## Main Purpose
針對新聞文章進行前處理
- Input: 
    - 新聞文章之 txt 檔（一篇文章一篇）
    - 日期與媒體來源標記
    - 停用詞庫
    - 連續詞庫
    - 同義詞庫
- Output: 二維的陣列

## Read the data

In [1]:
import os
import json
import string
import pandas as pd
from datetime import datetime
import warnings
warnings.filterwarnings("ignore")

In [2]:
# 載入新聞文章、日期、媒體資料
all_files = os.listdir("../txt/")
df_articles = pd.DataFrame(columns=['id','contents'])
for file in all_files:
    with open("../txt/" + file, 'r', encoding='utf-8') as f:
        if file != "labels.csv":
            df_articles.loc[len(df_articles.index)] = [int(file.replace('.txt', '')), " ".join(f.readlines())]
        else:
            df_label = pd.read_csv("../txt/" + file) 

# 合併新聞文章、日期、媒體資料
df_news = pd.merge(df_articles, df_label, on='id', how='left').reset_index(drop=True)

# 移除缺失值
df_news = df_news.dropna()

df_news.head()

Unnamed: 0,id,contents,source,date
0,0,7/1/2021\n https://www-proquest-com.falcon.lib...,Boston Globe,202004
1,1,7/2/2021\n https://www-proquest-com.falcon.lib...,Boston Globe,202004
2,10,Find a copy\n Abstract\n document 1 of 1\n Ful...,Boston Globe,202103
3,100,6/24/2021\n https://www-proquest-com.falcon.li...,Others,202003
4,101,6/24/2021\n https://www-proquest-com.falcon.li...,Others,202003


In [3]:
# 載入連續詞資料
with open('../wordbook/continous-words.txt', 'r') as file:
    lines = file.readlines()

# 將 txt 檔轉成 list
data = [tuple(line.strip().split()) for line in lines][1:-1]

# 移除標點符號
continuous_words = []
for item in data:
    item_no_punctuation = tuple(''.join(char for char in word if char not in string.punctuation) for word in item)
    continuous_words.append(item_no_punctuation)

continuous_words = [" ".join(list(pair)) for pair in continuous_words]

In [4]:
# 載入連續詞資料
with open('../wordbook/stopwords.txt', 'r') as file:
    lines = file.readlines()[0]
    
our_stopwords = lines.strip('["').strip('"]').split('","')
our_stopwords[0:10]

['publication',
 'document',
 'newspaper',
 'database',
 'login',
 'docview',
 'copyright',
 'se',
 'llc',
 'inc']

## Data Preprocessing

In [5]:
from nltk import pos_tag
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import re

In [6]:
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    elif treebank_tag.startswith('S'):
        return wordnet.ADJ_SAT
    else:
        return wordnet.NOUN

def preprocess(articles):
    tokenizer = RegexpTokenizer(r'\b[^\'\d\W]+\b')
    # Lowercase
    articles = [article.lower() for article in articles]
    # Remove Chinese characters
    articles = [re.sub(r'[\u4e00-\u9fff]+', '', article) for article in articles]
    # Execute tokenization
    tokenized_articles = [tokenizer.tokenize(article) for article in articles]
    # Concatenate multi-word terms
    for article_index, article in enumerate(tokenized_articles):
        article_length = len(article)
        i = 0
        while i < article_length - 1:
            if article[i] + ' ' + article[i + 1] in continuous_words:  # Add more multi-word terms as needed
                tokenized_articles[article_index][i] = article[i] + '_' + article[i + 1]
                del tokenized_articles[article_index][i + 1]
                article_length -= 1
            i += 1
    # Extract articles
    selected_texts = []
    for article in tokenized_articles:
        if "full_text" in article:
            start_idx = article.index("full_text")
            article = article[start_idx+1:]
        if "subject" in article:
            end_idx = article.index("subject")
            article = article[:end_idx]
        selected_texts.append(article)
    tokenized_articles = selected_texts
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    preprocessed_articles = [pos_tag(article) for article in tokenized_articles]
    # Keep only nouns, verbs
    preprocessed_articles = [[(word, pos) for word, pos in article if pos in ['NN', 'NNS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']] for article in preprocessed_articles]
    preprocessed_articles = [[lemmatizer.lemmatize(word, get_wordnet_pos(pos)) for word, pos in article] for article in preprocessed_articles]
    # Remove words with length less than 3
    preprocessed_articles = [[word for word in article if len(word) > 2] for article in preprocessed_articles]
    # Stopwords removal
    nltk_stopwords = stopwords.words('english')
    preprocessed_articles = [[word for word in article if word.lower() not in nltk_stopwords] for article in preprocessed_articles]
    preprocessed_articles = [[word for word in article if word.lower() not in our_stopwords] for article in preprocessed_articles]
    return preprocessed_articles

In [7]:
articles_to_preprocess = df_news['contents'].tolist()
preprocessed_articles = preprocess(articles_to_preprocess)
df_news['preprocessed_articles'] = preprocessed_articles

In [8]:
df_news.to_csv('../results/preprocessed_news.csv', index=False)