In [1]:
import pandas as pd
import numpy as np
import os
import re
import math

In [2]:
def preprocess(sent, n_gram, stopword):
    sent = re.sub(r'[^\w]','',sent)
    sent = re.sub(r'[A-Za-z0-9]','',sent)
    return_list = []
    for i in range(len(sent) - n_gram + 1):
        w = sent[i:i+n_gram]
        if w not in stopword:
            return_list.append(w)
    _return = pd.DataFrame(return_list, columns = ['word']) 
    return _return
#retrun as df
# word 
# -----
# A
# B
# A 
# ....

In [3]:
# will create case_list and article_list
class case:
    def __init__(self, topic):
        self.topic = topic
        self._articles = [] # the index of aricles belong the topic [7,9...]
        self._words = {} # {2:df, 3:df ...} for case 
        
        
    def add_articles(self, _index):
        self._articles.append(_index)
        
class article:
    def __init__(self, _index):
        self.index = _index 
        self._ngram = {} # {2:df, 3:df ...} for single article

    def addNgram(self, n, gram_list): # will be called with global funcion preprocess
        self._ngram[n] = gram_list 
        self._ngram[n] = self._ngram[n].groupby('word').size().reset_index(name = 'tf')
        
        # funcion size()
        # return the size within each group such like
        # word    tf
        # ---------- 
        # A    2
        # B    1 ...

In [4]:
%%time
# load data with stopword "stopwords.txt" 
# run about 10s
collections =pd.read_excel('bda2020_hw1/hw1_text.xlsx', index_col = 0)
stopword = []
n = len(collections)
with open ('bda2020_hw1/stopwords.txt','r',encoding = 'utf-8') as file:
    for data in file.readlines():
        data = data.strip()
        stopword.append(data)

Wall time: 8.88 s


In [5]:
%%time
# 1min7s 
# Identify articles that belong to different topics

topics = ['銀行','信用卡','匯率','台積電','台灣','日本']

case_list = [case(i) for i in topics]    
article_list = [article(index) for index in range(n+1)] # extra new for index = 0 

deal = [] # remember the articles index will be dealed with

for c in case_list: # for 6
    for index, val in collections.iterrows(): # for 90000
        if c.topic in val["標題"] + val["內容"]:
            c.add_articles(index)
            deal.append(index)
           

Wall time: 1min 4s


In [6]:
# take a look at the number of articles under each topic

N = len(collections)
n = {}
for i in case_list:
    n[i.topic] = len(i._articles)
    print(i.topic)
    print(n[i.topic])

銀行
6674
信用卡
653
匯率
1951
台積電
1738
台灣
27232
日本
8235


In [2]:
%%time
# 15min  
# cut n-gram for the articles will be dealed with
K = [2, 3, 4, 5, 6]
for index, val in collections.iterrows(): # for 90000
    if index in deal:  # for about 45000
        for k in K: 
            article_list[index].addNgram(k,preprocess(val["標題"]+val["內容"],k,stopword))

In [4]:
%%time
# compute df + tf

# use the two loop to control the range of data to avoid to caculate too much data at one time
# it will run about 2 hours without discard QQ
# so we run the different topic in differnt group memeber's computer

# take " 台灣 " case_list[4] for example


for c in case_list[1:2]:
    for k in K[:1]:
        cut = 0    # just for checking where it is caculating now
        
        c._words[k] = article_list[c._articles[0]]._ngram[k].copy()
        c._words[k].insert(1,'case_df',0)
        c._words[k].insert(2,'case_tf',0)
        
        for i in c._articles: # for 6700 articles
            if i == c._articles[0]: #since we have already copy  
                continue
                
            c._words[k] = c._words[k].merge(article_list[i]._ngram[k], how="outer", on ="word")
            ###################################
            #  word case_df  case_tf tf_x tf_y
            #   A     0       0        2   NaN
            #   B     NaN    NaN      NaN   3
            #   C     0       0        5    1 
            #   D     NaN    NaN      NaN   2
            ###################################
                  
            cut += 1
            if cut % 100 == 0:
                print('cut',cut)
                
            if cut % 2 == 0 or i == c._articles[-1]: # merge one articles into the c_word[k]
                #df  (with function fillna and count)
                c._words[k]['case_df'].fillna(value = 0, inplace =True)
                c._words[k]['case_tf'].fillna(value = 0, inplace =True)
                 
                c._words[k]['case_df']+= c._words[k].count(axis = "columns").values #　Count non-NA cells per row
                c._words[k]['case_df']-= 3 # sice it will count "word" "case_df" "case_tf"　as non-NA cells　

                #tf
                c._words[k]['case_tf'] = c._words[k].sum(axis = 1) #　Count total numbers per row
                c._words[k]['case_tf']-= c._words[k]['case_df']   # sice it will  extractly add case_df　
                c._words[k].drop(c._words[k].columns[3:],axis =1,inplace =True) # throw away tf_x tf_y
                
                # c._words[k] = c._words[k][c._words[k]['case_df']>discard] 
                # Ignore the words that df is too small to accelerate
                
                ###################################
                #  word case_df  case_tf
                #   A     1       2       
                #   B     1       3     
                #   C     2       6       
                #   D     1       2       
                ###################################

In [120]:
# tf- idf 
# with(1+logtf) * log(doc/df)
# use log10

for c in case_list[4:5]: # for Taiwan
    for k in K:
        doc = n[c.topic]
        c._words[k].loc[:,'tf-idf'] = (np.log10(c._words[k]['case_tf']) + 1) * np.log10(doc/c._words[k]['case_df'])
        print(c.topic, 'k =', k)
        print(c._words[k])


台灣 k = 2
      word  case_df  case_tf    tf-idf
0       一年   1450.0   1911.0  5.453091
1       一片    233.0    257.0  7.050799
2       不少   1496.0   1794.0  5.360446
3       世界   3522.0   5863.0  4.235474
4       主題   1136.0   1647.0  5.817777
...    ...      ...      ...       ...
74225   長巫      1.0      1.0  4.435080
74226   陸維      1.0      1.0  4.435080
74227   項任      1.0      1.0  4.435080
74228   願與      1.0      1.0  4.435080
74229   麼期      1.0      1.0  4.435080

[74230 rows x 4 columns]
台灣 k = 3
      word  case_df  case_tf    tf-idf
0      台灣最    630.0    731.0  6.320360
1      新的一     54.0     63.0  7.565738
2      的一年    114.0    143.0  7.503940
3      的開始    120.0    135.0  7.374748
4      下半年    838.0   1368.0  6.253082
...    ...      ...      ...       ...
89206  非常趨      1.0      1.0  4.435080
89207  項任命      1.0      1.0  4.435080
89208  願與大      1.0      1.0  4.435080
89209  顯示她      1.0      1.0  4.435080
89210  麼期待      1.0      1.0  4.435080

[89211 rows x 4 col

In [104]:
case_list[4]._words[2].to_excel('TW.xlsx', sheet_name='2-gram')
case_list[4]._words[3].to_excel('TW1.xlsx', sheet_name='3-gram')
case_list[4]._words[4].to_excel('TW2.xlsx', sheet_name='4-gram')
case_list[4]._words[5].to_excel('TW3.xlsx', sheet_name='5-gram')
case_list[4]._words[6].to_excel('TW4.xlsx', sheet_name='6-gram')