# Preprocessing
1. Remove stop word
2. Create dictionary - order is not important --> save file
3. Calculate tf-idf for each document --> filter top 400 words
4. Replace word by index (of dictionary) - KEEP order
5. Create a array of sequences - each sequence is a document --> save file

In [1]:
import os
import pandas as pd
from nltk import word_tokenize
import time
import nltk
import itertools
import operator
import codecs
from sklearn.feature_extraction.text import TfidfVectorizer
import heapq
import numpy as np
import re
import threading
import bottleneck as bn

In [57]:
def clean_str(string):
    """
    Remove ending character such as .,?;!:
    """
    endings = ['.', '?', ',', '!', ':', ';', '`']
    while len(string) > 0 and string[-1] in endings:
        string = string[:-1]
    
    return string.strip().lower()

In [58]:
def is_string(string):
    return not (re.match('(\d{1,})', string)
                or re.match('(\d{1,},\d{1,})', string) 
                or re.match('(\d{1,}/\d{1,})', string) 
                or re.match('(\d{1,}.\d{1,})', string))

### Load data - split sentences to words and lowercase

In [60]:
df = pd.read_csv('data/1_data.csv', encoding='utf-8')

In [61]:
# split sentences to word
start_time = time.time()
df['tokens'] = df['content'].apply(lambda x: word_tokenize(x))
print('Finish spliting sentences into words: %f s' % (time.time() - start_time))

Finish spliting sentences into words: 123.785000 s


In [62]:
df.head()

Unnamed: 0,content,label,tokens
0,Nhà_vua và Hoàng_hậu Nhật_Bản thăm Cố_đô Huế Đ...,the-gioi,"[Nhà_vua, và, Hoàng_hậu, Nhật_Bản, thăm, Cố_đô..."
1,Chiếc tuần_dương_hạm ngầm xấu_số của hải_quân ...,the-gioi,"[Chiếc, tuần_dương_hạm, ngầm, xấu_số, của, hải..."
2,Máy_bay chở gia_đình Bin_Laden rơi vì hạ_cánh ...,the-gioi,"[Máy_bay, chở, gia_đình, Bin_Laden, rơi, vì, h..."
3,Putin gửi lời chúc tới Trump không nhắc đến Ob...,the-gioi,"[Putin, gửi, lời, chúc, tới, Trump, không, nhắ..."
4,Trump không đưa Nga vào danh_sách ưu_tiên của ...,the-gioi,"[Trump, không, đưa, Nga, vào, danh_sách, ưu_ti..."


In [63]:
# Remove single-character tokens (mostly punctuation)
# Remove numbers
# convert all words into lowercase
start_time = time.time()
df['tokens'] = df['tokens'].apply(lambda x: [clean_str(word) for word in x if len(word) > 1 and is_string(word)])

# The last word is often author of document, so we remove it
df['tokens'] = df['tokens'].apply(lambda x: x[:-1])

print('Finish cleaning data: %f s' % (time.time() - start_time))
df.head()

Finish cleaning data: 193.131000 s


Unnamed: 0,content,label,tokens
0,Nhà_vua và Hoàng_hậu Nhật_Bản thăm Cố_đô Huế Đ...,the-gioi,"[nhà_vua, và, hoàng_hậu, nhật_bản, thăm, cố_đô..."
1,Chiếc tuần_dương_hạm ngầm xấu_số của hải_quân ...,the-gioi,"[chiếc, tuần_dương_hạm, ngầm, xấu_số, của, hải..."
2,Máy_bay chở gia_đình Bin_Laden rơi vì hạ_cánh ...,the-gioi,"[máy_bay, chở, gia_đình, bin_laden, rơi, vì, h..."
3,Putin gửi lời chúc tới Trump không nhắc đến Ob...,the-gioi,"[putin, gửi, lời, chúc, tới, trump, không, nhắ..."
4,Trump không đưa Nga vào danh_sách ưu_tiên của ...,the-gioi,"[trump, không, đưa, nga, vào, danh_sách, ưu_ti..."


In [67]:
df['content'] = df['tokens'].apply(lambda x: ' '.join(x))
df.drop('tokens', axis=1, inplace=True)
df.to_csv(os.path.join('data', '2_cleaned_data.csv'), index=False, encoding='utf-8')

### Step 1: Remove stopword

In [68]:
df = pd.read_csv(os.path.join('data', '2_cleaned_data.csv'), encoding='utf-8')
df['content'] = df['content'].apply(lambda x: x.split(' '))
df.head()

Unnamed: 0,content,label
0,"[nhà_vua, và, hoàng_hậu, nhật_bản, thăm, cố_đô...",the-gioi
1,"[chiếc, tuần_dương_hạm, ngầm, xấu_số, của, hải...",the-gioi
2,"[máy_bay, chở, gia_đình, bin_laden, rơi, vì, h...",the-gioi
3,"[putin, gửi, lời, chúc, tới, trump, không, nhắ...",the-gioi
4,"[trump, không, đưa, nga, vào, danh_sách, ưu_ti...",the-gioi


In [69]:
stopwords = []
with codecs.open(os.path.join('data', 'vietnamese-stopwords-dash.txt'), 'r', encoding='utf-8') as f:
    stopwords = f.readlines()

# you may also want to remove whitespace characters like `\n` at the end of each line
stopwords = [x.strip() for x in stopwords]

In [70]:
print(stopwords[1596])
print(len(df['content'][0]))

và
165


In [71]:
start_time = time.time()
df['content'] = df['content'].apply(lambda x: [word for word in x if word not in stopwords])
print('Finish removing stopwords: %f s' % (time.time() - start_time))
print(len(df['content'][0]))
df.head()

Finish removing stopwords: 700.040000 s
106


Unnamed: 0,content,label
0,"[nhà_vua, hoàng_hậu, nhật_bản, thăm, cố_đô, hu...",the-gioi
1,"[tuần_dương_hạm, ngầm, xấu_số, hải_quân, pháp,...",the-gioi
2,"[máy_bay, chở, gia_đình, bin_laden, rơi, hạ_cá...",the-gioi
3,"[putin, gửi, chúc, trump, nhắc, obama, thông_đ...",the-gioi
4,"[trump, nga, danh_sách, ưu_tiên, lầu_năm_góc, ...",the-gioi


In [72]:
# replace content with new text (remove stopword)
df['content'] = df['content'].apply(lambda x: ' '.join(x))

In [73]:
df.head()

Unnamed: 0,content,label
0,nhà_vua hoàng_hậu nhật_bản thăm cố_đô huế đông...,the-gioi
1,tuần_dương_hạm ngầm xấu_số hải_quân pháp tàu_n...,the-gioi
2,máy_bay chở gia_đình bin_laden rơi hạ_cánh tốc...,the-gioi
3,putin gửi chúc trump nhắc obama thông_điệp tổn...,the-gioi
4,trump nga danh_sách ưu_tiên lầu_năm_góc danh_s...,the-gioi


In [74]:
df.to_csv(os.path.join('data', '3_removed_stopwords_data.csv'), index=False, encoding='utf-8')

### Step 1b: add index into document, then split into seperate file to prevent the lack of memory

In [2]:
df = pd.read_csv(os.path.join('data', '3_removed_stopwords_data.csv'), encoding='utf-8')
df['doc_id'] = range(1, len(df) + 1)
df.head()

Unnamed: 0,content,label,doc_id
0,nhà_vua hoàng_hậu nhật_bản thăm cố_đô huế đông...,the-gioi,1
1,tuần_dương_hạm ngầm xấu_số hải_quân pháp tàu_n...,the-gioi,2
2,máy_bay chở gia_đình bin_laden rơi hạ_cánh tốc...,the-gioi,3
3,putin gửi chúc trump nhắc obama thông_điệp tổn...,the-gioi,4
4,trump nga danh_sách ưu_tiên lầu_năm_góc danh_s...,the-gioi,5


In [4]:
df.to_csv(os.path.join('data', '4_index_data.csv'), 
          columns=['doc_id', 'content', 'label'], 
          index=False, encoding='utf-8')

In [8]:
for topic in df.label.unique():
    df_filter = df[df.label == topic]
    file_name = '4_index_data.%s.csv' % topic
    df_filter.to_csv(os.path.join('data', file_name),
                     columns=['doc_id', 'content', 'label'], 
                     index=False, encoding='utf-8')

### Step 2. Calculate tf-idf for each document

In [2]:
df = pd.read_csv(os.path.join('data', '4_index_data.csv'), encoding='utf-8')
df.head()

Unnamed: 0,doc_id,content,label
0,1,nhà_vua hoàng_hậu nhật_bản thăm cố_đô huế đông...,the-gioi
1,2,tuần_dương_hạm ngầm xấu_số hải_quân pháp tàu_n...,the-gioi
2,3,máy_bay chở gia_đình bin_laden rơi hạ_cánh tốc...,the-gioi
3,4,putin gửi chúc trump nhắc obama thông_điệp tổn...,the-gioi
4,5,trump nga danh_sách ưu_tiên lầu_năm_góc danh_s...,the-gioi


In [3]:
start_time = time.time()

corpus = df['content']
vectorizer = TfidfVectorizer(min_df=1)
X = vectorizer.fit_transform(corpus)
words = vectorizer.get_feature_names()

print('Finish calculating tfidf: %f s' % (time.time() - start_time))

Finish calculating tfidf: 13.423000 s


In [4]:
topics = df['label'].unique()
topics

array([u'the-gioi', u'khoa-hoc', u'du-lich', u'giao-duc', u'the-thao',
       u'thoi-su', u'phap-luat', u'so-hoa', u'oto-xe-may', u'giai-tri',
       u'kinh-doanh'], dtype=object)

### Step 3. Create dictionary - save file

In [5]:
sorted_words = sorted(words)
word_to_index = dict([(w, i) for i, w in enumerate(sorted_words)])

with open(os.path.join('data', 'dic_words.txt'), 'w') as f:
    for i, w in enumerate(sorted_words):
        f.write('%s,%s\n' % (w.encode('utf-8'), i))

### Step 4. Get top N words - Replace word by index (of dictionary) - KEEP order - then save file

In [76]:
# ref: http://stackoverflow.com/questions/10337533/a-fast-way-to-find-the-largest-n-elements-in-an-numpy-array
# but I see that numpy works better than bottleneck
start_time = time.time()
bn.argpartition(-X[0,:].toarray()[0], 400)[:400]
print('Total time: %fs' % (time.time() - start_time))

Total time: 0.007000s


In [137]:
start_time = time.time()
top_index_freq_words = np.argpartition(-X[0,:].toarray()[0], 400).astype(np.int32)

start_time = time.time()
df.loc[0, 'top_words'] = operator.itemgetter(*top_index_freq_words)(words) 
#list(words[top_index_freq_words]) 
#[words[index] for index in top_index_freq_words]
print('Total time: %fs' % (time.time() - start_time))

Total time: 0.118000s


In [95]:
class TextThread (threading.Thread):
    def __init__(self, thread_id, name, nb_top_words, idx_from, idx_to):
        threading.Thread.__init__(self)
        self.threadID = thread_id
        self.name = name
        self.nb_top_words = nb_top_words
        self.idx_from = idx_from
        self.idx_to = idx_to
    def run(self):
        print("Starting %s - from %d to %d\n" % (self.name, self.idx_from, self.idx_to))        
        get_top_n_words(self.name, self.nb_top_words, self.idx_from, self.idx_to)

In [92]:
def get_top_n_words(thread_name, nb_top_words, idx_from, idx_to):
    start_time = time.time()
    for i in range(idx_from, idx_to):
        doc_id = df.doc_id[i]
        top_index_freq_words = np.argpartition(-X[doc_id - 1,:].toarray()[0], nb_top_words)[:nb_top_words]
        df.at[i, 'top_words'] = list(operator.itemgetter(*top_index_freq_words)(words))
        print('%s - document %d' % (thread_name, doc_id))
        
    print('Total time for %s: %fs\n' % (thread_name, time.time() - start_time))

In [62]:
# nb_top_words = 400

# df = pd.read_csv(os.path.join('data', '4_index_data.du-lich.csv'), encoding='utf-8')
# df['top_words'] = None
# start_time = time.time()

# # Create 100 docs per thread
# doc_per_thread = 100
# for i in range(0, len(df), doc_per_thread):
#     # define idx_to
#     if i + doc_per_thread > len(df) - 1:
#         idx_to = len(df) - 1
#     else:
#         idx_to = i + doc_per_thread
    
#     get_top_n_words('thread_name', nb_top_words, i, idx_to)
#     break

In [111]:
# if word in content belong to top_words, replace it by index of word (in dictionary), otherwise, remove it
def replace_content_by_index():
    start_time = time.time()
    df['content'] = df['content'].apply(lambda x: x.split(' '))
    print('Splitting content into words in %fs\n' % (time.time() - start_time))
    
    start_time = time.time()
    for i in range(0, len(df)):
        if len(df.loc[i, 'top_words']) == 0:
            print('WARN: there is no topwords for doc %d' % i)
            continue

        index_of_top_words = []
        for word in df.loc[i, 'content']:
            if word in df.loc[i, 'top_words']:
                index_of_top_words.append(str(word_to_index[word]))

        df.loc[i, 'content'] = ' '.join(index_of_top_words)

    print('Done replace_content_by_index in %fs\n' % (time.time() - start_time))

In [112]:
nb_top_words = 400
doc_per_thread = 1000
# for each topic, read corresponding file
del df
for topic in topics:
    df = pd.read_csv(os.path.join('data', ('4_index_data.%s.csv' % topic)), encoding='utf-8')
    df['top_words'] = None

    start_time = time.time()
    threads = []

    # Create 1000 docs per thread
    for i in range(0, len(df), doc_per_thread):
        # define idx_to
        if i + doc_per_thread > len(df) - 1:
            idx_to = len(df)
        else:
            idx_to = i + doc_per_thread
        
        thread = TextThread(i + 1, 'thread-%d' % (i + 1), n, i, idx_to)
        threads.append(thread)
        thread.start()
        
    # Wait for all threads to complete
    for t in threads:
        t.join()

    # replace content by index of top freq words
    replace_content_by_index()
    
    # save result to file
    df.to_csv(os.path.join('data', 'matrix_data.%s.csv' % (topic)), 
              columns=['doc_id', 'content', 'label'],
              index=False, encoding='utf-8')
    print("TOTAL TIME FOR TOPIC '%s': %fs\n" % (topic, time.time() - start_time))

Starting thread-0
 - from 0 to 1000
Starting thread-1
 - from 1000 to 2000Starting thread-2
 - from 2000 to 3000

Starting thread-3
 - from 3000 to 4000Starting thread-4
 - from 4000 to 4479
thread-0 - document 1thread-1 - document 1001thread-2 - document 2001
thread-3 - document 3001


thread-4 - document 4001
thread-0 - document 2thread-1 - document 1002thread-2 - document 2002
thread-3 - document 3002


thread-4 - document 4002
thread-0 - document 3thread-1 - document 1003thread-2 - document 2003
thread-3 - document 3003


thread-4 - document 4003
thread-0 - document 4 thread-2 - document 2004
thread-3 - document 3004
thread-1 - document 1004
thread-4 - document 4004
thread-0 - document 5
thread-2 - document 2005
thread-3 - document 3005
thread-1 - document 1005
thread-4 - document 4005
thread-0 - document 6
thread-2 - document 2006
thread-3 - document 3006
thread-1 - document 1006
thread-4 - document 4006
thread-0 - document 7
thread-2 - document 2007
thread-3 - document 3007
threa