In [1]:
import numpy as np 
import pickle as pk
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from __future__ import print_function
import codecs
from textrank4zh import TextRank4Keyword, TextRank4Sentence
tr4w = TextRank4Keyword()

## Read data

In [2]:
from sklearn.externals import joblib
(kp_sum_posts,kp_posts_time, kp_sum_comments, kp_comments_time, kp_comments_to_post_index) = joblib.load( "result/kp_all.pkl" )
(yao_sum_posts,yao_posts_time, yao_sum_comments, yao_comments_time, yao_comments_to_post_index) = joblib.load("result/yao_all.pkl")

In [3]:
print('kp posts num: ' + str(len(kp_sum_posts)))
print('kp comments num: ' + str(len(kp_sum_comments)))
print('yao posts num: ' + str(len(yao_sum_posts)))
print('yao comments num: ' + str(len(yao_sum_comments)))

kp posts num: 1526
kp comments num: 74705
yao posts num: 977
yao comments num: 15497


In [24]:
stopword_ch = []
with open('data/stopwords.txt', encoding='utf-8') as f:
    data = f.readlines()
    for line in data:
        stopword_ch.append(line.rstrip())
stopword_en = []
with open('data/stopwords-en.txt', encoding='utf-8') as f:
    data = f.readlines()
    for line in data:
        stopword_en.append(line.rstrip())
stopword = stopword_ch + stopword_en
print('stopword len: ' + str(len(stopword)))


stopword len: 2506


## jieba


In [25]:
def chinese_word_cut(mytext):
    jieba_str = ''
    tr4w.analyze(text=mytext, lower=True, window=2)
    #print(tr4w.words_no_filter)
    for words in tr4w.words_no_filter:
        word = ' '.join(words)
        jieba_str = jieba_str + word + ' '
    return jieba_str

In [26]:
def sentence_jieba(sentence_list):
    sentence_seg = []
    idx = 0
    for sentence in sentence_list:
        print(str(idx) + '/' + str(len(sentence_list)))
        idx = idx + 1
        #comment_message_list.append(jieba_chinese_word_cut(comment))
        sentence_seg.append(chinese_word_cut(sentence))
    return sentence_seg

In [75]:
kp_sum_posts_seg = sentence_jieba(kp_sum_posts)
kp_sum_comments_seg = sentence_jieba(kp_sum_comments)
yao_sum_posts_seg = sentence_jieba(yao_sum_posts)
yao_sum_comments_seg = sentence_jieba(yao_sum_comments)

## Remove English words and digits

In [83]:
import re
import copy
def sentence_clean(old_seg_list):
    seg_list = copy.copy(old_seg_list)
    for idx in range(len(seg_list)):
        seg_sentence = seg_list[idx]
        clean_digts = ''.join(i for i in seg_sentence if not i.isdigit())
        clean_eng = [w for w in clean_digts.split(' ') if not re.match(r'[A-Z]+', w, re.I)]
        seg_list[idx] = ' '.join(filter(None, clean_eng))
    return seg_list.tolist()

In [84]:
kp_sum_posts_clean_seg = sentence_clean(kp_sum_posts_seg)
kp_sum_comments_clean_seg = sentence_clean(kp_sum_comments_seg)
yao_sum_posts_clean_seg = sentence_clean(yao_sum_posts_seg)
yao_sum_comments_clean_seg = sentence_clean(yao_sum_comments_seg)

## Applying Term Weighting with TF-IDF

In [88]:
n_features = 50000
tf_vectorizer_kp_posts = TfidfVectorizer(strip_accents = 'unicode',
                                max_features=n_features,
                                stop_words=stopword
                               )
tf_vectorizer_yao_posts = TfidfVectorizer(strip_accents = 'unicode',
                                max_features=n_features,
                                stop_words=stopword
                               )

tf_vectorizer_kp_comments = TfidfVectorizer(strip_accents = 'unicode',
                                max_features=n_features,
                                stop_words=stopword
                               )
tf_vectorizer_yao_comments = TfidfVectorizer(strip_accents = 'unicode',
                                max_features=n_features,
                                stop_words=stopword
                               )
tf_kp_posts = tf_vectorizer_kp_posts.fit_transform(kp_sum_posts_clean_seg)
print( "Created %d X %d TF-IDF-normalized document-term matrix" % (tf_kp_posts.shape[0], tf_kp_posts.shape[1]) )

tf_yao_posts = tf_vectorizer_yao_posts.fit_transform(yao_sum_posts_clean_seg)
print( "Created %d X %d TF-IDF-normalized document-term matrix" % (tf_yao_posts.shape[0], tf_yao_posts.shape[1]) )

tf_kp_comments = tf_vectorizer_kp_comments.fit_transform(kp_sum_comments_clean_seg)
print( "Created %d X %d TF-IDF-normalized document-term matrix" % (tf_kp_comments.shape[0], tf_kp_comments.shape[1]) )

tf_yao_comments = tf_vectorizer_yao_comments.fit_transform(yao_sum_comments_clean_seg)
print( "Created %d X %d TF-IDF-normalized document-term matrix" % (tf_yao_comments.shape[0], tf_yao_comments.shape[1]) )

Created 1526 X 17121 TF-IDF-normalized document-term matrix
Created 977 X 13900 TF-IDF-normalized document-term matrix
Created 74705 X 50000 TF-IDF-normalized document-term matrix
Created 15497 X 21698 TF-IDF-normalized document-term matrix


In [90]:
tf_kp_posts_feature_names = tf_vectorizer_kp_posts.get_feature_names()
print("Vocabulary has %d distinct terms" % len(tf_kp_posts_feature_names))

tf_yao_posts_feature_names = tf_vectorizer_yao_posts.get_feature_names()
print("Vocabulary has %d distinct terms" % len(tf_yao_posts_feature_names))

tf_kp_comments_feature_names = tf_vectorizer_kp_comments.get_feature_names()
print("Vocabulary has %d distinct terms" % len(tf_kp_comments_feature_names))

tf_yao_comments_feature_names = tf_vectorizer_yao_comments.get_feature_names()
print("Vocabulary has %d distinct terms" % len(tf_yao_comments_feature_names))

Vocabulary has 17121 distinct terms
Vocabulary has 13900 distinct terms
Vocabulary has 50000 distinct terms
Vocabulary has 21698 distinct terms


In [92]:
from sklearn.externals import joblib
joblib.dump((tf_kp_posts,tf_kp_posts_feature_names, kp_sum_posts_clean_seg, tf_kp_comments,tf_kp_comments_feature_names, kp_sum_comments_clean_seg), "result/tf_idf_kp_all.pkl")
joblib.dump((tf_yao_posts,tf_yao_posts_feature_names, yao_sum_posts_clean_seg, tf_yao_comments,tf_yao_comments_feature_names, yao_sum_comments_clean_seg), "result/tf_idf_yao_all.pkl")

['tf_idf/tf_idf_yao_all.pkl']