In [7]:
import math
import jieba
import jieba.posseg as psg
from gensim import corpora, models
from jieba import analyse
import functools
import numpy as np

In [2]:
def get_stopword_list():
    stop_word_path = './stopword.txt'
    stopwords_list = [sw.replace('\n','') for sw in open(stop_word_path).readlines()]
    return stopwords_list

In [3]:
def seg_to_list(sentence, pos=False):
    if not pos:
        return jieba.cut(sentence)
    else:
        return psg.cut(sentence)

In [4]:
def word_filter(seg_list, pos=False):
    stopwords = get_stopword_list()
    filter_list = []
    for seg in seg_list:
        if not pos:
            word = seg 
            flag = 'n'
        else:
            word = seg.word
            flag = seg.flag
        if not flag.startwith('n'):
            continue
        if word not in stopwords and len(word)>1:
            filter_list.append(word)
    return filter_list

In [5]:
def load_data(pos=False, corpus_path='./corpus.txt'):
    doc_list = []
    for line in open(corpus_path):
        content = line.strip()
        seg_list = seg_to_list(content,pos)
        filter_list = word_filter(seg_list,pos)
        doc_list.append(filter_list)
    return doc_list

In [6]:
def train_idf(doc_list):
    idf_dic={}
    tt_count = len(doc_list)
    
    #每个词出现的文档数
    for i in doc_list:
        for j in set(i):
            idf_dic[j] = idf_dic.get(j,0)+1
    #calculate the idf
    for k,v in idf_dic.items():
        idf_dic[k] = math.log(tt_count/(1+v))
    #for oov use default value
    default_idf = math.log(tt_count)
    return idf_dic, default_idf
    

In [8]:
def cmp(e1, e2):
    res = np.sign(e1[1]-e2[1])
    if res != 0:
        return res
    else:
        a = e1[0] + e2[0]
        b = e2[0] + e1[0]
        if a>b:
            return 1
        elif a==b:
            return 0
        else:
            return -1

In [9]:
class TfIdf(object):
    def __init__(self,idf_dic, default_idf,word_list, keyword_num):
        self.word_list = word_list
        self.idf_dic, self.default_idf= idf_dic, default_idf
        self.tf_dict = self.get_tf_dic()
        self.keyword_num = keyword_num
    def get_tf_dic(self):
        tf_dict = {}
        for i in self.word_list:
            tf_dict[i]  = tf_dict.get(i, 0)+1
        tt_count = len(self.word_list)
        for k, v in word_list.items():
            tf_dict[k] = v/tt_count
        return tf_dict
    def get_tfidf(self):
        tfidf_dict={}
        for word in self.word_list:
            tf = self.tf_dict.get(word,0)
            idf = self.idf_dic.get(word, self.default_idf)
            tfidf_dict[word] = tf*idf
        for k, v in sorted(tfidf_dic.items(), key=functools.cmp_to_key(cmp), reverse=True)[:self.keyword_num]:
            print(k + "/ ", end='')
        print()