In [1]:
import numpy as np
from JapaneseTokenizer import MecabWrapper

## Convert Text to Paragraphs

In [2]:
def text2sentences(text):
    list_sentence = [sentence.strip() for sentence in text.split('。') if sentence]
    return list_sentence

## Tokenizer (Mecab)

In [3]:
def sent2token(sentence):
    mecab_wrapper = MecabWrapper(dictType='neologd')
    return mecab_wrapper.tokenize(sentence=sentence, return_list=1)

## TF-IDF
$tf(t,d) = \frac{f(t,d)}{max{f(w,d): w \in d}}$  
$idf(t,D) = log\frac{|D|}{|{d \in D: t \in d}|}$  
$tfidf(t,d,D) = tf(t,d) * idf(t,D)$

In [4]:
def compute_TF(sent_dict):
    tf_dict = {}
    max_val = max(sent_dict.values())
    for word, value in sent_dict.items():
        tf_dict[word] = value/max_val
    
    return tf_dict

In [5]:
def compute_IDF(doc_list):
    import math
    idf_dict = {}
    N = len(doc_list)
    
    #count number of paragraphs that contain this word
    idf_dict = dict.fromkeys(doc_list[0].keys(), 0)
    for doc in doc_list:
        for word, count in doc.items():
            if count > 0:
                idf_dict[word] += 1
                
    for word, count in idf_dict.items():
        idf_dict[word] = math.log(N/float(count))
        
    return idf_dict

In [6]:
def compute_TFIDF(tf_bow, idfs):
    tfidf = {}
    for word, val in tf_bow.items():
        tfidf[word] = val*idfs[word]
    return tfidf

## doc2dicts

In [7]:
def doc2dicts(list_sentence):
    # Find all Bag of words
    bows = []
    for par in list_sentence:
        idx = list_sentence.index(par)
        bows.append(sent2token(par))
    
    # Create dictionary of all words
    total_word_dict = set()
    for bow in bows:
        total_word_dict = total_word_dict.union(bow)
    
    # Create dictionary for each sentence
    word_dict = []
    for i in range(len(list_sentence)):
        word_dict.append(dict.fromkeys(total_word_dict, 0))
    for bow in bows:
        idx = bows.index(bow)
        for word in bow:
            word_dict[idx][word] +=1
    
    return total_word_dict, word_dict

## Sample

In [8]:
text = """２月２７～２８日の米朝首脳会談で、トランプ米大統領が北朝鮮の金正恩キムジョンウン朝鮮労働党委員長に日本人拉致問題を提起したのは、初日に行われた１対１の会談の冒頭だったことがわかった。

　複数の日本政府関係者が明らかにした。

　安倍首相は２月２０日のトランプ氏との電話会談で、拉致問題を正恩氏に提起するよう要請した。トランプ氏の発言は、これに配慮したものとみられる。正恩氏は核・ミサイル問題が最初の議題と想定していたのか、その場で「驚いた表情」を見せたという。

　トランプ氏は１対１の会談に続き、２７日の夕食会でも拉致問題を取り上げた。日本政府は「首相の注文通り」（関係者）と歓迎している。政府は日米連携をテコに日朝の首脳による直接対話につなげ、拉致問題を打開したい考えだ。"""

In [9]:
list_sentence = text2sentences(text)

In [10]:
total_word_dict, word_dict = doc2dicts(list_sentence)

In [11]:
tf_bows = []
for i in range(len(word_dict)):
    tf_bows.append(compute_TF(word_dict[i]))

In [12]:
idfs = compute_IDF(word_dict)

In [13]:
tfidf_list = []

In [14]:
for i in range(len(word_dict)):
    tfidf_list.append(compute_TFIDF(tf_bows[i], idfs))

In [20]:
max(tfidf_list[5].values())

1.3862943611198906

In [21]:
from heapq import nlargest
ten_largest = nlargest(10, tfidf_list[5], key=tfidf_list[0].get)

In [22]:
ten_largest

['1', 'こと', '金正恩', '日本人拉致', '米朝首脳会談', '行う', 'わかる', '冒頭', '2月', '朝鮮労働党委員長']

In [23]:
import pandas as pd
df = pd.DataFrame(tfidf_list)

In [24]:
df

Unnamed: 0,),1,2728,27日,2月,2月20日,、,「,」,」(,...,議題,通り,連携,配慮,金正恩,関係者,電話会談,首相,首脳,驚く
0,0.0,0.554518,0.415888,0.0,0.415888,0.0,0.115073,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.415888,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.386294,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,1.039721,0.143841,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.039721,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.287682,0.0,0.0,0.0,...,0.0,0.0,0.0,2.079442,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.095894,0.462098,0.693147,0.0,...,0.693147,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.693147
5,0.0,1.386294,0.0,1.039721,0.0,0.0,0.143841,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,2.079442,0.0,0.0,0.0,0.0,0.0,0.0,1.386294,0.0,2.079442,...,0.0,2.079442,0.0,0.0,0.0,1.386294,0.0,2.079442,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.143841,0.0,0.0,0.0,...,0.0,0.0,1.039721,0.0,0.0,0.0,0.0,0.0,1.039721,0.0


In [26]:
list_sentence[5]

'トランプ氏は１対１の会談に続き、２７日の夕食会でも拉致問題を取り上げた'