# tf-idf
tf-idf (term frequency-inverse document frequency) 是一种统计方法，用以评估一字词对于语料库中一份文件的重要程度。某个子词的重要性会随着它在文件中出现的次数成正比增加，但同时会随着它在语料中出现的频率成反比下降。
## tf
在一份给定的文件里，tf 指的是某个给定的词在该文件中出现的频率，对于在某一特定文件里的词语 $t_i$ 来说，它的重要性表示为：
$$tf_{i,j} = \frac{n_{i,j}}{\sum_{k}n_{k,j}}$$
其中，$n_{i,j}$ 是该词在文件 $d_j$ 中出现的次数，分母是文件 $d_j$ 中所有字词的出现次数之和。
## idf
idf 是一个词语普遍重要性的度量，某一特定词语的 idf 可有总文件数目除以包含该词语的文件的数目，再将结果取以10为底的对数得到：
$$idf_i = log{\frac{|D|}{|\{j:t_i \in d_j\}|}}$$
其中
* $|D|$: 语料库中的文件总数
* $|\{j:t_i \in d_j\}|$: 包含词语 $t_i$ 的文件数目，如果词语不在数据中，就导致分母为零，可使用 $1+|\{j:t_i \in d_j\}|$。

然后 $tfidf_{i,j} = tf_{i,j} \cdot idf_i$

In [9]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
import math

text1 = """
If you like tuna and tomato sauce- try combining the two.
It's really not as bad as it sounds.
If the Easter Bunny and the Tooth Fairy had babies would they take
your teeth and leave chocolate for you?
"""

In [12]:
def remove_string_special_characters(s):
    """
    This function removes special characters from within a string
    """
    # Replace special character with ' '
    stripped = re.sub('[^\w\s]', '', s)
    stripped = re.sub('_', '', stripped)
    
    # Change any whitespace to one space
    stripped = re.sub('\s+', ' ', stripped)
    
    # Remove start and end white spaces
    stripped = stripped.strip()
    
    return stripped

def get_doc(text_sents_clean):
    doc_info = []
    i = 0
    for sent in text_sents_clean:
        i += 1
        count = count_words(sent)
        temp = {'doc_id': i, 'doc_length': count}
        doc_info.append(temp)
    return doc_info

def count_words(sent):
    return len(word_tokenize(sent))

def create_freq_dict(sents):
    i = 0
    freqDict_list = []
    for sent in sents:
        i += 1
        freq_dict = {}
        words = word_tokenize(sent)
        for word in words:
            word = word.lower()
            freq_dict[word] = freq_dict.get(word, 0) + 1
        temp = {'doc_id': i, 'freq_dict': freq_dict}
        freqDict_list.append(temp)

    return freqDict_list

def computeTF(doc_info, freqDict_list):
    TF_scores = []
    for tempDict in freqDict_list:
        id = tempDict['doc_id']
        for k in tempDict['freq_dict']:
            temp = {'doc_id': id,
                   'TF_score': tempDict['freq_dict'][k] / doc_info[id-1]['doc_length'],
                   'key': k}
            TF_scores.append(temp)

    return TF_scores

def computeIDF(doc_info, freqDict_list):
    IDF_scores = []
    counter = 0
    for dict in freqDict_list:
        counter += 1
        for k in dict['freq_dict'].keys():
            count = sum([k in tempDict['freq_dict'] for tempDict in freqDict_list])
            temp = {'doc_id': counter, 'IDF_score': math.log(len(doc_info) / count), 'key': k}
            IDF_scores.append(temp)

    return IDF_scores

def computeTFIDF(TF_scores, IDF_scores):
    TFIDF_scores = []
    for j in IDF_scores:
        for i in TF_scores:
            if j['key'] == i['key'] and j['doc_id'] == i['doc_id']:
                temp = {'doc_id': j['doc_id'],
                       'TFIDF_score': j['IDF_score'] * i['TF_score'],
                       'key': i['key']}
        TFIDF_scores.append(temp)
        
    return TFIDF_scores

text_sents = sent_tokenize(text1)
text_sents_clean = [remove_string_special_characters(s) for s in text_sents]
doc_info = get_doc(text_sents_clean)

freqDict_list = create_freq_dict(text_sents_clean)
TF_scores = computeTF(doc_info, freqDict_list)
IDF_scores = computeIDF(doc_info, freqDict_list)
TFIDF_scores = computeTFIDF(TF_scores, IDF_scores)
TFIDF_scores

[{'doc_id': 1, 'TFIDF_score': 0.036860464373469494, 'key': 'if'},
 {'doc_id': 1, 'TFIDF_score': 0.036860464373469494, 'key': 'you'},
 {'doc_id': 1, 'TFIDF_score': 0.09987384442437362, 'key': 'like'},
 {'doc_id': 1, 'TFIDF_score': 0.09987384442437362, 'key': 'tuna'},
 {'doc_id': 1, 'TFIDF_score': 0.036860464373469494, 'key': 'and'},
 {'doc_id': 1, 'TFIDF_score': 0.09987384442437362, 'key': 'tomato'},
 {'doc_id': 1, 'TFIDF_score': 0.09987384442437362, 'key': 'sauce'},
 {'doc_id': 1, 'TFIDF_score': 0.09987384442437362, 'key': 'try'},
 {'doc_id': 1, 'TFIDF_score': 0.09987384442437362, 'key': 'combining'},
 {'doc_id': 1, 'TFIDF_score': 0.036860464373469494, 'key': 'the'},
 {'doc_id': 1, 'TFIDF_score': 0.09987384442437362, 'key': 'two'},
 {'doc_id': 2, 'TFIDF_score': 0.13732653608351372, 'key': 'its'},
 {'doc_id': 2, 'TFIDF_score': 0.13732653608351372, 'key': 'really'},
 {'doc_id': 2, 'TFIDF_score': 0.13732653608351372, 'key': 'not'},
 {'doc_id': 2, 'TFIDF_score': 0.27465307216702745, 'key':