# Term Frequency / Inverse Document Frequency (TF-IDF)

In [1]:
import pandas as pd

In [2]:
corpus = [
    "the house had a tiny little mouse",
    "the cat saw the mouse",
    "the mouse ran away from the house",
    "the cat finally ate the mouse",
    "the end of the mouse story"
    ]

In [3]:
word_lists = [txt.split() for txt in corpus]
word_lists

[['the', 'house', 'had', 'a', 'tiny', 'little', 'mouse'],
 ['the', 'cat', 'saw', 'the', 'mouse'],
 ['the', 'mouse', 'ran', 'away', 'from', 'the', 'house'],
 ['the', 'cat', 'finally', 'ate', 'the', 'mouse'],
 ['the', 'end', 'of', 'the', 'mouse', 'story']]

In [4]:
def count_words(word_lists):
    dfs = []
    for list_ in word_lists:
        df = pd.DataFrame()
        for word in set(list_):
            df[word] = [list_.count(word)]
        dfs.append(df)
    return pd.concat(dfs, sort = False).reset_index(drop = True)
            
bag_of_words = count_words(word_lists)
bag_of_words

Unnamed: 0,house,had,a,tiny,little,mouse,the,cat,saw,away,ran,from,ate,finally,end,story,of
0,1.0,1.0,1.0,1.0,1.0,1,1,,,,,,,,,,
1,,,,,,1,2,1.0,1.0,,,,,,,,
2,1.0,,,,,1,2,,,1.0,1.0,1.0,,,,,
3,,,,,,1,2,1.0,,,,,1.0,1.0,,,
4,,,,,,1,2,,,,,,,,1.0,1.0,1.0


In [7]:
def tf(bag_of_words):
    totals = bag_of_words.sum()
    n_words = totals.sum()
    return totals / n_words

tf(bag_of_words).sort_values(ascending = False)

the        0.290323
mouse      0.161290
house      0.064516
cat        0.064516
had        0.032258
a          0.032258
tiny       0.032258
little     0.032258
of         0.032258
story      0.032258
away       0.032258
ran        0.032258
from       0.032258
ate        0.032258
finally    0.032258
end        0.032258
saw        0.032258
dtype: float64

In [8]:
from math import log

In [10]:
def idf(bag_of_words):
    n_docs = bag_of_words.shape[0]
    doc_freq = bag_of_words.notnull().astype('int').sum()
    return (n_docs/doc_freq).apply(log)

idf(bag_of_words).sort_values(ascending = False)

of         1.609438
away       1.609438
had        1.609438
a          1.609438
tiny       1.609438
little     1.609438
story      1.609438
saw        1.609438
ran        1.609438
from       1.609438
ate        1.609438
finally    1.609438
end        1.609438
cat        0.916291
house      0.916291
the        0.000000
mouse      0.000000
dtype: float64

In [11]:
def tf_idf(bag_of_words):
    return (tf(bag_of_words) * idf(bag_of_words))

tf_idf(bag_of_words).sort_values(ascending = False)

house      0.059116
cat        0.059116
story      0.051917
had        0.051917
a          0.051917
tiny       0.051917
little     0.051917
of         0.051917
away       0.051917
ran        0.051917
from       0.051917
ate        0.051917
finally    0.051917
end        0.051917
saw        0.051917
the        0.000000
mouse      0.000000
dtype: float64