# TF-IDF

In [1]:
docs = ["The duck loves to eat the worm",
        "The worm doesn't like the early bird",
        "The bird loves to get up early to get the worm",
        "The bird gets the worm from the early duck",
        "The duck and the birds are so different from each other but one thing they have in common is that they both get the worm"
]

In [2]:
vocab = set()

for doc in docs:
    for w in doc.split():
        vocab.add(w)
        
vocab = sorted(list(vocab))
print(vocab)

['The', 'and', 'are', 'bird', 'birds', 'both', 'but', 'common', 'different', "doesn't", 'duck', 'each', 'early', 'eat', 'from', 'get', 'gets', 'have', 'in', 'is', 'like', 'loves', 'one', 'other', 'so', 'that', 'the', 'they', 'thing', 'to', 'up', 'worm']


In [3]:
import pandas as pd

In [4]:
from math import log

N = len(docs)

def tf(f,d):
    return d.count(f)

def idf(t):
    df = 0
    for doc in docs:
        df += t in doc
        
    return log(N/(df+1))

def tf_idf(f,d):
    return tf(f,d) * idf(t)
            

In [5]:
result = []

for i in range(N):
    result.append([])
    d = docs[i]
    for j in range(len(vocab)):
        t = vocab[j]
        result[-1].append(tf(t,d))
        
tf_ = pd.DataFrame(result, columns=vocab)
tf_

Unnamed: 0,The,and,are,bird,birds,both,but,common,different,doesn't,...,one,other,so,that,the,they,thing,to,up,worm
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,1
1,1,0,0,1,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,1
2,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,1,0,0,2,1,1
3,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,2,0,0,0,0,1
4,1,1,1,1,1,1,1,1,1,0,...,1,1,1,1,5,2,1,0,0,1


In [23]:
tf_.astype(bool).sum(axis=0)

The          5
and          1
are          1
bird         4
birds        1
both         1
but          1
common       1
different    1
doesn't      1
duck         3
each         1
early        3
eat          1
from         2
get          3
gets         1
have         1
in           1
is           1
like         1
loves        2
one          1
other        1
so           1
that         1
the          5
they         1
thing        1
to           2
up           1
worm         5
dtype: int64

In [6]:
result = []

for j in range(len(vocab)):
    t = vocab[j]
    result.append(idf(t))
    
idf_ = pd.DataFrame(result, columns=['IDF'], index=vocab)
idf_

Unnamed: 0,IDF
The,-0.182322
and,0.916291
are,0.916291
bird,0.0
birds,0.916291
both,0.916291
but,0.916291
common,0.916291
different,0.916291
doesn't,0.916291


### TDM, TF-IDF

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

In [8]:
vector = CountVectorizer()
print(vector.fit_transform(docs).toarray())
print(dict(sorted(vector.vocabulary_.items(), key=lambda x: x[1])))

[[0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 2 0 0 1 0 1]
 [0 0 1 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 2 0 0 0 0 1]
 [0 0 1 0 0 0 0 0 0 0 0 1 0 0 2 0 0 0 0 0 1 0 0 0 0 2 0 0 2 1 1]
 [0 0 1 0 0 0 0 0 0 1 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0 3 0 0 0 0 1]
 [1 1 0 1 1 1 1 1 0 1 1 0 0 1 1 0 1 1 1 0 0 1 1 1 1 3 2 1 0 0 1]]
{'and': 0, 'are': 1, 'bird': 2, 'birds': 3, 'both': 4, 'but': 5, 'common': 6, 'different': 7, 'doesn': 8, 'duck': 9, 'each': 10, 'early': 11, 'eat': 12, 'from': 13, 'get': 14, 'gets': 15, 'have': 16, 'in': 17, 'is': 18, 'like': 19, 'loves': 20, 'one': 21, 'other': 22, 'so': 23, 'that': 24, 'the': 25, 'they': 26, 'thing': 27, 'to': 28, 'up': 29, 'worm': 30}


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [10]:
tfidf = TfidfVectorizer().fit(docs)
print(tfidf.fit_transform(docs).toarray())
print(dict(sorted(tfidf.vocabulary_.items(), key=lambda x:x[1])))

[[0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.33974806 0.         0.
  0.50730499 0.         0.         0.         0.         0.
  0.         0.         0.40929047 0.         0.         0.
  0.         0.48346709 0.         0.         0.40929047 0.
  0.24173354]
 [0.         0.         0.33351139 0.         0.         0.
  0.         0.         0.49799252 0.         0.         0.33351139
  0.         0.         0.         0.         0.         0.
  0.         0.49799252 0.         0.         0.         0.
  0.         0.47459221 0.         0.         0.         0.
  0.2372961 ]
 [0.         0.         0.22460702 0.         0.         0.
  0.         0.         0.         0.         0.         0.22460702
  0.         0.         0.54116284 0.         0.         0.
  0.         0.         0.27058142 0.         0.         0.
  0.         0.31961949 0.         0.         0.54116284 0.33537869
  0.15980974]
 [0.         0.         0.2918132 

----

## TF-IDF 기반 Doc similarity

In [40]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(min_df=1)

X = vectorizer.fit_transform(docs)
vectorizer.get_feature_names_out()

array(['and', 'are', 'bird', 'birds', 'both', 'but', 'common',
       'different', 'doesn', 'duck', 'each', 'early', 'eat', 'from',
       'get', 'gets', 'have', 'in', 'is', 'like', 'loves', 'one', 'other',
       'so', 'that', 'the', 'they', 'thing', 'to', 'up', 'worm'],
      dtype=object)

In [41]:
X.toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,
        0, 0, 0, 2, 0, 0, 1, 0, 1],
       [0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
        0, 0, 0, 2, 0, 0, 0, 0, 1],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 2, 0, 0, 0, 0, 0, 1, 0,
        0, 0, 0, 2, 0, 0, 2, 1, 1],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 3, 0, 0, 0, 0, 1],
       [1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1,
        1, 1, 1, 3, 2, 1, 0, 0, 1]], dtype=int64)

In [42]:
X.toarray().transpose()

array([[0, 0, 0, 0, 1],
       [0, 0, 0, 0, 1],
       [0, 1, 1, 1, 0],
       [0, 0, 0, 0, 1],
       [0, 0, 0, 0, 1],
       [0, 0, 0, 0, 1],
       [0, 0, 0, 0, 1],
       [0, 0, 0, 0, 1],
       [0, 1, 0, 0, 0],
       [1, 0, 0, 1, 1],
       [0, 0, 0, 0, 1],
       [0, 1, 1, 1, 0],
       [1, 0, 0, 0, 0],
       [0, 0, 0, 1, 1],
       [0, 0, 2, 0, 1],
       [0, 0, 0, 1, 0],
       [0, 0, 0, 0, 1],
       [0, 0, 0, 0, 1],
       [0, 0, 0, 0, 1],
       [0, 1, 0, 0, 0],
       [1, 0, 1, 0, 0],
       [0, 0, 0, 0, 1],
       [0, 0, 0, 0, 1],
       [0, 0, 0, 0, 1],
       [0, 0, 0, 0, 1],
       [2, 2, 2, 3, 3],
       [0, 0, 0, 0, 2],
       [0, 0, 0, 0, 1],
       [1, 0, 2, 0, 0],
       [0, 0, 1, 0, 0],
       [1, 1, 1, 1, 1]], dtype=int64)

In [43]:
new_doc= ["The early bird gets the worm"]
new_doc_vec = vectorizer.transform(new_doc)
new_doc_vec.toarray()

array([[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 2, 0, 0, 0, 0, 1]], dtype=int64)

In [45]:
# cosine similarity  

import numpy as np
from numpy import dot
from numpy.linalg import norm

In [46]:
def cos_sim(A,B):
    return np.dot(A,B)/(norm(A)*norm(B))

In [47]:
post_vec.shape, new_doc_vec.shape

((1, 31), (1, 31))

In [58]:
best_sim,best_i = 0,0

for i in range(N):
    post_vec = X.getrow(i)
    sim = cos_sim(post_vec.toarray(), new_doc_vec.toarray().T)
    
    print(f" post_vec : {i+1}, sim = {sim}, doc{i+1} = {docs[i]}")
    
    if sim > best_sim:
        best_sim = sim
        best_i = i
        
# 가장 유사한 문장 
print(docs[best_i])
    

 post_vec : 1, sim = [[0.58925565]], doc1 = The duck loves to eat the worm
 post_vec : 2, sim = [[0.82495791]], doc2 = The worm doesn't like the early bird
 post_vec : 3, sim = [[0.60024505]], doc3 = The bird loves to get up early to get the worm
 post_vec : 4, sim = [[0.91287093]], doc4 = The bird gets the worm from the early duck
 post_vec : 5, sim = [[0.43082022]], doc5 = The duck and the birds are so different from each other but one thing they have in common is that they both get the worm
The bird gets the worm from the early duck


### 형태소 분석기를 통해 특정 단어로만 유사도 평가

In [69]:
token = ["bird", "duck", "worm", "early", "get", "love"]
token

['bird', 'duck', 'worm', 'early', 'get', 'love']

In [72]:
token_docs = []

for doc in docs:
    token_docs.append([])
    for w in doc.split():
        if w in token:
            token_docs[-1].append(w)
            
token_docs

[['duck', 'worm'],
 ['worm', 'early', 'bird'],
 ['bird', 'get', 'early', 'get', 'worm'],
 ['bird', 'worm', 'early', 'duck'],
 ['duck', 'get', 'worm']]

In [75]:
token_docs = [' '.join(t) for t in token_docs]
token_docs

['duck worm',
 'worm early bird',
 'bird get early get worm',
 'bird worm early duck',
 'duck get worm']

In [76]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(min_df=1)
X = vectorizer.fit_transform(token_docs)
vectorizer.get_feature_names_out()

array(['bird', 'duck', 'early', 'get', 'worm'], dtype=object)

In [77]:
new_doc= ["The early bird gets the worm"]
new_doc_vec = vectorizer.transform(new_doc)
new_doc_vec.toarray()

array([[1, 0, 1, 0, 1]], dtype=int64)

In [78]:
best_sim,best_i = 0,0

for i in range(N):
    post_vec = X.getrow(i)
    sim = cos_sim(post_vec.toarray(), new_doc_vec.toarray().T)
    
    print(f" post_vec : {i+1}, sim = {sim}, doc{i+1} = {docs[i]}")
    
    if sim > best_sim:
        best_sim = sim
        best_i = i
        
# 가장 유사한 문장 
print(docs[best_i])

 post_vec : 1, sim = [[0.40824829]], doc1 = The duck loves to eat the worm
 post_vec : 2, sim = [[1.]], doc2 = The worm doesn't like the early bird
 post_vec : 3, sim = [[0.65465367]], doc3 = The bird loves to get up early to get the worm
 post_vec : 4, sim = [[0.8660254]], doc4 = The bird gets the worm from the early duck
 post_vec : 5, sim = [[0.33333333]], doc5 = The duck and the birds are so different from each other but one thing they have in common is that they both get the worm
The worm doesn't like the early bird
