In [2]:
# TF-IDF - CountVectorizer

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

documents = [
    'Lee tells Kim',
    'Kim tells Lee',
    'Lee eats lunch',
    'Kim eats dinner',
]

vectorizer = CountVectorizer()

dtm = vectorizer.fit_transform(documents)
dtm_array = dtm.toarray()
vocabulary = vectorizer.get_feature_names_out()
print (vocabulary)
print ('='*30)

tf = pd.DataFrame(dtm.toarray(), columns = vocabulary)
print (tf)
print ('='*30)

df = tf.astype(bool).sum(axis = 0)
print (df)
print ('='*30)

D = len(tf)
idf = np.log((D+1) / (df+1)) + 1
print (idf)
print ('='*30)

tfidf = tf * idf      
tfidf

['dinner' 'eats' 'kim' 'lee' 'lunch' 'tells']
   dinner  eats  kim  lee  lunch  tells
0       0     0    1    1      0      1
1       0     0    1    1      0      1
2       0     1    0    1      1      0
3       1     1    1    0      0      0
dinner    1
eats      2
kim       3
lee       3
lunch     1
tells     2
dtype: int64
dinner    1.916291
eats      1.510826
kim       1.223144
lee       1.223144
lunch     1.916291
tells     1.510826
dtype: float64


Unnamed: 0,dinner,eats,kim,lee,lunch,tells
0,0.0,0.0,1.223144,1.223144,0.0,1.510826
1,0.0,0.0,1.223144,1.223144,0.0,1.510826
2,0.0,1.510826,0.0,1.223144,1.916291,0.0
3,1.916291,1.510826,1.223144,0.0,0.0,0.0


In [3]:
tfidf1 = tfidf / np.linalg.norm(tfidf, axis = 1, keepdims = True)

tfidf1

Unnamed: 0,dinner,eats,kim,lee,lunch,tells
0,0.0,0.0,0.53257,0.53257,0.0,0.657829
1,0.0,0.0,0.53257,0.53257,0.0,0.657829
2,0.0,0.553492,0.0,0.4481,0.702035,0.0
3,0.702035,0.553492,0.4481,0.0,0.0,0.0


In [6]:
# TD-IDF library - TfidfVectorizer

from sklearn.feature_extraction.text import TfidfVectorizer
documents = [
    'Lee tells Kim',
    'Kim tells Lee',
    'Lee eats lunch',
    'Kim eats dinner',   
         ]
tfidfv = TfidfVectorizer().fit(documents)
print(tfidfv.vocabulary_)
print ('='*30)
print(tfidfv.transform(documents).toarray())

{'lee': 3, 'tells': 5, 'kim': 2, 'eats': 1, 'lunch': 4, 'dinner': 0}
[[0.         0.         0.53256952 0.53256952 0.         0.65782931]
 [0.         0.         0.53256952 0.53256952 0.         0.65782931]
 [0.         0.55349232 0.         0.44809973 0.70203482 0.        ]
 [0.70203482 0.55349232 0.44809973 0.         0.         0.        ]]
