# NLP / TF-IDF

In [35]:
import numpy as np
import pandas as pd

In [36]:
corpus = ['data science is one of the most important fields of science',
          'this is one of the best data science courses',
          'data scientists analyze data',
          'I woke up this morning']

In [37]:
def clear_list(lst):
    if 'I' in lst:
        lst.remove('I')
        
    return lst


In [38]:
words_set = set()

for doc in corpus:
    words = doc.split(' ')
    words_set = words_set.union(set(words))
    
words_set = clear_list(words_set)

print(words_set)
print(len(words_set))



{'up', 'scientists', 'one', 'science', 'morning', 'most', 'fields', 'analyze', 'this', 'of', 'data', 'important', 'is', 'best', 'courses', 'woke', 'the'}
17


## Term Frequency (TF)

In [39]:
n_docs = len(corpus) # the number of documents
n_words_set = len(words_set)

df_tf = pd.DataFrame(np.zeros((n_docs, n_words_set)), columns=list(words_set))

df_tf

Unnamed: 0,up,scientists,one,science,morning,most,fields,analyze,this,of,data,important,is,best,courses,woke,the
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [42]:
# Calculate the TF
for i_doc in range(n_docs):
    words = corpus[i_doc].split(' ')
    
    words = clear_list(words)
    
    for word in words:
        df_tf[word][i_doc] = df_tf[word][i_doc] + (1 / len(words))
        
df_tf

Unnamed: 0,up,scientists,one,science,morning,most,fields,analyze,this,of,data,important,is,best,courses,woke,the
0,0.0,0.0,0.090909,0.181818,0.0,0.090909,0.090909,0.0,0.0,0.181818,0.090909,0.090909,0.090909,0.0,0.0,0.0,0.090909
1,0.0,0.0,0.111111,0.111111,0.0,0.0,0.0,0.0,0.111111,0.111111,0.111111,0.0,0.111111,0.111111,0.111111,0.0,0.111111
2,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0
3,0.25,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0


## Inverse Document Frequency (IDF)

In [43]:
idf = {}

for word in words_set:
    oc = 0 # total occurances of words in documents
    
    for i in range(n_docs):
        if word in clear_list(corpus[i].split()):
            oc += 1
    
    idf[word] = np.log10(n_docs/oc)
    
idf

{'up': 0.6020599913279624,
 'scientists': 0.6020599913279624,
 'one': 0.3010299956639812,
 'science': 0.3010299956639812,
 'morning': 0.6020599913279624,
 'most': 0.6020599913279624,
 'fields': 0.6020599913279624,
 'analyze': 0.6020599913279624,
 'this': 0.3010299956639812,
 'of': 0.3010299956639812,
 'data': 0.12493873660829993,
 'important': 0.6020599913279624,
 'is': 0.3010299956639812,
 'best': 0.6020599913279624,
 'courses': 0.6020599913279624,
 'woke': 0.6020599913279624,
 'the': 0.3010299956639812}

## TF-IDF

In [44]:
df_tf_idf = df_tf.copy()

for word in words_set:
    for doc in range(n_docs):
        df_tf_idf[word][doc] = df_tf[word][doc] * idf[word]
        
df_tf_idf

Unnamed: 0,up,scientists,one,science,morning,most,fields,analyze,this,of,data,important,is,best,courses,woke,the
0,0.0,0.0,0.027366,0.054733,0.0,0.054733,0.054733,0.0,0.0,0.054733,0.011358,0.054733,0.027366,0.0,0.0,0.0,0.027366
1,0.0,0.0,0.033448,0.033448,0.0,0.0,0.0,0.0,0.033448,0.033448,0.013882,0.0,0.033448,0.066896,0.066896,0.0,0.033448
2,0.0,0.150515,0.0,0.0,0.0,0.0,0.0,0.150515,0.0,0.0,0.062469,0.0,0.0,0.0,0.0,0.0,0.0
3,0.150515,0.0,0.0,0.0,0.150515,0.0,0.0,0.0,0.075257,0.0,0.0,0.0,0.0,0.0,0.0,0.150515,0.0


## Scikit Learn

https://medium.com/web-mining-is688-spring-2021/cosine-similarity-and-tfidf-c2a7079e13fa

In [45]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [46]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
column_names = vectorizer.get_feature_names_out()

In [47]:
df_tf_idf = pd.DataFrame(X.toarray(), columns=column_names)
df_tf_idf

Unnamed: 0,analyze,best,courses,data,fields,important,is,morning,most,of,one,science,scientists,the,this,up,woke
0,0.0,0.0,0.0,0.199417,0.312425,0.312425,0.246319,0.0,0.312425,0.492639,0.246319,0.492639,0.0,0.246319,0.0,0.0,0.0
1,0.0,0.403667,0.403667,0.257655,0.0,0.0,0.318256,0.0,0.0,0.318256,0.318256,0.318256,0.0,0.318256,0.318256,0.0,0.0
2,0.52489,0.0,0.0,0.670061,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.52489,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.525473,0.0,0.0,0.0,0.0,0.0,0.0,0.414289,0.525473,0.525473


## Cosine Similarities

In [51]:
from sklearn.metrics.pairwise import cosine_similarity

df_cos_sim = pd.DataFrame(cosine_similarity(df_tf_idf, dense_output=True))
df_cos_sim

Unnamed: 0,0,1,2,3
0,1.0,0.600128,0.133621,0.0
1,0.600128,1.0,0.172645,0.13185
2,0.133621,0.172645,1.0,0.0
3,0.0,0.13185,0.0,1.0
