In [1]:
#==============================================================================
# CellStrat Hub Pack - Natural Language Processing
# Compatible tier : Free Tier or above 
# Kernel : conda_pytorch_latest_p36 
#==============================================================================

In [2]:
#======================================================================================
#CountVectorizer is a great tool provided by the scikit-learn library in Python.
#It is used to transform a given text into a vector on the basis of the frequency (count)
#Tf means term-frequency while tf-idf means term-frequency times inverse document-frequency. 
#Terms that occur in all documents in a training set, will not be entirely ignored.
#========================================================================================
import pandas as pd
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
#========================================================================================
#Read the document set
#========================================================================================
docs=["the house had a tiny little mouse",
       "the cat saw the mouse",
       "the mouse ran away from the house",
       "the cat finally ate the mouse",
       "the end of the mouse story"]

In [4]:
#========================================================================================
#Use count vectorizer to transform documents into vector based on frequency
# A total of 16 words are present with index starting at 0
#========================================================================================

cv=CountVectorizer()

#fit & transform
Count_vector=cv.fit_transform(docs)

#check the vocabs
cv.vocabulary_

{'the': 14,
 'house': 7,
 'had': 6,
 'tiny': 15,
 'little': 8,
 'mouse': 9,
 'cat': 2,
 'saw': 12,
 'ran': 11,
 'away': 1,
 'from': 5,
 'finally': 4,
 'ate': 0,
 'end': 3,
 'of': 10,
 'story': 13}

In [5]:
#check the vector shape
Count_vector.shape

(5, 16)

In [6]:
#========================================================================================
#tfidf_transformer.transform(count_vector) will compute the tf-idf scores for your docs.
#Internally this is computing the tf * idf  multiplication where your term frequency is weighted by its IDF values.
#========================================================================================
tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(Count_vector)

TfidfTransformer()

In [7]:
#Prin the idf values
df_idf=pd.DataFrame(tfidf_transformer.idf_, index=cv.get_feature_names(),columns=["idf_weights"])

In [8]:
#========================================================================================
#Notice that the words ‘mouse’ and ‘the’ have the lowest IDF values. 
#This is expected as these words appear in each and every document in our collection.
#The lower the IDF value of a word, the less unique it is to any particular document
#========================================================================================
df_idf.sort_values(by=["idf_weights"])

Unnamed: 0,idf_weights
mouse,1.0
the,1.0
cat,1.693147
house,1.693147
ate,2.098612
away,2.098612
end,2.098612
finally,2.098612
from,2.098612
had,2.098612


In [9]:
#compute the tfidf of the first document
feature_names=cv.get_feature_names()
#print the feature names
feature_names

['ate',
 'away',
 'cat',
 'end',
 'finally',
 'from',
 'had',
 'house',
 'little',
 'mouse',
 'of',
 'ran',
 'saw',
 'story',
 'the',
 'tiny']

In [10]:
#Compute the TF-IDF scores for the document
#Compute the countvectorizer()
cv=CountVectorizer()
count_vector=cv.fit_transform(docs)
#compute the tfidf vocab
tf_idf_vector=tfidf_transformer.transform(count_vector)

In [11]:
#========================================================================================
#print the tf-idf score fr forst docu 
#Convert the document to vector and represent in dense space
##The TFIDF score for each of the vocabs are represented
#=======================================================================================
first_tfidf_score=tf_idf_vector[0]
first_tfidf_score.T.todense()


matrix([[0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.49356209],
        [0.39820278],
        [0.49356209],
        [0.23518498],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.23518498],
        [0.49356209]])

In [12]:
#========================================================================================
#print the tf-idf score of the document
#The unique a word is to the document,higher the score is
#========================================================================================
df=pd.DataFrame(first_tfidf_score.T.todense(),index=feature_names,columns=["tf-idf"])
#print the df
df.sort_values(by=["tf-idf"],ascending=False)


Unnamed: 0,tf-idf
had,0.493562
little,0.493562
tiny,0.493562
house,0.398203
mouse,0.235185
the,0.235185
ate,0.0
away,0.0
cat,0.0
end,0.0
