In [1]:
import pandas as pd
import numpy as np

# NLP related 
import string
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.stem.porter import *
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

### TF-IDF stands for Term Frequency Inverse Document Frequency of records.
- Term Frequency (TF)
- Inverse Document Frequency (IDF): 


### The [sklearn.feature_extraction.text.TfidfVectorizer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html) function is used for creating the TF-IDF features.


### Define the corpus

In [2]:
##### This is the corpus, including four document. 
corpus = ['This is the first document',
          'This document is the second document',
          'And this is the third one',
          'Is this the first document']

### The functions below calculate TF, IDF and TFIDF. 

In [3]:
def calculate_TF(document, word):
    tokenizer=ToktokTokenizer()
    tokens    = tokenizer.tokenize(document)
    occurance = len([tokens for token in tokens if token == word])
    TF        = occurance
    return TF

def calculate_IDF(word,corpus):
    tokenizer=ToktokTokenizer()
    n_document = len(corpus)
    n_document_include = 0
    for i in range(n_document):
        document_now = corpus[i].lower()
        token_now    = tokenizer.tokenize(document_now)
        if word in token_now:
            n_document_include+=1
    IDF = np.log((n_document+1)/(n_document_include+1))+1
    return(IDF)

def calculate_TFIDF(document,corpus,word):
    TF  = calculate_TF(document,word)
    IDF = calculate_IDF(word,corpus)
    TFIDF = TF*IDF
    return(TFIDF)

In [4]:
### All the words in the vocabulary of the corpus
tfidf = TfidfVectorizer()
_ = tfidf.fit_transform(corpus)
tfidf.vocabulary_

{'this': 8,
 'is': 3,
 'the': 6,
 'first': 2,
 'document': 1,
 'second': 5,
 'and': 0,
 'third': 7,
 'one': 4}

### Let's calculate the TF-IDF of each word in the list below (i.e., all the words appear in the corpus, alphabetically ordered).

In [5]:
words = ['and','document','first','is','one','second','the','third','this']
TFIDF = pd.DataFrame(columns=words)
for d in range(len(corpus)):
    for i in range(len(words)):
        TFIDF.loc[d,words[i]]=calculate_TFIDF(corpus[d],corpus,words[i])
TFIDF

Unnamed: 0,and,document,first,is,one,second,the,third,this
0,0.0,1.223144,1.510826,1.0,0.0,0.0,1.0,0.0,0.0
1,0.0,2.446287,0.0,1.0,0.0,1.916291,1.0,0.0,0.0
2,0.0,0.0,0.0,1.0,1.916291,0.0,1.0,1.916291,1.0
3,0.0,1.223144,1.510826,0.0,0.0,0.0,1.0,0.0,1.0


### We can get the same results using [sklearn.feature_extraction.text.TfidfVectorizer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html#sklearn.feature_extraction.text.TfidfVectorizer)


In [6]:
# Create TFIDF object
# Here we use a parameter norm=None. 
# The default value for this parameter is 'l2', which is more recommended. See discussion below.
tfidf = TfidfVectorizer(norm=None)
# get tf-df values
result = tfidf.fit_transform(corpus)

### The result is a scipy.sparse.csr_matrix object. It prepresents a matrix.
- This matric has four rows, the i th row presents the i th document. 
- The matric has 9 columns, the j the column represents the j th word in the tfidf.vocabulary_.

### Transform the matrix to a DataFrame with column names with the following functions
- [get_feature_names_out()](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html#sklearn.feature_extraction.text.TfidfVectorizer.get_feature_names_out) function gets output feature names for transformation.
- [csr_matrix.toarray function](https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.csr_matrix.toarray.html) returns a dense ndarray representation of this sparse array.

In [7]:
tfidf_df = pd.DataFrame(result.toarray(), columns=tfidf.get_feature_names_out())
tfidf_df

Unnamed: 0,and,document,first,is,one,second,the,third,this
0,0.0,1.223144,1.510826,1.0,0.0,0.0,1.0,0.0,1.0
1,0.0,2.446287,0.0,1.0,0.0,1.916291,1.0,0.0,1.0
2,1.916291,0.0,0.0,1.0,1.916291,0.0,1.0,1.916291,1.0
3,0.0,1.223144,1.510826,1.0,0.0,0.0,1.0,0.0,1.0


### To avoid large documents in the corpus dominating smaller ones, we can normalize each row in the sparse matrix to have the Euclidean norm.
- The parameter **norm** in TfidfVectorizer() can be use to guarantee that each output row will have unit norm. 
- When using **norm="l2"**, the sum of squares of vector elements is 1. The cosine similarity between two vectors is their dot product when l2 norm has been applied.

In [8]:
row_norm = np.sqrt(tfidf_df.pow(2).sum(axis=1))
row_norm

0    2.603589
1    3.557596
2    3.743863
3    2.603589
dtype: float64

In [9]:
l2_norm_tfidf_df = tfidf_df.divide(row_norm, axis=0)
l2_norm_tfidf_df

Unnamed: 0,and,document,first,is,one,second,the,third,this
0,0.0,0.469791,0.580286,0.384085,0.0,0.0,0.384085,0.0,0.384085
1,0.0,0.687624,0.0,0.281089,0.0,0.538648,0.281089,0.0,0.281089
2,0.511849,0.0,0.0,0.267104,0.511849,0.0,0.267104,0.511849,0.267104
3,0.0,0.469791,0.580286,0.384085,0.0,0.0,0.384085,0.0,0.384085


### We can get the same results using TfidfVectorizer(). The default value of norm is 'l2'. 

In [10]:
tfidf    = TfidfVectorizer()
result   = tfidf.fit_transform(corpus)
tfidf_df = pd.DataFrame(result.toarray(), columns=tfidf.get_feature_names_out())
tfidf_df

Unnamed: 0,and,document,first,is,one,second,the,third,this
0,0.0,0.469791,0.580286,0.384085,0.0,0.0,0.384085,0.0,0.384085
1,0.0,0.687624,0.0,0.281089,0.0,0.538648,0.281089,0.0,0.281089
2,0.511849,0.0,0.0,0.267104,0.511849,0.0,0.267104,0.511849,0.267104
3,0.0,0.469791,0.580286,0.384085,0.0,0.0,0.384085,0.0,0.384085


### If the parameter "max_features" in TfidfVectorizer() is specified, only the top max_features ordered by term frequency across the corpus will be considered. 

In [11]:
tfidf    = TfidfVectorizer(max_features=4)
result   = tfidf.fit_transform(corpus)
tfidf_df = pd.DataFrame(result.toarray(), columns=tfidf.get_feature_names_out())
tfidf_df

Unnamed: 0,document,is,the,this
0,0.576847,0.47161,0.47161,0.47161
1,0.81614,0.333624,0.333624,0.333624
2,0.0,0.57735,0.57735,0.57735
3,0.576847,0.47161,0.47161,0.47161


### If the parameter "ngram_range" in TfidfVectorizer() is specified, n-grams will be used. For example an ngram_range of (1, 1) means only unigrams, (1, 2) means unigrams and bigrams, and (2, 2) means only bigrams. 

In [12]:
tfidf    = TfidfVectorizer(ngram_range=(1,2))
result   = tfidf.fit_transform(corpus)
tfidf_df = pd.DataFrame(result.toarray(), columns=tfidf.get_feature_names_out())
tfidf_df

Unnamed: 0,and,and this,document,document is,first,first document,is,is the,is this,one,...,the,the first,the second,the third,third,third one,this,this document,this is,this the
0,0.0,0.0,0.314532,0.0,0.38851,0.38851,0.257151,0.314532,0.0,0.0,...,0.257151,0.38851,0.0,0.0,0.0,0.0,0.257151,0.0,0.38851,0.0
1,0.0,0.0,0.455513,0.356824,0.0,0.0,0.186206,0.227756,0.0,0.0,...,0.186206,0.0,0.356824,0.0,0.0,0.0,0.186206,0.356824,0.0,0.0
2,0.357007,0.357007,0.0,0.0,0.0,0.0,0.186301,0.227873,0.0,0.357007,...,0.186301,0.0,0.0,0.357007,0.357007,0.357007,0.186301,0.0,0.281469,0.0
3,0.0,0.0,0.28294,0.0,0.349487,0.349487,0.231322,0.0,0.443279,0.0,...,0.231322,0.349487,0.0,0.0,0.0,0.0,0.231322,0.0,0.0,0.443279
