In [2]:
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# (TFIDF) Term Frequency - Inverse Document Frequency

#### My goal is to replicate the formula that sklearn uses by default for tfidf.

#### 1. Calculate term frequency.

    * This is the number of times a term appears in a document. 
    * (NOT the number of times a term appears divided by the number of terms in document, which would be a normalized term frequency.)
    
#### 2. Calculate inverse document frequency.

    * Natural log of ( (Number of documents in corpus + 1)/(Number of documents where term appears + 1) ) + 1.
        * Notice Laplace smoothing where the numerator and denominator have 1 added is default set to True.
        
#### 3. Calculate term frequency - inverse document frequency (tfidf).

    * Term frequency * Inverse document frequency = tfidf
    
#### 4. Calculate norm of tfidf.

    * Euclidean norm of the vector of tfidfs.
        * L2 (Euclidean norm) is default set to True.
        
#### 5. Calculate normalized tfidf of each term in document 1 (v1).

    * Divide each tfidf by the tfidf norm to get the normalized tfidf.

## Simple Example Corpus

In [130]:
# tokenized documents of corpus = list of terms per document
v1 = ['cat', 'hat', 'bat', 'splat', 'cat', 'bat', 'hat', 'mat', 'cat']
v2 = ['cat', 'mat', 'cat', 'sat']

In [20]:
# documents of corpus untokenized
a = 'cat hat bat splat cat bat hat mat cat'
b  = 'cat mat cat sat'

# Using sklearn to calculate tfidfs

## Initialize sklearn's CountVectorizer() and TfidfVectorizer()

In [143]:
CV = CountVectorizer()
TV = TfidfVectorizer()
# TV = TfidfVectorizer(norm=None) # default set to norm='l2'

In [59]:
count = CV.fit_transform([a,b])
count

<2x6 sparse matrix of type '<class 'numpy.int64'>'
	with 8 stored elements in Compressed Sparse Row format>

In [60]:
tfidf = TV.fit_transform([a,b])
tfidf

<2x6 sparse matrix of type '<class 'numpy.float64'>'
	with 8 stored elements in Compressed Sparse Row format>

## Vocab of corpus in alphabetical order provided by CV.get_feature_names()

In [133]:
CV.get_feature_names()

['bat', 'cat', 'hat', 'mat', 'sat', 'splat']

## Term frequencies of the terms in the same order as vocab

In [165]:
# function to convert CV.fit_transform() and TV.fit_transform() compressed sparse matrix object to a list
def matrix_to_list(matrix):
    matrix = matrix.toarray()
    return matrix.tolist()

In [134]:
count_lst = matrix_to_list(count)
count_lst

[[2, 3, 2, 1, 0, 1], [0, 2, 0, 1, 1, 0]]

## Tfidfs calculated by sklearn
Can I follow the steps of the formula to reproduce?

In [135]:
tfidf_lst = matrix_to_list(tfidf)
tfidf_lst

[[0.5333344767907123,
  0.5692078092660131,
  0.5333344767907123,
  0.18973593642200434,
  0.0,
  0.26666723839535617],
 [0.0, 0.7572644142929534, 0.0, 0.3786322071464767, 0.5321543559503558, 0.0]]

# Calculating tfidfs from scratch

## Calculating Term Frequency for 'bat'

In [69]:
# term frequency of bat in document 1
tf_bat_v1 = v1.count('bat')
tf_bat_v1

2

In [70]:
# term frequency of bat in document 2
tf_bat_v2 = v2.count('bat')
tf_bat_v2

0

## Get rest of term frequency for terms in first document (v1)

In [113]:
tf_cat = v1.count('cat')
tf_hat = v1.count('hat')
tf_mat = v1.count('mat')
tf_splat = v1.count('splat')
tf_bat = v1.count('bat')
tf_sat = v1.count('sat')

#### Normalizing Term Frequency (term frequency/term count in document)

NOTE: sklearn doesn't use this in its formula. I've seen multiple examples online incorporating this normalization. Not sure why to normalize at this step instead of at the end, as sklearn does.

In [31]:
# this is normalizing the tf_bat_v1 value, but sklearn doesn't do this
norm_tf_bat_v1 = v1.count('bat')/len(v1)
norm_tf_bat_v1

0.2222222222222222

## Initialize variable for number of documents in corpus

In [136]:
num_doc = len([a,b])
num_doc

2

## Function to calculate number of documents where term appears

In [76]:
def doc_count_with_term(term,corpus):
    count = 0
    for doc in corpus:
        if term in doc:
            count += 1
            
    return count

In [137]:
# number of documents where the term 'bat' appears
num_doc_with_bat = doc_count_with_term('bat', [a,b])
num_doc_with_bat

1

## Function to calculate the inverse document frequency

In [142]:
# pass in arguments for term and corpus
def get_idf(term, corpus):
    num_doc = len(corpus)
    num_doc_with_term = doc_count_with_term(term, corpus)
    return np.log((num_doc + 1)/(num_doc_with_term +1)) + 1

In [146]:
idf_bat = np.log((num_doc + 1)/(num_doc_with_bat +1)) + 1
idf_cat = get_idf('cat', [a,b])
idf_hat = get_idf('hat', [a,b])
idf_mat = get_idf('mat', [a,b])
idf_sat = get_idf('sat', [a,b])
idf_splat = get_idf('splat', [a,b])

## Values

In [148]:
print(f'bat: {idf_bat}')
print(f'cat: {idf_cat}')
print(f'hat: {idf_hat}')
print(f'mat: {idf_mat}')
print(f'sat: {idf_sat}')
print(f'splat: {idf_splat}')

bat: 1.4054651081081644
cat: 1.0
hat: 1.4054651081081644
mat: 1.0
sat: 1.4054651081081644
splat: 1.4054651081081644


## tfidf = tf * idf

In [115]:
tfidf_cat = tf_cat*idf_cat
tfidf_hat = tf_hat*idf_hat
tfidf_mat = tf_mat*idf_mat
tfidf_splat = tf_splat*idf_splat
tfidf_bat = tf_bat*idf_bat
tfidf_sat = tf_sat*idf_sat

In [149]:
print(f'bat: {tfidf_bat}')
print(f'cat: {tfidf_cat}')
print(f'hat: {tfidf_hat}')
print(f'mat: {tfidf_mat}')
print(f'sat: {tfidf_sat}')
print(f'splat: {tfidf_splat}')

bat: 2.8109302162163288
cat: 3.0
hat: 2.8109302162163288
mat: 1.0
sat: 0.0
splat: 1.4054651081081644


# Step back to understand how to calculate L2 Norm

In [123]:
# test Matrix .shape = (3,3)
Matrix = np.array([[1,2,3],[2,3,4],[3,4,5]])

In [124]:
# calculate L2 norms
# one norm per row
np.sqrt((Matrix*Matrix).sum(axis=1))

array([3.74165739, 5.38516481, 7.07106781])

In [125]:
# verify row 1
print(np.sqrt(1**2+2**2+3**2))
# row 2
np.sqrt(2**2+3**2+4**2)

3.7416573867739413


5.385164807134504

In [169]:
# alternatively using np.einsum to get squareroot of sums squared
# get one norm per row
norms = np.sqrt(np.einsum('ij,ij->i', Matrix, Matrix))
norms

array([3.74165739, 5.38516481, 7.07106781])

In [170]:
# third way to calculate norm of first row of Matrix
np.linalg.norm(Matrix[0,:])

3.7416573867739413

In [178]:
# first row of Matrix divided by first row norm to calculate norms of elements of row
norm_elements_row1 = Matrix[0,:]/norms[0]
norm_elements_row1

array([0.26726124, 0.53452248, 0.80178373])

In [176]:
# sum of squares of normed elements should equal 1
np.sum(norm_elements_row1**2)

1.0

## Calculate L2 Norm for tfidfs of document 1

In [119]:
v1_tfidf_norm = np.sqrt(tfidf_cat**2 + tfidf_bat**2 + tfidf_hat**2 + tfidf_mat**2 + tfidf_splat**2)
v1_tfidf_norm

5.270482855582157

## Use L2 Norm to calculate normalized tfidf for each term

In [153]:
norm_tfidf_bat = tf_bat*idf_bat/v1_tfidf_norm
norm_tfidf_cat = tf_cat*idf_cat/v1_tfidf_norm
norm_tfidf_hat = tf_hat*idf_hat/v1_tfidf_norm
norm_tfidf_mat = tf_mat*idf_mat/v1_tfidf_norm
norm_tfidf_sat = tf_sat*idf_sat/v1_tfidf_norm
norm_tfidf_splat = tf_splat*idf_splat/v1_tfidf_norm

In [179]:
# normalized tfidfs calculated from scratch
print(norm_tfidf_bat)
print(norm_tfidf_cat)
print(norm_tfidf_hat)
print(norm_tfidf_mat)
print(norm_tfidf_sat)
print(norm_tfidf_splat)

0.5333344767907123
0.5692078092660131
0.5333344767907123
0.18973593642200434
0.0
0.26666723839535617


In [159]:
# sklearn's tfidfs for the first document (v1)
tfidf_lst[0]

[0.5333344767907123,
 0.5692078092660131,
 0.5333344767907123,
 0.18973593642200434,
 0.0,
 0.26666723839535617]

## Test if tfidfs calculated from scratch are the same as sklearn's values

In [163]:
scratch_norm_tfidfs = [norm_tfidf_bat, norm_tfidf_cat, norm_tfidf_hat, norm_tfidf_mat, norm_tfidf_sat, norm_tfidf_splat]

In [164]:
# test if the value calculated from scratch is equal to sklearn's calculated value for tfidfs
for scratch, sklearn in zip(scratch_norm_tfidfs, tfidf_lst[0]):
    print(scratch==sklearn)

True
True
True
True
True
True


## Success!