### Corpus

In [None]:
## SkLearn# Collection of string documents

corpus = [
     'this is the first document',
     'this document is the second document',
     'and this is the third one',
     'is this the first document',
]

### SkLearn Implementation

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
vectorizer.fit(corpus)
skl_output = vectorizer.transform(corpus)

In [None]:
# sklearn feature names, they are sorted in alphabetic order by default.

print(vectorizer.get_feature_names())

['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']


In [None]:
# Here we will print the sklearn tfidf vectorizer idf values after applying the fit method
# After using the fit function on the corpus the vocab has 9 words in it, and each has its idf value.

print(vectorizer.idf_)

[1.91629073 1.22314355 1.51082562 1.         1.91629073 1.91629073
 1.         1.91629073 1.        ]


In [None]:
# shape of sklearn tfidf vectorizer output after applying transform method.

skl_output.shape

(4, 9)

In [None]:
# sklearn tfidf values for first line of the above corpus.
# Here the output is a sparse matrix

print(skl_output[0])

  (0, 8)	0.38408524091481483
  (0, 6)	0.38408524091481483
  (0, 3)	0.38408524091481483
  (0, 2)	0.5802858236844359
  (0, 1)	0.46979138557992045


In [None]:
# sklearn tfidf values for first line of the above corpus.
# convert the sparse output matrix to dense matrix and printing it.
# output is normalized using L2 normalization. sklearn does this by default.

print(skl_output[0].toarray())

[[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]


### Custom implementation

**Fit Method:**

In [None]:
from collections import Counter
from tqdm import tqdm
from scipy.sparse import csr_matrix
import math
import operator
from sklearn.preprocessing import normalize
import numpy

def fit(dataset):    
    unique_words = set() # initialize an empty set
    
    # check if its list type or not
    if isinstance(dataset, (list,)):
        
        for row in dataset:    # for each document in the corpus
            for word in row.split(" "):    # for each word in the document.
                
                if len(word) < 2:    # if length of word is less than 2, we are not doing anything. We just move to the next iteration.
                    continue
                
                unique_words.add(word)    # length of word is more than 2, add it to the set initialized earlier.
        unique_words = sorted(list(unique_words))
        vocab = {j:i for i,j in enumerate(unique_words)}   # create a dictionary with word as key and corresponding index as its value. 
        
        return vocab
    else:
        print("Please pass a list of sentence")

In [None]:
vocabulary = fit(corpus)
print(vocabulary)
for i in vocabulary.keys():
    print(i)

{'and': 0, 'document': 1, 'first': 2, 'is': 3, 'one': 4, 'second': 5, 'the': 6, 'third': 7, 'this': 8}
and
document
first
is
one
second
the
third
this


**Transform Method:**

In [None]:
def idf(dataset, vocab):
    vocabIDF = []
    numDocs = len(dataset)
    for word in vocab.keys():
        cnt = 0
        for doc in dataset:
            wordsInDoc = doc.split(" ")
            if word in wordsInDoc:
                cnt += 1
        vocabIDF.append(round((math.log((1 + numDocs)/(cnt + 1))) + 1, 8))
    return vocabIDF

IDFs = idf(corpus, vocabulary)
print("IDF values:")
print(IDFs)
print()

def transform(dataset, fitVocab):
    rows = []
    columns = []
    values = []
    if isinstance(dataset, (list,)):
        for idx, row in enumerate(tqdm(dataset)):
            
            # it will return a dict type object where key is the word and values is its frequency, {word:frequency}
            word_freq = dict(Counter(row.split()))
            
            # length of document will be used later to calculate Term freq(TF).
            docLen = len(row.split())

            # for every unique word in the document
            for word, freq in word_freq.items():  # for each unique word in the review.                
                if len(word) < 2:
                    continue

                # we will check if word is there in the vocabulary that we build in fit() function
                col_index = fitVocab.get(word, -1)    # retreving the dimension number of a word

                # if the word exists
                if col_index !=-1:
                    # we are storing the index of the document
                    rows.append(idx)
                    
                    # we are storing the dimensions of the word
                    columns.append(col_index)
                    
                    tf = freq / docLen
                    tf_idf = tf * IDFs[fitVocab[word]]
                    
                    # we are storing the TF-IDF values of the word
                    values.append(tf_idf)
        corpusMatrix = csr_matrix((values, (rows,columns)), shape=(len(dataset),len(fitVocab)))

        # L2 normalization
        normalizedCorpusMatrix = normalize(corpusMatrix, norm='l2', axis=1, copy=True, return_norm=False)
        return normalizedCorpusMatrix
    else:
        print("You need to pass list of strings")

vocabulary = fit(corpus)
print("Vocab:")
print(vocabulary)
print()

tfidfMatrix = transform(corpus, vocabulary)

print()
print(tfidfMatrix[0])   

print()
print(tfidfMatrix[0].toarray())

IDF values:
[1.91629073, 1.22314355, 1.51082562, 1.0, 1.91629073, 1.91629073, 1.0, 1.91629073, 1.0]

Vocab:
{'and': 0, 'document': 1, 'first': 2, 'is': 3, 'one': 4, 'second': 5, 'the': 6, 'third': 7, 'this': 8}



100%|██████████| 4/4 [00:00<00:00, 5445.38it/s]


  (0, 1)	0.46979138558088085
  (0, 2)	0.5802858228626505
  (0, 3)	0.3840852413282814
  (0, 6)	0.3840852413282814
  (0, 8)	0.3840852413282814

[[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]



