# Custom implementation of Tf-Idf vectorization

### Corpus

In [None]:
corpus = [
     'this is the first document',
     'this document is the second document',
     'and this is the third one',
     'is this the first document',
]

### SkLearn Implementation

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
vectorizer.fit(corpus)
skl_output = vectorizer.transform(corpus)

In [None]:
print(vectorizer.get_feature_names())

['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']


In [None]:
# idf values of vocab
print(vectorizer.idf_)

[1.91629073 1.22314355 1.51082562 1.         1.91629073 1.91629073
 1.         1.91629073 1.        ]


In [None]:
# shape of sklearn tfidf vectorizer output after applying transform method.
skl_output.shape

(4, 9)

In [None]:
# sklearn tfidf values for first line of the above corpus.
# Here the output is a sparse matrix

print(skl_output[0])

  (0, 8)	0.38408524091481483
  (0, 6)	0.38408524091481483
  (0, 3)	0.38408524091481483
  (0, 2)	0.5802858236844359
  (0, 1)	0.46979138557992045


In [None]:
# sklearn tfidf values for first line of the above corpus.
# To understand the output better, here we are converting the sparse output matrix to dense matrix and printing it.
# this output is normalized using L2 normalization. sklearn does this by default.

print(skl_output[0].toarray())

[[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]


### Your custom implementation

In [65]:
# Write your code here.
# Try not to hardcode any values.
# Make sure its well documented and readble with appropriate comments.

from collections import Counter
from tqdm import tqdm
from scipy.sparse import csr_matrix
import math
import operator
from sklearn.preprocessing import normalize
import numpy

#Fit Method
def fit(corpus):    
  unique_words = set()
  if isinstance(corpus, (list,)):
    for row in corpus: 
      for word in row.split(" "): 
        if len(word) < 2:
          continue
        unique_words.add(word)
    unique_words = sorted(list(unique_words))
    vocab1= {j:i for i,j in enumerate(unique_words)} 
    p={}
    for x in vocab1:
      q=0
      for y in corpus:
        if x in y.split(" "):            
          q+=1      
      p[x] = 1 + math.log((len(corpus)+1)/(q+1))
      vocab_sorted=sorted(p.items(),key=lambda x:x[0],reverse=False)
      vocab={j[0]:(i,j[1]) for i,j in enumerate(vocab_sorted)} 
    return vocab       
  else:
    print("you need to pass list of sentance")

corpus = [
     'this is the first document',  
     'this document is the second document',
     'and this is the third one',
     'is this the first document',
]

vocab = fit(corpus)
print('Vocab = {}'.format(vocab)) 

Vocab = {'and': (0, 1.916290731874155), 'document': (1, 1.2231435513142097), 'first': (2, 1.5108256237659907), 'is': (3, 1.0), 'one': (4, 1.916290731874155), 'second': (5, 1.916290731874155), 'the': (6, 1.0), 'third': (7, 1.916290731874155), 'this': (8, 1.0)}


In [2]:
def transform(corpus,vocab):
  rows = []
  columns = []
  tf1=[]
  idf1=[]
  values = []
  if isinstance(corpus, (list,)):   #check input
    for idx, row in enumerate(tqdm(corpus)):
        word_freq = dict(Counter(row.split()))
        f=0         
        for word, freq in word_freq.items():
            if len(word) < 2:
                continue    
            f=len(row.split())    #finding requency
            col_index = vocab.get(word, -1)[0]  #column index of output
            if col_index !=-1:
                rows.append(idx)        #row index of output
                columns.append(col_index)
                tf=freq/f
                values.append(tf*vocab.get(word)[1])    #tf-idf value calculation      
    return normalize(csr_matrix((values,(rows,columns)),shape=(len(corpus),len(vocab))),norm='l2')
  else:
    print("you need to pass list of strings")
print(transform(corpus,vocab).toarray())

100%|██████████| 4/4 [00:00<00:00, 4257.10it/s]

[[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]
 [0.         0.6876236  0.         0.28108867 0.         0.53864762
  0.28108867 0.         0.28108867]
 [0.51184851 0.         0.         0.26710379 0.51184851 0.
  0.26710379 0.51184851 0.26710379]
 [0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]





### This gives almost the same tfidf values as the scikit learn implementation.