## tfidf implementation from scratch


$IDF(t) = \log_{e}\frac{\text{Total  number of documents}} {\text{Number of documents with term t in it}}.$

$IDF(t) = \log_{e}\frac{\text{Total  number of documents}} {\text{Number of documents with term t in it}+1}.$
</li>
</ul>
</font>

### Data Corpus

In [1]:
## SkLearn# Collection of string documents

corpus = [
     'this is the first document',
     'this document is the second document',
     'and this is the third one',
     'is this the first document',
]

### SkLearn Implementation

In [560]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
vectorizer.fit(data)
skl_output = vectorizer.transform(data)

In [561]:
skl_output[0].todense()

matrix([[0., 0., 0., ..., 0., 0., 0.]])

In [563]:
# sklearn feature names, they are sorted in alphabetic order by default.
print(vectorizer.get_feature_names())

In [565]:
# Here we will print the sklearn tfidf vectorizer idf values after applying the fit method
# After using the fit function on the corpus the vocab has 9 words in it, and each has its idf value.

print(vectorizer.idf_)

In [5]:
# shape of sklearn tfidf vectorizer output after applying transform method.

skl_output.shape

(4, 9)

In [6]:
# sklearn tfidf values for first line of the above corpus.
# Here the output is a sparse matrix

print(skl_output[0])

  (0, 8)	0.38408524091481483
  (0, 6)	0.38408524091481483
  (0, 3)	0.38408524091481483
  (0, 2)	0.5802858236844359
  (0, 1)	0.46979138557992045


In [7]:
# sklearn tfidf values for first line of the above corpus.
# To understand the output better, here we are converting the sparse output matrix to dense matrix and printing it.
# Notice that this output is normalized using L2 normalization. sklearn does this by default.

print(skl_output[0].toarray())

[[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]


### Our custom implementation

In [458]:
from collections import Counter
from tqdm import tqdm
from scipy.sparse import csr_matrix
import math
import operator
from sklearn.preprocessing import normalize
import numpy
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import normalize
import pandas as pd
import numpy as np
import pickle

## Calculating tf for the entire dataset:

In [623]:
#calculating the term frequency
def termFrequency(data):
    tf = []
    for l in data:
        line = l.split(" ")
        v = Counter(line)
        tfr = {}
        for k in v.keys():
            tfr[k] = v[k]/len(line)
        tf.append(tfr)
    return tf    

In [627]:
import math
def calidf(data):
    cin ={} #it will store the in how many documents a particular word occures
    
    unique = []
    for line in data:
        for word in line.split(" "):
            if word not in unique:
                unique.append(word)
            else:
                continue
    idf = {}
    for word in unique:
        count = 0
        for line in data:
            for w in line.split(" "):
                if word in w:
                    count = count+1
        idf[word] = 1 + math.log((1+len(data))/(1+count),2.71) 
    return idf       

In [628]:
def transform(data,tf,idf):
    #tf = termFrequency(data)
    #idf = calidf(data)
    tfidf = []
    for i,line in enumerate(data):
        d = {}
        for word in line.split(" "):
            if word in idf.keys():
                d[word] = tf[i][word]*idf[word]
        for word in idf.keys():
            if word not in d.keys():
                d[word] = 0
        tfidf.append(d)
    vec = DictVectorizer(sparse=False)
    tfidf = vec.fit_transform(tfidf)
    return tfidf

In [629]:
def fit(data):
    ##unique word in every document
    unique = []
    for line in data:
        for word in line.split(" "):
            if word not in unique:
                unique.append(word)
            else:
                continue
    ##calculating the idf for each word
    idf = {}
    for word in unique:
        count = 0
        for line in data:
            for w in line.split(" "):
                if word in w:
                    count = count+1
        ##calculating the idf for the word given in the training set
        idf[word] = 1 + math.log((1+len(data))/(1+count),2.71)
    ##calculating the term frequency
    tf = []
    for l in data:
        line = l.split(" ")
        v = Counter(line)
        tfr = {}
        for k in v.keys():
            tfr[k] = v[k]/len(line)
        tf.append(tfr)
    return [tf,idf] 

In [630]:
corpus = ['this is the first document','this document is the second document','and this is the third one','is this the first document',]

In [631]:
tf,idf = fit(corpus)   #calculating the tf, idf from the training data
d = transform(corpus,tf,idf)  ##we need to provide the data corpus that we want to transform

In [632]:
d

array([[0.        , 0.2       , 0.30247782, 0.08208286, 0.        ,
        0.        , 0.2       , 0.        , 0.2       ],
       [0.        , 0.33333333, 0.        , 0.06840238, 0.        ,
        0.3198492 , 0.16666667, 0.        , 0.16666667],
       [0.3198492 , 0.        , 0.        , 0.06840238, 0.3198492 ,
        0.        , 0.16666667, 0.3198492 , 0.16666667],
       [0.        , 0.2       , 0.30247782, 0.08208286, 0.        ,
        0.        , 0.2       , 0.        , 0.2       ]])

In [573]:
## features in the tfidf
idf.keys()

dict_keys(['this', 'is', 'the', 'first', 'document', 'second', 'and', 'third', 'one'])

### tfidf with k number of features based on idf values 

In [539]:
def calidf(data,n_features):
    cin ={} #it will store the in how many documents a particular word occures
    
    unique = []
    for line in data:
        for word in line.split(" "):
            if word not in unique:
                unique.append(word)
            else:
                continue
    idf = {}
    for word in unique:
        count = 0
        for line in data:
            for w in line.split(" "):
                if word in w:
                    count = count+1
        idf[word] = math.log10((len(data))/(count))
    ### sorting the dictionary to extract the top features
    idf1 = {k: v for k, v in sorted(idf.items(), key=lambda item: item[1])}
    return idf1       

In [603]:
def fit(data,n_features):
    cin ={} #it will store the in how many documents a particular word occures
    ##finding the unique words in the each document
    unique = []
    for line in data:
        for word in line.split(" "):
            if word not in unique:
                unique.append(word)
            else:
                continue
                
    idf = {}
    #calculating the idf for each word that is available in the sentence as well as training set
    for word in unique:
        count = 0
        for line in data:
            for w in line.split(" "):
                if word in w:
                    count = count+1
        idf[word] =1 + math.log((1+len(data))/(1+count),2.71)
    ##sorting the keys according to idf value
    idf1 = {k: v for k, v in sorted(idf.items(), key=lambda item: item[1])}
    #print(idf1)
    ## extracting first n_features from the dictionary
    key,value = [],[]
    for k in idf1.keys():
        key.append(k)
        value.append(idf1[k])
    l = -1
    idf2 = {}
    for i in range(n_features):
        idf2[key[l]] = value[l]
        l=l-1

    #calcualting term-frequency    
    tf = []
    for l in data:
        line = l.split(" ")
        v = Counter(line)
        tfr = {}
        for k in v.keys():
            tfr[k] = v[k]/len(line)
        tf.append(tfr)
    ##returning the idf and tf for the feature transformation
    return [tf,idf2] 

In [604]:
def transform(data,tf,idf):
    #tf = termFrequency(data)
    #idf = calidf(data)
    tfidf = []
    for i,line in enumerate(data):
        d = {}
        for word in line.split(" "):
            if word in idf.keys():
                d[word] = tf[i][word]*idf[word]
        for word in idf.keys():
            if word not in d.keys():
                d[word] = 0
        tfidf.append(d)
    vec = DictVectorizer(sparse=True)
    tfidf = vec.fit_transform(tfidf)
    nor = normalize(tfidf)
    #tfidf = nor.fit_transform(tfidf)
    return nor

In [605]:
n_features = 5  #please provide how many features u want to use
tf,idf = fit(corpus,n_features)   #calculating the tf, idf from the training data
d = transform(corpus,tf,idf)  ##we need to provide the 

In [606]:
print("Top Features : ",idf.keys())

Top Features :  dict_keys(['one', 'third', 'and', 'second', 'first'])


In [607]:
idf.keys()

dict_keys(['one', 'third', 'and', 'second', 'first'])

In [608]:
d.todense()

matrix([[0.        , 1.        , 0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 1.        , 0.        ],
        [0.57735027, 0.        , 0.57735027, 0.        , 0.57735027],
        [0.        , 1.        , 0.        , 0.        , 0.        ]])

## reading the data from the system

In [609]:
with (open("cleaned_strings", "rb")) as openfile:
            data = (pickle.load(openfile))

In [610]:
n_features = 50 #please provide how many features u want to use
tf,idf = fit(data,n_features)   #calculating the tf, idf from the training data
d = transform(data,tf,idf)  ##we need to provide the 

### top 50 features based on idf value

In [611]:
idf

{'exceptionally': 6.941046305978266,
 'regrettable': 6.941046305978266,
 'virtue': 6.941046305978266,
 'clothes': 6.941046305978266,
 'jessice': 6.941046305978266,
 'faster': 6.941046305978266,
 'bonding': 6.941046305978266,
 'smoothly': 6.941046305978266,
 'flowed': 6.941046305978266,
 'anyway': 6.941046305978266,
 'hollander': 6.941046305978266,
 'darren': 6.941046305978266,
 'flaming': 6.941046305978266,
 'trysts': 6.941046305978266,
 'houses': 6.941046305978266,
 'clients': 6.941046305978266,
 'salesman': 6.941046305978266,
 'estate': 6.941046305978266,
 'gay': 6.941046305978266,
 'obsessed': 6.941046305978266,
 'sex': 6.941046305978266,
 'weaving': 6.941046305978266,
 'hugo': 6.941046305978266,
 'confidence': 6.941046305978266,
 'cutie': 6.941046305978266,
 'sad': 6.941046305978266,
 'favorite': 6.941046305978266,
 'judith': 6.941046305978266,
 'march': 6.941046305978266,
 'sundays': 6.941046305978266,
 'babysitting': 6.941046305978266,
 'spoiled': 6.941046305978266,
 'fest': 6.94

In [612]:
print(data[734])
print(d[734].todense())

riot see hugo weaving play sex obsessed gay real estate salesman uses clients houses trysts flaming darren tom hollander
[[0.        0.        0.        0.        0.        0.        0.
  0.2773501 0.        0.        0.        0.        0.2773501 0.
  0.2773501 0.        0.        0.        0.        0.        0.
  0.2773501 0.        0.        0.2773501 0.2773501 0.2773501 0.2773501
  0.        0.        0.        0.        0.        0.2773501 0.
  0.        0.        0.2773501 0.2773501 0.        0.        0.
  0.        0.2773501 0.        0.        0.        0.        0.2773501
  0.       ]]
