# Task - 01

## 1 Build a TF-IDF Vectorizer ¶

### 1.1 Fit Function

In [1]:
from tqdm import tqdm 
def fit(data):
    unique_word = set() #as set will take only unique values
    if isinstance(data,list):
        for ele in data:
            for word in ele.split(" "):
                if len(word) < 2:        #single alpabhet words don't add any value for analysis
                    continue
                unique_word.add(word)
        unique_word = sorted(list(unique_word))   #sorting the unique values
        vocab = {j:i for i,j in enumerate(unique_word)} #creating a dictionary with word as key and index as value
        return vocab
    else:
        print("you need to pass list of sentance")  
data = ['this is the first document',
     'this document is the second document',
     'and this is the third one',
     'is this the first document']
vocab = fit(data)
print(vocab)



{'and': 0, 'document': 1, 'first': 2, 'is': 3, 'one': 4, 'second': 5, 'the': 6, 'third': 7, 'this': 8}


### 1.2 Transform Function

In [4]:
from collections import Counter
from scipy.sparse import csr_matrix
import math
from sklearn.preprocessing import normalize as norm
def transform(data,vocab): 
    rows = [] #for creating matrix,we need to store index value of sentence as row number
    columns = [] #for creating matrix, we need to store index value of unique words as column value
    final_col = [] 
    tf_values = [] #list to store tf_values
    idf_values = {} #dictionar to store idf_values, where idf value is key and word is value
   
    if isinstance(data,(list,)):
        for idx,row in enumerate(data):
            word_dict = dict(Counter(row.split()))  
            for word,freq in word_dict.items():
                if len(word) < 2:
                    continue
                col_index = vocab.get(word,-1)
                if col_index != -1:
                    rows.append(idx)
                    columns.append(col_index)                    
                    tf = freq/len(row.split())
                    tf_values.append(tf)
                if col_index in idf_values:
                    idf_values[col_index] += 1
                else:
                    idf_values[col_index] = 1
    
        for key,values in idf_values.items():
            idf = 1 + math.log((1 +len(data))/(1 + values))
            idf_values[key] = idf
        
        tfidf_values = [0]*len(columns)
        for col in range(len(columns)):
            a = columns[col]
            tfidf_values[col] = tf_values[col]*idf_values[a] #final tf-idf values 
        matrix = csr_matrix((tfidf_values,(rows,columns)),shape = (len(data),len(vocab))) #sparse matrix
        return norm(matrix)  #L2-norm using sklearn normalization
    else:
        print("you need to pass list of sentance")
vocab = fit(data)
print(transform(data,vocab)[0].toarray())


[[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]


# Task - 02

### 2.1 Loading Data

In [3]:
import pickle
with open('cleaned_strings', 'rb') as f:
    corpus = pickle.load(f)
    
# printing the length of the corpus loaded
print("Number of documents in corpus = ",len(corpus))

Number of documents in corpus =  746


### 2.2 Fit Function

In [5]:
from tqdm import tqdm 
import operator
import math
def fit(data):
    unique_word = set() #as set will take only unique values
    freq_vocab = {}
    idf_values = {}
    if isinstance(data,list):
        for ele in data:
            for word in ele.split(" "):
                if len(word) < 2:        #single alpabhet words don't add any value for analysis
                    continue
                if word in freq_vocab:
                    freq_vocab[word] += 1
                else:
                    freq_vocab.update({word : 1})
        for word,freq in freq_vocab.items():
            idf = 1 + math.log((1 +len(data))/(1 + freq))
            idf_values[word] = idf

        sorted_idf_values = sorted(idf_values.items(), key = operator.itemgetter(1),reverse = True)
        
        top_50_sorted_idf_values = sorted_idf_values[0:50]
        for i in top_50_sorted_idf_values:
            unique_word.add(i[0]) 
        vocab = {j:i for i,j in enumerate(unique_word)} #creating a dictionary with word as key and index as value
        return vocab
    else:
        print("you need to pass list of sentance")  
vocab = fit(corpus)
print(vocab)

{'screenplay': 0, 'dozen': 1, 'tightly': 2, 'overdue': 3, 'puzzle': 4, 'vitally': 5, 'aye': 6, 'solving': 7, 'th': 8, 'doomed': 9, 'drifting': 10, 'teacher': 11, 'florida': 12, 'practically': 13, 'occurs': 14, 'highest': 15, 'unlockable': 16, 'constructed': 17, 'existent': 18, 'content': 19, 'number': 20, 'person': 21, 'aimless': 22, 'buffet': 23, 'distressed': 24, 'conception': 25, 'pulls': 26, 'insane': 27, 'changing': 28, 'minor': 29, 'attempting': 30, 'artiness': 31, 'science': 32, 'structure': 33, 'post': 34, 'gerardo': 35, 'rocks': 36, 'punches': 37, 'properly': 38, 'nearly': 39, 'baby': 40, 'muppets': 41, 'owls': 42, 'tone': 43, 'fill': 44, 'superlative': 45, 'messages': 46, 'require': 47, 'emptiness': 48, 'effort': 49}


### 2.3 Transform Function

In [6]:
from collections import Counter
from scipy.sparse import csr_matrix
import math
from sklearn.preprocessing import normalize as norm
def transform(data,vocab): 
    rows = [] #for creating matrix,we need to store index value of sentence as row number
    columns = [] #for creating matrix, we need to store index value of unique words as column value
    tf_values = [] #list to store tf_values
    idf_values = {} #dictionar to store idf_values, where idf value is key and word is value
   
    if isinstance(data,(list,)):
        for idx,row in enumerate(data):
            
            word_dict = dict(Counter(row.split()))
            for word,freq in word_dict.items():
                if word in vocab:
                    col_index = vocab.get(word,-1)
                    if col_index != -1:
                        rows.append(idx)
                        columns.append(col_index)                    
                        tf = freq/len(row.split())
                        tf_values.append(tf)
                        if col_index in idf_values:
                            idf_values[col_index] += 1
                        else:
                            idf_values[col_index] = 1

        for key,values in idf_values.items():
            idf = 1 + math.log((1 +len(data))/(1 + values))
            idf_values[key] = idf
        print('idf_values',idf_values)
        tfidf_values = [0]*len(columns)
        for col in range(len(columns)):
            a = columns[col]
            tfidf_values[col] = tf_values[col]*idf_values[a] 
        matrix = csr_matrix((tfidf_values,(rows,columns)),shape = (len(data),len(vocab))) #sparse matrix
        return norm(matrix)
    else:
        print("you need to pass list of sentance")    
vocab = fit(corpus)
print(transform(corpus,vocab)[0].toarray())    

idf_values {22: 6.922918004572872, 24: 6.922918004572872, 10: 6.922918004572872, 39: 6.922918004572872, 30: 6.922918004572872, 31: 6.922918004572872, 18: 6.922918004572872, 35: 6.922918004572872, 48: 6.922918004572872, 49: 6.922918004572872, 46: 6.922918004572872, 23: 6.922918004572872, 32: 6.922918004572872, 11: 6.922918004572872, 40: 6.922918004572872, 42: 6.922918004572872, 12: 6.922918004572872, 41: 6.922918004572872, 21: 6.922918004572872, 3: 6.922918004572872, 0: 6.922918004572872, 34: 6.922918004572872, 13: 6.922918004572872, 33: 6.922918004572872, 2: 6.922918004572872, 17: 6.922918004572872, 5: 6.922918004572872, 14: 6.922918004572872, 19: 6.922918004572872, 44: 6.922918004572872, 1: 6.922918004572872, 15: 6.922918004572872, 45: 6.922918004572872, 47: 6.922918004572872, 4: 6.922918004572872, 7: 6.922918004572872, 26: 6.922918004572872, 37: 6.922918004572872, 20: 6.922918004572872, 8: 6.922918004572872, 27: 6.922918004572872, 16: 6.922918004572872, 38: 6.922918004572872, 6: 6.92