### **TASK 1**

In [7]:
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm
from collections import Counter
import numpy as np
from scipy.sparse import csr_matrix
import math

corpus = [
     'this is the first document',
     'this document is the second document',
     'and this is the third one',
     'is this the first document',
]

documentsCount_1 = len(corpus)

def fit(dataset):
    unique_words = set() 
    if isinstance(dataset, (list,)):
        for row in dataset:                                   
            for word in row.split(" "):                              
                if len(word) < 2:
                    continue
                unique_words.add(word)
        unique_words = sorted(list(unique_words))
        vocab = {j:i for i,j in enumerate(unique_words)}
        return vocab
  
vocab = fit (corpus)

rows = []
columns = []
values = []
idf_values = {}
k = set()

def transform(dataset,vocab):
    if isinstance(dataset, (list,)):
        for idx, row in enumerate(tqdm(dataset)):              
            review = dict(Counter(row.split()))    
            for word, freq in review.items():                    
                if len(word) < 2:
                    continue
                tf = freq/(len(review))
                idf = computeIDF(word,vocab,dataset)
                if idf!=0:
                   idf_values[word]= idf
                tfidf = tf * idf                                                                     
                col_index = vocab.get(word, -1)                
                if col_index !=-1:                             
                   if tfidf!=0:
                      rows.append(idx)
                      columns.append(col_index)
                      values.append(tfidf) 
        return normalize(csr_matrix((values, (rows,columns)), shape=(len(dataset),len(vocab))))
    else:
        print("you need to pass list of strings")


def computeIDF(word,vocab,dataset):  
    for val in k: 
        if word==val:
           return 0
    k.add(word)
    z = 0 
    if isinstance(dataset, (list,)):
       for row in dataset:                        
           for word1 in row.split(" "):        
               if word1 == word:
                   z = z+1
                   break
               continue
    idf = 1 + math.log((1+documentsCount_1)/(1+z))
    return idf

y = transform(corpus,vocab)


print("\n\n\n                                 ----Custom implement tdidfVectorizer Results----")

print("\n*IDF VALUES AND FEATURE NAMES\n")
for x in idf_values.items(): 
        print(x, end =' ') 
        print() 
print("\n*OUTPUT CORRESPONDING TO FIRST DOCUMENT\n")
print(y[0].toarray())

print("\n\n\n                                     -----sklearn tdidfVectorizer Results-----")

vectorizer = TfidfVectorizer()
vectorizer.fit(corpus)
skl_output = vectorizer.transform(corpus)

print("\n\n*IDF VALUES",vectorizer.idf_)
print("\n*FEATURE NAMES",vectorizer.get_feature_names())
print("\n*OUTPUT CORRESPONDING TO FIRST DOCUMENT\n")
print (skl_output[0].toarray())

100%|██████████| 4/4 [00:00<00:00, 1416.99it/s]




                                 ----Custom implement tdidfVectorizer Results----

*IDF VALUES AND FEATURE NAMES

('this', 1.0) 
('is', 1.0) 
('the', 1.0) 
('first', 1.5108256237659907) 
('document', 1.2231435513142097) 
('second', 1.916290731874155) 
('and', 1.916290731874155) 
('third', 1.916290731874155) 
('one', 1.916290731874155) 

*OUTPUT CORRESPONDING TO FIRST DOCUMENT

[[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]



                                     -----sklearn tdidfVectorizer Results-----


*IDF VALUES [1.91629073 1.22314355 1.51082562 1.         1.91629073 1.91629073
 1.         1.91629073 1.        ]

*FEATURE NAMES ['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']

*OUTPUT CORRESPONDING TO FIRST DOCUMENT

[[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]





### **TASK 2**

In [8]:
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm
from collections import Counter
import numpy as np
from scipy.sparse import csr_matrix
import math

import pickle
with open('cleaned_strings', 'rb') as f:
    corpus_2 = pickle.load(f)

documentsCount = len(corpus_2)

q = set()
vocab_2 = {}
vocab_3 = {}
idf_values_2 = {}

def fit(dataset):
    vocab_idf_values= {}
    unique_words_2 = set() 
    if isinstance(dataset, (list,)):
        for row in dataset: 
            review = dict(Counter(row.split()))                                          # for each review in the dataset
            for word, freq in review.items():                                            # for each word in the review. 
                if len(word) < 2:
                    continue
                unique_words_2.add(word)
    
    for val in unique_words_2:
        idf = compute_IDF(val,dataset)
        if idf!=0:
           vocab_idf_values[val]= idf      
    idf_list = list(vocab_idf_values.values())
    idf_list.sort(reverse=True)
    
    for s in range(0,50):
        word = get_key(idf_list[s],vocab_idf_values)
        #print("word = ", word , "idf value = ", idf_list[s])
        vocab_2.update({word:s})
        vocab_3.update({word:idf_list[s]})
    return vocab_3

def get_key(val,my_dict): 
    for key, value in my_dict.items(): 
         if val == value:
            my_dict.pop(key) 
            return key 

def compute_IDF(word,dataset):  
    for val in q: 
        if word==val:
           return 0
    q.add(word)
    z = 0 
    if isinstance(dataset, (list,)):
       for row in dataset:                        # for each review in the dataset
           for word1 in row.split(" "):        
               if word1 == word:
                   z = z+1
                   break
               continue           
    idf = 1 + math.log((1+documentsCount)/(1+z))
    return idf
 
idf_50 = fit (corpus_2)
i = 1
print("\n\n Top 50 words and their idf values\n")
for s in idf_50.items():
  print(i,".",s)
  i = i+1

rows_2 = []
columns_2 = []
values_2 = []
idf_values_2 = {}
k = set()

def transform_2(dataset,vocab_2):
    if isinstance(dataset, (list,)):
        for idx, row in enumerate(tqdm(dataset)):              
            review = dict(Counter(row.split()))    
            for word, freq in review.items():                    
                if len(word) < 2:
                    continue
                tf = freq/(len(review))
                idf = computeIDF(word,vocab_2,dataset)
                if idf!=0:
                   idf_values_2[word]= idf
                tfidf = tf * idf                                                                     
                col_index = vocab_2.get(word, -1)                
                if col_index !=-1:                             
                   if tfidf!=0:
                      rows_2.append(idx)
                      columns_2.append(col_index)
                      values_2.append(tfidf) 
        my_array = np.array(rows_2)
        #print("\nrows_2", my_array)
        return normalize(csr_matrix((values_2, (rows_2,columns_2)), shape=(len(dataset),len(vocab_2))))
    else:
        print("you need to pass list of strings")


def computeIDF(word,vocab,dataset):  
    for val in k: 
        if word==val:
           return 0
    k.add(word)
    z = 0 
    if isinstance(dataset, (list,)):
       for row in dataset:                        
           for word1 in row.split(" "):        
               if word1 == word:
                   z = z+1
                   break
               continue
    idf = 1 + math.log((1+documentsCount)/(1+z))
    return idf

y = transform_2(corpus_2,vocab_2)
print("\n\nShape of Sparse Matrix",y.shape)
print("\nShape of dense Matrix for each document",y[0].shape)

print("\nFew dense representation are as follows: ")

print("\nDense Matrix for 11th document")
print(y[11].toarray())
print("\nDense Matrix for 19th document")
print(y[19].toarray())
print("\nDense Matrix for 350th document")
print(y[350].toarray())
print("\nDense Matrix for 644th document")
print(y[644].toarray())



  2%|▏         | 18/746 [00:00<00:04, 166.78it/s]



 Top 50 words and their idf values

1 . ('modest', 6.922918004572872)
2 . ('system', 6.922918004572872)
3 . ('locations', 6.922918004572872)
4 . ('impossible', 6.922918004572872)
5 . ('dialogs', 6.922918004572872)
6 . ('native', 6.922918004572872)
7 . ('outside', 6.922918004572872)
8 . ('trilogy', 6.922918004572872)
9 . ('executed', 6.922918004572872)
10 . ('critic', 6.922918004572872)
11 . ('massive', 6.922918004572872)
12 . ('syrupy', 6.922918004572872)
13 . ('amaze', 6.922918004572872)
14 . ('personally', 6.922918004572872)
15 . ('highlights', 6.922918004572872)
16 . ('owned', 6.922918004572872)
17 . ('masculine', 6.922918004572872)
18 . ('limitations', 6.922918004572872)
19 . ('jerry', 6.922918004572872)
20 . ('cheekbones', 6.922918004572872)
21 . ('celebration', 6.922918004572872)
22 . ('surroundings', 6.922918004572872)
23 . ('rips', 6.922918004572872)
24 . ('rita', 6.922918004572872)
25 . ('propaganda', 6.922918004572872)
26 . ('broke', 6.922918004572872)
27 . ('tanks', 6.9229

100%|██████████| 746/746 [00:02<00:00, 278.66it/s]



Shape of Sparse Matrix (746, 50)

Shape of dense Matrix for each document (1, 50)

Few dense representation are as follows: 

Dense Matrix for 11th document
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0.]]

Dense Matrix for 19th document
[[0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.90453403 0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.30151134 0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.30151134 0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.        ]]

Dense Matrix for 350th document
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 


