<h3>Task 1: Build a TF-IDF Vectorizer </h3>

In [None]:
from collections import Counter
from tqdm import tqdm
from scipy.sparse import csr_matrix
import math
import operator
from sklearn.preprocessing import normalize
import numpy as np 

In [None]:
corpus = [
     'this is the first document',
     'this document is the second document',
     'and this is the third one',
     'is this the first document',
]

In [None]:
# define fit function to identify unqiue words in dataset
# from assignment reference: https://colab.research.google.com/drive/1Y_K1iQV_wv7Z7I63axwMQJp1XJzgoF1s#scrollTo=vWqqbym-gA9I

def fit(dataset):    
    unique_words = set()
    # check if its list data type
    if isinstance(dataset, (list,)):
        for row in dataset: # for each doc in the dataset
            for word in row.split(" "): # convert a string into list of words and for each word
                if len(word) < 2: # skip punctuation
                    continue
                unique_words.add(word)
        unique_words = sorted(list(unique_words))
        vocab = {j:i for i,j in enumerate(unique_words)}
        
        return vocab
    else:
        print("you need to pass list of sentence")

In [None]:
print(fit(corpus))

{'and': 0, 'document': 1, 'first': 2, 'is': 3, 'one': 4, 'second': 5, 'the': 6, 'third': 7, 'this': 8}


In [None]:
# define function to calculate IDF values for each word
# https://analyticsindiamag.com/hands-on-implementation-of-tf-idf-from-scratch-in-python/
# https://www.askpython.com/python/examples/tf-idf-model-from-scratch
# https://stackabuse.com/python-for-nlp-creating-tf-idf-model-from-scratch/

def IDF(corpus, unique_words):
  idf_dict = {}
  N = len(corpus) # no. of docs in corpus
  for word in unique_words: # for each word in vocab
    count = 0
    for row in corpus: # for each doc in dataset
      if word in row.split(): # convert a string into a list of words and if word exists in list, add 1 to count
        count = count+1
      idf_dict[word] = (math.log((1+N)/(count+1)))+1 # compute IDF value for each word
  return idf_dict 

In [None]:
unique_words = fit(corpus)

IDF(corpus, unique_words)

{'and': 1.916290731874155,
 'document': 1.2231435513142097,
 'first': 1.5108256237659907,
 'is': 1.0,
 'one': 1.916290731874155,
 'second': 1.916290731874155,
 'the': 1.0,
 'third': 1.916290731874155,
 'this': 1.0}

In [None]:
idf_values = IDF(corpus, unique_words)
print(idf_values)
# print(list(idf_values.keys()))
# print(list(idf_values.values()))

{'and': 1.916290731874155, 'document': 1.2231435513142097, 'first': 1.5108256237659907, 'is': 1.0, 'one': 1.916290731874155, 'second': 1.916290731874155, 'the': 1.0, 'third': 1.916290731874155, 'this': 1.0}


In [None]:
# define function to create a sparse matrix representation with TF-IDF values for each doc and unique word
# https://analyticsindiamag.com/hands-on-implementation-of-tf-idf-from-scratch-in-python/
# https://www.askpython.com/python/examples/tf-idf-model-from-scratch
# https://stackabuse.com/python-for-nlp-creating-tf-idf-model-from-scratch/

def transform(corpus,vocabulary,idf_values):
     sparse_matrix = csr_matrix( (len(corpus), len(vocabulary)), dtype=np.float64) # create blank matrix with size N (no. of docs) x D (no. of unique words) with float data type
     for row in range(0,len(corpus)): # for each doc in corpus
       number_of_words_in_doc = Counter(corpus[row].split()) # create dict of words and its count for each doc
       
       # for each word in doc, if word exists in vocabulary, calculate TF-IDF and store in matrix
       for word in corpus[row].split(): 
           if word in list(vocabulary.keys()):
               tf_idf_value = (number_of_words_in_doc[word]/len(corpus[row].split())) * (idf_values[word])
               sparse_matrix[row,vocabulary[word]] = tf_idf_value
     
     print(normalize(sparse_matrix, norm='l2', axis=1, copy=True, return_norm=False))
     output = normalize(sparse_matrix, norm='l2', axis=1, copy=True, return_norm=False)
     return output

In [None]:
output = transform(corpus,unique_words,idf_values)
output

  (0, 1)	0.4697913855799205
  (0, 2)	0.580285823684436
  (0, 3)	0.3840852409148149
  (0, 6)	0.3840852409148149
  (0, 8)	0.3840852409148149
  (1, 1)	0.6876235979836937
  (1, 3)	0.2810886740337529
  (1, 5)	0.5386476208856762
  (1, 6)	0.2810886740337529
  (1, 8)	0.2810886740337529
  (2, 0)	0.511848512707169
  (2, 3)	0.267103787642168
  (2, 4)	0.511848512707169
  (2, 6)	0.267103787642168
  (2, 7)	0.511848512707169
  (2, 8)	0.267103787642168
  (3, 1)	0.4697913855799205
  (3, 2)	0.580285823684436
  (3, 3)	0.3840852409148149
  (3, 6)	0.3840852409148149
  (3, 8)	0.3840852409148149


  self._set_intXint(row, col, x.flat[0])


<4x9 sparse matrix of type '<class 'numpy.float64'>'
	with 21 stored elements in Compressed Sparse Row format>

In [None]:
output.shape # 4 documents and 9 unique words

(4, 9)

<h4>Compare to sklearn implementation </h4>

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
v_matrix= vectorizer.fit_transform(corpus)
print(v_matrix)

  (0, 1)	0.46979138557992045
  (0, 2)	0.5802858236844359
  (0, 6)	0.38408524091481483
  (0, 3)	0.38408524091481483
  (0, 8)	0.38408524091481483
  (1, 5)	0.5386476208856763
  (1, 1)	0.6876235979836938
  (1, 6)	0.281088674033753
  (1, 3)	0.281088674033753
  (1, 8)	0.281088674033753
  (2, 4)	0.511848512707169
  (2, 7)	0.511848512707169
  (2, 0)	0.511848512707169
  (2, 6)	0.267103787642168
  (2, 3)	0.267103787642168
  (2, 8)	0.267103787642168
  (3, 1)	0.46979138557992045
  (3, 2)	0.5802858236844359
  (3, 6)	0.38408524091481483
  (3, 3)	0.38408524091481483
  (3, 8)	0.38408524091481483


In [None]:
# converting sparse representation to dense matrix
v_matrix.toarray()

array([[0.        , 0.46979139, 0.58028582, 0.38408524, 0.        ,
        0.        , 0.38408524, 0.        , 0.38408524],
       [0.        , 0.6876236 , 0.        , 0.28108867, 0.        ,
        0.53864762, 0.28108867, 0.        , 0.28108867],
       [0.51184851, 0.        , 0.        , 0.26710379, 0.51184851,
        0.        , 0.26710379, 0.51184851, 0.26710379],
       [0.        , 0.46979139, 0.58028582, 0.38408524, 0.        ,
        0.        , 0.38408524, 0.        , 0.38408524]])

In [None]:
vectorizer.get_feature_names()



['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']

In [None]:
v_matrix.shape

(4, 9)

Note: The result from custom implementation closely resembles the sklearn implementation.

<h3>Task 2: Implement max features functionality for TF-IDF Vectorizer </h3>
Vocabulary will contain only 50 terms with top idf scores

In [None]:
# identify top N words according to IDF values
# https://stackoverflow.com/questions/38218501/python-get-top-n-keys-with-value-as-dictionary

def get_top_words(data, n=3):
    top = sorted(data.items(), key=lambda x: x[1], reverse=True)[:n]
    return dict(top)

In [None]:
# test on Task 1 vocab idf values
get_top_words(idf_values)

{'and': 1.916290731874155,
 'one': 1.916290731874155,
 'second': 1.916290731874155}

In [None]:
# Load the cleaned_strings pickle file provided

import pickle
with open('cleaned_strings', 'rb') as f:
    corpus = pickle.load(f)
    
# printing the length of the corpus loaded
print("Number of documents in corpus = ",len(corpus))

Number of documents in corpus =  746


In [None]:
# run fit, IDF and get top words function as defined above
unique_words = fit(corpus)
idf_values = IDF(corpus, unique_words)
top50_idf = get_top_words(idf_values, n=50)

In [None]:
# the no. of words in top50_idf dict
len(top50_idf)

50

In [None]:
# the no. of words in vocab dict - there is a need to filter list of unique words
len(unique_words)

2886

In [None]:
# filter vocab to only include the top 50 words based on IDF scores
# https://stackoverflow.com/questions/6827834/how-to-filter-a-dict-to-contain-only-keys-in-a-given-list
# https://stackoverflow.com/questions/30661990/how-to-filter-python-dictionary-by-another-dictionary

keys = list(top50_idf.keys())
filtered_vocab = dict((k, unique_words[k]) for k in keys if k in unique_words)
len(filtered_vocab)

50

In [None]:
# random check post filter
list(filtered_vocab.items())[:10]

[('aailiyah', 0),
 ('abandoned', 1),
 ('abroad', 3),
 ('abstruse', 5),
 ('academy', 7),
 ('accents', 8),
 ('accessible', 9),
 ('acclaimed', 10),
 ('accolades', 11),
 ('accurate', 12)]

In [None]:
# reset index
filtered_vocab = {j:i for i,j in enumerate(filtered_vocab)}
list(filtered_vocab.items())[:10]

[('aailiyah', 0),
 ('abandoned', 1),
 ('abroad', 2),
 ('abstruse', 3),
 ('academy', 4),
 ('accents', 5),
 ('accessible', 6),
 ('acclaimed', 7),
 ('accolades', 8),
 ('accurate', 9)]

In [None]:
# return matrix with TF-IDF scores for first doc in corpus
output = transform(corpus,filtered_vocab,top50_idf)
output[0]

  (0, 30)	1.0
  (68, 24)	1.0
  (72, 29)	1.0
  (74, 31)	1.0
  (119, 33)	1.0
  (135, 3)	0.37796447300922725
  (135, 10)	0.37796447300922725
  (135, 18)	0.37796447300922725
  (135, 20)	0.37796447300922725
  (135, 36)	0.37796447300922725
  (135, 40)	0.37796447300922725
  (135, 41)	0.37796447300922725
  (176, 49)	1.0
  (181, 13)	1.0
  (192, 21)	1.0
  (193, 23)	1.0
  (216, 2)	1.0
  (222, 47)	1.0
  (225, 19)	1.0
  (227, 17)	1.0
  (241, 44)	1.0
  (270, 1)	1.0
  (290, 25)	1.0
  (333, 26)	1.0
  (334, 15)	1.0
  (341, 43)	1.0
  (344, 42)	1.0
  (348, 8)	1.0
  (377, 37)	1.0
  (409, 5)	1.0
  (430, 39)	1.0
  (457, 45)	1.0
  (461, 4)	1.0
  (465, 38)	1.0
  (475, 35)	1.0
  (493, 6)	1.0
  (500, 48)	1.0
  (548, 0)	0.7071067811865475
  (548, 32)	0.7071067811865475
  (608, 14)	1.0
  (612, 11)	1.0
  (620, 46)	1.0
  (632, 7)	1.0
  (644, 12)	0.7071067811865475
  (644, 27)	0.7071067811865475
  (664, 28)	1.0
  (667, 22)	1.0
  (691, 34)	1.0
  (697, 9)	1.0
  (722, 16)	1.0


  self._set_intXint(row, col, x.flat[0])


<1x50 sparse matrix of type '<class 'numpy.float64'>'
	with 1 stored elements in Compressed Sparse Row format>

In [None]:
output.shape # 746 docs in corpus and 50 words in vocab

(746, 50)

In [None]:
output[0].toarray()

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0.]])

In [None]:
output[0].toarray().shape

(1, 50)

Note: The result from custom implementation returns a sparse representation matrix of shape (746,50), matching the no. of docs in dataset and no. of words in vocabulary.