In [1]:
import os
import json
import string
import pickle 
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize
from tqdm.notebook import tqdm,tnrange
from collections import Counter
#nltk.download("punkt")
#nltk.download("stopwords")
class InvertedIndex():
    def __init__(self):
        self.DFpostings={}                #Dictionary contains Posting List for each words 
        self.termsInFile=[]               #List contains all words in a single file

    def stripSpecialChar(self,text):
        return ''.join(ch for ch in text if ch.isalnum() and not ch.isdigit() and ch not in string.punctuation)

    def preProcess(self,text):
        stemmer = SnowballStemmer("english")
        stopWords = set(stopwords.words('english'))

        text = text.lower()                                     # convert all text to lower case
        text_tokens = word_tokenize(text)                       # tokenizing the text

        stemmedWords = list([stemmer.stem(word) for word in text_tokens])   # Stemming 
        validTokens = [i for i in stemmedWords if i not in stopWords]       # Removing Stop Words

        validTokens = [self.stripSpecialChar(x) for x in validTokens]   # stripping special characters
        validTokens = [x for x in validTokens if len(x) > 1]    # Choosing only words which has length > 1
        return validTokens, set(validTokens)

    def indexFile(self,file,fileId):
        '''
        Creates Index for each File
        '''
        tokens, setTokens = self.preProcess(file)
        self.termsInFile.append(tokens)
        for i in setTokens:
            if i in self.DFpostings:
                self.DFpostings[i].append(fileId)
            else:
                self.DFpostings[i] = [fileId]
                
    def save(self,file):
        '''
        Save the index to a file locally
        '''
        json.dump(self.DFpostings, open("DFPostings"+file, "w"))
        json.dump(self.termsInFile, open("TermsInfile"+file, "w"))

In [2]:
class tf_idfmatrices():
    def __init__(self,DFpostings, tokens, docs_count):
        self.DFpostings = DFpostings
        self.tokens = tokens
        self.docs_count = docs_count
        self.vocab_count = len(DFpostings)
        self.vocabulary = [x for x in DFpostings]
        self.idf = dict()
        self.counter_lists = []
        self.tf_idf_TermFreq = np.zeros((docs_count, self.vocab_count))

    
    def generateIDF(self):
         for key in self.DFpostings:
                doc_freq = len(self.DFpostings[key]) 
                self.idf[key] = np.log10((self.docs_count/doc_freq) +1)
        
    def generateCounterLists(self):
        for i in tnrange(self.docs_count):
            self.counter_lists.append(Counter(self.tokens[i])) 
    
    def generateTermFreq(self):
        for i in tnrange(len(self.vocabulary)):
            for j in range(self.docs_count):
                self.tf_idf_TermFreq[j][i] = (self.counter_lists[j][self.vocabulary[i]]/len(self.tokens[j]))*self.idf[self.vocabulary[i]]


In [4]:
def generateScore(DFpostings, termsInFile):
    work_objthis = tf_idfmatrices(DFpostings,termsInFile,19241) 
    work_objthis.generateIDF()
    work_objthis.generateCounterLists()
    work_objthis.generateTermFreq()
    return work_objthis


with open('ProjectFile.obj','rb') as file_object:
    raw_data = file_object.read()
file_obj = pickle.loads(raw_data)

with open('ProjectTitle.obj','rb') as file_object:
    raw_data = file_object.read()
title_obj = pickle.loads(raw_data)

#fileScore = generateScore(file_obj.DFpostings, file_obj.termsInFile)
#titleScore = generateScore(title_obj.DFpostings, title_obj.termsInFile)


HBox(children=(FloatProgress(value=0.0, max=19241.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3423.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=19241.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5686.0), HTML(value='')))




In [73]:
#serialized = pickle.dumps(fileScore)
#filename = 'TF_IDF_Calculated_File.obj'
#with open(filename,'wb') as file_object:
#    file_object.write(serialized)
    
#serialized = pickle.dumps(titleScore)
#filename = 'TF_IDF_Calculated_Title.obj'
#with open(filename,'wb') as file_object:
#    file_object.write(serialized)
  
with open('TF_IDF_Calculated_File.obj','rb') as file_object:
    raw_data = file_object.read()
file_tf_idf_obj = pickle.loads(raw_data)

with open('TF_IDF_Calculated_Title.obj','rb') as file_object:
    raw_data = file_object.read()
title_tf_idf_obj = pickle.loads(raw_data)

In [74]:
def stripSpecialChar(text):
        return ''.join(ch for ch in text if ch.isalnum() and not ch.isdigit() and ch not in string.punctuation)

def preProcess(text):
        stemmer = SnowballStemmer("english")
        stopWords = set(stopwords.words('english'))

        text = text.lower()                                     # convert all text to lower case
        text_tokens = word_tokenize(text)                       # tokenizing the text

        stemmedWords = list([stemmer.stem(word) for word in text_tokens])
        validTokens = [i for i in stemmedWords if i not in stopWords]

        validTokens = [stripSpecialChar(x) for x in validTokens]   # stripping special characters
        validTokens = [x for x in validTokens if len(x) > 1]    # Choosing only words which has length > 1
        return set(validTokens)

In [75]:
def getscore(objtype, listofwords):
    indextolookfor = []
    for word in listofwords:
        index = objtype.vocabulary.index(word)  #instead of forming a query vector, we just extracted the indices of the querytokens
        indextolookfor.append(index)
    for docs in range(objtype.docs_count):
        for query in indextolookfor:
            query_eval[docs][0]+= (objtype.tf_idf_TermFreq[docs][query])
        
def Top5(alist):
    return sorted(range(len(alist)), key=lambda i: alist[i], reverse=True)[:5]

def results():
    print()
    return Top5(query_eval[:,0])

In [76]:
sentence_query = input("Enter Input:-")
listofwords = preProcess(sentence_query)

query_eval = np.zeros((title_tf_idf_obj.docs_count,1))
getscore(title_tf_idf_obj, listofwords)
getscore(file_tf_idf_obj, listofwords)

ans = results()
f = open('Final_mapping.json',)
d = json.load(f)

for i in ans:
    file1 = open(d[i],"r")
    print(d[i][15:-4])
    print(file1.readline())


Enter Input:-vanilla mango

20011
Mango Cream

10072
Vanilla Kifli

13763
Vanilla Pudding

21751
Vanilla Sauce

8071
Mango Cake

