In [21]:
from nltk.tokenize import word_tokenize
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer,SnowballStemmer,WordNetLemmatizer
import json
import os
from tqdm.notebook import tqdm,tnrange
import string
import numpy as np
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from collections import defaultdict

nltk.download("punkt")
nltk.download("stopwords")

[nltk_data] Downloading package punkt to C:\Users\Sajeel
[nltk_data]     Khan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Sajeel
[nltk_data]     Khan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Query

In [22]:
class Response():
    def __init__(self,data):
        self.data=set(data)

    def getMapping(self,file):
        '''
         Prints out the corresponding document names from list of document IDs
        '''
        self.mapping=json.load(open(file))
        for i in self.data:
            print(self.mapping[i])
    
    def __str__(self):
        return str(len(self.data))

    def add(self,response):
        '''
        Performs the set union of given object and pass object
        '''
        return Response(set.union(self.data,response.data))
        
    def intersect(self,response):
        '''
        Performs the set intersection of given object with pass object
        '''
        return Response(set.intersection(self.data,response.data))

    def diff(self,response):
        '''
        Performs the set difference of given object with pass object
        '''
        return Response(set.difference(self.data,response.data))
        

In [33]:
class Query():
    def __init__(self,file):
        '''
        initializes the object with loading the index file
        '''
        self.db=json.load(open(file))
        self.db=defaultdict(lambda:[],self.db)        

    def OR(self,term1,term2):
        '''
        Finds the docs after applying OR operation on given list of documents
        '''
        return term1.add(term2)
    
    def AND(self,term1,term2):
        '''
        Finds the docs after applying AND operation on given list of documents
        '''
        return term1.intersect(term2)
    
    def ANDNOT(self,term1,term2):
        '''
        Finds the docs after applying AND NOT operation on given list of documents
        '''
        univ=Response(np.arange(467))
        not_term2=univ.diff(term2)
        return term1.intersect(not_term2)

    def ORNOT(self,term1,term2):
        '''
        Finds the docs after applying OR NOT operation on given list of documents
        '''
        univ=Response(np.arange(467))
        not_term2=univ.diff(term2)
        joint = term1.intersect(term2)
        return not_term2.add(joint)

    def count(self,first,second):
        i,j,count=0,0,0
        while(i<len(first) and j<len(second)):
            count+=1
            if(first[i]<second[j]):
                i+=1          
            elif(first[i]>second[j]):
                j+=1
            else:
                i+=1
                j+=1
        return count

    def no_comparisonsOR(self,term1, term2):
        '''
        To return the number of comparisons it will make in OR operations between two list of documents
        '''
        first = list(term1.data)
        first.sort()
        second =list(term2.data)
        second.sort()
        return self.count(first,second)    

    def no_comparisonsAND(self,term1, term2):
        '''
        To return the number of comparisons it will make in AND operations between two list of documents
        '''
        first = list(term1.data)
        first.sort()
        second =list(term2.data)
        second.sort()
        return self.count(first,second) 
    
    def no_comparisonsANDNOT(self,term1, term2):
        '''
        To return the number of comparisons it will make in AND NOT operations between two list of documents
        '''
        first = list(term1.data)
        first.sort()
        univ=Response(np.arange(467)) 
        not_term2=univ.diff(term2) 
        second=list(not_term2.data)
        second.sort()
        return self.count(first,second) 
    
    def no_comparisonsORNOT(self,term1, term2):
        '''
        To return the number of comparisons it will make in OR NOT operations between two list of documents
        '''
        first = list(term1.data)
        first.sort()
        univ=Response(np.arange(467)) 
        not_term2=univ.diff(term2) 
        second=list(not_term2.data)
        second.sort()
    
        return self.count(first,second)  

    def stripSpecialChar(self,text):
        return ''.join(ch for ch in text if ch.isalnum() and not ch.isdigit() and ch not in string.punctuation)

    def preProcess(self,text):
        stemmer = SnowballStemmer("english")
        stopWords = set(stopwords.words('english'))

        text = text.lower()                                     # convert all text to lower case
        text_tokens = word_tokenize(text)                       # tokenizing the text

        # stemmedWords = list([stemmer.stem(word) for word in text_tokens])
        # validTokens = [i for i in stemmedWords if i not in stopWords]

        validTokens = [i for i in text_tokens if i not in stopWords]    # removing stop words

        validTokens = [self.stripSpecialChar(x) for x in validTokens]   # stripping special characters
        validTokens = [x for x in validTokens if len(x) > 1]    # Choosing only words which has length > 1
        return validTokens
    
    def processQuery(self,inp,ops):
        '''
        Performs query with given string and list of operations
        '''
        terms=self.preProcess(inp)
        print(terms)
        output=Response(self.db[terms[0]])
        comparisons=0
        for i in tnrange(1,len(terms)):
            curr=Response(self.db[terms[i]])
            if(ops[i-1]=='OR'):
                output=self.OR(output, curr)
                comparisons+=self.no_comparisonsOR(output,curr)
            elif(ops[i-1]=='AND'):
                output=self.AND(output, curr)
                comparisons+=self.no_comparisonsAND(output,curr)
            elif(ops[i-1]=='OR NOT'):
                output=self.ORNOT(output, curr)
                comparisons+=self.no_comparisonsORNOT(output,curr)
            elif(ops[i-1]=='AND NOT'):
                output=self.ANDNOT(output, curr)
                comparisons+=self.no_comparisonsANDNOT(output,curr)
            else:
                raise Exception("Operand not Identified:"+ops[i-1])

        print("Number of documents matched:",output)
        print("No. of comparisons required:",comparisons)
        docs_list = list(output.data)
        docs_list.sort()
        print(docs_list)


In [34]:
n = int(input("Enter the number of queries:"))
for _ in range(n):
    sentence_query = input("Enter the sentence:-")
    
    # Performing preprocessing(splitting, uppercase, stripping space from endpoints) on the operand input
    operands_=list(map(str.strip,input("Enter the operands:-").upper().split(",")))

    query=Query("output.json")
    query.processQuery(sentence_query,operands_)
    
    
    
    

Enter the number of queries:1
Enter the sentence:-SNOWBANK, STUBBED, LEGIONS
Enter the operands:-and not, or
['snowbank', 'stubbed', 'legions']


HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))


Number of documents matched: 2
No. of comparisons required: 2
[64, 127]
