In [None]:

from nltk.tokenize import word_tokenize
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer,SnowballStemmer,WordNetLemmatizer
import json
import os
from tqdm.notebook import tqdm,tnrange
import string
import numpy as np
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from collections import defaultdict
from tabulate import tabulate
from functools import reduce
import copy

nltk.download("punkt")
nltk.download("stopwords")

# Positional Index

In [None]:
class PositionalIndex():
    def __init__(self):
        self.db={}

    def __str__(self):
        return "Keys:"+str(len(list(self.db.keys())))

    def stripSpecialChar(self,text):
        return ''.join(ch for ch in text if ch.isalnum() and not ch.isdigit() and ch not in string.punctuation)

    def preProcess(self,text):
        stopWords = set(stopwords.words('english'))

        text = text.lower()                                     # convert all text to lower case
        text_tokens = word_tokenize(text)                       # tokenizing the text

        validTokens = [i for i in text_tokens if i not in stopWords]    # removing stop words

        validTokens = [self.stripSpecialChar(x) for x in validTokens]   # stripping special characters
        # validTokens = [x for x in validTokens if len(x) > 1]    # Choosing only words which has length > 1
        return validTokens

    def indexFile(self,file,fileId):
        '''
        Indexes the file pass as argument to the associated fileID
        '''
        tokens = self.preProcess(file)
        for pos,val in enumerate(tokens):
            if val in self.db:
                self.db[val][0]+=1
                if(fileId in self.db[val][1]):
                    self.db[val][1][fileId].append(pos)
                else:
                    self.db[val][1][fileId]=[pos]
                
            else:
                self.db[val] = [1,{fileId:[pos]}]

    def generateWordcloud(self):
        '''
        Creates a wordclound to visualize the frequence of words in the index
        '''
        frequencyDict = {}
        for key in self.db:
            frequencyDict[key] = len(self.db[key])
        wordcloud = WordCloud().generate_from_frequencies(frequencyDict)
        plt.imshow(wordcloud, interpolation="bilinear")
        plt.axis("off")

    def save(self):
        '''
        Save the index to a file locally
        '''
        json.dump(self.db, open('output.json', "w"))
   



In [None]:
allFiles = os.walk("../Dataset/stories")
filePaths = []
for i in allFiles:
    for j in i[2]:
        filePath = i[0] + "/" + j
        filePaths.append(filePath)

json.dump(filePaths, open("mapping.json", "w"))

index = PositionalIndex()

for i,filePath in enumerate(tqdm(filePaths)):
    try:
        file = open(filePath, encoding="utf8")
        read = file.read().replace('\n', ' ')    
    except Exception as e:
        file = open(filePath, encoding="unicode_escape")
        read = file.read().replace('\n', ' ')
    file.close()
    index.indexFile(read, i)
    index.save()


## Query

In [None]:
class Query():
    def __init__(self):
        '''
        initializes the object with loading the index file
        '''
        self.db=json.load(open('output.json'))
        self.db=defaultdict(lambda:[],self.db)   

    def stripSpecialChar(self,text):
        return ''.join(ch for ch in text if ch.isalnum() and not ch.isdigit() and ch not in string.punctuation)

    def preProcess(self,text):
        stopWords = set(stopwords.words('english'))

        text = text.lower()                                     # convert all text to lower case
        text_tokens = word_tokenize(text)                       # tokenizing the text

        validTokens = [i for i in text_tokens if i not in stopWords]    # removing stop words

        validTokens = [self.stripSpecialChar(x) for x in validTokens]   # stripping special characters
        validTokens = [x for x in validTokens if len(x) > 1]    # Choosing only words which has length > 1
        return validTokens

    def intersectLists(self,lists):
        '''
        Takes a 2D list and returns the intersection between them  
        '''
        if len(lists)==0:
            return []
       
        lists.sort(key=len)
        return list(reduce(lambda x,y: set(x)&set(y),lists))
        
    def getPostings(self, terms):
        '''
        Takes list of terms and returns a 3D list specifying docID with positions for every term
        '''
        return [ [ [ i, self.db[term][1][i] ] for i in self.db[term][1] ] for term in terms ]
    
    def getDocsFromPostings(self, postings):
        '''
        Takes list of postings and returns only the document id from that list for every term
        '''
        return [ [x[0] for x in p] for p in postings ]

    def performQuery(self,phrase):
        phrase=self.preProcess(phrase)    # Preprocessing of query
        print("Query:",phrase)
        
        if(len(self.db.keys())==0):      # if dataset is empty the loading it
            self.load()
        
        for term in phrase:               # If any term does not exist in the dataset then return an empty list
            if(term not in self.db):
                return []

        if(len(phrase)==1):                # If there is only 1 phrase left after preprocessing then result is only docs of that term in dataset
            result= self.db[phrase[0]][1].keys()
            result=list(result)

        else:
            postings=self.getPostings(phrase)    
            docs=self.getDocsFromPostings(postings)
            docs=self.intersectLists(docs)     # find common docs that contain all the terms

            for i in range(len(postings)):      # Filtering Postings so that it contains only docs which contain all the terms
                postings[i]=[x for x in postings[i] if x[0] in docs]
            
            postings=copy.deepcopy(postings)
          
            for i in range(len(postings)):      # Reducing positons of subsequent terms so that if all the terms are adjacent to each other then after this every posting list will have a common position
                for j in range(len(postings[i])):
                    postings[i][j][1]=[x-i for x in postings[i][j][1]]

            
            result=[]
            for i in range(len(postings[0])):
                intersection=self.intersectLists([x[i][1] for x in postings]) # finding intersections in postings
                if len(intersection)>0:
                    result.append(postings[0][i][0])            # if intersection found append the document id

        result=list(map(int,result))    # convert list of strings to list of integers
        self.getMapping(result)         # Showing fileID with thier locations
        return result
    


    def getMapping(self,files):
        print('Total Documents:',len(files))

        self.mapping=json.load(open('mapping.json'))
        data= list(map(lambda i:(i,self.mapping[i]),files)) # list of tuples consisting of document id and their location
        data.sort()
        
        print(tabulate(data,headers=['Document ID','Location']))


          

In [None]:
query = Query()
phrase=list(input().split())
output=query.performQuery(phrase)
