In [57]:

from nltk.tokenize import word_tokenize
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer,SnowballStemmer,WordNetLemmatizer
import json
import os
from tqdm.notebook import tqdm,tnrange
import string
import numpy as np
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from collections import defaultdict
from tabulate import tabulate
from functools import reduce
import copy

nltk.download("punkt")
nltk.download("stopwords")

[nltk_data] Downloading package punkt to /home/sandeep/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/sandeep/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Positional Index

In [23]:
class PositionalIndex():
    def __init__(self):
        self.db={}

    def __str__(self):
        return "Keys:"+str(len(list(self.db.keys())))

    def stripSpecialChar(self,text):
        return ''.join(ch for ch in text if ch.isalnum() and not ch.isdigit() and ch not in string.punctuation)

    def preProcess(self,text):
        stopWords = set(stopwords.words('english'))

        text = text.lower()                                     # convert all text to lower case
        text_tokens = word_tokenize(text)                       # tokenizing the text

        validTokens = [i for i in text_tokens if i not in stopWords]    # removing stop words

        validTokens = [self.stripSpecialChar(x) for x in validTokens]   # stripping special characters
        validTokens = [x for x in validTokens if len(x) > 1]    # Choosing only words which has length > 1
        return validTokens

    def indexFile(self,file,fileId):
        '''
        Indexes the file pass as argument to the associated fileID
        '''
        tokens = self.preProcess(file)
        for pos,val in enumerate(tokens):
            if val in self.db:
                self.db[val][0]+=1
                if(fileId in self.db[val][1]):
                    self.db[val][1][fileId].append(pos)
                else:
                    self.db[val][1][fileId]=[pos]
                
            else:
                self.db[val] = [1,{fileId:[pos]}]

    def generateWordcloud(self):
        '''
        Creates a wordclound to visualize the frequence of words in the index
        '''
        frequencyDict = {}
        for key in self.db:
            frequencyDict[key] = len(self.db[key])
        wordcloud = WordCloud().generate_from_frequencies(frequencyDict)
        plt.imshow(wordcloud, interpolation="bilinear")
        plt.axis("off")

    def save(self):
        '''
        Save the index to a file locally
        '''
        json.dump(self.db, open('output.json', "w"))
   



In [11]:
allFiles = os.walk("Dataset/stories")
filePaths = []
for i in allFiles:
    for j in i[2]:
        filePath = i[0] + "/" + j
        filePaths.append(filePath)

json.dump(filePaths, open("mapping.json", "w"))

index = PositionalIndex()

for i,filePath in enumerate(tqdm(filePaths)):
    try:
        file = open(filePath, encoding="utf8")
        read = file.read().replace('\n', ' ')    
    except Exception as e:
        file = open(filePath, encoding="unicode_escape")
        read = file.read().replace('\n', ' ')
    file.close()
    index.indexFile(read, i)
    index.save()


  0%|          | 0/467 [00:00<?, ?it/s]

## Query

In [83]:
class Query():
    def __init__(self):
        '''
        initializes the object with loading the index file
        '''
        self.db=json.load(open('output.json'))
        self.db=defaultdict(lambda:[],self.db)   

    def stripSpecialChar(self,text):
        return ''.join(ch for ch in text if ch.isalnum() and not ch.isdigit() and ch not in string.punctuation)

    def preProcess(self,text):
        stopWords = set(stopwords.words('english'))

        text = text.lower()                                     # convert all text to lower case
        text_tokens = word_tokenize(text)                       # tokenizing the text

        validTokens = [i for i in text_tokens if i not in stopWords]    # removing stop words

        validTokens = [self.stripSpecialChar(x) for x in validTokens]   # stripping special characters
        validTokens = [x for x in validTokens if len(x) > 1]    # Choosing only words which has length > 1
        return validTokens

    def intersectLists(self,lists):
        if len(lists)==0:
            return []
       
        lists.sort(key=len)
        return list(reduce(lambda x,y: set(x)&set(y),lists))
        
    def getPostings(self, terms):
        return [ self.db[term][1] for term in terms ]
    
    def getDocsFromPostings(self, postings):
        return [ [x for x in p] for p in postings ]

    def performQuery(self,phrase):
        phrase=self.preProcess(phrase)
    
        if(len(self.db.keys())==0):
            self.load()

        for term in phrase:
            if(term not in self.db):
                return []

        postings=self.getPostings(phrase)    #all the terms in q are in the index
        docs=self.getDocsFromPostings(postings)
        docs=self.intersectLists(docs)
        # print(postings)
        for i in range(len(postings)):
            postings[i]=[postings[i][x] for x in postings[i] if x in docs]
        
        postings=copy.deepcopy(postings)    #this is important since we are going to modify the postings list
        
        for i in range(len(postings)):
            for j in range(len(postings[i])):
                postings[i][j]=[x-i for x in postings[i][j]]
        return postings
        
        # #intersect the locations
        # result=[]
        # for i in xrange(len(postings[0])):
        #     li=self.intersectLists( [x[i][1] for x in postings] )
        #     if li==[]:
        #         continue
        #     else:
        #         result.append(postings[0][i][0])    #append the docid to the result
        
        # return result

In [84]:
query = Query()
# phrase=list(input().split())
query.performQuery("aytori psychia")

['aytori', 'psychia']


[[[395,
   421,
   1200,
   1396,
   1402,
   1443,
   1684,
   1698,
   1716,
   1760,
   1851,
   1865,
   1871,
   2102,
   2194],
  [854,
   871,
   902,
   926,
   1078,
   1279,
   1305,
   1409,
   1592,
   2112,
   2226,
   2275,
   2670,
   2705,
   2714,
   2832,
   2983,
   3026,
   3035,
   3052]],
 [[384, 429, 1817, 2199], [89, 131, 1517, 1533, 1544, 2130, 2558]]]