# Imports

In [48]:
from nltk.tokenize import word_tokenize
import nltk
from nltk.corpus import stopwords
import json
import os
from tqdm.notebook import tqdm,tnrange
import string
import numpy as np

nltk.download("punkt")
nltk.download("stopwords")

[nltk_data] Downloading package punkt to /home/sandeep/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/sandeep/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Inverted Index

In [13]:

class InvertedIndex():
	def __init__(self):
		self.db={}

	def __str__(self):
		return "Keys:"+str(len(list(self.db.keys())))
	
	def stripSpecialChar(self,text):
		return ''.join(ch for ch in text if ch.isalnum() and not ch.isdigit() and ch not in string.punctuation)

	def preProcess(self,text):
		text = text.lower()
		text_tokens = word_tokenize(text)
		stopWords = set(stopwords.words('english'))
		validTokens = list(set(text_tokens) - set(stopWords))
		validTokens = [self.stripSpecialChar(x) for x in validTokens]
		validTokens = [x for x in validTokens if len(x) > 1]
		return validTokens

	def indexFile(self,file,fileId):
		tokens = self.preProcess(file)
		for i in tokens:
			if i in self.db:
				self.db[i].append(fileId)
			else:
				self.db[i] = [fileId]

	def save(self):
		json.dump(self.db, open("output.json", "w"))


# Creating Index

In [14]:
allFiles = os.walk("stories")
filePaths = []
for i in allFiles:
	for j in i[2]:
		filePath = i[0] + "/" + j
		filePaths.append(filePath)

json.dump(filePaths, open("mapping.json", "w"))

index = InvertedIndex()

for i,filePath in enumerate(tqdm(filePaths)):
    try:
        file = open(filePath, encoding="utf8")
        read = file.read().replace('\n', ' ')    
    except Exception as e:
        file = open("stories/tctac.txt", encoding="Latin1")
        read = file.read().replace('\n', ' ')
    file.close()
    index.indexFile(read, i)
    index.save()


  0%|          | 0/467 [00:00<?, ?it/s]

# Query

In [54]:
class Query():
    def __init__(self):
        pass

    def loadIndex(self,file):
        self.db=json.load(open(file))

    def OR(self,term1,term2):
        return term1.add(term2)
    
    def AND(self,term1,term2):
        return term1.intersect(term2)
    
    def ANDNOT(self,term1,term2):
        # temp=Response(set(sum(list(self.db.copy().values()),[])))
        temp=Response(np.arange(468))
        temp1=temp.diff(term2)
        return term1.intersect(temp1)



In [55]:
class Response():
    def __init__(self,data):
        self.data=set(data)

    def getMapping(self,file):
        self.mapping=json.load(open(file))
        for i in self.data:
            print(self.mapping[i])
    
    def __str__(self):
        return str(len(self.data))

    def add(self,response):
        return Response(set.union(self.data,response.data))
        
    def intersect(self,response):
        return Response(set.intersection(self.data,response.data))

    def diff(self,response):
        return Response(set.difference(self.data,response.data))
        

In [56]:
query=Query()
query.loadIndex("output.json")


In [57]:
queries=["stink","middle","pronounced"]
output=Response(query.db[queries[0]])
for i in tnrange(1,len(queries)):
    curr=Response(query.db[queries[i]])
    output=query.ANDNOT(curr,output)

print(output)


  0%|          | 0/2 [00:00<?, ?it/s]

8


In [60]:
output.getMapping("mapping.json")
output.data

stories/grav
stories/forgotte
stories/wisteria.txt
stories/ltp
stories/abbey.txt
stories/arcadia.sty
stories/gulliver.txt
stories/darkness.txt


{0, 35, 107, 143, 161, 194, 258, 268}