# Imports

In [1]:
from nltk.tokenize import word_tokenize
import nltk
from nltk.corpus import stopwords
import json
import os
from tqdm.notebook import tqdm
import string

nltk.download("punkt")
nltk.download("stopwords")

[nltk_data] Downloading package punkt to /home/sandeep/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/sandeep/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Inverted Index

In [2]:

class InvertedIndex():
	def __init__(self):
		self.db={}

	def __str__(self):
		return "Keys:"+str(len(list(self.db.keys())))
	
	def stripSpecialChar(self,text):
		return ''.join(ch for ch in text if ch.isalnum() and not ch.isdigit() and ch not in string.punctuation)

	def preProcess(self,text):
		text = text.lower()
		text_tokens = word_tokenize(text)
		stopWords = set(stopwords.words('english'))
		validTokens = list(set(text_tokens) - set(stopWords))
		validTokens = [self.stripSpecialChar(x) for x in validTokens]
		validTokens = [x for x in validTokens if len(x) > 1]
		return validTokens

	def indexFile(self,file,fileId):
		tokens = self.preProcess(file)
		for i in tokens:
			if i in self.db:
				self.db[i].append(fileId)
			else:
				self.db[i] = [fileId]

	def save(self):
		json.dump(self.db, open("output.json", "w"))


# Creating Index

In [4]:
allFiles = os.walk("stories")
filePaths = []
for i in allFiles:
	for j in i[2]:
		filePath = i[0] + "/" + j
		filePaths.append(filePath)

json.dump(filePaths, open("mapping.json", "w"))

index = InvertedIndex()

for i,filePath in enumerate(tqdm(filePaths)):
    try:
        file = open(filePath, encoding="utf8")
        read = file.read().replace('\n', ' ')
        file.close()
        index.indexFile(read, i)
        index.save()
    
    except Exception as e:
        print("error", filePath,e)


  0%|          | 0/467 [00:00<?, ?it/s]

error stories/tctac.txt 'utf-8' codec can't decode byte 0xda in position 1217: invalid continuation byte
error stories/fred.txt 'utf-8' codec can't decode byte 0xfa in position 11818: invalid start byte
error stories/fea3 'utf-8' codec can't decode byte 0xc4 in position 1807: invalid continuation byte
error stories/mario.txt 'utf-8' codec can't decode byte 0xc4 in position 4557: invalid continuation byte
error stories/snow.txt 'utf-8' codec can't decode byte 0xb7 in position 6: invalid start byte
error stories/dskool.txt 'utf-8' codec can't decode byte 0xe9 in position 32406: invalid continuation byte
error stories/bureau.txt 'utf-8' codec can't decode byte 0xa0 in position 152695: invalid start byte
error stories/prince.art 'utf-8' codec can't decode byte 0xdc in position 1: invalid continuation byte
error stories/quot 'utf-8' codec can't decode byte 0xc9 in position 0: invalid continuation byte
error stories/girlclub.txt 'utf-8' codec can't decode byte 0xd4 in position 3952: invalid 

# Query

In [None]:
class Query():
    def __init__(self):
        pass

    def loadIndex(self,file):
        self.db=json.load(open(file))

    def OR(self,term1,term2):
        return term1.add(term2)
    
    def AND(self,term1,term2):
        return term1.intersect(term2)
    
    def ANDNOT(self,term1,term2):
        temp=Response(sum(list(self.db.copy().values()),[]))
        temp1=temp.diff(term2)
        return term1.intersect(temp1)



In [None]:
class Response():
    def __init__(self,data):
        print(data)
        self.data=set(data)
    
    def __str__(self):
        return str(len(self.data))

    def add(self,response):
        return Response(set.union(self.data,response.data))
        
    def intersect(self,response):
        return Response(set.intersection(self.data,response.data))

    def diff(self,response):
        return Response(set.difference(self.data,response.data))
        

In [None]:
query=Query()
query.loadIndex("output.json")


In [None]:
a=Response(query.db["stink"])
b=Response(query.db["pronounced"])
print(query.OR(a,b).data)

In [None]:
queries=["stink","middle","pronounced"]
output=Response(query.db[queries[0]])
for i in range(1,len(queries)):
    curr=Response(query.db[queries[i]])
    output=query.ANDNOT(curr,output)

print(output)

