In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
#importing libraries
import os
import nltk
import string
import regex as re
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.stem import PorterStemmer
from natsort import natsorted

In [3]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [4]:
posInd = {} #datastructure to store positional index 
mapping={} #maps fileno -> filename 
docNo = 0 #represents docId

**Pre-processing**

In [5]:
#function to preprocess the file and convert into token list
def preprocess_doc(raw_file):
    
    raw_file = raw_file.lower() #lower-casing
    token_list = TweetTokenizer().tokenize(raw_file) #tokenize
    stoppers = set(stopwords.words('english')) 
    token_list = {word for word in token_list if word not in stoppers} #removing stop-words
    token_list = [re.sub(r'[^\w\s]','',x) for x in token_list] #removing punctuations
    token_list = [word for word in token_list if word] #removing blank words
    

    return token_list

**Positional Indexing**

In [6]:
filesPath = '/content/drive/MyDrive/IR Assignment/Humor,Hist,Media,Food/Humor,Hist,Media,Food'
files = os.listdir(filesPath)
#for each file
for f in files:
  docPath = os.path.join(filesPath,f)
  myDoc = open(docPath,encoding='utf-8',errors='surrogateescape').read()
  myTokens = preprocess_doc(myDoc) #pre-process file

  #positional indexing :: Refer https://www.geeksforgeeks.org/python-positional-index/
  for pos,name in enumerate(myTokens):
    if name in posInd: #if token already present in ds
      posInd[name][0] = posInd[name][0]+1  #incr freq

      if docNo in posInd[name][1]: #if fileno already present then add into pos list
        posInd[name][1][docNo].append(pos)
      else:
        posInd[name][1][docNo] = [pos] #create new list
    else: #create new list and dict
      posInd[name] = []
      posInd[name].append(1)
      posInd[name].append({})
      posInd[name][1][docNo] = [pos]
  
  mapping[docNo]= f; #maps file no
  docNo=docNo+1




**Query Processing**

In [17]:
#function to merge tokens and find common docs for phrase tokens
def find_docs(w1,w2,dist):
  output=[]
  l1 = len(w1)
  l2 = len(w2)
  i1=j1=0
  while i1<l1 and j1<l2: #till length of doc no found
    if w1[i1][0] == w2[j1][0]: #matched doc-id
      
      ll1 = (w1[i1][1])
      ll2 = (w2[j1][1])
      i2=j2=0
      postlist=[]
      while i2<len(ll1) and j2<len(ll2): #if pos of first < sec then add to pos list
          if (ll2[j2]>ll1[i2] and (ll2[j2]-ll1[i2])<=dist):
            postlist.append(ll2[j2])
            i2=i2+1
            j2=j2+1
          elif ll2[j2]<ll1[i2]: #incr second pos list ptr
            j2=j2+1
          else:
            i2=i2+1
      
        
      output.append([w1[i1][0],postlist])
      i1=i1+1
      j1=j1+1
    elif w1[i1][0] < w2[j1][0]:
      i1=i1+1
    else:
      j1=j1+1
  return output 

In [37]:
query=input("Enter Query::")
tokens = preprocess_doc(query)
query = query.split()

Enter Query::tripped over the garden


In [38]:
d1=[]
dict1 = posInd[tokens[0]][1]
for k,v in dict1.items(): #convert dict into req ds
    temp = [k,v]
    d1.append(temp)
for i in range(1,len(tokens)): #for each token in query
  
  d2=[]
  dict2 = posInd[tokens[i]][1]
  for k,v in dict2.items(): #convert dict into req ds
    temp = [k,v]
    d2.append(temp)
  d1 = find_docs(d1,d2,5)

In [41]:
print("List of Document Names::")
total=0
for x in d1:
  total=total+1
  print(mapping[x[0]])

List of Document Names::
practica.txt
humor9.txt
insult.lst
cabbage.txt


In [42]:
print("Number of Documents Retrieved::")
print(total)

Number of Documents Retrieved::
4
