# Positional Index

**Imports**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
import json
import string
import re
from tqdm import tqdm
from nltk.stem import PorterStemmer

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

**Load data**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
raw_data =json.load(open("/content/drive/MyDrive/IR_Assignments/docs.json", "r"))
file_types =json.load(open("/content/drive/MyDrive/IR_Assignments/special_docs.json", "r"))

**Preprocessing**

In [None]:
def preprocess(input):
    input=input.replace('\a',' ')
    input=input.replace('\b',' ')
    input=input.replace('\f',' ')
    input=input.replace('\n',' ')    
    input=input.replace('\r',' ')
    input=input.replace('\t',' ')
    input=input.replace('\v',' ')
    # removing special characters
    # output = re.sub(r'[^\x20-\x7e]','',input)
    # print(output)
    # convert to lower case
    output = input.lower()
    # remove punctuations
    punctuations=string.punctuation.replace("'",'')
    output = "".join([char if char not in punctuations else ' ' for char in output])
    output = output.replace("'",'')
    # print(output)
    # tokenize
    output = nltk.word_tokenize(output)
    # removing words with special characters
    output = [word for word in output if re.sub(r'[^\x20-\x7e]','',word) == word]
    # remove stopwords
    output = [word.strip() for word in output if word not in nltk.corpus.stopwords.words('english')]
    return output

In [None]:
for doc in raw_data:
  raw_data[doc] = preprocess(raw_data[doc])

**Creating doc to doc-id mapping**

In [None]:
def map_docs(raw_data):
    doc_ids = {}
    id = 1
    for doc in raw_data:
        doc_ids[doc] = id
        id += 1
    return doc_ids

doc_ids = map_docs(raw_data)

**Creating positional index**

In [25]:
def create_index(doc_ids):
  positional_index={}
  ctr=0
  for doc in raw_data:
    ctr+=1
    for i in range(len(raw_data[doc])):
      token=raw_data[doc][i]
      if token in positional_index.keys():
        if doc_ids[doc] in positional_index[token].keys():
          positional_index[token][doc_ids[doc]].append(i)
        else:
          positional_index[token][doc_ids[doc]]=[i]
      else:
        positional_index[token] = {doc_ids[doc]:[i]}
    # if ctr%5==0:
    #   break
  return positional_index
positional_index = create_index(doc_ids)

**Processing phrase queries**

In [29]:
def process(query):
  not_present=0
  for word in query:
    if word not in positional_index:
      not_present = 1
      break
  count = 0
  doc_list = []
  if not_present == 0:
    for doc in positional_index[query[0]]:
      for occurance in positional_index[query[0]][doc]:
        found = 1
        ptr=1
        for i in range(1,len(query)):
          if doc not in positional_index[query[i]].keys() or (occurance+ptr) not in positional_index[query[i]][doc]:
            found = 0
            break
          ptr+=1
        if found == 1:
          count+=1
          doc_list.append(doc)
          break  
  print('Total Number of occurances =',count)
  print('List of Documents =',doc_list)
  key_list = list(doc_ids.keys())
  val_list = list(doc_ids.values())
  for doc in doc_list:
    position = val_list.index(doc)
    print(key_list[position])

In [30]:
query = input("Input sentence: ")
query = preprocess(query)
process(query)

Input sentence: good morning
Total Number of occurances = 14
List of Documents = [72, 231, 425, 481, 538, 638, 786, 840, 1002, 1004, 1005, 1035, 1058, 1110]
bad.jok
coffeebeerwomen.txt
gd_ql.txt
homermmm.txt
jason.fun
math.2
phorse.hum
pun.txt
teevee.hum
televisi.hum
televisi.txt
top10st1.txt
t_zone.jok
worldend.hum
