In [35]:
# importing required libraries

import os
import pickle
import spacy

In [36]:
folder = "preprocessed_text"

In [37]:
# loading spacy
nlp = spacy.load("en_core_web_sm")
def pre_query(text):
    doc = nlp(text.lower())
    return [tok.text for tok in doc if not tok.is_stop and not tok.is_punct and not tok.is_space]

In [38]:
# creating postional index
def create_pos(folder):
    # initilazing dictionary to store positional index
    pos_index = {}

    # iterating over files
    for filename in os.listdir(folder):
        filepath = os.path.join(folder, filename)

        # reading file
        with open(filepath, 'r', encoding='utf-8') as file:
            
            # splitting the words
            words = file.read().split()
            for pos, word in enumerate(words):
                # updating postional index
                if word not in pos_index:
                    pos_index[word] = {}
                if filename not in pos_index[word]:
                    pos_index[word][filename] = []
                pos_index[word][filename].append(pos)

    # returning final result
    return pos_index

In [39]:
# creating postional index
positional_index = create_pos(folder)

In [40]:
# name to save file
index_file = "postional_index.pkl"

In [41]:
# saving inverted index as pickle file
def save(inverted_index, file_name):
    with open(file_name, 'wb') as file:
        pickle.dump(inverted_index, file)

In [42]:
# loading saved inverted index using pickle
def load_inverted_index(file_name):
    with open(file_name, 'rb') as file:
        return pickle.load(file)

In [43]:
save(positional_index, index_file)

# loading inv index
loaded_index = load_inverted_index(index_file)

In [44]:
def process(query, index):
    # pre-proceesing input query
    preprocessed_terms = pre_query(query)

    if not preprocessed_terms:
        return set()

    # getting intial list of documents for first term
    init = index.get(preprocessed_terms[0], {})
    v = {doc: pos for doc, pos in init.items()}

    # checking next terms for matching postions
    for term in preprocessed_terms[1:]:
        term_docs = index.get(term, {})
        new_val = {}
        for doc, pos in term_docs.items():
            if doc in v:
                prev_pos = v[doc]
                new_positions = [p for p in pos if p-1 in prev_pos]
                if new_positions:
                    new_val[doc] = new_positions
        v = new_val

    # return document names where phrases occurs
    return set(v.keys())


In [45]:
n = int(input("Enter number of queries: "))  
for i in range(n):
    q = input("Enter your phrase query: ")
    # process qurery phrase using postional index
    res = process(q, loaded_index)
    # print number of documents and names
    print(f"Number of documents retrieved for query {i+1} using positional index:", len(res))
    print(f"Names of documents retrieved for query {i+1} using positional index:", ", ".join(res))


Number of documents retrieved for query 1: 3
Names of documents retrieved for query 1: file279.txt, file6.txt, file854.txt
