In [62]:
# Importing required libraries
import os
import spacy
import pickle

In [63]:
folder = "preprocessed_text"

In [64]:
# Load the spaCy English language model for NLP tasks
nlp = spacy.load("en_core_web_sm")
def pre_query(text):
    doc = nlp(text.lower())
    return [tok.text for tok in doc if not tok.is_stop and not tok.is_punct and not tok.is_space]

In [65]:
# creating inverted index
def inv_index(folder):
    
    # initializing empty dictionary
    inv = {}  
    
    # iterating over each file in folder
    for filename in os.listdir(folder):  
        
        # reading each one
        with open(os.path.join(folder, filename), 'r', encoding='utf-8') as file: 
            
            # storing set of unique words from text file 
            words = set(file.read().split())  
            
            for word in words:
                
                # updating each word with filename
                inv.setdefault(word, set()).add(filename)  
    return inv

In [66]:
# saving inverted index as pickle file
def save(inverted_index, file_name):
    with open(file_name, 'wb') as file:
        pickle.dump(inverted_index, file)

In [67]:
def process_query(query, operations, inverted_index):
    # splitting the query
    terms = query.split()

    # preprocess the first term 
    pre_first_term = pre_query(terms[0])
    
    # initializing result set
    res = set()
    for term in pre_first_term:
        res = res.union(inverted_index.get(term, set()))

    # iterate over the operations and the remaining terms
    for i, op in enumerate(operations):
        if i + 1 < len(terms):
            preprocessed_term = pre_query(terms[i + 1])
            next_set = set()
            for term in preprocessed_term:
                next_set = next_set.union(inverted_index.get(term, set()))

            if op == 'AND':
                res = res.intersection(next_set)
            elif op == 'OR':
                res = res.union(next_set)
            elif op == 'AND NOT':
                res = res.difference(next_set)
            elif op == 'OR NOT':
                all_docs = set.union(*[set(v) for v in inverted_index.values()])
                res = res.union(all_docs.difference(next_set))

    return res

In [68]:
# name to save file
index_file = "inverted_index.pkl"

In [69]:
# creating and saving inv index
inverted_index = inv_index(folder)
save(inverted_index, index_file)

In [70]:
# loading saved inverted index using pickle
def load_inverted_index(file_name):
    with open(file_name, 'rb') as file:
        return pickle.load(file)

In [71]:
import pickle
# loading inv index
loaded_index = load_inverted_index(index_file)

In [72]:
# processing unser queries user queries
n = int(input("Enter number of queries: "))  

for i in range(n):
    
    query = input(f"Enter query {i+1}: ")
    operations = input("Enter operations separated by comma: ").split(", ")
    
    pre_q= pre_query(query)
    
    result = process_query(query, operations, loaded_index)
    
    formatted_query = pre_q[0] if pre_q else ""
    for op, term in zip(operations, pre_q[1:]):
        formatted_query += f" {op.upper()} {term}"

    print(f"Query {i+1}: {formatted_query}")
    print(f"Number of documents retrieved for query {i+1}: {len(result)}")
    print(f"Names of the documents retrieved for query {i+1}: {', '.join(result)}")

Query 1: car OR bag AND NOT canister
Number of documents retrieved for query 1: 31
Names of the documents retrieved for query 1: file863.txt, file363.txt, file174.txt, file930.txt, file746.txt, file981.txt, file699.txt, file956.txt, file892.txt, file118.txt, file686.txt, file698.txt, file73.txt, file860.txt, file797.txt, file864.txt, file264.txt, file780.txt, file886.txt, file166.txt, file466.txt, file313.txt, file459.txt, file942.txt, file404.txt, file665.txt, file682.txt, file542.txt, file573.txt, file3.txt, file738.txt
