In [8]:
# Importing required libraries
import os
import spacy
import pickle


In [9]:
folder = "F:\\IIITD\\Sem 8\\info_ret\\CSE508_Winter2024_A1_2020513\\preprocessed_text"

In [10]:
# Load the spaCy English language model for NLP tasks
nlp = spacy.load("en_core_web_sm")
def pre_query(text):
    doc = nlp(text.lower())
    return [tok.text for tok in doc if not tok.is_stop and not tok.is_punct and not tok.is_space]

In [11]:
# creating inverted index
def inv_index(folder):
    
    # initializing empty dictionary
    inv = {}  
    
    # iterating over each file in folder
    for filename in os.listdir(folder):  
        
        # reading each one
        with open(os.path.join(folder, filename), 'r', encoding='utf-8') as file: 
            
            # storing set of unique words from text file 
            words = set(file.read().split())  
            
            for word in words:
                
                # updating each word with filename
                inv.setdefault(word, set()).add(filename)  
    return inv

In [12]:
# saving inverted index as pickle file
def save(inverted_index, file_name):
    with open(file_name, 'wb') as file:
        pickle.dump(inverted_index, file)

In [13]:
def process_query(query, operations, inverted_index):
    # splitting the query
    terms = query.split()

    # preprocess the first term 
    pre_first_term = pre_query(terms[0])
    
    # initializing result set
    res = set()
    for term in pre_first_term:
        res = res.union(inverted_index.get(term, set()))

    # iterate over the operations and the remaining terms
    for i, op in enumerate(operations):
        if i + 1 < len(terms):
            preprocessed_term = pre_query(terms[i + 1])
            next_set = set()
            for term in preprocessed_term:
                next_set = next_set.union(inverted_index.get(term, set()))

            if op == 'AND':
                res = res.intersection(next_set)
            elif op == 'OR':
                res = res.union(next_set)
            elif op == 'AND NOT':
                res = res.difference(next_set)
            elif op == 'OR NOT':
                all_docs = set.union(*[set(v) for v in inverted_index.values()])
                res = res.union(all_docs.difference(next_set))

    return res

In [14]:
# name to save file
index_file = "inverted_index.pkl"

In [15]:
# creating and saving inv index
inverted_index = inv_index(folder)
save(inverted_index, index_file)

In [16]:
# loading saved inverted index using pickle
def load_inverted_index(file_name):
    with open(file_name, 'rb') as file:
        return pickle.load(file)

In [17]:
import pickle
# loading inv index
loaded_index = load_inverted_index(index_file)

In [19]:
# processing unser queries user queries
n = int(input("Enter number of queries: "))  

for i in range(n):
    
    query = input(f"Enter query {i+1}: ")
    operations = input("Enter operations separated by comma: ").split(", ")
    
    result = process_query(query, operations, loaded_index)
    
    print(f"Query {i+1}: {query}")
    print(f"Number of documents retrieved for query {i+1}: {len(result)}")
    print(f"Names of the documents retrieved for query {i+1}: {', '.join(result)}")

Query 1: Car bag in a canister
Number of documents retrieved for query 1: 31
Names of the documents retrieved for query 1: file466.txt, file860.txt, file3.txt, file118.txt, file981.txt, file699.txt, file864.txt, file738.txt, file797.txt, file863.txt, file665.txt, file459.txt, file780.txt, file892.txt, file942.txt, file542.txt, file682.txt, file686.txt, file166.txt, file930.txt, file746.txt, file698.txt, file73.txt, file264.txt, file174.txt, file363.txt, file956.txt, file404.txt, file886.txt, file313.txt, file573.txt
Query 2: Coffee brewing techniques in cookbook
Number of documents retrieved for query 2: 999
Names of the documents retrieved for query 2: file346.txt, file36.txt, file389.txt, file238.txt, file347.txt, file864.txt, file608.txt, file687.txt, file507.txt, file393.txt, file983.txt, file706.txt, file504.txt, file587.txt, file60.txt, file589.txt, file251.txt, file601.txt, file836.txt, file897.txt, file147.txt, file354.txt, file269.txt, file766.txt, file376.txt, file711.txt, fi