**Question 2: UNIGRAM INVERTED INDEX AND BOOLEAN QUERIES**

---



---



**DRIVE MOUNTE**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


**LIBRARY / PACKAGE**

In [None]:
import os
import string
import re
import nltk
import pickle

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

**INPUT FOLDER**

In [None]:
data_folder = "/content/drive/MyDrive/text_files"
files_list = os.listdir(data_folder)

**UNIGRAM INVERTED INDEX CREATION**

In [None]:
processed_data = {}

for current_file in files_list:
    with open(os.path.join(data_folder, current_file), "r") as file_content:
        processed_data[current_file] = file_content.read()

unigram_inverted_index = {}
for current_file, file_text in processed_data.items():
    for word in file_text.split():
        if word not in unigram_inverted_index:
            unigram_inverted_index[word] = set()
        unigram_inverted_index[word].add(current_file)

In [None]:
file_path = "/content/drive/MyDrive/unigram_inverted_index.pkl"

In [None]:
def dump_unigram_inverted_index(file_path, unigram_inverted_index):
    with open(file_path, "wb") as file_obj:
        pickle.dump(unigram_inverted_index, file_obj)

# Dumping
dump_unigram_inverted_index(file_path, unigram_inverted_index)

In [None]:
def load_inverted_index(file_path):
    with open(file_path, "rb") as file_obj:
        loaded_inverted_index = pickle.load(file_obj)
    return loaded_inverted_index

# Loading
loaded_unigram_inverted_index = load_inverted_index(file_path)

**PRINT ALL UNIGRAM INVERTED INDEX**

In [None]:
for word in sorted(loaded_unigram_inverted_index.keys()):
    files_set = loaded_unigram_inverted_index[word]
    files_list = sorted(list(files_set))
    print(f"{word} -> {files_list}")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
craft -> ['file744.txt']
craftmanship -> ['file143.txt', 'file599.txt']
craftsmanship -> ['file673.txt', 'file978.txt']
craigslist -> ['file932.txt']
crank -> ['file194.txt']
cranking -> ['file235.txt']
crap -> ['file338.txt', 'file365.txt', 'file544.txt', 'file728.txt', 'file879.txt', 'file910.txt']
crappy -> ['file214.txt', 'file440.txt', 'file654.txt', 'file674.txt']
crash -> ['file160.txt', 'file885.txt']
crashes -> ['file160.txt']
crashing -> ['file467.txt', 'file841.txt']
crate -> ['file556.txt', 'file813.txt']
cray -> ['file936.txt']
crazy -> ['file100.txt', 'file157.txt', 'file309.txt']
cream -> ['file265.txt', 'file606.txt', 'file612.txt', 'file737.txt', 'file855.txt']
creamish -> ['file457.txt']
creamoff -> ['file915.txt']
creamy -> ['file194.txt', 'file850.txt']
create -> ['file264.txt', 'file290.txt', 'file513.txt', 'file906.txt']
created -> ['file203.txt', 'file719.txt', 'file828.txt']
creates -> ['file107.tx

In [None]:
total_terms = len(loaded_unigram_inverted_index)
print(f"\nTotal number of unique terms: {total_terms}")


Total number of unique terms: 6471


**FUNCTION FOR ALL THE OPRATION**

In [None]:
def intersect(S1, S2):
    result = list(set(S1) & set(S2))
    return result

def union(S1, S2):
    return list(set(S1) | set(S2))

def complement(S1, all_docs):
    return list(set(all_docs) - set(S1))

In [None]:
def comparisons_and(S1, S2, num_comparisons):
    x = y = 0
    S1.sort()
    S2.sort()
    while x < len(S1) and y < len(S2):
        if int(S1[x][4:-4]) == int(S2[y][4:-4]):
            num_comparisons += 1
            x += 1
            y += 1
        elif int(S1[x][4:-4]) < int(S2[y][4:-4]):
            x += 1
        else:
            y += 1
    return num_comparisons

def complment(S1,S2,all_files):
  result = list(set(S1)-set(S2))
  return result

def comparisons_or(S1, S2, num_comparisons):
    x = y = 0
    S1.sort()
    S2.sort()
    while x < len(S1) and y < len(S2):
        if int(S1[x][4:-4]) == int(S2[y][4:-4]):
            num_comparisons += 1
            x += 1
            y += 1
        elif int(S1[x][4:-4]) < int(S2[y][4:-4]):
            num_comparisons += 1
            x += 1
        else:
            num_comparisons += 1
            y += 1
    return num_comparisons

**QUERY HANDALING**

In [36]:
def retrieve_documents(query, operations, unigram_index, all_files):
    lem = WordNetLemmatizer()

    query = [lem.lemmatize(token) for token in re.sub('[^A-Z a-z ]+', ' ', query.lower()).split() if token.strip()]

    tokens = [lem.lemmatize(token) for token in query if token not in set(stopwords.words("english")) and token not in string.punctuation and token.strip()]

    combined_elements = []

    for token, operation in zip(tokens, operations):
      combined_elements.extend([token, operation])
    combined_elements.extend(tokens[len(operations):] + operations[len(tokens):])
    query_ans = ' '.join(combined_elements)
    # print(query_ans)

    query_tokens = tokens

    document_sets = [list(unigram_index[token]) for token in query_tokens if token in unigram_index]

    result = []
    num_comparisons = 0
    i = 0

    for operation in operations:
        if document_sets and i < len(document_sets):
            if operation == "AND":
                if i == 0:
                    num_comparisons = comparisons_and(document_sets[0], document_sets[1], num_comparisons)
                    result = intersect(document_sets[0], document_sets[1])
                    i = i + 1
                else:
                    num_comparisons = comparisons_and(result, document_sets[i], num_comparisons)
                    result = intersect(result, document_sets[i])
                    i = i + 1

            elif operation == "OR":
                if i == 0:
                    num_comparisons = comparisons_or(document_sets[0], document_sets[1], num_comparisons)
                    result = union(document_sets[0], document_sets[1])
                    i = i + 1
                else:
                    num_comparisons = comparisons_or(result, document_sets[i], num_comparisons)
                    result = union(result, document_sets[i])
                    i = i + 1

            elif operation == "AND NOT":
                complemented_set = complment(document_sets[i],document_sets[i+1], all_files)
                result = complemented_set
                i = i + 1

            elif operation == "OR NOT":
                complemented_set = complement(document_sets[i+1], all_files)
                num_comparisons = comparisons_or(result, complemented_set, num_comparisons)
                result = union(result, complemented_set)
                i = i + 1
        else:
            break

    return result, num_comparisons, query_ans

**USER QUERY INPUT**

In [39]:
if __name__ == "__main__":
    n = int(input().strip())
    files = [f"file{i + 1}.txt" for i in range(999)]
    for i in range(n):

        query = input().strip()
        operations = input().strip().split(",")

        result, number_of_comparisons, query_ans = retrieve_documents(query, operations, loaded_unigram_inverted_index, files)

        sorted_result = sorted(result)

        print(f"Query {i + 1}:", query_ans)
        print("Number of documents retrieved for query", i + 1, ":", len(result))
        print("Names of the documents retrieved for query", i + 1, ":", sorted_result)
        print()

1
car is bag in a canister
OR, AND NOT
Query 1: car OR bag  AND NOT canister
Number of documents retrieved for query 1 : 31
Names of the documents retrieved for query 1 : ['file118.txt', 'file166.txt', 'file174.txt', 'file264.txt', 'file3.txt', 'file313.txt', 'file363.txt', 'file404.txt', 'file459.txt', 'file466.txt', 'file542.txt', 'file573.txt', 'file665.txt', 'file682.txt', 'file686.txt', 'file698.txt', 'file699.txt', 'file73.txt', 'file738.txt', 'file746.txt', 'file780.txt', 'file797.txt', 'file860.txt', 'file863.txt', 'file864.txt', 'file886.txt', 'file892.txt', 'file930.txt', 'file942.txt', 'file956.txt', 'file981.txt']

