In [40]:
import pandas as pd
import numpy as np
import string
import os
import re
import pickle
from sortedcontainers import SortedDict, SortedList, SortedSet
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

#  Q1


In [41]:
def fetchFilesInDirectory(folder_path):
   
    files_list = []
    
    for file_item in os.listdir(folder_path):
        full_file_path = os.path.join(folder_path, file_item)
        if os.path.isfile(full_file_path):
            files_list.append(full_file_path)
    
    return files_list

In [42]:
def convert_to_lowercase(input_string):
    return input_string.lower()

def tokenize_words(input_text):
    return word_tokenize(input_text)

def filter_stopwords_from_tokens(word_list, stopword_set):
    filtered_words = [word for word in word_list if word not in stopword_set]
    return filtered_words

def remove_punctuation_from_tokens(tokens):
    punctuation_pattern = re.compile('[{}]'.format(re.escape(string.punctuation)))
    tokens_without_punctuation = [punctuation_pattern.sub('', token) for token in tokens]
    return tokens_without_punctuation

def filter_non_blank_tokens(input_tokens):
    non_blank_tokens = [token for token in input_tokens if token.strip()]
    return non_blank_tokens



In [43]:
def preprocess_text(input_text, stop_words_set):
    lowercase_text = convert_to_lowercase(input_text)
    word_tokens = tokenize_words(lowercase_text)
    tokens_without_stopwords = filter_stopwords_from_tokens(word_tokens, stop_words_set)
    tokens_without_punctuation = remove_punctuation_from_tokens(tokens_without_stopwords)
    non_blank_tokens = filter_non_blank_tokens(tokens_without_punctuation)
    final_tokens = sorted(list(dict.fromkeys(non_blank_tokens)))
    return final_tokens


In [44]:
def create_file_mapping(file_lists):
    file_mapping = {}
    for i in range(len(file_lists)):
        file_mapping[i] = file_lists[i]

    return file_mapping


In [45]:
def preprocess_text_files_folder(folder_path, stop_words_set):
    file_list = os.listdir(folder_path)
    preprocessed_tokens_list = []

    file_mapping = create_file_mapping(file_list)

    for file_name in file_list:
        file_path = os.path.join(folder_path, file_name)
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()

        preprocessed_tokens = preprocess_text(content, stop_words_set)

        preprocessed_tokens_list.append(preprocessed_tokens)

    return preprocessed_tokens_list, file_mapping

stopwords_set = set(stopwords.words('english'))
text_files_folder_path = "text_files"
preprocessed_tokens_list, file_mapping = preprocess_text_files_folder(text_files_folder_path, stopwords_set)

for file_index, file_name in file_mapping.items():
    print(f"File Index: {file_index}, File Name: {file_name}")


File Index: 0, File Name: file1.txt
File Index: 1, File Name: file10.txt
File Index: 2, File Name: file100.txt
File Index: 3, File Name: file101.txt
File Index: 4, File Name: file102.txt
File Index: 5, File Name: file103.txt
File Index: 6, File Name: file104.txt
File Index: 7, File Name: file105.txt
File Index: 8, File Name: file106.txt
File Index: 9, File Name: file107.txt
File Index: 10, File Name: file108.txt
File Index: 11, File Name: file109.txt
File Index: 12, File Name: file11.txt
File Index: 13, File Name: file110.txt
File Index: 14, File Name: file111.txt
File Index: 15, File Name: file112.txt
File Index: 16, File Name: file113.txt
File Index: 17, File Name: file114.txt
File Index: 18, File Name: file115.txt
File Index: 19, File Name: file116.txt
File Index: 20, File Name: file117.txt
File Index: 21, File Name: file118.txt
File Index: 22, File Name: file119.txt
File Index: 23, File Name: file12.txt
File Index: 24, File Name: file120.txt
File Index: 25, File Name: file121.txt
F

In [46]:
for i in range(min(5, len(preprocessed_tokens_list))):
    print("\nFile:", f"{text_files_folder_path}/{os.listdir(text_files_folder_path)[i]}")
    print("Content before preprocessing:")
    with open(os.path.join(text_files_folder_path, os.listdir(text_files_folder_path)[i]), 'r', encoding='utf-8') as file:
        print(file.read())

    print("\nContent after preprocessing:")
    print(preprocessed_tokens_list[i])
    print("\n" + "-"*50)


File: text_files/file1.txt
Content before preprocessing:
Loving these vintage springs on my vintage strat. They have a good tension and great stability. If you are floating your bridge and want the most out of your springs than these are the way to go.

Content after preprocessing:
['bridge', 'floating', 'go', 'good', 'great', 'loving', 'springs', 'stability', 'strat', 'tension', 'vintage', 'want', 'way']

--------------------------------------------------

File: text_files/file10.txt
Content before preprocessing:
Awesome stand!

Tip: The bottom part that supports the guitar had a weird angle when arrived, making the guitar slide back, becoming almost 100% on a vertical.
To solve this, I assembled the product and the put a some pressure on the support frame, making it bend a little. Now my guitar sits perfectly. Check photos!

Content after preprocessing:
['100', 'almost', 'angle', 'arrived', 'assembled', 'awesome', 'back', 'becoming', 'bend', 'bottom', 'check', 'frame', 'guitar', 'li

# Q2

In [47]:
def create_unigram_inverted_index(file_dictionary, stopwords_set):
    
    unigram_inverted_index = SortedDict()

    for doc_ID in range(len(file_dictionary)):
        file = open(file_dictionary[doc_ID], 'r', encoding='utf-8', errors='ignore')
        file_corpus = file.read()
        file.close()
        doc_tokens = preprocess_text(file_corpus, stopwords_set)
        for token in doc_tokens:
            if token in unigram_inverted_index:
                unigram_inverted_index[token][0] += 1
                unigram_inverted_index[token][1].add(doc_ID)
            else:
                unigram_inverted_index[token] = [1, SortedSet([doc_ID])]
 
    pickle_file_path = "unigram_inverted_index_pickle_file"
    with open(pickle_file_path, 'wb') as pickle_file:
        pickle.dump(unigram_inverted_index, pickle_file)
        
stopwords_set = set(stopwords.words('english'))
list_of_files = fetchFilesInDirectory('text_files')
file_dictionary = create_file_mapping(list_of_files)

create_unigram_inverted_index(file_dictionary, stopwords_set)

In [48]:
def xANDy(pos_list_1, pos_list_2):
    result = SortedSet()
    i,j = 0,0
    while(i<len(pos_list_1) and j<len(pos_list_2)):
        if pos_list_1[i]==pos_list_2[j]:
            result.add(pos_list_1[i])
            i+=1
            j+=1
        elif pos_list_1[i]<pos_list_2[j]:
            i+=1
        else:
            j+=1
    
    return result

In [49]:
def xORy(pos_list_1, pos_list_2):
    result = SortedSet()
    i,j = 0,0
    while(i<len(pos_list_1) and j<len(pos_list_2)):
        if pos_list_1[i]==pos_list_2[j]:
            result.add(pos_list_1[i])
            i+=1
            j+=1
        elif pos_list_1[i]<pos_list_2[j]:
            result.add(pos_list_1[i])
            i+=1
        else:
            result.add(pos_list_2[j])
            j+=1
    
    while(i<len(pos_list_1)):
        result.add(pos_list_1[i])
        i+=1
    while(j<len(pos_list_2)):
        result.add(pos_list_2[j])
        j+=1
    
    return result

In [50]:
def xANDNOTy(pos_list_1, pos_list_2):
    result = SortedSet()
    i,j = 0,0
    while(i<len(pos_list_1) and j<len(pos_list_2)):
        if pos_list_1[i]==pos_list_2[j]:
            i+=1
            j+=1
        elif pos_list_1[i]<pos_list_2[j]:
            result.add(pos_list_1[i])
            i+=1
        else:
            j+=1
    
    while(i<len(pos_list_1)):
        result.add(pos_list_1[i])
        i+=1
    
    return result


In [51]:
def xORNOTy(pos_list_1, pos_list_2, u_list):
    i,j = 0,0
    not_pos_list_2 = SortedSet()
    while(i<len(pos_list_2) and j<len(u_list)):
        if pos_list_2[i]<u_list[j]:
            i+=1
        elif pos_list_2[i]>u_list[j]:
            not_pos_list_2.add(u_list[j])
            j+=1
        else:
            i+=1
            j+=1
    while(j<len(u_list)):
        not_pos_list_2.add(u_list[j])
        j+=1
    
    result= xORy(pos_list_1, not_pos_list_2)
    return result

In [52]:
def main():
   
    stopwords_set = set(stopwords.words('english'))
    
    
    list_of_files = fetchFilesInDirectory('text_files')
    
    
    file_dictionary = create_file_mapping(list_of_files)
    
   
    uii_file = open('unigram_inverted_index_pickle_file', 'rb')
    unigram_inverted_index = pickle.load(uii_file)
    uii_file.close()
    
    u_list = SortedSet()
    for a in range(len(file_mapping)):
        u_list.add(a)
    
    N = int(input("Enter the number of queries: "))
    for q in range(N):
        input_sentence = input("Enter the query sentence: ")
        input_operation_sequence = input("Enter the operation sequence: ")
        
       
        sanitized_query = preprocess_text(input_sentence, stopwords_set)
        
        
        sanitized_operation_sequence = [op_seq.lower().strip() for op_seq in input_operation_sequence.split(',')]

        print(sanitized_query)
        print(sanitized_operation_sequence)
        
        ptr1, ptr2 = 0, 0
        result = unigram_inverted_index.get(sanitized_query[ptr1], (None, SortedSet()))[1]

        while ptr2 < len(sanitized_operation_sequence):
            next_word = unigram_inverted_index.get(sanitized_query[ptr1 + 1], (None, SortedSet()))[1]

            if next_word is None:
                print(f'Word "{sanitized_query[ptr1 + 1]}" not found in the index. Skipping this part of the query.')
                ptr1 += 1
                continue

            if sanitized_operation_sequence[ptr2] == 'and':
                res = xANDy(result, next_word)
            elif sanitized_operation_sequence[ptr2] == 'or':
                res= xORy(result, next_word)
            elif sanitized_operation_sequence[ptr2] == 'and not':
                res = xANDNOTy(result, next_word)
            elif sanitized_operation_sequence[ptr2] == 'or not':
                res = xORNOTy(result, next_word, u_list)
            
            ptr1 += 1
            ptr2 += 1
            result = res

        print('Number of documents matched: {}'.format(len(result)))
        print('Documents: \n')
        
        for res_doc in result:
            print(file_mapping[res_doc])

if __name__ == "__main__":
    main()


Enter the number of queries: 2
Enter the query sentence: Car bag in a canister
Enter the operation sequence: or, and not
['bag', 'canister', 'car']
['or', 'and not']
Number of documents matched: 25
Documents: 

file118.txt
file3.txt
file313.txt
file363.txt
file404.txt
file459.txt
file466.txt
file573.txt
file665.txt
file682.txt
file686.txt
file698.txt
file699.txt
file73.txt
file738.txt
file780.txt
file797.txt
file860.txt
file863.txt
file864.txt
file892.txt
file930.txt
file942.txt
file956.txt
file981.txt
Enter the query sentence: Coffee brewing techniques in cookbook
Enter the operation sequence: and, or not,or
['brewing', 'coffee', 'cookbook', 'techniques']
['and', 'or not', 'or']
Number of documents matched: 999
Documents: 

file1.txt
file10.txt
file100.txt
file101.txt
file102.txt
file103.txt
file104.txt
file105.txt
file106.txt
file107.txt
file108.txt
file109.txt
file11.txt
file110.txt
file111.txt
file112.txt
file113.txt
file114.txt
file115.txt
file116.txt
file117.txt
file118.txt
file1

# Q3

In [36]:
def create_positional_index(preprocessed_tokens_list):
    positional_index = {}

    for doc_id, tokens in enumerate(preprocessed_tokens_list):
        for position, token in enumerate(tokens):
            if token not in positional_index:
                positional_index[token] = {}
            if doc_id not in positional_index[token]:
                positional_index[token][doc_id] = []
            positional_index[token][doc_id].append(position)

    return positional_index

positional_index = create_positional_index(preprocessed_tokens_list)

pickle_file_path = "positional_index.pkl"
with open(pickle_file_path, 'wb') as pickle_file:
    pickle.dump(positional_index, pickle_file)

with open(pickle_file_path, 'rb') as pickle_file:
    loaded_positional_index = pickle.load(pickle_file)

In [37]:
def execute_phrase_query(index, query_tokens):
    matching_documents = set()

    for token in query_tokens:
        if token in index:
            if not matching_documents:
                matching_documents.update(index[token].keys())
            else:
                matching_documents.intersection_update(index[token].keys())

    return matching_documents


In [38]:
N = int(input("Enter the number of queries: "))
for q in range(N):
    input_query = input("Enter the phrase query: ")
    sanitized_query = preprocess_text(input_query, stopwords_set)
    
    if len(sanitized_query) > 5:
        print("Input sequence length exceeds 5. Please provide a shorter input.")
        continue
    
    result_docs = execute_phrase_query(loaded_positional_index, ' '.join(sanitized_query))
    
    if result_docs is None:
        print(f"Error: One or more words from the phrase query are not present in the positional index.")
        continue

    print(f"Number of documents retrieved for query {q + 1} using positional index: {len(result_docs)}")
    print(f"Names of documents retrieved for query {q + 1} using positional index: {[f'File {doc_id + 1}.txt' for doc_id in result_docs]}")


Enter the number of queries: 2
Enter the phrase query: Car bag in a canister
Number of documents retrieved for query 1 using positional index: 0
Names of documents retrieved for query 1 using positional index: []
Enter the phrase query: Coffee brewing techniques in cookbook
Number of documents retrieved for query 2 using positional index: 297
Names of documents retrieved for query 2 using positional index: ['File 3.txt', 'File 517.txt', 'File 520.txt', 'File 9.txt', 'File 11.txt', 'File 525.txt', 'File 527.txt', 'File 16.txt', 'File 529.txt', 'File 18.txt', 'File 19.txt', 'File 530.txt', 'File 532.txt', 'File 535.txt', 'File 25.txt', 'File 27.txt', 'File 539.txt', 'File 31.txt', 'File 543.txt', 'File 34.txt', 'File 38.txt', 'File 551.txt', 'File 555.txt', 'File 44.txt', 'File 556.txt', 'File 557.txt', 'File 47.txt', 'File 48.txt', 'File 49.txt', 'File 561.txt', 'File 51.txt', 'File 52.txt', 'File 53.txt', 'File 566.txt', 'File 55.txt', 'File 567.txt', 'File 58.txt', 'File 570.txt', 'Fi