Q1. Data Preprocessing

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [4]:
import os
import random
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

def preprocess_text(text):

    text = text.lower()

    #Remove HTML tags using BeautifulSoup
    soup = BeautifulSoup(text, 'html.parser')
    text = soup.get_text()

    #Tokenization
    tokens = word_tokenize(text)

    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word.lower() not in stop_words]


    tokens = [word for word in tokens if word.isalnum()]

    #Remove blank space tokens
    tokens = [word for word in tokens if word.strip()]

    return ' '.join(tokens)

def preprocess_and_save_files(dataset_path, preprocessed_path):
    files = os.listdir(dataset_path)
    for file_name in files:
        file_path = os.path.join(dataset_path, file_name)
        with open(file_path, 'r', encoding='utf-8') as file:
            original_text = file.read()
            preprocessed_text = preprocess_text(original_text)

        preprocessed_file_path = os.path.join(preprocessed_path, f"preprocessed_{file_name}")
        with open(preprocessed_file_path, 'w', encoding='utf-8') as preprocessed_file:
            preprocessed_file.write(preprocessed_text)

def print_random_sample_files(preprocessed_path, num_samples=5):
    files = os.listdir(preprocessed_path)
    sample_files = random.sample(files, num_samples)

    for file_name in sample_files:
        file_path = os.path.join(preprocessed_path, file_name)

        with open(file_path, 'r', encoding='utf-8') as file:
            preprocessed_text = file.read()
            print(f"\nSample file: {file_name}\n")
            print("\nAfter Preprocessing:\n")
            print(preprocessed_text[:300])  #Printing the first 300 characters

if __name__ == "__main__":
    dataset_path = "/content/drive/My Drive/CSE508_Winter2024_A1_2021532/text_files"
    preprocessed_path = "/content/drive/My Drive/CSE508_Winter2024_A1_2021532/Preprocessed_files"


    if not os.path.exists(preprocessed_path):
        os.makedirs(preprocessed_path)


    preprocess_and_save_files(dataset_path, preprocessed_path)


    print_random_sample_files(preprocessed_path)


  soup = BeautifulSoup(text, 'html.parser')



Sample file: preprocessed_file161.txt


After Preprocessing:

works guitar looks like new one

Sample file: preprocessed_file390.txt


After Preprocessing:

xvive sweet leo o2 lead distortion pedal demo im great lead player even able dial great tones pedal xvive sweet leo o2 distortion pedal great budget lead lead pedal im super impressed build quality packaging xvive well tones looking good lead distortion pedal functions like tube amp works may want c

Sample file: preprocessed_file902.txt


After Preprocessing:

beautiful strap matches guitar nicely perfect

Sample file: preprocessed_file927.txt


After Preprocessing:

ebay buy flip bad bass guitar worth much used new bridge pickup noisy pickups active pickup active pickup without boom sound would say 4 star everything gsr200 far body neck bridge mine 79 buy new 119 flip ok bass guitar string change setup first time bass buyer learning play

Sample file: preprocessed_file678.txt


After Preprocessing:

mixer good build knobs slider

Q2. Unigram Inverted Index and Boolean Queries


In [5]:
import os
import pickle
from collections import defaultdict

def create_inverted_index(preprocessed_path):
    inverted_index = defaultdict(set)
    for file_name in os.listdir(preprocessed_path):
        file_path = os.path.join(preprocessed_path, file_name)
        with open(file_path, 'r', encoding='utf-8') as file:
            words = file.read().split()
            for word in words:
                inverted_index[word].add(file_name)
    return inverted_index

def save_inverted_index(inverted_index, file_name):
    with open(file_name, 'wb') as file:
        pickle.dump(inverted_index, file)

def load_inverted_index(file_name):
    with open(file_name, 'rb') as file:
        return pickle.load(file)

def perform_query(inverted_index, query_terms, operations):
    result_sets = [inverted_index.get(term, set()) for term in query_terms]

    result = result_sets[0]
    for op, next_set in zip(operations, result_sets[1:]):
        if op == 'AND':
            result = result & next_set
        elif op == 'OR':
            result = result | next_set
        elif op == 'AND NOT':
            result = result - next_set
        elif op == 'OR NOT':
            result = (result | next_set) - next_set
    return result

def format_query(query_terms, operations):
    formatted_query = []
    for term, op in zip(query_terms, operations + ['']):
        formatted_query.append(term)
        if op:
            formatted_query.append(op)
    return ' '.join(formatted_query)

def process_queries(preprocessed_path, inverted_index_file):
    inverted_index = load_inverted_index(inverted_index_file)
    n = int(input("Enter the number of queries: "))
    queries = []
    for i in range(n):
        query_text = input(f"Enter query {i + 1}: ")
        operations = input(f"Enter operations for query {i + 1}, separated by commas: ").split(', ')
        queries.append((query_text, operations))

    for i, (query_text, operations) in enumerate(queries):
        preprocessed_query = preprocess_text(query_text)
        query_terms = preprocessed_query.split()

        result_docs = perform_query(inverted_index, query_terms, operations)
        formatted_query = format_query(query_terms, operations)
        print(f"\nQuery {i + 1}: {formatted_query}")
        print(f"Number of documents retrieved for query {i + 1}: {len(result_docs)}")
        print(f"Names of the documents retrieved for query {i + 1}: {', '.join(result_docs)}")

if __name__ == "__main__":
    dataset_path = "/content/drive/My Drive/CSE508_Winter2024_A1_2021532/text_files"
    preprocessed_path = "/content/drive/My Drive/CSE508_Winter2024_A1_2021532/Preprocessed_files"
    inverted_index_file = "/content/drive/My Drive/CSE508_Winter2024_A1_2021532/inverted_index.pkl"

    preprocess_and_save_files(dataset_path, preprocessed_path)

    inverted_index = create_inverted_index(preprocessed_path)
    save_inverted_index(inverted_index, inverted_index_file)

    process_queries(preprocessed_path, inverted_index_file)

  soup = BeautifulSoup(text, 'html.parser')


Enter the number of queries: 2
Enter query 1: Car bag in a canister
Enter operations for query 1, separated by commas: OR, AND NOT
Enter query 2: Coffee brewing techniques in cookbook
Enter operations for query 2, separated by commas: AND, OR NOT, OR

Query 1: car OR bag AND NOT canister
Number of documents retrieved for query 1: 31
Names of the documents retrieved for query 1: preprocessed_file682.txt, preprocessed_file686.txt, preprocessed_file118.txt, preprocessed_file698.txt, preprocessed_file166.txt, preprocessed_file313.txt, preprocessed_file363.txt, preprocessed_file3.txt, preprocessed_file404.txt, preprocessed_file542.txt, preprocessed_file956.txt, preprocessed_file780.txt, preprocessed_file942.txt, preprocessed_file864.txt, preprocessed_file930.txt, preprocessed_file860.txt, preprocessed_file174.txt, preprocessed_file264.txt, preprocessed_file573.txt, preprocessed_file886.txt, preprocessed_file797.txt, preprocessed_file466.txt, preprocessed_file665.txt, preprocessed_file892.tx

Q3. Positional Index and Phrase Queries

In [6]:
import os
import pickle
from collections import defaultdict


def default_dict():
    return defaultdict(list)

def create_positional_index(preprocessed_path):
    positional_index = defaultdict(default_dict)
    for file_name in os.listdir(preprocessed_path):
        file_path = os.path.join(preprocessed_path, file_name)
        with open(file_path, 'r', encoding='utf-8') as file:
            words = file.read().split()
            for position, word in enumerate(words):
                positional_index[word][file_name].append(position)
    return positional_index


def save_positional_index(positional_index, file_name):
    with open(file_name, 'wb') as file:
        pickle.dump(positional_index, file)

def load_positional_index(file_name):
    with open(file_name, 'rb') as file:
        return pickle.load(file)

def perform_phrase_query(positional_index, query_terms):
    if not query_terms:
        return set()

    # Retrieving positional lists for each term in the phrase query
    positional_lists = [positional_index[term] for term in query_terms]
    all_docs = set.intersection(*map(set, positional_lists))

    valid_docs = set()
    for doc in all_docs:
        positions = [positional_lists[i][doc] for i in range(len(query_terms))]
        for pos in positions[0]:
            if all(pos + i in positions[i] for i in range(len(query_terms))):
                valid_docs.add(doc)
                break

    return valid_docs

def process_queries(preprocessed_path, positional_index_file):
    positional_index = load_positional_index(positional_index_file)
    n = int(input("Enter the number of queries: "))
    for i in range(n):
        query_text = input(f"Enter phrase query {i + 1}: ")
        preprocessed_query = preprocess_text(query_text)
        query_terms = preprocessed_query.split()

        result_docs = perform_phrase_query(positional_index, query_terms)
        print(f"Number of documents retrieved for query {i + 1} using positional index: {len(result_docs)}")
        print(f"Names of documents retrieved for query {i + 1} using positional index: {', '.join(result_docs)}")

if __name__ == "__main__":
    dataset_path = "/content/drive/My Drive/CSE508_Winter2024_A1_2021532/text_files"
    preprocessed_path = "/content/drive/My Drive/CSE508_Winter2024_A1_2021532/Preprocessed_files"
    positional_index_file = "/content/drive/My Drive/CSE508_Winter2024_A1_2021532/positional_index.pkl"

    preprocess_and_save_files(dataset_path, preprocessed_path)

    positional_index = create_positional_index(preprocessed_path)
    save_positional_index(positional_index, positional_index_file)

    process_queries(preprocessed_path, positional_index_file)


  soup = BeautifulSoup(text, 'html.parser')


Enter the number of queries: 3
Enter phrase query 1: it is a good in front for poutch
Number of documents retrieved for query 1 using positional index: 0
Names of documents retrieved for query 1 using positional index: 
Enter phrase query 2: it is good in reliable for fit
Number of documents retrieved for query 2 using positional index: 1
Names of documents retrieved for query 2 using positional index: preprocessed_file9.txt
Enter phrase query 3: it is a fit front poutch
Number of documents retrieved for query 3 using positional index: 1
Names of documents retrieved for query 3 using positional index: preprocessed_file9.txt
