In [1]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
import os
import string
import logging
import re  # Import regular expressions library
from collections import defaultdict, Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/haddiphuel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/haddiphuel/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/haddiphuel/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
STOPWORDS = set(stopwords.words('english'))
LEMMATIZER = WordNetLemmatizer ()

In [5]:
def load_text_files(folder_path):
    """Reads all files in a folder and returns a dictionary
    with filenames as keys and content as values."""
    data = {}
    doc_id_to_filename = {}
    doc_id = 0

    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
                data[doc_id] = file.read()
                doc_id_to_filename[doc_id] = filename  # Map doc_id to filename
                logging.info(f"Loaded file: {filename} with doc_id: {doc_id}")
            doc_id += 1  # Increment document ID for the next file

    return data, doc_id_to_filename

In [7]:
def clean_text(text):
    """Performs text cleaning: removing special characters, tokenization, stopword removal, and lemmatization."""

    # Convert to lowercase
    text = text.lower()

    # Remove special characters and punctuation using regular expressions
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)  # Keeps only alphanumeric characters and spaces

    # Tokenize the cleaned text
    tokens = word_tokenize(text)

    # Remove stopwords and lemmatize
    cleaned_tokens = [LEMMATIZER.lemmatize(word) for word in tokens if word not in STOPWORDS]

    return cleaned_tokens


In [9]:
def build_inverted_index(data):
    """Builds an inverted index from the cleaned text data and tracks term frequencies."""

    inverted_index = defaultdict(set)
    term_frequencies = Counter()  # Track the frequency of each term

    for doc_id, content in data.items():
        cleaned_tokens = clean_text(content)

        for token in cleaned_tokens:
            inverted_index[token].add(doc_id)
            term_frequencies[token] += 1  # Update term frequency

    return inverted_index, term_frequencies

In [11]:
def boolean_query(query, inverted_index, doc_id_to_filename):
    """Processes a Boolean query ('AND', 'OR', 'NOT') on the inverted index and returns filenames."""

    query = query.lower()
    tokens = query.split()

    result_set = set()

    if 'and' in tokens:
        terms = [term for term in tokens if term not in ['and', 'or', 'not']]

        # Check if all terms exist in the inverted index
        if all(term in inverted_index for term in terms):
            result_set = inverted_index[terms[0]]
            for term in terms[1:]:
                result_set = result_set.intersection(inverted_index[term])
        else:
            result_set = set()  # Return empty result if any term is missing

    elif 'or' in tokens:
        terms = [term for term in tokens if term not in ['and', 'or', 'not']]
        for term in terms:
            if term in inverted_index:
                if not result_set:
                    result_set = inverted_index[term]
                else:
                    result_set = result_set.union(inverted_index[term])

    elif 'not' in tokens:
        term = tokens[1]
        if term in inverted_index:
            result_set = set(inverted_index.keys()) - inverted_index[term]
        else:
            result_set = set(inverted_index.keys())  # If term doesn't exist, return all docs

    else:
        # If no 'AND', 'OR', 'NOT' operators, check if single query term exists
        if query in inverted_index:
            result_set = inverted_index[query]
        else:
            result_set = set()  # If query term doesn't exist, return empty set

    # Convert doc_ids to filenames
    result_filenames = [doc_id_to_filename[doc_id] for doc_id in result_set if doc_id in doc_id_to_filename]

    logging.info(f"Query '{query}' resulted in: {result_filenames}")

    return result_filenames

In [13]:
def generate_queries_file(term_frequencies):
    """Generates a queries.txt file with example queries based on term frequencies."""

    # Open the file for writing
    with open("queries.txt", "w") as file:
        # Write a sample 'AND' query
        most_common_terms = [term for term, freq in term_frequencies.most_common(5)]  # Get top 5 terms
        if len(most_common_terms) >= 2:
            and_query = f"{most_common_terms[0]} AND {most_common_terms[1]}"
            file.write(f"{and_query}\n")

        # Write a sample 'OR' query
        if len(most_common_terms) >= 3:
            or_query = f"{most_common_terms[1]} OR {most_common_terms[2]}"
            file.write(f"{or_query}\n")

        # Write a sample 'NOT' query
        if len(most_common_terms) >= 4:
            not_query = f"NOT {most_common_terms[3]}"
            file.write(f"{not_query}\n")

In [37]:
def main():
    # Load dataset from a single text file
    file_path = '/Users/haddiphuel/Desktop/week 1 sample and text /rec.motorcycles.txt'
    
    # Load the content of the file
    data = load_text_file(file_path)
    
    # Split the file content into individual documents
    documents = split_documents(data)
    
    # Print only the first 5 documents for debugging
    print("Loaded documents (first 5):", list(documents.items())[:5])

    # Build the inverted index and term frequencies
    inverted_index, term_frequencies = build_inverted_index(documents)
    
    # Print only a sample of the inverted index (first 5 entries)
    print("Inverted Index (sample):", dict(list(inverted_index.items())[:5]))

    # Generate the queries.txt file
    generate_queries_file(term_frequencies)

    # Read the queries from the generated queries.txt file
    with open("queries.txt", "r") as query_file:
        queries = query_file.readlines()

    # Open a file to write the results
    with open("query_results.txt", "w") as result_file:
        for query in queries:
            query = query.strip()  # Remove any leading/trailing whitespace
            if query:  # Skip empty lines
                result = boolean_query(query, inverted_index, documents)
                result_str = f"Results for '{query}': {result}\n"
                print(result_str[:500])  # Print only the first 500 characters of each result
                result_file.write(result_str)  # Write full results to file

if __name__ == "__main__":
    main()


Inverted Index (sample): {'newsgroup': {1, 2, 3, 5, 7, 8, 9, 11, 13, 15, 16, 17, 18, 19, 20, 21, 22, 23, 25, 26, 27, 28, 29, 30, 31, 32, 33, 35, 36, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 52, 53, 54, 55, 57, 58, 59, 60, 64, 65, 66, 67, 73, 74, 75, 77, 78, 83, 86, 87, 88, 90, 91, 92, 93, 94, 96, 97, 98, 99, 100, 101, 102, 103, 104, 106, 107, 109, 110, 111, 113, 116, 117, 118, 119, 120, 122, 124, 125, 126, 127, 128, 129, 130, 133, 135, 136, 137, 138, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 167, 168, 169, 171, 172, 173, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 199, 201, 203, 204, 206, 208, 211, 212, 213, 218, 219, 220, 222, 223, 225, 226, 227, 228, 231, 232, 233, 234, 235, 236, 238, 239, 240, 241, 243, 244, 245, 246, 247, 248, 250, 252, 253, 254, 255, 256, 257, 260, 261, 263, 264, 265, 266, 267, 268, 270, 272, 273, 276, 277, 278, 279, 280, 2