<a href="https://colab.research.google.com/github/katariaNandini/IR/blob/main/ir_ass2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import zipfile
import math
import nltk
from nltk.corpus import stopwords  # Importing stopwords from NLTK
from nltk.stem import WordNetLemmatizer  # Importing lemmatizer for word normalization
from collections import defaultdict  # Default dictionary for easier dictionary operations
from typing import List, Dict, Tuple  # Importing typing hints for better code understanding

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Preprocessing function to clean and normalize the text
def preprocess(text: str) -> List[str]:
    """
    This function performs text preprocessing including:
    - Converting text to lowercase
    - Tokenizing the text
    - Removing non-alphabetic characters and stop words
    - Lemmatizing the words
    """
    text = text.lower()
    words = nltk.word_tokenize(text)  # Tokenize the text into words
    words = [word for word in words if word.isalpha()]  # Remove non-alphabetic tokens
    stop_words = set(stopwords.words('english'))  # Load English stopwords
    words = [word for word in words if word not in stop_words]  # Remove stopwords
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]  # Lemmatize words
    return words

# Function to extract documents from a zip file
def extract_documents(corpus_zip_path: str, corpus_dir: str) -> Tuple[Dict[int, str], Dict[int, str]]:
    """
    Extracts documents from the provided zip file path and stores them in the specified directory.
    - Returns a dictionary of document IDs mapped to their content, and a mapping of document IDs to filenames.
    """
    docs = {}
    file_to_doc_id = {}

    # Create directory if it does not exist
    if not os.path.exists(corpus_dir):
        os.makedirs(corpus_dir)

    # Extract zip file to the directory
    with zipfile.ZipFile(corpus_zip_path, 'r') as zip_ref:
        zip_ref.extractall(corpus_dir)

    # Check for the 'Corpus' subdirectory and load documents
    docs_dir = os.path.join(corpus_dir, 'Corpus')
    if os.path.exists(docs_dir):
        for i, filename in enumerate(os.listdir(docs_dir)):
            if filename.endswith('.txt'):
                file_path = os.path.join(docs_dir, filename)
                with open(file_path, 'r', encoding='utf-8') as file:
                    content = file.read()
                    docs[i + 1] = content  # Assign doc ID starting from 1
                    file_to_doc_id[i + 1] = filename  # Map doc ID to filename
    else:
        print(f"Directory {docs_dir} does not exist.")
        return {}, {}

    return docs, file_to_doc_id

# Function to build the inverted index for ranked retrieval
def build_inverted_index(docs: Dict[int, str]) -> Tuple[Dict[str, Dict[int, int]], Dict[int, float]]:
    """
    Builds an inverted index from the provided documents.
    - Returns the inverted index and a dictionary of document lengths for normalization.
    """
    inverted_index = defaultdict(lambda: defaultdict(int))
    doc_lengths = defaultdict(float)

    for doc_id, content in docs.items():
        words = preprocess(content)  # Preprocess the document content
        word_freq = defaultdict(int)
        for word in words:
            word_freq[word] += 1  # Calculate word frequency in the document

        # Compute Term Frequency (TF) and accumulate document lengths for normalization
        doc_length = 0
        for word, freq in word_freq.items():
            inverted_index[word][doc_id] = freq
            doc_length += (1 + math.log10(freq)) ** 2  # Calculate weighted term frequency
        doc_lengths[doc_id] = math.sqrt(doc_length)  # Store document length

    return inverted_index, doc_lengths

# Function to handle ranked retrieval
def ranked_retrieval(query: str, inverted_index: Dict[str, Dict[int, int]], doc_lengths: Dict[int, float], total_docs: int) -> Dict[int, float]:
    """
    Performs ranked retrieval on the given query using the inverted index and document lengths.
    - Returns a dictionary of document IDs mapped to their similarity scores.
    """
    query_terms = preprocess(query)  # Preprocess the query
    query_term_freq = defaultdict(int)

    # Calculate term frequency for the query
    for term in query_terms:
        query_term_freq[term] += 1

    # Calculate query weights using the ltc scheme (logarithmic term frequency)
    query_weights = {}
    for term, freq in query_term_freq.items():
        query_weights[term] = 1 + math.log10(freq)

    # Score the documents based on query weights and document frequencies
    doc_scores = defaultdict(float)
    for term, query_weight in query_weights.items():
        if term in inverted_index:
            doc_freqs = inverted_index[term]
            idf = math.log10(total_docs / len(doc_freqs)) if len(doc_freqs) > 0 else 0  # Calculate IDF
            for doc_id, term_freq in doc_freqs.items():
                tf = 1 + math.log10(term_freq)  # Calculate term frequency for the document
                doc_scores[doc_id] += tf * idf * query_weight

    # Normalize the document scores by their lengths
    for doc_id in doc_scores:
        if doc_lengths[doc_id] > 0:
            doc_scores[doc_id] /= doc_lengths[doc_id]

    # Return sorted document scores in descending order
    return dict(sorted(doc_scores.items(), key=lambda item: item[1], reverse=True))

# Main function to run the ranked retrieval system
def main():
    corpus_zip_path = 'Corpus.zip'  # Path to the corpus zip file
    corpus_dir = 'Corpus'  # Directory where the corpus will be extracted

    # Extract documents from the corpus zip file
    docs, file_to_doc_id = extract_documents(corpus_zip_path, corpus_dir)

    if not docs:
        print("No documents loaded. Please check the files and their content.")
        return

    # Build inverted index and calculate document lengths
    inverted_index, doc_lengths = build_inverted_index(docs)
    total_docs = len(docs)

    # Loop to continuously accept user queries until 'exit' is entered
    while True:
        query = input("Enter your search query (or 'exit' to quit): ")
        if query.lower() == 'exit':
            break

        # Perform ranked retrieval for the given query
        doc_scores = ranked_retrieval(query, inverted_index, doc_lengths, total_docs)

        print(f"\nQuery: {query}")
        if doc_scores:
            print("Top 5 documents:")
            for doc_id in list(doc_scores.keys())[:5]:  # Display top 5 results
                print(f"  {file_to_doc_id[doc_id]} (Score: {doc_scores[doc_id]:.4f})")
        else:
            print("No matching documents.")
        print("-" * 40)

# Entry point for the program
if __name__ == "__main__":
    main()


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


Enter your search query (or 'exit' to quit): Developing your Zomato business account and profile is a great way to boost your restaurant’s online reputation

Query: Developing your Zomato business account and profile is a great way to boost your restaurant’s online reputation
Top 5 documents:
  zomato.txt (Score: 0.5903)
  swiggy.txt (Score: 0.3498)
  instagram.txt (Score: 0.1611)
  messenger.txt (Score: 0.1592)
  youtube.txt (Score: 0.1315)
----------------------------------------
Enter your search query (or 'exit' to quit): exit
