<a href="https://colab.research.google.com/github/katariaNandini/IR/blob/main/vsm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import zipfile
import math
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import defaultdict
from typing import List, Dict, Tuple

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Preprocessing function
def preprocess(text: str) -> List[str]:
    text = text.lower()
    words = nltk.word_tokenize(text)
    words = [word for word in words if word.isalpha()]  # Keep only alphabetic words
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]  # Remove stopwords
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]  # Lemmatize the words
    return words

# Function to extract documents
def extract_documents(corpus_zip_path: str, corpus_dir: str) -> Tuple[Dict[int, str], Dict[int, str]]:
    docs = {}
    file_to_doc_id = {}

    if not os.path.exists(corpus_dir):
        os.makedirs(corpus_dir)

    with zipfile.ZipFile(corpus_zip_path, 'r') as zip_ref:
        zip_ref.extractall(corpus_dir)

    docs_dir = os.path.join(corpus_dir, 'Corpus')
    if os.path.exists(docs_dir):
        for i, filename in enumerate(os.listdir(docs_dir)):
            if filename.endswith('.txt'):
                file_path = os.path.join(docs_dir, filename)
                with open(file_path, 'r', encoding='utf-8') as file:
                    content = file.read()
                    docs[i + 1] = content  # Assign doc ID starting from 1
                    file_to_doc_id[i + 1] = filename  # Map doc ID to filename
    else:
        print(f"Directory {docs_dir} does not exist.")
        return {}, {}

    return docs, file_to_doc_id

# Function to build the inverted index for ranked retrieval
def build_inverted_index(docs: Dict[int, str]) -> Tuple[Dict[str, Dict[int, int]], Dict[int, float]]:
    inverted_index = defaultdict(lambda: defaultdict(int))
    doc_lengths = defaultdict(float)

    for doc_id, content in docs.items():
        words = preprocess(content)
        word_freq = defaultdict(int)
        for word in words:
            word_freq[word] += 1

        # Compute TF and accumulate document lengths for normalization
        doc_length = 0
        for word, freq in word_freq.items():
            tf = 1 + math.log10(freq)  # log-based term frequency
            inverted_index[word][doc_id] = tf  # Store TF directly
            doc_length += tf ** 2
        doc_lengths[doc_id] = math.sqrt(doc_length)  # Store document length for normalization

    return inverted_index, doc_lengths

# Function to handle ranked retrieval
def ranked_retrieval(query: str, inverted_index: Dict[str, Dict[int, float]], doc_lengths: Dict[int, float], total_docs: int) -> Dict[int, float]:
    query_terms = preprocess(query)
    query_term_freq = defaultdict(int)

    # Calculate term frequency for the query
    for term in query_terms:
        query_term_freq[term] += 1

    # Calculate query weights using ltc scheme (logarithmic term frequency)
    query_weights = {}
    query_length = 0
    for term, freq in query_term_freq.items():
        query_weights[term] = 1 + math.log10(freq)  # Logarithmic frequency
        query_length += query_weights[term] ** 2  # Sum of squares for query length normalization

    query_length = math.sqrt(query_length) if query_length > 0 else 1  # Avoid division by zero

    # Score the documents
    doc_scores = defaultdict(float)
    for term, query_weight in query_weights.items():
        if term in inverted_index:
            doc_freqs = inverted_index[term]
            idf = math.log10(total_docs / len(doc_freqs)) if len(doc_freqs) > 0 else 0
            for doc_id, term_weight in doc_freqs.items():
                doc_scores[doc_id] += term_weight * idf * query_weight  # TF-IDF * query weight

    # Normalize the document scores
    for doc_id in doc_scores:
        if doc_lengths[doc_id] > 0:
            doc_scores[doc_id] /= doc_lengths[doc_id]  # Normalize by document length
        doc_scores[doc_id] /= query_length  # Normalize by query length

    return dict(sorted(doc_scores.items(), key=lambda item: item[1], reverse=True))

# Main function to run the ranked retrieval system
def main():
    corpus_zip_path = 'Corpus.zip'
    corpus_dir = 'Corpus'

    # Extract documents
    docs, file_to_doc_id = extract_documents(corpus_zip_path, corpus_dir)

    if not docs:
        print("No documents loaded. Please check the files and their content.")
        return

    # Build inverted index and calculate document lengths
    inverted_index, doc_lengths = build_inverted_index(docs)
    total_docs = len(docs)

    while True:
        query = input("Enter your search query (or 'exit' to quit): ")
        if query.lower() == 'exit':
            break

        # Perform ranked retrieval
        doc_scores = ranked_retrieval(query, inverted_index, doc_lengths, total_docs)

        print(f"\nQuery: {query}")
        if doc_scores:
            print("Top 5 documents:")
            for doc_id in list(doc_scores.keys())[:5]:  # Show top 5 results
                print(f"  {file_to_doc_id[doc_id]} (Score: {doc_scores[doc_id]:.4f})")
        else:
            print("No matching documents.")
        print("-" * 40)

# Entry point for the program
if __name__ == "__main__":
    main()


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!



Query: Developing your Zomato business account and profile is a great way to boost your restaurant’s online reputation
Top 5 documents:
  zomato.txt (Score: 0.1755)
  swiggy.txt (Score: 0.1039)
  instagram.txt (Score: 0.0476)
  messenger.txt (Score: 0.0471)
  youtube.txt (Score: 0.0389)
----------------------------------------

Query: Warwickshire, came from an ancient family and was the heiress to some land
Top 5 documents:
  shakespeare.txt (Score: 0.1640)
  levis.txt (Score: 0.0360)
  nike.txt (Score: 0.0272)
  huawei.txt (Score: 0.0209)
  zomato.txt (Score: 0.0202)
----------------------------------------
