<a href="https://colab.research.google.com/github/katariaNandini/IR/blob/main/ir_ass2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import zipfile
import math
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import defaultdict
from typing import List, Dict, Tuple

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

class Document:
    def __init__(self, name: str, content: str):
        self.name = name
        self.content = content

# Preprocessing function
def preprocess(text: str) -> List[str]:
    text = text.lower()
    words = nltk.word_tokenize(text)
    words = [word for word in words if word.isalpha()]
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return words

# Function to load documents from a zip file using filenames as keys
def load_documents(zip_path: str, extract_to: str) -> Dict[str, Document]:
    if not os.path.exists(extract_to):
        os.makedirs(extract_to)

    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to)

    # Verify extraction
    extracted_files = []
    for root, dirs, files in os.walk(extract_to):
        for file in files:
            extracted_files.append(os.path.join(root, file))

    # Load the documents
    docs = {}
    for file_path in extracted_files:
        if file_path.endswith('.txt'):
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read().strip()
                if content:  # Only add non-empty files
                    filename = os.path.basename(file_path)
                    docs[filename] = Document(name=filename, content=content)

    return docs

# Function to build the inverted index for ranked retrieval
def build_inverted_index(docs: Dict[str, Document]) -> Tuple[Dict[str, Dict[str, int]], Dict[str, float]]:
    inverted_index = defaultdict(lambda: defaultdict(int))
    doc_lengths = defaultdict(float)

    for doc_name, document in docs.items():
        words = preprocess(document.content)
        word_freq = defaultdict(int)
        for word in words:
            word_freq[word] += 1

        # Compute TF and accumulate document lengths for normalization
        doc_length = 0
        for word, freq in word_freq.items():
            inverted_index[word][doc_name] = freq
            doc_length += (1 + math.log10(freq)) ** 2
        doc_lengths[doc_name] = math.sqrt(doc_length)

    return inverted_index, doc_lengths

# Function to handle ranked retrieval
def ranked_retrieval(query: str,
                     inverted_index: Dict[str, Dict[str, int]],
                     doc_lengths: Dict[str, float],
                     total_docs: int,
                     top_n: int = 10) -> Dict[str, float]:

    # Preprocess the query
    query_terms = preprocess(query)
    query_term_freq = defaultdict(int)

    # Calculate term frequency for the query
    for term in query_terms:
        query_term_freq[term] += 1

    # Calculate query weights using the ltc scheme
    query_weights = {}
    for term, freq in query_term_freq.items():
        query_weights[term] = 1 + math.log10(freq)

    # Score the documents
    doc_scores = defaultdict(float)
    for term, query_weight in query_weights.items():
        if term in inverted_index:
            doc_freqs = inverted_index[term]
            idf = math.log10(total_docs / len(doc_freqs)) if len(doc_freqs) > 0 else 0
            for doc_name, term_freq in doc_freqs.items():
                tf = 1 + math.log10(term_freq)
                doc_scores[doc_name] += tf * idf * query_weight

    # Normalize the document scores
    for doc_name in doc_scores:
        if doc_lengths[doc_name] > 0:
            doc_scores[doc_name] /= doc_lengths[doc_name]

    # Return only the top N documents
    return dict(sorted(doc_scores.items(), key=lambda item: item[1], reverse=True)[:top_n])

# Function to display document details
def display_document(doc_name: str, docs: Dict[str, Document], file_id_mapping: Dict[str, str], score: float):
    doc = docs[doc_name]
    lines = doc.content.split('\n')
    first_two_lines = '\n'.join(lines[:2])

    # Get the file ID for the document based on the name
    file_id = file_id_mapping.get(doc_name, None)
    if file_id:
        # Construct the Google Drive link for the individual file
        link = f"https://drive.google.com/file/d/{file_id}/view?usp=sharing"
    else:
        link = "Link not available"

    print(f"  Document Name: {doc.name}")
    print(f"  Content Preview:\n{first_two_lines}")
    print(f"  Score: {score:.4f}")  # Display the score with four decimal places
    print(f"  Link to Document: {link}")
    print("-" * 50)

# Main function to run the ranked retrieval system
def main():
    corpus_zip_path = 'Corpus.zip'
    corpus_dir = 'Corpus'

    # Load documents
    docs = load_documents(corpus_zip_path, corpus_dir)

    if not docs:
        print("No documents loaded. Please check the files and their content.")
        return

    # Build inverted index and calculate document lengths
    inverted_index, doc_lengths = build_inverted_index(docs)
    total_docs = len(docs)

    # Example mapping for file IDs using filenames
    file_id_mapping = {
        'microsoft.txt': '17gtGk9isJIjMOwJUhrvScWIkBeIivQvk',
    'apple.txt': '1Bi7UM3yJfro4Tu3-g3fm5Y2XZ8YeZHg5',
    'nike.txt': '1RDdQqOcbVrmzPYIDQ6R1t7QXWrvzI68P',
    'flipkart.txt': '1PRdJlT5wixJd_FTe3moFqTlHbanAo_5_',
    'google.txt': '1EEjfP9vyh8uzq3-N-TF95i_T91ZOF1On',
    'Discord.txt': '1-zWgMQAwSOCsmQGYzkyAINNZdGZaG_9N',
    'zomato.txt': '1i_XUVOQDZOvH21vVDAzPMFe3Db-60jIX',
    'Amazon.txt': '1Vyp9dRXbByFNjHUCWjx6IOT5fBSNSPSu',
    'Binance.txt': '10Iz3xS6CMbbkFkwDuENYEKoHrGh47WHZ',
    'shakespeare.txt': '1RHmqiyJN3dsr0eLJwZaN_7cqJf44u4mQ',
    'paypal.txt': '1SU3qe7WPduaiOWQiEWKjVgWWNJJG2P8l',
    'instagram.txt': '1myzc16SQiItwv7Fu34xKPdzhEAg8WMf_',
    'volkswagen.txt': '1YdVx3j2U1QIUqjApzzu_7Vc0yhwdN4W0',
    'HP.txt': '1gym8q9fuilqyNOZ_MbdTriQKiF4fs21T',
    'spotify.txt': '1cfhbL9uugppmXFwqCtwpgvpbx7k8iQtC',
    'Uber.txt': '1uRFepmUklcGoydXm7CNOCbyApWmyJ3zu',
    'youtube.txt': '1u32cxm6JkQXMttqcgpKtOaW9JF_io2HZ',
    'motorola.txt': '1jTtyGhZ5Nt0r5H5P4q9SCMM4EKNocFbq',
    'yahoo.txt': '1L8cqu31UdqfUlSl2BUPZm2WgcA7KWXhW',
    'Lenovo.txt': '1Yf9ypzXtV0GVRzSROA0Rf5lDlzEZdurG',
    'Dell.txt': '1q628g8gPoXhQwQophtXpa1EOfLTia6Yr',
    'levis.txt': '1NXxLhKMWLjFZw-OHi2omCUHneE65MEKy',
    'whatsapp.txt': '1IKOAeE6R_lYM79KcCg1R69jaZqFESi4T',
    'bing.txt': '1eJefZuegIKYKtO_Ui6Ft613HjNy4gjNa',
    'hawai.txt': '1UM0HfKDquiYfQUYPM2w8zOhOodJ8mpXB',
    'samsung.txt': '1W-hLebBkUgVqyvpPxrei8todpzcj2Zsf',
    'swiggy.txt': '1z8tNchwcY5fZpu9X5PTN_brt3TKn8jEA',
    'skype.txt': '1M44nNmnO8tBORZJefBZDpsA4eRsAvK8u',
    'messenger.txt': '1Ow9GrppA1haCplAzpl8sQCwinar6Ynri',
    'telegram.txt': '1c_wKzP_Kx9Sbu58ewhPSf8MyeX7C0OvK',
    'steam.txt': '1FcptFYZQ7aLZYXl0kQXp5UluCXzLG-Q9',
    'reliance.txt': '1885pbtntJwW_FkqriJBi_3OP450H-4rA',
    'canva.txt': '1RAThPxqywa6QnsfOx7m1Ye12oD-FFb48',
    'puma.txt': '1vd-v2IEBm3HEtCmQgEzlQTUixZrBFU-g',
    'nokia.txt': '1aeyAcBDBROKq0-oodl8ozOF_lKN0uQlV',
    'reddit.txt': '1Mg_hDi642we5KCGW6ZL6flSiijWgXxLH',
    'sony.txt': '1Lg1DRlNH4JS2SMyysDMN60JyHWsi3Q6T',
    'Ola.txt': '1KWB-AIr2ThSdquUGLfKXf6V-6haWSWwV',
    'Adobe.txt': '1gu9NXN9jW4Kr7O8XNaxfdtG7XVt3pG2V',
    'blackberry.txt': '1c9-7OnQuQo9ZuRzPRhSlqaDJUI57Cjbb',
    'huawei.txt' :  '1KwpOmzf6rt6i7ef1mm1S8ddi8ZUmpt5Q',
    'operating.txt'  :  '1maHBi5xRkntmTRGyyRWIQ3-IpPGhzykc',

    }

    while True:
        user_query = input("Enter your query (or 'exit' to quit): ")
        if user_query.lower() == 'exit':
            break

        # Perform ranked retrieval
        doc_scores = ranked_retrieval(user_query, inverted_index, doc_lengths, total_docs)

        # Display results
        if doc_scores:
            print("Top ranked documents:")
            for doc_name, score in doc_scores.items():
                display_document(doc_name, docs, file_id_mapping, score)
        else:
            print("No documents found matching the query.")

if __name__ == "__main__":
    main()


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


Enter your query (or 'exit' to quit): Developing your Zomato business account and profile is a great way to boost your restaurant’s online reputation
Top ranked documents:
  Document ID: 28
  Document Name: zomato.txt
  Content Preview:
If you are a restaurant owner or marketing manager for a restaurant, you’ll love Zomato. But what is Zomato, exactly, and why would you love it? We’ll introduce you to the platform, how to set up your business account, and everything that Zomato has to offer to help you boost your business.

  Score: 0.5903
  Link to Document: https://drive.google.com/file/d/1qCxhqPCobg-TzTzcrlRtprlhvZRDjqy0/view?usp=sharing
--------------------------------------------------
  Document ID: 30
  Document Name: swiggy.txt
  Content Preview:
What Is Swiggy And How It’s Working?

  Score: 0.3498
  Link to Document: https://drive.google.com/file/d/1lTf2QiDCHu3syg6TqeGeTjupN4gtdWjI/view?usp=sharing
--------------------------------------------------
  Document ID: 16
  Documen