<a href="https://colab.research.google.com/github/katariaNandini/IR/blob/main/ir_ass2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import zipfile
import math
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import defaultdict
from typing import List, Dict, Tuple

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

class Document:
    def __init__(self, name: str, content: str):
        self.name = name
        self.content = content

# Preprocessing function
def preprocess(text: str) -> List[str]:
    text = text.lower()
    words = nltk.word_tokenize(text)
    words = [word for word in words if word.isalpha()]
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return words

# Function to load documents from a zip file
def load_documents(zip_path: str, extract_to: str) -> Dict[int, Document]:
    if not os.path.exists(extract_to):
        os.makedirs(extract_to)

    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to)

    # Verify extraction
    extracted_files = []
    for root, dirs, files in os.walk(extract_to):
        for file in files:
            extracted_files.append(os.path.join(root, file))

    # Load the documents
    docs = {}
    for i, file_path in enumerate(extracted_files):
        if file_path.endswith('.txt'):
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read().strip()
                if content:  # Only add non-empty files
                    docs[i + 1] = Document(name=os.path.basename(file_path), content=content)

    return docs

# Function to build the inverted index for ranked retrieval
def build_inverted_index(docs: Dict[int, Document]) -> Tuple[Dict[str, Dict[int, int]], Dict[int, float]]:
    inverted_index = defaultdict(lambda: defaultdict(int))
    doc_lengths = defaultdict(float)

    for doc_id, document in docs.items():
        words = preprocess(document.content)
        word_freq = defaultdict(int)
        for word in words:
            word_freq[word] += 1

        # Compute TF and accumulate document lengths for normalization
        doc_length = 0
        for word, freq in word_freq.items():
            inverted_index[word][doc_id] = freq
            doc_length += (1 + math.log10(freq)) ** 2
        doc_lengths[doc_id] = math.sqrt(doc_length)

    return inverted_index, doc_lengths

# Function to handle ranked retrieval
def ranked_retrieval(query: str,
                     inverted_index: Dict[str, Dict[int, int]],
                     doc_lengths: Dict[int, float],
                     total_docs: int,
                     top_n: int = 6) -> Dict[int, float]:

    # Preprocess the query
    query_terms = preprocess(query)
    query_term_freq = defaultdict(int)

    # Calculate term frequency for the query
    for term in query_terms:
        query_term_freq[term] += 1

    # Calculate query weights using the ltc scheme
    query_weights = {}
    for term, freq in query_term_freq.items():
        query_weights[term] = 1 + math.log10(freq)

    # Score the documents
    doc_scores = defaultdict(float)
    for term, query_weight in query_weights.items():
        if term in inverted_index:
            doc_freqs = inverted_index[term]
            idf = math.log10(total_docs / len(doc_freqs)) if len(doc_freqs) > 0 else 0
            for doc_id, term_freq in doc_freqs.items():
                tf = 1 + math.log10(term_freq)
                doc_scores[doc_id] += tf * idf * query_weight

    # Normalize the document scores
    for doc_id in doc_scores:
        if doc_lengths[doc_id] > 0:
            doc_scores[doc_id] /= doc_lengths[doc_id]

    # Return only the top N documents
    return dict(sorted(doc_scores.items(), key=lambda item: item[1], reverse=True)[:top_n])

# Function to display document details
def display_document(doc_id: int, docs: Dict[int, Document], file_id_mapping: Dict[int, str], score: float):
    doc = docs[doc_id]
    lines = doc.content.split('\n')
    first_two_lines = '\n'.join(lines[:2])

    # Get the file ID for the document
    file_id = file_id_mapping.get(doc_id, None)
    if file_id:
        # Construct the Google Drive link for the individual file
        link = f"https://drive.google.com/file/d/{file_id}/view?usp=sharing"
    else:
        link = "Link not available"

    print(f"  Document ID: {doc_id}")
    print(f"  Document Name: {doc.name}")
    print(f"  Content Preview:\n{first_two_lines}")
    print(f"  Score: {score:.4f}")  # Display the score with four decimal places
    print(f"  Link to Document: {link}")
    print("-" * 50)

# Main function to run the ranked retrieval system
def main():
    corpus_zip_path = 'Corpus.zip'
    corpus_dir = 'Corpus'

    # Load documents
    docs = load_documents(corpus_zip_path, corpus_dir)

    if not docs:
        print("No documents loaded. Please check the files and their content.")
        return

    # Build inverted index and calculate document lengths
    inverted_index, doc_lengths = build_inverted_index(docs)
    total_docs = len(docs)

    # Example mapping for file IDs
    file_id_mapping = {
        1: '17gtGk9isJIjMOwJUhrvScWIkBeIivQvk',
        2: '1Bi7UM3yJfro4Tu3-g3fm5Y2XZ8YeZHg5',
        3: '1RDdQqOcbVrmzPYIDQ6R1t7QXWrvzI68P',
        4: '1PRdJlT5wixJd_FTe3moFqTlHbanAo_5_',
        5: '1EEjfP9vyh8uzq3-N-TF95i_T91ZOF1On',
        6: '1-zWgMQAwSOCsmQGYzkyAINNZdGZaG_9N',
        7: '1iiaWgK1Ic8jxQCFTDoAJwfEPOCpBcHyb',
        8: '1RHmqiyJN3dsr0eLJwZaN_7cqJf44u4mQ',
        9: '10Iz3xS6CMbbkFkwDuENYEKoHrGh47WHZ',
        10: '1i_XUVOQDZOvH21vVDAzPMFe3Db-60jIX',
        11: '1Vyp9dRXbByFNjHUCWjx6IOT5fBSNSPSu',
        12: '1SU3qe7WPduaiOWQiEWKjVgWWNJJG2P8l', #paypal
        13: '1myzc16SQiItwv7Fu34xKPdzhEAg8WMf_', #ig
        14: '1YdVx3j2U1QIUqjApzzu_7Vc0yhwdN4W0', #volkswagen
        15: '1gym8q9fuilqyNOZ_MbdTriQKiF4fs21T', #hp
        16: '1cfhbL9uugppmXFwqCtwpgvpbx7k8iQtC', #spotify
        17: '1uRFepmUklcGoydXm7CNOCbyApWmyJ3zu', #uber
        18: '1u32cxm6JkQXMttqcgpKtOaW9JF_io2HZ', #utube
        19: '1jTtyGhZ5Nt0r5H5P4q9SCMM4EKNocFbq', #motorola
        20: '1L8cqu31UdqfUlSl2BUPZm2WgcA7KWXhW', #yahoo
        21: '1Yf9ypzXtV0GVRzSROA0Rf5lDlzEZdurG', #lenevo
        22: '1q628g8gPoXhQwQophtXpa1EOfLTia6Yr', #dell
        23: '1NXxLhKMWLjFZw-OHi2omCUHneE65MEKy', #levis
        24: '1IKOAeE6R_lYM79KcCg1R69jaZqFESi4T', #whatsapp
        25: '1eJefZuegIKYKtO_Ui6Ft613HjNy4gjNa', #bing
        26: '1UM0HfKDquiYfQUYPMb0OefYXuO_pU-py', #calender
        27: '1P7xYaV89N-4Hrj_o2mh0VOZDbvDuKYKn', #ia
        28: '1qCxhqPCobg-TzTzcrlRtprlhvZRDjqy0', #mck
        29: '1RREKbmPEKwVh6A7A2FzI9oXw4eAIn4aG', #coca-cola
        30: '1lTf2QiDCHu3syg6TqeGeTjupN4gtdWjI', #red bull
        31: '1GyV9b5-dUwH0H5b7ED0Gy0RrGx0MKp2P', #pepsi
        32: '1ihkdXfowgn_G0X_15fC6H9fsC8pLN3NZ', #mtv
        33: '1tHcRPbDq2aQGAxqA-zOhBlqZlz4h1ZB5', #imdb
        34: '1sQXLYZ6g9DJHMBF-Fk97pbwAFWBNg0Id', #ebay
        35: '1RQZWV4HEmP7Y4FCu0QzWPKZ4HvGPdPvm', #twitch
        36: '1IBMFYSrMTcIRzeR-2msWr9tUV4opI6wI', #craigslist
        37: '1DUby7sNxMeDqJ_kKZFd3heJIV9U_a5I8', #google
        38: '1uCwNBkEKFl5dKzEKpZK0Ab0TPn53Bl7C', #yellow pages
        39: '1MYYGZRaTz9R2N-lDp0FhPU4D5YvOePf2', #twitter
        40: '1OfrYB87DEUPu5CAdX6lYcKcKAGuQ-D4D', #credit karma
        41: '1W9osaf5vDpk5I6gFtuTG16yD6J6Kbi2u', #monster
        42: '1f0kmftCR9xn3giGRqUmOEy7Z9_aHiNIi', #facebook
        43: '1LFwa-RZstduYgG3ZpAwAcH6pV2O0xDb5', #travelocity
        44: '1ue0TWp4Fb68W_XcELvBboGvGyOon8z4F', #yelp
        45: '1_nxJTIHQdeHjBE6btF5Fz4QQdJK0LqT8', #cars
        46: '1hyRV1nd_hOUop9pL23J5tt7G-xhiF1is', #expedia
        47: '1-uRRBpm8S61FWT3t9UJ-zU1klun9o5E', #cheap
        48: '1DhT0w8NYFx53b2OWac_X8izDOUu93e5x', #last fm
        49: '1L9Dqu4lMGLm89Sx49Fu-a7ZJg5gYtOnB', #foxnews
        50: '1ScWlCTk7cfZfjxi2C7cR9C99KpfuLV3Z', #fandom
    }

    while True:
        user_query = input("Enter your query (or 'exit' to quit): ")
        if user_query.lower() == 'exit':
            break

        # Perform ranked retrieval
        doc_scores = ranked_retrieval(user_query, inverted_index, doc_lengths, total_docs)

        # Display results
        if doc_scores:
            print("Top ranked documents:")
            for doc_id, score in doc_scores.items():
                display_document(doc_id, docs, file_id_mapping, score)
        else:
            print("No documents found matching the query.")

if __name__ == "__main__":
    main()


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


Enter your query (or 'exit' to quit): Developing your Zomato business account and profile is a great way to boost your restaurant’s online reputation
Top ranked documents:
  Document ID: 28
  Document Name: zomato.txt
  Content Preview:
If you are a restaurant owner or marketing manager for a restaurant, you’ll love Zomato. But what is Zomato, exactly, and why would you love it? We’ll introduce you to the platform, how to set up your business account, and everything that Zomato has to offer to help you boost your business.

  Score: 0.5903
  Link to Document: https://drive.google.com/file/d/1qCxhqPCobg-TzTzcrlRtprlhvZRDjqy0/view?usp=sharing
--------------------------------------------------
  Document ID: 30
  Document Name: swiggy.txt
  Content Preview:
What Is Swiggy And How It’s Working?

  Score: 0.3498
  Link to Document: https://drive.google.com/file/d/1lTf2QiDCHu3syg6TqeGeTjupN4gtdWjI/view?usp=sharing
--------------------------------------------------
  Document ID: 16
  Documen