In [61]:
import os
import math
from collections import defaultdict

# Initialize global variables
dictionary = defaultdict(list)
doc_lengths = {}
docID_to_filename = {}
N = 0  # Total number of documents

# Indexing phase
def index_corpus(corpus_path):
    global N
    for docID, filename in enumerate(os.listdir(corpus_path)):
        docID_to_filename[docID] = filename  # Store filename for later use
        N += 1
        with open(os.path.join(corpus_path, filename), 'r', encoding='utf-8') as file:
            content = file.read().lower().split()  # Basic tokenization
            term_freqs = defaultdict(int)
            # Calculate term frequencies
            for term in content:
                term_freqs[term] += 1
            
            # Update dictionary and postings
            for term, tf in term_freqs.items():
                dictionary[term].append((docID, tf))
                
            # Calculate and store document length
            length = math.sqrt(sum((1 + math.log10(tf)) ** 2 for tf in term_freqs.values()))
            doc_lengths[docID] = length

# Query phase
def process_query(query):
    query_terms = query.lower().split()  # Basic tokenization
    query_weights = defaultdict(float)
    
    # Calculate tf-idf for query with boosted weights
    query_length = 0
    for term in query_terms:
        tf = query_terms.count(term)
        df = len(dictionary[term]) if term in dictionary else 0
        idf = math.log10((N + 1) / (df + 0.5))  # Adjusted IDF for smoothing
        tf_idf = (1 + math.log10(tf)) * idf  # Log-normalized tf-idf
        query_weights[term] = tf_idf
        query_length += tf_idf ** 2  # Calculate query length for normalization
    
    query_length = math.sqrt(query_length)  # Normalize the query vector
    
    # Rank documents by cosine similarity
    scores = defaultdict(float)
    for term, query_weight in query_weights.items():
        if term in dictionary:
            for docID, tf in dictionary[term]:
                # Use log-normalized tf-idf for document weight
                doc_weight = (1 + math.log10(tf)) * math.log10((N + 1) / (len(dictionary[term]) + 0.5))
                scores[docID] += (query_weight * doc_weight)
    
    # Normalize the scores by document lengths and query length
    for docID in scores:
        if doc_lengths[docID] > 0 and query_length > 0:
            scores[docID] /= (doc_lengths[docID] * query_length)
    
    # Apply sigmoid function to boost lower scores while keeping them between 0 and 1
    for docID in scores:
        scores[docID] = 2 / (1 + math.exp(-10 * scores[docID])) - 1
    
    # Sort by score and then by docID
    ranked_docs = sorted(scores.items(), key=lambda x: (-x[1], x[0]))
    
    # Output top 10 results with actual filenames
    return [(docID_to_filename[docID], score) for docID, score in ranked_docs[:10]]

# Main Function
def main():
    corpus_path = 'C:\\Users\\Himanish\\Desktop\\IR Assignment 2\\Corpus'
    index_corpus(corpus_path)
    
    # Ask user for input query
    query = input("Enter your search query: ")
    
    # Process the query and get results
    results = process_query(query)
    
    # Display the results
    print("\nTop 10 relevant documents:")
    for doc, score in results:
        print(f"{doc}: {score:.15f}")  # Displaying scores with higher precision for better comparison

if __name__ == "__main__":
    main()

Enter your search query:  to see if your restaurant already has a listing on Zomato. If you find it, you’ll start with claiming your restaurant. Otherwise, you’ll begin by creating a new listing instead.  1. Claim or Start Your Listing  First, visit Zomato’s business homepage to search for your restaurant. If you can’t find your listing you can add a restaurant here. Make sure you’re on the business search page and not the user-facing search engine.  If your listing is in Zomato and unclaimed, you will see the option to claim it with a green button. If it’s already claimed, you will need to track down the owner of the page or contact Zomato for help.  Filippi's Pizza Grotto in Little Italy has not yet claimed their listing. If you were the restaurant owner, you would click the green outlined "Claim this restaurant" button.  In order to claim your listing, you’ need verification that you are the owner or manager of the business. Zomato asks for a document of proof, like your certificate


Top 10 relevant documents:
zomato.txt: 0.933279770682370
swiggy.txt: 0.272265286132856
reddit.txt: 0.211260876771811
google.txt: 0.189137133951502
bing.txt: 0.175068387563053
whatsapp.txt: 0.158757256384313
messenger.txt: 0.152372905486089
yahoo.txt: 0.147460471037912
skype.txt: 0.122323820323397
telegram.txt: 0.112197473772053
