In [6]:
from google.colab import drive
import os
import math
from collections import defaultdict

# Step 1: Mount Google Drive
drive.mount('/content/drive')

# Step 2: Specify the folder path in your Google Drive where your corpus files are located
corpus_folder = '/content/drive/My Drive/corpus_folder/'  # Replace with your folder name

# Dynamically fetch all .txt files from the specified folder
def get_corpus_files(folder_path):
    corpus_files = []
    for filename in os.listdir(folder_path):
        if filename.endswith('.txt'):  # Only include .txt files
            corpus_files.append(filename)
    return corpus_files

# Load and process corpus from Google Drive
def load_corpus(corpus_files):
    corpus = {}
    for filename in corpus_files:
        file_path = os.path.join(corpus_folder, filename)
        with open(file_path, 'r') as file:
            corpus[filename] = file.read().lower().split()  # Tokenize by splitting on spaces
    return corpus

# Build index (dictionary and postings lists)
def build_index(corpus):
    dictionary = defaultdict(lambda: {'df': 0, 'postings': []})
    document_lengths = {}
    N = len(corpus)  # Total number of documents

    # Build dictionary and postings
    for docID, content in corpus.items():
        term_freqs = defaultdict(int)
        for term in content:
            term_freqs[term] += 1
        
        # Update dictionary with term frequencies
        for term, freq in term_freqs.items():
            dictionary[term]['df'] += 1
            dictionary[term]['postings'].append((docID, freq))
        
        # Calculate document length for normalization
        doc_length = 0
        for term, freq in term_freqs.items():
            doc_length += (1 + math.log10(freq)) ** 2
        document_lengths[docID] = math.sqrt(doc_length)
    
    return dictionary, document_lengths, N

# Compute the tf-idf for query and documents
def compute_tfidf(term, freq, df, N, for_query=False):
    tf = 1 + math.log10(freq)
    if for_query:
        idf = math.log10(N / df)
        return tf * idf
    return tf  # For documents, we ignore the idf part in the lnc scheme

# Rank documents based on cosine similarity
def rank_documents(query, dictionary, document_lengths, N):
    query_terms = query.lower().split()
    query_freqs = defaultdict(int)
    for term in query_terms:
        query_freqs[term] += 1
    
    # Build the query vector
    query_vector = {}
    for term, freq in query_freqs.items():
        if term in dictionary:
            query_vector[term] = compute_tfidf(term, freq, dictionary[term]['df'], N, for_query=True)
    
    # Calculate cosine similarity for each document
    scores = defaultdict(float)
    for term in query_vector:
        if term in dictionary:
            postings = dictionary[term]['postings']
            for docID, doc_freq in postings:
                doc_tfidf = compute_tfidf(term, doc_freq, dictionary[term]['df'], N, for_query=False)
                scores[docID] += query_vector[term] * doc_tfidf
    
    # Normalize scores by document lengths
    for docID in scores:
        scores[docID] /= document_lengths[docID]
    
    # Sort documents by score (highest first), then by docID in case of ties
    ranked_docs = sorted(scores.items(), key=lambda x: (-x[1], x[0]))
    return ranked_docs[:10]  # Return top 10 relevant documents

# Display the results in a vertical list format
def display_results(results):
    print("Top 10Relevant Documents:")
    for rank, (doc, score) in enumerate(results, start=1):
        print(f"{rank}. {doc} (Relevance Score: {round(score, 5)})")

# Main function
def main():
    # Step 3: Dynamically get all .txt files from the Google Drive folder
    corpus_files = get_corpus_files(corpus_folder)
    corpus = load_corpus(corpus_files)
    dictionary, document_lengths, N = build_index(corpus)
    
    # User input query
    query = input("Enter your search query: ")  # Accept query from user
    ranked_docs = rank_documents(query, dictionary, document_lengths, N)
    
    # Display results in vertical list format
    display_results(ranked_docs)

if __name__ == "__main__":
    main()

ModuleNotFoundError: No module named 'sklearn'