In [24]:
import os
import re
from collections import defaultdict


In [30]:

# Preprocessing function to extract words from text
def preprocess(text):
    return re.findall(r'\b\w+\b', text.lower())
# Function to load 
def load_documents(folder_path):
    docs = {}
    for filename in os.listdir(folder_path):
        if filename.endswith('.txt'):
            with open(os.path.join(folder_path, filename), 'r',  encoding='utf-8') as file: # as this fle is not in the proper encoding i need to utf encoding
                docs[filename] = preprocess(file.read())  # Store preprocessed words
    return docs


In [26]:

# Function to compute term frequencies and document frequencies
def compute_statistics(docs):
    doc_count = len(docs)  # Total number of documents
    term_doc_freq = defaultdict(int)  # Tracks in how many docs each term appears
    term_freq = defaultdict(lambda: defaultdict(int))  # Tracks term frequencies in each document

    for doc_id, words in docs.items():
        word_set = set(words)  # Unique words in the document for doc frequency
        for word in words:
            term_freq[doc_id][word] += 1  # Count term frequency
        for word in word_set:
            term_doc_freq[word] += 1  # Count how many documents contain this word

    return term_freq, term_doc_freq, doc_count


In [27]:

# Function to compute relevance probabilities using Binary Independence Model (BIM)
def compute_relevance(query, term_freq, term_doc_freq, doc_count):
    scores = {}
    for doc_id in term_freq:
        score = 1.0  # Start with an initial relevance score of 1
        for term in query:
            tf = term_freq[doc_id].get(term, 0)  # Term frequency in this doc
            df = term_doc_freq.get(term, 0)  # Document frequency of the term
            p_term_given_relevant = (tf + 1) / (sum(term_freq[doc_id].values()) + len(term_doc_freq))  # Smoothed probability
            p_term_given_not_relevant = (df + 1) / (doc_count - df + len(term_doc_freq))  # Smoothed probability for not relevant
            score *= (p_term_given_relevant / p_term_given_not_relevant)  # Multiply probabilities
        scores[doc_id] = score  # Store the final score for this document
    return scores


In [28]:

# Main retrieval function
def retrieve_documents(folder_path, queries):
    docs = load_documents(folder_path)  # Load documents from folder
    term_freq, term_doc_freq, doc_count = compute_statistics(docs)  # Compute term and doc frequencies

    for query in queries:
        query_terms = preprocess(query)  # Preprocess the query (tokenization, case normalization)
        scores = compute_relevance(query_terms, term_freq, term_doc_freq, doc_count)  # Get relevance scores
        ranked_docs = sorted(scores.items(), key=lambda item: item[1], reverse=True)  # Sort documents by score
        
        # Print results for this query
        print(f"Query: {query}")
        for doc_id, score in ranked_docs:
            print(f"Document: {doc_id}, Score: {score:.4f}")
        print()


In [29]:
folder_path = r'D:\jupter notebook\books'
queries = ["beautiful new dress", "friends","king who kills fish"]
retrieve_documents(folder_path, queries)


Query: beautiful new dress
Document: Beauty and the Beast Story.txt, Score: 0.0711
Document: Cinderella Story.txt, Score: 0.0597
Document: The honest woodcutter.txt, Score: 0.0283
Document: A Frog and two Fishes Story.txt, Score: 0.0176
Document: Friends Forever Story.txt, Score: 0.0160
Document: The King Of Forest Lion and The Mou.txt, Score: 0.0093

Query: friends
Document: A Frog and two Fishes Story.txt, Score: 0.6862
Document: Friends Forever Story.txt, Score: 0.4982
Document: The King Of Forest Lion and The Mou.txt, Score: 0.2770
Document: Cinderella Story.txt, Score: 0.1363
Document: The honest woodcutter.txt, Score: 0.1265
Document: Beauty and the Beast Story.txt, Score: 0.0786

Query: king who kills fish
Document: The King Of Forest Lion and The Mou.txt, Score: 0.1547
Document: Friends Forever Story.txt, Score: 0.0600
Document: A Frog and two Fishes Story.txt, Score: 0.0455
Document: The honest woodcutter.txt, Score: 0.0135
Document: Cinderella Story.txt, Score: 0.0034
Documen