### STEP 0: Imports and Setup

In [1]:
import pandas as pd
import math
import re
import time
import math
from collections import defaultdict, Counter


import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download NLTK resources (run once)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

## STEP-1 (Load CSV from Google Drive)

---



In [2]:
from google.colab import drive
drive.mount('/content/drive')  # Connect Colab with Drive
file_path = "/content/drive/MyDrive/Articles.csv"
import pandas as pd
df = pd.read_csv(file_path, encoding="latin1")
print(df.head())
print(df.columns)
print("Total documents loaded:", len(df))


Mounted at /content/drive
                                             Article      Date  \
0  KARACHI: The Sindh government has decided to b...  1/1/2015   
1  HONG KONG: Asian markets started 2015 on an up...  1/2/2015   
2  HONG KONG:  Hong Kong shares opened 0.66 perce...  1/5/2015   
3  HONG KONG: Asian markets tumbled Tuesday follo...  1/6/2015   
4  NEW YORK: US oil prices Monday slipped below $...  1/6/2015   

                                             Heading  NewsType  
0  sindh govt decides to cut public transport far...  business  
1                    asia stocks up in new year trad  business  
2           hong kong stocks open 0.66 percent lower  business  
3             asian stocks sink euro near nine year   business  
4                 us oil prices slip below 50 a barr  business  
Index(['Article', 'Date', 'Heading', 'NewsType'], dtype='object')
Total documents loaded: 2692


### STEP 2: Preprocessing

In [3]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    if not isinstance(text, str):
        return ""
    # Lowercase
    text = text.lower()
    # Remove non-alphabetic characters (keep spaces)
    text = re.sub(r"[^a-z\s]", " ", text)
    # Collapse multiple spaces
    text = re.sub(r"\s+", " ", text).strip()
    return text

def tokenize(text):
    # Simple whitespace-based tokenization after cleaning
    if not text:
        return []
    return text.split()

def preprocess_text(text):

    text = clean_text(text)
    tokens = tokenize(text)
    processed = []
    for tok in tokens:
        if tok in stop_words:
            continue
        lemma = lemmatizer.lemmatize(tok)
        processed.append(lemma)
    return processed


# STEP 3: Build Document List and Preprocessed Tokens

In [4]:

documents = []  # list of dicts: {id, title, body, date, news_type, tokens}

for idx, row in df.iterrows():
    doc_id = idx  # simple index as ID
    title = str(row.get("Heading", ""))
    body = str(row.get("Article", ""))
    date = row.get("Date", "")
    news_type = str(row.get("NewsType", ""))

    # Preprocess
    title_tokens = preprocess_text(title)
    body_tokens = preprocess_text(body)

    # Option: give some extra weight to title by repeating tokens
    combined_tokens = title_tokens * 2 + body_tokens  # title weighted 2x

    documents.append({
        "id": doc_id,
        "title": title,
        "body": body,
        "date": date,
        "news_type": news_type,
        "tokens": combined_tokens
    })

print("Total preprocessed documents:", len(documents))
print("Sample doc:", documents[0]["title"])
print("Tokens:", documents[0]["tokens"][:20])


Total preprocessed documents: 2692
Sample doc: sindh govt decides to cut public transport fares by 7pc kti rej
Tokens: ['sindh', 'govt', 'decides', 'cut', 'public', 'transport', 'fare', 'pc', 'kti', 'rej', 'sindh', 'govt', 'decides', 'cut', 'public', 'transport', 'fare', 'pc', 'kti', 'rej']


# STEP 4: Inverted Index and BM25 Stats

In [5]:
inverted_index = defaultdict(dict)  # term -> {doc_id: tf}
doc_length = {}                     # doc_id -> total tokens
df_term = defaultdict(int)          # term -> document frequency

N = len(documents)

for doc in documents:
    doc_id = doc["id"]
    tokens = doc["tokens"]
    doc_length[doc_id] = len(tokens)

    term_counts = Counter(tokens)  # term -> tf in this doc

    for term, tf in term_counts.items():
        inverted_index[term][doc_id] = tf
        df_term[term] += 1

avg_doc_length = sum(doc_length.values()) / N

print("Total terms in index:", len(inverted_index))
print("Average document length:", avg_doc_length)


Total terms in index: 23565
Average document length: 193.72362555720653


# STEP 5: BM25 Implementation

In [6]:
class BM25:
    def __init__(self, inverted_index, df_term, doc_length, N, avg_doc_length, k1=1.5, b=0.75):
        self.inverted_index = inverted_index
        self.df_term = df_term
        self.doc_length = doc_length
        self.N = N
        self.avg_doc_length = avg_doc_length
        self.k1 = k1
        self.b = b

    def idf(self, term):
        # BM25 IDF
        df = self.df_term.get(term, 0)
        if df == 0:
            return 0.0
        return math.log((self.N - df + 0.5) / (df + 0.5) + 1)  # +1 for numerical stability

    def score(self, query_terms, candidate_docs=None):
        scores = defaultdict(float)

        # If candidate_docs not given, we collect from all docs that contain any query term
        if candidate_docs is None:
            candidate_docs = set()
            for term in query_terms:
                postings = self.inverted_index.get(term, {})
                candidate_docs.update(postings.keys())

        for term in query_terms:
            postings = self.inverted_index.get(term, {})
            if not postings:
                continue
            idf = self.idf(term)
            for doc_id in candidate_docs:
                tf = postings.get(doc_id, 0)
                if tf == 0:
                    continue
                dl = self.doc_length[doc_id]
                denom = tf + self.k1 * (1 - self.b + self.b * dl / self.avg_doc_length)
                score_term = idf * (tf * (self.k1 + 1) / denom)
                scores[doc_id] += score_term

        return scores

bm25_model = BM25(inverted_index, df_term, doc_length, N, avg_doc_length)
print("BM25 model ready.")


BM25 model ready.


## Boolean Retrieval Implementation

In [7]:
def get_doc_set_for_term(term):

    postings = inverted_index.get(term, {})
    return set(postings.keys())

def boolean_retrieve(query_str):

    # Clean query and split by spaces
    parts = query_str.strip().split()
    if not parts:
        return set()

    # Preprocess each term and convert operators to uppercase
    processed_parts = []
    for part in parts:
        upper_part = part.upper()
        if upper_part in ["AND", "OR", "NOT"]:
            processed_parts.append(upper_part)
        else:
            # Preprocess each term
            tokens = preprocess_text(part)
            if tokens:
                processed_parts.append(tokens[0])  # Only use the first token after preprocessing

    # If no valid processed terms, return empty set
    if not processed_parts:
        return set()

    # Initialize current_set to None (it will store the documents matching the query)
    current_set = None
    current_op = None

    i = 0
    while i < len(processed_parts):
        token = processed_parts[i]

        if token in ["AND", "OR", "NOT"]:
            current_op = token
        else:
            # it's a term
            term_set = get_doc_set_for_term(token)

            if current_set is None:
                # First term initializes the set
                if current_op == "NOT":
                    current_set = set(range(N)) - term_set  # If NOT, return all docs except those with this term
                else:
                    current_set = term_set
            else:
                if current_op == "AND":
                    current_set = current_set & term_set  # Keep only docs containing all terms
                elif current_op == "OR":
                    current_set = current_set | term_set  # Keep docs that contain at least one term
                elif current_op == "NOT":
                    current_set = current_set - term_set  # Exclude docs containing this term

        i += 1

    return current_set if current_set is not None else set()


# STEP 7: Search Function( simple search ,Boolean Search,Category filter)




In [8]:
def search(query_str, top_k=10, use_boolean_filter=True, category=None):
    # Start timing the search process
    start_time = time.time()

    # 1) Preprocess query for BM25 terms
    query_terms = preprocess_text(query_str)
    if not query_terms:
        print("No valid query terms after preprocessing.")
        return []

    # 2) Boolean candidate set
    candidate_docs = None
    if use_boolean_filter:
        bool_set = boolean_retrieve(query_str)
        if bool_set:
            candidate_docs = bool_set

    # 3) Category filter
    if category is not None:
        category = category.lower().strip()
        category_docs = {doc["id"] for doc in documents if doc["news_type"].lower() == category}
        if candidate_docs is None:
            candidate_docs = category_docs
        else:
            candidate_docs = candidate_docs & category_docs

    # 4) BM25 scoring
    scores = bm25_model.score(query_terms, candidate_docs)

    if not scores:
        print("No documents matched the query.")
        return []

    # 5) Sort scores
    ranked_docs = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:top_k]

    # 6) Prepare output
    results = []
    for rank, (doc_id, score) in enumerate(ranked_docs, start=1):
        doc = documents[doc_id]
        results.append({
            "rank": rank,
            "doc_id": doc_id,
            "score": score,
            "title": doc["title"],
            "date": doc["date"],
            "news_type": doc["news_type"],
            "snippet": doc["body"][:300].replace("\n", " ")
        })

    # Calculate and print time taken for the search query
    end_time = time.time()
    print(f"Time taken for query '{query_str}': {end_time - start_time:.4f} seconds")
    return results




def print_results(results):
    for r in results:
        print("=" * 80)
        print(f"Rank: {r['rank']} | DocID: {r['doc_id']} | Score: {r['score']:.4f}")
        print(f"Date: {r['date']} | Category: {r['news_type']}")
        print(f"Title: {r['title']}")
        print(f"Snippet: {r['snippet']}...")
    if not results:
        print("No results.")


# Evaluation Metrics Skeleton(Precision,Recall,F1score)

In [9]:
def precision_at_k(retrieved, relevant_set, k):
    if k == 0:
        return 0.0
    retrieved_k = retrieved[:k]
    rel_in_top_k = sum(1 for d in retrieved_k if d in relevant_set)
    return rel_in_top_k / k

def recall_at_k(retrieved, relevant_set, k):
    if not relevant_set:
        return 0.0
    retrieved_k = retrieved[:k]
    rel_in_top_k = sum(1 for d in retrieved_k if d in relevant_set)
    return rel_in_top_k / len(relevant_set)

def average_precision(retrieved, relevant_set):
    if not relevant_set:
        return 0.0
    ap_sum = 0.0
    hit_count = 0
    for i, doc_id in enumerate(retrieved, start=1):
        if doc_id in relevant_set:
            hit_count += 1
            ap_sum += hit_count / i
    if hit_count == 0:
        return 0.0
    return ap_sum / len(relevant_set)

def reciprocal_rank(retrieved, relevant_set):
    for i, doc_id in enumerate(retrieved, start=1):
        if doc_id in relevant_set:
            return 1.0 / i
    return 0.0


def f1_score(precision, recall):
    if precision + recall == 0:
        return 0
    return 2 * (precision * recall) / (precision + recall)


def evaluate_system(queries_judgments, k=10):

    precisions = []
    recalls = []
    f1_scores = []
    aps = []
    rrs = []

    for q, rel_docs in queries_judgments.items():
        rel_set = set(rel_docs)
        results = search(q, top_k=k, use_boolean_filter=False)
        retrieved_ids = [r["doc_id"] for r in results]

        p = precision_at_k(retrieved_ids, rel_set, k)
        r = recall_at_k(retrieved_ids, rel_set, k)
        f1 = f1_score(p, r)
        ap = average_precision(retrieved_ids, rel_set)
        rr = reciprocal_rank(retrieved_ids, rel_set)

        precisions.append(p)
        recalls.append(r)
        f1_scores.append(f1)
        aps.append(ap)
        rrs.append(rr)

        print(f"Query: {q}")
        print(f" Precision@{k}: {p:.4f}")
        print(f" Recall@{k}:    {r:.4f}")
        print(f" F1-Score:      {f1:.4f}")
        print(f" AP:            {ap:.4f}")
        print(f" RR:            {rr:.4f}")
        print("-" * 60)

    if not precisions:
        print("No queries provided for evaluation.")
        return

    print("\n=== Overall Metrics ===")
    print(f"Mean Precision@{k}: {sum(precisions)/len(precisions):.4f}")
    print(f"Mean Recall@{k}:    {sum(recalls)/len(recalls):.4f}")
    print(f"Mean F1-Score@{k}:  {sum(f1_scores)/len(f1_scores):.4f}")
    print(f"MAP:                {sum(aps)/len(aps):.4f}")
    print(f"MRR:                {sum(rrs)/len(rrs):.4f}")


# Hardcoded Main Menu

In [10]:


def MAIN():
    print("\n================= INFORMATION RETRIEVAL SYSTEM ================")
    print("Hybrid IR → Boolean + BM25 Ranking | Local | No Cloud DB Used")
    print("=================================================================")

    # Directly run the hardcoded input without asking for option
    print("\nRunning hardcoded input...")

    # Example 1: Simple Ranked Search
    q1 = "oil price crash"
    start_time = time.time()
    results1 = search(q1, top_k=5, use_boolean_filter=False)
    end_time = time.time()
    print(f"Time taken for query '{q1}': {end_time - start_time:.4f} seconds")
    print(f"--- Results for query: '{q1}' ---")
    print_results(results1)

    # Example 2: Boolean Search
    q2 = "karachi AND london"
    start_time = time.time()
    results2 = search(q2, top_k=5, use_boolean_filter=True)
    end_time = time.time()
    print(f"\nTime taken for query '{q2}': {end_time - start_time:.4f} seconds")
    print(f"--- Results for query: '{q2}' with Boolean filter ---")
    print_results(results2)

    # Example 3: Search with Category Filter
    q3 = "stock market"
    start_time = time.time()
    results3 = search(q3, top_k=5, use_boolean_filter=False, category="business")
    end_time = time.time()
    print(f"\nTime taken for query '{q3}': {end_time - start_time:.4f} seconds")
    print(f"--- Results for query: '{q3}' in category 'business' ---")
    print_results(results3)

    # After the hardcoded search examples, proceed to evaluation
    evaluate_search_results()


# Evaluation Function - This will appear after any search.
def evaluate_search_results():
    print("\n-------------------- SYSTEM EVALUATION --------------------")

    # Predefined evaluation queries (with relevant doc_ids)
    eval_queries = [
        ("oil price crash", [643, 424, 204, 123, 280]),  # Using actual document IDs for evaluation
        ("karachi AND london", [465, 1835, 2388, 2229,2520]),
        ("stock market", [745, 6, 253, 84, 817])
    ]

    queries_rel = {}

    for q, relevant_docs in eval_queries:
        print(f"\nEvaluating query: {q}")
        start_time = time.time()
        current_results = search(q, top_k=5, use_boolean_filter=True)
        end_time = time.time()
        print(f"Time taken for evaluation query '{q}': {end_time - start_time:.4f} seconds")

        print("\n--- TOP 5 RESULTS --- Mark Relevant Docs ---")
        print_results(current_results)

        # Using predefined relevant docs for evaluation
        queries_rel[q] = relevant_docs

    if queries_rel:
        evaluate_system(queries_rel, k=5)
    else:
        print("No data entered. Nothing to evaluate!")


# Evaluation Metrics with F1-Score
def evaluate_system(queries_judgments, k=10):
    """
    queries_judgments: dict
      key = query string
      value = set/list of relevant doc_ids
    """
    precisions = []
    recalls = []
    f1_scores = []
    aps = []
    rrs = []

    for q, rel_docs in queries_judgments.items():
        rel_set = set(rel_docs)
        results = search(q, top_k=k, use_boolean_filter=False)
        retrieved_ids = [r["doc_id"] for r in results]

        p = precision_at_k(retrieved_ids, rel_set, k)
        r = recall_at_k(retrieved_ids, rel_set, k)
        f1 = f1_score(p, r)
        ap = average_precision(retrieved_ids, rel_set)
        rr = reciprocal_rank(retrieved_ids, rel_set)

        precisions.append(p)
        recalls.append(r)
        f1_scores.append(f1)
        aps.append(ap)
        rrs.append(rr)

        print(f"Query: {q}")
        print(f" Precision@{k}: {p:.4f}")
        print(f" Recall@{k}:    {r:.4f}")
        print(f" F1-Score:      {f1:.4f}")
        print(f" AP:            {ap:.4f}")
        print(f" RR:            {rr:.4f}")
        print("-" * 60)

    if not precisions:
        print("No queries provided for evaluation.")
        return

    print("\n=== Overall Metrics ===")
    print(f"Mean Precision@{k}: {sum(precisions)/len(precisions):.4f}")
    print(f"Mean Recall@{k}:    {sum(recalls)/len(recalls):.4f}")
    print(f"Mean F1-Score@{k}:  {sum(f1_scores)/len(f1_scores):.4f}")
    print(f"MAP:                {sum(aps)/len(aps):.4f}")
    print(f"MRR:                {sum(rrs)/len(rrs):.4f}")


# ==== Running the System ====
def run_system():
    print("\nRunning hardcoded input...")
    # Run hardcoded input
    q1 = "oil price crash"
    search(q1, top_k=5, use_boolean_filter=False)

    # Run evaluation after hardcoded input
    evaluate_search_results()

# ==== Main Function Call ====
run_system()



Running hardcoded input...
Time taken for query 'oil price crash': 0.0011 seconds

-------------------- SYSTEM EVALUATION --------------------

Evaluating query: oil price crash
Time taken for query 'oil price crash': 0.0008 seconds
Time taken for evaluation query 'oil price crash': 0.0008 seconds

--- TOP 5 RESULTS --- Mark Relevant Docs ---
Rank: 1 | DocID: 643 | Score: 11.2039
Date: 5/17/2016 | Category: business
Title: Oil futures hold near six month hig
Snippet: strong>TOKYO: Crude oil futures held near six-month highs in early Asian trading as the market focused on supply disruptions that prompted long-time bear Goldman Sachs to issue a bullish assessment on near-term prices.</strongCrude futures have rallied for most of the past two weeks from a combinati...
Rank: 2 | DocID: 424 | Score: 10.5002
Date: 1/8/2016 | Category: business
Title: Oil moves away from 12 year lows as China shares ri
Snippet: strong>SEOUL: Global benchmark oil futures rallied more than 2 percent on Friday,

# User Input Main Menu

In [None]:


def MAIN():
    print("\n================= INFORMATION RETRIEVAL SYSTEM ================")
    print("Hybrid IR → Boolean + BM25 Ranking | Local | No Cloud DB Used")
    print("=================================================================")

    while True:
        print("\nChoose an option:")
        print("1) Search Query")
        print("2) Evaluate System (Precision / Recall / MAP / MRR)")
        print("3) Exit")

        main_choice = input("\nEnter choice (1/2/3): ").strip()

        # ======================= SEARCH ======================= #
        if main_choice == "1":
            while True:
                print("\nChoose search query type:")
                print("1) Simple Ranked Search")
                print("2) Boolean Search")
                print("3) Search with Category Filter")
                print("4) Back to Main Menu")

                search_type = input("\nEnter search type (1/2/3/4): ").strip()

                if search_type == "4":
                    break  # go back to main menu

                query = input("\nEnter search query: ").strip()

                # ===== Ask Top-K from user ===== #
                try:
                    top_k = int(input("How many top-K documents you want? → ").strip())
                except:
                    print("Invalid input. Defaulting top-K = 10")
                    top_k = 10

                cat = None
                use_boolean = False

                if search_type == "1":   # BM25
                    use_boolean = False
                elif search_type == "2": # BOOLEAN
                    use_boolean = True
                elif search_type == "3": # CATEGORY FILTER
                    use_boolean = False
                    cat = input("Enter category (business/sports/tech/...): ").strip()
                    if cat == "": cat = None
                else:
                    print("Invalid Search Type! Try Again.")
                    continue

                # Start timing the query execution
                start_time = time.time()
                results = search(query, top_k=top_k, use_boolean_filter=use_boolean, category=cat)
                end_time = time.time()

                # Print the time taken for the query execution
                print(f"Time taken for query '{query}': {end_time - start_time:.4f} seconds")

                print("\n-------------------- SEARCH RESULTS --------------------")
                print_results(results)

        # ======================= EVALUATION ======================= #
        elif main_choice == "2":
            try:
                n = int(input("\nHow many evaluation queries? → ").strip())
            except:
                print("Invalid number! Try again.")
                continue

            queries_rel = {}

            for i in range(n):
                print(f"\nQuery-{i+1}:")
                q = input("Enter query: ").strip()

                # Start timing the evaluation query
                start_time = time.time()
                current_results = search(q, top_k=10, use_boolean_filter=True)
                end_time = time.time()
                print(f"Time taken for evaluation query '{q}': {end_time - start_time:.4f} seconds")

                print("\n--- TOP 10 RESULTS --- Mark Relevant Docs ---")
                print_results(current_results)

                rel = input("Enter relevant doc IDs (comma-separated): ").strip()
                rel_docs = list(map(int, rel.split(","))) if rel else []

                queries_rel[q] = rel_docs

            if queries_rel:
                print("\n---------------- SYSTEM EVALUATION ----------------")
                evaluate_system(queries_rel, k=10)
            else:
                print("No data entered. Nothing to evaluate!")

        # ======================= EXIT ======================= #
        elif main_choice == "3":
            print("\nExiting IR System. Goodbye!")
            break

        else:
            print("\nInvalid Option! Try Again.")


# RUN
MAIN()



Hybrid IR → Boolean + BM25 Ranking | Local | No Cloud DB Used

Choose an option:
1) Search Query
2) Evaluate System (Precision / Recall / MAP / MRR)
3) Exit

Enter choice (1/2/3): 1

Choose search query type:
1) Simple Ranked Search
2) Boolean Search
3) Search with Category Filter
4) Back to Main Menu

Enter search type (1/2/3/4): 2

Enter search query: karachi AND london AND tokyo
How many top-K documents you want? → 40
Time taken for query 'karachi AND london AND tokyo': 0.0024 seconds
Time taken for query 'karachi AND london AND tokyo': 0.0026 seconds

-------------------- SEARCH RESULTS --------------------
Rank: 1 | DocID: 836 | Score: 7.3658
Date: 8/5/2016 | Category: business
Title: Tokyo stocks rise in early trade on weaker yen Toyota soar
Snippet: strong>TOKYO: Tokyo stocks climbed in early trading Friday as a weaker yen boosted exporters, while Toyota surged on forecast-beating quarterly net profits.</strongThe world´s biggest automaker said Thursday it posted a nearly 15 pe