<a href="https://colab.research.google.com/github/kenanmorani/TF-IDF--RAG--and-FAISS-Information-Retrieval/blob/main/Intial_study_for_comparison_of_two_system_performances_in_information_retrieval.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

install and load the dataset:

In [7]:
!pip install datasets scikit-learn faiss-cpu sentence-transformers



Importing the required libraries

In [8]:
import random
import json
import numpy as np
from datasets import load_dataset
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import faiss
from sentence_transformers import SentenceTransformer

# Importing the SciFact dataset for the experment

In [11]:
# Load SciFact dataset (corpus + claims)
corpus = load_dataset("allenai/scifact", "corpus", trust_remote_code=True)
claims = load_dataset("allenai/scifact", "claims", trust_remote_code=True)

# Set seed for reproducibility
random.seed(42)

# Reduce dataset size
small_corpus = corpus["train"].shuffle(seed=42).select(range(1000))
small_claims = claims["train"].shuffle(seed=42).select(range(1000))

# Convert to lists for easy handling
small_corpus = small_corpus.to_list()
small_claims = small_claims.to_list()

# Extract texts and IDs
corpus_texts = [" ".join(doc["abstract"]) for doc in small_corpus]  # Corpus abstracts
corpus_ids = [doc["doc_id"] for doc in small_corpus]  # Corpus document IDs
claims_texts = [claim["claim"] for claim in small_claims]  # Claims

# Model1-trial: TF-IDF (RAG)

In [12]:
### **1️⃣ TF-IDF (RAG) Retrieval**
vectorizer = TfidfVectorizer(stop_words="english", ngram_range=(1, 2), max_df=0.85, min_df=5)
corpus_tfidf = vectorizer.fit_transform(corpus_texts)
claims_tfidf = vectorizer.transform(claims_texts)

# Compute cosine similarity and get best matches
similarities = cosine_similarity(claims_tfidf, corpus_tfidf)
top_k_rag = [list(np.argsort(-similarities[i])[:3]) for i in range(len(small_claims))]  # Top-3 matches

### **2️⃣ FAISS (Embedding-based Retrieval)**
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# Compute embeddings
corpus_embeddings = model.encode(corpus_texts, convert_to_numpy=True, normalize_embeddings=True)
claims_embeddings = model.encode(claims_texts, convert_to_numpy=True, normalize_embeddings=True)

# Model2-trial: FAISS

In [13]:
# FAISS Index (Cosine Similarity via Inner Product)
d = corpus_embeddings.shape[1]
index = faiss.IndexFlatIP(d)  # Inner Product for cosine similarity
index.add(corpus_embeddings)

# Search FAISS index for top-3 matches
D, I = index.search(claims_embeddings, 3)
top_k_faiss = I.tolist()  # Convert to list

# Intial Evaluation of the performance

In [14]:
### **3️⃣ Precision@K Evaluation**
def evaluate_precision_at_k(rag_results, faiss_results, k=3):
    """Checks if at least one document retrieved by FAISS is also in RAG's top-k results."""
    matches = [1 if set(rag_results[i]) & set(faiss_results[i]) else 0 for i in range(len(rag_results))]
    return np.mean(matches)

precision_rag = evaluate_precision_at_k(top_k_rag, top_k_faiss)
precision_faiss = evaluate_precision_at_k(top_k_faiss, top_k_rag)

### **4️⃣ Print Results**
print(f"Precision at k=3 for TF-IDF (RAG): {precision_rag:.3f}")
print(f"Precision at k=3 for FAISS: {precision_faiss:.3f}")

# Debugging Output
print("\nSample Matches:")
for i in range(3):  # Print first 3 sample matches
    print(f"\nClaim {i+1}: {claims_texts[i]}")
    print(f"  TF-IDF (RAG) Top-3: {[corpus_ids[idx] for idx in top_k_rag[i]]}")
    print(f"  FAISS Top-3: {[corpus_ids[idx] for idx in top_k_faiss[i]]}")


Precision at k=3 for TF-IDF (RAG): 0.543
Precision at k=3 for FAISS: 0.543

Sample Matches:

Claim 1: The risk of cancer rises with level of alcohol consumption.
  TF-IDF (RAG) Top-3: [17236106, 20526907, 38028419]
  FAISS Top-3: [20526907, 12794099, 1456068]

Claim 2: British male students are bullied more than British female students.
  TF-IDF (RAG) Top-3: [2867345, 103007, 6368017]
  FAISS Top-3: [3413083, 5850219, 2867345]

Claim 3: A deficiency of vitamin B12 decreases blood levels of homocysteine.
  TF-IDF (RAG) Top-3: [33409100, 42441846, 16252863]
  FAISS Top-3: [33409100, 16252863, 42441846]


In [15]:
from collections import defaultdict

def compute_metrics(rag_results, faiss_results, k=3):
    """Calculates Precision@K, Recall@K, and F1-score@K"""
    precision_rag, recall_rag, f1_rag = [], [], []
    precision_faiss, recall_faiss, f1_faiss = [], [], []

    for i in range(len(rag_results)):
        rag_set = set(rag_results[i])  # TF-IDF retrieved docs
        faiss_set = set(faiss_results[i])  # FAISS retrieved docs

        # Assume ground-truth relevant docs are the union of both methods (fair comparison)
        relevant_docs = rag_set | faiss_set

        # Precision = relevant retrieved / K
        prec_rag = len(rag_set & relevant_docs) / k
        prec_faiss = len(faiss_set & relevant_docs) / k

        # Recall = relevant retrieved / total relevant
        recall_rag_val = len(rag_set & relevant_docs) / len(relevant_docs) if relevant_docs else 0
        recall_faiss_val = len(faiss_set & relevant_docs) / len(relevant_docs) if relevant_docs else 0

        # F1 Score
        f1_rag_val = (2 * prec_rag * recall_rag_val) / (prec_rag + recall_rag_val) if (prec_rag + recall_rag_val) else 0
        f1_faiss_val = (2 * prec_faiss * recall_faiss_val) / (prec_faiss + recall_faiss_val) if (prec_faiss + recall_faiss_val) else 0

        # Store results
        precision_rag.append(prec_rag)
        recall_rag.append(recall_rag_val)
        f1_rag.append(f1_rag_val)

        precision_faiss.append(prec_faiss)
        recall_faiss.append(recall_faiss_val)
        f1_faiss.append(f1_faiss_val)

    # Compute averages
    avg_metrics = {
        "TF-IDF (RAG)": {
            "Precision@K": np.mean(precision_rag),
            "Recall@K": np.mean(recall_rag),
            "F1-score@K": np.mean(f1_rag),
        },
        "FAISS": {
            "Precision@K": np.mean(precision_faiss),
            "Recall@K": np.mean(recall_faiss),
            "F1-score@K": np.mean(f1_faiss),
        }
    }

    return avg_metrics

# Compute and display the results
metrics = compute_metrics(top_k_rag, top_k_faiss, k=3)
for method, scores in metrics.items():
    print(f"\n📌 {method} Metrics:")
    for metric, value in scores.items():
        print(f"  {metric}: {value:.3f}")



📌 TF-IDF (RAG) Metrics:
  Precision@K: 1.000
  Recall@K: 0.579
  F1-score@K: 0.730

📌 FAISS Metrics:
  Precision@K: 1.000
  Recall@K: 0.579
  F1-score@K: 0.730


# Including the dependencies

In [16]:
!python --version

Python 3.11.11


In [17]:
!pip freeze > requirements.txt

In [18]:
!pip list --format=freeze | grep -E 'datasets|scikit-learn|faiss-cpu|sentence-transformers|numpy' > requirements.txt

In [19]:
import sys
import subprocess

libraries = [
    "datasets",
    "scikit-learn",
    "faiss-cpu",
    "sentence-transformers",
    "numpy"
]

for lib in libraries:
    subprocess.run([sys.executable, "-m", "pip", "install", lib])

print("✅ All essential libraries installed.")


✅ All essential libraries installed.
