In [None]:
%pip install pymilvus[milvus_lite]
%pip install transformers
%pip install datasets
%pip install sentence-transformers
%pip install ragas
%pip install evaluate

In [None]:
# Load required Libraries
import pandas as pd
import numpy as np
import transformers, torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, AutoModelForCausalLM
from datasets import Dataset
import json
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from sentence_transformers import SentenceTransformer
from sklearn.metrics import f1_score
import re
import string

from datetime import datetime

from pymilvus import MilvusClient, FieldSchema, CollectionSchema, DataType

from ragas import evaluate
from ragas.metrics import faithfulness, answer_relevancy, context_precision, context_recall
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_community.llms import HuggingFacePipeline
from langchain_community.embeddings import HuggingFaceEmbeddings
from transformers import pipeline

In [None]:
config = {
    "llm": "google/flan-t5-large",
    "embedding_model": "all-MiniLM-L6-v2",
    "embedding_dim": "384",
    "prompting_style": "advanced",
    "rag_collection_name": "rag_mini"
}

# Read Passages from the Datasets and Drop rows if they are NA or empty

In [None]:
passages = pd.read_parquet("hf://datasets/rag-datasets/rag-mini-wikipedia/data/passages.parquet/part.0.parquet")

print(f"Original dataset shape: {passages.shape}")
# Clean data
passages = passages.dropna(subset=['passage'])
passages = passages[passages['passage'].str.strip() != '']
print(f"Cleaned dataset shape: {passages.shape}")
passages.head()

# Dataset EDA


In [None]:
# Analyze passage lengths before indexing

print("Dataseet EDA")
print(f"Total passages: {len(passages)}")
print(f"Dataset columns: {passages.columns.tolist()}")

# Calculate passage lengths
passages['passage_length'] = passages['passage'].str.len()
passages['word_count'] = passages['passage'].str.split().str.len()

print(f"\nPassage Length Statistics - Characters:")
print(f"Min length: {passages['passage_length'].min()}")
print(f"Max length: {passages['passage_length'].max()}")
print(f"Mean length: {passages['passage_length'].mean():.2f}")
print(f"Median length: {passages['passage_length'].median():.2f}")
print(f"Std length: {passages['passage_length'].std():.2f}")

print(f"\nWord Count Statistics:")
print(f"Min words: {passages['word_count'].min()}")
print(f"Max words: {passages['word_count'].max()}")
print(f"Mean words: {passages['word_count'].mean():.2f}")
print(f"Median words: {passages['word_count'].median():.2f}")

# Distribution analysis
print(f"\nPassage Length Distribution:")
print(passages['passage_length'].describe())

In [None]:
# Plot distributions
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

ax1.hist(passages['passage_length'], bins=50, alpha=0.7, edgecolor='black')
ax1.set_xlabel('Passage Length (characters)')
ax1.set_ylabel('Frequency')
ax1.set_title('Distribution of Passage Lengths')

ax2.hist(passages['word_count'], bins=50, alpha=0.7, edgecolor='black')
ax2.set_xlabel('Word Count')
ax2.set_ylabel('Frequency')
ax2.set_title('Distribution of Word Counts')

plt.tight_layout()
plt.show()

In [None]:
# Sample passages
print(f"\nSample passages:")
print("Short passage example:")
short_passage = passages[passages['passage_length'] < 100]['passage'].iloc[0] if len(passages[passages['passage_length'] < 100]) > 0 else "No short passages found"
print(f"Length: {len(short_passage)} chars")
print(f"Content: {short_passage[:200]}...")

print("\nMedium passage example:")
medium_mask = (passages['passage_length'] >= 200) & (passages['passage_length'] <= 400)
medium_passage = passages[medium_mask]['passage'].iloc[0] if len(passages[medium_mask]) > 0 else "No medium passages found"
print(f"Length: {len(medium_passage)} chars")
print(f"Content: {medium_passage[:200]}...")

print("\nLong passage example:")
long_passage = passages[passages['passage_length'] > 600]['passage'].iloc[0] if len(passages[passages['passage_length'] > 600]) > 0 else "No long passages found"
print(f"Length: {len(long_passage)} chars")
print(f"Content: {long_passage[:200]}...")

# Tokenize Text and Generate Embeddings using Sentence Transformers

In [None]:
embedding_model = SentenceTransformer(config['embedding_model'])

In [None]:
# Encode Text
batch_size = 64
embeddings = []

for i in tqdm(range(0, len(passages), batch_size), desc="Encoding passages"):
    batch_passages = passages['passage'].iloc[i:i+batch_size].tolist()
    batch_embeddings = embedding_model.encode(batch_passages, show_progress_bar=False)
    embeddings.extend(batch_embeddings)

embeddings = np.array(embeddings)
print(f"Embeddings shape: {embeddings.shape}")
print(f"Embedding dimension: {embeddings.shape[1]}")

In [None]:
# Defining Schema
id_ = FieldSchema(
    name="id",
    dtype=DataType.INT64,
    is_primary=True,
    auto_id=False
)

passage = FieldSchema(
    name="passage",
    dtype=DataType.VARCHAR,
    max_length=1000
)

embedding = FieldSchema(
    name="embedding",
    dtype=DataType.FLOAT_VECTOR,
    dim=config['embedding_dim']
)

In [None]:
schema = CollectionSchema(
    fields=[id_, passage, embedding],
    description="RAG Mini Wikipedia Collection"
)

# Create the client
client = MilvusClient("rag_wikipedia_mini.db")

In [None]:
try:
    # Drop the collection if it already exists
    client.drop_collection(config['rag_collection_name'])
except:
    pass

client.create_collection(
    collection_name=config['rag_collection_name'],
    schema=schema
)

print("Collection created successfully")

In [None]:
passages_reset = passages.reset_index(drop=True)
rag_data = []
for idx, row in passages_reset.iterrows():
    rag_data.append({
        "id": int(idx),
        "passage": row['passage'],
        "embedding": embeddings[idx].tolist()
    })

print(f"Prepared {len(rag_data)} records for insertion")

In [None]:
# Code to insert the data to your DB
print("Inserting data into Milvus...")
res = client.insert(collection_name="rag_mini", data=rag_data)
print(f"Insert result: {res}")

- Do a Sanity Check on your database

**Do not delete the below line during your submission**

In [None]:
print("Entity count:", client.get_collection_stats("rag_mini")["row_count"])
print("Collection schema:", client.describe_collection("rag_mini"))

In [None]:
queries = pd.read_parquet("hf://datasets/rag-datasets/rag-mini-wikipedia/data/test.parquet/part.0.parquet")

print(f"Queries dataset shape: {queries.shape}")
print(f"Queries columns: {queries.columns.tolist()}")


In [None]:
# Clean the queries dataset
queries = queries.dropna(subset=['question', 'answer'])
queries = queries[queries['question'].str.strip() != '']
queries = queries[queries['answer'].str.strip() != '']

print(f"Cleaned queries shape: {queries.shape}")
print("\nSample queries:")
print(queries.head())

In [None]:
# Sample Question
query = queries['question'].iloc[0]
print(f"Test query: {query}")

query_embedding = embedding_model.encode([query])
print(f"Query embedding shape: {query_embedding.shape}")

In [None]:
# Create Index on the embedding column
index_params = MilvusClient.prepare_index_params()

# Add an index on the embedding field
index_params.add_index(
    field_name="embedding",
    index_type="FLAT",
    metric_type="COSINE"
)

# Create the index
try:
    client.create_index(
        collection_name="rag_mini",
        index_params=index_params
    )
    print("Index created successfully")
except Exception as e:
    print(f"Index creation result: {e}")

In [None]:
#  Load collection into memory
client.load_collection("rag_mini")
print("Collection loaded into memory")

# Search the db
search_params = {"metric_type": "COSINE", "params": {"nprobe": 10}}

output_ = client.search(
    collection_name="rag_mini",
    data=query_embedding.tolist(),
    anns_field="embedding",
    search_params=search_params,
    limit=10,
    output_fields=["passage"]
)

print("Search results:")
for i, result in enumerate(output_[0]):
    print(f"Rank {i+1}: Score: {result['distance']:.4f}")
    print(f"Passage: {result['entity']['passage'][:200]}...")
    print("-" * 50)

In [None]:
# Load the LLM
model_name = config['llm']
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
print(f"Model loaded on device: {device}")


In [None]:
def get_rag_response(question, top_k=5, max_context_chars=800):
    # Get query embedding
    query_embedding = embedding_model.encode([question])

    # Search database
    search_results = client.search(
        collection_name="rag_mini",
        data=query_embedding.tolist(),
        anns_field="embedding",
        search_params={"metric_type": "COSINE", "params": {"nprobe": 10}},
        limit=top_k,
        output_fields=["passage"]
    )

    # Enhancement 1: Confidence Scoring
    scores = [result['distance'] for result in search_results[0]]
    confidence = np.mean(scores)

    # Adaptive context selection based on confidence
    if confidence > 0.75:
        selected_contexts = search_results[0][:2]
        certainty_note = "Based on highly relevant information"
    else:
        selected_contexts = search_results[0][:4]
        certainty_note = "Based on available information with moderate confidence"

    # Enhancement 2: Context Window Optimization
    def optimize_context_window(contexts, max_chars):
        combined = " ".join([ctx['entity']['passage'] for ctx in contexts])

        if len(combined) <= max_chars:
            return combined

        sentences = combined.split('. ')
        optimized = ""

        for sentence in sentences:
            if len(optimized + sentence + ". ") <= max_chars:
                optimized += sentence + ". "
            else:
                break

        return optimized.strip()

    # Apply context window optimization
    optimized_context = optimize_context_window(selected_contexts, max_context_chars)

    # Generate response with enhanced prompting
    system_prompt = f"""You are a extremely knowledgeable and helpful assistant. {certainty_note}, answer the question using the provided context. Be extremely concise, only respond with the answer to the question, with no other commentary or information required. If the context doesn't fully answer the question, acknowledge this limitation and say 'I don't know'."""

    prompt = f"""{system_prompt}

    Context: {optimized_context}

    Question: {question}

    Answer:"""

    # Generate answer
    inputs = tokenizer(prompt, return_tensors="pt", max_length=400, truncation=True).to(device)

    with torch.no_grad():
        outputs = model.generate(
            inputs.input_ids,
            max_length=100,
            num_beams=4,
            do_sample=True,
            temperature=0.7,
            pad_token_id=tokenizer.eos_token_id
        )

    generated_answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    context_list = [ctx['entity']['passage'] for ctx in selected_contexts]
    return generated_answer, context_list

strategies = {
    "top1": 1,
    "top3": 3,
    "top5": 5
}

results = {}

In [None]:
# Generate responses for different strategies

for strategy_name, top_k in strategies.items():
    print(f"\nGenerating responses with {strategy_name} strategy...")

    strategy_results = {
        'questions': [],
        'generated_answers': [],
        'ground_truth_answers': [],
        'contexts': []
    }

    for idx, row in tqdm(queries.iterrows(), total=len(queries), desc=f"Processing {strategy_name}"):
        question = row['question']
        ground_truth = row['answer']

        generated_answer, contexts = get_rag_response(question, top_k=top_k)

        strategy_results['questions'].append(question)
        strategy_results['generated_answers'].append(generated_answer)
        strategy_results['ground_truth_answers'].append(ground_truth)
        strategy_results['contexts'].append(contexts)

    results[strategy_name] = strategy_results

print("\nResponse generation completed!")


In [None]:
results['top3']['generated_answers'][:15]

In [None]:
def normalize_answer(s):
    # Normalize answer for comparison
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

def exact_match_score(prediction, ground_truth):
    # Calculate exact match score
    return int(normalize_answer(prediction) == normalize_answer(ground_truth))

def f1_score_qa(prediction, ground_truth):
    # Calculate F1 score
    pred_tokens = normalize_answer(prediction).split()
    truth_tokens = normalize_answer(ground_truth).split()

    if len(pred_tokens) == 0 or len(truth_tokens) == 0:
        return int(pred_tokens == truth_tokens)

    common_tokens = set(pred_tokens) & set(truth_tokens)

    if len(common_tokens) == 0:
        return 0

    precision = len(common_tokens) / len(pred_tokens)
    recall = len(common_tokens) / len(truth_tokens)

    f1 = 2 * (precision * recall) / (precision + recall)
    return f1

print("QA METRICS EVALUATION")

evaluation_results = {}

for strategy_name, strategy_data in results.items():
    print(f"\nEvaluating {strategy_name} strategy:")

    em_scores = []
    f1_scores = []

    for pred, truth in zip(strategy_data['generated_answers'], strategy_data['ground_truth_answers']):
        em_score = exact_match_score(pred, truth)
        f1_score = f1_score_qa(pred, truth)

        em_scores.append(em_score)
        f1_scores.append(f1_score)

    avg_em = np.mean(em_scores)
    avg_f1 = np.mean(f1_scores)

    evaluation_results[strategy_name] = {
        'exact_match': avg_em,
        'f1_score': avg_f1,
        'em_scores': em_scores,
        'f1_scores': f1_scores
    }

    print(f"Exact Match: {avg_em:.4f}")
    print(f"F1 Score: {avg_f1:.4f}")


In [None]:
# Results comparison
print("STRATEGY COMPARISON")
comparison_df = pd.DataFrame({
    'Strategy': list(evaluation_results.keys()),
    'Exact Match': [evaluation_results[k]['exact_match'] for k in evaluation_results.keys()],
    'F1 Score': [evaluation_results[k]['f1_score'] for k in evaluation_results.keys()]
})

print(comparison_df)

In [None]:
# Best strategy identification
best_f1_strategy = comparison_df.loc[comparison_df['F1 Score'].idxmax(), 'Strategy']
best_em_strategy = comparison_df.loc[comparison_df['Exact Match'].idxmax(), 'Strategy']

print(f"Best strategy by F1 Score: {best_f1_strategy}")
print(f"Best strategy by Exact Match: {best_em_strategy}")


In [None]:
best_results = results[best_f1_strategy]

In [None]:
# OpenAI API for RAGAs
import os
os.environ['OPENAI_API_KEY'] = ''

# Advanced Evaluation using RAGAs

In [None]:
ragas_data = {
    "question": best_results['questions'][:50],
    "answer": best_results['generated_answers'][:50],
    "contexts": [[context] for context in [ctx[0] for ctx in best_results['contexts']]][:50],
    "ground_truth": best_results['ground_truth_answers'][:50]
}

# Convert dict to dataset
dataset = Dataset.from_dict(ragas_data)

print(f"Dataset prepared for RAGAs evaluation with {len(dataset)} samples")


In [None]:
result = evaluate(
        dataset,
        metrics=[
            faithfulness,
            answer_relevancy,
            context_precision,
            context_recall
        ],
        raise_exceptions=False
        )

In [None]:
results_data = {
    'metadata': {
        'date': datetime.now().isoformat(),
        'dataset': 'RAG Mini Wikipedia',
        'total_queries': len(queries),
        'embedding_model': 'all-mpnet-base-v2',
        'llm_model': 'google/flan-t5-base',
        'embedding_dim': config['embedding_dim'],
        'prompting_style': config['prompting_style'],
    },
    'strategy_comparison': comparison_df.to_dict('records'),
    'detailed_results': {}
}

# Add results for each strategy
for strategy_name, strategy_data in evaluation_results.items():
    results_data['detailed_results'][strategy_name] = {
        'exact_match': float(strategy_data['exact_match']),
        'f1_score': float(strategy_data['f1_score']),
        'num_samples': len(strategy_data['em_scores']),
        'statistics': {
            'em_std': float(np.std(strategy_data['em_scores'])),
            'f1_std': float(np.std(strategy_data['f1_scores'])),
            'em_min': float(np.min(strategy_data['em_scores'])),
            'em_max': float(np.max(strategy_data['em_scores'])),
            'f1_min': float(np.min(strategy_data['f1_scores'])),
            'f1_max': float(np.max(strategy_data['f1_scores']))
        }
    }

# Add best performing strategy
best_f1_strategy = comparison_df.loc[comparison_df['F1 Score'].idxmax(), 'Strategy']
best_em_strategy = comparison_df.loc[comparison_df['Exact Match'].idxmax(), 'Strategy']

results_data['best_strategies'] = {
    'best_f1': best_f1_strategy,
    'best_em': best_em_strategy,
    'best_f1_score': float(comparison_df['F1 Score'].max()),
    'best_em_score': float(comparison_df['Exact Match'].max())
}

ragas_df = result.to_pandas()
results_data['ragas'] = {
    'faithfulness': float(ragas_df['faithfulness'].mean()),
    'answer_relevancy': float(ragas_df['answer_relevancy'].mean()),
    'context_precision': float(ragas_df['context_precision'].mean()),
    'context_recall': float(ragas_df['context_recall'].mean())
}

# Save to JSON
output_filename = f'results_advanced_{config['embedding_dim']}_{config['prompting_style']}_{datetime.now().strftime("%Y%m%d_%H%M%S")}.json'
with open(output_filename, 'w') as f:
    json.dump(results_data, indent=2, fp=f)

print(f"Results saved to {output_filename}")

# Print summary
print("\nResults Summary:")
print(comparison_df)
print(f"\nBest F1 Strategy: {best_f1_strategy} ({results_data['best_strategies']['best_f1_score']:.4f})")
print(f"Best EM Strategy: {best_em_strategy} ({results_data['best_strategies']['best_em_score']:.4f})")
print(f"\nRAGAS Results:")
print(f"Faithfulness: {results_data['ragas']['faithfulness']:.4f}")
print(f"Answer Relevancy: {results_data['ragas']['answer_relevancy']:.4f}")
print(f"Context Precision: {results_data['ragas']['context_precision']:.4f}")
print(f"Context Recall: {results_data['ragas']['context_recall']:.4f}")