In [None]:
%pip install pymilvus[milvus_lite]
%pip install transformers
%pip install datasets
%pip install sentence-transformers
%pip install ragas
%pip install evaluate

In [None]:
# Load required Libraries
import pandas as pd
import numpy as np
import transformers, torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, AutoModelForCausalLM
from datasets import Dataset
import json
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from sentence_transformers import SentenceTransformer
from sklearn.metrics import f1_score
import re
import string

from datetime import datetime

from pymilvus import MilvusClient, FieldSchema, CollectionSchema, DataType

from ragas import evaluate
from ragas.metrics import faithfulness, answer_relevancy, context_precision, context_recall
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_community.llms import HuggingFacePipeline
from langchain_community.embeddings import HuggingFaceEmbeddings
from transformers import pipeline

In [None]:
config = {
    "llm": "google/flan-t5-base",
    "embedding_model": "all-MiniLM-L6-v2",
    "embedding_dim": "384",
    "prompting_style": "persona",
    "rag_collection_name": "rag_mini"
}

In [None]:
system_prompt = """You are a helpful assistant. Answer the question  accurately and concisely."""

In [None]:
import pandas as pd

queries = pd.read_parquet("hf://datasets/rag-datasets/rag-mini-wikipedia/data/test.parquet/part.0.parquet")

print(f"Queries dataset shape: {queries.shape}")
print(f"Queries columns: {queries.columns.tolist()}")


In [None]:
# Clean the queries dataset
queries = queries.dropna(subset=['question', 'answer'])
queries = queries[queries['question'].str.strip() != '']
queries = queries[queries['answer'].str.strip() != '']

print(f"Cleaned queries shape: {queries.shape}")
print("\nSample queries:")
print(queries.head())

In [None]:
# Load the LLM
model_name = config['llm']
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
print(f"Model loaded on device: {device}")


In [None]:
def get_response(question, top_k=1, model=model, tokenizer=tokenizer):

    # Create prompt

    prompt = f"""{system_prompt}

    Question: {question}

    Answer:"""

    # Generate response
    inputs = tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True).to(device)

    with torch.no_grad():
        outputs = model.generate(
            inputs.input_ids,
            max_length=150,
            num_beams=4,
            do_sample=True,
            temperature=0.7,
            pad_token_id=tokenizer.eos_token_id
        )

    generated_answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_answer

# Test with different strategies and parameters
strategies = {
    "NO RAG": 1,
}

results = {}

In [None]:
# Generate responses for different strategies

for strategy_name, top_k in strategies.items():
    print(f"\nGenerating responses with {strategy_name} strategy...")

    strategy_results = {
        'questions': [],
        'generated_answers': [],
        'ground_truth_answers': [],
    }

    for idx, row in tqdm(queries.iterrows(), total=len(queries), desc=f"Processing {strategy_name}"):
        question = row['question']
        ground_truth = row['answer']

        generated_answer = get_response(question, top_k=top_k)

        strategy_results['questions'].append(question)
        strategy_results['generated_answers'].append(generated_answer)
        strategy_results['ground_truth_answers'].append(ground_truth)

    results[strategy_name] = strategy_results

print("\nResponse generation completed!")


In [None]:
def normalize_answer(s):
    # Normalize answer for comparison
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

def exact_match_score(prediction, ground_truth):
    # Calculate exact match score
    return int(normalize_answer(prediction) == normalize_answer(ground_truth))

def f1_score_qa(prediction, ground_truth):
    # Calculate F1 score for QA
    pred_tokens = normalize_answer(prediction).split()
    truth_tokens = normalize_answer(ground_truth).split()

    if len(pred_tokens) == 0 or len(truth_tokens) == 0:
        return int(pred_tokens == truth_tokens)

    common_tokens = set(pred_tokens) & set(truth_tokens)

    if len(common_tokens) == 0:
        return 0

    precision = len(common_tokens) / len(pred_tokens)
    recall = len(common_tokens) / len(truth_tokens)

    f1 = 2 * (precision * recall) / (precision + recall)
    return f1

print("QA METRICS EVALUATION")

evaluation_results = {}

for strategy_name, strategy_data in results.items():
    print(f"\nEvaluating {strategy_name} strategy:")

    em_scores = []
    f1_scores = []

    for pred, truth in zip(strategy_data['generated_answers'], strategy_data['ground_truth_answers']):
        em_score = exact_match_score(pred, truth)
        f1_score = f1_score_qa(pred, truth)

        em_scores.append(em_score)
        f1_scores.append(f1_score)

    avg_em = np.mean(em_scores)
    avg_f1 = np.mean(f1_scores)

    evaluation_results[strategy_name] = {
        'exact_match': avg_em,
        'f1_score': avg_f1,
        'em_scores': em_scores,
        'f1_scores': f1_scores
    }

    print(f"Exact Match: {avg_em:.4f}")
    print(f"F1 Score: {avg_f1:.4f}")


In [None]:
# Results comparison
print("STRATEGY COMPARISON")
comparison_df = pd.DataFrame({
    'Strategy': list(evaluation_results.keys()),
    'Exact Match': [evaluation_results[k]['exact_match'] for k in evaluation_results.keys()],
    'F1 Score': [evaluation_results[k]['f1_score'] for k in evaluation_results.keys()]
})

print(comparison_df)

In [None]:
results_data = {
    'metadata': {
        'date': datetime.now().isoformat(),
        'dataset': 'RAG Mini Wikipedia',
        'total_queries': len(queries),
        'embedding_model': 'all-mpnet-base-v2',
        'llm_model': 'google/flan-t5-base',
        'embedding_dim': config['embedding_dim'],
        'prompting_style': config['prompting_style'],
    },
    'strategy_comparison': comparison_df.to_dict('records'),
    'detailed_results': {}
}

# Add results for each strategy
for strategy_name, strategy_data in evaluation_results.items():
    results_data['detailed_results'][strategy_name] = {
        'exact_match': float(strategy_data['exact_match']),
        'f1_score': float(strategy_data['f1_score']),
        'num_samples': len(strategy_data['em_scores']),
        'statistics': {
            'em_std': float(np.std(strategy_data['em_scores'])),
            'f1_std': float(np.std(strategy_data['f1_scores'])),
            'em_min': float(np.min(strategy_data['em_scores'])),
            'em_max': float(np.max(strategy_data['em_scores'])),
            'f1_min': float(np.min(strategy_data['f1_scores'])),
            'f1_max': float(np.max(strategy_data['f1_scores']))
        }
    }


# Save to JSON
output_filename = f'results_noRAG_{config['embedding_dim']}_{config['prompting_style']}_{datetime.now().strftime("%Y%m%d_%H%M%S")}.json'
with open(output_filename, 'w') as f:
    json.dump(results_data, indent=2, fp=f)

print(f"Results saved to {output_filename}")

# Print summary
print("\nResults Summary:")
print(comparison_df)