# ROUGE-L

In [1]:
from rouge_score import rouge_scorer
import pandas as pd

def calculate_rouge_l(expected, generated):
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    scores = scorer.score(expected, generated)
    return scores["rougeL"].fmeasure  # F1-score of ROUGE-L

def evaluate_rouge(csv_file):
    df = pd.read_csv(csv_file)

    # Columns for all three versions
    versions = ["rag_v1_output", "rag_v2_output", "rag_v3_output"]
    
    # Compute ROUGE-L for each version
    avg_rouge_scores = {}
    for version in versions:
        df[f"rouge_l_{version}"] = df.apply(lambda row: calculate_rouge_l(row["expected_output"], row[version]), axis=1)
        avg_rouge_scores[version] = df[f"rouge_l_{version}"].mean()

    # Print results
    for version, score in avg_rouge_scores.items():
        print(f"📊 Average ROUGE-L Score for {version}: {score:.4f}")

evaluate_rouge("test_queries.csv")

📊 Average ROUGE-L Score for rag_v1_output: 0.5830
📊 Average ROUGE-L Score for rag_v2_output: 0.5783
📊 Average ROUGE-L Score for rag_v3_output: 0.5716


# Cosine Similarity

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

def calculate_cosine_similarity(expected, generated):
    vectorizer = TfidfVectorizer().fit([expected, generated])
    vectors = vectorizer.transform([expected, generated])
    return cosine_similarity(vectors)[0, 1]  # Similarity score between 0 and 1

def evaluate_cosine_similarity(csv_file):
    df = pd.read_csv(csv_file)

    # Columns for all three versions
    versions = ["rag_v1_output", "rag_v2_output", "rag_v3_output"]

    # Compute Cosine Similarity for each version
    avg_cosine_scores = {}
    for version in versions:
        df[f"cosine_similarity_{version}"] = df.apply(lambda row: calculate_cosine_similarity(row["expected_output"], row[version]), axis=1)
        avg_cosine_scores[version] = df[f"cosine_similarity_{version}"].mean()

    # Print results
    for version, score in avg_cosine_scores.items():
        print(f"📊 Average Cosine Similarity for {version}: {score:.4f}")

evaluate_cosine_similarity("test_queries.csv")

📊 Average Cosine Similarity for rag_v1_output: 0.4180
📊 Average Cosine Similarity for rag_v2_output: 0.4308
📊 Average Cosine Similarity for rag_v3_output: 0.4359


# BLEU Score

In [3]:
import pandas  as pd
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

def calculate_bleu_score(expected, generated):
    smoothing = SmoothingFunction().method1  # Handles zero n-gram overlap issues
    return sentence_bleu([expected.split()], generated.split(), smoothing_function=smoothing)

def evaluate_bleu(csv_file):
    df = pd.read_csv(csv_file)

    # Columns for all three versions
    versions = ["rag_v1_output", "rag_v2_output", "rag_v3_output"]


    # Compute BLEU Score for each version
    avg_bleu_scores = {}
    for version in versions:
        df[f"bleu_score_{version}"] = df.apply(lambda row: calculate_bleu_score(row["expected_output"], row[version]), axis=1)
        avg_bleu_scores[version] = df[f"bleu_score_{version}"].mean()

    # Print results
    for version, score in avg_bleu_scores.items():
        print(f"📊 Average BLEU Score for {version}: {score:.4f}")

evaluate_bleu("test_queries.csv")

📊 Average BLEU Score for rag_v1_output: 0.0127
📊 Average BLEU Score for rag_v2_output: 0.0197
📊 Average BLEU Score for rag_v3_output: 0.0141
