In [None]:
import pandas as pd
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import nltk
nltk.download('punkt')
from rouge_score import rouge_scorer
import torch
from nltk.translate.bleu_score import sentence_bleu
from nltk.tokenize import word_tokenize

In [None]:
# Load your Excel file that contains question and answer along with llm generated one
data = pd.read_excel(r"C:\Users\jithi\Downloads\RAG_evaluation.csv")
# Assuming the DataFrame has columns 'Question' and 'True Answer'
questions = data['Question'].tolist()
true_answers = data['Answer'].tolist()
generated_answers=data['LLM_answer'].tolist()


In [None]:
# Function to calculate ROUGE scores
def calculate_rouge(generated, reference):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference, generated)
    return {key: value.fmeasure for key, value in scores.items()}

# Function to calculate BLEU score
def calculate_bleu(generated, reference):
    reference_tokens = word_tokenize(reference.lower())
    generated_tokens = word_tokenize(generated.lower())
    bleu_score = sentence_bleu([reference_tokens], generated_tokens)
    return bleu_score

# Function to calculate perplexity
def calculate_perplexity(text, model, tokenizer):
    inputs = tokenizer.encode(text, return_tensors='pt')
    with torch.no_grad():
        outputs = model(inputs, labels=inputs)
    loss = outputs.loss
    perplexity = torch.exp(loss).item()
    return perplexity



In [None]:
# Load pre-trained GPT-2 model and tokenizer
model_name = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)


# Prepare lists to store scores
rouge_scores_list = []
bleu_scores_list = []
perplexity_list = []

# Calculate and store metrics for each question
for generated, true in zip(generated_answers, true_answers):
    rouge_scores = calculate_rouge(generated, true)
    bleu_score = calculate_bleu(generated, true)
    perplexity = calculate_perplexity(generated, model, tokenizer)
    
    # Store scores
    rouge_scores_list.append(rouge_scores['rouge1'])  # You can change to 'rouge2' or 'rougeL' as needed
    bleu_scores_list.append(bleu_score)
    perplexity_list.append(perplexity)

# Add scores to the DataFrame
data['ROUGE Score'] = rouge_scores_list
data['BLEU Score'] = bleu_scores_list
data['Perplexity'] = perplexity_list
data.to_csv("rag_eval.csv")#store the output to the excel file
# sample evaluated file is given