In [21]:
import pandas as pd
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from rouge_score import rouge_scorer
from nltk.corpus import wordnet
from bert_score import score as bert_score
from sentence_transformers import SentenceTransformer, util
from tqdm import tqdm
import nltk
nltk.download('wordnet')
import transformers
import warnings
warnings.filterwarnings("ignore")




[nltk_data] Downloading package wordnet to /home/uttam/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!




#### Evaluation is  done in the training scripts for all models , this script is to summarize the results using the prediction CSV files produced by the models


In [11]:

# Cosine similarity model
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')

In [12]:
#Exact match
def exact_match(y_true, y_pred):
    return [int(a.strip().lower() == b.strip().lower()) for a, b in zip(y_true, y_pred)]


In [13]:
# Synonym Match
def synonym_match(y_true, y_pred):
    def is_synonym(word1, word2):
        syns1 = set([lemma.name().lower() for syn in wordnet.synsets(word1) for lemma in syn.lemmas()])
        syns2 = set([lemma.name().lower() for syn in wordnet.synsets(word2) for lemma in syn.lemmas()])
        return len(syns1.intersection(syns2)) > 0

    result = []
    for a, b in zip(y_true, y_pred):
        a, b = a.strip().lower(), b.strip().lower()
        if a == b:
            result.append(1)
        elif is_synonym(a, b):
            result.append(1)
        else:
            result.append(0)
    return result

In [14]:
# ROUGE Score
def rouge_l_score(y_true, y_pred):
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    scores = [scorer.score(ref, pred)['rougeL'].fmeasure for ref, pred in zip(y_true, y_pred)]
    return sum(scores) / len(scores)

In [15]:
# Cosine similarity score
def cosine_similarity_score(y_true, y_pred):
    embeddings1 = sbert_model.encode(y_true, convert_to_tensor=True)
    embeddings2 = sbert_model.encode(y_pred, convert_to_tensor=True)
    cos_sim = util.cos_sim(embeddings1, embeddings2)
    return cos_sim.diag().mean().item()

In [16]:
def evaluate_file(file_path):
    df = pd.read_csv(file_path)
    y_true = df['true_answer'].astype(str).tolist()
    y_pred = df['predicted_answer'].astype(str).tolist()

    em = exact_match(y_true, y_pred)
    acc = accuracy_score(em, [1]*len(em))
    precision = precision_score(em, [1]*len(em), zero_division=0)
    recall = recall_score(em, [1]*len(em), zero_division=0)
    f1 = f1_score(em, [1]*len(em), zero_division=0)

    rouge = rouge_l_score(y_true, y_pred)

    syn_acc = accuracy_score(synonym_match(y_true, y_pred), [1]*len(y_true))

    # BERTScore
    bert_p, bert_r, bert_f1 = bert_score(y_pred, y_true, lang='en', rescale_with_baseline=True)

    cos_sim = cosine_similarity_score(y_true, y_pred)

    return {
        "Exact Match Accuracy":round(acc * 100, 2),
        # Proportion of predictions that exactly match the true answers.
        
        "Synonym Accuracy" :round(syn_acc * 100, 2),
        # Proportion of predicted answers that are synonyms of the true answers.

        "Exact Match Precision":round(precision * 100, 2),
        # Proportion of exact matches among all predicted matches.
        
        "Exact Match Recall":round(recall * 100, 2),
        # Proportion of exact matches among all true answers.

        "Exact Match F1":round(f1 * 100, 2),
        # Harmonic mean of exact match precision and recall.

        "ROUGE-L F1" :round(rouge * 100, 2),
        # Measures overlap based on the longest common subsequence between predicted and true answers.

        "BERTScore Precision" :round(bert_p.mean().item() * 100, 2),
        # Measures how much of the predicted answer’s meaning matches the true answer using contextual embeddings.

        "BERTScore Recall" :round(bert_r.mean().item() * 100, 2),
        # Measures how much of the true answer’s meaning is captured by the prediction using contextual embeddings.

        "BERTScore F1"  :round(bert_f1.mean().item() * 100, 2),
        # Harmonic mean of BERTScore precision and recall, indicating overall semantic similarity.

        "Cosine Similarity" : round(cos_sim * 100, 2),
        # Cosine of the angle between the embedding vectors of predicted and true answers, representing semantic closeness.
    }



In [27]:
import logging
logging.getLogger("transformers.modeling_utils").setLevel(logging.ERROR)


csv_files = [
    "Evaluation/predictions_BLIP_Baseline.csv",
    "Evaluation/predictions_Vilt_Baseline.csv",
    "Evaluation/predictions_BLIP_r8.csv",
    "Evaluation/predictions_BLIP_r16.csv",
    "Evaluation/predictions_BLIP_r32.csv",
   
    
]

# Evaluate and print results
for csv_file in csv_files:
    print(f"\n{csv_file}")
    scores = evaluate_file(csv_file)
    for metric, value in scores.items():
        print(f"{metric:<25}: {value}%")


Evaluation/predictions_BLIP_Baseline.csv
Exact Match Accuracy     : 46.57%
Synonym Accuracy         : 50.71%
Exact Match Precision    : 46.57%
Exact Match Recall       : 100.0%
Exact Match F1           : 63.54%
ROUGE-L F1               : 47.55%
BERTScore Precision      : 91.43%
BERTScore Recall         : 88.46%
BERTScore F1             : 89.78%
Cosine Similarity        : 74.36%

Evaluation/predictions_Vilt_Baseline.csv
Exact Match Accuracy     : 27.53%
Synonym Accuracy         : 29.61%
Exact Match Precision    : 27.53%
Exact Match Recall       : 100.0%
Exact Match F1           : 43.18%
ROUGE-L F1               : 28.02%
BERTScore Precision      : 90.73%
BERTScore Recall         : 88.68%
BERTScore F1             : 89.54%
Cosine Similarity        : 63.54%

Evaluation/predictions_BLIP_r8.csv
Exact Match Accuracy     : 62.32%
Synonym Accuracy         : 64.57%
Exact Match Precision    : 62.32%
Exact Match Recall       : 100.0%
Exact Match F1           : 76.78%
ROUGE-L F1               : 63.

In [35]:
csv_files = [
    "Evaluation/predictions_Blip_LoRa_r16_key.csv"
]

for csv_file in csv_files:
    print(f"\n{csv_file}")
    scores = evaluate_file(csv_file)
    for metric, value in scores.items():
        print(f"{metric:<25}: {value}%")



Evaluation/predictions_Blip_LoRa_r16_key.csv
Exact Match Accuracy     : 59.6%
Synonym Accuracy         : 62.11%
Exact Match Precision    : 59.6%
Exact Match Recall       : 100.0%
Exact Match F1           : 74.69%
ROUGE-L F1               : 61.19%
BERTScore Precision      : 93.87%
BERTScore Recall         : 92.48%
BERTScore F1             : 93.07%
Cosine Similarity        : 80.69%
