In [1]:
import pandas as pd



In [2]:
# Set the path for the baseline predictions (corpus) and the one we want to compare (specific dataset) 

columns = ["Source", "Target", "Prediction"]
dataset = "Sports"

path_specific_model = "5_predictions/multi_label_true/"+dataset+"/predictions_"+dataset+"_dataset_with_"+dataset+"_model.csv"
path_full_model = "5_predictions/multi_label_true/"+dataset+"/predictions_"+dataset+"_dataset_with_All_model.csv"

In [3]:
# Load the predictions into Dataframes

df_prediction_specific = pd.read_csv(path_specific_model, sep=',', header=None, names=columns)
df_prediction_full = pd.read_csv(path_full_model, sep=',', header=None, names=columns)

In [4]:
print(df_prediction_specific.shape)
print(df_prediction_full.shape)

(1167, 3)
(1167, 3)


In [5]:
# Calculate ROUGE metrics

from rouge import Rouge

In [6]:
rouge = Rouge()
rouge_scores_specific = rouge.get_scores(df_prediction_specific["Prediction"], df_prediction_specific["Target"], avg=True)
rouge_scores_full = rouge.get_scores(df_prediction_full["Prediction"], df_prediction_full["Target"], avg=True)

In [7]:
import pprint

In [8]:
print(f'\n\nROUGE scores for {dataset} dataset with {dataset} model. \n')
pprint.pprint(rouge_scores_specific)

print(f'\n\nROUGE scores for {dataset} dataset with All model. \n')
pprint.pprint(rouge_scores_full)



ROUGE scores for Sports dataset with Sports model. 

{'rouge-1': {'f': 0.51617681826468,
             'p': 0.4064753462666721,
             'r': 0.7577614476222978},
 'rouge-2': {'f': 0.33486253611473193,
             'p': 0.2527580630777826,
             'r': 0.5592082049728957},
 'rouge-l': {'f': 0.5015817835285318,
             'p': 0.39494530810842793,
             'r': 0.7365048981024421}}


ROUGE scores for Sports dataset with All model. 

{'rouge-1': {'f': 0.5796625756783801,
             'p': 0.4637787273137406,
             'r': 0.8211833459237012},
 'rouge-2': {'f': 0.42592453198753416,
             'p': 0.3260831841898893,
             'r': 0.6883455632667943},
 'rouge-l': {'f': 0.5718549968595438,
             'p': 0.45763805858991374,
             'r': 0.8097400571958293}}


In [9]:
# Calculate BLEU metrics

from nltk.translate.bleu_score import sentence_bleu

In [10]:
def calculate_bleu(predictions, targets):
    score_bleu1 = 0.
    score_bleu2 = 0.

    for i in range(len(predictions)):
        score_bleu1 += sentence_bleu([targets[i].split()], predictions[i].split(), weights=(1, 0, 0, 0))
    score_bleu1 /= len(predictions)
    
    for i in range(len(predictions)):
        score_bleu2 += sentence_bleu([targets[i].split()], predictions[i].split(), weights=(0, 1, 0, 0))
    score_bleu2 /= len(predictions)
    
    return [score_bleu1, score_bleu2]

In [11]:
bleu_scores_specific = calculate_bleu(df_prediction_specific["Prediction"], df_prediction_specific["Target"])
bleu_scores_full = calculate_bleu(df_prediction_full["Prediction"], df_prediction_full["Target"])

The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [12]:
print(f'\n\BLEU scores for {dataset} dataset with {dataset} model. \n')
pprint.pprint(bleu_scores_specific)

print(f'\n\BLEU scores for {dataset} dataset with All model. \n')
pprint.pprint(bleu_scores_full)


\BLEU scores for Sports dataset with Sports model. 

[0.3228586900523488, 0.23442490817235054]

\BLEU scores for Sports dataset with All model. 

[0.3643774389837662, 0.2983391891572673]


In [13]:
# Calculate BERTScore metrics

from evaluate import load
from statistics import mean



In [14]:
bertscore = load("bertscore")
model = "microsoft/deberta-xlarge-mnli"

In [15]:
predictions_specific = df_prediction_specific["Prediction"]
references_specific = df_prediction_specific["Target"]
results_specific = bertscore.compute(predictions=predictions_specific, references=references_specific, lang="en", model_type=model)
#pprint.pprint(results_specific)
f1_score_specific = mean(results_specific["f1"])
precision_specific = mean(results_specific["precision"])
recall_specific = mean(results_specific["recall"])

print(f'BERTScore for {dataset} dataset with {dataset} model. \n')
print(f'F1: {f1_score_specific}')
print(f'Precision: {precision_specific}')
print(f'Recall: {recall_specific}')

BERTScore for Sports dataset with Sports model. 

F1: 0.7121308270998254
Precision: 0.6302744090786936
Recall: 0.8267768141338737


In [16]:
predictions_full = df_prediction_full["Prediction"]
references_full = df_prediction_full["Target"]
results_full = bertscore.compute(predictions=predictions_full, references=references_full, lang="en", model_type=model)
#pprint.pprint(results_full)
f1_score_specific = mean(results_full["f1"])
precision_specific = mean(results_full["precision"])
recall_specific = mean(results_full["recall"])

print(f'BERTScore for {dataset} dataset with All model. \n')
print(f'F1: {f1_score_specific}')
print(f'Precision: {precision_specific}')
print(f'Recall: {recall_specific}')

BERTScore for Sports dataset with All model. 

F1: 0.695192341632892
Precision: 0.5887148081745566
Recall: 0.8705070673551126
