# Evaluating Performance of Models


In [10]:
# gather all imports

import bert_score
from nltk.translate.meteor_score import meteor_score
from nltk.tokenize import word_tokenize
from transformers import logging as transformers_logging
from collections import Counter
from rouge_score import rouge_scorer
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
import torch
import pandas as pd
import ast

Now we need to parse the results files to get the model responses. This will include all the necessary information for comparison.

In [123]:
def parse_results(file_path, model_type):
    """
    Function to parse all the result files. I was building these models over time and should have kept them all in the same form, but instead of re-running I will just process like this.
    
    @PARAMS:
        - file_path  -> results file
        - model_type -> flag to specify which model to parse
    """
    entries = []
    
    # establish some flags based on the model input
    if model_type == "gpt2_baseline":
        response_label = "Baseline Response:"
        single_response = True
    elif model_type == "gpt2_finetuned" or model_type == "ensemble":
        response_label = "Answer:"
        single_response = True
    elif model_type == "llama":
        baseline_label = "Baseline Response:"
        finetuned_label = "Fine-tuned Response:"
        single_response = False
    elif model_type == "openai":
        baseline_label = "Baseline Model Response:"
        finetuned_label = "Fine-tuned Model Response:"
        single_response = False
    else:
        raise ValueError(f"Unknown model type: {model_type}")
    
    # read and split file based on the = delimiter
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()
    raw_entries = content.split('=' * 80)
    
    for entry in raw_entries:
        if not entry.strip():
            continue
            
        lines = entry.strip().split('\n')
        query = ""
        baseline = ""
        finetuned = ""
        response = ""
        expected = ""
        
        # parse based on flags set above
        for line in lines:
            if line.startswith("Question:"):
                query = line.replace("Question:", "").strip()
            elif single_response and line.startswith(response_label):
                response = line.replace(response_label, "").strip()
            elif not single_response and line.startswith(baseline_label):
                baseline = line.replace(baseline_label, "").strip()
            elif not single_response and line.startswith(finetuned_label):
                finetuned = line.replace(finetuned_label, "").strip()
            elif line.startswith("Expected Response:"):
                expected = line.replace("Expected Response:", "").strip()
        
        # append the values
        if query and expected:
            if single_response and response:
                entries.append((query, response, expected))
            elif not single_response and baseline and finetuned:
                entries.append((query, baseline, finetuned, expected))
                
    return entries

Print the first example of reading through each file to make sure it worked properly.

In [126]:
print("GPT-2 Baseline Results First Example:")
gpt2_baseline_results = parse_results("results/gpt2_baseline_results.txt", "gpt2_baseline")
print("Question:", gpt2_baseline_results[0][0])
print("Response:", gpt2_baseline_results[0][1])
print("Expected Response:", gpt2_baseline_results[0][2])
print()

print("GPT-2 Fine-Tuned Results First Example:")
gpt2_fine_tuned_results = parse_results("results/gpt2_fine_tuned_results.txt", "gpt2_finetuned")
print("Question:", gpt2_fine_tuned_results[0][0])
print("Response:", gpt2_fine_tuned_results[0][1])
print("Expected Response:", gpt2_fine_tuned_results[0][2])
print()

print("LLAMA Results First Example:")
llama_results = parse_results("results/llama_results.txt", "llama")
question, baseline, finetuned, expected = llama_results[0]
print("Question:", question)
print("Baseline Response:", baseline)
print("Fine-tuned Response:", finetuned)
print("Expected Response:", expected)
print()

print("OpenAI Results First Example:")
openai_results = parse_results("results/openai_model_comparison.txt", "openai")
question, baseline, finetuned, expected = openai_results[0]
print("Question:", question)
print("Baseline Response:", baseline)
print("Fine-tuned Response:", finetuned)
print("Expected Response:", expected)
print()

print("Ensemble Results First Example:")
ensemble_results = parse_results("results/ensemble_results.txt", "ensemble")
print("Question:", ensemble_results[0][0])
print("Response:", ensemble_results[0][1])
print("Expected Response:", ensemble_results[0][2])
print()

GPT-2 Baseline Results First Example:
Question: can i be pregnant if i had unprotected sex the 4th day of being on the depo?
Response: i have a lot of other options.
Expected Response: yes you can. the depo will take about a month or two to take full effect. even then it is not 100% effective.

GPT-2 Fine-Tuned Results First Example:
Question: can i be pregnant if i had unprotected sex the 4th day of being on the depo? . i had my period the 4 th day of my period. i had a period the 5th day. i was supposed to have my period on the 4 and 5th of this month. i have a period on my 5th. i am not pregnant. i did not have a cramp or cramp-like feeling. i took a test the 4 days after i was on the pill. i'm not sure if it was a test or if i am pregnant.
Response: hi its not a good idea to take a test for a period. if you miss a period then your pregnant. but if you are having unprotected sex then you should be ok. good luck
Expected Response: yes you can. the depo will take about a month or two 

In [118]:
class Evaluator:
    """Class to evaluate a model response against an expected response using semantic and lexical metrics"""
    def __init__(self):
        # initialize BERT model for embeddings
        self.model_name = "emilyalsentzer/Bio_ClinicalBERT"
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.model = AutoModel.from_pretrained(self.model_name)
        
        # initialize ROUGE scorer
        self.rouge_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

    def get_embedding(self, text):
        """
        Function to calculate the embeddings of a piece of text based off the bio bert model!

        @PARAMS:
            - text -> what to calculate embeddings of
        """
        inputs = self.tokenizer(
            text, 
            return_tensors="pt", 
            truncation=True,
            max_length=512,
            padding='max_length'
        )
        
        with torch.no_grad():
            outputs = self.model(**inputs)
        
        attention_mask = inputs['attention_mask']
        last_hidden_state = outputs.last_hidden_state
        
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        
        return mean_embeddings.numpy()

    def calculate_metrics(self, model_response, expected_response):
        """
        Calculate multiple metrics comparing model response to expected response.

        @PARAMS:
            - model_response    -> what we are comparing to the expected response
            - expected_response -> the testing example actual doctor response
        """
        metrics = {}
        
        # tokenize responses
        try:
            model_tokens = word_tokenize(model_response.lower())
            expected_tokens = word_tokenize(expected_response.lower())
        except Exception as e:
            print(f"Tokenization failed: {e}")
            model_tokens = model_response.lower().split()
            expected_tokens = expected_response.lower().split()
            
        # calculate METEOR score
        try:
            metrics['meteor'] = meteor_score([expected_tokens], model_tokens)
        except Exception as e:
            print(f"METEOR score calculation failed: {e}")
            metrics['meteor'] = 0.0
            
        # calculate BERTScore
        try:
            P, R, F1 = bert_score.score([model_response], [expected_response], lang='en')
            metrics['bertscore_precision'] = P.item()
            metrics['bertscore_recall'] = R.item()
            metrics['bertscore_f1'] = F1.item()
        except Exception as e:
            print(f"BERTScore calculation failed: {e}")
            metrics['bertscore_precision'] = 0.0
            metrics['bertscore_recall'] = 0.0
            metrics['bertscore_f1'] = 0.0
            
        # calculate ROUGE scores
        try:
            rouge_scores = self.rouge_scorer.score(model_response, expected_response)
            metrics['rouge1_precision'] = rouge_scores['rouge1'].precision
            metrics['rouge1_recall'] = rouge_scores['rouge1'].recall
            metrics['rouge1_f1'] = rouge_scores['rouge1'].fmeasure
            metrics['rouge2_precision'] = rouge_scores['rouge2'].precision
            metrics['rouge2_recall'] = rouge_scores['rouge2'].recall
            metrics['rouge2_f1'] = rouge_scores['rouge2'].fmeasure
            metrics['rougeL_precision'] = rouge_scores['rougeL'].precision
            metrics['rougeL_recall'] = rouge_scores['rougeL'].recall
            metrics['rougeL_f1'] = rouge_scores['rougeL'].fmeasure
        except Exception as e:
            print(f"ROUGE score calculation failed: {e}")
            for metric in ['rouge1', 'rouge2', 'rougeL']:
                metrics[f'{metric}_precision'] = 0.0
                metrics[f'{metric}_recall'] = 0.0
                metrics[f'{metric}_f1'] = 0.0
        
        # now calculate cosine similarity with the bio embeddings!!
        try:
            model_embedding = self.get_embedding(model_response)
            expected_embedding = self.get_embedding(expected_response)
            similarity = cosine_similarity(model_embedding, expected_embedding)
            metrics['cosine_similarity'] = similarity[0][0]
        except Exception as e:
            print(f"Cosine similarity calculation failed: {e}")
            metrics['cosine_similarity'] = 0.0
            
        return metrics

In [129]:
# initialize the evaluator to run on each of the testing examples!
evaluator = Evaluator()

########################################################################################################################################################

# GPT2 BASELINE!
gpt2_baseline_metrics = []
with open("GPT_BASELINE_METRICS.txt", "w") as f:
    for index in range(len(gpt2_baseline_results)):
        #                                                 model response             expected response
        result = evaluator.calculate_metrics(gpt2_baseline_results[index][1], gpt2_baseline_results[index][2])
        gpt2_baseline_metrics.append(result)
        f.write(f"{result}\n")
print("First 10 examples of GPT2 Baseline!")
for i in gpt2_baseline_metrics[:10]:
    print(i)
print()

########################################################################################################################################################

# GPT2 FINE-TUNED!
gpt2_fine_tuned_metrics = []
with open("GPT_FINETUNED_METRICS.txt", "w") as f:
    for index in range(len(gpt2_fine_tuned_results)):
        #                                                 model response             expected response
        result = evaluator.calculate_metrics(gpt2_fine_tuned_results[index][1], gpt2_fine_tuned_results[index][2])
        gpt2_fine_tuned_metrics.append(result)
        f.write(f"{result}\n")
print("First 10 examples of GPT2 Baseline!")
for i in gpt2_fine_tuned_metrics[:10]:
    print(i)
print()

########################################################################################################################################################

# LLAMA RESULTS!
llama_baseline_metrics = []
llama_fine_tuned_metrics = []


for index in range(len(llama_results)):
    question, baseline, finetuned, expected = llama_results[index]
    #                                                   model response  expected response
    llama_baseline_metrics.append(evaluator.calculate_metrics(baseline, expected))
    llama_fine_tuned_metrics.append(evaluator.calculate_metrics(finetuned, expected))
print("First 10 examples of LLAMA Baseline!")
with open("LLAMA_BASELINE_METRICS.txt", "w") as f:
    for index, value in enumerate(llama_baseline_metrics):
        if index < 11:
            print(value)
        f.write(f"{value}\n")
print("\nFirst 10 examples of LLAMA Fine-Tuned!")
with open("LLAMA_FINETUNED_METRICS.txt", "w") as f:
    for index, value in enumerate(llama_fine_tuned_metrics):
        if index < 11:
            print(value)
        f.write(f"{value}\n")
print()

########################################################################################################################################################

# OPENAI RESULTS!
openai_baseline_metrics = []
openai_fine_tuned_metrics = []

for index in range(len(openai_results)):
    question, baseline, finetuned, expected = openai_results[index]
    #                                                   model response  expected response
    openai_baseline_metrics.append(evaluator.calculate_metrics(baseline, expected))
    openai_fine_tuned_metrics.append(evaluator.calculate_metrics(finetuned, expected))
print("First 10 examples of OPENAI Baseline!")
with open("OPENAI_BASELINE_METRICS.txt", "w") as f:
    for index, value in enumerate(openai_baseline_metrics):
        if index < 11:
            print(value)
        f.write(f"{value}\n")
print("\nFirst 10 examples of OPENAI Fine-Tuned!")
with open("OPENAI_FINETUNED_METRICS.txt", "w") as f:
    for index, value in enumerate(openai_fine_tuned_metrics):
        if index < 11:
            print(value)
        f.write(f"{value}\n")
print()

########################################################################################################################################################

# ENSEMBLE RESULTS!
ensemble_metrics = []

with open("ENSEMBLE_METRICS.txt", "w") as f:
    for index in range(len(ensemble_results)):
        #                                         model response             expected response
        result = evaluator.calculate_metrics(ensemble_results[index][1], ensemble_results[index][2])
        ensemble_metrics.append(result)
        f.write(f"{result}\n")
print("First 10 examples of the Ensemble Model!")
for i in ensemble_metrics[:10]:
    print(i)

First 10 examples of GPT2 Baseline!
{'meteor': 0.05976095617529879, 'bertscore_precision': 0.8740603923797607, 'bertscore_recall': 0.8354758024215698, 'bertscore_f1': 0.854332685470581, 'rouge1_precision': 0.043478260869565216, 'rouge1_recall': 0.14285714285714285, 'rouge1_f1': 0.06666666666666667, 'rouge2_precision': 0.0, 'rouge2_recall': 0.0, 'rouge2_f1': 0.0, 'rougeL_precision': 0.043478260869565216, 'rougeL_recall': 0.14285714285714285, 'rougeL_f1': 0.06666666666666667, 'cosine_similarity': 0.8436091}
{'meteor': 0.07518535445866341, 'bertscore_precision': 0.8463988304138184, 'bertscore_recall': 0.8016108274459839, 'bertscore_f1': 0.8233962059020996, 'rouge1_precision': 0.10407239819004525, 'rouge1_recall': 0.3770491803278688, 'rouge1_f1': 0.1631205673758865, 'rouge2_precision': 0.04090909090909091, 'rouge2_recall': 0.15, 'rouge2_f1': 0.06428571428571428, 'rougeL_precision': 0.08597285067873303, 'rougeL_recall': 0.3114754098360656, 'rougeL_f1': 0.13475177304964536, 'cosine_similarit

Awesome! Now that we have all the metrics, lets write a quick function to parse it and compute some basic statsitics to get the important insights!

In [2]:
def analyze_metrics(data_list):
    """
    Function to take a list of dictionary data values and compute metrics for each of the keys.
    
    @PARAMS:
        - data -> all metric values
    """
    # convert list of dictionaries to DataFrame for easier analysis
    df = pd.DataFrame(data_list)
    
    # value to store results
    results = {}
    
    # compute statistics for each metric
    for column in df.columns:
        stats = {
            'mean': df[column].mean(),
            'median': df[column].median(),
            'std': df[column].std(),
            'min': df[column].min(),
            'max': df[column].max(),
            'q1': df[column].quantile(0.25),
            'q3': df[column].quantile(0.75),
            'iqr': df[column].quantile(0.75) - df[column].quantile(0.25),
            'count': df[column].count()
        }
        # append results of the column
        results[column] = stats
    
    return results

def print_analysis(results) -> None:
    """
    Function to print the results of the stats of the metrics!
    
    @PARAMS:
        - results -> data with the stats of the metrics
    """
    for metric, stats in results.items():
        print(f"\n=== {metric} ===")
        for stat_name, value in stats.items():
            # round to 4 decimal places
            print(f"{stat_name}: {round(value, 4)}")

In [11]:
# run on GPT2 baseline!
gpt_baseline_metrics = []
with open("GPT_BASELINE_METRICS.txt", 'r') as file:
    for line in file:
        gpt_baseline_metrics.append(ast.literal_eval(line.strip()))
print_analysis(analyze_metrics(gpt_baseline_metrics))


=== meteor ===
mean: 0.0933
median: 0.083
std: 0.069
min: 0.0
max: 0.605
q1: 0.0386
q3: 0.1335
iqr: 0.0948
count: 2340

=== bertscore_precision ===
mean: 0.8514
median: 0.8503
std: 0.0271
min: 0.6886
max: 0.99
q1: 0.8357
q3: 0.8648
iqr: 0.0291
count: 2340

=== bertscore_recall ===
mean: 0.8176
median: 0.8174
std: 0.028
min: 0.7043
max: 0.99
q1: 0.8001
q3: 0.8342
iqr: 0.0341
count: 2340

=== bertscore_f1 ===
mean: 0.8338
median: 0.834
std: 0.0208
min: 0.741
max: 0.99
q1: 0.8213
q3: 0.8462
iqr: 0.0249
count: 2340

=== rouge1_precision ===
mean: 0.1346
median: 0.1121
std: 0.1159
min: 0.0
max: 1.0
q1: 0.0465
q3: 0.197
iqr: 0.1505
count: 2340

=== rouge1_recall ===
mean: 0.3124
median: 0.2895
std: 0.2176
min: 0.0
max: 1.0
q1: 0.1479
q3: 0.4545
iqr: 0.3067
count: 2340

=== rouge1_f1 ===
mean: 0.1456
median: 0.1429
std: 0.094
min: 0.0
max: 0.5714
q1: 0.0725
q3: 0.2098
iqr: 0.1373
count: 2340

=== rouge2_precision ===
mean: 0.017
median: 0.0044
std: 0.0352
min: 0.0
max: 0.5
q1: 0.0
q3: 0.0225

In [12]:
# run on GPT2 finetuned!
gpt_finetuned_metrics = []
with open("GPT_FINETUNED_METRICS.txt", 'r') as file:
    for line in file:
        gpt_finetuned_metrics.append(ast.literal_eval(line.strip()))
print_analysis(analyze_metrics(gpt_finetuned_metrics))


=== meteor ===
mean: 0.1364
median: 0.1241
std: 0.0922
min: 0.0
max: 0.8837
q1: 0.0766
q3: 0.178
iqr: 0.1014
count: 2336

=== bertscore_precision ===
mean: 0.8503
median: 0.8504
std: 0.0262
min: 0.7552
max: 0.9808
q1: 0.8345
q3: 0.8661
iqr: 0.0315
count: 2336

=== bertscore_recall ===
mean: 0.83
median: 0.8293
std: 0.0288
min: 0.7434
max: 0.988
q1: 0.812
q3: 0.8466
iqr: 0.0346
count: 2336

=== bertscore_f1 ===
mean: 0.8397
median: 0.8392
std: 0.0222
min: 0.7662
max: 0.9803
q1: 0.8264
q3: 0.8523
iqr: 0.0259
count: 2336

=== rouge1_precision ===
mean: 0.1964
median: 0.1766
std: 0.1316
min: 0.0
max: 1.0
q1: 0.1
q3: 0.2667
iqr: 0.1667
count: 2336

=== rouge1_recall ===
mean: 0.3305
median: 0.318
std: 0.1897
min: 0.0
max: 1.0
q1: 0.1818
q3: 0.4615
iqr: 0.2797
count: 2336

=== rouge1_f1 ===
mean: 0.2028
median: 0.1956
std: 0.1071
min: 0.0
max: 0.8602
q1: 0.129
q3: 0.2674
iqr: 0.1384
count: 2336

=== rouge2_precision ===
mean: 0.0371
median: 0.02
std: 0.0727
min: 0.0
max: 0.9167
q1: 0.0
q3: 

In [13]:
# run on LLAMA baseline!
llama_baseline_metrics = []
with open("LLAMA_BASELINE_METRICS.txt", 'r') as file:
    for line in file:
        llama_baseline_metrics.append(ast.literal_eval(line.strip()))
print_analysis(analyze_metrics(llama_baseline_metrics))


=== meteor ===
mean: 0.1044
median: 0.0925
std: 0.0668
min: 0.0273
max: 0.2616
q1: 0.0529
q3: 0.1274
iqr: 0.0745
count: 20

=== bertscore_precision ===
mean: 0.8355
median: 0.8355
std: 0.0221
min: 0.7928
max: 0.8953
q1: 0.8242
q3: 0.8479
iqr: 0.0237
count: 20

=== bertscore_recall ===
mean: 0.8239
median: 0.8156
std: 0.0316
min: 0.781
max: 0.8793
q1: 0.7957
q3: 0.8469
iqr: 0.0512
count: 20

=== bertscore_f1 ===
mean: 0.8292
median: 0.8254
std: 0.0193
min: 0.803
max: 0.8816
q1: 0.8174
q3: 0.8358
iqr: 0.0184
count: 20

=== rouge1_precision ===
mean: 0.1634
median: 0.1474
std: 0.11
min: 0.0299
max: 0.3878
q1: 0.0753
q3: 0.2001
iqr: 0.1248
count: 20

=== rouge1_recall ===
mean: 0.3176
median: 0.2949
std: 0.1894
min: 0.0252
max: 0.7561
q1: 0.2345
q3: 0.3636
iqr: 0.1292
count: 20

=== rouge1_f1 ===
mean: 0.1601
median: 0.1663
std: 0.0825
min: 0.0468
max: 0.35
q1: 0.09
q3: 0.2051
iqr: 0.1152
count: 20

=== rouge2_precision ===
mean: 0.0163
median: 0.0093
std: 0.0221
min: 0.0
max: 0.0889
q1: 

In [14]:
# run on LLAMA finetuned!
llama_finetuned_metrics = []
with open("LLAMA_FINETUNED_METRICS.txt", 'r') as file:
    for line in file:
        llama_finetuned_metrics.append(ast.literal_eval(line.strip()))
print_analysis(analyze_metrics(llama_finetuned_metrics))


=== meteor ===
mean: 0.2252
median: 0.2114
std: 0.1521
min: 0.0766
max: 0.8233
q1: 0.1598
q3: 0.2276
iqr: 0.0678
count: 20

=== bertscore_precision ===
mean: 0.8397
median: 0.8379
std: 0.0234
min: 0.8033
max: 0.9017
q1: 0.8283
q3: 0.8495
iqr: 0.0212
count: 20

=== bertscore_recall ===
mean: 0.8453
median: 0.8414
std: 0.0381
min: 0.7892
max: 0.9751
q1: 0.8284
q3: 0.8561
iqr: 0.0277
count: 20

=== bertscore_f1 ===
mean: 0.8421
median: 0.8412
std: 0.0257
min: 0.8107
max: 0.937
q1: 0.8289
q3: 0.8459
iqr: 0.017
count: 20

=== rouge1_precision ===
mean: 0.3494
median: 0.3299
std: 0.1933
min: 0.0829
max: 0.9348
q1: 0.2456
q3: 0.4103
iqr: 0.1647
count: 20

=== rouge1_recall ===
mean: 0.312
median: 0.2849
std: 0.1833
min: 0.0549
max: 0.7661
q1: 0.2096
q3: 0.3795
iqr: 0.1699
count: 20

=== rouge1_f1 ===
mean: 0.2752
median: 0.2655
std: 0.1191
min: 0.0971
max: 0.637
q1: 0.1979
q3: 0.3234
iqr: 0.1255
count: 20

=== rouge2_precision ===
mean: 0.0935
median: 0.0435
std: 0.1864
min: 0.0
max: 0.8667


In [15]:
# run on OpenAI baseline!
openai_baseline_metrics = []
with open("OPENAI_BASELINE_METRICS.txt", 'r') as file:
    for line in file:
        openai_baseline_metrics.append(ast.literal_eval(line.strip()))
print_analysis(analyze_metrics(openai_baseline_metrics))


=== meteor ===
mean: 0.1612
median: 0.1533
std: 0.08
min: 0.0275
max: 0.3945
q1: 0.103
q3: 0.2122
iqr: 0.1092
count: 100

=== bertscore_precision ===
mean: 0.8525
median: 0.8536
std: 0.0232
min: 0.7839
max: 0.8972
q1: 0.8378
q3: 0.8715
iqr: 0.0337
count: 100

=== bertscore_recall ===
mean: 0.8407
median: 0.838
std: 0.0288
min: 0.786
max: 0.93
q1: 0.8221
q3: 0.86
iqr: 0.0379
count: 100

=== bertscore_f1 ===
mean: 0.8462
median: 0.8434
std: 0.0185
min: 0.8005
max: 0.8929
q1: 0.8351
q3: 0.8602
iqr: 0.0251
count: 100

=== rouge1_precision ===
mean: 0.2563
median: 0.222
std: 0.1558
min: 0.0
max: 0.8
q1: 0.1534
q3: 0.3333
iqr: 0.1799
count: 100

=== rouge1_recall ===
mean: 0.3232
median: 0.3031
std: 0.1828
min: 0.0
max: 0.8333
q1: 0.1775
q3: 0.4287
iqr: 0.2512
count: 100

=== rouge1_f1 ===
mean: 0.2259
median: 0.2313
std: 0.0904
min: 0.0
max: 0.4478
q1: 0.1681
q3: 0.279
iqr: 0.1109
count: 100

=== rouge2_precision ===
mean: 0.048
median: 0.0284
std: 0.0675
min: 0.0
max: 0.5
q1: 0.0077
q3: 0

In [16]:
# run on OpenAI finetuned!
openai_finetuned_metrics = []
with open("OPENAI_FINETUNED_METRICS.txt", 'r') as file:
    for line in file:
        openai_finetuned_metrics.append(ast.literal_eval(line.strip()))
print_analysis(analyze_metrics(openai_finetuned_metrics))


=== meteor ===
mean: 0.1855
median: 0.1795
std: 0.1093
min: 0.0165
max: 0.8582
q1: 0.1024
q3: 0.2546
iqr: 0.1522
count: 100

=== bertscore_precision ===
mean: 0.823
median: 0.8201
std: 0.0298
min: 0.7669
max: 0.9642
q1: 0.8054
q3: 0.8391
iqr: 0.0336
count: 100

=== bertscore_recall ===
mean: 0.8383
median: 0.8356
std: 0.0279
min: 0.7814
max: 0.9658
q1: 0.8184
q3: 0.8551
iqr: 0.0366
count: 100

=== bertscore_f1 ===
mean: 0.8302
median: 0.8301
std: 0.023
min: 0.7934
max: 0.965
q1: 0.8153
q3: 0.8403
iqr: 0.025
count: 100

=== rouge1_precision ===
mean: 0.3423
median: 0.3333
std: 0.1898
min: 0.0213
max: 0.84
q1: 0.1965
q3: 0.4874
iqr: 0.2909
count: 100

=== rouge1_recall ===
mean: 0.249
median: 0.206
std: 0.177
min: 0.0081
max: 0.8889
q1: 0.1275
q3: 0.328
iqr: 0.2006
count: 100

=== rouge1_f1 ===
mean: 0.214
median: 0.2173
std: 0.1128
min: 0.0157
max: 0.8571
q1: 0.1554
q3: 0.2622
iqr: 0.1068
count: 100

=== rouge2_precision ===
mean: 0.0549
median: 0.0256
std: 0.0998
min: 0.0
max: 0.8367


In [17]:
# run on Ensemble!
ensemble_metrics = []
with open("ENSEMBLE_METRICS.txt", 'r') as file:
    for line in file:
        ensemble_metrics.append(ast.literal_eval(line.strip()))
print_analysis(analyze_metrics(ensemble_metrics))


=== meteor ===
mean: 0.1778
median: 0.1894
std: 0.0982
min: 0.0
max: 0.6995
q1: 0.1077
q3: 0.2345
iqr: 0.1269
count: 100

=== bertscore_precision ===
mean: 0.8165
median: 0.8201
std: 0.0335
min: 0.6869
max: 0.9527
q1: 0.7983
q3: 0.8374
iqr: 0.0391
count: 100

=== bertscore_recall ===
mean: 0.8359
median: 0.8357
std: 0.0288
min: 0.75
max: 0.9512
q1: 0.8213
q3: 0.8531
iqr: 0.0318
count: 100

=== bertscore_f1 ===
mean: 0.8255
median: 0.8252
std: 0.0236
min: 0.7546
max: 0.9014
q1: 0.8137
q3: 0.8372
iqr: 0.0235
count: 100

=== rouge1_precision ===
mean: 0.3336
median: 0.3012
std: 0.2018
min: 0.0
max: 0.9474
q1: 0.1756
q3: 0.4743
iqr: 0.2987
count: 100

=== rouge1_recall ===
mean: 0.2368
median: 0.2157
std: 0.1615
min: 0.0
max: 1.0
q1: 0.1121
q3: 0.3412
iqr: 0.2291
count: 100

=== rouge1_f1 ===
mean: 0.206
median: 0.1997
std: 0.1047
min: 0.0
max: 0.4284
q1: 0.1283
q3: 0.284
iqr: 0.1558
count: 100

=== rouge2_precision ===
mean: 0.0601
median: 0.032
std: 0.1064
min: 0.0
max: 0.8571
q1: 0.005