# CS263 Final Project Notebook 3

## Automated Metrics for Evaluation

Please update the paths in the 4th cell before running this notebook. You can get the files here: https://github.com/iangalvez/cs263-vlm-radiology-eval/tree/main

In [40]:
!pip install bert-score
!pip install rouge-score



In [41]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [42]:
import os
import json
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
from bert_score import score
from rouge_score import rouge_scorer

In [43]:
# Please update the paths here if running this notebook locally
prefix_zero_shot = 'Findings: '
prefix_few_shot = 'Findings for sample x: '

zero_shot_data_path = '/content/drive/My Drive/[CS 263 - Fall 2024] Final Project/MIMIC-CXR-filtered'
few_shot_data_path = '/content/drive/My Drive/[CS 263 - Fall 2024] Final Project/MIMIC-CXR-filtered/test'
zero_shot_4o_mini_responses_path = '/content/drive/My Drive/[CS 263 - Fall 2024] Final Project/Results/263_zeroshot_gpt-4o-mini.json'
few_shot_4o_mini_responses_path = '/content/drive/My Drive/[CS 263 - Fall 2024] Final Project/Results/263_fewshot_gpt-4o-mini.json'
zero_shot_4o_responses_path = '/content/drive/My Drive/[CS 263 - Fall 2024] Final Project/Results/263_zeroshot_gpt-4o.json'
few_shot_4o_responses_path = '/content/drive/My Drive/[CS 263 - Fall 2024] Final Project/Results/263_fewshot_gpt-4o.json'
few_shot_4_turbo_responses_path = '/content/drive/My Drive/[CS 263 - Fall 2024] Final Project/Results/263_fewshot_gpt-4-turbo.json'
response_paths = [zero_shot_4o_mini_responses_path, zero_shot_4o_responses_path, few_shot_4o_mini_responses_path, few_shot_4o_responses_path, few_shot_4_turbo_responses_path]

In [44]:
ground_truth_json_files_zero_shot = []
ground_truth_zero_shot = {}

for root, dirs, files in os.walk(zero_shot_data_path):
    for file in files:
        if file.endswith('.json'):
            ground_truth_json_files_zero_shot.append(os.path.join(root, file))

for path in ground_truth_json_files_zero_shot:
    with open(path, 'r') as f:
        data = json.load(f)
        sample_id = os.path.basename(path).split('.')[0]
        ground_truth_zero_shot[sample_id] = data['FINDINGS']

ground_truth_json_files_few_shot = []
ground_truth_few_shot = {}

for root, dirs, files in os.walk(few_shot_data_path):
    for file in files:
        if file.endswith('.json'):
            ground_truth_json_files_few_shot.append(os.path.join(root, file))

for path in ground_truth_json_files_few_shot:
    with open(path, 'r') as f:
        data = json.load(f)
        sample_id = os.path.basename(path).split('.')[0]
        ground_truth_few_shot[sample_id] = data['FINDINGS']

In [45]:
generated_findings = []

for idx, val in enumerate(response_paths):
    with open(val, 'r') as f:
        generated_findings.append(json.load(f))

    for key in generated_findings[idx]:
        if idx == 0 or idx == 1:
            if generated_findings[idx][key].startswith(prefix_zero_shot):
                generated_findings[idx][key] = generated_findings[idx][key][len(prefix_zero_shot):]
        else:
            if generated_findings[idx][key].startswith(prefix_few_shot):
                generated_findings[idx][key] = generated_findings[idx][key][len(prefix_few_shot):]

*   Index 0 is zero shot in GPT 4o mini
*   Index 1 is zero shot in GPT 4o
*   Index 2 is few shot in GPT 4o mini
*   Index 3 is few shot in GPT 4o
*   Index 4 is few  shot in GPT 4 turbo

## BLEU Score

In [46]:
# Calculate BLEU score
for i in range(len(generated_findings)):
    ground_truth_split = []
    generated_finding_split = []

    if i == 0 or i == 1:
        ground_truth = ground_truth_zero_shot
    else:
        ground_truth = ground_truth_few_shot

    for j, ground_truth_sample in ground_truth.items():
        generated_finding = generated_findings[i].get(j, '')
        if generated_finding:
            ground_truth_split.append([ground_truth_sample.split()])
            generated_finding_split.append(generated_finding.split())
    smooth_fn = SmoothingFunction().method1
    bleu_score = corpus_bleu(ground_truth_split, generated_finding_split, smoothing_function=smooth_fn)
    print(f'BLEU score for {i} is', bleu_score)

BLEU score for 0 is 0.012056382978123503
BLEU score for 1 is 0.019675341227100383
BLEU score for 2 is 0.02644367876401616
BLEU score for 3 is 0.0382984730561119
BLEU score for 4 is 0.023804718962793094


## ROUGE Score

In [47]:
# Calculate ROUGE scores
for i in range(len(generated_findings)):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}

    if i == 0 or i == 1:
        ground_truth = ground_truth_zero_shot
    else:
        ground_truth = ground_truth_few_shot

    for j, ground_truth_sample in ground_truth.items():
        generated_finding = generated_findings[i].get(j, '')
        if generated_finding:
            scores = scorer.score(ground_truth_sample, generated_finding)

            rouge_scores['rouge1'].append(scores['rouge1'].fmeasure)
            rouge_scores['rouge2'].append(scores['rouge2'].fmeasure)
            rouge_scores['rougeL'].append(scores['rougeL'].fmeasure)

    avg_rouge_scores = {
        key: sum(values) / len(values) if values else 0 for key, values in rouge_scores.items()
    }
    print(f'ROUGE score for {i} is', avg_rouge_scores)

ROUGE score for 0 is {'rouge1': 0.308901410218829, 'rouge2': 0.05169436905571883, 'rougeL': 0.18339997234180316}
ROUGE score for 1 is {'rouge1': 0.2991409868157244, 'rouge2': 0.0739724758717956, 'rougeL': 0.18322835100909798}
ROUGE score for 2 is {'rouge1': 0.3309814969516802, 'rouge2': 0.08903276479068586, 'rougeL': 0.2015086768718444}
ROUGE score for 3 is {'rouge1': 0.35121438128824, 'rouge2': 0.10175424544701732, 'rougeL': 0.21889902768404262}
ROUGE score for 4 is {'rouge1': 0.3181431994862831, 'rouge2': 0.08903435976653067, 'rougeL': 0.19240594025170984}


## BERTScore

In [48]:
# Calculate BERTScore
for i in range(len(generated_findings)):
    ground_truth_list = []
    generated_finding_list = []

    if i == 0 or i == 1:
        ground_truth = ground_truth_zero_shot
    else:
        ground_truth = ground_truth_few_shot

    for j, ground_truth_sample in ground_truth.items():
        generated_finding = generated_findings[i].get(j, '')
        if generated_finding:
            ground_truth_list.append(ground_truth_sample)
            generated_finding_list.append(generated_finding)

    P, R, F1 = score(generated_finding_list, ground_truth_list, lang='en', verbose=True)
    bertscore = {
        'Precision': P.mean().item(),
        'Recall': R.mean().item(),
        'F1': F1.mean().item()
    }
    print(f'BERTScore for {i} is', bertscore)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/2 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 2.89 seconds, 20.10 sentences/sec
BERTScore for 0 is {'Precision': 0.857897937297821, 'Recall': 0.8597686290740967, 'F1': 0.8586655259132385}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/2 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 2.31 seconds, 22.90 sentences/sec
BERTScore for 1 is {'Precision': 0.8518829345703125, 'Recall': 0.8615147471427917, 'F1': 0.8564777970314026}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/2 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 2.22 seconds, 23.89 sentences/sec
BERTScore for 2 is {'Precision': 0.8612133860588074, 'Recall': 0.8697042465209961, 'F1': 0.8652008771896362}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/2 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 2.22 seconds, 23.84 sentences/sec
BERTScore for 3 is {'Precision': 0.86212158203125, 'Recall': 0.8723620772361755, 'F1': 0.8669986128807068}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/2 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 2.88 seconds, 18.41 sentences/sec
BERTScore for 4 is {'Precision': 0.8527664542198181, 'Recall': 0.8715744018554688, 'F1': 0.8618605136871338}
