In [1]:
!pip install bert-score
!pip install rouge-score

Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bert-score
Successfully installed bert-score-0.3.13
Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=dd8fd7a1572d47b37c0fdc7f766cf7810f23f94daa9cb2f946e5c197bfb67ae2
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import os
import json
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
from bert_score import score
from rouge_score import rouge_scorer

In [6]:
prefix_zero_shot = 'Findings: '
prefix_few_shot = 'Findings for sample x: '

zero_shot_data_path = '/content/drive/My Drive/[CS 263 - Fall 2024] Final Project/MIMIC-CXR-filtered'
few_shot_data_path = '/content/drive/My Drive/[CS 263 - Fall 2024] Final Project/MIMIC-CXR-filtered/test'
zero_shot_4o_mini_responses_path = '/content/drive/My Drive/[CS 263 - Fall 2024] Final Project/Results/263_zeroshot_gpt-40-mini.json'
few_shot_4o_mini_responses_path = '/content/drive/My Drive/[CS 263 - Fall 2024] Final Project/Results/263_fewshot_gpt-40-mini.json'
zero_shot_4o_responses_path = '/content/drive/My Drive/[CS 263 - Fall 2024] Final Project/Results/263_zeroshot_gpt-4o.json'
few_shot_4o_responses_path = '/content/drive/My Drive/[CS 263 - Fall 2024] Final Project/Results/263_fewshot_gpt-4o.json'
few_shot_4_turbo_responses_path = '/content/drive/My Drive/[CS 263 - Fall 2024] Final Project/Results/263_fewshot_gpt-4-turbo.json'
response_paths = [zero_shot_4o_mini_responses_path, few_shot_4o_mini_responses_path, zero_shot_4o_responses_path, few_shot_4o_responses_path, few_shot_4_turbo_responses_path]

In [7]:
ground_truth_json_files = []
ground_truth = {}

for root, dirs, files in os.walk(zero_shot_data_path):
    for file in files:
        if file.endswith('.json'):
            ground_truth_json_files.append(os.path.join(root, file))

for path in ground_truth_json_files:
    with open(path, 'r') as f:
        data = json.load(f)
        sample_id = os.path.basename(path).split('.')[0]
        ground_truth[sample_id] = data['FINDINGS']

## BLEU Score

In [17]:
generated_findings = [{}]

for idx, val in enumerate(response_paths):
    with open(val, 'r') as f:
        generated_findings.append(json.load(f))

    for key in generated_findings[idx]:
        if generated_findings[idx][key].startswith(prefix_zero_shot):
            generated_findings[idx][key] = generated_findings[idx][key][len(prefix_zero_shot):]
        elif generated_findings[idx][key].startswith(prefix_few_shot):
            generated_findings[idx][key] = generated_findings[idx][key][len(prefix_few_shot):]

In [28]:
# Calculate BLEU score
for i in range(1, len(generated_findings)):
    ground_truth_split = []
    generated_finding_split = []

    for j, ground_truth_sample in ground_truth.items():
        generated_finding = generated_findings.get(idx, '')
        if generated_finding:
            ground_truth_split.append([ground_truth_sample.split()])
            generated_finding_split.append(generated_finding.split())
    smooth_fn = SmoothingFunction().method1
    bleu_score = corpus_bleu(ground_truth_split, generated_finding_split, smoothing_function=smooth_fn)
    print(f'BLEU score for {i} is', bleu_score)

BLEU score for 1 is 0.012056382978123503
BLEU score for 2 is 0.026403520498161217
BLEU score for 3 is 0.019648835692867808
BLEU score for 4 is 0.038234585359215374
BLEU score for 5 is 0.02291376942211067


## ROUGE Score

In [31]:
# Calculate ROUGE scores
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
rouge_scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}

for i in range(1, len(generated_findings)):
    for j, ground_truth_sample in ground_truth.items():
        generated_finding = generated_findings.get(idx, '')
        if generated_finding:
            scores = scorer.score(ground_truth_sample, generated_finding)

        rouge_scores['rouge1'].append(scores['rouge1'].fmeasure)
        rouge_scores['rouge2'].append(scores['rouge2'].fmeasure)
        rouge_scores['rougeL'].append(scores['rougeL'].fmeasure)

    avg_rouge_scores = {
        key: sum(values) / len(values) if values else 0 for key, values in rouge_scores.items()
    }
    print(f'ROUGE score for {i} is', avg_rouge_scores)

ROUGE score for 1 is {'rouge1': 0.308901410218829, 'rouge2': 0.05169436905571883, 'rougeL': 0.18339997234180316}
ROUGE score for 2 is {'rouge1': 0.30567500975113043, 'rouge2': 0.06652594775119001, 'rougeL': 0.18376860577614074}
ROUGE score for 3 is {'rouge1': 0.2949009967377272, 'rouge2': 0.06688247793300695, 'rougeL': 0.1783233383535318}
ROUGE score for 4 is {'rouge1': 0.3014100674165571, 'rouge2': 0.0734074403837721, 'rougeL': 0.18374960922745165}
ROUGE score for 5 is {'rouge1': 0.29796464199926453, 'rouge2': 0.0745615767615147, 'rougeL': 0.1812424972875855}


## BERTScore

In [32]:
# Calculate BERTScore
ground_truth_list = []
generated_finding_list = []

for i in range(1, len(generated_findings)):
    for j, ground_truth_sample in ground_truth.items():
        generated_finding = generated_findings[i].get(idx, '')
        if generated_finding:
            ground_truth_list.append(ground_truth_sample)
            generated_finding_list.append(generated_finding)

    P, R, F1 = score(generated_finding_list, ground_truth_list, lang='en', verbose=True)
    bertscore = {
        'Precision': P.mean().item(),
        'Recall': R.mean().item(),
        'F1': F1.mean().item()
    }
    print(f'BERTScore for {i} is', bertscore)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/2 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 165.03 seconds, 0.35 sentences/sec
BERTScore for 1 is {'Precision': 0.8578980565071106, 'Recall': 0.8597686886787415, 'F1': 0.8586655259132385}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/3 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/2 [00:00<?, ?it/s]



done in 207.26 seconds, 0.56 sentences/sec
BERTScore for 2 is {'Precision': 0.8224345445632935, 'Recall': 0.8272491693496704, 'F1': 0.824640154838562}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/4 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/3 [00:00<?, ?it/s]



done in 255.31 seconds, 0.68 sentences/sec
BERTScore for 3 is {'Precision': 0.8077712655067444, 'Recall': 0.813914954662323, 'F1': 0.8106412291526794}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/5 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/4 [00:00<?, ?it/s]



done in 306.56 seconds, 0.76 sentences/sec
BERTScore for 4 is {'Precision': 0.8027786612510681, 'Recall': 0.809725821018219, 'F1': 0.8060453534126282}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/6 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/5 [00:00<?, ?it/s]



done in 359.05 seconds, 0.81 sentences/sec
BERTScore for 5 is {'Precision': 0.7974507212638855, 'Recall': 0.806881844997406, 'F1': 0.8019390106201172}
