In [9]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [10]:
!pip install bert-score
!pip install rouge-score



In [11]:
import os
import json
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
from bert_score import score
from rouge_score import rouge_scorer

In [12]:
prefix_zero_shot = 'Findings: '
prefix_few_shot = 'Findings for sample x: '

zero_shot_data_path = '/content/drive/My Drive/[CS 263 - Fall 2024] Final Project/MIMIC-CXR-filtered'
few_shot_data_path = '/content/drive/My Drive/[CS 263 - Fall 2024] Final Project/MIMIC-CXR-filtered/test'
zero_shot_4o_mini_responses_path = '/content/drive/My Drive/[CS 263 - Fall 2024] Final Project/Results/263_zeroshot_gpt-40-mini.json'
few_shot_4o_mini_responses_path = '/content/drive/My Drive/[CS 263 - Fall 2024] Final Project/Results/263_fewshot_gpt-40-mini.json'
zero_shot_4o_responses_path = '/content/drive/My Drive/[CS 263 - Fall 2024] Final Project/Results/263_zeroshot_gpt-4o.json'
few_shot_4o_responses_path = '/content/drive/My Drive/[CS 263 - Fall 2024] Final Project/Results/263_fewshot_gpt-4o.json'
few_shot_4_turbo_responses_path = '/content/drive/My Drive/[CS 263 - Fall 2024] Final Project/Results/263_fewshot_gpt-4-turbo.json'

# GPT-4o mini evaluation

## Zero-shot learning evaluation

In [13]:
ground_truth_json_files = []
ground_truth = {}

for root, dirs, files in os.walk(zero_shot_data_path):
    for file in files:
        if file.endswith('.json'):
            ground_truth_json_files.append(os.path.join(root, file))

for path in ground_truth_json_files:
    with open(path, 'r') as f:
        data = json.load(f)
        sample_id = os.path.basename(path).split('.')[0]
        ground_truth[sample_id] = data['FINDINGS']

In [14]:
generated_findings = {}
with open(zero_shot_4o_mini_responses_path, 'r') as f:
    generated_findings = json.load(f)

for key in generated_findings:
    if generated_findings[key].startswith(prefix_zero_shot):
        generated_findings[key] = generated_findings[key][len(prefix_zero_shot):]

In [15]:
# Calculate BLEU score
ground_truth_split = []
generated_finding_split = []

for idx, ground_truth_sample in ground_truth.items():
    generated_finding = generated_findings.get(idx, '')
    if generated_finding:
        ground_truth_split.append([ground_truth_sample.split()])
        generated_finding_split.append(generated_finding.split())
smooth_fn = SmoothingFunction().method1
bleu_score = corpus_bleu(ground_truth_split, generated_finding_split, smoothing_function=smooth_fn)

In [16]:
# Calculate ROUGE scores
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
rouge_scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}

for idx, ground_truth_sample in ground_truth.items():
    generated_finding = generated_findings.get(idx, '')
    if generated_finding:
        scores = scorer.score(ground_truth_sample, generated_finding)

        rouge_scores['rouge1'].append(scores['rouge1'].fmeasure)
        rouge_scores['rouge2'].append(scores['rouge2'].fmeasure)
        rouge_scores['rougeL'].append(scores['rougeL'].fmeasure)

avg_rouge_scores = {
    key: sum(values) / len(values) if values else 0 for key, values in rouge_scores.items()
}

In [17]:
# Calculate BERTScore
ground_truth_list = []
generated_finding_list = []

for idx, ground_truth_sample in ground_truth.items():
    generated_finding = generated_findings.get(idx, '')
    if generated_finding:
        ground_truth_list.append(ground_truth_sample)
        generated_finding_list.append(generated_finding)

P, R, F1 = score(generated_finding_list, ground_truth_list, lang='en', verbose=True)
bertscore = {
    'Precision': P.mean().item(),
    'Recall': R.mean().item(),
    'F1': F1.mean().item()
}

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/2 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 140.87 seconds, 0.41 sentences/sec


In [18]:
# Zero shot
print('Scores for zero-shot learning:')
print('BLEU score:', bleu_score)
print('ROUGE score:', avg_rouge_scores)
print('BERTScore:', bertscore)

Scores for zero-shot learning:
BLEU score: 0.012056382978123503
ROUGE score: {'rouge1': 0.308901410218829, 'rouge2': 0.05169436905571883, 'rougeL': 0.18339997234180316}
BERTScore: {'Precision': 0.8578980565071106, 'Recall': 0.8597686886787415, 'F1': 0.8586655259132385}


## Few-shot learning evaluation

In [19]:
ground_truth_json_files = []
ground_truth = {}

for root, dirs, files in os.walk(few_shot_data_path):
    for file in files:
        if file.endswith('.json'):
            ground_truth_json_files.append(os.path.join(root, file))

for path in ground_truth_json_files:
    with open(path, 'r') as f:
        data = json.load(f)
        sample_id = os.path.basename(path).split('.')[0]
        ground_truth[sample_id] = data['FINDINGS']

In [20]:
generated_findings = {}
with open(few_shot_4o_mini_responses_path, 'r') as f:
    generated_findings = json.load(f)

for key in generated_findings:
    if generated_findings[key].startswith(prefix_few_shot):
        generated_findings[key] = generated_findings[key][len(prefix_few_shot):]

In [21]:
# Calculate BLEU score
ground_truth_split = []
generated_finding_split = []

for idx, ground_truth_sample in ground_truth.items():
    generated_finding = generated_findings.get(idx, '')
    if generated_finding:
        ground_truth_split.append([ground_truth_sample.split()])
        generated_finding_split.append(generated_finding.split())
smooth_fn = SmoothingFunction().method1
bleu_score = corpus_bleu(ground_truth_split, generated_finding_split, smoothing_function=smooth_fn)

In [22]:
# Calculate ROUGE scores
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
rouge_scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}

for idx, ground_truth_sample in ground_truth.items():
    generated_finding = generated_findings.get(idx, '')
    if generated_finding:
        scores = scorer.score(ground_truth_sample, generated_finding)

        rouge_scores['rouge1'].append(scores['rouge1'].fmeasure)
        rouge_scores['rouge2'].append(scores['rouge2'].fmeasure)
        rouge_scores['rougeL'].append(scores['rougeL'].fmeasure)

avg_rouge_scores = {
    key: sum(values) / len(values) if values else 0 for key, values in rouge_scores.items()
}

In [23]:
# Calculate BERTScore
ground_truth_list = []
generated_finding_list = []

for idx, ground_truth_sample in ground_truth.items():
    generated_finding = generated_findings.get(idx, '')
    if generated_finding:
        ground_truth_list.append(ground_truth_sample)
        generated_finding_list.append(generated_finding)

P, R, F1 = score(generated_finding_list, ground_truth_list, lang='en', verbose=True)
bertscore = {
    'Precision': P.mean().item(),
    'Recall': R.mean().item(),
    'F1': F1.mean().item()
}

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/2 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 125.08 seconds, 0.42 sentences/sec


In [24]:
# Few shot
print('Scores for few-shot learning:')
print('BLEU score:', bleu_score)
print('ROUGE score:', avg_rouge_scores)
print('BERTScore:', bertscore)

Scores for few-shot learning:
BLEU score: 0.02644367876401616
ROUGE score: {'rouge1': 0.3309814969516802, 'rouge2': 0.08903276479068586, 'rougeL': 0.2015086768718444}
BERTScore: {'Precision': 0.8612134456634521, 'Recall': 0.8697042465209961, 'F1': 0.8652010560035706}


# GPT-4o evaluation

## Zero-shot learning evaluation

In [25]:
ground_truth_json_files = []
ground_truth = {}

for root, dirs, files in os.walk(zero_shot_data_path):
    for file in files:
        if file.endswith('.json'):
            ground_truth_json_files.append(os.path.join(root, file))

for path in ground_truth_json_files:
    with open(path, 'r') as f:
        data = json.load(f)
        sample_id = os.path.basename(path).split('.')[0]
        ground_truth[sample_id] = data['FINDINGS']

In [26]:
generated_findings = {}
with open(zero_shot_4o_responses_path, 'r') as f:
    generated_findings = json.load(f)

for key in generated_findings:
    if generated_findings[key].startswith(prefix_zero_shot):
        generated_findings[key] = generated_findings[key][len(prefix_zero_shot):]

In [27]:
# Calculate BLEU score
ground_truth_split = []
generated_finding_split = []

for idx, ground_truth_sample in ground_truth.items():
    generated_finding = generated_findings.get(idx, '')
    if generated_finding:
        ground_truth_split.append([ground_truth_sample.split()])
        generated_finding_split.append(generated_finding.split())
smooth_fn = SmoothingFunction().method1
bleu_score = corpus_bleu(ground_truth_split, generated_finding_split, smoothing_function=smooth_fn)

In [28]:
# Calculate ROUGE scores
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
rouge_scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}

for idx, ground_truth_sample in ground_truth.items():
    generated_finding = generated_findings.get(idx, '')
    if generated_finding:
        scores = scorer.score(ground_truth_sample, generated_finding)

        rouge_scores['rouge1'].append(scores['rouge1'].fmeasure)
        rouge_scores['rouge2'].append(scores['rouge2'].fmeasure)
        rouge_scores['rougeL'].append(scores['rougeL'].fmeasure)

avg_rouge_scores = {
    key: sum(values) / len(values) if values else 0 for key, values in rouge_scores.items()
}

In [29]:
# Calculate BERTScore
ground_truth_list = []
generated_finding_list = []

for idx, ground_truth_sample in ground_truth.items():
    generated_finding = generated_findings.get(idx, '')
    if generated_finding:
        ground_truth_list.append(ground_truth_sample)
        generated_finding_list.append(generated_finding)

P, R, F1 = score(generated_finding_list, ground_truth_list, lang='en', verbose=True)
bertscore = {
    'Precision': P.mean().item(),
    'Recall': R.mean().item(),
    'F1': F1.mean().item()
}

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/2 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 132.71 seconds, 0.40 sentences/sec


In [30]:
# Zero shot
print('Scores for zero-shot learning:')
print('BLEU score:', bleu_score)
print('ROUGE score:', avg_rouge_scores)
print('BERTScore:', bertscore)

Scores for zero-shot learning:
BLEU score: 0.019675341227100383
ROUGE score: {'rouge1': 0.2991409868157244, 'rouge2': 0.0739724758717956, 'rougeL': 0.18322835100909798}
BERTScore: {'Precision': 0.8518829941749573, 'Recall': 0.8615149259567261, 'F1': 0.8564777970314026}


## Few-shot learning evaluation

In [31]:
ground_truth_json_files = []
ground_truth = {}

for root, dirs, files in os.walk(few_shot_data_path):
    for file in files:
        if file.endswith('.json'):
            ground_truth_json_files.append(os.path.join(root, file))

for path in ground_truth_json_files:
    with open(path, 'r') as f:
        data = json.load(f)
        sample_id = os.path.basename(path).split('.')[0]
        ground_truth[sample_id] = data['FINDINGS']

In [37]:
generated_findings = {}
with open(few_shot_4o_responses_path, 'r') as f:
    generated_findings = json.load(f)

for key in generated_findings:
    if generated_findings[key].startswith(prefix_few_shot):
        generated_findings[key] = generated_findings[key][len(prefix_few_shot):]

In [38]:
# Calculate BLEU score
ground_truth_split = []
generated_finding_split = []

for idx, ground_truth_sample in ground_truth.items():
    generated_finding = generated_findings.get(idx, '')
    if generated_finding:
        ground_truth_split.append([ground_truth_sample.split()])
        generated_finding_split.append(generated_finding.split())
smooth_fn = SmoothingFunction().method1
bleu_score = corpus_bleu(ground_truth_split, generated_finding_split, smoothing_function=smooth_fn)

In [39]:
# Calculate ROUGE scores
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
rouge_scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}

for idx, ground_truth_sample in ground_truth.items():
    generated_finding = generated_findings.get(idx, '')
    if generated_finding:
        scores = scorer.score(ground_truth_sample, generated_finding)

        rouge_scores['rouge1'].append(scores['rouge1'].fmeasure)
        rouge_scores['rouge2'].append(scores['rouge2'].fmeasure)
        rouge_scores['rougeL'].append(scores['rougeL'].fmeasure)

avg_rouge_scores = {
    key: sum(values) / len(values) if values else 0 for key, values in rouge_scores.items()
}

In [40]:
# Calculate BERTScore
ground_truth_list = []
generated_finding_list = []

for idx, ground_truth_sample in ground_truth.items():
    generated_finding = generated_findings.get(idx, '')
    if generated_finding:
        ground_truth_list.append(ground_truth_sample)
        generated_finding_list.append(generated_finding)

P, R, F1 = score(generated_finding_list, ground_truth_list, lang='en', verbose=True)
bertscore = {
    'Precision': P.mean().item(),
    'Recall': R.mean().item(),
    'F1': F1.mean().item()
}

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/2 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 143.78 seconds, 0.37 sentences/sec


In [41]:
# Few shot
print('Scores for few-shot learning:')
print('BLEU score:', bleu_score)
print('ROUGE score:', avg_rouge_scores)
print('BERTScore:', bertscore)

Scores for few-shot learning:
BLEU score: 0.0382984730561119
ROUGE score: {'rouge1': 0.35121438128824, 'rouge2': 0.10175424544701732, 'rougeL': 0.21889902768404262}
BERTScore: {'Precision': 0.8621216416358948, 'Recall': 0.8723620772361755, 'F1': 0.8669987916946411}


# GPT-4 turbo

## Few-shot learning evaluation

In [42]:
generated_findings = {}
with open(few_shot_4_turbo_responses_path, 'r') as f:
    generated_findings = json.load(f)

for key in generated_findings:
    if generated_findings[key].startswith(prefix_few_shot):
        generated_findings[key] = generated_findings[key][len(prefix_few_shot):]

In [43]:
# Calculate BLEU score
ground_truth_split = []
generated_finding_split = []

for idx, ground_truth_sample in ground_truth.items():
    generated_finding = generated_findings.get(idx, '')
    if generated_finding:
        ground_truth_split.append([ground_truth_sample.split()])
        generated_finding_split.append(generated_finding.split())
smooth_fn = SmoothingFunction().method1
bleu_score = corpus_bleu(ground_truth_split, generated_finding_split, smoothing_function=smooth_fn)

In [44]:
# Calculate ROUGE scores
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
rouge_scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}

for idx, ground_truth_sample in ground_truth.items():
    generated_finding = generated_findings.get(idx, '')
    if generated_finding:
        scores = scorer.score(ground_truth_sample, generated_finding)

        rouge_scores['rouge1'].append(scores['rouge1'].fmeasure)
        rouge_scores['rouge2'].append(scores['rouge2'].fmeasure)
        rouge_scores['rougeL'].append(scores['rougeL'].fmeasure)

avg_rouge_scores = {
    key: sum(values) / len(values) if values else 0 for key, values in rouge_scores.items()
}

In [45]:
# Calculate BERTScore
ground_truth_list = []
generated_finding_list = []

for idx, ground_truth_sample in ground_truth.items():
    generated_finding = generated_findings.get(idx, '')
    if generated_finding:
        ground_truth_list.append(ground_truth_sample)
        generated_finding_list.append(generated_finding)

P, R, F1 = score(generated_finding_list, ground_truth_list, lang='en', verbose=True)
bertscore = {
    'Precision': P.mean().item(),
    'Recall': R.mean().item(),
    'F1': F1.mean().item()
}

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/2 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 133.03 seconds, 0.40 sentences/sec


In [46]:
# Few shot
print('Scores for few-shot learning:')
print('BLEU score:', bleu_score)
print('ROUGE score:', avg_rouge_scores)
print('BERTScore:', bertscore)

Scores for few-shot learning:
BLEU score: 0.023804718962793094
ROUGE score: {'rouge1': 0.3181431994862831, 'rouge2': 0.08903435976653067, 'rougeL': 0.19240594025170984}
BERTScore: {'Precision': 0.8527665138244629, 'Recall': 0.8715745210647583, 'F1': 0.8618605732917786}
