In [1]:
import json
import numpy as np
from collections import defaultdict
import evaluate
from rouge_score import rouge_scorer

In [None]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = ''
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"  

import torch
import warnings

if hasattr(torch, 'cuda'):
    original_is_available = torch.cuda.is_available
    torch.cuda.is_available = lambda: False
    
    if hasattr(torch.cuda, 'device_count'):
        torch.cuda.device_count = lambda: 0
    
    warnings.warn("Running on Mac: CUDA support disabled, using CPU instead.")

device = torch.device('cpu')
print(f"Using device: {device}")


Using device: cpu




'\nif hasattr(torch, \'backends\') and hasattr(torch.backends, \'mps\') and torch.backends.mps.is_available():\n    device = torch.device("mps")\n    print(f"Apple Silicon detected! Updated device: {device}")\n'

In [None]:
# Use custom rouge function to obtain rouge 3/4 which are not available in huggingface
def get_rouge_score(gold, pred):
    rouge_scores = ['rouge1', 'rouge2', 'rouge3', 'rouge4', 'rougeL']
    scorer = rouge_scorer.RougeScorer(rouge_scores, use_stemmer=True)
    scores = scorer.score(gold, pred)
    return {k: scores[k].fmeasure * 100 for k in rouge_scores}

def compute_custom_metrics(srcs, golds, preds, device):
    scores = defaultdict(list)
    
    # For rouge and length go over examples one by one and determine mean
    for gold, pred in zip(golds, preds):
        for k, v in get_rouge_score(gold, pred).items():
            scores[k].append(v)
        scores['words'].append(len(pred.split(' ')))
    for k, v in scores.items():
        scores[k] = np.mean(v)

    
    return scores

def print_metrics_as_latex(metrics):
    # Print latex table row
    order = ['rouge1', 'rouge2', 'rouge3', 'rouge4', 'rougeL']
    print(' & '.join([f'${metrics[k]:.2f}$' for k in order]))

In [None]:

# Experiment 1 and 2
test_data_file = "/Users/kavyakarthi/Documents/MCS/CS598/Final_Project/Code/final_proj/gpt-4/summarization_data/exp_1_test.json"
# test_data_file = "/Users/kavyakarthi/Documents/MCS/CS598/Final_Project/Code/final_proj/gpt-4/summarization_data/exp_2_test.json"

preds_data_file = "/Users/kavyakarthi/Documents/MCS/CS598/Final_Project/Code/final_proj/gpt-4/summarization_results/gpt-4.1_exp1_results_prompt3_2shot.jsonl"
# preds_data_file = "/Users/kavyakarthi/Documents/MCS/CS598/Final_Project/Code/final_proj/gpt-4/summarization_results/gpt-4.1_exp1_results_prompt3.1_0shot.jsonl"
# preds_data_file = "/Users/kavyakarthi/Documents/MCS/CS598/Final_Project/Code/final_proj/gpt-4/summarization_results/gpt-4.1_exp2_results_prompt3_5shot.jsonl"
# preds_data_file = "/Users/kavyakarthi/Documents/MCS/CS598/Final_Project/Code/final_proj/gpt-4/summarization_results/gpt-4.1_exp2_results_prompt3.1_0shot.jsonl"


# Read jsonl files
def read_jsonl(file_name):
    with open(file_name, "r") as f:
        return [json.loads(line) for line in f]
    
# Read jsonl files
test_data = read_jsonl(test_data_file)
preds_data = read_jsonl(preds_data_file)

In [34]:
# Print included valid examples with indices
for i in range(0, 3):
    print(i)
    print(test_data[i]["text"])
    print(test_data[i]["summary"])
    print(preds_data[i]["summary"])
    print()

0
Name:  ___                 Unit No:   ___
 
Admission Date:  ___              Discharge Date:   ___
 
Date of Birth:  ___             Sex:   M
 
Service: MEDICINE
 
Allergies: 
No Known Allergies / Adverse Drug Reactions
 
Attending: ___
 
Chief Complaint:
Hematuria
 
Major Surgical or Invasive Procedure:
Right heart cardiac catheterization
 
History of Present Illness:
___ male with past medical history significant for 
chronic diastolic heart failure, atrial fibrillation on 
warfarin, pulmonary hypertension, chronic kidney disease 
presents with hyperkalemia from his PCPs office but also with a 
multitude of complaints. Per OMR he has been admitted multiple 
times in the last month for CHF exacerbations the most recent 
admission being just a few days ago. He was diuresed and sent 
home. Yesterday he was seen in PCP's office with cc of 
hematuria. PCP did basic labs including lytes and recommended if 
it continues to refer to urology. Today the ___ physician 
was called by the lab 

In [35]:
srcs = [e["text"] for e in test_data]
golds = [e["summary"] for e in test_data]
preds = [e["summary"] for e in preds_data]
metrics_test = compute_custom_metrics(srcs, golds, preds, device)

metrics_test = {k: round(v, 2) for k, v in metrics_test.items()}
print("Test metrics rounded:")
print(metrics_test)
print_metrics_as_latex(metrics_test)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Test metrics rounded:
{'rouge1': 27.56, 'rouge2': 6.24, 'rouge3': 1.83, 'rouge4': 0.47, 'rougeL': 16.39, 'words': 141.55, 'bert_score': 81.82, 'bert_score_deberta-large': 54.07, 'sari': 40.04}
$27.56$ & $6.24$ & $1.83$ & $0.47$ & $16.39$ & $81.82$ & $54.07$ & $40.04$ & $141.55$
