# Config

In [1]:
import pandas as pd
import numpy as np
import os
import torch
import re
from datasets import load_dataset, Dataset, load_metric
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
    LogitsProcessor,
    LogitsProcessorList,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
import nltk
import warnings
import spacy
import math
import xformers
import tensor_parallel as tp
from tqdm import tqdm
import evaluate
nltk.download('punkt')

  _C._set_default_tensor_type(t)
[nltk_data] Downloading package punkt to /home/imx2/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Set up model and parameters

In [2]:
model_name = "meta-llama/Llama-2-7b-chat-hf"
# we save the model under this name
new_model = "llama-2-7b-radnlpv2"

In [3]:
################################################################################
# QLoRA parameters
################################################################################

# LoRA attention dimension
lora_r = 64

# Alpha parameter for LoRA scaling
lora_alpha = 16

# Dropout probability for LoRA layers
lora_dropout = 0.1

################################################################################
# bitsandbytes parameters
################################################################################

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

################################################################################
# TrainingArguments parameters
################################################################################

# Output directory where the model predictions and checkpoints will be stored
output_dir = "./results"

# Number of training epochs
num_train_epochs = 5

# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = False
bf16 = False

# Batch size per GPU for training
per_device_train_batch_size = 4

# Batch size per GPU for evaluation
per_device_eval_batch_size = 4

# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 1

# Enable gradient checkpointing
gradient_checkpointing = True

# Maximum gradient normal (gradient clipping)
max_grad_norm = 0.3

# Initial learning rate
learning_rate = 2e-4

# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.001

# Optimizer to use
optim = "paged_adamw_32bit"

# Learning rate schedule
lr_scheduler_type = "cosine"

# Number of training steps (overrides num_train_epochs)
max_steps = -1

# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03

# Group sequences into batches with same length
# Saves memory and speeds up training considerably
group_by_length = True

# Save checkpoint every X updates steps
save_steps = 0

# Log every X updates steps
logging_steps = 25

# Evaluate model on validation set every X update steps
evaluation_strategy='steps'
eval_steps=500

################################################################################
# SFT parameters
################################################################################

# Maximum sequence length to use
max_seq_length = 200

# Pack multiple short examples in the same input sequence to increase efficiency
packing = False

# batch size
dataset_batch_size = 32

# Load the entire model on the GPU 0
# device_map = {'':torch.cuda.current_device()}
# CUDA_VISIBLE_DEVICES=0,1,2,3, try multiple devices
device_map = 'auto' #{'':torch.cuda.current_device()}

In [4]:
# Load tokenizer and model with QLoRA configuration
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

In [5]:
# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

Your GPU supports bfloat16: accelerate training with bf16=True


In [None]:
# Load base model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map
)
# model = tp.tensor_parallel(
#     AutoModelForCausalLM.from_pretrained(
#     model_name,
#     quantization_config=bnb_config,
# #     device_map=device_map
#     )
# )
# model = tp.tensor_parallel(model, ["cuda:0", "cuda:1"])  # <- each GPU has half the weights
model.config.use_cache = False
model.config.pretraining_tp = 1

In [15]:
# Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training

In [16]:
# Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
)

In [None]:
# Set training parameters
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    evaluation_strategy=evaluation_strategy,
    eval_steps=eval_steps,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="tensorboard"
)

In [None]:
BERTSCORE_MODEL_TYPE = "microsoft/deberta-xlarge-mnli"

In [None]:
nlp = spacy.load("en_core_sci_lg")

# Get and Clean Data

## Get Data

In [None]:
ct_impressions_train = pd.read_csv('data/223268_chestct_1_20220624a_nodup_train.csv.gz', compression='gzip')
ct_impressions_train

In [None]:
ct_impressions_test = pd.read_csv('data/223268_chestct_1_20220624a_nodup_val.csv')
ct_impressions_test

In [None]:
pelvis_impressions_train = pd.read_csv('data/223268_ctabdpelvis_2_20220627a_nodup_train.csv.gz', compression='gzip')
pelvis_impressions_train

In [None]:
pelvis_impressions_test = pd.read_csv('data/223268_ctabdpelvis_2_20220627a_nodup_val.csv.gz', compression='gzip')
pelvis_impressions_test

In [None]:
mri_classification_train = pd.read_csv('data/223268_mri_November_2022_20221107a_nodup_train.csv.gz', compression='gzip')
mri_classification_train

In [None]:
mri_classification_test = pd.read_csv('data/223268_mri_November_2022_20221107a_nodup_val.csv.gz', compression='gzip')
mri_classification_test

In [None]:
petct_classification_train = pd.read_csv('data/223268_November_2022_PETCT_20221110a_nodup_train.csv.gz', compression='gzip')
petct_classification_train

In [None]:
petct_classification_test = pd.read_csv('data/223268_November_2022_PETCT_20221110a_nodup_val.csv.gz', compression='gzip')
petct_classification_test

## Clean Data
Separate to context-answer pairs (remove unnecessary parts of the answers).

In [None]:
ct_impressions_train

In [None]:
ct_impressions_train['Narrative'][0]

In [None]:
ct_impressions_train['Impression'][20]

In [None]:
# remove stuff after 'Reported And Signed By:' to prevent the model from hallucinating names
to_delete = ['Report Initiated By:', 'Reported and Signed By:', 'Reported By:']
ct_impressions_train['Impression'] = ct_impressions_train['Impression'].replace(to_replace=r'Reported And Signed By:.*$', value='', regex=True)
ct_impressions_test['Impression'] = ct_impressions_test['Impression'].replace(to_replace=r'Reported And Signed By:.*$', value='', regex=True)
ct_impressions_train['Impression'] = ct_impressions_train['Impression'].replace(to_replace=r'Report Initiated By:.*$', value='', regex=True)
ct_impressions_test['Impression'] = ct_impressions_test['Impression'].replace(to_replace=r'Report Initiated By:.*$', value='', regex=True)
ct_impressions_train['Impression'] = ct_impressions_train['Impression'].replace(to_replace=r'Reported By:.*$', value='', regex=True)
ct_impressions_test['Impression'] = ct_impressions_test['Impression'].replace(to_replace=r'Reported By:.*$', value='', regex=True)
ct_impressions_train['Impression'][100]

In [None]:
# remove extra white space
ct_impressions_train['Impression'] = ct_impressions_train['Impression'].str.rstrip()
ct_impressions_test['Impression'] = ct_impressions_test['Impression'].str.rstrip()
ct_impressions_train['Impression'][0]

In [None]:
# remove stuff after 'Reported And Signed By:' to prevent weird model behavior
pelvis_impressions_train['Impression'] = pelvis_impressions_train['Impression'].replace(to_replace=r'Reported And Signed By:.*$', value='', regex=True)
pelvis_impressions_test['Impression'] = pelvis_impressions_test['Impression'].replace(to_replace=r'Reported And Signed By:.*$', value='', regex=True)
pelvis_impressions_train['Impression'] = pelvis_impressions_train['Impression'].replace(to_replace=r'Report Initiated By:.*$', value='', regex=True)
pelvis_impressions_test['Impression'] = pelvis_impressions_test['Impression'].replace(to_replace=r'Report Initiated By:.*$', value='', regex=True)
pelvis_impressions_train['Impression'] = pelvis_impressions_train['Impression'].replace(to_replace=r'Reported By:.*$', value='', regex=True)
pelvis_impressions_test['Impression'] = pelvis_impressions_test['Impression'].replace(to_replace=r'Reported By:.*$', value='', regex=True)
# remove extra white space
pelvis_impressions_train['Impression'] = pelvis_impressions_train['Impression'].str.rstrip()
pelvis_impressions_test['Impression'] = pelvis_impressions_test['Impression'].str.rstrip()
pelvis_impressions_train['Impression'][100]

In [None]:
mri_classification_train

In [None]:
for i in list(mri_classification_train['Impression'][0:10]):
    print(i + '\n')

In [None]:
for i in list(mri_classification_train['Narrative'][0:1000]):
    print(i+'\n')

In [None]:
mri_classification_train['Narrative'] = mri_classification_train['Narrative'].replace(to_replace=r'INDICATION.*$', value='', regex=True)
mri_classification_test['Narrative'] = mri_classification_test['Narrative'].replace(to_replace=r'INDICATION.*$', value='', regex=True)
mri_classification_train['Narrative'] = mri_classification_train['Narrative'].replace(to_replace=r'HISTORY:.*$', value='', regex=True)
mri_classification_test['Narrative'] = mri_classification_test['Narrative'].replace(to_replace=r'HISTORY:.*$', value='', regex=True)
mri_classification_train['Narrative'] = mri_classification_train['Narrative'].replace(to_replace=r'History:.*$', value='', regex=True)
mri_classification_test['Narrative'] = mri_classification_test['Narrative'].replace(to_replace=r'History:.*$', value='', regex=True)
mri_classification_train['Narrative'] = mri_classification_train['Narrative'].replace(to_replace=r'Technique:.*$', value='', regex=True)
mri_classification_test['Narrative'] = mri_classification_test['Narrative'].replace(to_replace=r'Technique:.*$', value='', regex=True)
mri_classification_train['Narrative'] = mri_classification_train['Narrative'].replace(to_replace=r'Clinical History:.*$', value='', regex=True)
mri_classification_test['Narrative'] = mri_classification_test['Narrative'].replace(to_replace=r'Clinical History:.*$', value='', regex=True)
mri_classification_train['Narrative'] = mri_classification_train['Narrative'].replace(to_replace=r'CLINICAL HISTORY:.*$', value='', regex=True)
mri_classification_test['Narrative'] = mri_classification_test['Narrative'].replace(to_replace=r'CLINICAL HISTORY:.*$', value='', regex=True)
mri_classification_train['Narrative'] = mri_classification_train['Narrative'].replace(to_replace=r'CLINICAL.*$', value='', regex=True)
mri_classification_test['Narrative'] = mri_classification_test['Narrative'].replace(to_replace=r'CLINICAL.*$', value='', regex=True)
mri_classification_train['Narrative'] = mri_classification_train['Narrative'].replace(to_replace=r'COMPARISON:.*$', value='', regex=True)
mri_classification_test['Narrative'] = mri_classification_test['Narrative'].replace(to_replace=r'COMPARISON:.*$', value='', regex=True)
mri_classification_train['Narrative'] = mri_classification_train['Narrative'].replace(to_replace=r'Clinical.*$', value='', regex=True)
mri_classification_test['Narrative'] = mri_classification_test['Narrative'].replace(to_replace=r'Clinical.*$', value='', regex=True)
mri_classification_train['Narrative'] = mri_classification_train['Narrative'].replace(to_replace=r'Date:.*$', value='', regex=True)
mri_classification_test['Narrative'] = mri_classification_test['Narrative'].replace(to_replace=r'Date:.*$', value='', regex=True)
mri_classification_train['Narrative'] = mri_classification_train['Narrative'].replace(to_replace=r'Indication.*$', value='', regex=True)
mri_classification_test['Narrative'] = mri_classification_test['Narrative'].replace(to_replace=r'Indication.*$', value='', regex=True)
mri_classification_test['Narrative'] = mri_classification_test['Narrative'].replace(r'^.*?(MR)', r'\1', regex=True)
mri_classification_train['Narrative'] = mri_classification_train['Narrative'].replace(r'^.*?(MR)', r'\1', regex=True)
mri_classification_train['Impression'] = mri_classification_train['Impression'].replace(to_replace=r'Reported And Signed By:.*$', value='', regex=True)
mri_classification_test['Impression'] = mri_classification_test['Impression'].replace(to_replace=r'Reported And Signed By:.*$', value='', regex=True)
mri_classification_train['Impression'] = mri_classification_train['Impression'].replace(to_replace=r'Report Initiated By:.*$', value='', regex=True)
mri_classification_test['Impression'] = mri_classification_test['Impression'].replace(to_replace=r'Report Initiated By:.*$', value='', regex=True)
mri_classification_train['Impression'] = mri_classification_train['Impression'].replace(to_replace=r'Reported By:.*$', value='', regex=True)
mri_classification_test['Impression'] = mri_classification_test['Impression'].replace(to_replace=r'Reported By:.*$', value='', regex=True)

In [None]:
petct_classification_train['Narrative'] = petct_classification_train['Narrative'].replace(to_replace=r'INDICATION.*$', value='', regex=True)
petct_classification_test['Narrative'] = petct_classification_test['Narrative'].replace(to_replace=r'INDICATION.*$', value='', regex=True)
petct_classification_train['Narrative'] = petct_classification_train['Narrative'].replace(to_replace=r'HISTORY:.*$', value='', regex=True)
petct_classification_test['Narrative'] = petct_classification_test['Narrative'].replace(to_replace=r'HISTORY:.*$', value='', regex=True)
petct_classification_train['Narrative'] = petct_classification_train['Narrative'].replace(to_replace=r'History:.*$', value='', regex=True)
petct_classification_test['Narrative'] = petct_classification_test['Narrative'].replace(to_replace=r'History:.*$', value='', regex=True)
petct_classification_train['Narrative'] = petct_classification_train['Narrative'].replace(to_replace=r'Technique:.*$', value='', regex=True)
petct_classification_test['Narrative'] = petct_classification_test['Narrative'].replace(to_replace=r'Technique:.*$', value='', regex=True)
petct_classification_train['Narrative'] = petct_classification_train['Narrative'].replace(to_replace=r'Clinical History:.*$', value='', regex=True)
petct_classification_test['Narrative'] = petct_classification_test['Narrative'].replace(to_replace=r'Clinical History:.*$', value='', regex=True)
petct_classification_train['Narrative'] = petct_classification_train['Narrative'].replace(to_replace=r'CLINICAL HISTORY:.*$', value='', regex=True)
petct_classification_test['Narrative'] = petct_classification_test['Narrative'].replace(to_replace=r'CLINICAL HISTORY:.*$', value='', regex=True)
petct_classification_train['Narrative'] = petct_classification_train['Narrative'].replace(to_replace=r'CLINICAL.*$', value='', regex=True)
petct_classification_test['Narrative'] = petct_classification_test['Narrative'].replace(to_replace=r'CLINICAL.*$', value='', regex=True)
petct_classification_train['Narrative'] = petct_classification_train['Narrative'].replace(to_replace=r'COMPARISON:.*$', value='', regex=True)
petct_classification_test['Narrative'] = petct_classification_test['Narrative'].replace(to_replace=r'COMPARISON:.*$', value='', regex=True)
petct_classification_train['Narrative'] = petct_classification_train['Narrative'].replace(to_replace=r'Clinical.*$', value='', regex=True)
petct_classification_test['Narrative'] = petct_classification_test['Narrative'].replace(to_replace=r'Clinical.*$', value='', regex=True)
petct_classification_train['Narrative'] = petct_classification_train['Narrative'].replace(to_replace=r'Date:.*$', value='', regex=True)
petct_classification_test['Narrative'] = petct_classification_test['Narrative'].replace(to_replace=r'Date:.*$', value='', regex=True)
petct_classification_train['Narrative'] = petct_classification_train['Narrative'].replace(to_replace=r'Indication.*$', value='', regex=True)
petct_classification_test['Narrative'] = petct_classification_test['Narrative'].replace(to_replace=r'Indication.*$', value='', regex=True)
petct_classification_train['Impression'] = petct_classification_train['Impression'].replace(to_replace=r'Reported And Signed By:.*$', value='', regex=True)
petct_classification_test['Impression'] = petct_classification_test['Impression'].replace(to_replace=r'Reported And Signed By:.*$', value='', regex=True)
petct_classification_train['Impression'] = petct_classification_train['Impression'].replace(to_replace=r'Report Initiated By:.*$', value='', regex=True)
petct_classification_test['Impression'] = petct_classification_test['Impression'].replace(to_replace=r'Report Initiated By:.*$', value='', regex=True)
petct_classification_train['Impression'] = petct_classification_train['Impression'].replace(to_replace=r'Reported By:.*$', value='', regex=True)
petct_classification_test['Impression'] = petct_classification_test['Impression'].replace(to_replace=r'Reported By:.*$', value='', regex=True)
# petct_classification_test['Narrative'] = petct_classification_test['Narrative'].replace(r'^.*?()', r'\1', regex=True)
# petct_classification_train['Narrative'] = petct_classification_train['Narrative'].replace(r'^.*?()', r'\1', regex=True)

In [None]:
for i in list(petct_classification_train['Narrative'][0:1000]):
    print(i+'\n')

## Split to validation

In [None]:
ct_impressions_val = ct_impressions_train.head(int(.1*len(ct_impressions_train)))
ct_impressions_train = ct_impressions_train.tail(int(.9*len(ct_impressions_train)))

In [None]:
pelvis_impressions_val = pelvis_impressions_train.head(int(.1*len(pelvis_impressions_train)))
pelvis_impressions_train = pelvis_impressions_train.tail(int(.9*len(pelvis_impressions_train)))

In [None]:
mri_classification_val = mri_classification_train.head(int(.1*len(mri_classification_train)))
mri_classification_train = mri_classification_train.tail(int(.9*len(mri_classification_train)))

In [None]:
petct_classification_val = petct_classification_train.head(int(.1*len(petct_classification_train)))
petct_classification_train = petct_classification_train.tail(int(.9*len(petct_classification_train)))

## Change Data Type to Huggingface Dataset
Save these to Huggingface

In [None]:
mri_classification_train_dataset = Dataset.from_pandas(mri_classification_train)
mri_classification_val_dataset = Dataset.from_pandas(mri_classification_val)
mri_classification_test_dataset = Dataset.from_pandas(mri_classification_test)
petct_classification_train_dataset = Dataset.from_pandas(petct_classification_train)
petct_classification_val_dataset = Dataset.from_pandas(petct_classification_val)
petct_classification_test_dataset = Dataset.from_pandas(petct_classification_test)

In [None]:
ct_impressions_train_dataset = Dataset.from_pandas(ct_impressions_train)
ct_impressions_val_dataset = Dataset.from_pandas(ct_impressions_val)
ct_impressions_test_dataset = Dataset.from_pandas(ct_impressions_test)
pelvis_impressions_train_dataset = Dataset.from_pandas(pelvis_impressions_train)
pelvis_impressions_val_dataset = Dataset.from_pandas(pelvis_impressions_val)
pelvis_impressions_test_dataset = Dataset.from_pandas(pelvis_impressions_test)

In [None]:
ct_impressions_train_dataset.save_to_disk('cleaned_data/ct_train')
ct_impressions_val_dataset.save_to_disk('cleaned_data/ct_val')
ct_impressions_test_dataset.save_to_disk('cleaned_data/ct_test')
pelvis_impressions_train_dataset.save_to_disk('cleaned_data/pelvis_train')
pelvis_impressions_val_dataset.save_to_disk('cleaned_data/pelvis_val')
pelvis_impressions_test_dataset.save_to_disk('cleaned_data/pelvis_test')

In [None]:
mri_classification_train_dataset.save_to_disk('cleaned_data/mri_train')
mri_classification_val_dataset.save_to_disk('cleaned_data/mri_val')
mri_classification_test_dataset.save_to_disk('cleaned_data/mri_test')
petct_classification_train_dataset.save_to_disk('cleaned_data/petct_train')
petct_classification_val_dataset.save_to_disk('cleaned_data/petct_val')
petct_classification_test_dataset.save_to_disk('cleaned_data/petct_test')

In [None]:
type(ct_impressions_train_dataset)

# Generate Prompts
Function to generate prompts (allow for a series of prompts that we can randomly sample)

In [None]:
def generate_impression_prompt(example):
    """
    params:
    example: dataset with column 'Narrative' and 'Impression'
    
    returns:
    list of prompts for each context-answer pair
    """
    output_texts = []
    for i in range(len(example['Narrative'])):
        text = f"Narrative: {example['Narrative'][i]}\n Generate a short impression.\n Impression: {example['Impression'][i]}"
        output_texts.append(text)
    return output_texts

In [None]:
mri_classification_train.columns

In [None]:
def generate_mri_classification_prompt(example):
    """
    params:
    example: dataset with column 'Impression' and 'Narrative'
    
    returns:
    list of prompts for each context-answer pair
    """
    output_texts = []
    for i in range(len(example['Impression'])):
        text = f"Impression: {example['Impression'][i]}\n Given this impression, what MRI should we use?\n Answer: {example['Narrative'][i]}"
        output_texts.append(text)
    return output_texts

In [None]:
def generate_petct_classification_prompt(example):
    """
    params:
    example: dataset with column 'Impression' and 'Narrative'
    
    returns:
    list of prompts for each context-answer pair
    """
    output_texts = []
    for i in range(len(example['Impression'])):
        text = f"Impression: {example['Impression'][i]}\n Given this impression, what PET/CT scan should we use?\n Answer: {example['Narrative'][i]}"
        output_texts.append(text)
    return output_texts

# Fine-tune Set Up
Function where inputs are different hyperparameters that can be used.

In [None]:
class EosTokenRewardLogitProcess(LogitsProcessor):
    # class to get the model to generate EOS token more often as sentence nears max_length
    def __init__(self, eos_token_id: int, max_length: int):
        if not isinstance(eos_token_id, int) or eos_token_id < 0:
            raise ValueError(f"`eos_token_id` has to be a positive integer, but is {eos_token_id}")

        if not isinstance(max_length, int) or max_length < 1:
            raise ValueError(f"`max_length` has to be a integer bigger than 1, but is {max_length}")

        self.eos_token_id = eos_token_id
        self.max_length=max_length

    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
        cur_len = input_ids.shape[-1]
        # start to increese the reward of the  eos_tokekn from 70% max length  progressively on length
        for cur_len in (max(0,int(self.max_length*0.7)), self.max_length ):
            ratio = cur_len/self.max_length
            num_tokens = scores.shape[1] # size of vocab
            scores[:, [i for i in range(num_tokens) if i != self.eos_token_id]] =\
            scores[:, [i for i in range(num_tokens) if i != self.eos_token_id]]*ratio*10*torch.exp(-torch.sign(scores[:, [i for i in range(num_tokens) if i != self.eos_token_id]]))
            scores[:, self.eos_token_id] = 1.1e2 * ratio
        return scores

In [None]:
tokenizer.eos_token_id

In [None]:
def get_impressions(contexts, model, tokenizer, max_response_length, device_map):
    impressions = []
    for c in tqdm(contexts):
        prompt = f"Narrative: {c}\n Generate a short impression.\n Impression: "
#         max_length = len(prompt) + max_response_length
        logits_process_list= LogitsProcessorList([EosTokenRewardLogitProcess(eos_token_id=tokenizer.eos_token_id, max_length=max_response_length)])
        # add some postprocessor
        pipe = pipeline(
            task="text-generation", 
            model=model, 
            tokenizer=tokenizer, 
            logits_processor=logits_process_list, 
            max_new_tokens=max_response_length, 
            return_full_text=False, 
            temperature=.1, 
            device_map=device_map)
        result = pipe(prompt)
        impressions.append(result[0]['generated_text'])
    return impressions

In [None]:
def get_classification(contexts, model, tokenizer, max_response_length, device_map, classification_type):
    classifications = []
    for c in tqdm(contexts):
        prompt = f"Impression: {c}\n Given this impression, what {classification_type} should we use?\n Answer: "
#         max_length = len(prompt) + max_response_length
        logits_process_list= LogitsProcessorList([EosTokenRewardLogitProcess(eos_token_id=tokenizer.eos_token_id, max_length=max_response_length)])
        # add some postprocessor
        pipe = pipeline(
            task="text-generation", 
            model=model, 
            tokenizer=tokenizer, 
            logits_processor=logits_process_list, 
            max_new_tokens=max_response_length, 
            return_full_text=False, 
            temperature=.1, 
            device_map=device_map)
        result = pipe(prompt)
        classifications.append(result[0]['generated_text'])
    return classifications

In [7]:
def finetune(model, train_dataset, eval_dataset, peft_config, max_seq_length, tokenizer, training_arguments, packing, formatting_func, new_model, compute_metrics, preprocess_logits_for_metrics):
    # Set supervised fine-tuning parameters
    # add validation set to model
    trainer = SFTTrainer(
        model=model,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        peft_config=peft_config,
        max_seq_length=max_seq_length,
        tokenizer=tokenizer,
        args=training_arguments,
        packing=packing,
        formatting_func=formatting_func,
        compute_metrics=compute_metrics,
        preprocess_logits_for_metrics=preprocess_logits_for_metrics,
    )

    # Train model
    trainer.train()

    # Save trained model
    trainer.model.save_pretrained(new_model)

# Evaluate Model
Functions to evaluate the model

In [8]:
def get_rouge_scores(predictions, references):
    '''
    predictions: list of model predictions
    references: corresponding list of test summaries
    
    returns: dictionary of rouge scores
    '''
    rouge = load_metric("rouge")

    # process text to make it compatible with rouge
    predictions = [" ".join(pred.strip().split()) for pred in predictions]
    references = [" ".join(ref.strip().split()) for ref in references]
    predictions = ["\n".join(nltk.sent_tokenize(pred)) for pred in predictions]
    references = ["\n".join(nltk.sent_tokenize(ref)) for ref in references]

    # compute rouge scores
    results = rouge.compute(
        predictions=predictions,
        references=references,
        use_stemmer=True,
        use_aggregator=False,
    )
    for key, value in results.items():
        results[key] = {
            "precision": [score.precision * 100 for score in value],
            "recall": [score.recall * 100 for score in value],
            "fmeasure": [score.fmeasure * 100 for score in value],
            "fmeasure_mean": np.mean([score.fmeasure for score in value]) * 100,
        }
    # Compute the arithmetic mean of ROUGE-1, ROUGE-2 and ROUGE-L following: https://arxiv.org/abs/2110.08499
    if all(rouge_type in results for rouge_type in ["rouge1", "rouge2", "rougeL"]):
        results["rouge_avg_fmeasure"] = np.mean(
            [results[key]["fmeasure"] for key in ["rouge1", "rouge2", "rougeL"]], axis=0
        ).tolist()
        results["rouge_avg_fmeasure_mean"] = np.mean(results["rouge_avg_fmeasure"]).item()
    else:
        warnings.warn(
            "ROUGE-1, ROUGE-2 and ROUGE-L are not all present in the results. Skipping the computation of ROUGE-AVG."
        )

    return results
    pass

In [9]:
def get_bertscore(predictions, references):
    '''
    predictions: list of model predictions
    references: corresponding list of test summaries
    
    returns: dictionary of bert scores
    '''
    bertscore = load_metric("bertscore")

    predictions = [" ".join(pred.strip().split()) for pred in predictions]
    references = [" ".join(ref.strip().split()) for ref in references]
    predictions = ["\n".join(nltk.sent_tokenize(pred)) for pred in predictions]
    references = ["\n".join(nltk.sent_tokenize(ref)) for ref in references]

    # Compute and post-process bertscore results
    results = bertscore.compute(
        predictions=predictions,
        references=references,
        # These are mostly based on the recommendations in https://github.com/Tiiiger/bert_score
        model_type=BERTSCORE_MODEL_TYPE,
        lang="en",
        rescale_with_baseline=True,
        use_fast_tokenizer=True,
    )
    results["f1_mean"] = np.mean(results["f1"])
    for key, value in results.items():
        if key == "hashcode":
            continue
        if isinstance(value, list):
            results[key] = [score * 100 for score in value]
        else:
            results[key] = value * 100

    return results

In [10]:
def compare_lengths(predictions, references):
    pred_length = sum(len(pred) for pred in predictions)/len(predictions)
    ref_length = sum(len(ref) for ref in references)/len(predictions)
    return {'prediction': pred_length, 'reference': ref_length}

In [11]:
def test_hallucination(nlp, predictions):
    # returns percent of entities in generated impression not found in findings
    unknown_words = {}
    unknown_pcts = {}
    total_unknown_pcts = 0
    for i,pred in enumerate(predictions):
        doc = nlp(pred)
        unknown_words[i] = doc.ents
        unknown_pct = len(doc.ents)/(len(set(pred.split()))+.0000000001)
        unknown_pcts[i] = unknown_pct
        total_unknown_pcts+= unknown_pct
    avg_unknown_pct = total_unknown_pcts/(len(predictions)+.0000000001)
    return unknown_words, unknown_pcts, avg_unknown_pct

In [12]:
metric = evaluate.load("rouge")

def preprocess_logits_for_metrics(logits, labels):
    if isinstance(logits, tuple):
        logits = logits[0]
    return logits.argmax(dim=-1)

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    # Replace -100s used for padding as we can't decode them
    preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    # Otherwise rouge scores were a bit inflated
    decoded_pred_impressions = [s.split('MR:', 1)[-1].strip() if 'MR:' in s else s for s in decoded_preds]
    decoded_label_impressions = [s.split('MR:', 1)[-1].strip() if 'MR:' in s else s for s in decoded_labels]
    print("decoded_preds:----------------------\n", decoded_pred_impressions[0:5])
    print("decoded_labels:---------------------\n", decoded_label_impressions[0:5])

    result = metric.compute(predictions=decoded_pred_impressions, references=decoded_label_impressions, use_stemmer=True)
    result = {k: round(v * 100, 4) for k, v in result.items()}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["pred_len"] = np.mean(prediction_lens)
    reference_lens = [np.count_nonzero(label != tokenizer.pad_token_id) for label in labels]
    result['ref_len'] = np.mean(reference_lens)
    return result

# Test base model

## Chest Impressions

In [39]:
ct_impressions_test_dataset

Dataset({
    features: ['Narrative', 'Impression'],
    num_rows: 3000
})

In [140]:
references = ct_impressions_test_dataset['Impression'][:100]
contexts = ct_impressions_test_dataset['Narrative'][:100]

In [57]:
max([len(ref) for ref in references])

270

In [58]:
max([len(c) for c in contexts])

1762

In [160]:
predictions = get_impressions(contexts, model, tokenizer, 120, device_map)

100%|██████████████████████████████████████████████████████████| 3000/3000 [7:17:10<00:00,  8.74s/it]


In [177]:
rouge_scores = get_rouge_scores(predictions, references)
rouge_scores.keys

<function dict.keys>

In [178]:
rouge_scores.keys()

dict_keys(['rouge1', 'rouge2', 'rougeL', 'rougeLsum', 'rouge_avg_fmeasure', 'rouge_avg_fmeasure_mean'])

In [181]:
rouge_scores['rouge1']['fmeasure_mean']

25.04508696836646

In [182]:
rouge_scores['rouge2']['fmeasure_mean']

10.17241478231588

In [183]:
rouge_scores['rougeL']['fmeasure_mean']

17.47149724947418

In [189]:
bert_scores = get_bertscore(predictions, references)
bert_scores

(…)-mnli/resolve/main/tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

(…)rta-xlarge-mnli/resolve/main/config.json:   0%|          | 0.00/792 [00:00<?, ?B/s]

(…)erta-xlarge-mnli/resolve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

(…)erta-xlarge-mnli/resolve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/3.04G [00:00<?, ?B/s]



{'precision': [24.834538996219635,
  37.750157713890076,
  22.40828573703766,
  28.471028804779053,
  14.707496762275696,
  4.699182882905006,
  13.288217782974243,
  12.672558426856995,
  8.33568349480629,
  3.084900416433811,
  17.14048683643341,
  16.291871666908264,
  14.699624478816986,
  29.515105485916138,
  -22.436799108982086,
  25.502660870552063,
  -3.4838534891605377,
  27.134808897972107,
  20.949043333530426,
  16.653822362422943,
  17.653679847717285,
  18.774043023586273,
  40.35017192363739,
  24.729283154010773,
  5.836363136768341,
  22.554269433021545,
  23.877570033073425,
  -106.99931383132935,
  17.855456471443176,
  33.02275836467743,
  19.930902123451233,
  25.972729921340942,
  -2.016443759202957,
  -19.363027811050415,
  -1.03784566745162,
  26.41589045524597,
  2.899186871945858,
  11.353661864995956,
  18.192262947559357,
  26.33955478668213,
  14.330875873565674,
  19.41608190536499,
  22.415775060653687,
  27.20629572868347,
  2.405390702188015,
  23.8667

In [191]:
bert_scores.keys()

dict_keys(['precision', 'recall', 'f1', 'hashcode', 'f1_mean'])

In [192]:
bert_scores['f1_mean']

16.271674651204375

In [190]:
avg_response_lengths = compare_lengths(predictions, references)
avg_response_lengths

{'prediction': 410.25333333333333, 'reference': 290.08666666666664}

In [195]:
contexts[0]

'CT Chest without IV contrast  Clinical information: Hemoptysis, allergic rhinitis  Comparison: 9/18/2006  The study was performed without intravenous contrast.  FINDINGS:  LUNGS AND AIRWAYS: The central airway is patent. Imaging of the lung parenchyma is mildly degraded by respiratory motion artifact. There are no suspicious lung nodules. There are no focal consolidations. There is a subtle mosaic pattern of the lung parenchyma in the lower lobes which could represent some air trapping.  PLEURA: The pleural surfaces are normal and there are no effusions.  LYMPH NODES/MEDIASTINUM: No mediastinal, hilar or axillary adenopathy. The esophagus is normal in appearance.  CHEST WALL: Negative   THYROID AND LOWER NECK: Negative  CARDIOVASCULAR: There is no significant enlargement of the heart, thoracic aorta or central pulmonary arteries. There is mild coronary artery calcification. There is no pericardial effusion.   UPPER ABDOMEN: Limited evaluation of the upper abdomen demonstrate a markedl

In [193]:
predictions[0]

" The patient has a mild mosaic pattern of the lung parenchyma in the lower lobes, which could represent some air trapping. There is no evidence of any significant lung disease or pathology. The liver and spleen are enlarged, consistent with cirrhosis. There is no evidence of any cardiovascular disease or pericardial effusion. The patient's allergic rhinitis and hemoptysis are likely related to their history of smoking. \n\n"

In [194]:
references[0]

' 1. Possible mild air trapping in the lower lobes, otherwise unremarkable noncontrast CT examination of the chest.  2. Markedly cirrhotic liver and splenomegaly which have developed since 2006.'

In [154]:
base_predictions = get_impressions(contexts, base_model, tokenizer, 200, device_map)

100%|██████████████████████████████████████████████████████████████| 100/100 [04:40<00:00,  2.81s/it]


In [161]:
base_predictions[2]

' Stable majority of the bilateral scattered lung nodules with interval slight enlargement of some of the nodules. No new or enlarging thoracic adenopathy.  Stable sclerotic focus in the lateral left fifth rib'

In [157]:
base_spacy_scores = test_hallucination(nlp, base_predictions)

In [158]:
base_spacy_scores[0][0]

(evidence, intrathoracic disease)

In [159]:
base_spacy_scores[1][0]

0.2857142857142857

In [160]:
base_spacy_scores[2]

0.4168356347983479

# Fine-tune on Chest Impressions

## Fine-tune and save model

In [50]:
ct_impressions_train

Unnamed: 0,Narrative,Impression
27643,CT ED CHEST ABDOMEN PELVIS W IV CONTRAST INDI...,IMPRESSION: No evidence of acute traumatic in...
27644,"CT CHEST, ABDOMEN, AND PELVIS WITH IV CONTRAST...",IMPRESSION: Interval progression of metastati...
27645,CT CHEST ABDOMEN PELVIS W IV CONTRAST CLINICA...,IMPRESSION: Interval increase in size of some...
27646,CT CHEST ABDOMEN PELVIS WO IV CONTRAST Histor...,IMPRESSION: 1. No evidence of retroperitoneal...
27647,CTA CHEST (PE) W IV CONTRAST INDICATION: Abdo...,Similar marked narrowing of the right upper l...
...,...,...
276417,Study: CT CHEST W IV CONTRAST Date: 3/1/2017...,IMPRESSION: 1. Decrease in size of the residua...
276418,CT CHEST W IV CONTRAST Date: 12/28/2020 8:30 ...,IMPRESSION: Slight retraction of the treated ...
276419,CTA CHEST ABDOMEN PELVIS W AND/OR WO IV CONTRA...,"IMPRESSION: TAVR measurements, as above. Fin..."
276420,CT CHEST WO IV CONTRAST Date: 2/27/2021 11:42 ...,IMPRESSION: Status post heart transplant with...


In [51]:
ct_impressions_train_dataset['Impression'][5]

'IMPRESSION:   No aortic dissection or other acute abnormality of the chest or abdomen.   Stable 4.1 cm ascending thoracic aortic aneurysm.'

In [104]:
sample_dataset = ct_impressions_train_dataset.shuffle(seed=42)
sample_dataset = sample_dataset.select(range(10000))
sample_dataset
sample_val = ct_impressions_val_dataset.shuffle(seed=42)
sample_val = sample_val.select(range(1500))

In [53]:
# plot loss, every 100-200 steps, run validation, and plot loss/rouge scores at those validation check points

In [56]:
sample_val['Narrative'][3]

"Study: CT CHEST WO IV CONTRAST  Date:  1/19/2018 3:02 PM  History/Indication:  lung nodule, f/u.  Technique:  A volumetric CT acquisition of the chest was obtained from the thoracic inlet to the upper abdomen without intravenous administration of contrast. Coronal and sagittal reconstructed images are provided.  Comparison: Chest CT from 9/13/2017 and PET CT from 9/29/2017. Chest CT from 4/11/2017 was also reviewed.  FINDINGS:  Lungs/Airways/Pleura: Again seen is pulmonary emphysema predominating the bilateral upper lobes. The subsolid ill-defined nodule in the right lower lobe has not significantly changed in size when measured in a similar fashion (measuring 18 x 12 mm on image 215 of series 3), but it has decreased in density (better appreciated comparing image 55 of series 2 on today's exam with image 51 of series 3 on the most recent prior CT). Internal lucencies are again seen within this nodule, slightly increased from prior exams. Several smaller than 4 mm scattered bilateral 

In [55]:
sample_val['Impression'][3]

'IMPRESSION: Interval decrease in density of the subsolid right lower lobe nodule, which is overall not significantly changed in size from recent prior CT (when measured in a similar fashion) although still larger and denser than on the CT from April 2017. Apparent interval decrease in density may be related to different inspiratory effort, particularly given the basilar location of this nodule, or it may represent partial resolution of an infectious/inflammatory nodule. Therefore, a follow-up chest CT is recommended in 3 months to reassess.'

In [57]:
finetune(model, 
         sample_dataset, #ct_impressions_train_dataset, 
         sample_val,
         peft_config, 
         640, # different than inference max length since it counts full text
         tokenizer, 
         training_arguments, 
         packing, 
         generate_impression_prompt, 
         new_model,
         compute_metrics,
         preprocess_logits_for_metrics)



Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Pred Len,Ref Len
500,0.9353,1.111371,58.0309,33.4824,55.6483,57.1714,634.928,511.076
1000,0.8599,1.013307,59.7372,36.0709,57.5479,58.9631,634.928,511.076
1500,0.8182,0.954402,61.3904,37.6021,59.2857,60.6262,634.928,511.076
2000,0.8215,0.919627,63.3375,39.6388,61.2398,62.5341,634.928,511.076
2500,0.8145,0.89127,64.0162,40.1934,62.033,63.2521,634.928,511.076
3000,0.7625,0.877381,64.9675,41.3586,62.9673,64.2121,634.928,511.076
3500,0.7162,0.860353,65.2394,41.7338,63.2758,64.5007,634.928,511.076
4000,0.7202,0.850163,65.5223,42.1809,63.5313,64.7438,634.928,511.076
4500,0.7823,0.834662,65.4762,42.0778,63.5362,64.7589,634.928,511.076
5000,0.6896,0.82564,65.226,42.1165,63.3282,64.5288,634.928,511.076


decoded_preds:----------------------
 ['IMPRESSION:  evidence of pulmonary embolism.\nNo-119 relateding protocol: 1eterminate  COpression classification that: Im be class on pulPEVID-19) neumonia.\nbut the notonspecific and can be with other variety of otherectious processes inflinfectious conditions.\n2linid]]et  \n example information on the classification classification termin, see the https://wwwctor.org/10.1038/rydb.2020.00033  examsssssssssssssssssssssss   ssss                        7777', 'IMPRESSION: able exam.. evidence of disease prourrence.\ns                                                                            n n n n n n n n n n n n n', 'Ipression:  No significant change.\nthe lung identified retstitial lung disease.dominantly inoneycombing and.\nun seen.\nfindings are un likely related with idP.IPF.\nNoviously identified ple indeterminate nod uppersided plemonary nodule is which well above, is unchanged.\nss', 'IMPRESSION: val stable in size of the subsolid ill low

decoded_preds:----------------------
 ['IMPRESSION:  evidence of pulmonary embolism.\nSc Sc-119 ping find:  Loweterminate.\nScaging find:: Sc be found in COVID1VID-19) neumonia.\npul the notonspecific.\nmay be with other variety of otherectious processes inflinfectious conditions.\nThelinid9]et   RE more information, the classification classification system, see the https://wwwi.org/10.1008/rycd.2020100100  CHssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssss5ssssssssssssssssssssssssssssss', 'IMPRESSION: able examination.\nevidence of rec recurrence.\nssssssssssssssssssssss55555555555555555555555555555555555555555555555555555555555555ssssssssssssssssss', 'Impression:  St significant change in the appearance identified interstitial lung disease.dominantly inoneycombing and in seen seen.\nfindings are consistent consistent related with UIP.UIF.\nPreviously identified ple indeterminate nod uppersided plemonary nodule is is described,, is unchanged.\nssssssssssssssssssssss55   

decoded_preds:----------------------
 ['IMPRESSION: No evidence of pulmonary embolism.\nSc-119 ping find:  Loweterminate for COpression find of: Sc be found on COVID1VID-19) inneumonia.\nor the notonspecific.\nmay be with other variety of otherectious processes inflinfectious et.\nThelinid9]et   A more information please the classification, scheme, see the https://wwwi.org/10.1038/ryct.2020.00103  REssssssssssss000005555555555555555555555555555555555555555555555555555550', 'IMPRESSION: able examination.\nevidence of rec recurrence.\nss5ssss555555555555555555555555555555555555555555555555555555555555555555', 'Impression:  St significant change in the CT identified findstitial lung disease.dominantly inoneycombing and.\nseen identified.\nfindings are consistent consistent related with UIP patternIPF.\nStviously identified ind indeterminate nod uppersided plemonary nodule is is described,, is unchanged.\nssssssss5555555555555555555555555555555555555555555555555555555555555555555555555555'

decoded_preds:----------------------
 ['IMPRESSION: No evidence of pulmonary embolism.\nSc Sc-119 ping find: 0eterminate for COpression find of in * be found in COVIDCOVID-19) pneumonia.\nincluding the notonspecific.\nmay be with other variety of otherectious processes inflinfectious et.\nCOovid9_et   The more information on the classification classification framework, see the https://wwwi.org/10.1048/ryct.2020200103  RE00000000000000000000000055550000555555000000000000000000000000000000000', 'IMPRESSION: Stable examination.\nevidence of rec recurrence.\nssssssssss0000000000000000000000000000055555000000000000000000055555555555555555', 'Impression:  St significant change in the find identified findstitial lung disease.dominantly inoneycombing and and identified identified.\nfindings are consistent consistent related with UIP.IPF.\nPreviously identified ind indeterminate nod uppersided plemonary nodule is meas described,, is againchanged.\nssssss00000000000000000000000000005555555555555

decoded_preds:----------------------
 ['IMPRESSION: No evidence of pulmonary embolism.\nSc-119 ping find: 0eterminate for Findaging find of in Sc be found in COVIDCOVID-19) pneumonia.\nsee the notonspecific.\nmay be with other variety of otherectious processes inflinfectious processes.\nCOovid9ICet   Find more information on the classification reporting framework, please: https://doi.org/10.1148/ryct.2020200131   RE000000000000000000000000000000000000000000000000000000000000000000', 'IMPRESSION: Noable examination.\nevidence of rec recurrence.\nss555000000000000000000000000005555555000000000000000005555555555555', 'Impression:  St significant change in the find identified findstitial lung disease.dominantly inoneycombing and, seen identified.\nfindings are consistent likely related with UIP patternIPF.\nPreviously identified ple indeterminate nod uppersided plemonary nodule is is described above, is unchanged.\nsss00000000000000000000,,00000000000000000000000000000000000000000000000000

decoded_preds:----------------------
 ['IMPRESSION: No evidence of pulmonary embolism.\nSc-119 ping find: 0eterminate  Foraging find of in CO be found in COVIDCOVID-19) pneumonia.\nsee the notonspecific.\nmay be with other variety of otherectious processes inflinfectious et.\nCOovid9ICet   A more information on the clin reporting framework, see: https://doi.org/10.1148/ryct.2020200131   RE000000000000000000000000000000000000000000000000000000000000000000', 'IMPRESSION: Stable examination.\nevidence of rec recurrence.\nsss000000000000000000000000000000555550000000000000000000055555555555', 'Impression:  St significant change in the previously identified findstitial lung disease.dominantly inoneycombing and and identified identified.\nfindings are consistent likely related with UIP.IPF.\nPreviously identified ind indeterminate nod uppersided plemonary nodule is is described above, is unchanged.\nssss000000000000000000000,00000000000000000000050000000000000000000000000000000000000', 'IMPR

decoded_preds:----------------------
 ['IMPRESSION: No evidence of pulmonary embolism.\nSc-119 ping find:  "eterminate  Foraging find of in CO be found in COVIDCOVID-19) pneumonia.\nsee the notonspecific.\nmay be with other variety of otherectious processes inflinfectious et.\nCOovid9ICet   A more information on the new reporting framework, see: https://doi.org/10.1148/ryct.2020200131   RE000000000000000000000000000000000000000000000000000000000000000000', 'IMPRESSION: Noable examination.\nevidence of rec recurrence.\nsss000000000000000000000000000000555550000000000000000000055555555555', 'Impression:  St significant change in the previously identified findstitial lung disease.dominantly inoneycombing and and identified identified.\nfindings are compatible likely related with UIP.IPF.\nPreviously identified ind indeterminate nod uppersided plemonary nodule is is described above, is unchanged.\nssss00000000000000000000,,,000000000000000000005000000000000000000000000000000000000', 'IMPRE

In [58]:
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map=device_map,
)
model = PeftModel.from_pretrained(base_model, new_model)
model = model.merge_and_unload()

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [59]:
!huggingface-cli login --token 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

model.push_to_hub(new_model, use_temp_dir=False)
tokenizer.push_to_hub(new_model, use_temp_dir=False)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/imx2/.cache/huggingface/token
Login successful


pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/imxx/llama-2-7b-radnlpv2/commit/59e65475368947d125c08fbed230f08087be8c29', commit_message='Upload tokenizer', commit_description='', oid='59e65475368947d125c08fbed230f08087be8c29', pr_url=None, pr_revision=None, pr_num=None)

## Evaluate model

In [84]:
ct_references = ct_impressions_test_dataset['Impression'][:1000]
ct_contexts = ct_impressions_test_dataset['Narrative'][:1000]

In [61]:
ct_predictions = get_impressions(ct_contexts, model, tokenizer, 128, device_map)

100%|████████████████████████████████████████████████████████████████████| 1000/1000 [1:49:12<00:00,  6.55s/it]


In [62]:
ct_contexts[3]

'CT CHEST W IV CONTRAST  Date: 4/1/2022 11:51 AM  INDICATION: Hematologic malignancy, monitor. Additional history per electronic medical record: T-cell lymphoma, status post prior stem cell transplant.  COMPARISON: 10/23/2021  TECHNIQUE:  A volumetric CT acquisition of the chest was obtained from the thoracic inlet to the upper abdomen, following the administration of intravenous contrast. Coronal and sagittal reconstructed images are provided.  FINDINGS:  Lungs/Airways/Pleura : There is a new 1 cm groundglass nodule in the paramediastinal left lower lobe (image 365 of series 6). This is nonspecific, could be infectious or inflammatory in etiology, follow-up is recommended to document clearing and to exclude other etiologies. A few scattered less than 4 mm solid lung nodules are unchanged and are also stable when compared with 11/2018, likely benign. Calcified granulomas present.  Mild, multifocal bronchiectasis again noted. Mosaic lung attenuation compatible with air trapping as seen 

In [63]:
ct_predictions[0]

" IMPRESSION:  1. No evidence of lung nodules or other suspicious findings to account for the patient's hemoptysis.  2. Cirrhotic liver and splenomegaly, both of which have developed since the prior examination.  The critical findings described above were communicated via the Veriphy Critical Results Reporting System as Green (verbal communication) to Dr."

In [64]:
ct_references[3]

'1. No lymphadenopathy. 2. New groundglass nodule in the left lower lobe, nonspecific but possibly infectious or inflammatory in etiology. Recommend follow-up chest CT in 2-3 months to further assess. 3. Bronchiectasis and mosaic lung attenuation again seen likely due to air trapping. 4. Stable basilar predominant fibrosis but which is new compared with older prior studies.  An alert has been created in the YNHHS Radiology Notify Critical Results system.'

In [65]:
for i in ct_predictions[:10]:
    print(i)
    print('\n\n')

 IMPRESSION:  1. No evidence of lung nodules or other suspicious findings to account for the patient's hemoptysis.  2. Cirrhotic liver and splenomegaly, both of which have developed since the prior examination.  The critical findings described above were communicated via the Veriphy Critical Results Reporting System as Green (verbal communication) to Dr.



 IMPRESSION: 1. Stable postoperative right middle lobe and stable bilateral scattered nonspecific pulmonary nodules. No new lung nodules.  2. Further decrease in size of the residual linear opacity in the left lower lobe.  3. No mediastinal or hilar lymphadenopathy by CT size criteria.  4. Stable bony metastasis.  A Yellow message has been communicated to this patient's provider via the PowerScribe 360 Critical Results application on 4/7/20



 Stable majority of the bilateral scattered lung nodules with interval slight enlargement of some of the nodules. No new or enlarging thoracic adenopathy.  Stable ectasia of the ascending aort

In [66]:
ct_rouge_scores = get_rouge_scores(ct_predictions, ct_references)
ct_rouge_scores['rouge1']['fmeasure_mean']

  rouge = load_metric("rouge")


34.074458442287025

In [67]:
ct_rouge_scores['rouge2']['fmeasure_mean']

18.68854533437709

In [68]:
ct_rouge_scores['rougeL']['fmeasure_mean']

27.073281301159614

In [69]:
ct_bert_scores = get_bertscore(ct_predictions, ct_references)
ct_bert_scores['f1_mean']

27.10916182794608

In [70]:
ct_avg_response_lengths = compare_lengths(ct_predictions, ct_references)
ct_avg_response_lengths

{'prediction': 391.64, 'reference': 270.149}

In [71]:
ct_spacy_scores = test_hallucination(nlp, ct_predictions)

In [72]:
ct_spacy_scores[0][10]

(evidence,
 interstitial lung disease,
 Moderate,
 cardiomegaly,
 left-sided enlargement,
 Cholelithiasis,
 CT,
 evidence,
 acute cholecystitis,
 Vertebroplasty,
 extension,
 dense material,
 ventral epidural space,
 Proliferation,
 extrapleural fat,
 mediastinal fat,
 retroperitoneal fat)

In [73]:
ct_spacy_scores[1][10]

0.3695652173913043

In [74]:
ct_spacy_scores[2]

0.44160241223176505

# Radqa

In [3]:
import sklearn

In [2]:
%pip install scikit-learn

Collecting scikit-learn
  Obtaining dependency information for scikit-learn from https://files.pythonhosted.org/packages/3f/48/6fdd99f5717045f9984616b5c2ec683d6286d30c0ac234563062132b83ab/scikit_learn-1.3.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
  Downloading scikit_learn-1.3.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting threadpoolctl>=2.0.0 (from scikit-learn)
  Obtaining dependency information for threadpoolctl>=2.0.0 from https://files.pythonhosted.org/packages/81/12/fd4dea011af9d69e1cad05c75f3f7202cdcbeac9b712eea58ca779a72865/threadpoolctl-3.2.0-py3-none-any.whl.metadata
  Downloading threadpoolctl-3.2.0-py3-none-any.whl.metadata (10.0 kB)
Downloading scikit_learn-1.3.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.1 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.1/11.1 MB[0m [31m77.7 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m36m0:00:01[0m
[?25hUsing ca

In [27]:
radqa_train_df = pd.read_json('radqa/train.json')
radqa_val_df = pd.read_json('radqa/dev.json')
radqa_test_df = pd.read_json('radqa/test.json')
radqa_train_df

Unnamed: 0,data,version
0,{'paragraphs': [{'qas': [{'question': 'Is ther...,full
1,{'paragraphs': [{'qas': [{'question': 'Does th...,full
2,{'paragraphs': [{'qas': [{'question': 'Is the ...,full
3,{'paragraphs': [{'qas': [{'question': 'Is an a...,full
4,{'paragraphs': [{'qas': [{'question': 'Is ther...,full
...,...,...
798,{'paragraphs': [{'qas': [{'question': 'Is ther...,full
799,{'paragraphs': [{'qas': [{'question': 'Is ther...,full
800,{'paragraphs': [{'qas': [{'question': 'Are the...,full
801,{'paragraphs': [{'qas': [{'question': 'Was the...,full


In [28]:
radqa_train_df['data'][0]

{'paragraphs': [{'qas': [{'question': 'Is there any significant change in bleeding?',
     'id': '796653_2_1_I',
     'answers': [],
     'is_impossible': True},
    {'question': 'Did the bleeding in the sub-dural space resolve?',
     'id': '796653_1_2_I',
     'answers': [{'answer_id': '796653_1_2_I_MG',
       'text': 'Subdural hematomas with blood products of different ages',
       'answer_start': 13}],
     'is_impossible': False},
    {'question': 'Is there any additional bleeding in the sub-arachanoid space?',
     'id': '796653_1_1_I',
     'answers': [],
     'is_impossible': True}],
   'context': 'IMPRESSION:  Subdural hematomas with blood products of different ages.\n Question vescular abnormality in left suprasellar space.  Findings were\n discussed with Dr. [**Last Name (STitle) 8620**] at 9:25 am on [**2191-8-5**].  An MRI of the brain and MRA\n of the COW is recommended.',
   'document_id': '796653_I'},
  {'qas': [{'question': 'Is there any additional bleeding in the su

In [29]:
radqa_train_df['version'].unique()

array(['full'], dtype=object)

In [30]:
radqa_train_df

Unnamed: 0,data,version
0,{'paragraphs': [{'qas': [{'question': 'Is ther...,full
1,{'paragraphs': [{'qas': [{'question': 'Does th...,full
2,{'paragraphs': [{'qas': [{'question': 'Is the ...,full
3,{'paragraphs': [{'qas': [{'question': 'Is an a...,full
4,{'paragraphs': [{'qas': [{'question': 'Is ther...,full
...,...,...
798,{'paragraphs': [{'qas': [{'question': 'Is ther...,full
799,{'paragraphs': [{'qas': [{'question': 'Is ther...,full
800,{'paragraphs': [{'qas': [{'question': 'Are the...,full
801,{'paragraphs': [{'qas': [{'question': 'Was the...,full


In [31]:
def clean_radqa_df(df):
    data_column = df['data']
    normalized_data = pd.json_normalize(data_column, 'paragraphs', ['title'])
    df_expanded = normalized_data.explode('qas').reset_index(drop=True)
    df_expanded_qas = pd.json_normalize(df_expanded['qas']).add_prefix('qas.')
    result_df = pd.concat([df_expanded, df_expanded_qas], axis=1)
    result_df['answer_text'] = result_df['qas.answers'].apply(lambda x: x[0]['text'] if x else None)
    result_df = result_df.drop(columns=['qas', 'qas.answers'])
    result_df['qas.answer'] = np.where(result_df['answer_text'],  result_df['answer_text'], 'Not in context.')
#     result_df['qas.adjusted_answers'] = np.where(result_df['qas.is_impossible'] == True, 'Is impossible.', result_df['qas.answers'])
    return result_df[['context', 'qas.question', 'qas.answer']]

In [32]:
radqa_clean_train_df = clean_radqa_df(radqa_train_df)
radqa_clean_val_df = clean_radqa_df(radqa_val_df)
radqa_clean_test_df = clean_radqa_df(radqa_test_df)
radqa_clean_train_df

Unnamed: 0,context,qas.question,qas.answer
0,IMPRESSION: Subdural hematomas with blood pro...,Is there any significant change in bleeding?,Not in context.
1,IMPRESSION: Subdural hematomas with blood pro...,Did the bleeding in the sub-dural space resolve?,Subdural hematomas with blood products of diff...
2,IMPRESSION: Subdural hematomas with blood pro...,Is there any additional bleeding in the sub-ar...,Not in context.
3,WET READ: MES FRI [**2191-8-5**] 1:40 AM\n no...,Is there any additional bleeding in the sub-ar...,Not in context.
4,WET READ: MES FRI [**2191-8-5**] 1:40 AM\n no...,Did the bleeding in the sub-dural space resolve?,mixed density subdural hematomas seen along bo...
...,...,...,...
4873,IMPRESSION: Successful decompression of pneum...,Does the CT show any decrease in pneumo perito...,Successful decompression of pneumoperitoneum
4874,IMPRESSION: Successful decompression of pneum...,Was the percutaneous drainage successful?,Successful decompression of pneumoperitoneum
4875,FINAL REPORT\n INDICATION: 83 year old male s...,Was the percutaneous drainage successful?,significant\n reduction in the amount of free air
4876,FINAL REPORT\n INDICATION: 83 year old male s...,Was percutaneous drainage of pneumoperitoneum ...,significant\n reduction in the amount of free ...


In [33]:
radqa_clean_train_df['context'][3]

'WET READ: MES FRI [**2191-8-5**] 1:40 AM\n  no significant change in hemorrhage\n ______________________________________________________________________________\n                                 FINAL REPORT\n INDICATION: known subarachnoid subdural hemorrhage from outside hospital.\n Evaluate for any change.\n\n TECHNIQUE: Noncontrast head CT.\n\n COMPARISON: (CT done several hours earlier at [**Hospital 539**] Hospital).  At the time\n of attending review, the prior exam is not available for comparison.\n\n FINDINGS: There has been no significant change in the interval. There is an\n area of hyperdensity along the left anterior clinoid and in the adjacent\n suprasellar space, which may be an aneurysm or small collection of blood, or a\n dense mass.\n There are mixed density subdural hematomas seen along both cerebral\n convexities, slightly larger on the left (approx 8-9mm) than on the right.\n There is acute blood in the dependent parts of the subdural collections. There\n is flatt

In [34]:
radqa_clean_train_df['qas.question'][3]

'Is there any additional bleeding in the sub-arachanoid space?'

In [35]:
radqa_clean_train_df['qas.answer'][3]

'Not in context.'

In [36]:
radqa_train_dataset = Dataset.from_pandas(radqa_clean_train_df)
radqa_val_dataset = Dataset.from_pandas(radqa_clean_val_df)
radqa_test_dataset = Dataset.from_pandas(radqa_clean_test_df)

In [37]:
radqa_train_dataset

Dataset({
    features: ['context', 'qas.question', 'qas.answer'],
    num_rows: 4878
})

In [38]:
def generate_rad_prompt(example):
    """
    params:
    example: dataset with column 'narrative' and 'impression'
    
    returns:
    list of prompts for each context-answer pair
    """
    output_texts = []
    for i in range(len(example['context'])):
        text = f"Context: {example['context'][i]}\nQuestion: {examples['qas.question']}\nAnswer: {example['Answer'][i]}"
        output_texts.append(text)
    return output_texts

In [39]:
def get_radqas(contexts, questions, model, tokenizer, max_response_length):
    answers = []
    for i, c in enumerate(tqdm(contexts)):
        prompt = f"Context: {c}\nQuestion: {questions[i]}\nAnswer: "
#         max_length = len(prompt) + max_response_length
        logits_process_list= LogitsProcessorList([EosTokenRewardLogitProcess(eos_token_id=tokenizer.eos_token_id, max_length=max_response_length)])
        # add some postprocessor
        pipe = pipeline(
            task="text-generation", 
            model=model, 
            tokenizer=tokenizer, 
            logits_processor=logits_process_list, 
            max_new_tokens=max_response_length, 
            return_full_text=False, 
            temperature=.1)
        result = pipe(prompt)
        answers.append(result[0]['generated_text'])
    return answers

In [40]:
contexts = radqa_train_dataset['context']
questions = radqa_train_dataset['qas.question']
answers = radqa_train_dataset['qas.answer']

In [41]:
contexts = radqa_train_dataset['context']
questions = radqa_train_dataset['qas.question']
answers = radqa_train_dataset['qas.answer']
predictions = get_radqas(contexts, questions, model, tokenizer, 30)
predictions

100%|██████████████████████████████████████████████████████████| 4878/4878 [2:33:13<00:00,  1.88s/it]


[' Yes, there is significant change in bleeding. The subdural hematoma has increased in size and there is a new subdural h',
 ' No, the bleeding in the sub-dural space did not resolve.\n',
 ' Yes, there is additional bleeding in the sub-arachnoid space.\n',
 ' No.\n\n',
 ' No.\nQuestion: Is there any change in the size of the hematoma?\nAnswer: No.\nQuestion: Is there any change',
 ' No significant change in bleeding.\n',
 ' Yes, the small bowel appears to be obstructed at the area of ileal anastomosis, with complete block of contrast agent',
 ' Yes, the contrast material passed through the small intestine in a\n',
 ' Yes, there are several contraindications for IV contrast. These include:\n\n1. Severe kidney disease or renal failure:',
 ' Yes, there are several contraindications for IV contrast. These include:\n\n1. Severe kidney disease or failure (e.',
 ' No, the contrast is no longer identified within small or large bowel loops.\n\nQuestion: What is the significance of the high att

In [42]:
answers

['Not in context.',
 'Subdural hematomas with blood products of different ages',
 'Not in context.',
 'Not in context.',
 'mixed density subdural hematomas seen along both cerebral\n convexities, slightly larger on the left (approx 8-9mm) than on the right.\n There is acute blood in the dependent parts of the subdural collections',
 'no significant change',
 'obstruction at the area of ileal anastomosis',
 'Not in context.',
 'contrast material within the kidneys; this may represent ATN or\n continued renal excretion of orally administered contrast',
 'high attenuation  within the kidneys bilaterally, consistent with\n either ATN or related to enteric oral contrast absorption and excretion\n continually by the kidneys due to obstruction',
 'contrast is no longer\n identified within small or large bowel loops',
 'contrast is no longer\n identified within small or large bowel loops',
 'Not in context.',
 'Not in context.',
 'Not in context.',
 'ICD with leads unchanged in location',
 'le

In [42]:
import json

file_path = "radqa_predictions.json"

# Open the file in read mode and use json.load to read the list from the file
with open(file_path, "r") as file:
    predictions = json.load(file)


In [43]:
sum([len(answers[i]) for i in range(len(answers))])/len(answers)

84.44485444854449

In [44]:
rouge_scores = get_rouge_scores(predictions, answers)
print(f"Rouge1: {rouge_scores['rouge1']['fmeasure_mean']}")
print(f"Rouge2: {rouge_scores['rouge2']['fmeasure_mean']}")
print(f"RougeL: {rouge_scores['rougeL']['fmeasure_mean']}")
bert_scores = get_bertscore(predictions, answers)
print(f"Bert score: {bert_scores['f1_mean']}")
avg_response_lengths = compare_lengths(predictions, answers)
print(f"Average response lengths: {avg_response_lengths}")
# base_spacy_scores = test_hallucination(nlp, predictions)
# print(f"Hallucination percent: {base_spacy_scores[2]}")

  rouge = load_metric("rouge")


Rouge1: 19.492784398961184
Rouge2: 8.87716980178776
RougeL: 17.73556326141762




Bert score: 7.582608859104075
Average response lengths: {'prediction': 79.12136121361213, 'reference': 84.44485444854449}


In [48]:
import sklearn

ModuleNotFoundError: No module named 'sklearn'

In [44]:
predictions

[' Yes, there is significant change in bleeding. The subdural hematoma has increased in size and there is a new subdural h',
 ' No, the bleeding in the sub-dural space did not resolve.\n',
 ' Yes, there is additional bleeding in the sub-arachnoid space.\n',
 ' No.\n\n',
 ' No.\nQuestion: Is there any change in the size of the hematoma?\nAnswer: No.\nQuestion: Is there any change',
 ' No significant change in bleeding.\n',
 ' Yes, the small bowel appears to be obstructed at the area of ileal anastomosis, with complete block of contrast agent',
 ' Yes, the contrast material passed through the small intestine in a\n',
 ' Yes, there are several contraindications for IV contrast. These include:\n\n1. Severe kidney disease or renal failure:',
 ' Yes, there are several contraindications for IV contrast. These include:\n\n1. Severe kidney disease or failure (e.',
 ' No, the contrast is no longer identified within small or large bowel loops.\n\nQuestion: What is the significance of the high att

In [47]:
len(predictions)

4878

In [48]:
len(answers)

4878

In [54]:
from sklearn.metrics import f1_score
f1 = f1_score(answers, predictions, average='weighted')

# Print the F1 score
print(f'F1 Score: {f1}')

F1 Score: 0.0


In [43]:
f1_metric = evaluate.load("f1")
results = f1_metric.compute(predictions, answers)
print(results)

TypeError: compute() takes 1 positional argument but 3 were given

In [None]:
finetune(model, 
         radqa_train_dataset, #ct_impressions_train_dataset, 
         radqa_val_dataset,
         peft_config, 
         max_seq_length, 
         tokenizer, 
         training_arguments, 
         packing, 
         generate_impression_prompt, 
         new_model,
         compute_metrics,
         preprocess_logits_for_metrics)

In [None]:
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map=device_map,
)
model = PeftModel.from_pretrained(base_model, new_model)
model = model.merge_and_unload()

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [None]:
!huggingface-cli login --token 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

model.push_to_hub(new_model, use_temp_dir=False)
tokenizer.push_to_hub(new_model, use_temp_dir=False)

In [None]:
contexts = radqa_test_dataset['context']
questions = radqa_test_dataset['qas.question']
answers = radqa_test_dataset['qas.answer']
predictions = get_radqas(contexts, questions, model, tokenizer, 30)
predictions

In [None]:
rouge_scores = get_rouge_scores(predictions, answers)
print(f"Rouge1: {rouge_scores['rouge1']['fmeasure_mean']}")
print(f"Rouge2: {rouge_scores['rouge2']['fmeasure_mean']}")
print(f"RougeL: {rouge_scores['rougeL']['fmeasure_mean']}")
bert_scores = get_bertscore(predictions, answers)
print(f"Bert score: {bert_scores['f1_mean']}")
avg_response_lengths = compare_lengths(predictions, answers)
print(f"Average response lengths: {avg_response_lengths}")
base_spacy_scores = test_hallucination(nlp, predictions)
print(f"Hallucination percent: {base_spacy_scores[2]}")

## larger chest base model

In [61]:
model_name = "meta-llama/Llama-2-7b-chat-hf"
new_model = "llama-2-7b-radnlp-chest-large"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map
)
# model = tp.tensor_parallel(
#     AutoModelForCausalLM.from_pretrained(
#     model_name,
#     quantization_config=bnb_config,
# #     device_map=device_map
#     )
# )
# model = tp.tensor_parallel(model, ["cuda:0", "cuda:1"])  # <- each GPU has half the weights
model.config.use_cache = False
model.config.pretraining_tp = 1

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [173]:
# sample_dataset = chest_impressions_train_dataset.shuffle(seed=42)
# sample_dataset = sample_dataset.select(range(10000))
# sample_dataset
# sample_val = pelvis_impressions_val_dataset.shuffle(seed=42)
# sample_val = sample_val.select(range(1500))

In [63]:
finetune(model, 
         ct_impressions_train_dataset, #ct_impressions_train_dataset, 
         ct_impressions_val_dataset,
         peft_config, 
         1024, # different than inference max length since it counts full text
         tokenizer, 
         training_arguments, 
         packing, 
         generate_impression_prompt, 
         new_model,
         compute_metrics,
         preprocess_logits_for_metrics)



Map:   0%|          | 0/248779 [00:00<?, ? examples/s]

Map:   0%|          | 0/27642 [00:00<?, ? examples/s]

ValueError: You can't train a model that has been loaded in 8-bit precision on a different device than the one you're training on. Make sure you loaded the model on the correct device using for example `device_map={'':torch.cuda.current_device()}you're training on. Make sure you loaded the model on the correct device using for example `device_map={'':torch.cuda.current_device() or device_map={'':torch.xpu.current_device()}

In [176]:
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map=device_map,
)
model = PeftModel.from_pretrained(base_model, new_model)
model = model.merge_and_unload()

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [177]:
!huggingface-cli login --token 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

model.push_to_hub(new_model, use_temp_dir=False)
tokenizer.push_to_hub(new_model, use_temp_dir=False)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/imx2/.cache/huggingface/token
Login successful


pytorch_model-00002-of-00002.bin:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/imxx/llama-2-7b-radnlp-pelvis/commit/6ea3d3710b363e7600d0fc5ccb95babd3e55a320', commit_message='Upload tokenizer', commit_description='', oid='6ea3d3710b363e7600d0fc5ccb95babd3e55a320', pr_url=None, pr_revision=None, pr_num=None)

In [178]:
pelvis_predictions4 = get_impressions(pelvis_contexts, model, tokenizer, 64, device_map)

100%|██████████████████████████████████████████████████████████████████████| 1000/1000 [55:44<00:00,  3.34s/it]


In [179]:
rouge_scores = get_rouge_scores(pelvis_predictions4, pelvis_references)
print(f"Rouge1: {rouge_scores['rouge1']['fmeasure_mean']}")
print(f"Rouge2: {rouge_scores['rouge2']['fmeasure_mean']}")
print(f"RougeL: {rouge_scores['rougeL']['fmeasure_mean']}")
bert_scores = get_bertscore(pelvis_predictions4, pelvis_references)
print(f"Bert score: {bert_scores['f1_mean']}")
avg_response_lengths = compare_lengths(pelvis_predictions4, pelvis_references)
print(f"Average response lengths: {avg_response_lengths}")
base_spacy_scores = test_hallucination(nlp, pelvis_predictions4)
print(f"Hallucination percent: {base_spacy_scores[2]}")

Rouge1: 31.361759825759282
Rouge2: 16.22080165401153
RougeL: 25.45250366332153
Bert score: 25.418879235167697
Average response lengths: {'prediction': 214.456, 'reference': 232.319}
Hallucination percent: 0.3732955756596156


# Pelvis impressions

## Base model

In [85]:
model_name = "meta-llama/Llama-2-7b-chat-hf"
# Load base model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map
)
# model = tp.tensor_parallel(
#     AutoModelForCausalLM.from_pretrained(
#     model_name,
#     quantization_config=bnb_config,
# #     device_map=device_map
#     )
# )
# model = tp.tensor_parallel(model, ["cuda:0", "cuda:1"])  # <- each GPU has half the weights
model.config.use_cache = False
model.config.pretraining_tp = 1

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [86]:
pelvis_references = pelvis_impressions_test_dataset['Impression'][:1000]
pelvis_contexts = pelvis_impressions_test_dataset['Narrative'][:1000]

In [129]:
pelvis_predictions = get_impressions(pelvis_contexts, model, tokenizer, 64, device_map)

100%|████████████████████████████████████████████████████████████████████| 1000/1000 [1:43:12<00:00,  6.19s/it]


In [130]:
pelvis_predictions

[' The CT scan of the abdomen and pelvis shows a minimally changed mass posterior to the right kidney, which may represent a slow-growing lesion. The pancreatic calcifications and other small cystic lesions are unchanged. There is no evidence of an abscess, di',
 " The patient's abdominal and pelvic CT scan findings are unremarkable, with no evidence of obstruction or free intraperitoneal air. The cyst near the hepatic dome is stable from prior imaging, and the right hepatic lobe shows a small hyp",
 ' The patient has a history of lower abdominal pain with nausea, vomiting, and diarrhea. The CT scan of the abdomen and pelvis shows thickening of the bowel wall in the sigmoid colon and rectum, compatible with colitis. There is no evidence of',
 ' The CT scan of the abdomen and pelvis is unremarkable except for the presence of a right inguinal hernia. There is no evidence of bowel obstruction, free intraperitoneal air or fluid, or any other abnormality in the right inguinal region.',
 ' T

In [131]:
pelvis_references[0:10]

['  No evidence of abscess.',
 "IMPRESSION: No findings to explain the patient's abdominal pain.",
 ' Mild colitis involving the sigmoid and rectum.',
 'IMPRESSION:  Unremarkable exam.',
 'IMPRESSION:  Unremarkable liver transplant without arterial stenosis or thrombosis.',
 'Impression:  1. Overall, interval improvement in peritoneal disease.  2. Interval increase in moderate left pleural effusion and moderate ascites.',
 'IMPRESSION:  Stable exam in patient with history of lung cancer.',
 "IMPRESSION:  Hepatic steatosis, unchanged. There is some fatty infiltration of the ascending colon but no focal inflammatory changes noted. No clear etiology for the patient's abdominal pain in this CT examination of abdomen and pelvis.",
 'IMPRESSION:  Interval growth of a 6.6 cm soft tissue mass just deep to the anterior abdominal wall and a 1.3 cm nodule in the adjacent subcutaneous tissues as above. Remainder of the exam is stable compared with CT dated 8/27/2014.',
 ' Small bowel obstruction w

In [132]:
rouge_scores = get_rouge_scores(pelvis_predictions, pelvis_references)
print(f"Rouge1: {rouge_scores['rouge1']['fmeasure_mean']}")
print(f"Rouge2: {rouge_scores['rouge2']['fmeasure_mean']}")
print(f"RougeL: {rouge_scores['rougeL']['fmeasure_mean']}")
bert_scores = get_bertscore(pelvis_predictions, pelvis_references)
print(f"Bert score: {bert_scores['f1_mean']}")
avg_response_lengths = compare_lengths(pelvis_predictions, pelvis_references)
print(f"Average response lengths: {avg_response_lengths}")
base_spacy_scores = test_hallucination(nlp, pelvis_predictions)
print(f"Hallucination percent: {base_spacy_scores[2]}")

Rouge1: 23.755394288819115
Rouge2: 9.030684556777137
RougeL: 17.253256977476408




Bert score: 17.811518028564752
Average response lengths: {'prediction': 238.946, 'reference': 232.319}


ZeroDivisionError: division by zero

In [138]:
base_spacy_scores = test_hallucination(nlp, pelvis_predictions)
print(f"Hallucination percent: {base_spacy_scores[2]}")

Hallucination percent: 0.38631637379613704


## Chest Impression model

In [139]:
model_name = "imxx/llama-2-7b-radnlpv2"
# Load base model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map
)
# model = tp.tensor_parallel(
#     AutoModelForCausalLM.from_pretrained(
#     model_name,
#     quantization_config=bnb_config,
# #     device_map=device_map
#     )
# )
# model = tp.tensor_parallel(model, ["cuda:0", "cuda:1"])  # <- each GPU has half the weights
model.config.use_cache = False
model.config.pretraining_tp = 1

config.json:   0%|          | 0.00/630 [00:00<?, ?B/s]

pytorch_model.bin.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

In [85]:
pelvis_references = pelvis_impressions_test_dataset['Impression'][:1000]
pelvis_contexts = pelvis_impressions_test_dataset['Narrative'][:1000]

In [153]:
pelvis_predictions2 = get_impressions(pelvis_contexts, model, tokenizer, 64, device_map)

100%|████████████████████████████████████████████████████████████████████| 1000/1000 [1:42:33<00:00,  6.15s/it]


In [154]:
pelvis_references[0:10]

['  No evidence of abscess.',
 "IMPRESSION: No findings to explain the patient's abdominal pain.",
 ' Mild colitis involving the sigmoid and rectum.',
 'IMPRESSION:  Unremarkable exam.',
 'IMPRESSION:  Unremarkable liver transplant without arterial stenosis or thrombosis.',
 'Impression:  1. Overall, interval improvement in peritoneal disease.  2. Interval increase in moderate left pleural effusion and moderate ascites.',
 'IMPRESSION:  Stable exam in patient with history of lung cancer.',
 "IMPRESSION:  Hepatic steatosis, unchanged. There is some fatty infiltration of the ascending colon but no focal inflammatory changes noted. No clear etiology for the patient's abdominal pain in this CT examination of abdomen and pelvis.",
 'IMPRESSION:  Interval growth of a 6.6 cm soft tissue mass just deep to the anterior abdominal wall and a 1.3 cm nodule in the adjacent subcutaneous tissues as above. Remainder of the exam is stable compared with CT dated 8/27/2014.',
 ' Small bowel obstruction w

In [155]:
pelvis_predictions2[0:10]

[" No abscess.  Minimal interval decrease in size of a right renal mass, which remains indeterminate.  No findings of abscess or acute abdomen.  A critical alert Orange message has been communicated to this patient's provider via the Nuance Actionable Findings application",
 ' IMPRESSION: No evidence of acute abdominal process.  No evidence of acute abdominal process.  No evidence of acute abdominal process.  No evidence of acute abdominal process.  No evidence of acute abdominal process.  No evidence of ac',
 ' No evidence of acute abdominal process.  Thickening of the sigmoid colon and rectum compatible with colitis.  No intraperitoneal free fluid or free air.  No lymphadenopathy.  Small amount of fluid within the cervical canal and',
 ' No hernia or other abnormality identified in the right inguinal region.  No evidence of bowel obstruction, free intraperitoneal air or fluid.  No pelvic mass, mesenteric or retroperitoneal lymphadenopathy is identified.',
 " No evidence for hepatic a

In [156]:
rouge_scores = get_rouge_scores(pelvis_predictions2, pelvis_references)
print(f"Rouge1: {rouge_scores['rouge1']['fmeasure_mean']}")
print(f"Rouge2: {rouge_scores['rouge2']['fmeasure_mean']}")
print(f"RougeL: {rouge_scores['rougeL']['fmeasure_mean']}")
bert_scores = get_bertscore(pelvis_predictions2, pelvis_references)
print(f"Bert score: {bert_scores['f1_mean']}")
avg_response_lengths = compare_lengths(pelvis_predictions2, pelvis_references)
print(f"Average response lengths: {avg_response_lengths}")
base_spacy_scores = test_hallucination(nlp, pelvis_predictions2)
print(f"Hallucination percent: {base_spacy_scores[2]}")

Rouge1: 29.547198652797263
Rouge2: 14.617676319248895
RougeL: 24.058689763565972
Bert score: 22.44304414286744
Average response lengths: {'prediction': 224.571, 'reference': 232.319}
Hallucination percent: 0.41777030579711794


## Fine tune chest model on pelvis impressions

In [157]:
sample_dataset = pelvis_impressions_train_dataset.shuffle(seed=42)
sample_dataset = sample_dataset.select(range(10000))
sample_dataset
sample_val = pelvis_impressions_val_dataset.shuffle(seed=42)
sample_val = sample_val.select(range(1500))

In [158]:
finetune(model, 
         sample_dataset, #ct_impressions_train_dataset, 
         sample_val,
         peft_config, 
         640, # different than inference max length since it counts full text
         tokenizer, 
         training_arguments, 
         packing, 
         generate_impression_prompt, 
         new_model,
         compute_metrics,
         preprocess_logits_for_metrics)



Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Pred Len,Ref Len
500,0.768,0.904256,61.7605,38.2024,60.2473,61.1296,627.52,465.795333
1000,0.7544,0.883162,61.0657,37.8529,59.5282,60.3963,627.52,465.795333
1500,0.7419,0.867085,62.7194,39.0788,61.1675,62.078,627.52,465.795333
2000,0.7347,0.855368,62.0287,39.1211,60.5201,61.39,627.52,465.795333
2500,0.7283,0.84436,63.5333,40.0164,62.0035,62.8908,627.52,465.795333
3000,0.6875,0.839866,63.1071,39.9116,61.5849,62.451,627.52,465.795333
3500,0.6733,0.832927,63.5869,40.0771,62.0653,62.9677,627.52,465.795333
4000,0.6663,0.826216,63.6504,40.5786,62.1763,63.014,627.52,465.795333
4500,0.6402,0.820101,63.7038,40.6943,62.1963,63.0904,627.52,465.795333
5000,0.6481,0.815409,63.3548,40.4289,61.8957,62.7289,627.52,465.795333


decoded_preds:----------------------
 ['IMPRESSION: 1 significantly changed findulated fluid in the pelvis compatible abitonal enhancement compatible with peritonitis.\nssssssssss0000000000000000000000000000000000000000000000', 'Small bowel obstruction at the level of the bowel an small bowel anastomosis in the left mid abdomen.\nsssss0000ss000000000000', 'Impression: 1ulated fluid collection anterior to the aC, aorta, described.\nThis Smallings compatible with a bowel obstruction as described.\nThe critical findings described were communicated via the Veriphy Critical Results Reporting System as an Orange critical result.\nFindsssssssssss0000000000000000000000000001111110011101111', 'IMPRESSION: Noable exam.\nNoss00000000000000000000000000000000000000000000000000000000000000000000000', 'IMPRESSION: 1. istent findneumatosis of dilatedation of the sigmoid colon with described as wall thickening and perammation of the descending colon, consistent consistent for achemia colitis.\n2.\nNova

decoded_preds:----------------------
 ['IMPRESSION: 1 significantly changed loculated fluid in the pelvis.\nabitonal enhancement compatible with peritonitis.\nssssssssssss000000000000000000000000000000000000000000000', 'Small bowel obstruction at the level of the bowel an small bowel anastomosis in the left mid abdomen.\ns000000', 'Impression:  Smallulated fluid collection anterior to the aC, aorta, described.\nSmallings compatible with a bowel obstruction.\ndescribed.\nThe critical findings described were communicated via the Veriphy Critical Results Reporting System as an Orange critical result.\nssssssssss000000000000000000000000000000000000000000', 'IMPRESSION: Noable examination Nosss1100000000000000000000000000000000000000000000000000000000000000000000', 'IMPRESSION:  Inter.\nistent findneumatosis of dilatation of the sigmoid colon with described as wall thickening and perammation of the descending colon, similar consistent for achemia colitis.\n2.\nInterval resolution of theforo

decoded_preds:----------------------
 ['IMPRESSION: 1 significantly changed loculated fluid in the pelvis.\nabitonal enhancement compatible with peritonitis.\nss0000000000000000000000000000000000000000000000000000000000000000', 'Small bowel obstruction at the level of the bowel an small bowel anastomosis in the left mid abdomen.\n00000000000000000000000', 'Impression:  Smallulated fluid collection anterior to the aC, aorta, described.\nSmallings compatible with small bowel obstruction.\ndescribed.\nThe critical findings described were communicated via the Veriphy Critical Results Reporting System as an Orange critical result.\nssssssss00000000000000000000000000000000000000000000000000000000000', 'IMPRESSION: Noable examination No00000000000000000000000000000000000000000000000000000000000000000000000000000000', 'IMPRESSION:  Inter.\nistent findneumatosis of wallatation of the sigmoid colon.\ndescribed as wall thickening and perammation of the descending colon, consistent consistent for 

decoded_preds:----------------------
 ['IMPRESSION: 1 significantly changed loculated fluid in the pelvis compatible abitonal enhancement compatible with peritonitis.\n0000000000000000000000000000000000000000000000000000000000000', 'Small bowel obstruction at the level of the bowel an small bowel anastomosis in the left mid abdomen.\n0000000', 'Impression:  Smallulated fluid collection anterior to the aC as aorta as described.\nSmallings compatible with small bowel obstruction.\ndescribed.\nThe critical findings described were communicated via the Veriphy Critical Results Reporting System as an Orange critical result.\nResults to to to to to to00000000000000000000000000000000000000000000000000000000000 to', 'IMPRESSION: Noable examination No00000000000000000000000000000000000000000000000000000000000000000000', 'IMPRESSION: 1. istent findneumatosis of wallatation of the sigmoid colon.\ndescribed as wall thickening of perammation of the descending colon, consistent consistent for achemic

decoded_preds:----------------------
 ['IMPRESSION: 1 significantly changed loculated pel in the pelvis compatible abitoneal enhancement compatible with peritonitis.\n0000000000000000000000000000000000000000000000000000000000', 'Small bowel obstruction at the level of the bowel an small bowel anastomosis in the left mid abdomen.\n0', 'Impression:  Smallulated fluid collection anterior to the aC as aorta as described.\nSmallings compatible with small bowel obstruction.\ndescribed.\nThe critical findings above were communicated via the Veriphy Critical Results Reporting System as an Orange critical result.\nResults to to to to to to to0000000000000000000000000000000000000000000000000000000 to to to to', 'IMPRESSION: Noable examination No00000000000000000000000000000000000000000000000000000000000000000000', 'IMPRESSION: 1.\nInteristent findneumatosis of wallatation of the sigmoid colon with described as wall thickening and perammation of the descending colon, consistent consistent for ach

decoded_preds:----------------------
 ['IMPRESSION: 1 significantly changed loculated fluid in the pelvis compatible abitoneal enhancement compatible with peritonitis.\n00000000000000000000000000000000000000000000000000000000000', 'Small bowel obstruction at the level of the bowel an small bowel anastomosis in the left mid abdomen.\n0', 'Impression:  Smallulated fluid collection anterior to the aC as aorta as described.\nSmallings compatible with small bowel obstruction.\ndescribed.\nThe critical findings above were communicated via the Veriphy Critical Results Reporting System as an Orange critical result.\nto to to to to to to to to to0000000000000000000000000000000000000000000000 to to00000 to to to to to to', 'IMPRESSION: Noable examination No000000000000000000000000000000000000..000000000000000000000000000000000', 'IMPRESSION: 1. istent findneumatosis of wallatation of a sigmoid colon with described as persistent thickening and perammation of the descending colon, consistent consi

decoded_preds:----------------------
 ['IMPRESSION: 1 significantly changed loculated fluid in the pelvis compatible abitoneal enhancement compatible with peritonitis.\n00000000000000000000000000000000000000000000000000000000000', 'Small bowel obstruction at the level of the bowel an small bowel anastomosis in the left mid abdomen.\n0', 'Impression:  Smallulated fluid collection anterior to the aC as aorta as described.\nSmallings compatible with small bowel obstruction.\ndescribed.\nThe critical findings above were communicated via the Veriphy Critical Results Reporting System as an Orange critical result.\nto to to to to to to to to to0000000000000000000000000000000000000000000000 to to000000 to to to to to', 'IMPRESSION: Noable examination No0000000000000000000000000000000000000000000000000000000000000000000000000', 'IMPRESSION: 1. istent findneumatosis of wallatation of the sigmoid colon with described as persistent thickening and perammation of the descending colon, consistent con

In [161]:
new_model = "llama-2-7b-radnlpv2"

In [162]:
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map=device_map,
)
model = PeftModel.from_pretrained(base_model, new_model)
model = model.merge_and_unload()

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/1.57k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/437 [00:00<?, ?B/s]

In [164]:
!huggingface-cli login --token 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

model.push_to_hub(new_model, use_temp_dir=False)
tokenizer.push_to_hub(new_model, use_temp_dir=False)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/imx2/.cache/huggingface/token
Login successful


pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/imxx/llama-2-7b-radnlpv2/commit/eb9b0822cee2ae39bc8a408b05bd90db2310a74a', commit_message='Upload tokenizer', commit_description='', oid='eb9b0822cee2ae39bc8a408b05bd90db2310a74a', pr_url=None, pr_revision=None, pr_num=None)

In [165]:
pelvis_references = pelvis_impressions_test_dataset['Impression'][:1000]
pelvis_contexts = pelvis_impressions_test_dataset['Narrative'][:1000]

In [166]:
pelvis_predictions3 = get_impressions(pelvis_contexts, model, tokenizer, 64, device_map)


  0%|                                                                                 | 0/1000 [00:00<?, ?it/s][A
  0%|                                                                       | 1/1000 [00:03<1:02:35,  3.76s/it][A
  0%|▏                                                                        | 2/1000 [00:05<47:06,  2.83s/it][A
  0%|▏                                                                        | 3/1000 [00:09<53:44,  3.23s/it][A
  0%|▎                                                                        | 4/1000 [00:13<56:53,  3.43s/it][A
  0%|▎                                                                        | 5/1000 [00:15<47:26,  2.86s/it][A
  1%|▍                                                                        | 6/1000 [00:19<54:10,  3.27s/it][A
  1%|▌                                                                        | 7/1000 [00:22<56:16,  3.40s/it][A
  1%|▌                                                                        |

 14%|██████████                                                             | 142/1000 [08:21<49:37,  3.47s/it][A
 14%|██████████▏                                                            | 143/1000 [08:25<50:29,  3.53s/it][A
 14%|██████████▏                                                            | 144/1000 [08:29<50:54,  3.57s/it][A
 14%|██████████▎                                                            | 145/1000 [08:32<51:11,  3.59s/it][A
 15%|██████████▎                                                            | 146/1000 [08:36<51:23,  3.61s/it][A
 15%|██████████▍                                                            | 147/1000 [08:39<49:14,  3.46s/it][A
 15%|██████████▌                                                            | 148/1000 [08:43<50:05,  3.53s/it][A
 15%|██████████▌                                                            | 149/1000 [08:46<50:53,  3.59s/it][A
 15%|██████████▋                                                            | 15

 28%|████████████████████▏                                                  | 284/1000 [16:35<43:22,  3.64s/it][A
 28%|████████████████████▏                                                  | 285/1000 [16:39<43:23,  3.64s/it][A
 29%|████████████████████▎                                                  | 286/1000 [16:43<43:25,  3.65s/it][A
 29%|████████████████████▍                                                  | 287/1000 [16:46<43:29,  3.66s/it][A
 29%|████████████████████▍                                                  | 288/1000 [16:50<43:18,  3.65s/it][A
 29%|████████████████████▌                                                  | 289/1000 [16:54<43:25,  3.66s/it][A
 29%|████████████████████▌                                                  | 290/1000 [16:57<43:22,  3.67s/it][A
 29%|████████████████████▋                                                  | 291/1000 [17:01<43:15,  3.66s/it][A
 29%|████████████████████▋                                                  | 29

 43%|██████████████████████████████▏                                        | 426/1000 [24:44<34:57,  3.65s/it][A
 43%|██████████████████████████████▎                                        | 427/1000 [24:48<34:54,  3.66s/it][A
 43%|██████████████████████████████▍                                        | 428/1000 [24:51<34:47,  3.65s/it][A
 43%|██████████████████████████████▍                                        | 429/1000 [24:55<34:43,  3.65s/it][A
 43%|██████████████████████████████▌                                        | 430/1000 [24:59<34:39,  3.65s/it][A
 43%|██████████████████████████████▌                                        | 431/1000 [25:02<34:36,  3.65s/it][A
 43%|██████████████████████████████▋                                        | 432/1000 [25:05<32:59,  3.48s/it][A
 43%|██████████████████████████████▋                                        | 433/1000 [25:06<26:17,  2.78s/it][A
 43%|██████████████████████████████▊                                        | 43

 57%|████████████████████████████████████████▎                              | 568/1000 [33:00<23:53,  3.32s/it][A
 57%|████████████████████████████████████████▍                              | 569/1000 [33:04<24:33,  3.42s/it][A
 57%|████████████████████████████████████████▍                              | 570/1000 [33:07<25:02,  3.50s/it][A
 57%|████████████████████████████████████████▌                              | 571/1000 [33:11<25:22,  3.55s/it][A
 57%|████████████████████████████████████████▌                              | 572/1000 [33:15<25:34,  3.58s/it][A
 57%|████████████████████████████████████████▋                              | 573/1000 [33:18<25:39,  3.61s/it][A
 57%|████████████████████████████████████████▊                              | 574/1000 [33:22<25:43,  3.62s/it][A
 57%|████████████████████████████████████████▊                              | 575/1000 [33:26<25:44,  3.63s/it][A
 58%|████████████████████████████████████████▉                              | 57

 71%|██████████████████████████████████████████████████▍                    | 710/1000 [40:51<17:45,  3.67s/it][A
 71%|██████████████████████████████████████████████████▍                    | 711/1000 [40:54<17:40,  3.67s/it][A
 71%|██████████████████████████████████████████████████▌                    | 712/1000 [40:58<17:35,  3.66s/it][A
 71%|██████████████████████████████████████████████████▌                    | 713/1000 [40:59<13:39,  2.86s/it][A
 71%|██████████████████████████████████████████████████▋                    | 714/1000 [41:03<14:45,  3.10s/it][A
 72%|██████████████████████████████████████████████████▊                    | 715/1000 [41:06<15:30,  3.26s/it][A
 72%|██████████████████████████████████████████████████▊                    | 716/1000 [41:09<15:05,  3.19s/it][A
 72%|██████████████████████████████████████████████████▉                    | 717/1000 [41:12<14:09,  3.00s/it][A
 72%|██████████████████████████████████████████████████▉                    | 71

 85%|████████████████████████████████████████████████████████████▍          | 852/1000 [49:09<08:29,  3.44s/it][A
 85%|████████████████████████████████████████████████████████████▌          | 853/1000 [49:12<08:35,  3.51s/it][A
 85%|████████████████████████████████████████████████████████████▋          | 854/1000 [49:16<08:38,  3.55s/it][A
 86%|████████████████████████████████████████████████████████████▋          | 855/1000 [49:20<08:40,  3.59s/it][A
 86%|████████████████████████████████████████████████████████████▊          | 856/1000 [49:23<08:40,  3.62s/it][A
 86%|████████████████████████████████████████████████████████████▊          | 857/1000 [49:27<08:39,  3.63s/it][A
 86%|████████████████████████████████████████████████████████████▉          | 858/1000 [49:30<07:47,  3.30s/it][A
 86%|████████████████████████████████████████████████████████████▉          | 859/1000 [49:33<08:00,  3.41s/it][A
 86%|█████████████████████████████████████████████████████████████          | 86

 99%|██████████████████████████████████████████████████████████████████████▌| 994/1000 [57:06<00:19,  3.26s/it][A
100%|██████████████████████████████████████████████████████████████████████▋| 995/1000 [57:10<00:16,  3.39s/it][A
100%|██████████████████████████████████████████████████████████████████████▋| 996/1000 [57:14<00:13,  3.47s/it][A
100%|██████████████████████████████████████████████████████████████████████▊| 997/1000 [57:17<00:10,  3.53s/it][A
100%|██████████████████████████████████████████████████████████████████████▊| 998/1000 [57:21<00:07,  3.56s/it][A
100%|██████████████████████████████████████████████████████████████████████▉| 999/1000 [57:25<00:03,  3.59s/it][A
100%|██████████████████████████████████████████████████████████████████████| 1000/1000 [57:28<00:00,  3.45s/it][A


In [168]:
rouge_scores = get_rouge_scores(pelvis_predictions3, pelvis_references)
print(f"Rouge1: {rouge_scores['rouge1']['fmeasure_mean']}")
print(f"Rouge2: {rouge_scores['rouge2']['fmeasure_mean']}")
print(f"RougeL: {rouge_scores['rougeL']['fmeasure_mean']}")
bert_scores = get_bertscore(pelvis_predictions3, pelvis_references)
print(f"Bert score: {bert_scores['f1_mean']}")
avg_response_lengths = compare_lengths(pelvis_predictions3, pelvis_references)
print(f"Average response lengths: {avg_response_lengths}")
base_spacy_scores = test_hallucination(nlp, pelvis_predictions3)
print(f"Hallucination percent: {base_spacy_scores[2]}")

Rouge1: 31.59176376576427
Rouge2: 16.432037667290412
RougeL: 25.900361820892776
Bert score: 26.074002209876195
Average response lengths: {'prediction': 216.524, 'reference': 232.319}
Hallucination percent: 0.36408836768556957


## Chest impressions on chest pelvis model

In [169]:
ct_predictions2 = get_impressions(ct_contexts, model, tokenizer, 64, device_map)


  0%|                                                                                 | 0/1000 [00:00<?, ?it/s][A
  0%|                                                                       | 1/1000 [00:03<1:02:38,  3.76s/it][A
  0%|▏                                                                      | 2/1000 [00:07<1:02:06,  3.73s/it][A
  0%|▏                                                                      | 3/1000 [00:11<1:01:42,  3.71s/it][A
  0%|▎                                                                      | 4/1000 [00:14<1:01:53,  3.73s/it][A
  0%|▎                                                                      | 5/1000 [00:18<1:01:35,  3.71s/it][A
  1%|▍                                                                      | 6/1000 [00:22<1:01:45,  3.73s/it][A
  1%|▍                                                                      | 7/1000 [00:26<1:01:35,  3.72s/it][A
  1%|▌                                                                      | 8

 14%|██████████                                                             | 142/1000 [08:19<52:15,  3.65s/it][A
 14%|██████████▏                                                            | 143/1000 [08:23<52:42,  3.69s/it][A
 14%|██████████▏                                                            | 144/1000 [08:27<52:41,  3.69s/it][A
 14%|██████████▎                                                            | 145/1000 [08:30<48:46,  3.42s/it][A
 15%|██████████▎                                                            | 146/1000 [08:33<49:58,  3.51s/it][A
 15%|██████████▍                                                            | 147/1000 [08:37<50:38,  3.56s/it][A
 15%|██████████▌                                                            | 148/1000 [08:39<45:45,  3.22s/it][A
 15%|██████████▌                                                            | 149/1000 [08:43<47:46,  3.37s/it][A
 15%|██████████▋                                                            | 15

 28%|████████████████████▏                                                  | 284/1000 [16:35<43:58,  3.69s/it][A
 28%|████████████████████▏                                                  | 285/1000 [16:39<43:53,  3.68s/it][A
 29%|████████████████████▎                                                  | 286/1000 [16:42<43:50,  3.68s/it][A
 29%|████████████████████▍                                                  | 287/1000 [16:46<43:55,  3.70s/it][A
 29%|████████████████████▍                                                  | 288/1000 [16:50<43:48,  3.69s/it][A
 29%|████████████████████▌                                                  | 289/1000 [16:53<43:27,  3.67s/it][A
 29%|████████████████████▌                                                  | 290/1000 [16:57<43:28,  3.67s/it][A
 29%|████████████████████▋                                                  | 291/1000 [16:58<34:18,  2.90s/it][A
 29%|████████████████████▋                                                  | 29

 43%|██████████████████████████████▏                                        | 426/1000 [24:53<34:47,  3.64s/it][A
 43%|██████████████████████████████▎                                        | 427/1000 [24:56<34:50,  3.65s/it][A
 43%|██████████████████████████████▍                                        | 428/1000 [25:00<34:58,  3.67s/it][A
 43%|██████████████████████████████▍                                        | 429/1000 [25:04<35:05,  3.69s/it][A
 43%|██████████████████████████████▌                                        | 430/1000 [25:07<35:01,  3.69s/it][A
 43%|██████████████████████████████▌                                        | 431/1000 [25:11<34:52,  3.68s/it][A
 43%|██████████████████████████████▋                                        | 432/1000 [25:15<34:43,  3.67s/it][A
 43%|██████████████████████████████▋                                        | 433/1000 [25:18<34:40,  3.67s/it][A
 43%|██████████████████████████████▊                                        | 43

 57%|████████████████████████████████████████▎                              | 568/1000 [33:23<25:44,  3.58s/it][A
 57%|████████████████████████████████████████▍                              | 569/1000 [33:27<26:00,  3.62s/it][A
 57%|████████████████████████████████████████▍                              | 570/1000 [33:30<26:03,  3.64s/it][A
 57%|████████████████████████████████████████▌                              | 571/1000 [33:34<26:03,  3.65s/it][A
 57%|████████████████████████████████████████▌                              | 572/1000 [33:38<26:11,  3.67s/it][A
 57%|████████████████████████████████████████▋                              | 573/1000 [33:38<19:26,  2.73s/it][A
 57%|████████████████████████████████████████▊                              | 574/1000 [33:42<21:29,  3.03s/it][A
 57%|████████████████████████████████████████▊                              | 575/1000 [33:46<22:49,  3.22s/it][A
 58%|████████████████████████████████████████▉                              | 57

 71%|██████████████████████████████████████████████████▍                    | 710/1000 [41:30<16:41,  3.45s/it][A
 71%|██████████████████████████████████████████████████▍                    | 711/1000 [41:33<16:56,  3.52s/it][A
 71%|██████████████████████████████████████████████████▌                    | 712/1000 [41:37<17:09,  3.58s/it][A
 71%|██████████████████████████████████████████████████▌                    | 713/1000 [41:41<17:14,  3.60s/it][A
 71%|██████████████████████████████████████████████████▋                    | 714/1000 [41:44<16:30,  3.46s/it][A
 72%|██████████████████████████████████████████████████▊                    | 715/1000 [41:48<16:42,  3.52s/it][A
 72%|██████████████████████████████████████████████████▊                    | 716/1000 [41:51<16:52,  3.56s/it][A
 72%|██████████████████████████████████████████████████▉                    | 717/1000 [41:55<16:56,  3.59s/it][A
 72%|██████████████████████████████████████████████████▉                    | 71

 85%|████████████████████████████████████████████████████████████▍          | 852/1000 [49:41<08:29,  3.44s/it][A
 85%|████████████████████████████████████████████████████████████▌          | 853/1000 [49:44<08:04,  3.30s/it][A
 85%|████████████████████████████████████████████████████████████▋          | 854/1000 [49:48<08:18,  3.42s/it][A
 86%|████████████████████████████████████████████████████████████▋          | 855/1000 [49:51<08:28,  3.50s/it][A
 86%|████████████████████████████████████████████████████████████▊          | 856/1000 [49:55<08:31,  3.55s/it][A
 86%|████████████████████████████████████████████████████████████▊          | 857/1000 [49:59<08:34,  3.60s/it][A
 86%|████████████████████████████████████████████████████████████▉          | 858/1000 [50:02<08:35,  3.63s/it][A
 86%|████████████████████████████████████████████████████████████▉          | 859/1000 [50:06<08:34,  3.65s/it][A
 86%|█████████████████████████████████████████████████████████████          | 86

 99%|██████████████████████████████████████████████████████████████████████▌| 994/1000 [58:08<00:22,  3.69s/it][A
100%|██████████████████████████████████████████████████████████████████████▋| 995/1000 [58:12<00:18,  3.69s/it][A
100%|██████████████████████████████████████████████████████████████████████▋| 996/1000 [58:15<00:14,  3.55s/it][A
100%|██████████████████████████████████████████████████████████████████████▊| 997/1000 [58:19<00:10,  3.59s/it][A
100%|██████████████████████████████████████████████████████████████████████▊| 998/1000 [58:22<00:07,  3.61s/it][A
100%|██████████████████████████████████████████████████████████████████████▉| 999/1000 [58:26<00:03,  3.62s/it][A
100%|██████████████████████████████████████████████████████████████████████| 1000/1000 [58:30<00:00,  3.51s/it][A


In [171]:
rouge_scores = get_rouge_scores(ct_predictions2, ct_references)
print(f"Rouge1: {rouge_scores['rouge1']['fmeasure_mean']}")
print(f"Rouge2: {rouge_scores['rouge2']['fmeasure_mean']}")
print(f"RougeL: {rouge_scores['rougeL']['fmeasure_mean']}")
bert_scores = get_bertscore(ct_predictions2, ct_references)
print(f"Bert score: {bert_scores['f1_mean']}")
avg_response_lengths = compare_lengths(ct_predictions2, ct_references)
print(f"Average response lengths: {avg_response_lengths}")
base_spacy_scores = test_hallucination(nlp, ct_predictions2)
print(f"Hallucination percent: {base_spacy_scores[2]}")

Rouge1: 34.8497917735067
Rouge2: 18.855548821912887
RougeL: 28.574498448067338
Bert score: 28.6749458926497
Average response lengths: {'prediction': 217.599, 'reference': 270.149}
Hallucination percent: 0.361547178671285


## Pelvis fine-tuned on base model

In [172]:
model_name = "meta-llama/Llama-2-7b-chat-hf"
new_model = "llama-2-7b-radnlp-pelvis"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map
)
# model = tp.tensor_parallel(
#     AutoModelForCausalLM.from_pretrained(
#     model_name,
#     quantization_config=bnb_config,
# #     device_map=device_map
#     )
# )
# model = tp.tensor_parallel(model, ["cuda:0", "cuda:1"])  # <- each GPU has half the weights
model.config.use_cache = False
model.config.pretraining_tp = 1

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [173]:
sample_dataset = pelvis_impressions_train_dataset.shuffle(seed=42)
sample_dataset = sample_dataset.select(range(10000))
sample_dataset
sample_val = pelvis_impressions_val_dataset.shuffle(seed=42)
sample_val = sample_val.select(range(1500))

In [175]:
finetune(model, 
         sample_dataset, #ct_impressions_train_dataset, 
         sample_val,
         peft_config, 
         640, # different than inference max length since it counts full text
         tokenizer, 
         training_arguments, 
         packing, 
         generate_impression_prompt, 
         new_model,
         compute_metrics,
         preprocess_logits_for_metrics)

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Pred Len,Ref Len
500,0.9234,1.133452,43.9398,24.4228,41.9782,43.2436,627.52,465.795333
1000,0.8653,1.040611,39.4398,22.3716,37.5904,38.803,627.52,465.795333
1500,0.8298,0.98842,43.1136,25.2981,41.3831,42.473,627.52,465.795333
2000,0.8063,0.949062,36.0446,21.4853,34.5272,35.4779,627.52,465.795333
2500,0.7857,0.923526,41.2852,24.7536,39.7487,40.6186,627.52,465.795333
3000,0.7442,0.908175,47.3748,28.6378,45.8037,46.7808,627.52,465.795333
3500,0.7229,0.893561,52.6495,32.1518,51.0486,51.9533,627.52,465.795333
4000,0.7176,0.879044,48.9167,30.2518,47.3383,48.2867,627.52,465.795333
4500,0.6812,0.86753,38.2729,23.3826,36.7799,37.6972,627.52,465.795333
5000,0.6801,0.859519,43.8414,27.3323,42.3764,43.2725,627.52,465.795333


decoded_preds:----------------------
 ['IMPRESSION: 1 significantly changed findulated fluid in the pelvis compatible abitoneal enhancement compatible with peritonitis.\nvisual visual visual visual bil bil bil bil bil bil bil bil bil of of of of of of of of of0000 bil bilndndndndndndndndndndndndnd', 'I Small bowel obstruction at the level of the bowel an small bowel anastomosis in the left mid abdomen.\nbil bil bil bil bil bil bil bil bil bil bil bil bil bil bil bil bil bil bil bil bil bil bil bil bil bil bil bil bil bil bil bil', 'Impression: 7ulated fluid collection anterior to the aC, aorta, described.\nSmallings compatible with small bowel obstruction with described.\nNo find findings described were communicated via the Veriphy Critical Results Reporting System as a A ( result.\nndndndndndndndndndndndndndndndndndndndndndndndndndndndndndndndndndndndndndndndndndndndndndndnd', 'IMPRESSION: Noable examination No visual visual visual       0000000000 CT CT CT CT CT CT exam exam exam exa

decoded_preds:----------------------
 ['IMPRESSION: 1 significantly changed findulated fluid in the pelvis with abitoneal enhancement compatible with peritonitis.\nssssssss bil bil bil bil bil bil bil bil bil bil bil bil bil bil bil bil bil bil bil bil bil bil bil bil bil bil bil bil bil bil bil bil bil bil bil bil bil bil bil bil bil bil bil bil bil bil bil bil bil bil bil bil bil bil bil bil bil bil bil bil bil bil bil bil bil bil bil bil bil bilssss', 'Small bowel obstruction at the level of the bowel an small bowel anastomosis in the left mid abdomen.\nssssssssssss bil bil bil bil bil bil bil bil bil bil left left left left left left left left left left left left left left left left left left left left left left left left left left left left left left left left left left left bil bil bil bil bil bil bil bil bil bil bil bil bil bil bil bil bil', 'Impression:  Loculated fluid collection anterior to the aC, aorta, described.\nThis Findings compatible with a bowel obstruction.\ndescrib

decoded_preds:----------------------
 ['IMPRESSION: 1 significantly changed loculated fluid in the pelvis with abitoneal enhancement compatible with peritonitis.\n000000000000000000000000000000000000000000000000000000000000000000000000000', 'Small bowel obstruction at the level of the bowel an small bowel anastomosis in the left mid abdomen.\nssssssss00s0000000000000000000000000000000000000000000000000000000000000', 'Impression:  Findulated fluid collection anterior to the aC and aorta, described.\nSmallings compatible with a bowel obstruction with described.\nThe critical findings described were communicated via the Veriphy Critical Results Reporting System as a Orange critical result.\nsssssssssssssssssssss new new new new new new new new new new new new new new new new new new new new new new new new new new new new new new new new new new new new new new new new new new new new new new new new new new new new new new new new new new new new new new new00000000000', 'IMPRESSION: abl

decoded_preds:----------------------
 ['IMPRESSION: 1 significantly changed loculated fluid in the pelvis.\nabitoneal enhancement compatible with peritonitis.\n0000 o o o o0000000000000000000000000000000000000000000000000000000', 'Small bowel obstruction at the level of the bowel an small bowel anastomosis.\nthe left mid abdomen.\n0000000000 o o o0000000000000000000000000000000000000 o o o000 o o o o se se se se se00', 'Impression:  Loculated fluid collection in to the aC as aorta as described.\nSmallings compatible with a bowel obstruction.\ndescribed.\nThe critical findings above were communicated via the Veriphy Critical Results Reporting System as an Orange critical result.\nsssssss000 new new new new new new new new new new new new new new new new new new new new new new new new new new new new new new new new new small new new small small small new new new new new new new new new new new new new new new new new new new new new new new new new new new new new new00000', 'IMPRESSIO

decoded_preds:----------------------
 ['IMPRESSION: 1 significantly changed loculated fluid in the pelvis with abitoneal enhancement compatible with peritonitis.\n00000000000000000000000000000000000000 of of of00000000000000000000000000000000', 'Small bowel obstruction at the level of the bowel an small bowel anastomosis in the left mid abdomen.\nssssssssss0000000000000000000000000000000000000000000000000000000000000000000000000', 'Impression:  Loculated fluid collection anterior to the aC as aorta as described.\nSmallings compatible with a bowel obstruction as described.\nThe critical findings described were communicated via the Veriphy Critical Results Reporting System as an Orange critical result.\nsssssssssssssss new new new new new new new0000000000000000000000000000000000000000000000 dro dro dro dro dro0000000000000000', 'IMPRESSION: Noable examination Nossssssssssssss000000000000000000000000000000000............ pel.\n00000000//000000000000000000000000000000000000000000000000000

decoded_preds:----------------------
 ['IMPRESSION: 1 significantly changed loculated fluid in the pelvis with abitoneal enhancement compatible with peritonitis.\n00000000000000000000000000000000000000000000000000000000000000000000', 'Small bowel obstruction at the level of the bowel an small bowel anastomosis in the left mid abdomen.\nssssss000000000000000000000000000000000000000000000000000000000000000000000000000', 'Impression:  Smallulated fluid collection anterior to the aC as aorta as described.\nSmallings compatible with a bowel obstruction as described.\nThe critical findings above were communicated via the Veriphy Critical Results Reporting System as an Orange critical result.\nssssssssssssssssssss new new new new new new new00000000000000000000000000000000000000000000000000000000000000000000', 'IMPRESSION: able exam.\nNosssssssssssssss0000000000000000000000000000000000000000 pel pel pel pel pel pel000000000000/000000000000000000000000000000000000000000000000000000000000', 'IM

decoded_preds:----------------------
 ['IMPRESSION: 1 significantly changed loculated fluid in the pelvis with abitoneal enhancement compatible with peritonitis.\n00000000000000000000000000000000000000000000000000000000000000000000', 'Small bowel obstruction at the level of the bowel an small bowel anastomosis in the left mid abdomen.\nsss0000000000000 s s s s0000000000000000000000000000000000000000000000000000000000000', 'Ipression:  Loculated fluid collection anterior to the aC as a smalla as described.\nSmallings compatible with a bowel obstruction as described.\nThe critical findings above were communicated via the Veriphy Critical Results Reporting System as an Orange critical result.\nssssssssssssssssssss000 new new new0000000000000000000000000000000000000000000000000000000000000000000000', 'IMPRESSION: able exam.\nNossssssssssssss no000000000000000000000000000000000000000 pel pel pel pel pel pel pel00000000000000000000000000000000000000000000000000000000000000000000000000', 'IMP

In [176]:
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map=device_map,
)
model = PeftModel.from_pretrained(base_model, new_model)
model = model.merge_and_unload()

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [177]:
!huggingface-cli login --token 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

model.push_to_hub(new_model, use_temp_dir=False)
tokenizer.push_to_hub(new_model, use_temp_dir=False)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/imx2/.cache/huggingface/token
Login successful


pytorch_model-00002-of-00002.bin:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/imxx/llama-2-7b-radnlp-pelvis/commit/6ea3d3710b363e7600d0fc5ccb95babd3e55a320', commit_message='Upload tokenizer', commit_description='', oid='6ea3d3710b363e7600d0fc5ccb95babd3e55a320', pr_url=None, pr_revision=None, pr_num=None)

In [178]:
pelvis_predictions4 = get_impressions(pelvis_contexts, model, tokenizer, 64, device_map)

100%|██████████████████████████████████████████████████████████████████████| 1000/1000 [55:44<00:00,  3.34s/it]


In [179]:
rouge_scores = get_rouge_scores(pelvis_predictions4, pelvis_references)
print(f"Rouge1: {rouge_scores['rouge1']['fmeasure_mean']}")
print(f"Rouge2: {rouge_scores['rouge2']['fmeasure_mean']}")
print(f"RougeL: {rouge_scores['rougeL']['fmeasure_mean']}")
bert_scores = get_bertscore(pelvis_predictions4, pelvis_references)
print(f"Bert score: {bert_scores['f1_mean']}")
avg_response_lengths = compare_lengths(pelvis_predictions4, pelvis_references)
print(f"Average response lengths: {avg_response_lengths}")
base_spacy_scores = test_hallucination(nlp, pelvis_predictions4)
print(f"Hallucination percent: {base_spacy_scores[2]}")

Rouge1: 31.361759825759282
Rouge2: 16.22080165401153
RougeL: 25.45250366332153
Bert score: 25.418879235167697
Average response lengths: {'prediction': 214.456, 'reference': 232.319}
Hallucination percent: 0.3732955756596156


# MRI Classification

## Base model

In [58]:
model_name = "meta-llama/Llama-2-7b-chat-hf"
# Load base model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map
)
# model = tp.tensor_parallel(
#     AutoModelForCausalLM.from_pretrained(
#     model_name,
#     quantization_config=bnb_config,
# #     device_map=device_map
#     )
# )
# model = tp.tensor_parallel(model, ["cuda:0", "cuda:1"])  # <- each GPU has half the weights
model.config.use_cache = False
model.config.pretraining_tp = 1

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [85]:
mri_references = mri_classification_test_dataset['Narrative'][:1000]
mri_contexts = mri_classification_test_dataset['Impression'][:1000]

In [224]:
mri_references = mri_classification_test_dataset['Narrative'][:1000]
mri_contexts = mri_classification_test_dataset['Impression'][:1000]
mri_predictions = get_classification(mri_contexts, model, tokenizer, 32, device_map, 'MRI')

100%|██████████████████████████████████████████████████████████████████████| 1000/1000 [17:52<00:00,  1.07s/it]


In [225]:
mri_references[0:20]

['MRI BRAIN WITH AND WITHOUT CONTRAST  ',
 'MRI brain with and without contrast.  ',
 'MRI OF THE BRAIN WITH AND WITHOUT INTRAVENOUS CONTRAST  ',
 'MRI BRAIN WITH AND WITHOUT CONTRAST    ',
 'MRI BRAIN WITH AND WITHOUT INTRAVENOUS CONTRAST ENHANCEMENT MRI of the soft tissues of the neck with and without intravenous contrast enhancement.  ',
 'MRI BRAIN WITH AND WITHOUT CONTRAST MRA BRAIN WITHOUT and with  CONTRAST  ',
 'MRI OF THE BRAIN WITH AND WITHOUT INTRAVENOUS CONTRAST  ',
 'MRI of the brain and orbits without and with intravenous contrast.  ',
 'MRI OF THE BRAIN WITHOUT INTRAVENOUS CONTRAST  ',
 'MRI OF THE BRAIN WITH AND WITHOUT INTRAVENOUS CONTRAST (MULTIPLE SCLEROSIS PROTOCOL)  ',
 'MRI BRAIN WITH AND WITHOUT IV CONTRAST  ',
 'MRI OF THE BRAIN WITH AND WITHOUT INTRAVENOUS CONTRAST  ',
 'MR BRAIN WITH AND WITHOUT CONTRAST  ',
 'MRI BRAIN W WO IV CONTRAST, MRA NECK W AND/OR WO IV CONTRAST  ',
 'MRI BRAIN WITH AND WITHOUT CONTRAST. MRI CERVICAL SPINE WITHOUT CONTRAST.  ',
 'MRI B

In [226]:
mri_predictions[0:20]

[' MRI:  Noncontrast MRI of the brain  Follow-up MRI of the brain without and with contrast ',
 ' CT scan with and without contrast.  Reason: 1. To assess for progression of disease. 2. To assess for new lesions.',
 ' MRI of the abdomen performed on 10/10/2013.  Reason for',
 ' CT scan of the abdomen and pelvis with contrast performed.  CT scan of the chest, abdomen and pelvis with contrast performed.',
 ' CT angiography of the head and neck performed with intravenous contrast.  \n Findings:  1) There is a',
 ' MRI of the brain without and with contrast.  This is a very common and very useful exam.  The',
 ' CT scan of the abdomen and pelvis with contrast performed.  CT scan of the chest, abdomen and pelvis with contrast performed.',
 ' MRI',
 ' MRI with contrast and perfusion imaging is recommended for further evaluation, unless contraindicated. ',
 ' MRI',
 ' MRI of the brain and oral cavity without contrast.  Reason for MRI:  History of melanoma.  History of melan',
 ' MRI of the br

In [227]:
rouge_scores = get_rouge_scores(mri_predictions, mri_references)
print(f"Rouge1: {rouge_scores['rouge1']['fmeasure_mean']}")
print(f"Rouge2: {rouge_scores['rouge2']['fmeasure_mean']}")
print(f"RougeL: {rouge_scores['rougeL']['fmeasure_mean']}")
bert_scores = get_bertscore(mri_predictions, mri_references)
print(f"Bert score: {bert_scores['f1_mean']}")
avg_response_lengths = compare_lengths(mri_predictions, mri_references)
print(f"Average response lengths: {avg_response_lengths}")
base_spacy_scores = test_hallucination(nlp, mri_predictions)
print(f"Hallucination percent: {base_spacy_scores[2]}")

Rouge1: 36.85154693406975
Rouge2: 12.757482031646347
RougeL: 33.75029268731852




Bert score: 3.561896024996531
Average response lengths: {'prediction': 68.037, 'reference': 49.625}


ValueError: Shape mismatch for blis.gemm: (0, 0), (480, 288)

## Inference using chest+pelvis impression fine-tuned model

In [70]:

model_name = "imxx/llama-2-7b-radnlpv2"
# Load base model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map
)
# model = tp.tensor_parallel(
#     AutoModelForCausalLM.from_pretrained(
#     model_name,
#     quantization_config=bnb_config,
# #     device_map=device_map
#     )
# )
# model = tp.tensor_parallel(model, ["cuda:0", "cuda:1"])  # <- each GPU has half the weights
model.config.use_cache = False
model.config.pretraining_tp = 1

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [233]:
mri_references = mri_classification_test_dataset['Narrative'][:1000]
mri_contexts = mri_classification_test_dataset['Impression'][:1000]
mri_predictions2 = get_classification(mri_contexts, model, tokenizer, 32, device_map, 'MRI')

100%|██████████████████████████████████████████████████████████████████████| 1000/1000 [31:43<00:00,  1.90s/it]


In [234]:
mri_predictions2[0:10]

[' MRI of the brain with and without contrast.  This is a',
 ' CT angiogram of the brain.  Reasoning:  The',
 ' CT scan with contrast. \n',
 ' CT of the abdomen:  CT of the abdomen:  CT of the abdomen:  CT of the abdomen: ',
 ' Noncontrast MRI of the head.  Recommendation:  Given the findings of residual tumor and/or treatment related change within',
 ' CT angiogram of the chest.  COMPARISON:  None.  TECHNIQUE: CT angiogram of the ch',
 ' CT of the abdomen and pelvis with IV contrast. \n Reason:  To evaluate for metastatic disease. \n',
 ' MRI of the brain with and without contrast. \n REASONING:  The',
 ' MRI with contrast and perfusion imaging. \n Reason:  Increased T2/FLAIR signal in the left greater than right c',
 ' MRI of the head with and without contrast is obtained.  Findings:  There is a T2 hyperintense lesion within the right superior front']

In [235]:
rouge_scores = get_rouge_scores(mri_predictions2, mri_references)
print(f"Rouge1: {rouge_scores['rouge1']['fmeasure_mean']}")
print(f"Rouge2: {rouge_scores['rouge2']['fmeasure_mean']}")
print(f"RougeL: {rouge_scores['rougeL']['fmeasure_mean']}")
bert_scores = get_bertscore(mri_predictions2, mri_references)
print(f"Bert score: {bert_scores['f1_mean']}")
avg_response_lengths = compare_lengths(mri_predictions2, mri_references)
print(f"Average response lengths: {avg_response_lengths}")
base_spacy_scores = test_hallucination(nlp, mri_predictions2)
print(f"Hallucination percent: {base_spacy_scores[2]}")

Rouge1: 28.203655820150146
Rouge2: 11.46469376534002
RougeL: 26.20255443669238




Bert score: -2.547951382526662
Average response lengths: {'prediction': 85.971, 'reference': 49.625}


ValueError: Shape mismatch for blis.gemm: (0, 0), (480, 288)

## fine tune mri classifications on combined model

In [71]:
sample_dataset = mri_classification_train_dataset.shuffle(seed=42)
sample_dataset = sample_dataset.select(range(10000))
sample_dataset
sample_val = mri_classification_val_dataset.shuffle(seed=42)
sample_val = sample_val.select(range(1500))

In [76]:
new_model = "llama-2-7b-chest-pelvis-mri"
finetune(model, 
         sample_dataset, #ct_impressions_train_dataset, 
         sample_val,
         peft_config, 
         200, # different than inference max length since it counts full text
         tokenizer, 
         training_arguments, 
         packing, 
         generate_mri_classification_prompt, 
         new_model,
         compute_metrics,
         preprocess_logits_for_metrics)

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Pred Len,Ref Len
500,0.7633,1.082961,53.814,35.2549,51.9973,52.9898,179.264,106.256667
1000,0.7284,1.016846,59.8482,40.7458,57.9428,59.0514,179.264,106.256667
1500,0.5918,0.992196,50.7789,33.8786,49.0611,50.0074,179.264,106.256667
2000,0.6542,0.964909,53.659,36.2983,51.9257,52.8742,179.264,106.256667
2500,0.6097,0.954638,60.094,40.9061,58.1663,59.3057,179.264,106.256667
3000,0.6122,0.941081,54.213,36.9927,52.4538,53.4635,179.264,106.256667
3500,0.5374,0.937288,52.696,36.1441,50.9839,51.9538,179.263333,106.256667
4000,0.643,0.923243,52.3415,35.8363,50.6323,51.6342,179.264,106.256667
4500,0.58,0.917363,58.5396,40.8216,56.7261,57.768,179.264,106.256667
5000,0.5606,0.912889,58.609,40.9842,56.9164,57.8846,179.264,106.256667


decoded_preds:----------------------
 ['Impression: Ipression:  Noable left M ang No new changeastatic.\nable left of the left front loastasis.\nGiven this impression, what MRI should we use?\nAnswer: MRI B the brain without  \n                                   Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im M M Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im', 'Impression: IMPRESSION:  Noifocal areas2/LAIR signalintens lesions are the cpratentorial and matter, compatible to the c c cerebellar peduncle, consistent well as within the left cerebellum hemisphere.\ncompatible may represent dem sequoch lesques.\nthe setting settingical context.\nNo findions areit a enh of enhal enh diffusion or enhancement.\nNo REPORT  \n Given this impression, what MRI should we use?\nAnswer: MRI BR

decoded_preds:----------------------
 ['Impression: Ipression:  Noable exam M scan   evidence changeastatic.\nable left of the right front loastasis.\nGiven this impression, what MRI should we use?\nAnswer: MRI B the brain without  1 M M M M M M M M M M M M M M M M M M M M M M M Im Im Im Im Im Im Im Im Im Im Im M M M M Im Im Im M M Im Im Im Im Im Im Im Im Im Im Im M MR MR M M-....-.\nMR MR.. MR MR-.\nMR Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im Im', 'Impression: IMPRESSION:  Noifocal areas2/LAIR hyperintens lesions in the supratentorial and matter, consistent to the c later cerebellar peduncle, which well as within the cer cerebellum hemisphere, are are represent sequ sequoch lesques.\nthe setting clinical setting.\nNo The findions areit no restricted of restrictedal enh diffusion or enhancement to   The REPORT  \n Given this impression, what MRI should we use?\nAnswer: MRI OFRAIN WITH WHOUT AND WITH

decoded_preds:----------------------
 ['Impression: Ipression:  Noable exam MR exam No evidence changeastatic.\nable left of the right front loastasis.\nGiven this impression, what MRI should we use?\nAnswer: MRI B the brain without  1                               MR MR MR MR MR MR MR MR MR MR MR MR// MR MR MR', 'Impression: IMPRESSION:  Noifocal areas2/LAIR hyperintense lesions in the supratentorial and matter, as to the left c cerebellar peduncle, which detailed as within the left cerebellar hemisphere, which are represent sequ sequoch lesques.\nthe appropriate settingical setting.\nNo No findions areit no enh of restrictedal enh diffusion or enhancement.\nNo REPORT  \n Given this impression, what MRI should we use?\nAnswer: MRI BRAIN W WHOUT AND WITH CONTRAST     M', 'Impression: IMPRESSION: 1 ac interval in size appearance les cerinal cord les2/intenseities.\nNo newhancing lesions are identified.\nGiven this impression, what MRI should we use?\nAnswer: MRI BRAIN WITHERVICAL SPINE 

decoded_preds:----------------------
 ['Impression: Ipression:  Noable exam M.   evidence orastatic.\nable left of the right front loastasis.\nGiven this impression, what MRI should we use?\nAnswer: MRI B the brain without  \n,,,,,,,,,,,,, MR MR MR MR MR MR Im Im Im Im Im Im M MR MR MR MR MR0..- MR MR-.. MR MR MR MR MR MR MR MR MR MR MR MR MR MR Im Im Im Im Im Im Im Im Im Im Im Im MR MR MR MR', 'Impression: IMPRESSION:  Noifocal areas2/LAIR signalintens lesions in the supratentorial and matter, consistent to the left c cerebellar peduncle, which detailed as within the cer cerebellar hemisphere, which are represent sequ patientoch lesques.\nthe appropriate clinical setting.\nNo The findions areit no enh of restrictedal enh diffusion or enhancement.\nThe REPORT  \n Given this impression, what MRI should we use?\nAnswer: MRI BRAIN WITHOUT AND WITH CONTRAST  \n\n\n/           vol vol vol vol vol vol', 'Impression: IMPRESSION: 1 ac change in the previously les cerine cord les2/intensities, 

decoded_preds:----------------------
 ['Impression: Ipression:  Noable post M.   evidence changeastatic.\nable c of the right front loastasis.\nGiven this impression, what MRI should we use?\nAnswer: MRI B the brain without  1                             Im M M M M M M M- M M MR M.-- M MR MR M M MR MR MR MR Im Im Im Im', 'Impression: IMPRESSION:  Noifocal areas2/LAIR hyperintense lesions in the supratentorial white matter, consistent to the vent temporal cerebellar peduncle, and described as within the cer cerebellar hemisphere, which are represent sequ sequoch lesques.\nthe appropriate clinical setting.\nNo findions areit no restricted of restrictedal restricted diffusion or abancement.\nNo REPORT  \n Given this impression, what MRI should we use?\nAnswer: MRI BRAIN WITHOUT AND WITH CONTRAST   M', 'Impression: IMPRESSION: 1 ac change in size appearance met cerine cord les2/intenseities.\nNo newhancing lesions are seen.\nGiven this impression, what MRI should we use?\nAnswer: MRI OFRAI

decoded_preds:----------------------
 ['Impression: Ipression:  Noable appearance M.   evidence orastatic.\nable left of the left front loastasis.\nGiven this impression, what MRI should we use?\nAnswer: MRI B the brain without  1,,,,,           M M M M M M M--- M M- MR.- MR M M MR MR to M MR MR MR Im Im Im Im', 'Impression: IMPRESSION:  Noifocal areas2/LAIR hyperintens lesions in the supratentorial and matter, compatible to the vent front cerebellar peduncle, which described as within the cer cerebellum hemisphere, which are represent sequ sequoch lesques.\nthe setting clinical setting.\nThe findions areit no restricted of restrictedal restricted diffusion or abancement to   No REPORT  \n Given this impression, what MRI should we use?\nAnswer: MRI BRAIN WITHOUT AND WITH CONTRAST   M\n     vol vol vol vol', 'Impression: IMPRESSION: 1 evidence change in left appearance par cerine cord les2/intenseities, No evidencehancing lesions are seen.\nGiven this impression, what MRI should we use?

decoded_preds:----------------------
 ['Impression: Ipression:  Noable appearance M.   evidence changeastatic or  able left of the right front loastasis.\nGiven this impression, what MRI should we use?\nAnswer: MRI B the brain:  1,,ddddddd   Im Im MR MR MR MR MR--- MR MR- MR MR-- MR MR MR- MR MR MR MR MR', 'Impression: IMPRESSION:  Noifocal areas2/LAIR hyperintens lesions involving the supratentorial and matter, compatible to the vent front cerebellar peduncle, which described as within the cer cererebellar hemisphere, compatible are represent sequ sequoch lesques.\nthe appropriate clinical setting.\nNo No findions areit no restricted of restrictedal restricted diffusion or enhancement to   No REPORT  \n Given this impression, what MRI should we use?\nAnswer: MRI BRAIN WITHOUT AND WITH CONTRAST   M\nmit/ of vol vol vol', 'Impression: IMPRESSION: 1 evidence change in size appearance les cerine cord les2/intensities, No newhancing lesions are seen.\nGiven this impression, what MRI should

decoded_preds:----------------------
 ['Impression: Ipression:  Noable appearance M.   evidence orastatic or  able left of the right front loastasis.\nGiven this impression, what MRI should we use?\nAnswer: MRI B the brain:   D  dddddd vent vent vent Im Im Im Im Im Im M M MR MR MR MR, MR MR MR Im Im MR MR Im Im Im Im Im Im Im Im Im', 'Impression: IMPRESSION:  Noifocal areas2/LAIR hyperintens lesions in the supratentorial and matter, compatible to the vent front cerebellar peduncle, compatible described as within the cer cerebellum hemisphere, compatible are represent the patientoch lesques.\nthe appropriate clinical setting.\nThe findions areit no restricted of restrictedal restricted diffusion or enhancement.\nThe REPORT  \n Given this impression, what MRI should we use?\nAnswer: MRI BRAIN WITHOUT AND WITH CONTRAST   M\nmit/ of of of of', 'Impression: IMPRESSION: 1 ac change in size appearance M cerine cord les2 hyperintensities, No newhancing lesions are seen.\nGiven this impression,

decoded_preds:----------------------
 ['Impression: Ipression:  Noable exam MR with   evidence orastatic.\nable left of the right front loastasis.\nGiven this impression, what MRI should we use?\nAnswer: MRI B the brain:   Comddddddddd of of of of Im MR, of of Im Im Im Im Im', 'Impression: IMPRESSION:  Noifocal areas2/LAIR hyperintens lesions in the supratentorial and matter, compatible to the vent later cerebellar peduncle, as described as within the cer cerebellum hemisphere, compatible are represent sequ patientoch lesques.\nthe appropriate clinical setting.\nNo The findions areit no restricted of restrictedal restricted diffusion or enhancement to  No REPORT  \n Given this impression, what MRI should we use?\nAnswer: MRI BRAIN WITHOUT AND WITH CONTRAST   M\n /', 'Impression: IMPRESSION: 1 ac change in size appearance les cerine cord les2/intensities, No newhancing lesions to seen.\nGiven this impression, what MRI should we use?\nAnswer: MRI OFRAIN WITHERVICAL SPINE WITH WO IV CONTR

decoded_preds:----------------------
 ['Impression: Ipression:  Noable appearance MR.   evidence orastatic.\nable left of the left front loastasis.\nGiven this impression, what MRI should we use?\nAnswer: MRI B the brain:   Mdddddddd Im of of of of of M M M of of M MR MR MR M of of of to of', 'Impression: IMPRESSION:  Noifocal areas2/LAIR hyperintens lesions in the supratentorial and matter, consistent to the vent later cerebellar peduncle, as described as within the cer cerebellum hemisphere, consistent are represent sequ patientoch lesques.\nthe appropriate clinical setting.\nThe findions areit no restricted of restrictedal restricted diffusion or enhancement to  The REPORT  \n Given this impression, what MRI should we use?\nAnswer: MRI BRAIN WITHOUT AND WITH CONTRAST   M\nmit/', 'Impression: IMPRESSION: 1 ac change in size appearance les cerine cord les2 hyperintensities, No newhancing lesions are seen.\nGiven this impression, what MRI should we use?\nAnswer: MRI OFRAIN ANDERVICAL S

decoded_preds:----------------------
 ['Impression: Ipression:  Noable appearance MR.   evidence orastatic or  able left of the left front loastasis.\nGiven this impression, what MRI should we use?\nAnswer: MRI B the brain:   Dddddddddd Im Im Im Im Im Im M M of of Im MR,,, of Im Im Im to Im Im Im Im', 'Impression: IMPRESSION:  Noifocal su2/LAIR hyperintense lesions in the supratentorial and matter, consistent to the vent later cerebellar peduncle, as described as within the cer cerebellum hemisphere, consistent are represent sequ patientoch lesques.\nthe setting clinical setting.\nNo The findions areit no restricted of restrictedal restricted diffusion or enhancement to  No REPORT  \n Given this impression, what MRI should we use?\nAnswer: MRI BRAIN WITHOUT AND WITH CONTRAST   M\n//', 'Impression: IMPRESSION: 1 ac change in size appearance les cerine cord les2 hyperintensities, No newhancing lesions are seen.\nGiven this impression, what MRI should we use?\nAnswer: MRI OFRAIN ANDERVICA

decoded_preds:----------------------
 ['Impression: Ipression:  Noable appearance MR.   evidence orastatic or  able left of the left front loastasis.\nGiven this impression, what MRI should we use?\nAnswer: MRI B the brain:   Dddddddddd Im Im of of Im of to to of of of to,,, of of Im of to of Im Im Im', 'Impression: IMPRESSION:  Noifocal su2/LAIR hyperintense lesions in the supratentorial and matter, consistent to the vent later cerebellar peduncle, as described as within the cer cerebellum hemisphere, consistent are represent sequ patientoch lesques.\nthe setting clinical setting.\nThe findions areit no restricted of restrictedal restricted diffusion or enhancement to  No REPORT  \n Given this impression, what MRI should we use?\nAnswer: MRI BRAIN WITHOUT AND WITH CONTRAST   M\n//', 'Impression: IMPRESSION: 1 ac change in size appearance les cerine cord les2 hyperintensities, No newhancing lesions are seen.\nGiven this impression, what MRI should we use?\nAnswer: MRI OFRAIN WITHERVICA

decoded_preds:----------------------
 ['Impression: Ipression:  Noable appearance MR.   evidence orastatic or  able left of the left front loastasis.\nGiven this impression, what MRI should we use?\nAnswer: MRI B the brain:   Dddddddddd Im Im of Im Im of M to to of of to,,, of Im Im of to of Im Im Im', 'Impression: IMPRESSION:  Noifocal su2/LAIR hyperintense lesions in the supratentorial and matter, consistent to the vent later cerebellar peduncle, as described as within the cer cerebellum hemisphere, consistent are represent sequ patientoch lesques.\nthe setting clinical setting.\nThe findions areit no restricted of restrictedal restricted diffusion or enhancement to  No REPORT  \n Given this impression, what MRI should we use?\nAnswer: MRI BRAIN WITHOUT AND WITH CONTRAST   M\n//', 'Impression: IMPRESSION: 1 ac change in size appearance les cerine cord les2 hyperintensities, No newhancing lesions are seen.\nGiven this impression, what MRI should we use?\nAnswer: MRI OFRAIN WITHERVICAL

In [77]:
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map=device_map,
)
model = PeftModel.from_pretrained(base_model, new_model)
model = model.merge_and_unload()

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [78]:
!huggingface-cli login --token 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

model.push_to_hub(new_model, use_temp_dir=False)
tokenizer.push_to_hub(new_model, use_temp_dir=False)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/imx2/.cache/huggingface/token
Login successful


pytorch_model-00002-of-00002.bin:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/imxx/llama-2-7b-chest-pelvis-mri/commit/dc57aa842c5fc20e20f9ab3652c029c8fc70e82e', commit_message='Upload tokenizer', commit_description='', oid='dc57aa842c5fc20e20f9ab3652c029c8fc70e82e', pr_url=None, pr_revision=None, pr_num=None)

In [79]:
# test mri classification
mri_references = mri_classification_test_dataset['Narrative'][:1000]
mri_contexts = mri_classification_test_dataset['Impression'][:1000]
mri_predictions = get_classification(mri_contexts, model, tokenizer, 32, device_map, 'MRI')

100%|████████████████████████████████████████████████████████████| 1000/1000 [28:33<00:00,  1.71s/it]


In [80]:
mri_references

['MRI BRAIN WITH AND WITHOUT CONTRAST  ',
 'MRI brain with and without contrast.  ',
 'MRI OF THE BRAIN WITH AND WITHOUT INTRAVENOUS CONTRAST  ',
 'MRI BRAIN WITH AND WITHOUT CONTRAST    ',
 'MRI BRAIN WITH AND WITHOUT INTRAVENOUS CONTRAST ENHANCEMENT MRI of the soft tissues of the neck with and without intravenous contrast enhancement.  ',
 'MRI BRAIN WITH AND WITHOUT CONTRAST MRA BRAIN WITHOUT and with  CONTRAST  ',
 'MRI OF THE BRAIN WITH AND WITHOUT INTRAVENOUS CONTRAST  ',
 'MRI of the brain and orbits without and with intravenous contrast.  ',
 'MRI OF THE BRAIN WITHOUT INTRAVENOUS CONTRAST  ',
 'MRI OF THE BRAIN WITH AND WITHOUT INTRAVENOUS CONTRAST (MULTIPLE SCLEROSIS PROTOCOL)  ',
 'MRI BRAIN WITH AND WITHOUT IV CONTRAST  ',
 'MRI OF THE BRAIN WITH AND WITHOUT INTRAVENOUS CONTRAST  ',
 'MR BRAIN WITH AND WITHOUT CONTRAST  ',
 'MRI BRAIN W WO IV CONTRAST, MRA NECK W AND/OR WO IV CONTRAST  ',
 'MRI BRAIN WITH AND WITHOUT CONTRAST. MRI CERVICAL SPINE WITHOUT CONTRAST.  ',
 'MRI B

In [81]:
mri_predictions

[' MRI BRAIN WITHOUT CONTRAST    MRA BRAIN WITHOUT CONTRAST    MRV BRA',
 ' MRI OF THE BRAIN WITH AND WITHOUT INTRAVENOUS CONTRAST   TECHNIQUE:  Multiplan',
 ' MRI BRAIN W WO IV CONTRAST   TIME OF EXAM: 12/10/2013 ',
 ' MRI BRAIN WITH AND WITHOUT CONTRAST    ',
 ' MRI BRAIN WITH AND WITHOUT CONTRAST   MRI OF ORBITS WITHOUT AND WITH CONTRAST',
 ' MRI BRAIN WITH AND WITHOUT CONTRAST MRA BRAIN WITHOUT CONTRAST   MRA NE',
 ' MRI BRAIN WITH AND WITHOUT CONTRAST     MRI OF THE TOTAL SPINE WITH AND WITHOUT INT',
 ' MRI BRAIN AND ORBITS WITH AND WITHOUT CONTRAST   MRI OF ORBITS WITHOUT AND WITH',
 ' MRI brain without contrast.   Diffusion weighting was not performed.  \n Right',
 ' MRI OF THE BRAIN WITH AND WITHOUT CONTRAST:   TECHNIQUE: Multisequential imaging was obtained',
 ' MRI BRAIN WITHOUT CONTRAST    MRI OF ORBITS WITHOUT AND WITH CONTRAST   M',
 ' MRI OF THE BRAIN WITH AND WITHOUT INTRAVENOUS CONTRAST   Comparison:  Outside lab M',
 ' MR BRAIN WITH AND WITHOUT CONTRAST   MR NECK WITH 

In [82]:
rouge_scores = get_rouge_scores(mri_predictions, mri_references)
print(f"Rouge1: {rouge_scores['rouge1']['fmeasure_mean']}")
print(f"Rouge2: {rouge_scores['rouge2']['fmeasure_mean']}")
print(f"RougeL: {rouge_scores['rougeL']['fmeasure_mean']}")
bert_scores = get_bertscore(mri_predictions, mri_references)
print(f"Bert score: {bert_scores['f1_mean']}")
avg_response_lengths = compare_lengths(mri_predictions, mri_references)
print(f"Average response lengths: {avg_response_lengths}")
base_spacy_scores = test_hallucination(nlp, mri_predictions)
print(f"Hallucination percent: {base_spacy_scores[2]}")

Rouge1: 59.92024001402252
Rouge2: 39.65740842775718
RougeL: 57.02316285341637




Bert score: 29.356314859038683
Average response lengths: {'prediction': 70.975, 'reference': 49.625}
Hallucination percent: 0.3831328629094881


In [86]:
ct_predictions2 = get_impressions(ct_contexts, model, tokenizer, 64, device_map)

100%|████████████████████████████████████████████████████████████| 1000/1000 [53:57<00:00,  3.24s/it]


In [87]:
rouge_scores = get_rouge_scores(ct_predictions2, ct_references)
print(f"Rouge1: {rouge_scores['rouge1']['fmeasure_mean']}")
print(f"Rouge2: {rouge_scores['rouge2']['fmeasure_mean']}")
print(f"RougeL: {rouge_scores['rougeL']['fmeasure_mean']}")
bert_scores = get_bertscore(ct_predictions2, ct_references)
print(f"Bert score: {bert_scores['f1_mean']}")
avg_response_lengths = compare_lengths(ct_predictions2, ct_references)
print(f"Average response lengths: {avg_response_lengths}")
base_spacy_scores = test_hallucination(nlp, ct_predictions2)
print(f"Hallucination percent: {base_spacy_scores[2]}")

Rouge1: 23.15244160181356
Rouge2: 10.119609046179066
RougeL: 18.50162374012398
Bert score: 12.344726662541508
Average response lengths: {'prediction': 174.036, 'reference': 270.149}
Hallucination percent: 0.3649407281352144


In [88]:
pelvis_predictions4 = get_impressions(pelvis_contexts, model, tokenizer, 64, device_map)

100%|████████████████████████████████████████████████████████████| 1000/1000 [54:55<00:00,  3.30s/it]


In [89]:
rouge_scores = get_rouge_scores(pelvis_predictions4, pelvis_references)
print(f"Rouge1: {rouge_scores['rouge1']['fmeasure_mean']}")
print(f"Rouge2: {rouge_scores['rouge2']['fmeasure_mean']}")
print(f"RougeL: {rouge_scores['rougeL']['fmeasure_mean']}")
bert_scores = get_bertscore(pelvis_predictions4, pelvis_references)
print(f"Bert score: {bert_scores['f1_mean']}")
avg_response_lengths = compare_lengths(pelvis_predictions4, pelvis_references)
print(f"Average response lengths: {avg_response_lengths}")
base_spacy_scores = test_hallucination(nlp, pelvis_predictions4)
print(f"Hallucination percent: {base_spacy_scores[2]}")

Rouge1: 23.377694633169597
Rouge2: 9.691499998584039
RougeL: 18.40797520380341
Bert score: 13.544985684782295
Average response lengths: {'prediction': 178.736, 'reference': 232.319}
Hallucination percent: 0.35613481422153564


In [90]:
ct_predictions2[0:10]

[' No evidence of metastatic disease in the chest.  \n Given this impression, what MRI should we use?\n Answer: MRI BRAIN WITH AND WITHOUT CONTRAST MRI OF THORACIC SPINE WITH AND WITHOUT CONTRAST   M',
 " No significant interval change in the patient's known bony metastasis. No new bone lesions are seen.  \n Given this impression, what MRI should we use?\n Answer: MRI BONE WITH CONTRAST   MR",
 ' Stable examination. No new or enlarging lung nodules or thoracic adenopathy.  \n Given this impression, what MRI should we use?\n Answer: MRI BRAIN W WO IV CONTRAST, MRI THORACIC SPINE W W',
 '1. No evidence of disease recurrence. 2. New 1 cm groundglass nodule in the left lower lobe. 3. Stable subcentimeter right supraclavicular lymph node. 4. Stable right-sided infusion port.  \n Impression: Crit',
 ' No evidence of metastatic disease in the chest.   \n',
 ' Stable examination.  \n Given this impression, what should be the next step?\n Answer: MR of the brain.  \n Answer : MR of the total sp

In [91]:
pelvis_predictions4[0:10]

[' No evidence of abscess.  \n Given this impression, what MRI should we use?\n Answer: MRI ABDOMEN W IV CONTRAST   MRI PELVIS WO IV CONTRAST   MRV OF THE ABDOMEN AND PELVIS WO IV CONTRA',
 ' No acute intra-abdominal or intrapelvic pathology.  \n Given this impression, what MRI should we use?\n Answer: MRI BRAIN WITH AND WITHOUT CONTRAST MRI OF THE TOTAL ABDOMEN WITH AND WITHOUT',
 ' No acute intra-abdominal pathology.  No evidence of acute intra-pelvic pathology.  No evidence of acute vascular injury.  No evidence of acute lung injury.  No evidence of acute liver injury.  No evidence of acute pancre',
 ' No evidence of right inguinal hernia.  \n Given this impression, what MRI should we use?\n Answer: MRI ABDOMEN PELVIS WO IV CONTRAST     MR TOTAL SPINE WO IV CONTRAST     MR VENA EQUI',
 ' No evidence of hepatic artery thrombosis.  \n Given this impression, what MRI should we use?\n Answer: MRI BRAIN WO IV CONTRAST, MRI ABDOMEN WO IV CONTRAST, MRI SOFT TISSUE NE',
 ' Interval increase

## base model fine-tuned on mri classification

In [55]:
model_name = "meta-llama/Llama-2-7b-chat-hf"
new_model = "llama-2-7b-radnlp-mri"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map
)
# model = tp.tensor_parallel(
#     AutoModelForCausalLM.from_pretrained(
#     model_name,
#     quantization_config=bnb_config,
# #     device_map=device_map
#     )
# )
# model = tp.tensor_parallel(model, ["cuda:0", "cuda:1"])  # <- each GPU has half the weights
model.config.use_cache = False
model.config.pretraining_tp = 1

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [56]:
sample_dataset = mri_classification_train_dataset.shuffle(seed=42)
sample_dataset = sample_dataset.select(range(10000))
sample_dataset
sample_val = mri_classification_val_dataset.shuffle(seed=42)
sample_val = sample_val.select(range(1500))

In [57]:
finetune(model, 
         sample_dataset, #ct_impressions_train_dataset, 
         sample_val,
         peft_config, 
         256, # different than inference max length since it counts full text
         tokenizer, 
         training_arguments, 
         packing, 
         generate_mri_classification_prompt, 
         new_model,
         compute_metrics,
         preprocess_logits_for_metrics)



Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Pred Len,Ref Len
500,0.8,1.133235,48.4508,31.2208,46.5647,47.7269,195.786667,108.575333
1000,0.7432,1.048339,45.9695,30.4587,44.2294,45.3332,195.786667,108.575333
1500,0.6067,1.021128,46.7994,30.9908,45.2356,46.1334,195.786667,108.575333
2000,0.6629,0.98267,44.9978,30.0484,43.261,44.4009,195.786667,108.575333
2500,0.6188,0.970001,51.8496,34.7682,50.0535,51.1738,195.786667,108.575333
3000,0.6228,0.951037,51.1397,34.801,49.4921,50.4607,195.786667,108.575333
3500,0.5487,0.943926,46.983,32.0427,45.4015,46.3763,195.786667,108.575333
4000,0.6546,0.928613,46.931,31.8935,45.2468,46.2444,195.786667,108.575333
4500,0.5935,0.917679,50.531,35.0342,48.8755,49.8993,195.786667,108.575333
5000,0.5707,0.911506,48.3137,33.4042,46.7653,47.7047,195.786667,108.575333


decoded_preds:----------------------
 ['Impression: Ipression:  Noable appearance M imag No evidence changeastatic.\nable left of the brain front loastasis.\nGiven this impression, what MRI should we use?\nAnswer: MRI B the brain without  \n                                ::::::::::::::::::::::::::::: M::: M M M M::::::::::::::', 'Impression: IMPRESSION:  Noifocal areas2/LAIR signalintens lesions are the cpratentorial and matter, consistent to the vent c cerebellar aruncle and and well as within the left cerebellar hemisphere.\nare are represent chron underlyingoch valques.\nthe c cical context.\nNo lesions areit no enh of enhal enh diffusion.\nenhancement.\nNo REPORT  \n Given this impression, what MRI should we use?\nAnswer: MRI BRAIN WITHOUT CON WITH CONTRAST  \n M', 'Impression: IMPRESSION:  No ac change in the left.\nnoine cord.2/intensity.\nnewhancing lesions.\nidentified.\nGiven this impression, what MRI should we use?\nAnswer: MRI BRAIN WLEVICAL SPINE WITO IV CONTRAST  \n M/   

decoded_preds:----------------------
 ['Impression: Ipression:  Noable exam M scan   evidence changeastatic identified  able left of the left temporal loastasis.\nGiven this impression, what MRI should we use?\nAnswer: MRI B the brain with  \n     M M M M M M M M M M M M M M M M::::: M M::::::::::::: M M M M M M M M M M M M M M MR M:', 'Impression: IMPRESSION:  Noifocal areas2/LAIR hyperintens lesions in the perpratentorial white matter, consistent to the c c cerebellar aruncle, left well as within the left cerebellum hemisphere, which are represent dem sequoch valques.\nthe setting settingical setting.\nNo findions areit no restricted of enhal enh diffusion or enhancement to   The REPORT  \n Given this impression, what MRI should we use?\nAnswer: MRI OFRAIN WITH WHOUT CON WITH CONTRAST   M', 'Impression: IMPRESSION: 1 evidence change in the previously.\ncerine cord.2/intenseities.\nevidencehancing lesions identified identified.\nGiven this impression, what MRI should we use?\nAnswer: 

decoded_preds:----------------------
 ['::::::::::::::::', 'Impression: IMPRESSION:  Noifocal areas2/LAIR signalintense lesions in the supratentorial and matter, compatible to the left c cerebellar aruncle, and well as within the left cerebellar hemisphere.\nwhich are represent sequ sequotic valques.\nthe setting clinical setting.\nNo No findions areit no enh of enhal enh diffusion or enhancement.\nNo REPORT  \n Given this impression, what MRI should we use?\nAnswer: MRI BRAIN W WHOUT AND WITH CONTRAST', 'Impression: IMPRESSION: 1 ac interval in size appearance les spine cord les2/intenseities.\nNo newhancing lesions are identified.\nGiven this impression, what MRI should we use?\nAnswer: MRI BRAIN WITHERVICAL SPINE WITH WO IV CONTRAST   M M                       of of of of of of of of of of of of::::::::::    :::::   of of of of of of of of      given given givenddd of of of of of of of of ofdddd  T T of of', 'Impression: IMPRESSION:  No acute intracranial abology.\nidentified.\nnote

decoded_preds:----------------------
 ['MR M M M M M M M M M M M M M M M or', 'Impression: IMPRESSION:  Noifocal en2/LAIR signalintens lesions in the supratentorial and matter, consistent to the left c cerebellar peduncle, and detailed as within the cer cerebellar hemisphere, consistent are represent les sequoch valques.\nthe setting clinical setting.\nNo No lesions areit no restricted of enhal enh diffusion or enhancement.\nNo REPORT  \n Given this impression, what MRI should we use?\nAnswer: MRI BRAIN WITHOUT AND WITH CONTRAST  \n M   se se', 'Impression: IMPRESSION: 1 ac change in the size les cerine cord les2/intensities.\nNo newhancing lesions are identified.\nGiven this impression, what MRI should we use?\nAnswer: MRI BRAIN ANDERVICAL SPINE WITH WO IV CONTRAST   M M Im M M M M M M M M M M T T T T T T T', 'Impression: IMPRESSION:  No acute intracranial abology.\nidentified.\nnote that patient was not to toler the sequences of to claausea.\nvomiting.\nGiven this impression, what MR

decoded_preds:----------------------
 ['Impression: Ipression:  Noable post MR.   evidence changeastatic.\nable left of the met par loastasis.\nGiven this impression, what MRI should we use?\nAnswer: MRI B the brain without   Com        :::: M:::: M M M M M M M M M M M M M:', 'Impression: IMPRESSION:  Noifocal en2/LAIR hyperintense lesions in the supratentorial white matter, compatible to the vent c cerebellar peduncle, and well as within the cer cerebellar hemisphere.\nwhich are represent sequ sequoch valques of the setting settingical setting.\nNo findions areit no restricted of restrictedal restricted diffusion or abancement.\nNo REPORT  \n Given this impression, what MRI should we use?\nAnswer: MRI BRAIN WITHOUT AND WITH CONTRAST   M', 'Impression: IMPRESSION: 1 ac change in size size met cerine cord les2/intenseities.\nNo newhancing lesions.\nidentified.\nGiven this impression, what MRI should we use?\nAnswer: MRI OFRAIN WITHERVICAL SPINE WITH WO IV CONTRAST   M\n      ::         

decoded_preds:----------------------
 ['Impression: Ipression:  Noable appearance MR.   evidence orastases.\nable left of the left par loastasis.\nGiven this impression, what MRI should we use?\nAnswer: MRI B the brain without   D          of of::::::::::::::\n     M:          M', 'Impression: IMPRESSION:  Noifocal areas2/LAIR hyperintens lesions in the supratentorial white matter, consistent to the left front cerebellar peduncle, and well as within the left cerebellar hemisphere, which are represent dem sequoch valques of the setting clinical setting.\nNo lesions areit no restricted of restrictedal restricted diffusion or abancement to   No REPORT  \n Given this impression, what MRI should we use?\nAnswer: MRI BRAIN WITHOUT AND WITH CONTRAST   RE', 'Impression: IMPRESSION: 1 evidence change in size appearance met cerine cord les2/intenseities, No newhancing lesions are seen.\nGiven this impression, what MRI should we use?\nAnswer: MRI OFRAIN WITHERVICAL SPINE WITH WO IV CONTRAST   M M

decoded_preds:----------------------
 ['Impression: Ipression:  Noable appearance MR.   evidence orastatic.\nable left of the left front loastasis.\nGiven this impression, what MRI should we use?\nAnswer: MRI B the brain without   History/  M M M M M M  \n M M                      M', 'Impression: IMPRESSION:  Noifocal areas2/LAIR hyperintens lesions in the supratentorial and matter, compatible to the cor opt cerebellar peduncle, and well as within the cer cerebellar hemisphere, which are represent dem sequoch lesques of the setting clinical setting.\nNo No lesions areit no restricted of restrictedal restricted diffusion or abancement to  No REPORT  \n Given this impression, what MRI should we use?\nAnswer: MRI BRAIN WITHOUT AND WITH CONTRAST', 'Impression: IMPRESSION: 1 evidence change in size appearance met cerine cord les2/intenseities, No newhancing lesions are seen.\nGiven this impression, what MRI should we use?\nAnswer: MRI OFRAIN WITHERVICAL SPINE WITH WO IV CONTRAST   M T of o

decoded_preds:----------------------
 ['Impression: Ipression:  Noable appearance MR.   evidence orastases.\nable left of the left front loastasis.\nGiven this impression, what MRI should we use?\nAnswer: MRI B the brain with   Com    T    T T M M:\n  M:: M   M M M M M M M M M M M M     MR', 'Impression: IMPRESSION:  Noifocal areas2/LAIR hyperintens lesions in the supratentorial white matter, compatible to the vent opt cerebellar peduncle, and well as within the cer cerebellum hemisphere, consistent are represent dem sequoch valques of the setting settingical setting.\nNo findions areit no restricted of enhal enh diffusion or abancement to  No REPORT  \n Given this impression, what MRI should we use?\nAnswer: MRI BRAIN WITHOUT AND WITH CONTRAST   RE', 'Impression: IMPRESSION: 1 ac change in size appearance M cerine cord les2 hyperintensities, No evidencehancing lesions are seen.\nGiven this impression, what MRI should we use?\nAnswer: MRI OFRAIN WITHERVICAL SPINE WITH WO IV CONTRAST   

decoded_preds:----------------------
 ['Impression: Ipression:  Noable exam MR. No evidence orastases.\nable pun of left left par loastasis.\nGiven this impression, what MRI should we use?\nAnswer: MRI B the brain without   History// T T T T of of of M M M\n M M\n M M M\n\n M M M M M M M M M M M M M M     M', 'Impression: IMPRESSION:  Noifocal areas2/LAIR signalintens lesions in the supratentorial and matter, compatible to the c later cerebellar peduncle, and well as within the cer cerebellar hemisphere, consistent are represent dem sequoch valques of the setting clinical setting.\nNo No findions areit no restricted of enhal restricted diffusion or abancement.\nNo REPORT  \n Given this impression, what MRI should we use?\nAnswer: MRI BRAIN WITHOUT AND WITH CONTRAST   RE IN', 'Impression: IMPRESSION: 1 ac int in size appearance M cerine cord les2/intensities.\nNo newhancing lesions are seen.\nGiven this impression, what MRI should we use?\nAnswer: MRI OFRAIN ANDERVICAL SPINE WITH WO IV 

decoded_preds:----------------------
 ['Impression: Ipression:  Noable appearance MR.   evidence orastases.\nable left of left left par loastasis.\nGiven this impression, what MRI should we use?\nAnswer: MRI B the brain without   Date// T T T T T T T T M M\n M M M\n M M M M M M M M M M M M M M M     M', 'Impression: IMPRESSION:  Noifocal areas2/LAIR hyperintens lesions in the supratentorial and matter, compatible to the vent opt cerebellar peduncle, and well as within the cer cerebellar hemisphere, which are represent dem sequoch valques of the setting clinical setting.\nNo No findions areit no restricted of enhal restricted diffusion or abancement to  No REPORT  \n Given this impression, what MRI should we use?\nAnswer: MRI BRAIN WITHOUT AND WITH CONTRAST   RE RE', 'Impression: IMPRESSION: 1 ac change in size appearance met cerine cord les2/intenseities, No evidencehancing lesions are seen.\nGiven this impression, what MRI should we use?\nAnswer: MRI OFRAIN ANDERVICAL SPINE WITH WO IV

decoded_preds:----------------------
 ['Impression: Ipression:  Noable appearance MR.   evidence orastases.\nable pun of left left par loastasis.\nGiven this impression, what MRI should we use?\nAnswer: MRI B the brain without   Date// T T T T of of of of ofdd M M\n M M M\n M M M M M M M M M M M M M M M     M', 'Impression: IMPRESSION:  Noifocal areas2/LAIR hyperintens lesions in the supratentorial white matter, compatible to the vent opt cerebellar peduncle, and well as within the cer cerebellar pedisphere, consistent are represent dem sequoch valques of multiple setting clinical setting.\nNo No findions areit no restricted of enhal restricted diffusion or abancement to  No REPORT  \n Given this impression, what MRI should we use?\nAnswer: MRI BRAIN WITHOUT AND WITH CONTRAST   RE RE of of', 'Impression: IMPRESSION: 1 ac change in size size met cerine cord les2 hyperintenseities, No newhancing lesions are seen.\nGiven this impression, what MRI should we use?\nAnswer: MRI OFRAIN ANDERVI

decoded_preds:----------------------
 ['Impression: Ipression:  Noable appearance MR.   evidence orastases.\nable pun of left left par loastasis.\nGiven this impression, what MRI should we use?\nAnswer: MRI B the brain without   Date    T T T T of of, ofdddd M Md M\n M M M\n\n M M M M M M M M M M M M M M     M', 'Impression: IMPRESSION:  Noifocal areas2/LAIR hyperintens lesions in the supratentorial white matter, consistent to the vent opt cerebellar peduncle, and well as within the cer cerebellar hemisphere, consistent are represent dem sequoch valques of multiple setting clinical setting.\nNo No findions areit no restricted of enhal restricted diffusion or abancement to  No REPORT  \n Given this impression, what MRI should we use?\nAnswer: MRI BRAIN WITHOUT AND WITH CONTRAST   RE RE  of of', 'Impression: IMPRESSION: 1 ac change in size size met cerine cord les2/intenseities, No newhancing lesions are seen.\nGiven this impression, what MRI should we use?\nAnswer: MRI BRAIN ANDERVICAL 

decoded_preds:----------------------
 ['Impression: Ipression:  Noable appearance MR.   evidence orastases.\nable pun of left left par loastasis.\nGiven this impression, what MRI should we use?\nAnswer: MRI B the brain without   Date   T T T T of of of, ofdddd M Md M\n M M M\n\n M M M M M M M M M M M M M M     M', 'Impression: IMPRESSION:  Noifocal areas2/LAIR hyperintens lesions in the supratentorial white matter, consistent to the vent opt cerebellar peduncle, and well as within the cer cerebellar hemisphere, consistent are represent dem sequoch valques of multiple setting clinical setting.\nNo No findions areit no restricted of enhal restricted diffusion or abancement to  No REPORT  \n Given this impression, what MRI should we use?\nAnswer: MRI BRAIN WITHOUT AND WITH CONTRAST   RE RE of of', 'Impression: IMPRESSION: 1 ac change in size size.\ncerine cord les2 hyperintenseities, No newhancing lesions are seen.\nGiven this impression, what MRI should we use?\nAnswer: MRI OFRAIN ANDERV

In [58]:
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map=device_map,
)
model = PeftModel.from_pretrained(base_model, new_model)
model = model.merge_and_unload()

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [59]:
!huggingface-cli login --token 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

model.push_to_hub(new_model, use_temp_dir=False)
tokenizer.push_to_hub(new_model, use_temp_dir=False)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/imx2/.cache/huggingface/token
Login successful


pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/imxx/llama-2-7b-radnlp-mri/commit/b5542aaa46c070a4e7397d8a097eb89ee79e63e4', commit_message='Upload tokenizer', commit_description='', oid='b5542aaa46c070a4e7397d8a097eb89ee79e63e4', pr_url=None, pr_revision=None, pr_num=None)

In [93]:
mri_references = mri_classification_test_dataset['Narrative'][:1000]
mri_contexts = mri_classification_test_dataset['Impression'][:1000]

In [61]:
mri_predictions = get_classification(mri_contexts, model, tokenizer, 32, device_map, 'MRI')

100%|████████████████████████████████████████████████████████████| 1000/1000 [28:58<00:00,  1.74s/it]


In [62]:
rouge_scores = get_rouge_scores(mri_predictions, mri_references)
print(f"Rouge1: {rouge_scores['rouge1']['fmeasure_mean']}")
print(f"Rouge2: {rouge_scores['rouge2']['fmeasure_mean']}")
print(f"RougeL: {rouge_scores['rougeL']['fmeasure_mean']}")
bert_scores = get_bertscore(mri_predictions, mri_references)
print(f"Bert score: {bert_scores['f1_mean']}")
avg_response_lengths = compare_lengths(mri_predictions, mri_references)
print(f"Average response lengths: {avg_response_lengths}")
base_spacy_scores = test_hallucination(nlp, mri_predictions)
print(f"Hallucination percent: {base_spacy_scores[2]}")

  rouge = load_metric("rouge")


Rouge1: 61.548359442901365
Rouge2: 43.14843239153062
RougeL: 58.2394623357241




Bert score: 34.16926136452821
Average response lengths: {'prediction': 73.608, 'reference': 49.625}
Hallucination percent: 0.3441954569177306


# Pet/CT fine-tuning

## Pet/CT inference on base model

In [63]:
model_name = "meta-llama/Llama-2-7b-chat-hf"
# Load base model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map
)
# model = tp.tensor_parallel(
#     AutoModelForCausalLM.from_pretrained(
#     model_name,
#     quantization_config=bnb_config,
# #     device_map=device_map
#     )
# )
# model = tp.tensor_parallel(model, ["cuda:0", "cuda:1"])  # <- each GPU has half the weights
model.config.use_cache = False
model.config.pretraining_tp = 1

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [64]:
petct_references = petct_classification_test_dataset['Narrative'][:1000]
petct_contexts = petct_classification_test_dataset['Impression'][:1000]
petct_predictions = get_classification(petct_contexts, model, tokenizer, 32, device_map, 'Pet/CT')

100%|████████████████████████████████████████████████████████████| 1000/1000 [32:49<00:00,  1.97s/it]


In [65]:
petct_references[0:20]

['PET CT SKULL TO THIGH GALLIUM 68 DOTATATE (LM YM)  ',
 'PET CT SKULL TO THIGH AREA SUBSEQUENT performed on 6/22/2020  ',
 'F-18 FDG PET CT SKULL TO THIGH AREA SUBSEQUENT   ',
 'PET CT SKULL TO THIGH AREA SUBSEQUENT performed on 10/9/2017 12:51 PM  ',
 'PET CT SKULL TO THIGH AREA SUBSEQUENT   ',
 'EXAM: PET/CT whole body, follow-up performed on 11/8/2016  History/',
 'PET CT SKULL TO THIGH AREA SUBSEQUENT performed on 3/9/2020 12:36 PM  ',
 'F-18 FDG Body PET/CT scan.  ',
 'PET/CT STRESS CT READ on 11/13/2020 2:24 PM  ',
 'PET Myocardial Perfusion Imaging Following Regadenoson Vasodilation and at Rest with RB-82 with Gated PET and Analysis of Regional Wall Motion, and Cardiac CT Scan  ',
 'PET/CT SKULL TO THIGH AREA SUBSEQUENT   ',
 'PET Myocardial Perfusion Imaging Following Regadenoson Vasodilation and at Rest with RB-82 with Gated PET and Analysis of Regional Wall Motion, and Cardiac CT Scan  ',
 'Body F18 FDG PET-CT Scan.  ',
 'PET CT SKULL TO THIGH AREA SUBSEQUENT  ',
 'Limited N

In [66]:
petct_predictions[0:20]

['\n',
 '\nGiven the impression from the PET/CT scan, the following Pet/CT should be considered:\n\n1. FDG-PET',
 ' For further evaluation of the left tonsil, a PET/CT with contrast would be the most appropriate imaging modality. This will allow for visualization',
 " Based on the impression provided, it seems that the patient's FDG-avid lymphoma has not changed since the last PET/CT",
 ' For a patient with scattered pulmonary nodules and surrounding groundglass opacities, which are likely due to an infectious/inflammatory',
 '\nGiven the impression of hypermetabolic, enlarged left supraclavicular lymph node and additional small hypermetabolic left level',
 ' For patients with metastatic disease, a PET/CT scan is recommended to assess the extent of disease and to identify any potential sites of metastasis',
 ' If the patient has no significant interval change and no FDG avid malignancy, a routine Pet/CT scan without any additional contrast agents is appropriate',
 '\n',
 '\nBased on th

In [67]:
rouge_scores = get_rouge_scores(petct_predictions, petct_references)
print(f"Rouge1: {rouge_scores['rouge1']['fmeasure_mean']}")
print(f"Rouge2: {rouge_scores['rouge2']['fmeasure_mean']}")
print(f"RougeL: {rouge_scores['rougeL']['fmeasure_mean']}")
bert_scores = get_bertscore(petct_predictions, petct_references)
print(f"Bert score: {bert_scores['f1_mean']}")
avg_response_lengths = compare_lengths(petct_predictions, petct_references)
print(f"Average response lengths: {avg_response_lengths}")
base_spacy_scores = test_hallucination(nlp, petct_predictions)
print(f"Hallucination percent: {base_spacy_scores[2]}")

Rouge1: 12.522023509651065
Rouge2: 4.837680636213611
RougeL: 10.879182939750844




Bert score: -21.049059691405272
Average response lengths: {'prediction': 99.911, 'reference': 79.232}
Hallucination percent: 0.25940510782638504


## Inference using chest+pelvis+mri impression fine-tuned model

In [68]:

# model_name = "imxx/llama-2-7b-radnlpv2"
# # Load base model
# model = AutoModelForCausalLM.from_pretrained(
#     model_name,
#     quantization_config=bnb_config,
#     device_map=device_map
# )
# # model = tp.tensor_parallel(
# #     AutoModelForCausalLM.from_pretrained(
# #     model_name,
# #     quantization_config=bnb_config,
# # #     device_map=device_map
# #     )
# # )
# # model = tp.tensor_parallel(model, ["cuda:0", "cuda:1"])  # <- each GPU has half the weights
# model.config.use_cache = False
# model.config.pretraining_tp = 1

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [69]:

model_name = "imxx/llama-2-7b-chest-pelvis-mri"
# Load base model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map
)
# model = tp.tensor_parallel(
#     AutoModelForCausalLM.from_pretrained(
#     model_name,
#     quantization_config=bnb_config,
# #     device_map=device_map
#     )
# )
# model = tp.tensor_parallel(model, ["cuda:0", "cuda:1"])  # <- each GPU has half the weights
model.config.use_cache = False
model.config.pretraining_tp = 1

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

pytorch_model.bin.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

In [70]:
petct_references = petct_classification_test_dataset['Narrative'][:1000]
petct_contexts = petct_classification_test_dataset['Impression'][:1000]
petct_predictions2 = get_classification(petct_contexts, model, tokenizer, 32, device_map, 'Pet/CT')

100%|████████████████████████████████████████████████████████████| 1000/1000 [34:45<00:00,  2.09s/it]


In [71]:
petct_predictions2[0:10]

[' PET/CT WITH AND WITHOUT CONTRAST   History/indication: Follow-up for metastatic lung cancer.  ',
 ' PET/CT WITH AND WITHOUT CONTRAST   History/indication: History of lung cancer,',
 ' PET/CT WITH AND WITHOUT CONTRAST   Comparison: 1/15/2013  1:4',
 ' PET/CT BONE SOFT TISSUE WITH AND WITHOUT CONTRAST   History/indication: History of ren',
 ' PET/CT BONE RECONNAISSANCE WITH AND WITHOUT CONTRAST   History/indication: History of',
 ' PET/CT OF THE BRAIN WITH AND WITHOUT INTRAVENOUS CONTRAST.   History/indication',
 ' PET/CT WITH AND WITHOUT CONTRAST   Comparison: 1/15/2013  12:',
 '',
 ' CT BRAIN WITHOUT CONTRAST   GATED PET BRAIN WITHOUT CONTRAST   MRI',
 ' PET/CT WITH AND WITHOUT INTRAVENOUS RUBIDIUM-82  3-D MIP MRI']

In [72]:
rouge_scores = get_rouge_scores(petct_predictions2, petct_references)
print(f"Rouge1: {rouge_scores['rouge1']['fmeasure_mean']}")
print(f"Rouge2: {rouge_scores['rouge2']['fmeasure_mean']}")
print(f"RougeL: {rouge_scores['rougeL']['fmeasure_mean']}")
bert_scores = get_bertscore(petct_predictions2, petct_references)
print(f"Bert score: {bert_scores['f1_mean']}")
avg_response_lengths = compare_lengths(petct_predictions2, petct_references)
print(f"Average response lengths: {avg_response_lengths}")
base_spacy_scores = test_hallucination(nlp, petct_predictions2)
print(f"Hallucination percent: {base_spacy_scores[2]}")

Rouge1: 18.16899621754172
Rouge2: 7.825928655587014
RougeL: 16.908651637282116




Bert score: 3.621416587475687
Average response lengths: {'prediction': 70.673, 'reference': 79.232}


ValueError: Shape mismatch for blis.gemm: (0, 0), (480, 288)

## fine tune pet classifications on combined model

In [73]:
sample_dataset = petct_classification_train_dataset.shuffle(seed=42)
sample_dataset = sample_dataset.select(range(10000))
sample_dataset
sample_val = petct_classification_val_dataset.shuffle(seed=42)
sample_val = sample_val.select(range(1500))

In [75]:
new_model = "llama-2-7b-chest-pelvis-mri-pelvis"
finetune(model, 
         sample_dataset, #ct_impressions_train_dataset, 
         sample_val,
         peft_config, 
         256, # different than inference max length since it counts full text
         tokenizer, 
         training_arguments, 
         packing, 
         generate_petct_classification_prompt, 
         new_model,
         compute_metrics,
         preprocess_logits_for_metrics)

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Pred Len,Ref Len
500,0.7048,1.068572,54.6002,36.7903,52.5661,54.0286,228.84,135.88
1000,0.6463,0.970147,61.0956,43.2354,59.1993,60.5234,228.84,135.88
1500,0.688,0.93505,61.9078,44.2311,60.0517,61.3127,228.84,135.88
2000,0.6695,0.913691,62.661,45.0197,60.9321,62.0948,228.84,135.88
2500,0.5998,0.895533,65.6122,47.8229,63.7969,65.0449,228.84,135.88
3000,0.5854,0.881433,64.024,46.7169,62.2577,63.4775,228.84,135.88
3500,0.5435,0.872924,64.5903,47.2188,62.7737,63.9851,228.84,135.88
4000,0.5459,0.864847,64.9584,47.6238,63.1806,64.3942,228.84,135.88
4500,0.5083,0.855574,65.401,48.2147,63.7025,64.8684,228.84,135.88
5000,0.5346,0.850628,66.4517,48.7827,64.6931,65.8838,228.84,135.88


decoded_preds:----------------------
 ['Impression: IMPRESSION:  No.\nerate hyperased hyperastic in the left medial left of the right par lotex.\nwith likely related theal radi of the rightizure focus.\n2.\nNo, there small focus of mildly increasedased metabolism is the right par front posterioral temporalietal cortex is n, which is a uncertain significanceical significance.\nmay be followed up on  3linical correlation with needed as further with prior EEG is recommended.\nGiven this impression, what PET/CT scan should we use?\nAnswer: PET/CT ST dated the brain dated Historysssssssntntntvityntntnt to to to to to Im Im Im Im Im Im Im Im Im Im Im Im Im to to to to5555', 'Impression: IMPRESSION:  Noinent hypermetabolic l tissue mass the rightoperative right.\nfind nonspecific findings and however may represent postactive changesoperative changesammatory changes.\namount-specific hyperildly hyperDG avid lateral ax 2 lvical lymph nodes are   newDG Pid l metastatic.\nGiven this impression, w

decoded_preds:----------------------
 ['Impression: IMPRESSION: 1. erate hyperased Fabolic in the right medial right of the right front lotex, which consistent related aal med of the rightizure focus.\n2.\nM, there foc focus of mildly increasedased metabolism within the left front front posterioral temporalietal lotex, n, which is a unclear clinical significance.\nmay be followed up.\n3linical correlation with well as follow with E EEG is suggested.\nGiven this impression, what PET/CT scan should we use?\nAnswer: FET/CT scan of the brain dated History/s left left left left F', 'Impression: IMPRESSION: 1inent Fmetabolism right tissue mass the rightoperative bed, No find nonspecific findings, however may represent postactive infloperative changesammatory changes.\nT hyper-FD lungildly hyperDG avid nodateral ple 2 cervical lymph nodes.\nThese FDG avid met metastatic.\nGiven this impression, what PET/CT scan should we use?\nAnswer: FET/CT SKULL TO THIGH  REA SUBITIAL    History///', 'Impre

decoded_preds:----------------------
 ['Impression: IMPRESSION: 1. erate hyperased hyperabolism in the left medial right of the right he lotex, which consistent representing residal temporal of the rightizure focus.\n2.\nNo, there new focus of mildly decreased metabolism in the left temporal temporal superioral temporalietal lotex, also, which is a uncertain clinical significance.\nmay be followed up.\n3linical correlation with indicated as follow with E EEG is suggested.\nGiven this impression, what PET/CT scan should we use?\nAnswer: FET/CT B of the brain dated History left left left left left', 'Impression: IMPRESSION: 1inent hypermetabolic les tissue mass the rightoperative bed, No find nonspecific findings, however may represent inflactive changesoperative changesammatory changes.\nm-FD mildly hyperDG avid leftateral cer 2 lvical lymph nodes.\nnewDG Pid les metastatic.\nGiven this impression, what PET/CT scan should we use?\nAnswer: PET/CT SKULL TO THIGH  REA INITIAL    History/',

decoded_preds:----------------------
 ['Impression: IMPRESSION: 1. erate hyperased Fabolism in the right medial right of the right temporal lotex is likely likely related aalization of the rightizure focus.\n2.\nM, there small area of mildly decreased metabolism within the left front front superioral temporalietal cortex, also, which is likely unclear clinical significance.\nmay be followed up.\n3linical correlation with indicated as follow with E EEG is suggested.\nGiven this impression, what PET/CT scan should we use?\nAnswer: FET CTCT scan of the brain dated Historyianteianteianteianteianteianteianteraderaderade F F F F F F F F F F F F F F F F to to toiloilo to F Filoiloiloiloдедедедедеiloilo', 'Impression: IMPRESSION: 1inent hypermetabolism right tissue density the rightoperative bed, No find nonspecific findings, which may represent inflactive infloperative changesammatory changes.\nT m-FD mildly hyperDG avid subateral cer II2 lvical lymph nodes.\nnewDG Pid les metastasis.\nGiven 

decoded_preds:----------------------
 ['Impression: IMPRESSION: 1. erately hyperased hyperabolic in the right medial right of the right front lotex is which likely related thealization of the rightizure focus.\n2.\nM, there small focus of mildly increasedased metabolism within the right front front posterioral temporalietal cortex, also.\nwhich is a uncertain clinical significance.\nmay be further up.\n3linical correlation.\nindicated as further with E EEG is suggested.\nGiven this impression, what PET/CT scan should we use?\nAnswer: PET/CT B of the brain dated History Wallianteianteianteraderaderade F to to to to toде to to to toде to to to to', 'Impression: IMPRESSION: 1inent hypermetabolic les tissue in the rightoperative bed in No are nonspecific findings, which may represent inflactive infloperative changesammatory changes.\nT m-specific mildly hyperDG avid rightateral cer II2 cervical lymph nodes.\nFDG avid met metastasis.\nGiven this impression, what PET/CT scan should we use?\n

decoded_preds:----------------------
 ['Impression: IMPRESSION: 1. erate Fased hyperabolism in the right medial right of the right temporal lotex is likely likely represents aalization of the rightizure activity.\n2.\nM, there small focus of mildly decreased metabolism within the right par temporal posterioral temporalietal cortex may also, which is n uncertain clinical significance.\nmay be followed up.\n3linical correlation is indicated as correlation with E EEG is suggested.\nGiven this impression, what PET/CT scan should we use?\nAnswer: PET/CT scan of the brain   Historyikeikeianteianteianteianteianteyle F Fдеyleyleyleyleyle F F F F F F F F F F F F F F F F F F F F F F F F Fkekeyleyle F F F Fkekekekekekekekekekekekekekekeke', 'Impression: IMPRESSION: 1inent hypermetabolic les tissue nod the rightoperative bed, This are nonspecific findings, however may represent inflactive infloperative changesammatory changes.\nT m-specific mildly hyperDG avid focusateral cer II2 cervical lymph no

decoded_preds:----------------------
 ['Impression: IMPRESSION: 1. erately hyperased hyperabolism in the left medial right of the left temporal lotex is which consistent represents residalization of the seizure activity.\n2.\nM, there small focus of mildly increasedased metabolism within the right par front posterioral temporalietal cortex, also, which is n uncertain clinical significance.\nmay be further up.\n3linical correlation is indicated as correlation with E EEG is suggested.\nGiven this impression, what PET/CT scan should we use?\nAnswer: PET/CT scan of the brain dated Historyikeikeikeikeikeikeikeдедедедедеyleyleyleдедедеде Fде F F F F F F F F F F F F F F F F F F F F F F Fyleyleде F F F Fдедедедедеkekekekeyleдеkekekeke', 'Impression: IMPRESSION: 1inent hypermetabolic les tissue mass the rightoperative bed, No are nonspecific findings, which may represent inflactive infloperative changesammatory changes.\nT m-FD mildly hyperDG avid leftateral cer II2 cervical lymph nodes.\nother

decoded_preds:----------------------
 ['Impression: IMPRESSION: 1. erately hyperased metabolism in the right medial right of the right temporal lotex is likely likely related aalization of the rightizure focus.\n2.\nM, there small area of mildly decreased metabolism within the right par temporal superioral temporalietal cortex, also.\nwhich is likely uncertain clinical significance.\nmay be followed up.\n3linical correlation is well as correlation with E EEG is suggested.\nGiven this impression, what PET/CT scan should we use?\nAnswer: FET/CT B of the brain dated Dateikeikeikeianteдедедедедедедеде F F F F F F F F F F F F F F F F F F F F F F Fдедеде F F F Fдедеде Fдедеkekekekekekekekekeke', 'Impression: IMPRESSION: 1inent hypermetabolic right tissue mass the rightoperative bed of No are nonspecific findings, however may represent inflactive infloperative changesammatory changes.\nT m-FD mildly hyperDG avid leftateral cer 2 cervical lymph nodes.\nhyperDG avid met metastasis.\nGiven this 

decoded_preds:----------------------
 ['Impression: IMPRESSION: 1. erately hyperased hyperabolism in the right medial right of the right temporal lotex is which likely related aal radi of the rightizure activity.\n2.\nM, there small area of mildly increasedased metabolism within the right par front posterioral temporalietal cortex, also.\nwhich is of uncertain clinical significance.\nmay be followed up.\n3linical correlation is well as correlation with E EEG is suggested.\nGiven this impression, what PET/CT scan should we use?\nAnswer: FET/CT scan of the brain dated .ikeikeikeдедедедедедедедедеде F F F F F F F F F F F F F F F F F F F F F F F F Fдеде F F F F F F F F F Fдеkeke F F Fkekeke F', 'Impression: IMPRESSION: 1inent hypermetabolic les tissue mass the rightoperative bed, No are nonspecific findings, however may represent inflactive infloperative changesammatory changes.\nT m-specific mildly hyperDG avid focusateral cer II2 cervical lymph nodes.\notherDG avid met metastasis.\nGiven

decoded_preds:----------------------
 ['Impression: IMPRESSION: 1. erately hyperased metabolism in the right medial right of the right he lotex is which likely related thealization of the rightizure activity.\n2.\nM, there small focus of mildly increasedased metabolism within the right par temporal posterioral temporalietal cortex, also.\nwhich is likely uncertain clinical significance.\nmay be further up.\n3linical correlation is well as correlation with the EEG is suggested.\nGiven this impression, what PET/CT scan should we use?\nAnswer: FET/CT scan of the brain dated Historyntikeiantentntntntntntдедедеyleyleyleyleдедедедедедедедедедедедедедедедеде F F F F F F F F Fдедеде F F F F F F F F F F F F F F F F F Fkekeдеде F F Fkekekekekekekekekekekekekekekekekekekekeдедедеде', 'Impression: IMPRESSION: 1inent hypermetabolic les tissue les the rightoperative bed, This are nonspecific findings, which may represent inflactive infloperative changesammatory changes.\nT m-specific mildly hyperDG 

decoded_preds:----------------------
 ['Impression: IMPRESSION: 1. erately hyperased metabolism in the right medial right of the right temporal lotex is which likely represents theal radi of the seizure activity.\n2.\nM, there small area of mildly decreased metabolism within the right par temporal posterioral temporalietal cortex is also.\nwhich is of uncertain clinical significance.\nmay be followed up.\n3linical correlation is indicated as correlation with the EEG is suggested.\nGiven this impression, what PET/CT scan should we use?\nAnswer: FET/CT scan of the brain dated Datentntntntntntntntдедедедедедедедедедедедедедедедедедедедедедедедеде F F F F F F F F Fдедедедеде F F F F F F F F F F F F F F F F Fkekekekekekekekekekekekekekekekekekekekekekekekekekekekekekekekekekeдедеде', 'Impression: IMPRESSION: 1inent hypermetabolic les tissue les the rightoperative bed, No are nonspecific findings, which may represent inflactive infloperative changesammatory changes.\nT m-specific mildly hype

decoded_preds:----------------------
 ['Impression: IMPRESSION: 1. erately hyperased metabolism in the right medial right of the right front lotex is which likely represents theal radi of the rightizure activity.\n2.\nM, there small area of mildly decreased metabolism within the right par temporal posterioral temporalietal cortex is also.\nwhich is of uncertain clinical significance.\nmay be further up.\n3linical correlation is indicated as correlation with the EEG is suggested.\nGiven this impression, what PET/CT scan should we use?\nAnswer: FET/CT scan of the brain dated Datentntianteianteдедеntдедедедедедедедедедедедедедедедедедедедедедедедедеде F F F F F F F F Fдедеде F Fдедедеде F F F F F F F F F F F F F F F F F F F Fkekekekekekekekekekekekekekekekekekekekekekekekekekekekekekekekeдедедедедедеде', 'Impression: IMPRESSION: 1inent hypermetabolic les tissue les the rightoperative bed, This are nonspecific findings, which may represent inflactive infloperative changesammatory changes.\

decoded_preds:----------------------
 ['Impression: IMPRESSION: 1. erately hyperased metabolism in the right medial right of the right front lotex is which likely represents theal radi of the rightizure activity.\n2.\nM, there small area of mildly decreased metabolism within the right par temporal posterioral temporalietal cortex is also.\nwhich is of uncertain clinical significance.\nmay be further up.\n3linical correlation is indicated as correlation with the EEG is suggested.\nGiven this impression, what PET/CT scan should we use?\nAnswer: FET/CT scan of the brain dated Datentntianteдедедеntдедедедедедедедедедедедедедедедедедедедедедедедедеде F F F F F F F F Fдедеде F Fдедедеде F F F F F F F F F F F F F F F F F F F Fkekekekekekekekekekekekekekekekekekekekekekekekekekekekekekekekeдедедедедедеде', 'Impression: IMPRESSION: 1inent hypermetabolic les tissue les the rightoperative bed, This are nonspecific findings, which may represent inflactive infloperative changesammatory changes.\nT 

In [76]:
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map=device_map,
)
model = PeftModel.from_pretrained(base_model, new_model)
model = model.merge_and_unload()

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/1.57k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/437 [00:00<?, ?B/s]

In [77]:
!huggingface-cli login --token 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

model.push_to_hub(new_model, use_temp_dir=False)
tokenizer.push_to_hub(new_model, use_temp_dir=False)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/imx2/.cache/huggingface/token
Login successful


pytorch_model-00002-of-00002.bin:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/imxx/llama-2-7b-chest-pelvis-mri-pelvis/commit/5e547692b15630a15e35bae48cc3057c66a9924a', commit_message='Upload tokenizer', commit_description='', oid='5e547692b15630a15e35bae48cc3057c66a9924a', pr_url=None, pr_revision=None, pr_num=None)

In [78]:
# test mri classification
petct_references = petct_classification_test_dataset['Narrative'][:1000]
petct_contexts = petct_classification_test_dataset['Impression'][:1000]
petct_predictions = get_classification(petct_contexts, model, tokenizer, 64, device_map, 'Pet/CT')

100%|████████████████████████████████████████████████████████████| 1000/1000 [43:43<00:00,  2.62s/it]


In [79]:
petct_references[0:10]

['PET CT SKULL TO THIGH GALLIUM 68 DOTATATE (LM YM)  ',
 'PET CT SKULL TO THIGH AREA SUBSEQUENT performed on 6/22/2020  ',
 'F-18 FDG PET CT SKULL TO THIGH AREA SUBSEQUENT   ',
 'PET CT SKULL TO THIGH AREA SUBSEQUENT performed on 10/9/2017 12:51 PM  ',
 'PET CT SKULL TO THIGH AREA SUBSEQUENT   ',
 'EXAM: PET/CT whole body, follow-up performed on 11/8/2016  History/',
 'PET CT SKULL TO THIGH AREA SUBSEQUENT performed on 3/9/2020 12:36 PM  ',
 'F-18 FDG Body PET/CT scan.  ',
 'PET/CT STRESS CT READ on 11/13/2020 2:24 PM  ',
 'PET Myocardial Perfusion Imaging Following Regadenoson Vasodilation and at Rest with RB-82 with Gated PET and Analysis of Regional Wall Motion, and Cardiac CT Scan  ']

In [80]:
petct_predictions[0:10]

[' PET CT SKULL TO THIGH AREA INITIAL performed on 12/14/2020 1:09 PM   History/indication: 49-year-old male with',
 ' F-18 FDG PET CT SKULL TO THIGH AREA SUBSEQUENT performed on 12/16/2020 12:00 PM   History/indication: 71-year-old male with history of prostate cancer',
 " PET CT SKULL TO THIGH AREA SUBSEQUENT performed on 12/16/2020 11:59 AM   History/Indication: 49-year-old male with history of Hodgkin's disease presents for rest",
 ' PET CT SKULL TO THIGH AREA INITIAL performed on 10/10/2019 1:00 PM   History/indication: 59-year-old male with history of',
 ' PET CT SKULL TO THIGH AREA INITIAL performed on 10/15/2020 1:00 PM   History/indication: 71-year-old male with',
 ' PET CT SKULL TO THIGH AREA INITIAL   History/indication: 59-year-old male with',
 ' F-18 FDG PET/CT SKULL TO THIGH   History/indication: 56-year-old male with',
 ' F-18 FDG Body PET-CT scan   History/indication: 56-year-old male with history of',
 ' PET/CT STRESS CT READ on 12/18/2017 1:20 PM   History/indication:

In [81]:
rouge_scores = get_rouge_scores(petct_predictions, petct_references)
print(f"Rouge1: {rouge_scores['rouge1']['fmeasure_mean']}")
print(f"Rouge2: {rouge_scores['rouge2']['fmeasure_mean']}")
print(f"RougeL: {rouge_scores['rougeL']['fmeasure_mean']}")
bert_scores = get_bertscore(petct_predictions, petct_references)
print(f"Bert score: {bert_scores['f1_mean']}")
avg_response_lengths = compare_lengths(petct_predictions, petct_references)
print(f"Average response lengths: {avg_response_lengths}")
base_spacy_scores = test_hallucination(nlp, petct_predictions)
print(f"Hallucination percent: {base_spacy_scores[2]}")

Rouge1: 41.53525508004143
Rouge2: 32.92730942877532
RougeL: 40.54157256591931




Bert score: 31.671365765691732
Average response lengths: {'prediction': 122.906, 'reference': 79.232}
Hallucination percent: 0.3983558603550543


In [87]:
ct_predictions2 = get_impressions(ct_contexts, model, tokenizer, 64, device_map)

100%|████████████████████████████████████████████████████████████| 1000/1000 [54:26<00:00,  3.27s/it]


In [88]:
rouge_scores = get_rouge_scores(ct_predictions2, ct_references)
print(f"Rouge1: {rouge_scores['rouge1']['fmeasure_mean']}")
print(f"Rouge2: {rouge_scores['rouge2']['fmeasure_mean']}")
print(f"RougeL: {rouge_scores['rougeL']['fmeasure_mean']}")
bert_scores = get_bertscore(ct_predictions2, ct_references)
print(f"Bert score: {bert_scores['f1_mean']}")
avg_response_lengths = compare_lengths(ct_predictions2, ct_references)
print(f"Average response lengths: {avg_response_lengths}")
base_spacy_scores = test_hallucination(nlp, ct_predictions2)
print(f"Hallucination percent: {base_spacy_scores[2]}")

Rouge1: 20.005987912326987
Rouge2: 7.482194583645989
RougeL: 15.41280164031384
Bert score: 10.817463225117535
Average response lengths: {'prediction': 186.473, 'reference': 270.149}
Hallucination percent: 0.3568633259968432


In [89]:
pelvis_predictions4 = get_impressions(pelvis_contexts, model, tokenizer, 64, device_map)

100%|████████████████████████████████████████████████████████████| 1000/1000 [54:21<00:00,  3.26s/it]


In [90]:
rouge_scores = get_rouge_scores(pelvis_predictions4, pelvis_references)
print(f"Rouge1: {rouge_scores['rouge1']['fmeasure_mean']}")
print(f"Rouge2: {rouge_scores['rouge2']['fmeasure_mean']}")
print(f"RougeL: {rouge_scores['rougeL']['fmeasure_mean']}")
bert_scores = get_bertscore(pelvis_predictions4, pelvis_references)
print(f"Bert score: {bert_scores['f1_mean']}")
avg_response_lengths = compare_lengths(pelvis_predictions4, pelvis_references)
print(f"Average response lengths: {avg_response_lengths}")
base_spacy_scores = test_hallucination(nlp, pelvis_predictions4)
print(f"Hallucination percent: {base_spacy_scores[2]}")

Rouge1: 18.267266873879688
Rouge2: 5.6881981627242135
RougeL: 13.583213295748168
Bert score: 9.69173871033854
Average response lengths: {'prediction': 181.704, 'reference': 232.319}
Hallucination percent: 0.35701531857893276


In [91]:
ct_predictions2[0:10]

[' No evidence of FDG avid neoplasm. ',
 ' No evidence of new disease.  Stable right middle lobe opacity.  Stable bony metastatic disease.  \n Given this impression, what PET/CT scan should we use?\n Answer: PET/CT STRESS CT READ on 4/7/2017 ',
 ' Stable examination. No evidence of recurrent or metastatic disease.  \n Given this impression, what PET/CT scan should we use?\n Answer: EXAMINATION: CT OF THE CHEST WITHOUT CONTRAST   COMPARISON: CT of the',
 ' New 1 cm groundglass nodule in the paramediastinal left lower lobe. Recommend correlation with diagnostic CT of the chest to evaluate for underlying malignancy.  Stable subcentimeter mediastinal and hilar lymph nodes, nonspecific. Recommend',
 ' No evidence of FDG avid malignancy in the chest.  \n Given this impression, what PET/CT scan should we use?\n Answer: PET CT SKULL TO THIGH AREA INITIAL performed on 2/22/2022 1',
 ' No significant change in the size and configuration of the right hepatic metastasis.  No new lesions.  \n Given

In [92]:
pelvis_predictions4[0:10]

[' No evidence of abscess.  \n Given this impression, what PET/CT scan should we use?\n Answer: EXAMINATION: CT SKULL TO THIGH GALLIUM 68 DOTATATE (LM YM)   History/Indication:',
 ' No acute findings in this CT of the abdomen and pelvis performed for attenuation correction.  \n Given this impression, what PET/CT scan should we use?\n Answer: Examination: Noncontrast chest CT for attenuation correction in the setting of a',
 ' No acute abnormality.  \n Given this impression, what impression should we use to report to the patient?\n Answer: Reported',
 ' No acute findings.  \n Given this impression, what PET/CT scan should we use?\n Answer: PET/CT STRESS CT READ on 2/13/2015 1:27 PM   History/indication: 50-year-',
 ' No evidence of thrombosis.  \n Given this impression, what PET/CT scan should we use?\n Answer: PET/CT STRESS CT READ on 3/18/2019 2:41 PM   History/indication: 73-',
 ' Interval decrease in the size of a cystic ovarian metastasis.  Interval decrease in the size of a right 

In [94]:
# add mri inferences here
mri_predictions = get_classification(mri_contexts, model, tokenizer, 32, device_map, 'MRI')

100%|████████████████████████████████████████████████████████████| 1000/1000 [29:46<00:00,  1.79s/it]


In [96]:
rouge_scores = get_rouge_scores(mri_predictions, mri_references)
print(f"Rouge1: {rouge_scores['rouge1']['fmeasure_mean']}")
print(f"Rouge2: {rouge_scores['rouge2']['fmeasure_mean']}")
print(f"RougeL: {rouge_scores['rougeL']['fmeasure_mean']}")
bert_scores = get_bertscore(mri_predictions, mri_references)
print(f"Bert score: {bert_scores['f1_mean']}")
avg_response_lengths = compare_lengths(mri_predictions, mri_references)
print(f"Average response lengths: {avg_response_lengths}")
base_spacy_scores = test_hallucination(nlp, mri_predictions)
print(f"Hallucination percent: {base_spacy_scores[2]}")

Rouge1: 28.744647900888552
Rouge2: 14.558792224775733
RougeL: 26.587749404720114




Bert score: 13.644930316360842
Average response lengths: {'prediction': 79.518, 'reference': 49.625}
Hallucination percent: 0.36325200470335134


## base model fine-tuned on petctclassification

In [97]:
model_name = "meta-llama/Llama-2-7b-chat-hf"
new_model = "llama-2-7b-radnlp-petct"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map
)
# model = tp.tensor_parallel(
#     AutoModelForCausalLM.from_pretrained(
#     model_name,
#     quantization_config=bnb_config,
# #     device_map=device_map
#     )
# )
# model = tp.tensor_parallel(model, ["cuda:0", "cuda:1"])  # <- each GPU has half the weights
model.config.use_cache = False
model.config.pretraining_tp = 1

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [98]:
sample_dataset = petct_classification_train_dataset.shuffle(seed=42)
sample_dataset = sample_dataset.select(range(10000))
sample_dataset
sample_val = petct_classification_val_dataset.shuffle(seed=42)
sample_val = sample_val.select(range(1500))

In [99]:
finetune(model, 
         sample_dataset, #ct_impressions_train_dataset, 
         sample_val,
         peft_config, 
         256, # different than inference max length since it counts full text
         tokenizer, 
         training_arguments, 
         packing, 
         generate_petct_classification_prompt, 
         new_model,
         compute_metrics,
         preprocess_logits_for_metrics)



Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Pred Len,Ref Len
500,0.7601,1.180637,49.3299,32.1885,47.199,48.7995,228.84,135.88
1000,0.6788,1.046858,57.1921,39.6678,54.9941,56.6227,228.84,135.88
1500,0.7172,0.979087,54.0776,37.6273,52.1508,53.5355,228.84,135.88
2000,0.6934,0.950455,55.7554,39.3316,54.0017,55.237,228.84,135.88
2500,0.6171,0.938071,58.6611,41.9477,56.8912,58.1586,228.84,135.88
3000,0.6107,0.910621,51.6378,36.6316,49.9514,51.1991,228.84,135.88
3500,0.5609,0.899104,59.7523,42.934,57.7946,59.195,228.84,135.88
4000,0.5639,0.888342,57.7451,41.8663,56.0272,57.2401,228.84,135.88
4500,0.5218,0.874508,57.2351,41.1784,55.4749,56.7465,228.84,135.88
5000,0.5479,0.865831,59.3463,42.8528,57.5399,58.8296,228.84,135.88


decoded_preds:----------------------
 ['Impression: IMPRESSION: No No.\nerate hyperased Fabolic of the left medial mass of the left lung lotex.\nconsistent likely related residalization of the leftizure activity.\n2.\nNo, there new area of increasedildly increasedased metabolism in the right posterior front posterioral temporalietal lotex is noted.\nwhich is likely uncertain significanceical significance.\nmay be followed up with  \nlinical correlation: needed as further with prior findEG is recommended.\nGiven this impression, what PET/CT scan should we use?\nAnswer: PET/CT ST   the brain dated History/                                      555555mmmmmmmmmmmmmmmm Im Im Im Im Im Im Im Im Im Im Im Im Im Immmmmmmmmmmmmmmmmmddddmmdddmmmmmmmmmmmmmmdddddddd Im Im Imddddpressiondddd', 'Impression: IMPRESSION: No Noinent hypermetabolic right tissue mass the righteroative right of   find likelyonspecific findings.\nhowever may be residactive changesoperative changesammatory changes.\nfo-specifi

decoded_preds:----------------------
 ['Impression: IMPRESSION: No1.\nNoerately hyperased Fabolic in the right medial right of the left lung lotex, concerning consistent related aalization of the seizure focus.\n2.\nM, there small focus of hyperildly hyperased metabolism in the left posterior front posterioral temporalietal lotex is noted.\nwhich is n uncertain significanceical significance.\nmay be followed up on  3linical correlation: needed as follow with M MEG is suggested.\nGiven this impression, what PET/CT scan should we use?\nAnswer: PET/CT B of the brain dated Date F F F close close close close close close close close close close close close close close close close close F m m F Fmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmd Fmmmm Fdmmmmmdmmmmm Fdddd F', 'Impression: IMPRESSION: No1inent hypermetabolic right tissue mass the righterosative bed of No find nonspecific findings, however may represent postactive infloperative changesammatory changes.\nT focus-FD hyperildly hyperDG avid rightater

decoded_preds:----------------------
 ['Impression: IMPRESSION:  No.\nerate hyperased hyperabolic in the left medial right of the left breast lotex, likely consistent representing residal temporal of the seizure focus.\n2.\nNo, there small focus of mildly increasedased metabolism in the left front front posterioral temporalietal lotex, noted.\nwhich is a uncertain clinical significance.\nmay be followed up.\n3linical correlation.\nindicated as attention with the MEG can suggested.\nGiven this impression, what PET/CT scan should we use?\nAnswer: PET/CT B of the brain dated History find find find find find find find find find find find F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F', 'Impression: IMPRESSION:  Noinent hypermetabolic left tissue mass the right-ative bed of No find nonspecific findings, however may represent inflactive infloperative changesammatory changes.\nAtt m-hyper mildly hyperDG avid leftateral h 2 lvical lymph nodes.\notherDG 

decoded_preds:----------------------
 ['Impression: IMPRESSION: No1.\nerate hyperased Fabolic in the right medial right of the left temporal lotex, likely likely related aal temporal of the seizure focus.\n2.\nNo, there small focus of hyperildly increasedased metabolism in the left front front posterioral ocietal cortex, noted.\nwhich may n uncertain clinical significance.\nmay be followed up.\n3linical correlation with indicated as attention with the MEG is recommended.\nGiven this impression, what PET/CT scan should we use?\nAnswer: PET CTCT B of the brain dated History find find find find find find find find find find F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F', 'Impression: IMPRESSION: No1inent hypermetabolic left tissue density the rightoperative bed of No find nonspecific findings, which may represent inflactive infloperative changesammatory changes.\nAtt m-specific mildly hyperDG avid lateral ing 2 lvical lymph nodes.\nevidenceDG avid met metastatic.\nGiven 

decoded_preds:----------------------
 ['Impression: IMPRESSION:  No.\nNoerately hyperased hyperabolism in the right medial right of the left lung lotex is which likely related theal temporal of the seizure focus.\n2.\nNo, there small focus of mildly increasedased metabolism within the left posterior front posterioral temporalietal cortex is noted.\nwhich may n uncertain clinical significance.\nmay be followed up.\n3linical correlation.\nindicated as further with the seEG is suggested.\nGiven this impression, what PET/CT scan should we use?\nAnswer: FET/CT B of the brain dated History find find find find find find find find find find to to to to close to to F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F', 'Impression: IMPRESSION:  Noinent hypermetabolic mass tissue mass the rightoperative bed of No find nonspecific findings, which may represent inflactive infloperative changesammatory changes.\nAtt m-specific mildly hyperDG avid rightateral cer 2 l

decoded_preds:----------------------
 ['Impression: IMPRESSION: 1. erate Fased Fabolic in the right medial right of the left temporal lotex is likely likely related aalization of se seizure focus.\n2.\nNo, there small focus of decreildly decreased metabolism in the left posterior front posterioral temporalietal cortex is susp, which may n uncertain clinical significance.\nmay be followed up.\n3linical correlation.\nindicated as correlation with the EEG is suggested.\nGiven this impression, what PET/CT scan should we use?\nAnswer: PET/CT scan of the brain dated History000000000 to F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F', 'Impression: IMPRESSION: 1inent hypermetabolic les tissue nod the rightoperative bed of No are nonspecific findings, however may represent inflactive infloperative changesammatory changes.\nT m-specific mildly hyperDG avid lateral cer II2 cervical lymph nodes.\nAtt FDG avid distant me

decoded_preds:----------------------
 ['Impression: IMPRESSION: 1. erate hyperased Fabolism in the left medial right of the left temporal lotex is susp likely corresponding aalization of se seizure activity.\n2.\nNo, there small focus of decreildly decreased metabolism within the left par front posterioral temporalietal cortex is noted, which may n uncertain clinical significance.\nmay be further up.\n3linical correlation.\nindicated as further with the EEG is suggested.\nGiven this impression, what PET/CT scan should we use?\nAnswer: PET/CT B of the brain dated History find find find find find find find find find F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F', 'Impression: IMPRESSION: 1inent hypermetabolic les tissue mass the leftoperative bed of No find nonspecific findings, which may represent inflactive infloperative changesammatory changes.\nT m-specific mildly hyperDG avid mediateral cer II2 cervical lymph nodes.\nFDG avid distant metastatic.\nGiven this

decoded_preds:----------------------
 ['Impression: IMPRESSION: 1. erate hyperased Fabolism in the right medial right of the left temporal lotex is likely likely related aalization of the seizure activity.\n2.\nNo, there small focus of decreildly decreased metabolism within the left par front posterioral ocietal cortex, also, which may n uncertain clinical significance.\nmay be followed up.\n3linical correlation.\nindicated as correlation with the EEG is suggested.\nGiven this impression, what PET/CT scan should we use?\nAnswer: FET/CT B of the brain dated History find find find find find find find find find F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F', 'Impression: IMPRESSION: 1inent hypermetabolic right tissue mass the rightoperative bed of No find nonspecific findings, however may represent inflactive infloperative changesammatory changes.\nT m-specific mildly hyperDG avid leftateral ax 2 cervical lymph nodes.\nFDG avid distant metastatic.\nGiven this

decoded_preds:----------------------
 ['Impression: IMPRESSION: 1. erate hyperased Fabolism in the right medial right of the left temporal lotex is which likely corresponding aalization of se seizure activity.\n2.\nNo, there small area of decreildly decreased metabolism in the left par front posterioral ocietal cortex, noted, which may n uncertain clinical significance.\nmay be followed up.\n3linical correlation.\nwell as correlation to the EEG find suggested.\nGiven this impression, what PET/CT scan should we use?\nAnswer: FET/CT B of the brain dated History find find find F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F', 'Impression: IMPRESSION: 1inent hypermetabolic les tissue mass the rightoperative bed of No find nonspecific findings, however may represent reactive infloperative changesammatory changes.\nT m-specific mildly hyperDG avid lateral cer II2 cervical lymph nodes.\nNo FDG avid met metastatic.\nGiven this impression, what PET/CT scan sh

decoded_preds:----------------------
 ['Impression: IMPRESSION: 1. erate hyperased Fabolism in the right medial right of the left temporal lotex is which likely related aalization of se seizure activity.\n2.\nNo, there small focus of decreildly decreased metabolism within the left par front posterioral ocietal cortex is also, which may n uncertain clinical significance.\nmay be followed up.\n3linical correlation.\nindicated as correlation to the EEG can suggested.\nGiven this impression, what PET/CT scan should we use?\nAnswer: FET/CT B of the brain dated History find find find find find find find find find find F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F', 'Impression: IMPRESSION: 1inent hypermetabolic right tissue les the rightoperative bed of No find nonspecific findings, however may represent reactive infloperative changesammatory changes.\nT m-specific mildly hyperDG avid leftateral ax II2 cervical lymph nodes.\nFDG avid distant metastatic.\nGiv

decoded_preds:----------------------
 ['Impression: IMPRESSION: 1. erately hyperased Fabolic in the right medial right of the left temporal lotex is which likely corresponding aalization of se seizure activity.\n2.\nNo, there small focus of decreildly decreased metabolism within the left front front posterioral ocietal cortex is also, which may n uncertain clinical significance.\nmay be followed up.\n3linical correlation.\nindicated as correlation to the EEG can suggested.\nGiven this impression, what PET/CT scan should we use?\nAnswer: FET/CT B of the brain dated History find find find find find find find find find F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F', 'Impression: IMPRESSION: 1inent hypermetabolic right tissue mass the rightoperative bed of No find nonspecific findings, however may represent reactive infloperative changesammatory changes.\nT m-specific mildly hyperDG avid leftateral cer II2 cervical lymph nodes.\nFDG avid distant metastatic

decoded_preds:----------------------
 ['Impression: IMPRESSION: 1. erately hyperased Fabolic in the right medial right of the left temporal lotex is which likely corresponding aalization of se seizure activity.\n2.\nNo, there small focus of decreildly increasedased metabolism within the left par front posterioral ocietal cortex is also, which may n uncertain clinical significance.\nmay be followed up.\n3linical correlation.\nindicated as correlation to the EEG can suggested.\nGiven this impression, what PET/CT scan should we use?\nAnswer: FET/CT B of the brain dated History find find find find find find find find find F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F', 'Impression: IMPRESSION: 1inent hypermetabolic right tissue mass the rightoperative bed of No find nonspecific findings, however may represent reactive infloperative changesammatory changes.\nT m-specific mildly hyperDG avid leftateral cer II2 cervical lymph nodes.\nFDG avid distant metastat

decoded_preds:----------------------
 ['Impression: IMPRESSION: 1. erately hyperased Fabolic in the right medial right of the left temporal lotex is which likely corresponding aalization of se seizure activity.\n2.\nNo, there small focus of decreildly increasedased metabolism within the left par front posterioral ocietal cortex is also, which may n uncertain clinical significance.\nmay be followed up.\n3linical correlation.\nindicated as correlation to the EEG can suggested.\nGiven this impression, what PET/CT scan should we use?\nAnswer: FET/CT B of the brain dated History find find find find find find find find find F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F F', 'Impression: IMPRESSION: 1inent hypermetabolic right tissue mass the rightoperative bed of No find nonspecific findings, however may represent reactive infloperative changesammatory changes.\nT m-specific mildly hyperDG avid leftateral cer II2 cervical lymph nodes.\nFDG avid distant metastat

In [100]:
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map=device_map,
)
model = PeftModel.from_pretrained(base_model, new_model)
model = model.merge_and_unload()

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [101]:
!huggingface-cli login --token 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

model.push_to_hub(new_model, use_temp_dir=False)
tokenizer.push_to_hub(new_model, use_temp_dir=False)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/imx2/.cache/huggingface/token
Login successful


pytorch_model-00002-of-00002.bin:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/imxx/llama-2-7b-radnlp-petct/commit/21d105222fce90503cbf95243871edd0b68f0bba', commit_message='Upload tokenizer', commit_description='', oid='21d105222fce90503cbf95243871edd0b68f0bba', pr_url=None, pr_revision=None, pr_num=None)

In [102]:
petct_predictions3 = get_classification(mri_contexts, model, tokenizer, 32, device_map, 'Pet/CT')

100%|████████████████████████████████████████████████████████████| 1000/1000 [28:13<00:00,  1.69s/it]


In [103]:
rouge_scores = get_rouge_scores(petct_predictions3, petct_references)
print(f"Rouge1: {rouge_scores['rouge1']['fmeasure_mean']}")
print(f"Rouge2: {rouge_scores['rouge2']['fmeasure_mean']}")
print(f"RougeL: {rouge_scores['rougeL']['fmeasure_mean']}")
bert_scores = get_bertscore(petct_predictions3, petct_references)
print(f"Bert score: {bert_scores['f1_mean']}")
avg_response_lengths = compare_lengths(petct_predictions3, petct_references)
print(f"Average response lengths: {avg_response_lengths}")
base_spacy_scores = test_hallucination(nlp, petct_predictions3)
print(f"Hallucination percent: {base_spacy_scores[2]}")

Rouge1: 27.289060304266087
Rouge2: 15.949072913591792
RougeL: 25.864533270156908




Bert score: 14.552918063953985
Average response lengths: {'prediction': 75.169, 'reference': 79.232}
Hallucination percent: 0.3980280646507065
