# Config

In [1]:
import pandas as pd
import numpy as np
import os
import torch
import re
from datasets import load_dataset, Dataset, load_metric
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
    LogitsProcessor,
    LogitsProcessorList,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
import nltk
import warnings
import spacy
import math
import xformers
import tensor_parallel as tp
from tqdm import tqdm
import evaluate
nltk.download('punkt')

  _C._set_default_tensor_type(t)
[nltk_data] Downloading package punkt to /home/imx2/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Set up model and parameters

In [2]:
model_name = "meta-llama/Llama-2-7b-chat-hf"
# we save the model under this name
new_model = "llama-2-7b-radnlpv2"

In [35]:
################################################################################
# QLoRA parameters
################################################################################

# LoRA attention dimension
lora_r = 64

# Alpha parameter for LoRA scaling
lora_alpha = 16

# Dropout probability for LoRA layers
lora_dropout = 0.25

################################################################################
# bitsandbytes parameters
################################################################################

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

################################################################################
# TrainingArguments parameters
################################################################################

# Output directory where the model predictions and checkpoints will be stored
output_dir = "./results"

# Number of training epochs
num_train_epochs = 3

# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = False
bf16 = False

# Batch size per GPU for training
per_device_train_batch_size = 4

# Batch size per GPU for evaluation
per_device_eval_batch_size = 4

# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 1

# Enable gradient checkpointing
gradient_checkpointing = True

# Maximum gradient normal (gradient clipping)
max_grad_norm = 0.3

# Initial learning rate
learning_rate = 2e-4

# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.001

# Optimizer to use
optim = "paged_adamw_32bit"

# Learning rate schedule
lr_scheduler_type = "cosine"

# Number of training steps (overrides num_train_epochs)
max_steps = -1

# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03

# Group sequences into batches with same length
# Saves memory and speeds up training considerably
group_by_length = True

# Save checkpoint every X updates steps
save_steps = 0

# Log every X updates steps
logging_steps = 25

# Evaluate model on validation set every X update steps
evaluation_strategy='steps'
eval_steps=500

################################################################################
# SFT parameters
################################################################################

# Maximum sequence length to use
max_seq_length = 200

# Pack multiple short examples in the same input sequence to increase efficiency
packing = False

# batch size
dataset_batch_size = 32

# Load the entire model on the GPU 0
# device_map = {'':torch.cuda.current_device()}
# CUDA_VISIBLE_DEVICES=0,1,2,3, try multiple devices
device_map = 'auto' #{'':torch.cuda.current_device()}

In [36]:
# Load tokenizer and model with QLoRA configuration
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

In [37]:
# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

Your GPU supports bfloat16: accelerate training with bf16=True


In [38]:
# Load base model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map
)
# model = tp.tensor_parallel(
#     AutoModelForCausalLM.from_pretrained(
#     model_name,
#     quantization_config=bnb_config,
# #     device_map=device_map
#     )
# )
# model = tp.tensor_parallel(model, ["cuda:0", "cuda:1"])  # <- each GPU has half the weights
model.config.use_cache = False
model.config.pretraining_tp = 1

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [39]:
# Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training

In [40]:
# Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
)

In [41]:
# Set training parameters
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    evaluation_strategy=evaluation_strategy,
    eval_steps=eval_steps,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="tensorboard"
)

In [10]:
BERTSCORE_MODEL_TYPE = "microsoft/deberta-xlarge-mnli"

In [11]:
nlp = spacy.load("en_core_sci_lg")

# Fine-tune Set Up
Function where inputs are different hyperparameters that can be used.

In [12]:
class EosTokenRewardLogitProcess(LogitsProcessor):
    # class to get the model to generate EOS token more often as sentence nears max_length
    def __init__(self, eos_token_id: int, max_length: int):
        if not isinstance(eos_token_id, int) or eos_token_id < 0:
            raise ValueError(f"`eos_token_id` has to be a positive integer, but is {eos_token_id}")

        if not isinstance(max_length, int) or max_length < 1:
            raise ValueError(f"`max_length` has to be a integer bigger than 1, but is {max_length}")

        self.eos_token_id = eos_token_id
        self.max_length=max_length

    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
        cur_len = input_ids.shape[-1]
        # start to increese the reward of the  eos_tokekn from 70% max length  progressively on length
        for cur_len in (max(0,int(self.max_length*0.7)), self.max_length ):
            ratio = cur_len/self.max_length
            num_tokens = scores.shape[1] # size of vocab
            scores[:, [i for i in range(num_tokens) if i != self.eos_token_id]] =\
            scores[:, [i for i in range(num_tokens) if i != self.eos_token_id]]*ratio*10*torch.exp(-torch.sign(scores[:, [i for i in range(num_tokens) if i != self.eos_token_id]]))
            scores[:, self.eos_token_id] = 1.1e2 * ratio
        return scores

In [13]:
tokenizer.eos_token_id

2

In [14]:
def finetune(model, train_dataset, eval_dataset, peft_config, max_seq_length, tokenizer, training_arguments, packing, formatting_func, new_model, compute_metrics, preprocess_logits_for_metrics):
    # Set supervised fine-tuning parameters
    # add validation set to model
    trainer = SFTTrainer(
        model=model,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        peft_config=peft_config,
        max_seq_length=max_seq_length,
        tokenizer=tokenizer,
        args=training_arguments,
        packing=packing,
        formatting_func=formatting_func,
        compute_metrics=compute_metrics,
        preprocess_logits_for_metrics=preprocess_logits_for_metrics,
    )

    # Train model
    trainer.train()

    # Save trained model
    trainer.model.save_pretrained(new_model)

# Evaluate Model
Functions to evaluate the model

In [15]:
def get_rouge_scores(predictions, references):
    '''
    predictions: list of model predictions
    references: corresponding list of test summaries
    
    returns: dictionary of rouge scores
    '''
    rouge = load_metric("rouge")

    # process text to make it compatible with rouge
    predictions = [" ".join(pred.strip().split()) for pred in predictions]
    references = [" ".join(ref.strip().split()) for ref in references]
    predictions = ["\n".join(nltk.sent_tokenize(pred)) for pred in predictions]
    references = ["\n".join(nltk.sent_tokenize(ref)) for ref in references]

    # compute rouge scores
    results = rouge.compute(
        predictions=predictions,
        references=references,
        use_stemmer=True,
        use_aggregator=False,
    )
    for key, value in results.items():
        results[key] = {
            "precision": [score.precision * 100 for score in value],
            "recall": [score.recall * 100 for score in value],
            "fmeasure": [score.fmeasure * 100 for score in value],
            "fmeasure_mean": np.mean([score.fmeasure for score in value]) * 100,
        }
    # Compute the arithmetic mean of ROUGE-1, ROUGE-2 and ROUGE-L following: https://arxiv.org/abs/2110.08499
    if all(rouge_type in results for rouge_type in ["rouge1", "rouge2", "rougeL"]):
        results["rouge_avg_fmeasure"] = np.mean(
            [results[key]["fmeasure"] for key in ["rouge1", "rouge2", "rougeL"]], axis=0
        ).tolist()
        results["rouge_avg_fmeasure_mean"] = np.mean(results["rouge_avg_fmeasure"]).item()
    else:
        warnings.warn(
            "ROUGE-1, ROUGE-2 and ROUGE-L are not all present in the results. Skipping the computation of ROUGE-AVG."
        )

    return results
    pass

In [16]:
def get_bertscore(predictions, references, device):
    '''
    predictions: list of model predictions
    references: corresponding list of test summaries
    
    returns: dictionary of bert scores
    '''
    bertscore = load_metric("bertscore")

    predictions = [" ".join(pred.strip().split()) for pred in predictions]
    references = [" ".join(ref.strip().split()) for ref in references]
    predictions = ["\n".join(nltk.sent_tokenize(pred)) for pred in predictions]
    references = ["\n".join(nltk.sent_tokenize(ref)) for ref in references]

    # Compute and post-process bertscore results
    results = bertscore.compute(
        predictions=predictions,
        references=references,
        # These are mostly based on the recommendations in https://github.com/Tiiiger/bert_score
        model_type=BERTSCORE_MODEL_TYPE,
        lang="en",
        rescale_with_baseline=True,
        use_fast_tokenizer=True,
        device = device
    )
    results["f1_mean"] = np.mean(results["f1"])
    for key, value in results.items():
        if key == "hashcode":
            continue
        if isinstance(value, list):
            results[key] = [score * 100 for score in value]
        else:
            results[key] = value * 100

    return results

In [17]:
def compare_lengths(predictions, references):
    pred_length = sum(len(pred) for pred in predictions)/len(predictions)
    ref_length = sum(len(ref) for ref in references)/len(predictions)
    return {'prediction': pred_length, 'reference': ref_length}

In [18]:
def test_hallucination(nlp, predictions):
    # returns percent of entities in generated impression not found in findings
    unknown_words = {}
    unknown_pcts = {}
    total_unknown_pcts = 0
    for i,pred in enumerate(predictions):
        doc = nlp(pred)
        unknown_words[i] = doc.ents
        unknown_pct = len(doc.ents)/(len(set(pred.split()))+.0000000001)
        unknown_pcts[i] = unknown_pct
        total_unknown_pcts+= unknown_pct
    avg_unknown_pct = total_unknown_pcts/(len(predictions)+.0000000001)
    return unknown_words, unknown_pcts, avg_unknown_pct

In [19]:
metric = evaluate.load("rouge")

def preprocess_logits_for_metrics(logits, labels):
    if isinstance(logits, tuple):
        logits = logits[0]
    return logits.argmax(dim=-1)

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    # Replace -100s used for padding as we can't decode them
    preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    # Otherwise rouge scores were a bit inflated
    decoded_pred_radqas = [s.split('Answer:', 1)[-1].strip() if 'Answer:' in s else s for s in decoded_preds]
    decoded_label_radqas = [s.split('Answer:', 1)[-1].strip() if 'Answer:' in s else s for s in decoded_labels]
    print("decoded_preds:----------------------\n", decoded_pred_radqas[0:5])
    print("decoded_labels:---------------------\n", decoded_label_radqas[0:5])

    result = metric.compute(predictions=decoded_pred_radqas, references=decoded_label_radqas, use_stemmer=True)
    result = {k: round(v * 100, 4) for k, v in result.items()}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["pred_len"] = np.mean(prediction_lens)
    reference_lens = [np.count_nonzero(label != tokenizer.pad_token_id) for label in labels]
    result['ref_len'] = np.mean(reference_lens)
    return result

# RadQA

## Data cleaning

In [20]:
radqa_train_df = pd.read_json('radqa/train.json')
radqa_val_df = pd.read_json('radqa/dev.json')
radqa_test_df = pd.read_json('radqa/test.json')
radqa_train_df

Unnamed: 0,data,version
0,{'paragraphs': [{'qas': [{'question': 'Is ther...,full
1,{'paragraphs': [{'qas': [{'question': 'Does th...,full
2,{'paragraphs': [{'qas': [{'question': 'Is the ...,full
3,{'paragraphs': [{'qas': [{'question': 'Is an a...,full
4,{'paragraphs': [{'qas': [{'question': 'Is ther...,full
...,...,...
798,{'paragraphs': [{'qas': [{'question': 'Is ther...,full
799,{'paragraphs': [{'qas': [{'question': 'Is ther...,full
800,{'paragraphs': [{'qas': [{'question': 'Are the...,full
801,{'paragraphs': [{'qas': [{'question': 'Was the...,full


In [21]:
radqa_train_df['data'][0]

{'paragraphs': [{'qas': [{'question': 'Is there any significant change in bleeding?',
     'id': '796653_2_1_I',
     'answers': [],
     'is_impossible': True},
    {'question': 'Did the bleeding in the sub-dural space resolve?',
     'id': '796653_1_2_I',
     'answers': [{'answer_id': '796653_1_2_I_MG',
       'text': 'Subdural hematomas with blood products of different ages',
       'answer_start': 13}],
     'is_impossible': False},
    {'question': 'Is there any additional bleeding in the sub-arachanoid space?',
     'id': '796653_1_1_I',
     'answers': [],
     'is_impossible': True}],
   'context': 'IMPRESSION:  Subdural hematomas with blood products of different ages.\n Question vescular abnormality in left suprasellar space.  Findings were\n discussed with Dr. [**Last Name (STitle) 8620**] at 9:25 am on [**2191-8-5**].  An MRI of the brain and MRA\n of the COW is recommended.',
   'document_id': '796653_I'},
  {'qas': [{'question': 'Is there any additional bleeding in the su

In [22]:
radqa_train_df['version'].unique()

array(['full'], dtype=object)

In [23]:
def clean_radqa_df(df):
    data_column = df['data']
    normalized_data = pd.json_normalize(data_column, 'paragraphs', ['title'])
    df_expanded = normalized_data.explode('qas').reset_index(drop=True)
    df_expanded_qas = pd.json_normalize(df_expanded['qas']).add_prefix('qas.')
    result_df = pd.concat([df_expanded, df_expanded_qas], axis=1)
    result_df['answer_text'] = result_df['qas.answers'].apply(lambda x: x[0]['text'] if x else None)
    result_df['answer_start'] = result_df['qas.answers'].apply(lambda x: str(x[0]['answer_start']) if x else 'Not in context.')
    result_df = result_df.drop(columns=['qas', 'qas.answers'])
    result_df['qas.answer'] = np.where(result_df['answer_text'],  result_df['answer_text'], 'Not in context.')
#     result_df['qas.adjusted_answers'] = np.where(result_df['qas.is_impossible'] == True, 'Is impossible.', result_df['qas.answers'])
    return result_df[['context', 'qas.question', 'qas.answer', 'answer_start']]

In [24]:
radqa_clean_train_df = clean_radqa_df(radqa_train_df)
radqa_clean_val_df = clean_radqa_df(radqa_val_df)
radqa_clean_test_df = clean_radqa_df(radqa_test_df)
radqa_clean_train_df.head()

Unnamed: 0,context,qas.question,qas.answer,answer_start
0,IMPRESSION: Subdural hematomas with blood pro...,Is there any significant change in bleeding?,Not in context.,Not in context.
1,IMPRESSION: Subdural hematomas with blood pro...,Did the bleeding in the sub-dural space resolve?,Subdural hematomas with blood products of diff...,13
2,IMPRESSION: Subdural hematomas with blood pro...,Is there any additional bleeding in the sub-ar...,Not in context.,Not in context.
3,WET READ: MES FRI [**2191-8-5**] 1:40 AM\n no...,Is there any additional bleeding in the sub-ar...,Not in context.,Not in context.
4,WET READ: MES FRI [**2191-8-5**] 1:40 AM\n no...,Did the bleeding in the sub-dural space resolve?,mixed density subdural hematomas seen along bo...,757


In [25]:
radqa_clean_train_df['context'][3]

'WET READ: MES FRI [**2191-8-5**] 1:40 AM\n  no significant change in hemorrhage\n ______________________________________________________________________________\n                                 FINAL REPORT\n INDICATION: known subarachnoid subdural hemorrhage from outside hospital.\n Evaluate for any change.\n\n TECHNIQUE: Noncontrast head CT.\n\n COMPARISON: (CT done several hours earlier at [**Hospital 539**] Hospital).  At the time\n of attending review, the prior exam is not available for comparison.\n\n FINDINGS: There has been no significant change in the interval. There is an\n area of hyperdensity along the left anterior clinoid and in the adjacent\n suprasellar space, which may be an aneurysm or small collection of blood, or a\n dense mass.\n There are mixed density subdural hematomas seen along both cerebral\n convexities, slightly larger on the left (approx 8-9mm) than on the right.\n There is acute blood in the dependent parts of the subdural collections. There\n is flatt

In [26]:
radqa_clean_train_df['qas.question'][3]

'Is there any additional bleeding in the sub-arachanoid space?'

In [27]:
radqa_clean_train_df['qas.answer'][3]

'Not in context.'

In [28]:
radqa_train_dataset = Dataset.from_pandas(radqa_clean_train_df)
radqa_val_dataset = Dataset.from_pandas(radqa_clean_val_df)
radqa_test_dataset = Dataset.from_pandas(radqa_clean_test_df)

In [29]:
radqa_train_dataset

Dataset({
    features: ['context', 'qas.question', 'qas.answer', 'answer_start'],
    num_rows: 4878
})

## Prompt and inference set up 

In [30]:
def generate_radqa_prompt(example):
    """
    params:
    example: dataset with columns context, question, and answer
    Ask model to return a sentence from the context that contains the answer
    
    returns:
    list of prompts for each context-question-answer trio
    """
    output_texts = []
    for i in range(len(example['context'])):
        text = f"Context: {example['context'][i]}\nQuestion: {example['qas.question'][i]}\nWhat text from the context answers this question? If none, answer Not in context. Answer: {example['qas.answer'][i]}"
        output_texts.append(text)
    return output_texts

In [31]:
def get_radqas(contexts, questions, model, tokenizer, max_response_length):
    answers = []
    for i, c in enumerate(tqdm(contexts)):
#         prompt = f"Context: {c}\nQuestion: {questions[i]}\nReturn 'Not in Context' if the answer is not in the context. Answer: "
#         prompt = f"Context: {c}\nQuestion: {questions[i]}\nWhat word in the context does the answer start? Answer Start Position: "
        prompt = f"Context: {c}\nQuestion: {questions[i]}\nWhat text from the context answers this question? If none, answer Not in context. Answer: "
#         max_length = len(prompt) + max_response_length
        logits_process_list= LogitsProcessorList([EosTokenRewardLogitProcess(eos_token_id=tokenizer.eos_token_id, max_length=max_response_length)])
        # add some postprocessor
        pipe = pipeline(
            task="text-generation", 
            model=model, 
            tokenizer=tokenizer, 
            logits_processor=logits_process_list, 
            max_new_tokens=max_response_length, 
            return_full_text=False, 
            temperature=.1)
        result = pipe(prompt)
        answers.append(result[0]['generated_text'])
    return answers

# Inference on base model

In [49]:
contexts = radqa_train_dataset['context']
questions = radqa_train_dataset['qas.question']
answers = radqa_train_dataset['qas.answer']
answer_start = radqa_train_dataset['answer_start']

In [65]:
predictions = get_radqas(contexts[:5], questions[:5], model, tokenizer, 32)
predictions

100%|██████████████████████████████████████████████████████████████████| 5/5 [00:06<00:00,  1.23s/it]


[' No, there is no significant change in bleeding. The subdural hematoma is still present, and the blood products are of different ages.\n',
 ' No.\n',
 ' No.\n',
 ' No.\n',
 ' Not in Context.\n']

In [60]:
answers[:5]

['Not in context.',
 'Subdural hematomas with blood products of different ages',
 'Not in context.',
 'Not in context.',
 'mixed density subdural hematomas seen along both cerebral\n convexities, slightly larger on the left (approx 8-9mm) than on the right.\n There is acute blood in the dependent parts of the subdural collections']

In [None]:
predictions = get_radqas(contexts, questions, model, tokenizer, 30)
predictions

In [None]:
answers

In [42]:
import json

file_path = "radqa_predictions.json"

# Open the file in read mode and use json.load to read the list from the file
with open(file_path, "r") as file:
    predictions = json.load(file)


In [43]:
sum([len(answers[i]) for i in range(len(answers))])/len(answers)

84.44485444854449

In [44]:
rouge_scores = get_rouge_scores(predictions, answers)
print(f"Rouge1: {rouge_scores['rouge1']['fmeasure_mean']}")
print(f"Rouge2: {rouge_scores['rouge2']['fmeasure_mean']}")
print(f"RougeL: {rouge_scores['rougeL']['fmeasure_mean']}")
bert_scores = get_bertscore(predictions, answers)
print(f"Bert score: {bert_scores['f1_mean']}")
avg_response_lengths = compare_lengths(predictions, answers)
print(f"Average response lengths: {avg_response_lengths}")
# base_spacy_scores = test_hallucination(nlp, predictions)
# print(f"Hallucination percent: {base_spacy_scores[2]}")

  rouge = load_metric("rouge")


Rouge1: 19.492784398961184
Rouge2: 8.87716980178776
RougeL: 17.73556326141762




Bert score: 7.582608859104075
Average response lengths: {'prediction': 79.12136121361213, 'reference': 84.44485444854449}


In [None]:
predictions

In [54]:
from sklearn.metrics import f1_score
f1 = f1_score(answers, predictions, average='weighted')

# Print the F1 score
print(f'F1 Score: {f1}')

F1 Score: 0.0


## Attempt to get answer start position

Oops F1 score of 0 makes sense because it's not meant for text to text models. We will adjust the prompt.

In [86]:
predictions = get_radqas(contexts[:5], questions[:5], model, tokenizer, 32)
predictions

100%|██████████████████████████████████████████████████████████████████| 5/5 [00:04<00:00,  1.18it/s]


['10\n', '10\n\n', '10\n', '2191-8-5\n', '2191-8-5\n']

In [82]:
answer_start[:5]

['Not in context.', '13', 'Not in context.', 'Not in context.', '757']

## Attempt to get answer form context - promising compromise

In [111]:
predictions = get_radqas(contexts[:500], questions[:500], model, tokenizer, 32)
predictions[:10]

100%|██████████████████████████████████████████████████████████████| 500/500 [22:49<00:00,  2.74s/it]


[' Yes, there is a significant change in bleeding.\n',
 ' Not in context. The text does not provide any information about the resolution of the bleeding in the sub-dural space.\n\nAnswer: Not in',
 ' Yes, there is additional bleeding in the sub-arachnoid space.\n\n',
 ' Not in context.\n\n',
 ' Not in context. There is no text in the provided context that directly answers the question of whether the bleeding in the sub-dural space resolved. The',
 ' Not in context. There is no text in the provided context that directly answers the question about significant change in bleeding. The report only mentions the initial finding of',
 '2) There is contrast material within the kidneys; this may represent ATN or continued renal excretion of orally administered contrast.',
 '2) There is contrast material within the kidneys; this may represent ATN or continued renal excretion of orally administered contrast.',
 '2) There is contrast material within the kidneys; this may represent ATN or continued 

In [112]:
answers[:10]

['Not in context.',
 'Subdural hematomas with blood products of different ages',
 'Not in context.',
 'Not in context.',
 'mixed density subdural hematomas seen along both cerebral\n convexities, slightly larger on the left (approx 8-9mm) than on the right.\n There is acute blood in the dependent parts of the subdural collections',
 'no significant change',
 'obstruction at the area of ileal anastomosis',
 'Not in context.',
 'contrast material within the kidneys; this may represent ATN or\n continued renal excretion of orally administered contrast',
 'high attenuation  within the kidneys bilaterally, consistent with\n either ATN or related to enteric oral contrast absorption and excretion\n continually by the kidneys due to obstruction']

In [117]:
rouge_scores = get_rouge_scores(predictions, answers[:500])
print(f"Rouge1: {rouge_scores['rouge1']['fmeasure_mean']}")
print(f"Rouge2: {rouge_scores['rouge2']['fmeasure_mean']}")
print(f"RougeL: {rouge_scores['rougeL']['fmeasure_mean']}")
bert_scores = get_bertscore(predictions, answers[:500], 7)
print(f"Bert score: {bert_scores['f1_mean']}")
avg_response_lengths = compare_lengths(predictions, answers[:500])
print(f"Average response lengths: {avg_response_lengths}")

Rouge1: 21.317079660752867
Rouge2: 14.844763851352388
RougeL: 20.568796939284567




Bert score: 6.075675392569974
Average response lengths: {'prediction': 71.112, 'reference': 59.504}




# Inference on radiology model

In [119]:
!huggingface-cli login --token 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

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /home/imx2/.cache/huggingface/token
Login successful


In [121]:
model_name = "imxx/llama-2-7b-chest-pelvis-mri-pelvis"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map
)
model.config.use_cache = False
model.config.pretraining_tp = 1

config.json:   0%|          | 0.00/633 [00:00<?, ?B/s]

pytorch_model.bin.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

In [123]:
predictions = get_radqas(contexts[:500], questions[:500], model, tokenizer, 32)
predictions[:10]

100%|██████████████████████████████████████████████████████████████| 500/500 [14:28<00:00,  1.74s/it]


[' No',
 ' No',
 ' No',
 ' No.\n\n The critical findings described above were communicated via the Veriphy Critical Results Reporting System as a Yellow critical result.  ',
 ' No.\n\n The critical findings described above were communicated via the Veriphy Critical Results Reporting System as a Yellow critical result.  ',
 ' No.\n The critical findings described above were communicated via the Veriphy Critical Results Reporting System as a Yellow critical result.  \n',
 '1. Small bowel obstruction at the area of ileal anastomosis. 2. Focal bowel wall thickening and hypermet',
 '1. Small bowel obstruction at the area of ileal anastomosis. 2. Fecal obstruction.  \n Given this',
 ' No',
 ' No.\n\n']

Results don't seem very promising :(

In [124]:
rouge_scores = get_rouge_scores(predictions, answers[:500])
print(f"Rouge1: {rouge_scores['rouge1']['fmeasure_mean']}")
print(f"Rouge2: {rouge_scores['rouge2']['fmeasure_mean']}")
print(f"RougeL: {rouge_scores['rougeL']['fmeasure_mean']}")
bert_scores = get_bertscore(predictions, answers[:500], 7)
print(f"Bert score: {bert_scores['f1_mean']}")
avg_response_lengths = compare_lengths(predictions, answers[:500])
print(f"Average response lengths: {avg_response_lengths}")

Rouge1: 12.411193973482773
Rouge2: 6.132884068738024
RougeL: 12.338329193119382




Bert score: -12.673906405025628
Average response lengths: {'prediction': 45.058, 'reference': 59.504}




# Fine tune RadQA on base model

In [32]:
model_name = "meta-llama/Llama-2-7b-chat-hf"
new_model = "llama-2-7b-radnlp-radqa"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map
)
model.config.use_cache = False
model.config.pretraining_tp = 1

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

 Evidence of overfitting, so we increase lora dropout and rerun the model.

In [33]:
finetune(model, 
         radqa_train_dataset, #ct_impressions_train_dataset, 
         radqa_val_dataset,
         peft_config, 
         1024, 
         tokenizer, 
         training_arguments, 
         packing, 
         generate_radqa_prompt, 
         new_model,
         compute_metrics,
         preprocess_logits_for_metrics)



Map:   0%|          | 0/4878 [00:00<?, ? examples/s]

Map:   0%|          | 0/656 [00:00<?, ? examples/s]

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Pred Len,Ref Len
500,0.8667,1.161797,57.063,51.7701,57.1129,57.1212,335.585366,190.60061
1000,0.8523,1.103447,57.9868,52.5347,58.0172,58.0629,335.585366,190.60061
1500,0.896,1.081809,74.9096,68.9406,74.8596,74.9403,335.585366,190.60061
2000,0.875,1.103495,64.4323,59.0861,64.4745,64.5917,335.585366,190.60061
2500,0.5854,1.142377,64.9798,59.7701,65.004,65.0705,335.585366,190.60061
3000,0.5503,1.187377,67.0944,61.4125,67.1242,67.2337,335.585366,190.60061
3500,0.5542,1.211889,73.3898,67.1768,73.3778,73.4113,335.585366,190.60061
4000,0.3865,1.266108,77.0488,71.2676,77.0642,77.0967,335.585366,190.60061
4500,0.3203,1.316458,76.3133,70.1386,76.316,76.3364,335.585366,190.60061
5000,0.4277,1.348567,73.8466,67.6277,73.8223,73.8649,335.585366,190.60061


decoded_preds:----------------------
 ['Right mid lung opacity is concerning for early pneumonia.\nin in in in in concerning concerning in in in in concerning concerning concerning concerning concerning in in is is concerning concerning concerning concerning concerning concerning concerning concerning in in in in in in in in in answer answer question in in in question answer question in', 'Not mid lung opacity is in is::: is is:::::::::::::::::::::::: is as concerning in in', 'small right mid-lung\n opacity, concerning for pneumonia.', 'small right mid-lung\n opacity,\n\n question question question question question question question question', 'no in context.\n[']
decoded_labels:---------------------
 ['Right mid lung opacity is concerning for early pneumonia', 'Right mid lung opacity', 'small right mid-lung\n opacity, concerning for pneumonia', 'small right mid-lung\n opacity', 'Not in context.']
decoded_preds:----------------------
 ['Right mid lung opacity is concerning for early p

Experiment: increase dropout rate to .25 and reduce number of epochs to 3 to prevent overfitting.

In [42]:
finetune(model, 
         radqa_train_dataset, #ct_impressions_train_dataset, 
         radqa_val_dataset,
         peft_config, 
         1024, 
         tokenizer, 
         training_arguments, 
         packing, 
         generate_radqa_prompt, 
         new_model,
         compute_metrics,
         preprocess_logits_for_metrics)

Map:   0%|          | 0/4878 [00:00<?, ? examples/s]

Map:   0%|          | 0/656 [00:00<?, ? examples/s]

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Pred Len,Ref Len
500,0.8626,1.152484,52.9583,48.7004,52.9118,52.96,335.585366,190.60061
1000,0.8469,1.096629,51.282,46.6282,51.3133,51.3642,335.585366,190.60061
1500,0.8948,1.077609,55.0821,50.4339,55.0725,55.1999,335.585366,190.60061
2000,0.8817,1.083837,54.9959,50.268,54.9836,55.0213,335.585366,190.60061
2500,0.6102,1.124918,55.2738,50.9057,55.2396,55.3399,335.585366,190.60061
3000,0.5852,1.135888,55.6487,50.865,55.5744,55.6905,335.585366,190.60061
3500,0.6265,1.137062,55.9362,51.3202,55.8544,55.9624,335.585366,190.60061


decoded_preds:----------------------
 ['Right mid lung opacity is concerning for early pneumonia.\nin in in in in in00 in in in in0000\n\n0000 in in in in in in in in in in in in in in in context context context in in in question question question in in in in', 'Not mid lung opacity is in9:::99::::::::::::::::::::::::::: in context context in context', 'small right mid-lung\n opacity, concerning for pneumonia.', 'small right mid-lung\n opacity,\n question question question question question question question question', 'Not in context.']
decoded_labels:---------------------
 ['Right mid lung opacity is concerning for early pneumonia', 'Right mid lung opacity', 'small right mid-lung\n opacity, concerning for pneumonia', 'small right mid-lung\n opacity', 'Not in context.']
decoded_preds:----------------------
 ['Right mid lung opacity is concerning for early pneumonia.\nin in in in in in in in in in in in in in in in in in in in', 'Right mid lung opacity is in in in in in:::::::::: in in

In [43]:
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map=device_map,
)
model = PeftModel.from_pretrained(base_model, new_model)
model = model.merge_and_unload()

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [44]:
!huggingface-cli login --token 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

model.push_to_hub(new_model, use_temp_dir=False)
tokenizer.push_to_hub(new_model, use_temp_dir=False)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/imx2/.cache/huggingface/token
Login successful


pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/imxx/llama-2-7b-radnlp-radqa/commit/e427c4b68a2766c0f3e9a6d9bd2aa648da4acfbe', commit_message='Upload tokenizer', commit_description='', oid='e427c4b68a2766c0f3e9a6d9bd2aa648da4acfbe', pr_url=None, pr_revision=None, pr_num=None)

In [54]:
model_name = "imxx/llama-2-7b-radnlp-radqa"
new_model = "llama-2-7b-radnlp-chest-pelvis-mri-petct-radqa"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map
)
model.config.use_cache = False
model.config.pretraining_tp = 1

config.json:   0%|          | 0.00/630 [00:00<?, ?B/s]

pytorch_model.bin.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

In [55]:
contexts = radqa_test_dataset['context']
questions = radqa_test_dataset['qas.question']
answers = radqa_test_dataset['qas.answer']
predictions = get_radqas(contexts[:500], questions[:500], model, tokenizer, 30)
predictions[:10]

100%|██████████████████████████████████████████████████████████████| 500/500 [29:27<00:00,  3.53s/it]


[' Patent celiac and SMA, there is no ultrasound evidence of\n stenosis. [**Female First Name (un',
 ' Patent celiac and SMA, there is no ultrasound evidence of\n stenosis. [**Female First Name (un',
 ' Patent celiac and SMA, there is no ultrasound evidence of\n stenosis. [**Female First Name (un',
 ' the celiac and SMA are widely patent, Doppler\n assessment shows normal spectral flow and velocities within normal limits.\n There',
 ' the celiac and SMA are widely patent, Doppler\n assessment shows normal spectral flow and velocities within normal limits.\n There',
 ' the celiac and SMA are widely patent, Doppler\n assessment shows normal spectral flow and velocities within normal limits.\n There',
 '1) ET tube tip in good position, 2 cm above the carina.\n\n 2) Nasogatric tube',
 '1) ET tube tip in good position, 2 cm above the carina.\n\n 2) Nasogatric tube',
 '2 cm above the carina.  There is no\n change in the position of the left IJ central venous catheter; the nas',
 '2 cm above 

In [56]:
answers[:10]

['no ultrasound evidence of\n stenosis',
 'no ultrasound evidence of\n stenosis',
 'Patent celiac and SMA',
 'SMA are widely patent',
 'SMA are widely patent',
 'Doppler\n assessment shows normal spectral flow and velocities within normal limits',
 'tube tip in good position, 2 cm above the carina',
 '2 cm above the carina',
 '2 cm above the carina',
 'tube tip is in satisfactory position, 2 cm above the carina']

In [59]:
rouge_scores = get_rouge_scores(predictions, answers[:500])
print(f"Rouge1: {rouge_scores['rouge1']['fmeasure_mean']}")
print(f"Rouge2: {rouge_scores['rouge2']['fmeasure_mean']}")
print(f"RougeL: {rouge_scores['rougeL']['fmeasure_mean']}")
bert_scores = get_bertscore(predictions, answers[:500], 7)
print(f"Bert score: {bert_scores['f1_mean']}")
avg_response_lengths = compare_lengths(predictions, answers[:500])
print(f"Average response lengths: {avg_response_lengths}")
# base_spacy_scores = test_hallucination(nlp, predictions)
# print(f"Hallucination percent: {base_spacy_scores[2]}")

Rouge1: 36.097263516484794
Rouge2: 30.084301002803933
RougeL: 35.193033008240356




Bert score: 22.836305712512694
Average response lengths: {'prediction': 110.63, 'reference': 55.984}


# Fine-tune on general model

In [60]:
model_name = "imxx/llama-2-7b-chest-pelvis-mri-pelvis"
new_model = "llama-2-7b-radnlp-chest-pelvis-mri-petct-radqa"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map
)
model.config.use_cache = False
model.config.pretraining_tp = 1

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [61]:
finetune(model, 
         radqa_train_dataset, #ct_impressions_train_dataset, 
         radqa_val_dataset,
         peft_config, 
         1024, 
         tokenizer, 
         training_arguments, 
         packing, 
         generate_radqa_prompt, 
         new_model,
         compute_metrics,
         preprocess_logits_for_metrics)



Map:   0%|          | 0/4878 [00:00<?, ? examples/s]

Map:   0%|          | 0/656 [00:00<?, ? examples/s]

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Pred Len,Ref Len
500,0.8407,1.115452,52.0922,47.3875,52.194,52.2619,335.585366,190.60061
1000,0.8317,1.075856,52.5916,48.7573,52.6406,52.7331,335.585366,190.60061
1500,0.8665,1.072088,53.8067,49.2771,53.8294,53.9419,335.585366,190.60061
2000,0.8593,1.07927,55.7377,51.4784,55.7431,55.9398,335.585366,190.60061
2500,0.5985,1.111609,57.1815,52.4661,57.1348,57.3148,335.585366,190.60061
3000,0.5735,1.134341,56.4382,51.6486,56.3853,56.5872,335.585366,190.60061
3500,0.609,1.133206,56.9732,52.3976,56.9267,57.1154,335.585366,190.60061


decoded_preds:----------------------
 ['Right mid lung opacity is concerning for early pneumonia.\nin in in in in\n\n in in in\n\n\n and and and and\n\n\n text text text text text text ch ch ch ch text ch ch ch ch ch ch ch ch text text text text text text text text question in question question question question question question question question question question question question question question question', 'Not mid lung opacity is Context Context Context Context Context Context Context Context Context Context Context Context Context Context Context and and and and and is is is text text is,,::::::::::::::QuestionQuestionQuestion:::: and and and text: text text text text text question question questionlylylylyly Context Context Context Context Context Context Context Context Context Context Context Context Context Context Context Context Context Context Context Context Context', 'small right mid-lung\n opacity, concerning for pneumonia.', 'small right mid-lung\n opacity,', 'Lar in 

In [62]:
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map=device_map,
)
model = PeftModel.from_pretrained(base_model, new_model)
model = model.merge_and_unload()

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/1.57k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/437 [00:00<?, ?B/s]

In [63]:
!huggingface-cli login --token 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

model.push_to_hub(new_model, use_temp_dir=False)
tokenizer.push_to_hub(new_model, use_temp_dir=False)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/imx2/.cache/huggingface/token
Login successful


pytorch_model-00002-of-00002.bin:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/imxx/llama-2-7b-radnlp-chest-pelvis-mri-petct-radqa/commit/1b1523f10e4b4a86df82591b7607a3cac9d53c09', commit_message='Upload tokenizer', commit_description='', oid='1b1523f10e4b4a86df82591b7607a3cac9d53c09', pr_url=None, pr_revision=None, pr_num=None)

In [64]:
predictions = get_radqas(contexts[:500], questions[:500], model, tokenizer, 30)
predictions[:10]

100%|██████████████████████████████████████████████████████████████| 500/500 [20:09<00:00,  2.42s/it]


[' Patent celiac and SMA, there is no ultrasound evidence of\n stenosis. [**Female First Name (un',
 ' Patent celiac and SMA, there is no ultrasound evidence of\n stenosis. [**Female First Name (un',
 ' Patent celiac and SMA, there is no ultrasound evidence of\n stenosis. [**Female First Name (un',
 ' liver is normal in size and echogenicity without focal lesions.\n There is no biliary duct dilatation.',
 '10.5 cm in length, the\n left kidney measures 10.1 cm.  There is no splenomegaly',
 ' liver is normal in size and echogenicity without focal lesions.\n There is no biliary duct dilatation.',
 '1) ET tube tip in good position, 2 cm above the carina.\n\n 2) Nasogatric tube',
 '1) ET tube tip in good position, 2 cm above the carina.\n\n 2) Nasogatric tube',
 '2 cm above the carina.  There is no\n change in the position of the left IJ central venous catheter; the nas',
 '2 cm above the carina.  There is no\n change in the position of the left IJ central venous catheter; the nas']

In [65]:
rouge_scores = get_rouge_scores(predictions, answers[:500])
print(f"Rouge1: {rouge_scores['rouge1']['fmeasure_mean']}")
print(f"Rouge2: {rouge_scores['rouge2']['fmeasure_mean']}")
print(f"RougeL: {rouge_scores['rougeL']['fmeasure_mean']}")
bert_scores = get_bertscore(predictions, answers[:500], 7)
print(f"Bert score: {bert_scores['f1_mean']}")
avg_response_lengths = compare_lengths(predictions, answers[:500])
print(f"Average response lengths: {avg_response_lengths}")

Rouge1: 37.60160939194189
Rouge2: 31.79404675210386
RougeL: 36.74360415276978




Bert score: 24.885057040434912
Average response lengths: {'prediction': 108.49, 'reference': 55.984}
