In [1]:
import pandas as pd
import numpy as np

In [None]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer
from rouge_score import rouge_scorer

# Analyze datasets

In [2]:
impressions = pd.read_csv('data/223268_chestct_1_20220624a_nodup_val.csv')

In [10]:
impressions.head()

Unnamed: 0,Narrative,Impression
0,CT Chest without IV contrast Clinical informa...,1. Possible mild air trapping in the lower lo...
1,Study: CT CHEST W IV CONTRAST Date: 4/7/2017...,IMPRESSION: 1. Further decrease in size of the...
2,CT CHEST WO IV CONTRAST Date: 12/15/2021 12:03...,Stable majority of bilateral scattered lung ...
3,CT CHEST W IV CONTRAST Date: 4/1/2022 11:51 A...,1. No lymphadenopathy. 2. New groundglass nodu...
4,CT CHEST WO IV CONTRAST Date: 2/22/2022 11:08...,No abnormal findings in the left lung. Th...


In [104]:
impressions

Unnamed: 0,Narrative,Impression
0,CT Chest without IV contrast Clinical informa...,1. Possible mild air trapping in the lower lo...
1,Study: CT CHEST W IV CONTRAST Date: 4/7/2017...,IMPRESSION: 1. Further decrease in size of the...
2,CT CHEST WO IV CONTRAST Date: 12/15/2021 12:03...,Stable majority of bilateral scattered lung ...
3,CT CHEST W IV CONTRAST Date: 4/1/2022 11:51 A...,1. No lymphadenopathy. 2. New groundglass nodu...
4,CT CHEST WO IV CONTRAST Date: 2/22/2022 11:08...,No abnormal findings in the left lung. Th...
...,...,...
2995,CT CHEST W IV CONTRAST Date: 6/18/2020 10:55 P...,IMPRESSION: No findings to suggest source of ...
2996,CTA CHEST (PE) W IV CONTRAST Performed On 10/2...,IMPRESSION: 1. No evidence of pulmonary embolu...
2997,"CT chest, abdomen and pelvis with IV contrast....",IMPRESSION: Possible gluteal soft tissue cont...
2998,CT scan of the chest with contras for pulmona...,IMPRESSION: 1. No evidence of pulmonary embol...


In [3]:
impressions['Narrative'][0]

'CT Chest without IV contrast  Clinical information: Hemoptysis, allergic rhinitis  Comparison: 9/18/2006  The study was performed without intravenous contrast.  FINDINGS:  LUNGS AND AIRWAYS: The central airway is patent. Imaging of the lung parenchyma is mildly degraded by respiratory motion artifact. There are no suspicious lung nodules. There are no focal consolidations. There is a subtle mosaic pattern of the lung parenchyma in the lower lobes which could represent some air trapping.  PLEURA: The pleural surfaces are normal and there are no effusions.  LYMPH NODES/MEDIASTINUM: No mediastinal, hilar or axillary adenopathy. The esophagus is normal in appearance.  CHEST WALL: Negative   THYROID AND LOWER NECK: Negative  CARDIOVASCULAR: There is no significant enlargement of the heart, thoracic aorta or central pulmonary arteries. There is mild coronary artery calcification. There is no pericardial effusion.   UPPER ABDOMEN: Limited evaluation of the upper abdomen demonstrate a markedl

In [11]:
impressions['Impression'][0]

' 1. Possible mild air trapping in the lower lobes, otherwise unremarkable noncontrast CT examination of the chest.  2. Markedly cirrhotic liver and splenomegaly which have developed since 2006.  Reported And Signed By: William Friedman, MD'

In [14]:
# also impressions
pelvis = pd.read_csv('data/223268_ctabdpelvis_2_20220627a_nodup_train.csv.gz', compression='gzip')

In [103]:
pelvis

Unnamed: 0,Impression,Narrative
0,IMPRESSION: Limited study as the patient was ...,US DUPLEX LOWER EXTREMITY VENOUS BILATERAL per...
1,IMPRESSION: No evidence of deep venous thrombo...,STUDY: US DUPLEX LOWER EXTREMITY VENOUS BILATE...
2,IMPRESSION: No evidence of deep venous thrombo...,US DUPLEX LOWER EXTREMITY VENOUS LEFT INDICAT...
3,IMPRESSION: No ultrasound evidence of deep ve...,Right lower extremity venous duplex ultrasound...
4,IMPRESSION: No evidence of deep venous thrombo...,STUDY: Left lower extremity venous Doppler ult...
...,...,...
183561,Impression: 1. Nonocclusive thrombus at the l...,Examination: Bilateral lower extremity venous ...
183562,IMPRESSION: 1. Bilateral lower extremity fem...,US DUPLEX LOWER EXTREMITY VENOUS BILATERAL H...
183563,Impression: Nonvisualization of the calf vein...,Study: US DUPLEX LOWER EXTREMITY VENOUS BILAT...
183564,IMPRESSION: 1.No evidence of deep venous throm...,STUDY: Right lower extremity venous Doppler ul...


In [None]:
pelvis

In [17]:
feb = pd.read_csv('data/223268_February_2023_US_20230206a_nodup_train.csv.gz', compression='gzip')

In [18]:
feb

Unnamed: 0,Impression,Narrative
0,IMPRESSION: Limited study as the patient was ...,US DUPLEX LOWER EXTREMITY VENOUS BILATERAL per...
1,IMPRESSION: No evidence of deep venous thrombo...,STUDY: US DUPLEX LOWER EXTREMITY VENOUS BILATE...
2,IMPRESSION: No evidence of deep venous thrombo...,US DUPLEX LOWER EXTREMITY VENOUS LEFT INDICAT...
3,IMPRESSION: No ultrasound evidence of deep ve...,Right lower extremity venous duplex ultrasound...
4,IMPRESSION: No evidence of deep venous thrombo...,STUDY: Left lower extremity venous Doppler ult...
...,...,...
183561,Impression: 1. Nonocclusive thrombus at the l...,Examination: Bilateral lower extremity venous ...
183562,IMPRESSION: 1. Bilateral lower extremity fem...,US DUPLEX LOWER EXTREMITY VENOUS BILATERAL H...
183563,Impression: Nonvisualization of the calf vein...,Study: US DUPLEX LOWER EXTREMITY VENOUS BILAT...
183564,IMPRESSION: 1.No evidence of deep venous throm...,STUDY: Right lower extremity venous Doppler ul...


In [22]:
feb['Impression'][0]

'IMPRESSION:  Limited study as the patient was unable to tolerate compressibility in the thighs bilaterally and right leg, therefore nonocclusive venous thrombosis is difficult to exclude. No evidence of occlusive deep venous thrombosis in the bilateral femoropopliteal venous systems.  If clinical concern persists, re-evaluation can be performed after 5-7 days to evaluate for propagation of clot.  Report Initiated By:  Khalid Al-Dasuqi, MD  Reported And Signed By: Joseph Cavallo, MD'

In [21]:
# TODO: figure out what this dataset is (i.e. how impressions predict narrative)
feb['Narrative'][0]

'US DUPLEX LOWER EXTREMITY VENOUS BILATERAL performed on 9/21/2020 11:53 PM  INDICATION: BLE edema and pain.  COMPARISON: Ultrasound duplex lower extremity venous left dated 11/17/2019.  TECHNIQUE: Gray scale, color, and pulsed Doppler were done.   FINDINGS: Bilateral external iliac, common femoral, and left popliteal veins as well as origins of the profunda femoris and great saphenous veins are patent on gray scale and color Doppler and are fully compressible throughout their course. No evidence for deep venous thrombosis at the level of the left trifurcation.   Compressibility was not assessed in the right femoral and popliteal veins as well as the left femoral vein and right tibioperoneal trunk. No evidence of occlusive deep venous thrombosis.   There is normal respiratory variability within both external iliac veins. Augmentation to flow was not assessed in the popliteal veins.   The evaluation of the bilateral posterior tibial and peroneal veins in the calf is limited due to limit

In [19]:
# what MRI to use given impression
# maybe should cut off narrative before FINDINGS?
mris = pd.read_csv('data/223268_mri_November_2022_20221107a_nodup_train.csv.gz', compression='gzip')
mris.head()

Unnamed: 0,Impression,Narrative
0,IMPRESSION: Acute/subacute infarct extending ...,STUDY: MRI of the brain without and with intra...
1,IMPRESSION: 1. Stable positioning of left-side...,MRI OF THE BRAIN WITHOUT CONTRAST: INDICATION...
2,IMPRESSION: 1. Focal encephalomalacia in the ...,MRI BRAIN WITH AND WITHOUT CONTRAST MRV BRAIN ...
3,Impression: Few scattered very small T2 hyper...,Study: MRI brain without and with IV contrast ...
4,Impression: Stable exam with no evidence of ne...,MRI BRAIN WITH AND WITHOUT CONTRAST INDICATI...


In [23]:
mris['Narrative'][0]

'STUDY: MRI of the brain without and with intravenous contrast.  INDICATION: Status post TPA for stroke causing right arm and face weakness.  COMPARISON: CT and CTA dated 6/6/2014.  TECHNIQUE: Multiplanar and multisequence MR imaging of the brain was performed without contrast.  FINDINGS: There is a small area of restricted diffusion extending craniad from the posterior left basal ganglia into the right posterior periventricular white matter. There is corresponding mild T2 prolongation and minimal magnetic susceptibility. There is no acute intracranial hematoma or mass effect. Additional scattered foci of T2 prolongation in the periventricular subcortical white matter of both cerebral hemispheres is identified, most consistent with chronic small vessel ischemic disease. There is no other evidence of intracranial hemorrhage.   The ventricles are symmetric and normal in size.  There is no extra-axial collection.   '

In [24]:
# Reducing 'Narrative' here might also be needed - idk, maybe it makes sense for model to predict findings
petcts = pd.read_csv('data/223268_November_2022_PETCT_20221110a_nodup_train.csv.gz', compression='gzip')
petcts.head()

Unnamed: 0,Impression,Narrative
0,IMPRESSION: Since 3/11/2020: Within limits of ...,PET CT SKULL TO THIGH AREA SUBSEQUENT performe...
1,IMPRESSION: Moderate right and small left pl...,NONCONTRAST CT OF THE CHEST FOR ATTENUATION CO...
2,IMPRESSION: Mild hypermetabolism correspondin...,PET/CT SKULL TO THIGH CLINICAL INFORMATION: B...
3,IMPRESSION: Virtually complete resolution of h...,Body F18 FDG PET-CT Scan. CLINICAL INDICATION...
4,IMPRESSION: 1. Hypermetabolic left upper lun...,PET/CT SKULL TO THIGH AREA INITIAL INDICATIO...


In [26]:
petcts['Impression'][0]

'IMPRESSION: Since 3/11/2020: Within limits of PET and low dose CT, excellent response to multimodal therapy with no definite residual tumor, concordant with low CA-125 (although slightly increased from nadir).  1. Mild vaginal activity probably a contamination. 2. New indeterminant mildly avid subcentimeter right obturator nodes, early recurrence not excluded, attention on follow-up. 3. Avid probably inflammatory nonneoplastic changes in the anterior abdominal and pelvic wall. 4. Complete response of prior extensive metastatic disease in the peritoneum and chest. 5. Other findings as above. Continued close follow-up including PET/CT in 3-6 months is suggested.  Report Initiated By:  Jannatun Sikder, MD  Reported And Signed By: Darko Pucar, MD'

In [25]:
petcts['Narrative'][0]

"PET CT SKULL TO THIGH AREA SUBSEQUENT performed on 8/17/2020 11:48 AM  INDICATION: 61 yo female with malignant mixed mullerian of the corpus uteri and left adnexa, squamous cervical cancer.  3/16/2020, TAH/BSO; in cervix uteri right invasive poorly differentiated squamous cell carcinoma in the background of high-grade squamous intraepithelial lesion CIN3 of cervix, pT1b1Nx; in corpus uteri, left ovarian tissue, omentum, liver, diaphragm, appendix, cul-de-sac, bladder serosa, sigmoid, paracolic gutter, malignant mixed mullerian tumor, pT3aNxM1. Carbo/Taxol.  CA 125, 333 on 3/2/2020, 4 on 6/2/2020, 8 on 8/11/2020.   PMH: 2011, L triple negative breast cancer, surgery, chemotherapy, RT. CA-27-29, 11 on 2/14/2020. Other PMH: Graves' disease with thyroid surgery. Diverticulitis. Never smoker.  Comparison: PET/CT 3/11/2020.  Correlation: CT chest abdomen pelvis June 18, 2020.  TECHNIQUE: Oral contrast was given approximately 60 minutes prior to the CT scan. Approximately 60 minutes followin

# Try finetuning with llama2 on chest df

In [61]:
type(dataset)

datasets.arrow_dataset.Dataset

In [64]:
# create new dataset with 5000 samples
mini_dailymail_dataset = dataset.shuffle(seed=42)
mini_dailymail_dataset = mini_dailymail_dataset.select(range(5000))
mini_dailymail_dataset

Dataset({
    features: ['article', 'highlights', 'id'],
    num_rows: 5000
})

In [67]:
# function to format the datasets
def formatting_prompts_func_summarize(example):
    output_texts = []
    for i in range(len(example['article'])):
        # for time purposes with practice model, just do 5000 examples
        text = f"### Question: Summarize this article: {example['article'][i]}\n ### Answer: {example['highlights'][i]}"
        output_texts.append(text)
    return output_texts

In [32]:
dataset_name = "cnn_dailymail"
dataset = load_dataset(dataset_name, '3.0.0')

Downloading data files:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/159M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/376M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/12.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/661k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/572k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

In [None]:
# problem - how to format data to be dataset if can't create dataset on huggingface
dataset = dataset['train']

In [39]:
dataset

Dataset({
    features: ['article', 'highlights', 'id'],
    num_rows: 287113
})

In [40]:
dataset_name = "mlabonne/guanaco-llama2-1k"
dataset2 = load_dataset(dataset_name, split='train')
dataset2

Dataset({
    features: ['text'],
    num_rows: 1000
})

In [None]:
dataset2['text']

In [36]:
# The model that you want to train from the Hugging Face hub
model_name = "meta-llama/Llama-2-7b-chat-hf"

# # The instruction dataset to use
# dataset_name = "mlabonne/guanaco-llama2-1k"

# Fine-tuned model name
new_model = "llama-2-7b-cnndailymail-practice"

################################################################################
# QLoRA parameters
################################################################################

# LoRA attention dimension
lora_r = 64

# Alpha parameter for LoRA scaling
lora_alpha = 16

# Dropout probability for LoRA layers
lora_dropout = 0.1

################################################################################
# bitsandbytes parameters
################################################################################

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

################################################################################
# TrainingArguments parameters
################################################################################

# Output directory where the model predictions and checkpoints will be stored
output_dir = "./results"

# Number of training epochs
num_train_epochs = 1

# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = False
bf16 = False

# Batch size per GPU for training
per_device_train_batch_size = 4

# Batch size per GPU for evaluation
per_device_eval_batch_size = 4

# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 1

# Enable gradient checkpointing
gradient_checkpointing = True

# Maximum gradient normal (gradient clipping)
max_grad_norm = 0.3

# Initial learning rate (AdamW optimizer)
learning_rate = 2e-4

# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.001

# Optimizer to use
optim = "paged_adamw_32bit"

# Learning rate schedule
lr_scheduler_type = "cosine"

# Number of training steps (overrides num_train_epochs)
max_steps = -1

# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03

# Group sequences into batches with same length
# Saves memory and speeds up training considerably
group_by_length = True

# Save checkpoint every X updates steps
save_steps = 0

# Log every X updates steps
logging_steps = 25

################################################################################
# SFT parameters
################################################################################

# Maximum sequence length to use
max_seq_length = None

# Pack multiple short examples in the same input sequence to increase efficiency
packing = False

# Load the entire model on the GPU 0
device_map = {"": 0}

In [None]:
# Load dataset (you can process it here)
# dataset = load_dataset(dataset_name, split="train")
# TODO: reformat impressions to dataset
# dataset = impressions

# Load tokenizer and model with QLoRA configuration
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

# Load base model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map
)
model.config.use_cache = False
model.config.pretraining_tp = 1

# Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training

# Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
)

# Set training parameters
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="tensorboard"
)

# # Set supervised fine-tuning parameters
# trainer = SFTTrainer(
#     model=model,
#     train_dataset=dataset,
#     peft_config=peft_config,
#     dataset_text_field="text",
#     max_seq_length=max_seq_length,
#     tokenizer=tokenizer,
#     args=training_arguments,
#     packing=packing,
#     formatting_func=formatting_prompts_func,
# )

# # Train model
# trainer.train()

# # Save trained model
# trainer.model.save_pretrained(new_model)

In [68]:
# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=mini_dailymail_dataset,
    peft_config=peft_config,
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=packing,
    formatting_func=formatting_prompts_func_summarize,
)

# Train model
trainer.train()

# Save trained model
trainer.model.save_pretrained(new_model)

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Step,Training Loss
25,2.0092
50,1.8181
75,1.7756
100,1.8155
125,1.7452
150,1.809


In [97]:
# Ignore warnings
logging.set_verbosity(logging.CRITICAL)

# Run text generation pipeline with our next model
example_article = ''
example_summary = ''
# make sure it's not in the train data
for i in range(len(dataset['article'])):
    if dataset['article'][i] not in mini_dailymail_dataset['article']:
        example_article = dataset['article'][i]
        example_summary = dataset['highlights'][i]
        break

prompt = f"Summarize this article: {example_article}"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=760)
# result = pipe(f"[INST] {prompt} [/INST]")
result = pipe(f"### Question: {prompt}\n ### Answer:")
print(result[0]['generated_text'])

### Question: Summarize this article: LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won't cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don't plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. "I don't think I'll be particularly extravagant. "The things I like buying are things that cost about 10 pounds -- books and CDs and DVDs." At 18, Radcliffe will be able to gamble in a casino, buy a drink in a pub or see the horror film "Hostel: Part II," currently six places below his number one movie on t

In [95]:
print(example_summary)

Harry Potter star Daniel Radcliffe gets £20M fortune as he turns 18 Monday .
Young actor says he has no plans to fritter his cash away .
Radcliffe's earnings from first five Potter films have been held in trust fund .


In [98]:
# Reload model in FP16 and merge it with LoRA weights
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map=device_map,
)
model = PeftModel.from_pretrained(base_model, new_model)
model = model.merge_and_unload()

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [102]:
!huggingface-cli login --token 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

model.push_to_hub(new_model, use_temp_dir=False)
tokenizer.push_to_hub(new_model, use_temp_dir=False)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/imx2/.cache/huggingface/token
Login successful


Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/imxx/llama-2-7b-cnndailymail-practice/commit/3053413221d05f16a79ea41ffa6a7ebcdb701586', commit_message='Upload tokenizer', commit_description='', oid='3053413221d05f16a79ea41ffa6a7ebcdb701586', pr_url=None, pr_revision=None, pr_num=None)