In [3]:
# empty gpu cache
import torch
torch.cuda.empty_cache()
import gc
gc.collect()
import json
from tqdm import tqdm

In [4]:
torch.cuda.memory_summary()



In [5]:
import os
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import pandas as pd
from datasets import Dataset, DatasetDict
from torch.utils.data import DataLoader
from transformers import default_data_collator
import peft
import transformers


In [6]:
notebook_dir = os.getcwd()
model_dir = os.path.join(notebook_dir, "..", "..", "..", "..", "local-models/Llama-3.2-1B")
adapter_dir = "/xdisk/bethard/kbozler/repositories/discharge-summarization/output/adapters/Llama-3.2-1B"

device = "cuda" if torch.cuda.is_available() else "cpu"


In [7]:
def load_model_and_tokenizer(model_path):
    ''' load model and tokenizer '''

    # set quantization configs if using qlora
    quantization_config = transformers.BitsAndBytesConfig(
                load_in_4bit=True,
                bnb_4bit_use_double_quant=True,
                bnb_4bit_quant_type="nf4",
                bnb_4bit_compute_dtype=torch.bfloat16
            )

    # define model and tokenizer
    model = AutoModelForCausalLM.from_pretrained(model_path,
                                                    quantization_config=quantization_config)
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))
    model.config.pad_token_id = tokenizer.pad_token_id

    return model, tokenizer

def get_lora_model(model, device):
    ''' add peft adapter to model '''

    task_type = peft.TaskType.CAUSAL_LM
    
    # prepare for k-bit training
    model = peft.prepare_model_for_kbit_training(model) 
    
    # get peft configs based on architecture (task_type) and fine-tuning method
    config = peft.LoraConfig(   
                                task_type=task_type, 
                                inference_mode=False,
                                r=8, 
                                lora_alpha=32,
                                lora_dropout=0.1
                            )

    # wrap model w peft configs
    model = peft.get_peft_model(model, config).to(device)
    model.print_trainable_parameters()

    return model

In [8]:
base_model, tokenizer = load_model_and_tokenizer(model_dir)
model = peft.PeftModel.from_pretrained(
    base_model,
    adapter_dir,
    is_trainable=False
    )
model.to(device)
model.eval()

`low_cpu_mem_usage` was None, now default to True since model is quantized.
The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128257, 2048)
        (layers): ModuleList(
          (0-15): 16 x LlamaDecoderLayer(
            (self_attn): LlamaSdpaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=2048, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): Li

: 

In [44]:
test_df = pd.read_csv("/xdisk/bethard/kbozler/repositories/discharge-summarization/data/processed/test.csv")

In [10]:
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model.resize_token_embeddings(len(tokenizer))
model.config.pad_token_id = tokenizer.pad_token_id


The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


In [57]:
def single_inference_local(model, tokenizer, test_sample):
    """
    Generates a single summary using the local model
    Args:
        model (transformers.AutoModelForCausalLM): model to use
        test_sample (str): sample to generate a summary for

    Returns:
        str: generated summary
    """
    # generate summary
    device = model.device
    inputs = tokenizer(test_sample, return_tensors="pt")
    inputs = {key: value.to(device) for key, value in inputs.items()}
    print("device", inputs["input_ids"].device)
    outputs = model.generate(inputs["input_ids"],
                            tokenizer=tokenizer,
                            min_new_tokens=250,
                            max_new_tokens=500,
                            temperature=0.8,
                            top_p=0.9,
                            top_k=40,
                            repetition_penalty=1.2,
                            stop_strings=[tokenizer.eos_token]
                            ) #check stop strings class 
    
    result = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(result)
    return result[len(test_sample):]

In [58]:
generated_summaries = []
for i, trial in tqdm(test_df.iterrows(), total=test_df.shape[0], desc=f"Generating summaries with Llama-3.2-1B"):
    if (i+1)/test_df.shape[0] % 0.1 == 0:
        print(f"test sample {i+1} of {test_df.shape[0]}")
    text = trial["text"]
    generated_summaries.append(single_inference_local(model, tokenizer, text))

Generating summaries with Llama-3.2-1B:   0%|          | 0/20 [00:00<?, ?it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


device cuda:0


Generating summaries with Llama-3.2-1B:   5%|▌         | 1/20 [00:11<03:41, 11.68s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


age: 46. gender: M.  
Name:  ___             Unit No:   ___
 
Admission Date:  ___              Discharge Date:   ___
 
Date of Birth:  ___             Sex:   M
 
Service: MEDICINE
 
Allergies: 
No Known Allergies / Adverse Drug Reactions
 
Attending: ___.
 
Chief Complaint:
Alcohol withdrawal
 
Major Surgical or Invasive Procedure:
None

 
History of Present Illness:
From nightfloat admitting resident ___ presents w/ concern 
for alcohol withdrawal. Reports a h/o prior withdrawal seizure. 
Drinks 16 drinks of vodka daily, today has had 6, last at ___. Also complaining of insomnia, seeing flashes of "demonic 
faces" and dreams of being levitated by devils. No auditory 
hallucinations. No SI or HI. No h/o prior psych diagnoses. He 
has poor sleep hygiene at baseline and has not been sleeping 
well recently. He noted after beginning to drink today that he 
was having dry heaves, relieved with Pepto Bismol. He continued 
to not tolerate PO and his case worker was concerned that he 
might 

Generating summaries with Llama-3.2-1B:  10%|█         | 2/20 [00:29<04:33, 15.19s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


age: 79. gender: F.  
Name:  ___                   Unit No:   ___
 
Admission Date:  ___              Discharge Date:   ___
 
Date of Birth:  ___             Sex:   F
 
Service: PLASTIC
 
Allergies: 
Penicillins / trees / rye grass
 
Attending: ___
 
___ Complaint:
Breast cancer
 
Major Surgical or Invasive Procedure:
Left simple mastectomy and sentinel lymph node biopsy, with 
immediate tissue expander insertion

 
History of Present Illness:
___ year old female with history of previous bilateral breast 
cancer, BCT on left, MRM on right. Now with recurrent left 
breast cancer.

 
Past Medical History:
HTN
macular degeneration
breast cancer
Compression fracture T7 ___ s/p fall
Thyroid disease
reflux 
 
Social History:
Non-smoker, occasional ETOH, denies recreational drug use.
 
Physical Exam:
Pre-procedure physical exam as documented in anesthesia record 
___:
pulse: 65/min
BP: 135/71
O2sat: 98% RA
.
General: NAD
Mental/psych: AxOx3
Airway: as documented in detail in anesthesia record

Generating summaries with Llama-3.2-1B:  10%|█         | 2/20 [00:35<05:16, 17.59s/it]


KeyboardInterrupt: 

: 

In [None]:
stories = [
    "A lonely robot found a broken music box in an abandoned city. As he fixed it, the melody attracted other robots. Together, they created the city's first robot orchestra.",
    "The last seed on Earth was planted by a child in her grandmother's garden. Against all odds, it grew into a magical tree that produced seeds of every plant that had been lost.",
    "In a world where dreams were visible as floating bubbles, a young girl discovered she could weave them into blankets. Her creations brought comfort to those suffering from nightmares.",
    "An old lighthouse keeper discovered that his beacon didn't guide ships, but rather lost stars back to their constellations. Each night, he helped rebuild the night sky.",
    "Deep in the digital forest, a virus learned to heal corrupted files instead of destroying them. Other programs began calling it the Digital Doctor.",
    "A time-traveling mailman accidentally delivered letters to the wrong centuries. The resulting mix-ups created unexpected friendships across time.",
    "The last bookstore on Mars housed a librarian who could bring characters to life by reading aloud. She used this gift to help homesick colonists feel less alone."
]

summaries = [
    "A robot repairs a music box and builds community through music.",
    "A child's last seed miraculously restores Earth's lost plants.",
    "A girl turns visible dreams into comforting blankets for nightmare sufferers.",
    "A lighthouse keeper helps lost stars find their way back to constellations.",
    "A benevolent virus becomes known for healing corrupted files.",
    "A mailman's time-travel mistakes lead to cross-century friendships.",
    "A Martian librarian uses her power to comfort colonists with living stories."
]

alternative_stories = [
    "In a forgotten city, a solitary robot stumbled upon an ancient piano. As it played, the harmonious notes summoned other robots, and together they formed a symphony that echoed through the empty streets.",
    "A young girl planted a mysterious seed in a barren land. To everyone's amazement, it sprouted into a tree that bore fruits of every extinct plant, reviving the world's lost flora.",
    "In a realm where dreams floated like clouds, a girl learned to capture them in jars. Her bottled dreams provided solace to those haunted by restless nights.",
    "An old lighthouse keeper discovered that his beacon didn't guide ships, but rather lost stars back to their constellations. Each night, he helped rebuild the night sky.",

]

alternative_summaries = [
    "A robot finds a piano and unites others through music.",
    "A girl's seed grows into a tree that revives extinct plants.",
    "A girl captures dreams to comfort those with nightmares.",
    "A lighthouse keeper helps lost stars find their way back to constellations.",

]
pairs_train = pd.DataFrame({"text": stories, "target": summaries})
pairs_val = pd.DataFrame({"text": alternative_stories, "target": alternative_summaries})
pairs_train = Dataset.from_pandas(pairs_train)
pairs_val = Dataset.from_pandas(pairs_val)
pairs = DatasetDict({"train": pairs_train, "validation": pairs_val})


In [None]:
pairs

In [None]:

def tokenize_function(example, tokenizer):
    inputs = tokenizer(example["text"], add_special_tokens=True) # only gonna add bos
    targets = tokenizer(example["target"], add_special_tokens=False) # we will manually add eos
    for i in range(len(inputs["input_ids"])):
        sample_input_ids =  inputs["input_ids"][i] 
        sample_label_input_ids = targets["input_ids"][i] + [tokenizer.eos_token_id]
        inputs["input_ids"][i] = sample_input_ids + sample_label_input_ids
        targets["input_ids"][i] = [-100] * len(sample_input_ids) + sample_label_input_ids
        inputs["attention_mask"][i] = [1] * len(inputs["input_ids"][i])
    inputs["labels"] = targets["input_ids"]
    # input_ids, attention_mask, and labels are all the same length for a given sample, but not across samples
    # so we need to pad to max length from left side

    max_length = max([len(x) for x in inputs["input_ids"]])
    # add padding tokens to the left side of the input ids, attention mask, and labels
    for i in range(len(inputs["input_ids"])):
        inputs["input_ids"][i] = ([tokenizer.pad_token_id] * 
                                (max_length - len(inputs["input_ids"][i])) + 
                                inputs["input_ids"][i])
        inputs["attention_mask"][i] = ([0] * (max_length - len(inputs["attention_mask"][i])) +
                                    inputs["attention_mask"][i])
        inputs["labels"][i] = ([-100] * (max_length - len(inputs["labels"][i])) +
                                inputs["labels"][i])
        
    return inputs


In [None]:
pairs_dataset = pairs.map(
        lambda x: tokenize_function(x, tokenizer=tokenizer),
        batched=True,
        remove_columns=pairs['train'].column_names,
        load_from_cache_file=False,
        desc="Running tokenizer on train dataset",
    )

In [None]:
for split in pairs_dataset:
    print(split)
    for i in range(len(pairs_dataset[split]["input_ids"])):
        print(pairs_dataset[split]["input_ids"][i])   
        print(len(pairs_dataset[split]  ["input_ids"][i]))
        print(pairs_dataset[split]["attention_mask"][i])  
        print(len(pairs_dataset[split]["attention_mask"][i]))
        print(pairs_dataset[split]["labels"][i])
        print(len(pairs_dataset[split]["labels"][i]))
        print("-"*100)


In [None]:
train_dataloader = DataLoader(
    pairs_dataset['train'], shuffle=True, collate_fn=default_data_collator, batch_size=2
)
dev_dataloader = DataLoader(
    pairs_dataset['validation'], shuffle=True, collate_fn=default_data_collator, batch_size=2
)

In [None]:
for batch in dev_dataloader:
    print("-"*100)
    print("train batch")
    for i in range(len(batch["input_ids"])):
        print(f"number {i}")
        print(batch["input_ids"][i])   
        print(len(batch["input_ids"][i]))
        print(batch["attention_mask"][i])  
        print(len(batch["attention_mask"][i]))
        print(batch["labels"][i])
        print(len(batch["labels"][i]))
        print("-"*100)
