In [1]:
!pip install transformers datasets accelerate peft



In [2]:
!pip uninstall bitsandbytes -y

[0m

In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import LoraConfig, get_peft_model

model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"  # Replace with the appropriate Llama 3.1 model name

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Set up LoRA configuration
lora_config = LoraConfig(
    r=16,  # Low-rank dimension
    lora_alpha=32,  # Scaling factor
    lora_dropout=0.1,  # Dropout probability
    bias="none",  # Don't add bias to the LoRA adapters
    target_modules=["q_proj", "v_proj"],  # Apply LoRA to specific attention layers
)

# Wrap the model with LoRA
model = get_peft_model(model, lora_config)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [4]:
import torch
print(f'MPS available: {torch.backends.mps.is_available()}')

device = "mps" if torch.backends.mps.is_available() else "cpu"
model = model.to(device)

MPS available: True


In [5]:
from datasets import load_dataset

dataset = load_dataset("ruslanmv/ai-medical-chatbot", split='all')
dataset

Dataset({
    features: ['Description', 'Patient', 'Doctor'],
    num_rows: 256916
})

In [6]:
PROMPT_DICT = {
    "prompt_input": (
        "Below is an instruction that describes a task, paired with an input that provides further context. "
        "Write a response that appropriately completes the request.\n\n"
        "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:"
    ),
    "prompt_no_input": (
        "Below is an instruction that describes a task. "
        "Write a response that appropriately completes the request.\n\n"
        "### Instruction:\n{instruction}\n\n### Response:"
    ),
}


In [7]:
# Function to transform dataset by merging context (Patient) and Description into instruction
def format_example(example):
    patient_info = example['Patient']
    description = example['Description']
    doctor_response = example['Doctor']
    
    # Construct the instruction with the Patient as context and Description as the question
    instruction = f"Patient: {patient_info}. Description: {description}"
    
    return {
        'instruction': instruction,
        'response': doctor_response
    }

# Apply the transformation to the entire dataset
formatted_dataset = dataset.map(format_example, remove_columns=dataset.column_names)
formatted_dataset


Dataset({
    features: ['instruction', 'response'],
    num_rows: 256916
})

In [8]:
# Check if [PAD] token exists, add it if missing
if tokenizer.pad_token is None:
   print(f'Added pad_token {tokenizer.eos_token}')
   tokenizer.pad_token = tokenizer.eos_token

tokenizer.padding_side = "right"

# Update the model's token embeddings to accommodate the new [PAD] token
model.resize_token_embeddings(len(tokenizer))

Added pad_token <|eot_id|>


Embedding(128256, 4096)

In [9]:
from copy import deepcopy

IGNORE_INDEX = -100  # The default setting in CrossEntropyLoss

# Tokenize the dataset
def tokenize_function(example):

    if example.get("Patient", "") == "":
        prompt = PROMPT_DICT["prompt_no_input"].format(instruction=example['Description'])
    else:
        prompt = PROMPT_DICT["prompt_input"].format(instruction=example['Description'], input=example['Patient'])

    formatted_example = prompt + example['Doctor']

    encoded_prompt = torch.tensor(
            tokenizer.encode(prompt), dtype=torch.int64
        )
        
    example = tokenizer.encode(formatted_example)
    example.append(tokenizer.eos_token_id)
    example = torch.tensor(
        example, dtype=torch.int64
    )
    labels = deepcopy(example)
    labels[: len(encoded_prompt)] = -1
    example_mask = example.ge(0)
    label_mask = labels.ge(0)
    example[~example_mask] = 0
    labels[~label_mask] = IGNORE_INDEX

    return {
        "input_ids": example.tolist(),
        "labels": labels.tolist(),
        "attention_mask":example_mask.tolist(),
    }


tokenized_dataset = dataset.map(tokenize_function, remove_columns=['Description', 'Patient', 'Doctor'], batched=False)

In [10]:
tokenized_dataset

Dataset({
    features: ['input_ids', 'labels', 'attention_mask'],
    num_rows: 256916
})

In [11]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

In [12]:
train_test_split = tokenized_dataset.train_test_split(test_size=0.001)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

In [13]:
train_dataset[0]

{'input_ids': [128000,
  39314,
  374,
  459,
  7754,
  430,
  16964,
  264,
  3465,
  11,
  35526,
  449,
  459,
  1988,
  430,
  5825,
  4726,
  2317,
  13,
  9842,
  264,
  2077,
  430,
  36001,
  45695,
  279,
  1715,
  382,
  14711,
  30151,
  512,
  48,
  13,
  8595,
  656,
  18852,
  636,
  23540,
  1306,
  1176,
  892,
  1877,
  1980,
  14711,
  5688,
  512,
  13347,
  10896,
  11,
  358,
  1047,
  1877,
  369,
  279,
  1176,
  892,
  13,
  4194,
  6153,
  430,
  11,
  358,
  1550,
  539,
  636,
  856,
  18852,
  3686,
  13,
  358,
  4024,
  311,
  264,
  342,
  75030,
  16549,
  13,
  3005,
  1047,
  4529,
  856,
  6680,
  6205,
  323,
  433,
  574,
  8389,
  13,
  2030,
  11,
  2103,
  358,
  617,
  539,
  17454,
  856,
  18852,
  323,
  433,
  706,
  1027,
  264,
  2046,
  8469,
  13,
  3639,
  1436,
  387,
  279,
  2944,
  369,
  856,
  3389,
  18852,
  1980,
  14711,
  6075,
  25,
  9906,
  13,
  1666,
  279,
  41529,
  13746,
  305,
  8974,
  5990,
  527,
  2753,
  1109,


In [14]:
from transformers import TrainingArguments, Trainer
from torch.optim import AdamW

training_args = TrainingArguments(
    output_dir="./llama_lora_finetuned",
    eval_strategy="steps",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=1,
    save_steps=1000,
    logging_steps=50,
    learning_rate=5e-5,
    weight_decay=0.01,
    warmup_steps=10,
    load_best_model_at_end=True,
)

optimizer = AdamW(model.parameters(), lr=5e-5)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset[:100],
    data_collator=data_collator,
    optimizers=(optimizer, None),  # No need for a scheduler here
)


In [15]:
trainer.train()

  0%|          | 0/256659 [00:00<?, ?it/s]

We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


{'loss': 3.8686, 'grad_norm': 52.04057693481445, 'learning_rate': 4.99922072558241e-05, 'epoch': 0.0}


  0%|          | 0/100 [00:00<?, ?it/s]

{'eval_input_ids_runtime': 258.6841, 'eval_input_ids_samples_per_second': 0.387, 'eval_input_ids_steps_per_second': 0.387, 'epoch': 0.0}


  0%|          | 0/100 [00:00<?, ?it/s]

{'eval_labels_runtime': 155.8769, 'eval_labels_samples_per_second': 0.642, 'eval_labels_steps_per_second': 0.642, 'epoch': 0.0}


  0%|          | 0/100 [00:00<?, ?it/s]

{'eval_attention_mask_runtime': 224.0191, 'eval_attention_mask_samples_per_second': 0.446, 'eval_attention_mask_steps_per_second': 0.446, 'epoch': 0.0}
{'loss': 7.0583, 'grad_norm': 19.55243682861328, 'learning_rate': 4.9982466325604234e-05, 'epoch': 0.0}


  0%|          | 0/100 [00:00<?, ?it/s]

{'eval_input_ids_runtime': 410.3618, 'eval_input_ids_samples_per_second': 0.244, 'eval_input_ids_steps_per_second': 0.244, 'epoch': 0.0}


  0%|          | 0/100 [00:00<?, ?it/s]

{'eval_labels_runtime': 397.7662, 'eval_labels_samples_per_second': 0.251, 'eval_labels_steps_per_second': 0.251, 'epoch': 0.0}


  0%|          | 0/100 [00:00<?, ?it/s]

In [None]:
model.save_pretrained("./llama3_lora_model_finetuned")
tokenizer.save_pretrained("./llama3_lora_tokenizer_finetuned")