<a href="https://colab.research.google.com/github/ashishpatel26/LLM-Finetuning/blob/main/20.Alpaca_%2B_Gemma2_9b_Unsloth_2x_faster_finetuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%capture
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps xformers "trl<0.9.0" peft accelerate bitsandbytes

In [None]:
import pandas as pd

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/llama-3-8b-bnb-4bit",          
    "unsloth/llama-3-8b-Instruct-bnb-4bit",
    "unsloth/llama-3-70b-bnb-4bit"         
] 

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-Instruct-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

We now add LoRA adapters so we only need to update 1 to 10% of all parameters!

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

In [None]:
prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

from datasets import load_dataset
from datasets import Dataset
dataset = load_dataset('RehanaHasin/vaccine_adjuvant_trial_97', split = "train")
dataset = dataset.map(formatting_prompts_func, batched = True,)

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 50,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

In [None]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

In [None]:
trainer_stats = trainer.train()

In [None]:
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

In [None]:
# prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    prompt.format(
        """Task: Extract vaccine adjuvant names from the provided clinical trial data. Each line of the input includes a unique identifier (NCT Number) and a study summary of the clinical trial, separated by a tab. Your task is to identify any mention of adjuvants in the data and pair it with the corresponding NCT Number. The output format is provided below.

```
Output Format:
Produce a TSV (tab-separated values) output file with the two following columns:
* NCT Number: The unique identifier for each trial.
* Adjuvant Name: The name of the adjuvant mentioned in the provided clinical trial information.
If multiple adjuvants are mentioned, list each adjuvant in a separate output line. The output should not contain any blank columns. The output should include a line with 'Done' at the end.
```""", # instruction
        """NCT Number	Study Summary
NCT01479244	Study Title: Efficacy and Safety Study of NeuVax (Nelipepimut-S or E75) Vaccine to Prevent Breast Cancer Recurrence. Brief Summary: Purpose of this trial: 1. To assess the efficacy and safety of NeuVax™ administered with adjuvant Leukine (sargramostim, GM-CSF). 2. To evaluate and compare the disease free survival (DFS) in the vaccinated and control subjects. Interventions: BIOLOGICAL: NeuVax vaccine | BIOLOGICAL: Leukine (sargramostim, GM-CSF) and water for injection.""", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
tokenizer.batch_decode(outputs)

In [None]:
output = tokenizer.batch_decode(outputs)
# Extract the relevant response part

response = output[0].split("### Response:\n")[1]
# Parse the text into a list of lines
response = response.strip().split("<|end_of_text|>")[0]

# Parse the text into a list of lines
lines = response.strip().split("\n")

# Split each line into columns
data = [line.split("\t") for line in lines]

# Create a dataframe
df = pd.DataFrame(data[1:], columns=data[0])

# Display the dataframe
df

In [None]:
test_dataset = load_dataset('RehanaHasin/vaccine_adjuvant_trial_97', split = "test")
test_dataset = test_dataset.map(formatting_prompts_func, batched = True,)
test_dataset

In [None]:
# Function to generate predictions
def generate_predictions(examples):
    inputs = tokenizer(examples["text"], return_tensors="pt", padding=True, truncation=True).to("cuda")
    outputs = model.generate(**inputs, max_new_tokens=64, use_cache=True)
    predictions = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return {"predictions": predictions}

# Apply the prediction function to the dataset
predictions_dataset = test_dataset.map(generate_predictions, batched=True, batch_size=8)


In [None]:
predictions_dataset

In [None]:
# Convert the dataset to a pandas DataFrame
df = pd.DataFrame(predictions_dataset)
df

In [None]:
# Print the prediction value of the first row
print(df.loc[4, 'predictions'])

In [None]:
# Create a new DataFrame with only the 'predictions' column
prediction_df = df[['predictions']]
prediction_df.to_csv("llama3_output/llama3_8b_Instruct_bnb_4bit/prediction_df.csv", index=False)
prediction_df

In [None]:
# Function to process each prediction
def process_prediction(prediction):
    try:
        # Extract the relevant response part
        response = prediction.split("### Response:\n")[1]
        # Parse the text into a list of lines
        #response = response.strip().split("")[0]
        # Parse the text into a list of lines
        lines = response.strip().split("\n")
        # Split each line into columns
        data = [line.split("\t") for line in lines]
        # Create a dataframe
        processed_df = pd.DataFrame(data[1:], columns=data[0])
        return processed_df
    except Exception as e:
        print(f"Error processing prediction: {e}")
        return pd.DataFrame()

# Apply the processing function to each prediction and combine the results
processed_dfs = [process_prediction(prediction) for prediction in prediction_df['predictions']]
predicted_adjuvant_df = pd.concat(processed_dfs, ignore_index=True)
predicted_adjuvant_df.to_csv("llama3_output/llama3_8b_Instruct_bnb_4bit/predicted_adjuvant_df.csv", index=False)

predicted_adjuvant_df

In [None]:
gold_standard_df = pd.read_csv("Dataset/AdjuvareDB104_Standard/11_folds_preprocessed_merged_file.csv")
# Specify the columns to keep
columns_to_keep = ['NCT Number', 'Adjuvant Name']

# Select only the specified columns
gold_standard_df = gold_standard_df[columns_to_keep]
gold_standard_df.head(2)

In [None]:
# Performing inner join on 'ID' column
df_merged = pd.merge(predicted_adjuvant_df, gold_standard_df, on='NCT Number', how='inner')
df_merged.to_csv("llama3_output/llama3_8b_Instruct_bnb_4bit/predicted_merged_with_goldstandard.csv", index=False)

# Display the result
df_merged

In [None]:
# prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    prompt.format(
        """Task: Extract vaccine adjuvant names from the provided clinical trial data. Each line of the input includes a unique identifier (NCT Number) and a study summary of the clinical trial, separated by a tab. Your task is to identify any mention of adjuvants in the data and pair it with the corresponding NCT Number. The output format is provided below.

```
Output Format:
Produce a TSV (tab-separated values) output file with the two following columns:
* NCT Number: The unique identifier for each trial.
* Adjuvant Name: The name of the adjuvant mentioned in the provided clinical trial information.
If multiple adjuvants are mentioned, list each adjuvant in a separate output line. The output should not contain any blank columns. The output should include a line with 'Done' at the end.
```""", # instruction
        """NCT Number	Study Summary
NCT03359239	Study Title: Atezolizumab Given in Combination With a Personalized Vaccine in Patients With Urothelial Cancer. Brief Summary: The purpose of this study is to determine the good and bad effects of atezolizumab given in combination with a personalized cancer vaccine in patients with urothelial cancer either after surgery to remove organ where the tumor arose (for example, removal of the bladder) or for urothelial cancer that has spread to other organs. Interventions: DRUG: Atezolizumab | BIOLOGICAL: PGV001 | DRUG: Poly ICLC | DRUG: Normal saline.""", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)