In [1]:
!pip install wandb -q
import wandb
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mahmadhakimiadnan[0m ([33mahmadhakimiadnan-other[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [2]:
import bitsandbytes as bnb
print(bnb.__version__)  # Should be 0.41.3

0.46.0


In [2]:
import os
import torch
import torch.nn as nn
import bitsandbytes as bnb
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, BitsAndBytesConfig
import wandb

wandb.init(project="fine-tuning-llms")

#available device
print(f"Available devices: {torch.cuda.device_count()} GPUs")
print(f"Current device: {torch.cuda.current_device()}")

bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
)

model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-3.2-3B-Instruct",
    quantization_config=bnb_config,
    device_map="auto",  
    torch_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B-Instruct")
tokenizer.pad_token = tokenizer.eos_token
print("model is loaded")

Available devices: 2 GPUs
Current device: 0


2025-08-21 22:59:05.854567: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-08-21 22:59:06.162795: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1755788346.303128 3370446 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1755788346.335605 3370446 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-08-21 22:59:06.591575: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

model is loaded


In [3]:
for param in model.parameters():
    param.requires_grad = False
    if param.ndim == 1:
        param.data = param.data.to(torch.float32)

model.gradient_checkpointing_enable()
model.enable_input_require_grads()

class CastOutputToFloat(nn.Sequential):
    def forward(self, x):
        return super().forward(x.to(torch.bfloat16)).to(torch.float32)

model.lm_head = CastOutputToFloat(model.lm_head)

def print_trainable_parameters(model):
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(f"trainable params: {trainable_params}  all params: {all_param}  trainable%: {100 * trainable_params / all_param}")
    wandb.log({
        "trainable_params": trainable_params,
        "total_params": all_param,
        "trainable_percentage": 100 * trainable_params / all_param
    })
    
print_trainable_parameters(model)

trainable params: 0  all params: 3212749824  trainable%: 0.0


In [4]:
#check layers in the model
for name, module in model.named_modules():
    if 'attn' in name or 'attention' in name:
        print(name)
        for sub_name, sub_module in module.named_modules():
            print(f"  - {sub_name}")

model.layers.0.self_attn
  - 
  - q_proj
  - k_proj
  - v_proj
  - o_proj
model.layers.0.self_attn.q_proj
  - 
model.layers.0.self_attn.k_proj
  - 
model.layers.0.self_attn.v_proj
  - 
model.layers.0.self_attn.o_proj
  - 
model.layers.0.post_attention_layernorm
  - 
model.layers.1.self_attn
  - 
  - q_proj
  - k_proj
  - v_proj
  - o_proj
model.layers.1.self_attn.q_proj
  - 
model.layers.1.self_attn.k_proj
  - 
model.layers.1.self_attn.v_proj
  - 
model.layers.1.self_attn.o_proj
  - 
model.layers.1.post_attention_layernorm
  - 
model.layers.2.self_attn
  - 
  - q_proj
  - k_proj
  - v_proj
  - o_proj
model.layers.2.self_attn.q_proj
  - 
model.layers.2.self_attn.k_proj
  - 
model.layers.2.self_attn.v_proj
  - 
model.layers.2.self_attn.o_proj
  - 
model.layers.2.post_attention_layernorm
  - 
model.layers.3.self_attn
  - 
  - q_proj
  - k_proj
  - v_proj
  - o_proj
model.layers.3.self_attn.q_proj
  - 
model.layers.3.self_attn.k_proj
  - 
model.layers.3.self_attn.v_proj
  - 
model.layers.3

In [5]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r = 32,
    lora_alpha = 64,
    target_modules = ["q_proj","k_proj", "v_proj", "o_proj"],
    lora_dropout = 0.05,
    bias = "none",
    task_type = "CAUSAL_LM"
)

lora_model = get_peft_model(model, config)
print_trainable_parameters(lora_model)

trainable params: 18350080  all params: 3231099904  trainable%: 0.56792053929633


In [4]:
import pandas as pd
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split

df = pd.read_csv("CMNEE.csv")
df = df[df['instruction'].notnull() & df['new_output'].notnull()]

if 'input' not in df.columns:
    df['input'] = ''

#split
train_df, val_df = train_test_split(df, test_size=0.15, random_state=42)

#HF DatasetDict
data = DatasetDict({
    "train": Dataset.from_pandas(train_df, preserve_index=False),
    "validation": Dataset.from_pandas(val_df, preserve_index=False),
    "test": Dataset.from_pandas(val_df.copy(), preserve_index=False)
})

In [5]:
#prompt
def create_prompt(example):
    prompt = f"Instruction:\n{example['instruction']}\n\n"
    if example['input']:
        prompt += f"Input:\n{example['input']}\n\n"
    prompt += f"Response:\n{example['new_output']}"
    example["prompt"] = prompt
    return example

data = data.map(create_prompt)

Map:   0%|          | 0/2547 [00:00<?, ? examples/s]

Map:   0%|          | 0/450 [00:00<?, ? examples/s]

Map:   0%|          | 0/450 [00:00<?, ? examples/s]

In [6]:
from transformers import AutoTokenizer
#tokenization
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B-Instruct")
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(example):
    tokens = tokenizer(
        example["prompt"],
        truncation=True,
        padding="max_length",
        max_length=512
    )
    tokens["labels"] = tokens["input_ids"].copy()
    return tokens

tokenized_data = data.map(tokenize_function, batched=True)
print(tokenized_data)

Map:   0%|          | 0/2547 [00:00<?, ? examples/s]

Map:   0%|          | 0/450 [00:00<?, ? examples/s]

Map:   0%|          | 0/450 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output', 'new_output', 'prompt', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 2547
    })
    validation: Dataset({
        features: ['instruction', 'input', 'output', 'new_output', 'prompt', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 450
    })
    test: Dataset({
        features: ['instruction', 'input', 'output', 'new_output', 'prompt', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 450
    })
})


In [9]:
#training setup
from datetime import datetime
import transformers

base_model_name = "Llama3.23B"
run_name = "QLoRA"
output_dir = "./" + run_name

trainer = transformers.Trainer(
    model=lora_model,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["validation"],
    args=transformers.TrainingArguments(
        per_device_train_batch_size=2,
        num_train_epochs=4,
        gradient_accumulation_steps=4,
        optim="paged_adamw_8bit",
        warmup_steps=100,
        max_steps=-1,
        learning_rate=1e-4,
        bf16=True,
        logging_steps=10,
        output_dir=output_dir, 
        logging_dir="./logs",
        save_strategy="steps",
        save_steps=50,
        eval_strategy="steps",
        eval_steps=50,
        do_eval=True,
        report_to='wandb',
        run_name=f"{run_name}-{datetime.now().strftime('%Y-%m-%d-%H-%M')}"
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
)

model.config.use_cache = False

In [10]:
trainer.train()
wandb.finish()



Step,Training Loss,Validation Loss
50,2.6929,2.67823
100,2.5896,2.549751
150,2.3422,2.493921
200,2.4498,2.427639
250,2.3365,2.364314
300,2.2969,2.319451
350,2.0846,2.286154
400,2.0535,2.247504
450,2.0353,2.212166
500,2.0978,2.181168




0,1
eval/loss,█▇▆▆▅▅▅▄▄▄▄▃▃▃▃▂▂▂▂▁▁▁▁▁▁
eval/runtime,▁▇▇▇▇██▇█▇▇█▇▇▇▇▇▇▇▇▇▇▇▇▇
eval/samples_per_second,█▂▂▂▂▁▁▂▁▂▂▁▂▂▂▂▂▂▂▂▂▂▂▂▂
eval/steps_per_second,█▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▁▁▁▁
total_params,▁█
train/epoch,▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▆▆▆▇▇▇▇▇████
train/global_step,▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇███
train/grad_norm,▁▁▁▁▁▁▂▂▂▂▂▂▂▃▃▄▄▃▃▄▄▄▄▆▅▅▅▇▆▆▆▇▇▇▇█▇▇▇█
train/learning_rate,▂▄▇████▇▇▇▆▆▆▆▅▅▅▅▅▄▄▄▄▄▄▄▃▃▃▃▃▃▃▂▂▂▁▁▁▁
train/loss,█▇▆▆▆▅▅▅▅▅▄▄▄▄▄▄▄▄▃▄▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▂▁▁

0,1
eval/loss,1.82867
eval/runtime,45.5205
eval/samples_per_second,9.886
eval/steps_per_second,1.252
total_flos,8.8794160265429e+16
total_params,3231099904.0
train/epoch,4.0
train/global_step,1276.0
train/grad_norm,3.00452
train/learning_rate,0.0


In [4]:
#human evaluation
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
import gradio as gr

base_model_name = "meta-llama/Llama-3.2-3B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(base_model_name)
tokenizer.pad_token = tokenizer.eos_token

base = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    torch_dtype=torch.float16,
    device_map="auto"
)

#adapter
model = PeftModel.from_pretrained(base, "QLoRA/checkpoint-1276")
model.eval()

def build_prompt(instruction, input_text=""):
    prompt = f"Instruction:\n{instruction.strip()}\n"
    if input_text.strip():
        prompt += f"\nInput:\n{input_text.strip()}\n"
    prompt += "\nResponse:\n"
    return prompt

def generate_text(instruction, input_text):
    prompt = build_prompt(instruction, input_text)
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=256,
            temperature=0.7,
            top_p=0.9,
            repetition_penalty=1.1,
            do_sample=True
        )
    # Only return the part after "Response:"
    full_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return full_output.split("Response:")[-1].strip()

interface = gr.Interface(
    fn=generate_text,
    inputs=[
        gr.Textbox(label="Instruction", lines=2, placeholder="e.g. What is the best team in London?"),
        gr.Textbox(label="Input (optional)", lines=2, placeholder="Add context if needed"),
    ],
    outputs=gr.Textbox(label="Model Response"),
    title="Llama 3.2 3B Fined Tune",
    description="Interact with the baseline LLaMA 3.2 3B model after fine-tuning by QLoRA."
)

interface.launch(share=True)

2025-08-19 13:13:03.211366: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-08-19 13:13:03.217862: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1755580383.224611 3334930 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1755580383.226339 3334930 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-08-19 13:13:03.232681: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

* Running on local URL:  http://127.0.0.1:7860
* Running on public URL: https://c8111a508218f64501.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


In [7]:
#numerical evaluation
from sklearn.metrics import f1_score
from sklearn.preprocessing import MultiLabelBinarizer
from tqdm import tqdm
import torch
import logging
from transformers import logging as hf_logging

from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel


from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
import nltk
nltk.download('punkt')

base_model = "meta-llama/Llama-3.2-3B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(base_model)
base = AutoModelForCausalLM.from_pretrained(base_model, torch_dtype=torch.float16, device_map="auto")
model = PeftModel.from_pretrained(base, "QLoRA/checkpoint-1276")
model.eval()

logging.getLogger().setLevel(logging.ERROR)
hf_logging.set_verbosity_error()

#evaluation
true_answers = []
predicted_answers = []

for sample in tqdm(data["test"]):
    instruction = sample["instruction"]
    input_text = sample["input"]
    true_output = sample["new_output"]

    prompt = f"Instruction:\n{instruction.strip()}\n"
    if input_text is not None and input_text.strip():
        prompt += f"\nInput:\n{input_text.strip()}\n"
    prompt += "\nResponse:\n"

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=128,
            temperature=0.7,
            top_p=0.9,
            repetition_penalty=1.1,
            do_sample=False
        )
    generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    predicted_response = generated_text.split("Response:")[-1].strip()

    predicted_answers.append(predicted_response)
    true_answers.append(true_output.strip())
    
#f1
true_token_sets = [set(ans.lower().split()) for ans in true_answers]
pred_token_sets = [set(ans.lower().split()) for ans in predicted_answers]

all_tokens = list(set().union(*true_token_sets, *pred_token_sets))
mlb = MultiLabelBinarizer(classes=all_tokens)
y_true_bin = mlb.fit_transform(true_token_sets)
y_pred_bin = mlb.transform(pred_token_sets)

f1 = f1_score(y_true_bin, y_pred_bin, average="macro")
print(f"\nMacro F1 Score on validation set: {f1:.4f}")

#BERTScore
from bert_score import score as bert_score
print("\nCalculating BERTScore...")

P = predicted_answers
R = true_answers

P_scores, R_scores, F1_scores = bert_score(P, R, lang='en', verbose=True)
average_bert_f1 = F1_scores.mean().item()
print(f"Average BERTScore F1 on validation set: {average_bert_f1:.4f}")

#BLEU
print("\nCalculating BLEU Score...")

smoothie = SmoothingFunction().method4
bleu_scores = []
for ref, pred in zip(true_answers, predicted_answers):
    ref_tokens = [nltk.word_tokenize(ref.lower())]
    pred_tokens = nltk.word_tokenize(pred.lower())
    score = sentence_bleu(ref_tokens, pred_tokens, smoothing_function=smoothie)
    bleu_scores.append(score)

average_bleu = sum(bleu_scores) / len(bleu_scores)
print(f"Average BLEU Score on validation set: {average_bleu:.4f}")

#ROUGE
print("\nCalculating ROUGE Scores...")

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
rouge1_list, rouge2_list, rougeL_list = [], [], []

for ref, pred in zip(true_answers, predicted_answers):
    scores = scorer.score(ref, pred)
    rouge1_list.append(scores['rouge1'].fmeasure)
    rouge2_list.append(scores['rouge2'].fmeasure)
    rougeL_list.append(scores['rougeL'].fmeasure)

avg_rouge1 = sum(rouge1_list) / len(rouge1_list)
avg_rouge2 = sum(rouge2_list) / len(rouge2_list)
avg_rougeL = sum(rougeL_list) / len(rougeL_list)

print(f"Average ROUGE-1 F1: {avg_rouge1:.4f}")
print(f"Average ROUGE-2 F1: {avg_rouge2:.4f}")
print(f"Average ROUGE-L F1: {avg_rougeL:.4f}")

2025-08-22 00:49:03.737545: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-08-22 00:49:03.743393: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1755794943.749495  295744 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1755794943.751219  295744 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-08-22 00:49:03.757350: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 450/450 [12:25<00:00,  1.66s/it]



Macro F1 Score on validation set: 0.0797

Calculating BERTScore...
calculating scores...
computing bert embedding.


  0%|          | 0/14 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/8 [00:00<?, ?it/s]

done in 1.57 seconds, 286.07 sentences/sec
Average BERTScore F1 on validation set: 0.8587

Calculating BLEU Score...
Average BLEU Score on validation set: 0.0953

Calculating ROUGE Scores...
Average ROUGE-1 F1: 0.2686
Average ROUGE-2 F1: 0.1993
Average ROUGE-L F1: 0.2680
