In [1]:
!pip install wandb -q
import wandb
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mahmadhakimiadnan[0m ([33mahmadhakimiadnan-other[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [2]:
import torch
print(torch.cuda.get_device_name(0))  # Should show "RTX 4070"
print(torch.cuda.get_device_properties(0).total_memory / 1e9)  # Should show ~12GB

NVIDIA GeForce RTX 4070 Ti
12.479430656


In [3]:
torch.cuda.empty_cache()

In [1]:
import os
import torch
import torch.nn as nn
import bitsandbytes as bnb
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, BitsAndBytesConfig
import wandb

wandb.init(project="fine-tuning-llms")

model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-3.2-1B",
    device_map='auto',
)

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")
print("model is loaded")

[34m[1mwandb[0m: Currently logged in as: [33mahmadhakimiadnan[0m ([33mahmadhakimiadnan-other[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


2025-08-20 11:49:13.527363: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-08-20 11:49:13.533746: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1755661753.541266 3763559 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1755661753.543471 3763559 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-08-20 11:49:13.551921: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

model is loaded


In [2]:
for name, param in model.named_parameters():
    print(name, param.shape)

model.embed_tokens.weight torch.Size([128256, 2048])
model.layers.0.self_attn.q_proj.weight torch.Size([2048, 2048])
model.layers.0.self_attn.k_proj.weight torch.Size([512, 2048])
model.layers.0.self_attn.v_proj.weight torch.Size([512, 2048])
model.layers.0.self_attn.o_proj.weight torch.Size([2048, 2048])
model.layers.0.mlp.gate_proj.weight torch.Size([8192, 2048])
model.layers.0.mlp.up_proj.weight torch.Size([8192, 2048])
model.layers.0.mlp.down_proj.weight torch.Size([2048, 8192])
model.layers.0.input_layernorm.weight torch.Size([2048])
model.layers.0.post_attention_layernorm.weight torch.Size([2048])
model.layers.1.self_attn.q_proj.weight torch.Size([2048, 2048])
model.layers.1.self_attn.k_proj.weight torch.Size([512, 2048])
model.layers.1.self_attn.v_proj.weight torch.Size([512, 2048])
model.layers.1.self_attn.o_proj.weight torch.Size([2048, 2048])
model.layers.1.mlp.gate_proj.weight torch.Size([8192, 2048])
model.layers.1.mlp.up_proj.weight torch.Size([8192, 2048])
model.layers.1.

In [3]:
#freeze all parameters first
for param in model.parameters():
    param.requires_grad = False

#unfreeze top n layers + lm_head
n_layers = 2  
layers = model.model.layers

for layer in layers[-n_layers:]:
    for param in layer.parameters():
        param.requires_grad = True

#unfreeze final lm_head
for param in model.lm_head.parameters():
    param.requires_grad = True

#print trainable parameters
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
total = sum(p.numel() for p in model.parameters())
print(f"Trainable params: {trainable:,} | Total params: {total:,} ({100 * trainable/total:.4f}%)")
wandb.log({"trainable_params": trainable, "total_params": total})


Trainable params: 384,311,296 | Total params: 1,235,814,400 (31.0978%)


In [4]:
for name, param in model.lm_head.named_parameters():
    print(name, param.shape, param.dtype, param.requires_grad)

weight torch.Size([128256, 2048]) torch.float32 True


In [5]:
class CastOutputToFloat(nn.Sequential):
    def forward(self, x):
        return super().forward(x).to(torch.float32)


def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(f"trainable params: {trainable_params} | all params: {all_param} | trainable%: {100 * trainable_params / all_param:.2f}")

print_trainable_parameters(model)

trainable params: 384311296 | all params: 1235814400 | trainable%: 31.10


In [1]:
import pandas as pd
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split

df = pd.read_csv("CMNEE.csv")
df = df[df['instruction'].notnull() & df['new_output'].notnull()]

if 'input' not in df.columns:
    df['input'] = ''

#split
train_df, val_df = train_test_split(df, test_size=0.15, random_state=42)

#HF DatasetDict
data = DatasetDict({
    "train": Dataset.from_pandas(train_df, preserve_index=False),
    "validation": Dataset.from_pandas(val_df, preserve_index=False),
    "test": Dataset.from_pandas(val_df.copy(), preserve_index=False)
})

In [2]:
#prompt
def create_prompt(example):
    prompt = f"Instruction:\n{example['instruction']}\n\n"
    if example['input']:
        prompt += f"Input:\n{example['input']}\n\n"
    prompt += f"Response:\n{example['new_output']}"
    example["prompt"] = prompt
    return example

data = data.map(create_prompt)

Map:   0%|          | 0/2547 [00:00<?, ? examples/s]

Map:   0%|          | 0/450 [00:00<?, ? examples/s]

Map:   0%|          | 0/450 [00:00<?, ? examples/s]

In [3]:
from transformers import AutoTokenizer
#tokenization
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(example):
    tokens = tokenizer(
        example["prompt"],
        truncation=True,
        padding="max_length",
        max_length=512
    )
    tokens["labels"] = tokens["input_ids"].copy()
    return tokens

tokenized_data = data.map(tokenize_function, batched=True)
print(tokenized_data)

Map:   0%|          | 0/2547 [00:00<?, ? examples/s]

Map:   0%|          | 0/450 [00:00<?, ? examples/s]

Map:   0%|          | 0/450 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output', 'new_output', 'prompt', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 2547
    })
    validation: Dataset({
        features: ['instruction', 'input', 'output', 'new_output', 'prompt', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 450
    })
    test: Dataset({
        features: ['instruction', 'input', 'output', 'new_output', 'prompt', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 450
    })
})


In [9]:
from datetime import datetime
import transformers
import os
import gc

base_model_name = "Llama3.21B"
run_name = "TopLayer"
output_dir = "./" + run_name

#training Arguments with Efficient Checkpoint Management
training_args = transformers.TrainingArguments(
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    prediction_loss_only=True,
    num_train_epochs=3,
    gradient_accumulation_steps=4,
    warmup_steps=100,
    learning_rate=2e-4,
    fp16=True,
    logging_steps=10,
    output_dir=output_dir,
    logging_dir="./logs",
    save_strategy="steps",
    save_steps=50,
    save_total_limit=1,
    eval_strategy="steps",
    eval_steps=50,
    do_eval=True,
    report_to='wandb',
    run_name=f"{run_name}-{datetime.now().strftime('%Y-%m-%d-%H-%M')}"
)

#custom Callback to Clean Up Old Checkpoints
class CleanupCallback(transformers.TrainerCallback):
    def on_save(self, args, state, control, **kwargs):
        # Keep only the 3 most recent checkpoints (adjust as needed)
        checkpoints = []
        for entry in os.listdir(args.output_dir):
            if entry.startswith("checkpoint-"):
                checkpoints.append(entry)
        checkpoints.sort(key=lambda x: int(x.split("-")[1]))
        
        # Delete all but the newest checkpoint
        for checkpoint in checkpoints[:-1]:
            os.system(f"rm -rf {os.path.join(args.output_dir, checkpoint)}")

trainer = transformers.Trainer(
    model=model,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["validation"],
    args=training_args,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
    callbacks=[CleanupCallback()]  
)

model.config.use_cache = False
torch.backends.cuda.enable_flash_sdp(True)

In [10]:
trainer.train()
del trainer
del model
del tokenizer
gc.collect()
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()
torch.cuda.reset_accumulated_memory_stats()
wandb.finish()

Step,Training Loss,Validation Loss
50,2.7121,2.71711
100,2.5104,2.627136
150,2.2645,2.522175
200,2.3491,2.31909
250,2.1518,2.181108
300,2.0608,2.076246
350,2.0221,1.889973
400,1.6942,1.771827
450,1.5693,1.644646
500,1.4763,1.506786


0,1
eval/loss,██▇▇▆▆▅▅▅▄▄▃▃▃▃▃▃▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
eval/runtime,▁▂▃▄▃▄▄▄▄▄▄▅▅▅▅▄▄█▄▅▅▄▅▆▅▅▆▆▆▆▆▆▅▆▆█▆▅
eval/samples_per_second,█▇▆▅▆▅▅▅▅▅▅▄▄▄▄▅▅▁▅▄▄▅▄▃▄▄▃▃▃▃▃▃▄▃▃▁▃▄
eval/steps_per_second,█▇▆▅▆▅▅▅▅▅▅▄▄▄▄▅▅▁▅▄▄▅▄▃▄▄▃▃▃▃▃▃▄▃▃▁▃▄
total_params,▁
train/epoch,▁▁▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▄▄▅▅▅▅▅▅▅▅▆▆▆▆▇▇▇██
train/global_step,▁▁▁▁▁▂▂▂▃▃▃▃▃▃▃▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇▇▇███
train/grad_norm,▅▄▅▄▄▃▃▃▃▃▃▃▃▂▃▃▃▂▂▂▂▃▂▂▂▁█▁▁▁▁▂▂▂▁▁▁▁▂▁
train/learning_rate,▂████▇▇▇▇▇▇▇▇▇▆▆▆▆▅▅▅▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▂▁▁▁
train/loss,███▇▇▇▇▆▆▆▅▅▄▅▄▄▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁

0,1
eval/loss,0.4181
eval/runtime,24.6601
eval/samples_per_second,18.248
eval/steps_per_second,18.248
total_flos,2.2842806605185024e+16
total_params,1235814400.0
train/epoch,3.0
train/global_step,1911.0
train/grad_norm,2.34682
train/learning_rate,0.0


In [8]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import gradio as gr

base_model_name = "meta-llama/Llama-3.2-1B"
fine_tuned_checkpoint = "TopLayer/checkpoint-480"  #fine-tuned model checkpoint folder

tokenizer = AutoTokenizer.from_pretrained(base_model_name)
tokenizer.pad_token = tokenizer.eos_token

#load your fine-tuned model checkpoint
model = AutoModelForCausalLM.from_pretrained(
    fine_tuned_checkpoint,
    torch_dtype=torch.float16,
    device_map="auto"
)

model.eval()

def build_prompt(instruction, input_text=""):
    prompt = f"Instruction:\n{instruction.strip()}\n"
    if input_text.strip():
        prompt += f"\nInput:\n{input_text.strip()}\n"
    prompt += "\nResponse:\n"
    return prompt

def generate_text(instruction, input_text):
    prompt = build_prompt(instruction, input_text)
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=256,
            temperature=0.7,
            top_p=0.9,
            repetition_penalty=1.1,
            do_sample=True
        )
    full_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return full_output.split("Response:")[-1].strip()

interface = gr.Interface(
    fn=generate_text,
    inputs=[
        gr.Textbox(label="Instruction", lines=2, placeholder="e.g. What is the best team in London?"),
        gr.Textbox(label="Input (optional)", lines=2, placeholder="Add context if needed"),
    ],
    outputs=gr.Textbox(label="Model Response"),
    title="Llama 3.2 1B Top Layer Fine-tuned",
    description="Interact with the LLaMA 3.2 1B model fine-tuned by updating only the top layer weights."
)

interface.launch(share=True)

Some weights of the model checkpoint at TopLayer/checkpoint-480 were not used when initializing LlamaForCausalLM: ['lm_head.0.weight']
- This IS expected if you are initializing LlamaForCausalLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LlamaForCausalLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


* Running on local URL:  http://127.0.0.1:7860
* Running on public URL: https://2ebade82089960b531.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


In [4]:
from sklearn.metrics import f1_score
from sklearn.preprocessing import MultiLabelBinarizer
from tqdm import tqdm
import torch
import logging
from transformers import logging as hf_logging

from transformers import AutoTokenizer, AutoModelForCausalLM

from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
import nltk
nltk.download('punkt')

base_model = "meta-llama/Llama-3.2-1B"
fine_tuned_checkpoint = "TopLayer/checkpoint-1911"  # path to your fine-tuned model folder

tokenizer = AutoTokenizer.from_pretrained(base_model)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    fine_tuned_checkpoint,
    torch_dtype=torch.float16,
    device_map="auto"
)
model.eval()

logging.getLogger().setLevel(logging.ERROR)
hf_logging.set_verbosity_error()

# evaluation lists
true_answers = []
predicted_answers = []

for sample in tqdm(data["test"]):
    instruction = sample["instruction"]
    input_text = sample.get("input", "")
    true_output = sample["new_output"]

    prompt = f"Instruction:\n{instruction.strip()}\n"
    if input_text and input_text.strip():
        prompt += f"\nInput:\n{input_text.strip()}\n"
    prompt += "\nResponse:\n"

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=128,
            temperature=0.7,
            top_p=0.9,
            repetition_penalty=1.1,
            do_sample=False
        )
    generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    predicted_response = generated_text.split("Response:")[-1].strip()

    predicted_answers.append(predicted_response)
    true_answers.append(true_output.strip())

# F1 Score
true_token_sets = [set(ans.lower().split()) for ans in true_answers]
pred_token_sets = [set(ans.lower().split()) for ans in predicted_answers]

all_tokens = list(set().union(*true_token_sets, *pred_token_sets))
mlb = MultiLabelBinarizer(classes=all_tokens)
y_true_bin = mlb.fit_transform(true_token_sets)
y_pred_bin = mlb.transform(pred_token_sets)

f1 = f1_score(y_true_bin, y_pred_bin, average="macro")
print(f"\nMacro F1 Score on validation set: {f1:.4f}")

# BERTScore
from bert_score import score as bert_score
print("\nCalculating BERTScore...")

P = predicted_answers
R = true_answers

P_scores, R_scores, F1_scores = bert_score(P, R, lang='en', verbose=True)
average_bert_f1 = F1_scores.mean().item()
print(f"Average BERTScore F1 on validation set: {average_bert_f1:.4f}")

# BLEU Score
print("\nCalculating BLEU Score...")

smoothie = SmoothingFunction().method4
bleu_scores = []
for ref, pred in zip(true_answers, predicted_answers):
    ref_tokens = [nltk.word_tokenize(ref.lower())]
    pred_tokens = nltk.word_tokenize(pred.lower())
    score = sentence_bleu(ref_tokens, pred_tokens, smoothing_function=smoothie)
    bleu_scores.append(score)

average_bleu = sum(bleu_scores) / len(bleu_scores)
print(f"Average BLEU Score on validation set: {average_bleu:.4f}")

# ROUGE Scores
print("\nCalculating ROUGE Scores...")

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
rouge1_list, rouge2_list, rougeL_list = [], [], []

for ref, pred in zip(true_answers, predicted_answers):
    scores = scorer.score(ref, pred)
    rouge1_list.append(scores['rouge1'].fmeasure)
    rouge2_list.append(scores['rouge2'].fmeasure)
    rougeL_list.append(scores['rougeL'].fmeasure)

avg_rouge1 = sum(rouge1_list) / len(rouge1_list)
avg_rouge2 = sum(rouge2_list) / len(rouge2_list)
avg_rougeL = sum(rougeL_list) / len(rougeL_list)

print(f"Average ROUGE-1 F1: {avg_rouge1:.4f}")
print(f"Average ROUGE-2 F1: {avg_rouge2:.4f}")
print(f"Average ROUGE-L F1: {avg_rougeL:.4f}")

[nltk_data] Downloading package punkt to /home/user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
2025-08-20 12:32:48.711745: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-08-20 12:32:48.718285: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1755664368.725143 3776972 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1755664368.727130 3776972 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-08-20 12:32:48.734123:


Macro F1 Score on validation set: 0.0628

Calculating BERTScore...
calculating scores...
computing bert embedding.


  0%|          | 0/14 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/8 [00:00<?, ?it/s]

done in 1.91 seconds, 235.63 sentences/sec
Average BERTScore F1 on validation set: 0.8230

Calculating BLEU Score...
Average BLEU Score on validation set: 0.0429

Calculating ROUGE Scores...




Average ROUGE-1 F1: 0.1360
Average ROUGE-2 F1: 0.0797
Average ROUGE-L F1: 0.1343
