In [None]:
!pip install -q transformers accelerate peft trl datasets evaluate tensorboard bitsandbytes

In [17]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from transformers import BitsAndBytesConfig
import torch
import evaluate
from datetime import datetime
import os
from sklearn.metrics import accuracy_score
import numpy as np
from huggingface_hub import login, create_repo


In [18]:
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## 🔐 Load Tokenizer and Model

In [20]:
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"  # Requires HF access

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=True)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="cuda", # auto,
    use_auth_token=True,
)
model = prepare_model_for_kbit_training(model)

peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, peft_config)



Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 112.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 50.12 MiB is free. Process 6609 has 14.69 GiB memory in use. Of the allocated memory 14.43 GiB is allocated by PyTorch, and 126.53 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
model.gradient_checkpointing_enable()

## 📦 Load and Tokenize Dataset

In [None]:
dataset = load_dataset("kuyesu22/multilingual_instruction_tuning")
dataset = dataset["train"].train_test_split(test_size=0.05, seed=42)

def generate_prompt(example):
    return f"### Instruction:\n{example['instruction']}\n\n### Input:\n{example['input']}\n\n### Response:\n{example['output']}"

def tokenize(example):
    full_prompt = generate_prompt(example)
    tokenized = tokenizer(full_prompt, truncation=True, padding="max_length", max_length=512)
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

tokenized_dataset = dataset.map(tokenize, remove_columns=dataset["train"].column_names)

README.md:   0%|          | 0.00/391 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/8.59M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/149658 [00:00<?, ? examples/s]

Map:   0%|          | 0/142175 [00:00<?, ? examples/s]

Map:   0%|          | 0/7483 [00:00<?, ? examples/s]

## 🧠 Training Setup

In [None]:
report_to="tensorboard",
output_dir = "mldod-multilingual-translation"
log_dir = os.path.join(output_dir, "logs", datetime.now().strftime("%Y%m%d-%H%M%S"))


In [None]:
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=3,
    per_device_train_batch_size=1,  # Reduce to 1
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=8,  # Simulates batch size of 8
    # per_device_train_batch_size=4,
    # per_device_eval_batch_size=4,
    eval_strategy="steps",  # <-- switch to steps for frequent evals
    eval_steps=100,               # <-- evaluate every 100 steps
    save_strategy="steps",
    save_steps=100,
    logging_dir=log_dir,
    logging_strategy="steps",
    logging_steps=20,             # <-- log every 20 steps
    load_best_model_at_end=True,
    report_to=["tensorboard"],
    warmup_steps=10,
    save_total_limit=2,
    fp16=True,
    push_to_hub=True,
    hub_model_id="kuyesu22/mldod-multilingual-translation",
    hub_strategy="every_save"
)


In [None]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

## 👨‍🏫 Compute Custom Metrics

In [None]:

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    preds = np.argmax(logits, axis=-1)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [label.strip() for label in decoded_labels]

    accuracy = accuracy_score(decoded_labels, decoded_preds)
    return {"accuracy": accuracy}


## 🏋️ Start Training

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics  # <-- add this
)


  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [None]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss


OutOfMemoryError: CUDA out of memory. Tried to allocate 2.94 GiB. GPU 0 has a total capacity of 14.74 GiB of which 2.51 GiB is free. Process 6609 has 12.23 GiB memory in use. Of the allocated memory 10.69 GiB is allocated by PyTorch, and 1.41 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

## 📊 Evaluation

In [None]:
predictions = trainer.predict(tokenized_dataset["test"])
preds = tokenizer.batch_decode(torch.argmax(torch.tensor(predictions.predictions), dim=-1), skip_special_tokens=True)
refs = tokenizer.batch_decode(predictions.label_ids, skip_special_tokens=True)

## 🧪 Compute Metrics

In [None]:
bleu = evaluate.load("bleu")
roc_auc = evaluate.load("roc_auc")
accuracy = evaluate.load("accuracy")

# Clean labels and predictions
clean_preds = [pred.strip() for pred in preds]
clean_refs = [ref.strip() for ref in refs]

# BLEU
bleu_score = bleu.compute(predictions=clean_preds, references=[[ref] for ref in clean_refs])
print("BLEU:", bleu_score)

# Accuracy (simplified)
accuracy_score = accuracy.compute(predictions=clean_preds, references=clean_refs)
print("Accuracy:", accuracy_score)

# ROCAUC (Note: works better for binary classification tasks)
# This step will only work if your output is binary (e.g., 0 or 1). Skip if not relevant.
try:
    bin_preds = [1 if "yes" in pred.lower() else 0 for pred in clean_preds]
    bin_refs = [1 if "yes" in ref.lower() else 0 for ref in clean_refs]
    auc_score = roc_auc.compute(prediction_scores=bin_preds, references=bin_refs)
    print("ROC-AUC:", auc_score)
except Exception as e:
    print("ROC-AUC could not be computed:", str(e))

{'instruction': 'Translate to English',
 'input': 'Jani la mahindi lina manjano hata hivo rangi yake na hudhururungi kwa mbingi',
 'output': 'The maize leaf is yellow however its colour tends to brown at the base.',
 'language': 'swahili'}

## 📈 View TensorBoard

In [None]:
# Run in terminal or notebook:
!tensorboard --logdir=llama3-multilingual-finetune/logs

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…