In [1]:
# %% [colab] Install libraries
!pip -q install -U "transformers>=4.41.0" "datasets>=2.19.0" "peft>=0.11.0" "bitsandbytes>=0.43.1" accelerate sentencepiece


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.3/11.3 MB[0m [31m92.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.3/61.3 MB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m49.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m69.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m37.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m52.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
# %% [colab] Upload good.jsonl from your computer
from google.colab import files
uploaded = files.upload()  # choose good.jsonl
DATA_FILE = next(iter(uploaded.keys()))
print("Using:", DATA_FILE)



Saving good.jsonl to good (1).jsonl
Using: good (1).jsonl


In [3]:
# %% [colab] Config & helpers
import os, torch
from transformers import (
    AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer,
    BitsAndBytesConfig, DataCollatorForLanguageModeling
)
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, TaskType, prepare_model_for_kbit_training

# Reduce CUDA fragmentation on Colab
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
MODEL_SAVE_PATH = "/content/tinyllama-finetuned-qlora"

# Keep memory in check on 16 GB GPUs
MAX_LENGTH = 2048         # try 512 first; go 768/1024 if VRAM allows
BATCH_SIZE = 1            # keep at 1 for T4; use 2+ only if you have headroom
GRAD_ACCUM = 16           # effective batch size = BATCH_SIZE * GRAD_ACCUM

# DType selection: bf16 on L4/A100 etc., else fp16
bf16_ok = torch.cuda.is_available() and torch.cuda.get_device_capability(0)[0] >= 8
compute_dtype = torch.bfloat16 if bf16_ok else torch.float16

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=compute_dtype,
)

torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

print("GPU:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU",
      "| bf16:", bf16_ok, "| compute dtype:", compute_dtype)


GPU: Tesla T4 | bf16: False | compute dtype: torch.float16


In [4]:
# %% [colab] Load tokenizer & dataset
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
# make sure we have a pad token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
tokenizer.truncation_side = "left"  # keep the tail (usually includes the answer)

raw = load_dataset("json", data_files=DATA_FILE)

def preprocess(ex):
    # simple concat (matches your original script)
    text = (ex.get("system", "") + "\n" +
            ex.get("input", "")  + "\n" +
            ex.get("output", ""))
    enc = tokenizer(
        text,
        truncation=True,
        max_length=MAX_LENGTH,
        add_special_tokens=True,   # adds BOS/EOS for llama-style tokenizers
        padding=False              # dynamic padding via collator
    )
    # causal LM labels (next-token prediction over whole sequence)
    enc["labels"] = enc["input_ids"].copy()
    return enc

print("Tokenizing…")
ds = raw.map(preprocess, remove_columns=raw["train"].column_names)
print(ds)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Tokenizing…


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2000
    })
})


In [5]:
# %% [colab] QLoRA model
print("Loading 4-bit base model…")
base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto"
)

# prepare for k-bit training and add LoRA adapters
base_model = prepare_model_for_kbit_training(base_model)

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
    target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],  # good set for LLaMA
)

model = get_peft_model(base_model, lora_config)
model.print_trainable_parameters()

# During training, cache should be off to save VRAM
model.config.use_cache = False


Loading 4-bit base model…


config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

trainable params: 6,307,840 || all params: 1,106,356,224 || trainable%: 0.5701


In [6]:
# %% [colab] Trainer setup
from transformers import set_seed
set_seed(42)

collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

training_args = TrainingArguments(
    output_dir=MODEL_SAVE_PATH,
    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRAD_ACCUM,
    num_train_epochs=3,                # start small; increase if needed
    learning_rate=2e-4,
    lr_scheduler_type="cosine",
    warmup_ratio=0.03,
    logging_steps=20,
    save_strategy="epoch",
    report_to="none",
    fp16=not bf16_ok,
    bf16=bf16_ok,
    optim="paged_adamw_8bit",          # bitsandbytes optimizer
    dataloader_pin_memory=False,       # helps with Colab sometimes
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds["train"],
    data_collator=collator,
)

print("Starting training…")
trainer.train()
print("✅ Training finished.")


Starting training…


  return fn(*args, **kwargs)


Step,Training Loss
20,0.4342
40,0.1071
60,0.0939
80,0.0916
100,0.0923
120,0.0921
140,0.0905
160,0.0885
180,0.0895
200,0.0905


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


✅ Training finished.


In [1]:
# %% [colab] Save LoRA adapter & tokenizer
model.save_pretrained(MODEL_SAVE_PATH, safe_serialization=True)
tokenizer.save_pretrained(MODEL_SAVE_PATH)
print("Saved at:", MODEL_SAVE_PATH)


NameError: name 'model' is not defined