In [None]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)
FOLDERNAME = "Colab\ Notebooks/fetch-data"
%cd drive/MyDrive/$FOLDERNAME

Mounted at /content/drive
/content/drive/MyDrive/Colab Notebooks/WeHelp/Final Project/Datasets


In [None]:
# !pip install transformers peft bitsandbytes accelerate datasets

In [None]:
%%bash
export HF_TOKEN="MY_TOKEN"
huggingface-cli login --token $HF_TOKEN --add-to-git-credential

[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in your terminal in case you want to set the 'store' credential helper as default.

git config --global credential.helper store

Read https://git-scm.com/book/en/v2/Git-Tools-Credential-Storage for more details.[0m


Token is valid (permission: fineGrained).
The token `llama-2` has been saved to /root/.cache/huggingface/stored_tokens
Token has not been saved to git credential helper.
Your token has been saved to /root/.cache/huggingface/token
Login successful.
Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [None]:
import numpy as np
from numpy.core.multiarray import array as _carray

def _patched_array(*args, **kwargs):
    kwargs.pop("copy", None)
    kwargs.pop("subok", None)
    return _carray(*args, **kwargs)

np.array = _patched_array
import numpy.core.multiarray as _multiarray
_multiarray.array = _patched_array

In [None]:
import json
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

# Read JSONL and convert it to HF Dataset
data = []
with open("lora_data.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        data.append(json.loads(line))
dataset = Dataset.from_list(data)

ds = dataset.train_test_split(test_size=0.1, seed=42)

# Load tokenizer & Quantified Gemma 2 2B
MODEL_NAME = "google/gemma-2b"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
tokenizer.pad_token_id = tokenizer.eos_token_id

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    quantization_config=quant_config
)

def tokenize_fn(batch):
    texts = [p.strip() + " " + c.strip()
             for p, c in zip(batch["prompt"], batch["completion"])]
    out = tokenizer(
        texts,
        truncation=True,
        max_length=512,
        padding="max_length"
    )
    out["labels"] = [ids.copy() for ids in out["input_ids"]]
    return out

tokenized = ds.map(
    tokenize_fn,
    batched=True,
    remove_columns=["prompt", "completion"]
).with_format("torch")

# Preprocess LoRA
model = prepare_model_for_kbit_training(model)
lora_config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)

# Data collator & TrainingArguments
data_collator = DataCollatorForLanguageModeling(
    tokenizer, mlm=False, pad_to_multiple_of=8
)
training_args = TrainingArguments(
    output_dir="lora_gemma2_resume",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    learning_rate=4e-4,
    logging_steps=10,
    save_steps=50,
    eval_strategy="no",
    fp16=True,
    optim="paged_adamw_8bit",
    save_total_limit=2,
    report_to="none"
)

# Start training
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

trainer.train()
trainer.save_model("lora_gemma2_resume")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Map:   0%|          | 0/317 [00:00<?, ? examples/s]

Map:   0%|          | 0/36 [00:00<?, ? examples/s]

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Step,Training Loss
10,3.874
20,2.8964
30,2.5108
40,2.3652
50,2.3637
60,2.292


  return fn(*args, **kwargs)
