In [1]:
import os
import sys

sys.path.append(
    os.path.abspath(
        os.path.join(os.path.dirname(os.path.abspath("__file__")), "../../..")
    )
)

In [2]:
import json
import torch
from functools import partial
from textwrap import dedent
from typing import Dict, Literal

from datasets import DatasetDict, load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    EarlyStoppingCallback,
    Trainer,
    TrainingArguments,
)

from peft import LoraConfig, get_peft_model

from src.shared.config import settings

DTYPE = torch.bfloat16

## Utils

In [3]:
HF_GEMMA_ID = "google/gemma-3-1b-pt"


def tokenizer_init():
    tokenizer = AutoTokenizer.from_pretrained(HF_GEMMA_ID)
    return tokenizer


def model_init():
    config = LoraConfig()

    model = get_peft_model(
        AutoModelForCausalLM.from_pretrained(HF_GEMMA_ID),
        config
    )
    return model.to(settings.device)


def format_dataset_row(
    row, for_test: bool, unique_from_accounts: set, eos_token: str
) -> Dict[Literal["text"], str]:
    row = dict(row)
    from_account = row.pop("from_account")
    formatted = dedent(
        f"""
        <possible accounts>{', '.join(unique_from_accounts)}</possible accounts>
        <transaction>{json.dumps(row)}</transaction>
        which account did this transaction come from?
        answer: {'' if for_test else f"{from_account}{eos_token}"}
    """
    ).strip()
    return {"text": formatted}


def training_dataset_init(tokenizer) -> DatasetDict:
    dataset = load_dataset(f"{settings.hf_user_name}/{settings.hf_dataset_repo_name}")

    unique_from_accounts = set(
        dataset["train"]["from_account"] + dataset["test"]["from_account"]
    )
    eos_token = tokenizer.eos_token
    format_for_train = partial(
        format_dataset_row,
        for_test=False,
        unique_from_accounts=unique_from_accounts,
        eos_token=eos_token,
    )

    dataset["train"] = dataset["train"].map(format_for_train)
    dataset["validation"] = dataset["test"].map(format_for_train)
    del dataset["test"]

    remove_columns = [
        "amount",
        "month",
        "day",
        "year",
        "vendor",
        "from_account",
        "text",
    ]
    dataset = dataset.map(
        lambda batch: tokenizer(batch["text"]),
        batched=True,
        remove_columns=remove_columns,
    )
    dataset.set_format("pt")
    return dataset

In [4]:
dataset = training_dataset_init(tokenizer_init())

In [14]:
def get_model_size_gb(model):
    param_size = 0
    for param in model.parameters():
        param_size += param.nelement() * param.element_size()  # size in bytes
    buffer_size = 0
    for buffer in model.buffers():
        buffer_size += buffer.nelement() * buffer.element_size()
    
    size_in_gb = (param_size + buffer_size) / (1024 ** 3)
    
    print(f'Model size: {size_in_gb:.2f} GB')
    return size_in_gb


base_gemma = AutoModelForCausalLM.from_pretrained(HF_GEMMA_ID)
get_model_size_gb(base_gemma)
del base_gemma

Model size: 3.72 GB


In [10]:
base_gemma.print_trainable_parameters()  # would be nice to see

AttributeError: 'Gemma3ForCausalLM' object has no attribute 'print_trainable_parameters'

In [20]:
def model_init():
    config = LoraConfig(
        target_modules=["q_proj", "v_proj"],
    )

    model = get_peft_model(
        AutoModelForCausalLM.from_pretrained(HF_GEMMA_ID, attn_implementation='eager', torch_dtype=DTYPE),
        config,
        autocast_adapter_dtype=True
    )
    return model.to(settings.device)

# model = model_init()

## Identify Best Learning Rate - having problems here.

In [17]:
def identify_best_learning_rate():
    learning_rate_train_dataset = dataset["train"].select(range(64))
    learning_rate_validation_dataset = dataset["validation"].select(range(64))

    training_args = TrainingArguments(
        output_dir="/tmp/lr_search",
        num_train_epochs=1,
        per_device_train_batch_size=64,
        per_device_eval_batch_size=64,
        weight_decay=0.01,
        eval_strategy="epoch",
        logging_strategy="epoch",
        disable_tqdm=False,
        push_to_hub=False,
        log_level="error",
        save_strategy="no",
        report_to="none",
    )

    trainer = Trainer(
        model_init=model_init,
        args=training_args,
        data_collator=DataCollatorForLanguageModeling(
            tokenizer=tokenizer_init(), mlm=False
        ),
        train_dataset=learning_rate_train_dataset,
        eval_dataset=learning_rate_validation_dataset,
    )

    best_run = trainer.hyperparameter_search(
        hp_space=lambda trial: {
            "learning_rate": trial.suggest_float("learning_rate", 1e-6, 1e-3, log=True)
        },
        n_trials=2,
        direction="minimize",
    )

    return best_run


identify_best_learning_rate()

[I 2025-03-18 17:55:10,206] A new study created in memory with name: no-name-516b25f3-eeeb-40b5-9731-c1998e63ee24


Epoch,Training Loss,Validation Loss
1,2.7169,No log


[W 2025-03-18 17:55:19,504] Trial 0 failed with parameters: {'learning_rate': 5.544182349128145e-06} because of the following error: TypeError("The `value` argument is of type '<class 'NoneType'>' but supposed to be a float.").
Traceback (most recent call last):
  File "/Users/jacobdanner/Repos/transaction-lm-fine-tuning/.venv/lib/python3.12/site-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/Users/jacobdanner/Repos/transaction-lm-fine-tuning/.venv/lib/python3.12/site-packages/transformers/integrations/integration_utils.py", line 254, in _objective
    trainer.train(resume_from_checkpoint=checkpoint, trial=trial)
  File "/Users/jacobdanner/Repos/transaction-lm-fine-tuning/.venv/lib/python3.12/site-packages/transformers/trainer.py", line 2251, in train
    return inner_training_loop(
           ^^^^^^^^^^^^^^^^^^^^
  File "/Users/jacobdanner/Repos/transaction-lm-fine-tuning/.venv/lib/python3.12/si

TypeError: The `value` argument is of type '<class 'NoneType'>' but supposed to be a float.

## Training Run

In [5]:
def model_init():
    config = LoraConfig(
        target_modules=["q_proj", "v_proj"],
    )

    model = get_peft_model(
        AutoModelForCausalLM.from_pretrained(HF_GEMMA_ID, attn_implementation='eager', torch_dtype=DTYPE),
        config,
        autocast_adapter_dtype=True
    )
    return model.to(settings.device)



In [6]:
model = model_init()

In [9]:
model

PeftModel(
  (base_model): LoraModel(
    (model): Gemma3ForCausalLM(
      (model): Gemma3TextModel(
        (embed_tokens): Gemma3TextScaledWordEmbedding(262144, 1152, padding_idx=0)
        (layers): ModuleList(
          (0-25): 26 x Gemma3DecoderLayer(
            (self_attn): Gemma3Attention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=1152, out_features=1024, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=1152, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=1024, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): L

In [30]:
learning_rate_train_dataset = dataset["train"].select(range(64))
learning_rate_validation_dataset = dataset["validation"].select(range(64))

learning_rate_train_dataset, learning_rate_validation_dataset

(Dataset({
     features: ['input_ids', 'attention_mask'],
     num_rows: 64
 }),
 Dataset({
     features: ['input_ids', 'attention_mask'],
     num_rows: 64
 }))

In [None]:
learning_rate_train_dataset = dataset["train"].select(range(64))
learning_rate_validation_dataset = dataset["validation"].select(range(64))

n_epochs = 1
best_learning_rate = 1e-4

tokenizer = tokenizer_init()

training_args = TrainingArguments(
    output_dir="/tmp/gemma_3_1b_causal_finetune_lora",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=n_epochs,
    learning_rate=best_learning_rate,
    weight_decay=0.01,
    eval_strategy="epoch",
    logging_strategy="epoch",
    save_strategy="epoch",
    push_to_hub=False,
    # hub_model_id=f"{settings.hf_user_name}/{settings.gemma_3_1b_causal_finetune_lora}",
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    bf16=True
)

trainer = Trainer(
    model_init=model_init,
    args=training_args,
    data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
    train_dataset=learning_rate_train_dataset,
    eval_dataset=learning_rate_validation_dataset,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
)

#  todo, add labels for peft model, and maybe they are not showing up on validation dataset too?
#  

# trainer.train()

No label_names provided for model class `PeftModel`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss
1,2.4995,No log


early stopping required metric_for_best_model, but did not find eval_loss so early stopping is disabled


KeyError: "The `metric_for_best_model` training argument is set to 'eval_loss', which is not found in the evaluation metrics. The available evaluation metrics are: []. Consider changing the `metric_for_best_model` via the TrainingArguments."

In [34]:
model = model_init()

TypeError: model_init() takes 0 positional arguments but 1 was given

In [48]:
inputs = {k: v.unsqueeze(0).to('mps') for k, v in learning_rate_train_dataset[0].items()}
inputs

model(**inputs)

CausalLMOutputWithPast(loss=None, logits=tensor([[[-19.8750,   3.0000,  -2.1719,  ..., -19.8750, -20.0000, -19.7500],
         [-17.7500,   2.6562,  -1.8906,  ..., -17.7500, -18.0000, -17.7500],
         [-17.3750,   0.8672,  -2.2344,  ..., -17.5000, -17.6250, -17.3750],
         ...,
         [-15.9375,   9.3750,  -3.0938,  ..., -16.0000, -16.0000, -15.9375],
         [-20.0000,  15.3750,  -3.5156,  ..., -20.0000, -20.2500, -20.0000],
         [-21.7500,   9.0625,  -0.9727,  ..., -21.7500, -22.0000, -21.7500]]],
       device='mps:0', dtype=torch.bfloat16, grad_fn=<LinearBackward0>), past_key_values=<transformers.cache_utils.HybridCache object at 0x35af58830>, hidden_states=None, attentions=None)

In [41]:
learning_rate_train_dataset[0]

{'input_ids': tensor([     2, 236820,  10082,  10789, 236813,  20194, 236787,  64801, 236787,
          44410, 236764,  51347, 236787,  64801, 236787,   7765, 236787, 115500,
         236787,  96579, 236764,  51347, 236787,  64801, 236787,  36177, 236824,
           1639, 236764,  51347, 236787,  64801, 236787, 197756, 236764,  51347,
         236787,  64801, 236787,   7765, 236787, 236824,   1639, 236787,  96579,
         236764,  51347, 236787,  64801, 236787,   7765, 236787, 115500, 236787,
          47609, 236764,  51347, 236787,  64801, 236787,   7765, 236787, 115500,
         236787,  69201,  63018, 236764,  51347, 236787,  64801, 236787,   7765,
         236787, 236824,   1639, 236787,  13517, 236764,  51347, 236787,  64801,
         236787,   7765, 236787, 115500, 236787,  13517,    954,  10082,  10789,
         236813,    107, 236820,  31408, 236813,  14937,  13304,   1083, 236743,
         236770, 236819, 236771, 236761, 236812, 236778, 236764,    623,  11283,
           1083

In [11]:
# model only takes up a few gb of memory. training is memory hungry. storing those gradient updates i guess.

for name, param in model.named_parameters():
    print(f"{name}: {param.dtype} on {param.device}")
    break 

base_model.model.model.embed_tokens.weight: torch.bfloat16 on mps:0


In [9]:
import torch
torch.cuda.empty_cache()

In [29]:
model.base_model.model.model.layers[0].self_attn.k_proj.weight

Parameter containing:
tensor([[-1.9897e-02,  1.0193e-02,  4.4189e-02,  ...,  2.4292e-02,
         -3.8757e-03,  2.4796e-04],
        [ 6.1646e-03, -3.4912e-02, -1.8677e-02,  ..., -9.3994e-03,
         -5.9204e-03, -9.2163e-03],
        [-4.7119e-02, -6.4392e-03, -2.1118e-02,  ..., -9.1553e-03,
          4.2236e-02, -9.5825e-03],
        ...,
        [ 2.9053e-02,  2.7710e-02, -2.6367e-02,  ..., -2.4414e-03,
          3.2471e-02, -2.4048e-02],
        [ 2.7222e-02,  2.8839e-03, -1.5747e-02,  ...,  6.3419e-05,
         -1.6632e-03,  7.3242e-03],
        [-1.1230e-02, -7.0801e-03,  4.8523e-03,  ...,  1.5503e-02,
          5.2490e-02, -1.1658e-02]], device='mps:0', dtype=torch.bfloat16)

In [16]:
# del trainer.model
# del trainer.optimiz
import gc
gc.collect()

2818

This was bumping up to around 120 GB of ram.

In [10]:
import sys
from collections import defaultdict
import gc

def get_size(obj):
    """Get size of object and its members in bytes"""
    memory_size = sys.getsizeof(obj)
    if hasattr(obj, '__dict__'):
        memory_size += sys.getsizeof(obj.__dict__)
        for key, value in obj.__dict__.items():
            memory_size += get_size(value)
    elif hasattr(obj, '__iter__') and not isinstance(obj, (str, bytes, bytearray)):
        for item in obj:
            memory_size += get_size(item)
    return memory_size

def show_memory_usage():
    """Show all objects in memory sorted by size"""
    # Collect all objects
    gc.collect()
    objects = defaultdict(int)
    for obj in gc.get_objects():
        try:
            objects[type(obj).__name__] += get_size(obj)
        except:
            # Skip objects that can't be sized
            continue
    
    # Sort and print
    sorted_objects = sorted(objects.items(), key=lambda x: x[1], reverse=True)
    print("\nMemory usage by type:")
    print("-" * 40)
    print(f"{'Type':<30} {'Size (MB)':<10}")
    print("-" * 40)
    for type_name, size in sorted_objects[:20]:  # Show top 20
        size_mb = size / (1024 * 1024)  # Convert to MB
        if size_mb >= 0.1:  # Only show objects using more than 0.1 MB
            print(f"{type_name:<30} {size_mb:>8.2f}")

show_memory_usage()

In [6]:
trainer.push_to_hub()

CommitInfo(commit_url='https://huggingface.co/jacob-danner/gemma_3_1b_causal_finetune/commit/d57dbe313855fe90e612cf39f5b4185a570e88b8', commit_message='End of training', commit_description='', oid='d57dbe313855fe90e612cf39f5b4185a570e88b8', pr_url=None, repo_url=RepoUrl('https://huggingface.co/jacob-danner/gemma_3_1b_causal_finetune', endpoint='https://huggingface.co', repo_type='model', repo_id='jacob-danner/gemma_3_1b_causal_finetune'), pr_revision=None, pr_num=None)