In [1]:
import os
import sys

sys.path.append(
    os.path.abspath(
        os.path.join(os.path.dirname(os.path.abspath("__file__")), "../../..")
    )
)

In [2]:
import json
from functools import partial
from textwrap import dedent
from typing import Dict, Literal

from datasets import DatasetDict, load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    EarlyStoppingCallback,
    Trainer,
    TrainingArguments,
)

from src.shared.config import settings

## Utils

In [6]:
HF_GPT_ID = "openai-gpt"


def tokenizer_init():
    tokenizer = AutoTokenizer.from_pretrained(HF_GPT_ID)
    tokenizer.add_special_tokens(
        {"pad_token": "<pad>", "eos_token": "<eos>"}
    )  # gpt-1 tokenizer lacks these by default
    return tokenizer


def model_init():
    tokenizer = tokenizer_init()
    model = AutoModelForCausalLM.from_pretrained(HF_GPT_ID)
    model.resize_token_embeddings(
        len(tokenizer), mean_resizing=False
    )  # extend the embedding layer to handle padding and eos tokens
    model.config.pad_token_id = tokenizer.pad_token_id
    model.config.eos_token_id = tokenizer.eos_token_id
    return model.to(settings.device)


def format_dataset_row(
    row, for_test: bool, unique_from_accounts: set, eos_token: str
) -> Dict[Literal["text"], str]:
    # currently not using unique_from_accounts
    row = dict(row)
    from_account = row.pop("from_account")
    formatted = dedent(
        f"""
        Transaction
        -----------
        Description: {row['description']}
        Amount: {row['amount']}
        Category: {row['category']} (Source: {row['category_source']})
        Transaction Date: {row['transaction_date']}
        Day of Week: {row['day_of_week']}
        Card: {row['card']}

        Question: Which account initiated this transaction?
        Answer: {'' if for_test else f"{from_account}{eos_token}"}
        """
    ).strip()
    return {"text": formatted}


def training_dataset_init(tokenizer) -> DatasetDict:
    dataset = load_dataset(f"{settings.hf_user_name}/{settings.hf_dataset_repo_name}").shuffle(0)

    unique_from_accounts = set(
        dataset["train"]["from_account"] + dataset["test"]["from_account"]
    )
    eos_token = tokenizer.eos_token
    format_for_train = partial(
        format_dataset_row,
        for_test=False,
        unique_from_accounts=unique_from_accounts,
        eos_token=eos_token,
    )

    dataset["train"] = dataset["train"].map(format_for_train)
    dataset["validation"] = dataset["test"].map(format_for_train)
    del dataset["test"]

    remove_columns = ['transaction_date', 'description', 'amount', 'category', 'category_source', 'card', 'day_of_week', 'from_account', 'text']

    dataset = dataset.map(
        lambda batch: tokenizer(batch["text"]),
        batched=True,
        remove_columns=remove_columns,
    )
    dataset.set_format("pt")
    return dataset

In [7]:
dataset = training_dataset_init(tokenizer_init())

Map:   0%|          | 0/764 [00:00<?, ? examples/s]

Map:   0%|          | 0/193 [00:00<?, ? examples/s]

Map:   0%|          | 0/764 [00:00<?, ? examples/s]

Map:   0%|          | 0/193 [00:00<?, ? examples/s]

## Identify Best Learning Rate

In [8]:
training_args = TrainingArguments(
    output_dir="/tmp/lr_search",
    num_train_epochs=1,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    weight_decay=0.01,
    eval_strategy="epoch",
    logging_strategy="epoch",
    disable_tqdm=False,
    push_to_hub=False,
    log_level="error",
    save_strategy="no",
    report_to="none",
)

trainer = Trainer(
    model_init=model_init,
    args=training_args,
    data_collator=DataCollatorForLanguageModeling(
        tokenizer=tokenizer_init(), mlm=False
    ),
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
)

best_run = trainer.hyperparameter_search(
    hp_space=lambda trial: {
        "learning_rate": trial.suggest_float("learning_rate", 1e-5, 5e-4, log=True)
    },
    n_trials=5,
    direction="minimize",
)

best_run

[I 2025-03-23 15:53:23,101] A new study created in memory with name: no-name-bd64df81-bb2a-4e8d-b804-f98650ef9a6c


Epoch,Training Loss,Validation Loss
1,4.4079,3.595229


[I 2025-03-23 15:53:33,766] Trial 0 finished with value: 3.595229387283325 and parameters: {'learning_rate': 1.1652186096275915e-05}. Best is trial 0 with value: 3.595229387283325.


Epoch,Training Loss,Validation Loss
1,2.7376,1.257106


[I 2025-03-23 15:53:44,377] Trial 1 finished with value: 1.2571061849594116 and parameters: {'learning_rate': 0.0002511643502833988}. Best is trial 1 with value: 1.2571061849594116.


Epoch,Training Loss,Validation Loss
1,2.9026,1.652341


[I 2025-03-23 15:53:55,655] Trial 2 finished with value: 1.652341365814209 and parameters: {'learning_rate': 5.404211968714059e-05}. Best is trial 1 with value: 1.2571061849594116.


Epoch,Training Loss,Validation Loss
1,3.4334,2.222654


[I 2025-03-23 15:54:07,010] Trial 3 finished with value: 2.22265362739563 and parameters: {'learning_rate': 3.0452338929671888e-05}. Best is trial 1 with value: 1.2571061849594116.


Epoch,Training Loss,Validation Loss
1,2.6881,1.465267


[I 2025-03-23 15:54:18,410] Trial 4 finished with value: 1.4652667045593262 and parameters: {'learning_rate': 8.479060049005736e-05}. Best is trial 1 with value: 1.2571061849594116.


BestRun(run_id='1', objective=1.2571061849594116, hyperparameters={'learning_rate': 0.0002511643502833988}, run_summary=None)

## Training Run

In [9]:
n_epochs = 25  # will likely stop early
best_learning_rate = 0.0002511643502833988

tokenizer = tokenizer_init()
dataset = training_dataset_init(tokenizer)

training_args = TrainingArguments(
    output_dir="/tmp/gpt_1_causal_finetune",
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    num_train_epochs=n_epochs,
    learning_rate=best_learning_rate,
    weight_decay=0.01,
    eval_strategy="epoch",
    logging_strategy="epoch",
    save_strategy="epoch",
    push_to_hub=True,
    hub_model_id=f"{settings.hf_user_name}/{settings.gpt_1_causal_finetune}",
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    load_best_model_at_end=True,
    disable_tqdm=False,
)

trainer = Trainer(
    model_init=model_init,
    args=training_args,
    data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
)

trainer.train()

Epoch,Training Loss,Validation Loss
1,2.7166,1.091051
2,0.9172,0.822129
3,0.6982,0.717173
4,0.5758,0.666996
5,0.504,0.651167
6,0.4566,0.651174
7,0.4187,0.650319
8,0.3829,0.664107
9,0.3589,0.66983


TrainOutput(global_step=108, training_loss=0.781011042771516, metrics={'train_runtime': 108.1923, 'train_samples_per_second': 176.538, 'train_steps_per_second': 2.773, 'total_flos': 354528099459072.0, 'train_loss': 0.781011042771516, 'epoch': 9.0})

In [10]:
trainer.push_to_hub(commit_message="feat: train on improved dataset. changed prompt to seperate transaction details rather than a dump")

CommitInfo(commit_url='https://huggingface.co/jacob-danner/gpt_1_causual_finetune/commit/8409d991ad4857354b47e70c9c17fc54905253f5', commit_message='feat: train on improved dataset. changed prompt to seperate transaction details rather than a dump', commit_description='', oid='8409d991ad4857354b47e70c9c17fc54905253f5', pr_url=None, repo_url=RepoUrl('https://huggingface.co/jacob-danner/gpt_1_causual_finetune', endpoint='https://huggingface.co', repo_type='model', repo_id='jacob-danner/gpt_1_causual_finetune'), pr_revision=None, pr_num=None)