In [6]:
import os
import sys

sys.path.append(
    os.path.abspath(
        os.path.join(os.path.dirname(os.path.abspath("__file__")), "../../..")
    )
)

In [7]:
from functools import partial
from textwrap import dedent
from typing import Dict, Literal

from datasets import DatasetDict, load_dataset
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    EarlyStoppingCallback,
    Trainer,
    TrainingArguments,
)

from src.shared.config import settings

## Utils

In [8]:
HF_GPT_ID = "openai-gpt"

def tokenizer_init():
    tokenizer = AutoTokenizer.from_pretrained(HF_GPT_ID)
    tokenizer.add_special_tokens(
        {"pad_token": "<pad>"}
    )  # gpt-1 tokenizer lacks this by default
    return tokenizer


def model_init():
    tokenizer = tokenizer_init()
    model = AutoModelForSequenceClassification.from_pretrained(HF_GPT_ID, num_labels=8)
    model.resize_token_embeddings(
        len(tokenizer), mean_resizing=False
    )  # extend the embedding layer to handle padding
    model.config.pad_token_id = tokenizer.pad_token_id
    return model.to(settings.device)


def format_dataset_row(
    row, label_mapping: Dict[str, int]
) -> Dict[Literal["text"], str]:
    row = dict(row)
    from_account = row.pop("from_account")
    formatted = dedent(
        f"""
        Transaction
        -----------
        Description: {row['description']}
        Amount: {row['amount']}
        Category: {row['category']} (Source: {row['category_source']})
        Transaction Date: {row['transaction_date']}
        Day of Week: {row['day_of_week']}
        Card: {row['card']}

        Question: Which account initiated this transaction?
        Answer:
        """
    ).strip()
    return {"text": formatted, "labels": label_mapping[from_account]}


def training_dataset_init(tokenizer) -> DatasetDict:
    dataset = load_dataset(f"{settings.hf_user_name}/{settings.hf_dataset_repo_name}").shuffle(0)

    unique_from_accounts = sorted(set(
        dataset["train"]["from_account"] + dataset["test"]["from_account"]
    ))
    label_mapping = {account: idx for idx, account in enumerate(unique_from_accounts)}
    print(f'label_mapping: {label_mapping}')
    format_dataset_row_partial = partial(
        format_dataset_row,
        label_mapping=label_mapping,
    )

    dataset["train"] = dataset["train"].map(format_dataset_row_partial)
    dataset["validation"] = dataset["test"].map(format_dataset_row_partial)
    del dataset["test"]

    remove_columns = ['transaction_date', 'description', 'amount', 'category', 'category_source', 'card', 'day_of_week', 'from_account', 'text']

    dataset = dataset.map(
        lambda batch: tokenizer(batch["text"]),
        batched=True,
        remove_columns=remove_columns,
    )
    dataset.set_format("pt")
    return dataset

## Identify Best Learning Rate

In [9]:
dataset = training_dataset_init(tokenizer_init())

training_args = TrainingArguments(
    output_dir="/tmp/lr_search",
    num_train_epochs=1,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    weight_decay=0.01,
    eval_strategy="epoch",
    logging_strategy="epoch",
    disable_tqdm=False,
    push_to_hub=False,
    log_level="error",
    save_strategy="no",
    report_to="none",
)

trainer = Trainer(
    model_init=model_init,
    args=training_args,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer_init()),
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
)

best_run = trainer.hyperparameter_search(
    hp_space=lambda trial: {
        "learning_rate": trial.suggest_float("learning_rate", 1e-5, 5e-4, log=True)
    },
    n_trials=5,
    direction="minimize",
)

best_run

label_mapping: {'Assets:Discover:Furniture': 0, 'Assets:Discover:FutureWants': 1, 'Assets:Discover:Main:Needs:Gas': 2, 'Assets:Discover:Main:Needs:Groceries': 3, 'Assets:Discover:Main:Needs:Monthly': 4, 'Assets:Discover:Main:Needs:Other': 5, 'Assets:Discover:Main:Wants:Monthly': 6, 'Assets:Discover:Main:Wants:Other': 7}


Map:   0%|          | 0/764 [00:00<?, ? examples/s]

Map:   0%|          | 0/193 [00:00<?, ? examples/s]

Map:   0%|          | 0/764 [00:00<?, ? examples/s]

Map:   0%|          | 0/193 [00:00<?, ? examples/s]

[I 2025-03-23 16:25:03,535] A new study created in memory with name: no-name-f30295bb-3aa7-46c9-8207-d17380973048


Epoch,Training Loss,Validation Loss
1,1.7611,1.525469


[I 2025-03-23 16:25:11,736] Trial 0 finished with value: 1.525469422340393 and parameters: {'learning_rate': 1.2431684939654512e-05}. Best is trial 0 with value: 1.525469422340393.


Epoch,Training Loss,Validation Loss
1,1.7151,1.504723


[I 2025-03-23 16:25:19,079] Trial 1 finished with value: 1.5047225952148438 and parameters: {'learning_rate': 0.000283964877953884}. Best is trial 1 with value: 1.5047225952148438.


Epoch,Training Loss,Validation Loss
1,1.7159,1.515454


[I 2025-03-23 16:25:26,888] Trial 2 finished with value: 1.5154544115066528 and parameters: {'learning_rate': 0.0002713673779161299}. Best is trial 1 with value: 1.5047225952148438.


Epoch,Training Loss,Validation Loss
1,1.6232,1.521107


[I 2025-03-23 16:25:34,853] Trial 3 finished with value: 1.5211067199707031 and parameters: {'learning_rate': 8.985463055747293e-05}. Best is trial 1 with value: 1.5047225952148438.


Epoch,Training Loss,Validation Loss
1,1.7131,1.497422


[I 2025-03-23 16:25:42,911] Trial 4 finished with value: 1.4974219799041748 and parameters: {'learning_rate': 1.5703603902088657e-05}. Best is trial 4 with value: 1.4974219799041748.


BestRun(run_id='4', objective=1.4974219799041748, hyperparameters={'learning_rate': 1.5703603902088657e-05}, run_summary=None)

## Training Run

In [10]:
n_epochs = 15  # will likely stop early
best_learning_rate = 5.38312346311055e-05

tokenizer = tokenizer_init()
dataset = training_dataset_init(tokenizer)

training_args = TrainingArguments(
    output_dir="/tmp/gpt_1_sequence_classification",
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    num_train_epochs=n_epochs,
    learning_rate=best_learning_rate,
    weight_decay=0.01,
    eval_strategy="epoch",
    logging_strategy="epoch",
    save_strategy="epoch",
    push_to_hub=True,
    hub_model_id=f"{settings.hf_user_name}/{settings.gpt_1_sequence_classification}",
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    load_best_model_at_end=True,
    disable_tqdm=False,
)

trainer = Trainer(
    model_init=model_init,
    args=training_args,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
)

trainer.train()

label_mapping: {'Assets:Discover:Furniture': 0, 'Assets:Discover:FutureWants': 1, 'Assets:Discover:Main:Needs:Gas': 2, 'Assets:Discover:Main:Needs:Groceries': 3, 'Assets:Discover:Main:Needs:Monthly': 4, 'Assets:Discover:Main:Needs:Other': 5, 'Assets:Discover:Main:Wants:Monthly': 6, 'Assets:Discover:Main:Wants:Other': 7}


Epoch,Training Loss,Validation Loss
1,1.6661,1.457538
2,1.3696,1.002303
3,0.6977,0.556376
4,0.3751,0.457325
5,0.2528,0.380322
6,0.1503,0.269912
7,0.0863,0.288816
8,0.0547,0.329497


TrainOutput(global_step=96, training_loss=0.5815815708289543, metrics={'train_runtime': 63.3679, 'train_samples_per_second': 180.849, 'train_steps_per_second': 2.841, 'total_flos': 284921734496256.0, 'train_loss': 0.5815815708289543, 'epoch': 8.0})

In [11]:
trainer.push_to_hub(commit_message="feat: train with sorted label to make training behavior reproducible at evaluation time")

model.safetensors:   0%|          | 0.00/466M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/jacob-danner/gpt_1_sequence_classification_finetune/commit/ee6d2713c0019a163e8e90b46b603934e66b2f93', commit_message='feat: train with sorted label to make training behavior reproducible at evaluation time', commit_description='', oid='ee6d2713c0019a163e8e90b46b603934e66b2f93', pr_url=None, repo_url=RepoUrl('https://huggingface.co/jacob-danner/gpt_1_sequence_classification_finetune', endpoint='https://huggingface.co', repo_type='model', repo_id='jacob-danner/gpt_1_sequence_classification_finetune'), pr_revision=None, pr_num=None)