In [2]:
import os
import sys

sys.path.append(
    os.path.abspath(
        os.path.join(os.path.dirname(os.path.abspath("__file__")), "../../..")
    )
)

In [3]:
import json
from functools import partial
from textwrap import dedent
from typing import Dict, Literal

from datasets import DatasetDict, load_dataset
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    EarlyStoppingCallback,
    Trainer,
    TrainingArguments,
)

from src.shared.config import settings

## Utils

In [7]:
HF_GPT_ID = "openai-gpt"

def tokenizer_init():
    tokenizer = AutoTokenizer.from_pretrained(HF_GPT_ID)
    tokenizer.add_special_tokens(
        {"pad_token": "<pad>"}
    )  # gpt-1 tokenizer lacks this by default
    return tokenizer


def model_init():
    tokenizer = tokenizer_init()
    model = AutoModelForSequenceClassification.from_pretrained(HF_GPT_ID, num_labels=9)
    model.resize_token_embeddings(
        len(tokenizer), mean_resizing=False
    )  # extend the embedding layer to handle padding
    model.config.pad_token_id = tokenizer.pad_token_id
    return model.to(settings.device)


def format_dataset_row(
    row, label_mapping: Dict[str, int]
) -> Dict[Literal["text"], str]:
    row = dict(row)
    from_account = row.pop("from_account")
    formatted = dedent(
    f"""
        {json.dumps(row)}
    """
    ).strip()
    return {"text": formatted, "labels": label_mapping[from_account]}


def training_dataset_init(tokenizer) -> DatasetDict:
    dataset = load_dataset(f"{settings.hf_user_name}/{settings.hf_dataset_repo_name}")

    unique_from_accounts = set(
        dataset["train"]["from_account"] + dataset["test"]["from_account"]
    )
    label_mapping = {account: idx for idx, account in enumerate(unique_from_accounts)}
    format_dataset_row_partial = partial(
        format_dataset_row,
        label_mapping=label_mapping,
    )

    dataset["train"] = dataset["train"].map(format_dataset_row_partial)
    dataset["validation"] = dataset["test"].map(format_dataset_row_partial)
    del dataset["test"]

    remove_columns = [
        "amount",
        "month",
        "day",
        "year",
        "vendor",
        "from_account",
        "text"
    ]
    dataset = dataset.map(
        lambda batch: tokenizer(batch["text"]),
        batched=True,
        remove_columns=remove_columns,
    )
    dataset.set_format("pt")
    return dataset

## Identify Best Learning Rate

In [8]:
dataset = training_dataset_init(tokenizer_init())

training_args = TrainingArguments(
    output_dir="/tmp/lr_search",
    num_train_epochs=1,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    weight_decay=0.01,
    eval_strategy="epoch",
    logging_strategy="epoch",
    disable_tqdm=False,
    push_to_hub=False,
    log_level="error",
    save_strategy="no",
    report_to="none",
)

trainer = Trainer(
    model_init=model_init,
    args=training_args,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer_init()),
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
)

best_run = trainer.hyperparameter_search(
    hp_space=lambda trial: {
        "learning_rate": trial.suggest_float("learning_rate", 1e-7, 1e-3, log=True)
    },
    n_trials=5,
    direction="minimize",
)

best_run

Map:   0%|          | 0/743 [00:00<?, ? examples/s]

Map:   0%|          | 0/187 [00:00<?, ? examples/s]

Map:   0%|          | 0/743 [00:00<?, ? examples/s]

Map:   0%|          | 0/187 [00:00<?, ? examples/s]

[I 2025-03-19 21:05:52,127] A new study created in memory with name: no-name-d9bc123a-871e-4dee-b731-e4b9d5e1ae89


Epoch,Training Loss,Validation Loss
1,1.6154,1.361363


[I 2025-03-19 21:05:56,470] Trial 0 finished with value: 1.3613629341125488 and parameters: {'learning_rate': 2.6385847657862185e-05}. Best is trial 0 with value: 1.3613629341125488.


Epoch,Training Loss,Validation Loss
1,1.7062,1.216077


[I 2025-03-19 21:06:08,551] Trial 1 finished with value: 1.2160768508911133 and parameters: {'learning_rate': 0.00017654440874111848}. Best is trial 1 with value: 1.2160768508911133.


Epoch,Training Loss,Validation Loss
1,1.5708,1.026987


[I 2025-03-19 21:06:17,255] Trial 2 finished with value: 1.0269871950149536 and parameters: {'learning_rate': 0.00011761323975949382}. Best is trial 2 with value: 1.0269871950149536.


Epoch,Training Loss,Validation Loss
1,1.7154,1.560079


[I 2025-03-19 21:06:21,264] Trial 3 finished with value: 1.5600794553756714 and parameters: {'learning_rate': 8.25688321383896e-06}. Best is trial 2 with value: 1.0269871950149536.


Epoch,Training Loss,Validation Loss
1,2.05,2.010101


[I 2025-03-19 21:06:25,222] Trial 4 finished with value: 2.010101079940796 and parameters: {'learning_rate': 2.956816030763976e-07}. Best is trial 2 with value: 1.0269871950149536.


BestRun(run_id='2', objective=1.0269871950149536, hyperparameters={'learning_rate': 0.00011761323975949382}, run_summary=None)

## Training Run

In [9]:
#  objective=1.50540292263031, hyperparameters={'learning_rate': 8.232913715704246e-05} with prompting
# BestRun(run_id='2', objective=1.0269871950149536, hyperparameters={'learning_rate': 0.00011761323975949382}, run_summary=None) without prompting

n_epochs = 15  # will likely stop early
best_learning_rate = 0.00011761323975949382

tokenizer = tokenizer_init()
dataset = training_dataset_init(tokenizer)

training_args = TrainingArguments(
    output_dir="/tmp/gpt_1_sequence_classification",
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    num_train_epochs=n_epochs,
    learning_rate=best_learning_rate,
    weight_decay=0.01,
    eval_strategy="epoch",
    logging_strategy="epoch",
    save_strategy="epoch",
    push_to_hub=True,
    hub_model_id=f"{settings.hf_user_name}/{settings.gpt_1_sequence_classification}",
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    load_best_model_at_end=True,
)

trainer = Trainer(
    model_init=model_init,
    args=training_args,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

trainer.train()

{'loss': 1.5062, 'grad_norm': 3.570878028869629, 'learning_rate': 0.0001097723571088609, 'epoch': 1.0}
{'eval_loss': 0.8068403005599976, 'eval_runtime': 0.1887, 'eval_samples_per_second': 991.08, 'eval_steps_per_second': 15.9, 'epoch': 1.0}
{'loss': 0.7222, 'grad_norm': 3.5703277587890625, 'learning_rate': 0.00010193147445822798, 'epoch': 2.0}
{'eval_loss': 0.6196367740631104, 'eval_runtime': 0.1882, 'eval_samples_per_second': 993.422, 'eval_steps_per_second': 15.937, 'epoch': 2.0}
{'loss': 0.4747, 'grad_norm': 3.3774116039276123, 'learning_rate': 9.409059180759506e-05, 'epoch': 3.0}
{'eval_loss': 0.6139162182807922, 'eval_runtime': 0.1907, 'eval_samples_per_second': 980.464, 'eval_steps_per_second': 15.729, 'epoch': 3.0}
{'loss': 0.487, 'grad_norm': 2.792229413986206, 'learning_rate': 8.624970915696212e-05, 'epoch': 4.0}
{'eval_loss': 0.6617082953453064, 'eval_runtime': 0.1935, 'eval_samples_per_second': 966.532, 'eval_steps_per_second': 15.506, 'epoch': 4.0}
{'loss': 0.2917, 'grad_no

TrainOutput(global_step=72, training_loss=0.6231741938326094, metrics={'train_runtime': 24.5994, 'train_samples_per_second': 453.059, 'train_steps_per_second': 7.317, 'train_loss': 0.6231741938326094, 'epoch': 6.0})

In [5]:
trainer.push_to_hub()

model.safetensors:   0%|          | 0.00/466M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/jacob-danner/gpt_1_sequence_classification_finetune/commit/d7de8e056288fe9546ebf1b2106d113d9db26f9a', commit_message='End of training', commit_description='', oid='d7de8e056288fe9546ebf1b2106d113d9db26f9a', pr_url=None, repo_url=RepoUrl('https://huggingface.co/jacob-danner/gpt_1_sequence_classification_finetune', endpoint='https://huggingface.co', repo_type='model', repo_id='jacob-danner/gpt_1_sequence_classification_finetune'), pr_revision=None, pr_num=None)