In [1]:
import os
import sys

sys.path.append(
    os.path.abspath(
        os.path.join(os.path.dirname(os.path.abspath("__file__")), "../../..")
    )
)

In [2]:
import json
from functools import partial
from textwrap import dedent
from typing import Dict, Literal

from datasets import DatasetDict, load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    EarlyStoppingCallback,
    Trainer,
    TrainingArguments,
)

from src.shared.config import settings

## Utils

In [3]:
HF_GPT_ID = "openai-gpt"


def tokenizer_init():
    tokenizer = AutoTokenizer.from_pretrained(HF_GPT_ID)
    tokenizer.add_special_tokens(
        {"pad_token": "<pad>", "eos_token": "<eos>"}
    )  # gpt-1 tokenizer lacks these by default
    return tokenizer


def model_init():
    tokenizer = tokenizer_init()
    model = AutoModelForCausalLM.from_pretrained(HF_GPT_ID)
    model.resize_token_embeddings(
        len(tokenizer), mean_resizing=False
    )  # extend the embedding layer to handle padding and eos tokens
    model.config.pad_token_id = tokenizer.pad_token_id
    model.config.eos_token_id = tokenizer.eos_token_id
    return model.to(settings.device)


def format_dataset_row(
    row, for_test: bool, unique_from_accounts: set, eos_token: str
) -> Dict[Literal["text"], str]:
    row = dict(row)
    from_account = row.pop("from_account")
    formatted = dedent(
        f"""
        <possible accounts>{', '.join(unique_from_accounts)}</possible accounts>
        <transaction>{json.dumps(row)}</transaction>
        which account did this transaction come from?
        answer: {'' if for_test else f"{from_account}{eos_token}" }
    """
    ).strip()
    return {"text": formatted}


def training_dataset_init(tokenizer) -> DatasetDict:
    dataset = load_dataset(f"{settings.hf_user_name}/{settings.hf_dataset_repo_name}")

    unique_from_accounts = set(
        dataset["train"]["from_account"] + dataset["test"]["from_account"]
    )
    eos_token = tokenizer.eos_token
    format_for_train = partial(
        format_dataset_row,
        for_test=False,
        unique_from_accounts=unique_from_accounts,
        eos_token=eos_token,
    )

    dataset["train"] = dataset["train"].map(format_for_train)
    dataset["validation"] = dataset["test"].map(format_for_train)
    del dataset["test"]

    remove_columns = [
        "amount",
        "month",
        "day",
        "year",
        "vendor",
        "from_account",
        "text",
    ]
    dataset = dataset.map(
        lambda batch: tokenizer(batch["text"]),
        batched=True,
        remove_columns=remove_columns,
    )
    dataset.set_format("pt")
    return dataset

In [4]:
dataset = training_dataset_init(tokenizer_init())

## Identify Best Learning Rate

In [None]:
training_args = TrainingArguments(
    output_dir="/tmp/lr_search",
    num_train_epochs=1,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    weight_decay=0.01,
    eval_strategy="epoch",
    logging_strategy="epoch",
    disable_tqdm=False,
    push_to_hub=False,
    log_level="error",
    save_strategy="no",
    report_to="none",
)

trainer = Trainer(
    model_init=model_init,
    args=training_args,
    data_collator=DataCollatorForLanguageModeling(
        tokenizer=tokenizer_init(), mlm=False
    ),
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
)

best_run = trainer.hyperparameter_search(
    hp_space=lambda trial: {
        "learning_rate": trial.suggest_float("learning_rate", 1e-7, 1e-3, log=True)
    },
    n_trials=10,
    direction="minimize",
)

best_run

## Training Run

In [5]:
n_epochs = 15  # will likely stop early
best_learning_rate = 0.00012694864775774771

tokenizer = tokenizer_init()
dataset = training_dataset_init(tokenizer)

training_args = TrainingArguments(
    output_dir="/tmp/gpt_1_causal_finetune",
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    num_train_epochs=n_epochs,
    learning_rate=best_learning_rate,
    weight_decay=0.01,
    eval_strategy="epoch",
    logging_strategy="epoch",
    save_strategy="epoch",
    push_to_hub=True,
    hub_model_id=f"{settings.hf_user_name}/{settings.gpt_1_causal_finetune}",
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    load_best_model_at_end=True,
)

trainer = Trainer(
    model_init=model_init,
    args=training_args,
    data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
)

trainer.train()

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Epoch,Training Loss,Validation Loss
1,1.8855,0.375536
2,0.2855,0.203367
3,0.1896,0.185649
4,0.1706,0.179815
5,0.1606,0.175083
6,0.1536,0.173203
7,0.1483,0.1706
8,0.1451,0.170205
9,0.1424,0.170229
10,0.1403,0.169337


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


TrainOutput(global_step=180, training_loss=0.2735820220576392, metrics={'train_runtime': 301.9658, 'train_samples_per_second': 36.908, 'train_steps_per_second': 0.596, 'total_flos': 896033298327552.0, 'train_loss': 0.2735820220576392, 'epoch': 15.0})

In [6]:
trainer.push_to_hub()

CommitInfo(commit_url='https://huggingface.co/jacob-danner/gpt_1_causual_finetune/commit/9c174f68a07045c93abaa3e6d0438e591dd9e194', commit_message='End of training', commit_description='', oid='9c174f68a07045c93abaa3e6d0438e591dd9e194', pr_url=None, repo_url=RepoUrl('https://huggingface.co/jacob-danner/gpt_1_causual_finetune', endpoint='https://huggingface.co', repo_type='model', repo_id='jacob-danner/gpt_1_causual_finetune'), pr_revision=None, pr_num=None)