In [1]:
import os
import sys

sys.path.append(
    os.path.abspath(
        os.path.join(os.path.dirname(os.path.abspath("__file__")), "../../..")
    )
)

In [2]:
import json
from functools import partial
from textwrap import dedent
from typing import Dict, Literal

from datasets import DatasetDict, load_dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer, DataCollatorWithPadding, EarlyStoppingCallback, Trainer, TrainingArguments

from src.shared.config import settings

## Utils

In [3]:
HF_GPT_ID = "openai-gpt"
FINETUNED_HF_GPT_ID = f"{settings.hf_user_name}/{settings.gpt_1_sequence_classification}"


def tokenizer_init():
    tokenizer = AutoTokenizer.from_pretrained(HF_GPT_ID)
    tokenizer.add_special_tokens(
        {"pad_token": "<pad>"}
    )  # gpt-1 tokenizer lacks this by default
    return tokenizer


def model_init():
    tokenizer = tokenizer_init()
    model = AutoModelForSequenceClassification.from_pretrained(FINETUNED_HF_GPT_ID, num_labels=9)
    model.resize_token_embeddings(
        len(tokenizer), mean_resizing=False
    )  # extend the embedding layer to handle padding and eos tokens
    model.config.pad_token_id = tokenizer.pad_token_id
    return model.to(settings.device)


def format_dataset_row(
    row, unique_from_accounts: set, label_mapping: Dict[str, int]
) -> Dict[Literal["text"], str]:
    row = dict(row)
    from_account = row.pop("from_account")
    formatted = dedent(
    f"""
        <possible accounts>{', '.join(unique_from_accounts)}</possible accounts>
        <transaction>{json.dumps(row)}</transaction>
        which account did this transaction come from?
    """
    ).strip()
    return {"text": formatted, "labels": label_mapping[from_account]}


def test_dataset_init(tokenizer) -> DatasetDict:
    dataset = load_dataset(f"{settings.hf_user_name}/{settings.hf_dataset_repo_name}")

    unique_from_accounts = set(
        dataset["train"]["from_account"] + dataset["test"]["from_account"]
    )
    label_mapping = {account: idx for idx, account in enumerate(unique_from_accounts)}
    format_dataset_row_partial = partial(
        format_dataset_row,
        unique_from_accounts=unique_from_accounts,
        label_mapping=label_mapping,
    )

    dataset["test"] = dataset["test"].map(format_dataset_row_partial)
    del dataset["train"]

    remove_columns = [
        "amount",
        "month",
        "day",
        "year",
        "vendor",
        "from_account",
        "text"
    ]
    dataset = dataset.map(
        lambda batch: tokenizer(batch["text"]),
        batched=True,
        remove_columns=remove_columns,
    )
    dataset.set_format("pt")
    return dataset

## Score

In [4]:
training_args = TrainingArguments(
    output_dir="/tmp/eval",
    per_device_eval_batch_size=64,
    report_to="none",
)

trainer = Trainer(
    model=model_init(),
    args=training_args,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer_init()),
)

dataset = test_dataset_init(tokenizer_init())

predictions = trainer.predict(dataset["test"])
actual = predictions.label_ids
predictions = predictions.predictions.argmax(-1)

accuracy = sum(1 for pred, act in zip(predictions, actual) if pred == act) / len(predictions)
accuracy

config.json:   0%|          | 0.00/1.22k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/466M [00:00<?, ?B/s]

Map:   0%|          | 0/187 [00:00<?, ? examples/s]

Map:   0%|          | 0/187 [00:00<?, ? examples/s]

0.18181818181818182

## Example

In [5]:
predictions

array([5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
       5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
       5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
       5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
       5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
       5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
       5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
       5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
       5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5])

In [6]:
example = dataset["test"]["text"][21]
actual = dataset["test"]["from_account"][21]
prediction = pipeline(example)

print(example, actual, prediction, sep="\n\n")

<possible accounts>Assets:Discover:Main:Needs:Groceries, Assets:Discover:Main:Needs:Monthly, Assets:Discover:Main:Wants:Monthly, Assets:Discover:Main:Needs:Other, Assets:Discover:Travel, Assets:Discover:FutureWants, Assets:Discover:Main:Wants:Other, Assets:Discover:Main:Needs:Gas, Assets:Discover:Furniture</possible accounts>
<transaction>{"amount": 89.99, "month": 6, "day": 4, "year": 2023, "vendor": "WALMART"}</transaction>
which account did this transaction come from?
answer:

Assets:Discover:Main:Needs:Groceries

Assets:Discover:Main:Needs:Groceries


In [15]:
model = model_init()

In [16]:
model

OpenAIGPTForSequenceClassification(
  (transformer): OpenAIGPTModel(
    (tokens_embed): Embedding(40479, 768)
    (positions_embed): Embedding(512, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x Block(
        (attn): Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      )
    )
  )
  (score): Linear(in_features=768, out_features=2, bias=False)
)