In [1]:
import os

import torch
from datasets import load_dataset
from huggingface_hub import login
from peft import LoraConfig
from torch import tensor
from torcheval.metrics.functional import multiclass_f1_score, multiclass_accuracy
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments, DataCollatorWithPadding
)
from trl import SFTTrainer

In [2]:
# prevent env load failed
%load_ext dotenv
%dotenv

In [3]:
login(token=os.environ.get("HF_TOKEN", ""), add_to_git_credential=True)

Token is valid (permission: read).
Your token has been saved in your configured git credential helpers (osxkeychain).
Your token has been saved to /Users/hermeschen/.cache/huggingface/token
Login successful


# Load Dataset

In [4]:
dataset = load_dataset("benjaminbeilharz/better_daily_dialog", num_proc=16, trust_remote_code=True).remove_columns(["dialog_id", "turn_type"])

In [5]:
emotion_labels: list = ["neutral", "anger", "disgust", "fear", "happiness", "sadness", "surprise"]

In [6]:
dataset = dataset.map(lambda samples: {
    "label": [emotion_labels[sample] for sample in samples["emotion"]],
    "text": [sample.strip() for sample in samples["utterance"]]
}, remove_columns=["utterance", "emotion"], batched=True, num_proc=16)

In [7]:
dataset["train"][0]

{'label': 'neutral',
 'text': 'Say , Jim , how about going for a few beers after dinner ?'}

## Load Tokenizer

In [8]:
base_model_name: str = "michellejieli/emotion_text_classifier"

In [9]:
tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
tokenizer.truncation_side = "right"
tokenizer.padding_side = "right"

In [10]:
def formatting_function(sample: str) -> tensor:
    return tokenizer.encode(sample, truncation=True)

In [11]:
dataset = dataset.map(lambda samples: {
    "tokenized_text": [formatting_function(sample) for sample in samples]
}, input_columns="text", batched=True, num_proc=16)

## Configurations

In [12]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False
)

In [13]:
peft_parameters = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=8,
    bias="none",
    task_type="SEQ_CLS"
)

In [14]:
num_train_epochs: int = 3

In [15]:
def compute_metrics(prediction):
    outputs, labels = prediction
    predictions = torch.argmax(outputs, dim=-1)
    
    f1_score = multiclass_f1_score(labels, predictions, num_classes=len(emotion_labels), average="micro"),
    accuracy = multiclass_accuracy(labels, predictions, num_classes=len(emotion_labels))
    
    return {"accuracy": accuracy, "f1": f1_score}

In [16]:
trainer_arguments = TrainingArguments(
    output_dir="./checkpoints",
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=1,
    eval_accumulation_steps=1,
    eval_delay=100,
    learning_rate=2e-5,
    weight_decay=0.001,
    # max_grad_norm=0.3,
    num_train_epochs=num_train_epochs,
    # lr_scheduler_type="constant",
    warmup_ratio=0.03,
    max_steps=-1,
    logging_steps=25,
    save_steps=25,
    save_total_limit=5,
    bf16=False,
    fp16=False,
    eval_steps=25,
    dataloader_num_workers=16,
    optim="paged_adamw_32bit",
    group_by_length=True,
    # report_to=["wandb"],
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": True},
    auto_find_batch_size=True,
    torch_compile=False,
    resume_from_checkpoint=True
)

## Load Model

In [17]:
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    # quantization_config=quantization_config if torch.cuda.is_available() else None,
    num_labels=len(emotion_labels),
    # is_decoder=True,
    device_map="auto",
    low_cpu_mem_usage=True
)

If you want to use `RobertaLMHeadModel` as a standalone, add `is_decoder=True.`
Some weights of RobertaForCausalLM were not initialized from the model checkpoint at michellejieli/emotion_text_classifier and are newly initialized: ['lm_head.bias', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
base_model.config.use_cache = False
base_model.config.pretraining_tp = 1

In [None]:
base_model.tra

## Setup Tuner

In [19]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="max_length", return_tensors="pt")

In [20]:
tuner = SFTTrainer(
    model=base_model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    dataset_text_field="tokenized_text",
    data_collator=data_collator,
    tokenizer=tokenizer,
    # formatting_func=formatting_function,
    peft_config=peft_parameters,
    args=trainer_arguments,
    max_seq_length=512,
    dataset_num_proc=16,
)

'NoneType' object has no attribute 'cadam32bit_grad_fp32'


  warn("The installed version of bitsandbytes was compiled without GPU support. "


Map (num_proc=16):   0%|          | 0/87170 [00:00<?, ? examples/s]

ValueError: text input must be of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).

In [None]:
tuner.predict(dataset["test"])

In [None]:
tuner.train()

In [None]:
tuner.model = torch.compile(tuner.model)
tuner.save_model("./model")