In [1]:
%set_env TOKENIZERS_PARALLELISM=false

env: TOKENIZERS_PARALLELISM=false


In [2]:
from datasets import load_dataset
import evaluate
from transformers import (
    AutoModelForSequenceClassification, AutoTokenizer,
    Trainer, TrainingArguments
)
import numpy as np
import random
import torch
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    classification_report,
)

SEED = 42
MAX_LENGTH = 8192
LABELS = [
    'label_recommended', 'label_story', 'label_gameplay', 'label_visual',
    'label_audio', 'label_technical', 'label_price', 'label_suggestion'
]

2024-06-11 21:15:06.966188: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x79f725c049b0>

In [4]:
def encode(examples, tokenizer):
    outputs = tokenizer(examples['cleaned_review'], truncation=True)
    return outputs


def evaluate(y_test, y_pred, labels):
    accuracy = accuracy_score(y_test, y_pred)
    print(f'Overall accuracy: {accuracy}')
    for idx, label in enumerate(labels):
        label_accuracy = accuracy_score(y_test[:, idx], y_pred[:, idx])
        print(f'Accuracy {label}: {label_accuracy}')

    f1 = f1_score(y_test, y_pred, average='macro')
    print(f'F1 macro: {f1}')
    print(
        classification_report(y_test, y_pred, target_names=labels, digits=4, zero_division=0)
    )


def sigmoid(x):
    return 1 / (1 + np.exp(-x)) 
sigmoid_v = np.vectorize(sigmoid)

In [5]:
def run(
    model_name, gradient_accumulation_steps, num_train_epochs,
    learning_rate, weight_decay, warmup_ratio
):
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name, trust_remote_code=True,
        num_labels=8, problem_type='multi_label_classification'
    )
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    ds_all = load_dataset('ilos-vigil/steam-review-aspect-dataset')
    ds_all = ds_all.map(encode, batched=True, fn_kwargs={'tokenizer': tokenizer})

    training_args = TrainingArguments(
        output_dir=f'final_{model_name.split("/")[-1]}',
        eval_strategy='no',
        bf16=True,
        dataloader_drop_last=False,
        report_to='tensorboard',
        per_device_train_batch_size=1,
        per_device_eval_batch_size=1,
        gradient_checkpointing=True,
        # param from ray tune
        gradient_accumulation_steps=gradient_accumulation_steps,
        eval_accumulation_steps=gradient_accumulation_steps,
        num_train_epochs=num_train_epochs,
        learning_rate=learning_rate,
        weight_decay=weight_decay,
        warmup_ratio=warmup_ratio
    )
    trainer = Trainer(
        model=model,
        args=training_args,
        tokenizer=tokenizer,
        train_dataset=ds_all['train']
    )
    trainer.train()

    y_pred = trainer.predict(ds_all['test'])
    y_pred = np.where(
        sigmoid_v(y_pred.predictions) > 0.5, 1, 0
    ).astype(np.int32)

    evaluate(np.array(ds_all['test']['labels']), y_pred, LABELS)

In [6]:
# best hyperparameter from 16 trials, before it stopped halfway
# ╭─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
# │ Trial name            status         ...ccumulation_steps     num_train_epochs     learning_rate     weight_decay     warmup_ratio     iter     total time (s)     eval_loss     eval_precision     eval_recall     eval_f1 │
# ├─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┤
# │ _objective_47d46dc3   TERMINATED                       16                    5       4.95377e-05      0.000603811        0.0214077        5           441.176       0.348702           0.860371        0.854684    0.857066 │
# ╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
run(
    model_name='jinaai/jina-embeddings-v2-base-en',
    gradient_accumulation_steps=16,
    num_train_epochs=5,
    learning_rate=4.9537713401096075e-05,
    weight_decay=0.0006038110820661773,
    warmup_ratio=0.021407687322013313
)

Some weights of JinaBertForSequenceClassification were not initialized from the model checkpoint at jinaai/jina-embeddings-v2-base-en and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/900 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

You are using an old version of the checkpointing format that is deprecated (We will also silently ignore `gradient_checkpointing_kwargs` in case you passed it).Please update to the new format on your modeling file. To use the new format, you need to completely remove the definition of the method `_set_gradient_checkpointing` in your model.


  0%|          | 0/280 [00:00<?, ?it/s]



{'train_runtime': 466.6251, 'train_samples_per_second': 9.644, 'train_steps_per_second': 0.6, 'train_loss': 0.34732862200055803, 'epoch': 4.98}


  0%|          | 0/200 [00:00<?, ?it/s]

Overall accuracy: 0.335
Accuracy label_recommended: 0.89
Accuracy label_story: 0.855
Accuracy label_gameplay: 0.87
Accuracy label_visual: 0.84
Accuracy label_audio: 0.955
Accuracy label_technical: 0.87
Accuracy label_price: 0.88
Accuracy label_suggestion: 0.885
F1 macro: 0.7354135529174992
                   precision    recall  f1-score   support

label_recommended     0.9091    0.9459    0.9272       148
      label_story     0.8488    0.8202    0.8343        89
   label_gameplay     0.8765    0.9675    0.9198       154
     label_visual     0.8235    0.8046    0.8140        87
      label_audio     0.9200    0.9020    0.9109        51
  label_technical     0.8298    0.6842    0.7500        57
      label_price     0.7805    0.6809    0.7273        47
 label_suggestion     0.0000    0.0000    0.0000        21

        micro avg     0.8646    0.8394    0.8518       654
        macro avg     0.7485    0.7257    0.7354       654
     weighted avg     0.8373    0.8394    0.8369       654