In [1]:
%set_env TOKENIZERS_PARALLELISM=false

env: TOKENIZERS_PARALLELISM=false


In [2]:
from datasets import load_dataset
import evaluate
from transformers import (
    AutoModelForSequenceClassification, AutoTokenizer,
    Trainer, TrainingArguments
)
import numpy as np
import random
import torch
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    classification_report,
)

SEED = 42
MAX_LENGTH = 8192
LABELS = [
    'label_recommended', 'label_story', 'label_gameplay', 'label_visual',
    'label_audio', 'label_technical', 'label_price', 'label_suggestion'
]

2024-06-04 20:09:47.915549: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x712da29eca50>

In [4]:
def encode(examples, tokenizer):
    outputs = tokenizer(examples['cleaned_review'], truncation=True)
    return outputs


def evaluate(y_test, y_pred, labels):
    accuracy = accuracy_score(y_test, y_pred)
    print(f'Overall accuracy: {accuracy}')
    for idx, label in enumerate(labels):
        label_accuracy = accuracy_score(y_test[:, idx], y_pred[:, idx])
        print(f'Accuracy {label}: {label_accuracy}')

    f1 = f1_score(y_test, y_pred, average='macro')
    print(f'F1 macro: {f1}')
    print(
        classification_report(y_test, y_pred, target_names=labels, digits=4, zero_division=0)
    )


def sigmoid(x):
    return 1 / (1 + np.exp(-x)) 
sigmoid_v = np.vectorize(sigmoid)

In [5]:
def run(
    model_name, gradient_accumulation_steps, num_train_epochs,
    learning_rate, weight_decay, warmup_ratio
):
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name, trust_remote_code=True,
        num_labels=8, problem_type='multi_label_classification'
    )
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    ds_all = load_dataset('ilos-vigil/steam-review-aspect-dataset')
    ds_all = ds_all.map(encode, batched=True, fn_kwargs={'tokenizer': tokenizer})

    training_args = TrainingArguments(
        output_dir=f'final_{model_name.split("/")[-1]}',
        eval_strategy='no',
        bf16=True,
        dataloader_drop_last=False,
        report_to='tensorboard',
        per_device_train_batch_size=1,
        per_device_eval_batch_size=1,
        gradient_checkpointing=True,
        # param from ray tune
        gradient_accumulation_steps=gradient_accumulation_steps,
        eval_accumulation_steps=gradient_accumulation_steps,
        num_train_epochs=num_train_epochs,
        learning_rate=learning_rate,
        weight_decay=weight_decay,
        warmup_ratio=warmup_ratio
    )
    trainer = Trainer(
        model=model,
        args=training_args,
        tokenizer=tokenizer,
        train_dataset=ds_all['train']
    )
    trainer.train()

    y_pred = trainer.predict(ds_all['test'])
    y_pred = np.where(
        sigmoid_v(y_pred.predictions) > 0.5, 1, 0
    ).astype(np.int32)

    evaluate(np.array(ds_all['test']['labels']), y_pred, LABELS)

In [6]:
# best hyperparameter from 16 trials, before it stopped halfway
# ╭─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
# │ Trial name            status         ...ccumulation_steps     num_train_epochs     learning_rate     weight_decay     warmup_ratio     iter     total time (s)     eval_loss     eval_precision     eval_recall     eval_f1 │
# ├─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┤
# │ _objective_c2d7bce8   TERMINATED                       16                    5       3.03063e-05      0.00312782         0.0196097        5            863.847      0.223566           0.913893        0.912368    0.913099 │
# ╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
run(
    model_name='Alibaba-NLP/gte-large-en-v1.5',
    gradient_accumulation_steps=16,
    num_train_epochs=5,
    learning_rate=3.03063e-05,
    weight_decay=0.00312782,
    warmup_ratio=0.0196097
)

Some weights of NewForSequenceClassification were not initialized from the model checkpoint at Alibaba-NLP/gte-large-en-v1.5 and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/280 [00:00<?, ?it/s]



{'train_runtime': 1009.686, 'train_samples_per_second': 4.457, 'train_steps_per_second': 0.277, 'train_loss': 0.23595480237688338, 'epoch': 4.98}


  0%|          | 0/200 [00:00<?, ?it/s]

Overall accuracy: 0.475
Accuracy label_recommended: 0.94
Accuracy label_story: 0.895
Accuracy label_gameplay: 0.895
Accuracy label_visual: 0.91
Accuracy label_audio: 0.97
Accuracy label_technical: 0.875
Accuracy label_price: 0.895
Accuracy label_suggestion: 0.905
F1 macro: 0.8231023676252327
                   precision    recall  f1-score   support

label_recommended     0.9474    0.9730    0.9600       148
      label_story     0.8864    0.8764    0.8814        89
   label_gameplay     0.9182    0.9481    0.9329       154
     label_visual     0.8710    0.9310    0.9000        87
      label_audio     0.9245    0.9608    0.9423        51
  label_technical     0.7963    0.7544    0.7748        57
      label_price     0.7955    0.7447    0.7692        47
 label_suggestion     0.5833    0.3333    0.4242        21

        micro avg     0.8901    0.8914    0.8908       654
        macro avg     0.8403    0.8152    0.8231       654
     weighted avg     0.8845    0.8914    0.8865       6