In [1]:
import os
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

In [2]:
from datasets import load_dataset
import evaluate
from transformers import (
    AutoModelForSequenceClassification, AutoTokenizer,
    Trainer, TrainingArguments,
    BitsAndBytesConfig
)
from peft import (
    prepare_model_for_kbit_training, LoraConfig, get_peft_model, TaskType
)
import numpy as np
import random
import torch
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    classification_report,
)

2024-06-05 23:52:08.953567: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
SEED = 42
MAX_LENGTH = 32768
INSTRUCTION = 'Classify the aspect mentioned in the given Steam Review into up to of the eight aspects: recommended, story, gameplay, visual, audio, technical, price, and suggestion.'  # This mimic paper's string instruction
LABELS = [
    'label_recommended', 'label_story', 'label_gameplay', 'label_visual',
    'label_audio', 'label_technical', 'label_price', 'label_suggestion'
]

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x79145f200a10>

In [4]:
def encode(examples, tokenizer):
    outputs = tokenizer(
        [INSTRUCTION + s for s in examples['cleaned_review']],
        truncation=True, max_length=MAX_LENGTH
    )
    return outputs


def load_model():
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type='nf4',
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
    )
    lora_config = LoraConfig(
        #
        r=16,
        lora_alpha=8,
        lora_dropout=0.05,
        bias='none',
        # use_rslora=True,
        task_type='CAUSAL_LM',
        target_modules=[
            'q_proj',
            # 'k_proj',
            'v_proj',
            # 'o_proj',
            # 'gate_proj',
            # 'up_proj',
            # 'down_proj',
            # 'embed_tokens',
            # 'lm_head',
        ],
    )

    model = AutoModelForSequenceClassification.from_pretrained(
        'intfloat/e5-mistral-7b-instruct', trust_remote_code=True,
        num_labels=8, problem_type='multi_label_classification',
        quantization_config=quantization_config,
        # token='HF_XXX'
    )
    model.gradient_checkpointing_enable()
    model = prepare_model_for_kbit_training(model)
    model = get_peft_model(model, lora_config)
    model.print_trainable_parameters()

    return model


def sigmoid(x):
    return 1/(1 + np.exp(-x))


metric = evaluate.combine(['precision', 'recall', 'f1'])
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = sigmoid(predictions)
    predictions = (predictions > 0.5).astype(int).reshape(-1)
    return metric.compute(predictions=predictions, references=labels.astype(int).reshape(-1), average='macro')


def sigmoid(x):
    return 1 / (1 + np.exp(-x)) 
sigmoid_v = np.vectorize(sigmoid)


def evaluate(y_test, y_pred, labels):
    accuracy = accuracy_score(y_test, y_pred)
    print(f'Overall accuracy: {accuracy}')
    for idx, label in enumerate(labels):
        label_accuracy = accuracy_score(y_test[:, idx], y_pred[:, idx])
        print(f'Accuracy {label}: {label_accuracy}')

    f1 = f1_score(y_test, y_pred, average='macro')
    print(f'F1 macro: {f1}')
    print(
        classification_report(y_test, y_pred, target_names=labels, digits=4, zero_division=0)
    )

In [5]:
model = load_model()
tokenizer = AutoTokenizer.from_pretrained('intfloat/e5-mistral-7b-instruct')
ds_all = load_dataset('ilos-vigil/steam-review-aspect-dataset')
ds_all = ds_all.map(encode, batched=True, fn_kwargs={'tokenizer': tokenizer})

training_args = TrainingArguments(
    #
    output_dir='final',
    logging_steps=5,
    report_to='tensorboard',
    #
    dataloader_drop_last=False,
    eval_strategy='no',
    #
    bf16=True,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=16,
    eval_accumulation_steps=16,
    gradient_checkpointing=True,
    optim="paged_adamw_8bit",
    # alternative approach
    num_train_epochs=2,
    learning_rate=0.0002,
    weight_decay=0.0,
    warmup_steps=5,
    adam_beta1=0.9,
    adam_beta2=0.95,
)
trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    train_dataset=ds_all['train'],
    compute_metrics=compute_metrics,
)

`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of MistralForSequenceClassification were not initialized from the model checkpoint at intfloat/e5-mistral-7b-instruct and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 6,815,744 || all params: 7,117,508,608 || trainable%: 0.0958


In [6]:
import gc
gc.collect()
with torch.no_grad():
    torch.cuda.empty_cache()
trainer.train()

  0%|          | 0/112 [00:00<?, ?it/s]



{'loss': 2.7943, 'grad_norm': 5.335930347442627, 'learning_rate': 0.0002, 'epoch': 0.09}
{'loss': 1.7291, 'grad_norm': 5.490852355957031, 'learning_rate': 0.00019065420560747664, 'epoch': 0.18}
{'loss': 1.4007, 'grad_norm': 5.1768951416015625, 'learning_rate': 0.0001813084112149533, 'epoch': 0.27}
{'loss': 1.0765, 'grad_norm': 3.4315829277038574, 'learning_rate': 0.00017196261682242992, 'epoch': 0.36}
{'loss': 0.8738, 'grad_norm': 3.458270788192749, 'learning_rate': 0.00016261682242990654, 'epoch': 0.44}
{'loss': 0.8445, 'grad_norm': 4.763637542724609, 'learning_rate': 0.00015327102803738317, 'epoch': 0.53}
{'loss': 0.7836, 'grad_norm': 3.2394015789031982, 'learning_rate': 0.00014392523364485982, 'epoch': 0.62}
{'loss': 0.6852, 'grad_norm': 3.784304618835449, 'learning_rate': 0.00013457943925233645, 'epoch': 0.71}
{'loss': 0.6499, 'grad_norm': 4.912858486175537, 'learning_rate': 0.00012523364485981308, 'epoch': 0.8}
{'loss': 0.5499, 'grad_norm': 2.4500560760498047, 'learning_rate': 0.0

TrainOutput(global_step=112, training_loss=0.7603069229849747, metrics={'train_runtime': 3867.0998, 'train_samples_per_second': 0.465, 'train_steps_per_second': 0.029, 'total_flos': 2.887580881900339e+16, 'train_loss': 0.7603069229849747, 'epoch': 1.991111111111111})

In [7]:
y_pred = trainer.predict(ds_all['test'])
y_pred = np.where(
    sigmoid_v(y_pred.predictions) > 0.5, 1, 0
).astype(np.int32)

evaluate(np.array(ds_all['test']['labels']), y_pred, LABELS)

  0%|          | 0/200 [00:00<?, ?it/s]

Overall accuracy: 0.175
Accuracy label_recommended: 0.85
Accuracy label_story: 0.78
Accuracy label_gameplay: 0.83
Accuracy label_visual: 0.64
Accuracy label_audio: 0.76
Accuracy label_technical: 0.795
Accuracy label_price: 0.83
Accuracy label_suggestion: 0.895
F1 macro: 0.602655223199517
                   precision    recall  f1-score   support

label_recommended     0.8782    0.9257    0.9013       148
      label_story     0.7778    0.7079    0.7412        89
   label_gameplay     0.8333    0.9740    0.8982       154
     label_visual     0.5843    0.5977    0.5909        87
      label_audio     0.5455    0.3529    0.4286        51
  label_technical     0.6481    0.6140    0.6306        57
      label_price     0.6444    0.6170    0.6304        47
 label_suggestion     0.0000    0.0000    0.0000        21

        micro avg     0.7586    0.7401    0.7492       654
        macro avg     0.6140    0.5987    0.6027       654
     weighted avg     0.7239    0.7401    0.7286       654
 