In [1]:
import os
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

In [2]:
from datasets import load_dataset
import evaluate
from transformers import (
    AutoModelForSequenceClassification, AutoTokenizer,
    Trainer, TrainingArguments,
    BitsAndBytesConfig
)
from peft import (
    prepare_model_for_kbit_training, LoraConfig, get_peft_model, TaskType
)
import numpy as np
import random
import torch
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    classification_report,
)

2024-06-05 22:57:49.254295: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
SEED = 42
MAX_LENGTH = 32768
INSTRUCTION = 'Classify the aspect mentioned in the given Steam Review into up to of the eight aspects: recommended, story, gameplay, visual, audio, technical, price, and suggestion.'  # This mimic paper's string instruction
LABELS = [
    'label_recommended', 'label_story', 'label_gameplay', 'label_visual',
    'label_audio', 'label_technical', 'label_price', 'label_suggestion'
]

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x7e95107dc9d0>

In [4]:
def encode(examples, tokenizer):
    outputs = tokenizer(
        [INSTRUCTION + s for s in examples['cleaned_review']],
        truncation=True, max_length=MAX_LENGTH
    )
    return outputs


def load_model():
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type='nf4',
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
    )
    lora_config = LoraConfig(
        r=8,
        lora_alpha=16,
        lora_dropout=0.05,
        bias='none',
        use_rslora=True,
        task_type='CAUSAL_LM',
        target_modules=[
            'q_proj',
            'k_proj',
            'v_proj',
            'o_proj',
            'gate_proj',
            'up_proj',
            'down_proj',
            'embed_tokens',
            'lm_head',
        ],
    )

    model = AutoModelForSequenceClassification.from_pretrained(
        'intfloat/e5-mistral-7b-instruct', trust_remote_code=True,
        num_labels=8, problem_type='multi_label_classification',
        quantization_config=quantization_config,
        # token='HF_XXX'
    )
    model.gradient_checkpointing_enable()
    model = prepare_model_for_kbit_training(model)
    model = get_peft_model(model, lora_config)
    model.print_trainable_parameters()

    return model


def sigmoid(x):
    return 1/(1 + np.exp(-x))


metric = evaluate.combine(['precision', 'recall', 'f1'])
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = sigmoid(predictions)
    predictions = (predictions > 0.5).astype(int).reshape(-1)
    return metric.compute(predictions=predictions, references=labels.astype(int).reshape(-1), average='macro')


def sigmoid(x):
    return 1 / (1 + np.exp(-x)) 
sigmoid_v = np.vectorize(sigmoid)


def evaluate(y_test, y_pred, labels):
    accuracy = accuracy_score(y_test, y_pred)
    print(f'Overall accuracy: {accuracy}')
    for idx, label in enumerate(labels):
        label_accuracy = accuracy_score(y_test[:, idx], y_pred[:, idx])
        print(f'Accuracy {label}: {label_accuracy}')

    f1 = f1_score(y_test, y_pred, average='macro')
    print(f'F1 macro: {f1}')
    print(
        classification_report(y_test, y_pred, target_names=labels, digits=4, zero_division=0)
    )

In [5]:
model = load_model()
tokenizer = AutoTokenizer.from_pretrained('intfloat/e5-mistral-7b-instruct')
ds_all = load_dataset('ilos-vigil/steam-review-aspect-dataset')
ds_all = ds_all.map(encode, batched=True, fn_kwargs={'tokenizer': tokenizer})

training_args = TrainingArguments(
    #
    output_dir='final',
    logging_steps=5,
    report_to='tensorboard',
    #
    dataloader_drop_last=False,
    eval_strategy='no',
    #
    bf16=True,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=8,
    eval_accumulation_steps=8,
    gradient_checkpointing=True,
    optim="paged_adamw_8bit",
    num_train_epochs=1,
    # from few ray tune trial
    learning_rate=0.00005,
    weight_decay=0.0003,
    warmup_ratio=0.05,
)
trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    train_dataset=ds_all['train'],
    compute_metrics=compute_metrics,
)

`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of MistralForSequenceClassification were not initialized from the model checkpoint at intfloat/e5-mistral-7b-instruct and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 21,260,288 || all params: 7,131,953,152 || trainable%: 0.2981


In [6]:
import gc
gc.collect()
with torch.no_grad():
    torch.cuda.empty_cache()
trainer.train()

  0%|          | 0/112 [00:00<?, ?it/s]



{'loss': 2.6339, 'grad_norm': 937.6458740234375, 'learning_rate': 4.166666666666667e-05, 'epoch': 0.04}
{'loss': 1.6996, 'grad_norm': 522.1160888671875, 'learning_rate': 4.811320754716982e-05, 'epoch': 0.09}
{'loss': 1.2485, 'grad_norm': 666.1519775390625, 'learning_rate': 4.575471698113208e-05, 'epoch': 0.13}
{'loss': 1.2215, 'grad_norm': 811.1981201171875, 'learning_rate': 4.3396226415094345e-05, 'epoch': 0.18}
{'loss': 1.0486, 'grad_norm': 772.4200439453125, 'learning_rate': 4.103773584905661e-05, 'epoch': 0.22}
{'loss': 0.8087, 'grad_norm': 690.603271484375, 'learning_rate': 3.867924528301887e-05, 'epoch': 0.27}
{'loss': 0.8281, 'grad_norm': 495.5755920410156, 'learning_rate': 3.632075471698113e-05, 'epoch': 0.31}
{'loss': 0.875, 'grad_norm': 661.9572143554688, 'learning_rate': 3.39622641509434e-05, 'epoch': 0.36}
{'loss': 0.8121, 'grad_norm': 905.1956787109375, 'learning_rate': 3.160377358490566e-05, 'epoch': 0.4}
{'loss': 0.7801, 'grad_norm': 287.1793212890625, 'learning_rate': 2

TrainOutput(global_step=112, training_loss=0.8731554592294353, metrics={'train_runtime': 2299.8602, 'train_samples_per_second': 0.391, 'train_steps_per_second': 0.049, 'total_flos': 1.4470443293601792e+16, 'train_loss': 0.8731554592294353, 'epoch': 0.9955555555555555})

In [7]:
y_pred = trainer.predict(ds_all['test'])
y_pred = np.where(
    sigmoid_v(y_pred.predictions) > 0.5, 1, 0
).astype(np.int32)

evaluate(np.array(ds_all['test']['labels']), y_pred, LABELS)

  0%|          | 0/200 [00:00<?, ?it/s]

Overall accuracy: 0.13
Accuracy label_recommended: 0.835
Accuracy label_story: 0.725
Accuracy label_gameplay: 0.81
Accuracy label_visual: 0.635
Accuracy label_audio: 0.78
Accuracy label_technical: 0.77
Accuracy label_price: 0.72
Accuracy label_suggestion: 0.89
F1 macro: 0.49433687427810113
                   precision    recall  f1-score   support

label_recommended     0.8324    0.9730    0.8972       148
      label_story     0.6932    0.6854    0.6893        89
   label_gameplay     0.8295    0.9481    0.8848       154
     label_visual     0.6207    0.4138    0.4966        87
      label_audio     0.7333    0.2157    0.3333        51
  label_technical     0.7619    0.2807    0.4103        57
      label_price     0.3333    0.1915    0.2432        47
 label_suggestion     0.0000    0.0000    0.0000        21

        micro avg     0.7567    0.6468    0.6974       654
        macro avg     0.6005    0.4635    0.4943       654
     weighted avg     0.7081    0.6468    0.6505       654