In [None]:
%set_env TOKENIZERS_PARALLELISM=false

In [2]:
import torch
from transformers import AutoTokenizer, AutoModel, AutoConfig, BitsAndBytesConfig
from llm2vec import LLM2Vec
from peft import PeftModel

import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    classification_report,
)
from sklearn.multiclass import OneVsRestClassifier

SEED = 42
HF_TOKEN = 'hf_XXX'
BATCH_SIZE = 8
MODEL_NAME = 'McGill-NLP/LLM2Vec-Mistral-7B-Instruct-v2-mntp'
MODEL_ID = 'McGill-NLP/LLM2Vec-Mistral-7B-Instruct-v2-mntp-supervised'
INSTRUCTION = 'Classify the aspect mentioned in the given Steam Review into up to of the eight aspects: recommended, story, gameplay, visual, audio, technical, price, and suggestion.'  # This mimic paper's string instruction

In [3]:
df_train = pd.read_csv('../../dataset/v1/train.csv')
df_test = pd.read_csv('../../dataset/v1/test.csv')

labels = df_train.columns[3:].to_list()
y_train = df_train[labels].to_numpy()
y_test = df_test[labels].to_numpy()

In [4]:
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME
)
config = AutoConfig.from_pretrained(
    MODEL_NAME, trust_remote_code=True
)
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)
model = AutoModel.from_pretrained(
    MODEL_NAME,
    trust_remote_code=True,
    config=config,
    torch_dtype=torch.bfloat16,
    device_map="cuda" if torch.cuda.is_available() else "cpu",
    token=HF_TOKEN,
    quantization_config=quantization_config
)
model = PeftModel.from_pretrained(
    model,
    MODEL_NAME,
)
model = model.merge_and_unload()
model = PeftModel.from_pretrained(model, MODEL_ID)

# Wrapper for encoding and pooling operations
l2v = LLM2Vec(model, tokenizer, pooling_mode='mean', max_length=512)




Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of the model checkpoint at mistralai/Mistral-7B-Instruct-v0.2 were not used when initializing MistralEncoderModel: ['lm_head.weight']
- This IS expected if you are initializing MistralEncoderModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing MistralEncoderModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
def evaluate(X_train, y_train, X_test, y_test, labels):
    clf = LogisticRegression(
        random_state=SEED,
        max_iter=100
    )
    ovr = OneVsRestClassifier(clf, n_jobs=-1)

    ovr.fit(X_train, y_train)
    y_pred = ovr.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    print(f'Overall accuracy: {accuracy}')
    for idx, label in enumerate(labels):
        label_accuracy = accuracy_score(y_test[:, idx], y_pred[:, idx])
        print(f'Accuracy {label}: {label_accuracy}')

    f1 = f1_score(y_test, y_pred, average='macro')
    print(f'F1 macro: {f1}')
    print(
        classification_report(y_test, y_pred, target_names=labels, digits=4, zero_division=0)
    )


In [6]:
X_train = l2v.encode(
    # df_train['cleaned_review'].to_list(),
    [
        INSTRUCTION + s for s in
        df_train['cleaned_review'].to_list()
    ],
    batch_size=BATCH_SIZE
)
# X_train = torch.nn.functional.normalize(X_train, p=2, dim=1).numpy()
X_test = l2v.encode(
    # df_test['cleaned_review'].to_list(),
    [
        INSTRUCTION + s for s in
        df_test['cleaned_review'].to_list()
    ],
    batch_size=BATCH_SIZE
)
# X_test = torch.nn.functional.normalize(X_test, p=2, dim=1).numpy()

Batches:   0%|          | 0/113 [00:00<?, ?it/s]

2024-06-01 00:03:52.419214: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Batches:   0%|          | 0/25 [00:00<?, ?it/s]

In [7]:
evaluate(X_train, y_train, X_test, y_test, labels)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Overall accuracy: 0.295
Accuracy label_recommended: 0.895
Accuracy label_story: 0.79
Accuracy label_gameplay: 0.89
Accuracy label_visual: 0.795
Accuracy label_audio: 0.83
Accuracy label_technical: 0.865
Accuracy label_price: 0.84
Accuracy label_suggestion: 0.85
F1 macro: 0.6927673032099368
                   precision    recall  f1-score   support

label_recommended     0.9441    0.9122    0.9278       148
      label_story     0.7831    0.7303    0.7558        89
   label_gameplay     0.9074    0.9545    0.9304       154
     label_visual     0.7738    0.7471    0.7602        87
      label_audio     0.7179    0.5490    0.6222        51
  label_technical     0.7500    0.7895    0.7692        57
      label_price     0.7143    0.5319    0.6098        47
 label_suggestion     0.2000    0.1429    0.1667        21

        micro avg     0.8261    0.7844    0.8047       654
        macro avg     0.7238    0.6697    0.6928       654
     weighted avg     0.8159    0.7844    0.7978       654