# Testing fine tuning on emotions

got from [Simple training with transformers trainer](https://www.youtube.com/watch?v=u--UVvH-LIQ)

https://www.youtube.com/watch?v=u--UVvH-LIQ)

In [1]:
from datasets import load_dataset

emotion_dataset = load_dataset("emotion")
emotion_dataset

  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})

In [2]:
features = emotion_dataset["train"].features
features

{'text': Value(dtype='string', id=None),
 'label': ClassLabel(names=['sadness', 'joy', 'love', 'anger', 'fear', 'surprise'], id=None)}

In [3]:
emotion_df = emotion_dataset["train"].to_pandas()

In [4]:
id2label = {label: features["label"].int2str(label) for label in range(6)}
label2id = {v: k for k, v in id2label.items()}


In [5]:
emotion_df["label"].value_counts(normalize=True).sort_index()

label
0    0.291625
1    0.335125
2    0.081500
3    0.134937
4    0.121063
5    0.035750
Name: proportion, dtype: float64

In [6]:
from transformers import AutoTokenizer

model_ckpt = "microsoft/MiniLM-L12-H384-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [7]:
def tokenize_text(examples):
    return tokenizer(examples["text"], truncation=True, max_length=512) # truncate the longest sentences


In [8]:
emotion_dataset = emotion_dataset.map(tokenize_text, batched=True)

In [9]:
class_weights = (1 - (emotion_df["label"].value_counts().sort_index() / len(emotion_df))).values
class_weights

array([0.708375 , 0.664875 , 0.9185   , 0.8650625, 0.8789375, 0.96425  ])

In [10]:
import torch

class_weights =  torch.from_numpy(class_weights).float().to("cuda")

RuntimeError: CUDA unknown error - this may be due to an incorrectly set up environment, e.g. changing env variable CUDA_VISIBLE_DEVICES after program start. Setting the available devices to be zero.

In [None]:
emotion_dataset = emotion_dataset.rename_column("label", "labels")

In [None]:
from torch import nn
import torch
from transformers import Trainer

class WeightedLossTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        outputs = model(**inputs)
        logits = outputs.get("logits")
        labels = inputs.get("labels")
        loss_func = nn.CrossEntropyLoss(weight=class_weights)
        loss = loss_func(logits, labels)
        
        return (loss, outputs) if return_outputs else loss


In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=6, id2label=id2label, label2id=label2id)


Downloading pytorch_model.bin: 100%|██████████| 133M/133M [00:01<00:00, 108MB/s]  
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/MiniLM-L12-H384-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from sklearn.metrics import f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    return {"f1": f1}




In [None]:
from transformers import TrainingArguments

batch_size = 64
logging_steps = len(emotion_dataset["train"]) // batch_size
output_dir = "minilm-finetuned-emotion"
training_args = TrainingArguments(output_dir=output_dir,
                                  num_train_epochs=5,
                                  learning_rate=2e-5,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  weight_decay=0.01,
                                  evaluation_strategy="epoch",
                                  logging_steps=logging_steps,
                                  fp16=True)

In [None]:
trainer = WeightedLossTrainer(model=model,
                              args=training_args,
                              compute_metrics=compute_metrics,
                              train_dataset=emotion_dataset["train"],
                              eval_dataset=emotion_dataset["validation"],
                              tokenizer=tokenizer)

In [None]:
trainer.train()

RuntimeError: CUDA error: unspecified launch failure
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
