In [None]:
!pip install datasets



In [None]:
from datasets import load_dataset

In [None]:
emotion_dataset = load_dataset("emotion")
emotion_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})

In [None]:
emotion_dataset['train']

Dataset({
    features: ['text', 'label'],
    num_rows: 16000
})

In [None]:
type(emotion_dataset['train']['text'])

list

In [None]:
emotion_dataset['train'][1]

{'text': 'i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake',
 'label': 0}

In [None]:
emotion_df = emotion_dataset['train'].to_pandas()
emotion_df.head()

Unnamed: 0,text,label
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,3
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,3


In [None]:
features = emotion_dataset['train'].features
features

{'text': Value(dtype='string', id=None),
 'label': ClassLabel(names=['sadness', 'joy', 'love', 'anger', 'fear', 'surprise'], id=None)}

In [None]:
features['label'].int2str(0)

'sadness'

In [None]:
id2label = {idx:features['label'].int2str(idx) for idx in range(6)}
id2label

{0: 'sadness', 1: 'joy', 2: 'love', 3: 'anger', 4: 'fear', 5: 'surprise'}

In [None]:
label2id = {v:k for k,v in id2label.items()}
label2id

{'sadness': 0, 'joy': 1, 'love': 2, 'anger': 3, 'fear': 4, 'surprise': 5}

In [None]:
emotion_df["label"].value_counts(normalize=True).sort_index()

0    0.291625
1    0.335125
2    0.081500
3    0.134937
4    0.121063
5    0.035750
Name: label, dtype: float64

## Tokenize all the things

In [None]:
!pip install transformers



In [None]:
from transformers import AutoTokenizer

In [None]:
model_ckpt = "microsoft/MiniLM-L12-H384-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [None]:
emotion_dataset['train']['text'][:1]

['i didnt feel humiliated']

In [None]:
tokenizer(emotion_dataset['train']['text'][:1])

{'input_ids': [[101, 1045, 2134, 2102, 2514, 26608, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1]]}

In [None]:
len(emotion_dataset['train']['text'])

16000

In [None]:
def tokenize_text(examples):
  return tokenizer(examples["text"], truncation=True, max_length=512)

In [None]:
emotion_dataset = emotion_dataset.map(tokenize_text, batched=True)

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [None]:
type(emotion_dataset['train']['attention_mask'])

list

### Dealing with imbalanced classes

In [None]:
class_weights = (1 - (emotion_df['label'].value_counts().sort_index() / len(emotion_df))).values

In [None]:
class_weights

array([0.708375 , 0.664875 , 0.9185   , 0.8650625, 0.8789375, 0.96425  ])

In [None]:
import torch


In [None]:
class_weights = torch.from_numpy(class_weights).float()

In [None]:
class_weights

tensor([0.7084, 0.6649, 0.9185, 0.8651, 0.8789, 0.9643])

In [None]:
emotion_dataset = emotion_dataset.rename_column("label", "labels")

In [None]:
emotion_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2000
    })
})

In [None]:
from torch import nn
import torch
from transformers import Trainer

In [None]:
class WeightedLossTrainer(Trainer):
  def compute_loss(self, model, inputs, return_outputs=False):
    #Feed inputs to model and extract logits
    outputs = model(**inputs)
    logits = outputs.get("logits")
    #Extract labels
    labels = inputs.get("labels")
    #Define loss function with class weights
    loss_func = nn.CrossEntropyLoss(weight=class_weights)
    # Compute loss
    loss = loss_func(logits, labels)
    return (loss, outputs) if return_outputs else loss

### Putting it all together

In [None]:
from transformers import AutoModelForSequenceClassification

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt,
                                                   num_labels = 6,
                                                   id2label = id2label,
                                                   label2id = label2id)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/MiniLM-L12-H384-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from sklearn.metrics import f1_score

In [None]:
def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  f1 = f1_score(labels, preds, average="weighted")

In [None]:
from transformers import TrainingArguments

In [None]:
batch_size = 64
#log the training loss at each epoch
logging_steps = len(emotion_dataset['train'])//batch_size

In [None]:
!mkdir /emotion

mkdir: cannot create directory ‘/emotion’: File exists


In [None]:
!pip install transformers[torch]
!pip install accelerate -U



In [None]:
training_args = TrainingArguments(output_dir = "/emotion",
                                  num_train_epochs=5,
                                  learning_rate=2e-5,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  weight_decay=0.01,
                                  evaluation_strategy="epoch",
                                  logging_steps=logging_steps,
                                  # fp16=True, # Make train fast
                                  # push_to_hub=True
                                  )

In [None]:
trainer = WeightedLossTrainer(model = model,
                              args = training_args,
                              compute_metrics = compute_metrics,
                              train_dataset = emotion_dataset['train'],
                              eval_dataset=emotion_dataset["validation"],
                              tokenizer=tokenizer
                            )

## Train

In [None]:
trainer.train()



Epoch,Training Loss,Validation Loss


TypeError: ignored