In [1]:
from datasets import load_dataset
from sklearn.metrics import f1_score, accuracy_score
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import Trainer, TrainingArguments

import torch

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
# load dataset
emotions = load_dataset('emotion')
# number of classes we have in this dataset.
num_labels = len(emotions['train'].features['label'].names)

In [4]:
# tokenizer and model
model_ckpt = 'distilbert-base-uncased'
model = (AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=num_labels).to(device))
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.weight', 'pre_classifier.bias', 'classifier.we

In [5]:
# encoding the dataset with tokenizer
def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True)

In [6]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc, 'f1': f1}

In [7]:
emotions_encoded = emotions.map(tokenize, batched=True, batch_size=None)
emotions_encoded.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

In [8]:
from huggingface_hub import login
login(token='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')

Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/karvsmech/.cache/huggingface/token
Login successful


In [9]:
batch_size = 256
logging_steps = len(emotions_encoded['train']) // batch_size
model_name = f'{model_ckpt}-finetuned-emotion'
training_args = TrainingArguments(output_dir=model_name,
                                  num_train_epochs=3,
                                  learning_rate=2e-5,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  weight_decay=0.01,
                                  evaluation_strategy='epoch',
                                  disable_tqdm=False,
                                  logging_steps=logging_steps,
                                  push_to_hub=True,
                                  log_level='error'
                                  )

In [10]:
trainer = Trainer(model=model, args=training_args,
                  compute_metrics=compute_metrics,
                  train_dataset=emotions_encoded['train'],
                  eval_dataset=emotions_encoded['validation'],
                  tokenizer=tokenizer)

For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.
/home/karvsmech/Projects/Pytorch_projects/NLPT/text_classification/distilbert-base-uncased-finetuned-emotion is already a clone of https://huggingface.co/karvsmech/distilbert-base-uncased-finetuned-emotion. Make sure you pull the latest changes with `repo.git_pull()`.


In [11]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,F1
1,1.3116,0.897228,0.702,0.636911
2,0.6831,0.485033,0.842,0.823085
3,0.4345,0.393735,0.889,0.881384


TrainOutput(global_step=189, training_loss=0.8042330584198079, metrics={'train_runtime': 199.7549, 'train_samples_per_second': 240.294, 'train_steps_per_second': 0.946, 'total_flos': 1080514292544000.0, 'train_loss': 0.8042330584198079, 'epoch': 3.0})