# Sentiment Analysis on DistilRoBERTa
[michellejieli/emotion_text_classifier](https://huggingface.co/michellejieli/emotion_text_classifier)

# Fine-tuning
new_model will be the name of your fine-tuned model (saved)

In [1]:
import os
import pandas as pd
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, pipeline
from huggingface_hub import login

In [2]:
# prevent env load failed
%load_ext dotenv
%dotenv

In [3]:
login(token=os.environ.get("HF_TOKEN", ""), add_to_git_credential=True)

Token is valid (permission: read).
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /home/hermeschen/.cache/huggingface/token
Login successful


In [4]:
# Model from Hugging Face hub
base_model = "michellejieli/emotion_text_classifier"

# Saved fine-tuned model 
new_model = "emotion_text_classifier_on_dd_v1"

In [5]:
def preprocessing(data):
    data = data.rename_column("utterance", "text")
    data = data.rename_column("emotion", "label")
    data = data.remove_columns(["dialog_id", "turn_type"])
    return data

In [6]:
data_name = "benjaminbeilharz/better_daily_dialog"
data = load_dataset(data_name, num_proc=16)
data = preprocessing(data)
data

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 87170
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 8069
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 7740
    })
})

In [7]:
tokenizer = AutoTokenizer.from_pretrained(base_model)


def tokenize(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True)

In [8]:
emotions = data
print(tokenize(emotions["train"][:2]))

{'input_ids': [[0, 34673, 2156, 2488, 2156, 141, 59, 164, 13, 10, 367, 16328, 71, 3630, 17487, 1437, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [9]:
#hide_input
#not very sure what did he do here
tokens2ids = list(zip(tokenizer.all_special_tokens, tokenizer.all_special_ids))
data = sorted(tokens2ids, key=lambda x: x[-1])
df = pd.DataFrame(data, columns=["Special Token", "Special Token ID"])
df.T

Unnamed: 0,0,1,2,3,4
Special Token,<s>,<pad>,</s>,<unk>,<mask>
Special Token ID,0,1,2,3,50264


In [10]:
 # hide_output
emotions_encoded = emotions.map(tokenize, batched=True, batch_size=None)
print(emotions_encoded["train"].column_names)

['text', 'label', 'input_ids', 'attention_mask']


In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# hide_output
num_labels = 7
id2label = {
    0: "neutral",
    1: "anger",
    2: "disgust",
    3: "fear",
    4: "joy",
    5: "sadness",
    6: "surprise"
}

label2id = {
    "neutral": 0,
    "anger": 1,
    "disgust": 2,
    "fear": 3,
    "joy": 4,
    "sadness": 5,
    "surprise": 6
}

model = AutoModelForSequenceClassification.from_pretrained(base_model, num_labels=num_labels, id2label=id2label, label2id=label2id)

In [12]:
from sklearn.metrics import accuracy_score, f1_score


def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

## Train
on my 3090, it seems to use 23 out of 24 GB of my vram

In [13]:
batch_size = 64
logging_steps = len(emotions_encoded["train"]) // batch_size
training_args = TrainingArguments(output_dir=new_model,
                                  overwrite_output_dir=True,
                                  num_train_epochs=2,
                                  learning_rate=2e-5,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  weight_decay=0.01,
                                  group_by_length=True,
                                  evaluation_strategy="epoch",
                                  logging_steps=logging_steps,
                                  report_to=["tensorboard"],
                                  log_level="error",
                                  gradient_checkpointing=True,
                                  gradient_checkpointing_kwargs={"use_reentrant": True})

In [14]:
trainer = Trainer(model=model, args=training_args,
                  compute_metrics=compute_metrics,
                  train_dataset=emotions_encoded["train"],
                  eval_dataset=emotions_encoded["validation"],
                  tokenizer=tokenizer)
trainer.train();

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.4146,0.275982,0.902466,0.897819
2,0.3549,0.257709,0.902466,0.898937


In [22]:
trainer.model.save_pretrained(new_model)

# Inference
I use first five percent of train data to predict

In [24]:
classifier_model = AutoModelForSequenceClassification.from_pretrained(new_model, num_labels=num_labels, id2label=id2label, label2id=label2id)

In [26]:
classifier = pipeline("sentiment-analysis", model=classifier_model, tokenizer=tokenizer, device=0)
data_name = "benjaminbeilharz/better_daily_dialog"
data = load_dataset(data_name, split='test', num_proc=8)
data = preprocessing(data)


def predict(row):
    text = row['text']
    true_label = row['label']
    predicted_result = classifier(text)[0]
    predicted_label = predicted_result["label"]

    print(f"Predicted: {predicted_label}, True: {true_label},        ##Text: {text}")
    return {"predicted_label": predicted_label, "true_label": true_label}


predictions = data.map(predict)

correct_predictions = sum(1 for p in predictions if p["predicted_label"] == "neutral")
total_predictions = len(predictions)
accuracy = correct_predictions / total_predictions

Map:   0%|          | 0/7740 [00:00<?, ? examples/s]

Predicted: neutral, True: 0,        ##Text: Hey man , you wanna buy some weed ? 
Predicted: neutral, True: 6,        ##Text:  Some what ? 
Predicted: neutral, True: 0,        ##Text:  Weed ! You know ? Pot , Ganja , Mary Jane some chronic ! 
Predicted: neutral, True: 0,        ##Text:  Oh , umm , no thanks . 
Predicted: neutral, True: 0,        ##Text:  I also have blow if you prefer to do a few lines . 
Predicted: neutral, True: 0,        ##Text:  No , I am ok , really . 
Predicted: anger, True: 0,        ##Text:  Come on man ! I even got dope and acid ! Try some ! 
Predicted: neutral, True: 0,        ##Text:  Do you really have all of these drugs ? Where do you get them from ? 
Predicted: neutral, True: 0,        ##Text:  I got my connections ! Just tell me what you want and I ’ ll even give you one ounce for free . 
Predicted: joy, True: 0,        ##Text:  Sounds good ! Let ’ s see , I want . 
Predicted: neutral, True: 3,        ##Text:  Yeah ? 
Predicted: anger, True: 0,        ##T



Predicted: neutral, True: 0,        ##Text: Sun-set hotel . May I help you ? 
Predicted: neutral, True: 0,        ##Text:  Yes , I have booked a room for 24th . It's a double room . 
Predicted: neutral, True: 0,        ##Text:  Hold on , please . Let me check it for you . Yes , you're right . You will keep it for 3 days . 
Predicted: neutral, True: 0,        ##Text:  Well , now I want to change the date from 24th to 28th . 
Predicted: neutral, True: 0,        ##Text:  OK , that shall be arranged . 
Predicted: neutral, True: 0,        ##Text: Are you busy tomorrow morning ? 
Predicted: neutral, True: 0,        ##Text:  I'm free . What's up ? 
Predicted: neutral, True: 0,        ##Text:  Someone has to pick up the boss at the airport . 
Predicted: neutral, True: 0,        ##Text:  Oh , I just remembered I've got a report to write . 
Predicted: neutral, True: 5,        ##Text: I'm sorry I'm so late . I had a really bad day . 
Predicted: neutral, True: 0,        ##Text:  It's ten after six

In [27]:
print(f"acc: {accuracy}")

acc: 0.8574935400516795
