# Sentiment Analysis on DistilRoBERTa
[michellejieli/emotion_text_classifier](https://huggingface.co/michellejieli/emotion_text_classifier)

# Fine-tuning
new_model will be the name of your fine-tuned model (saved)

In [1]:
from datasets import load_dataset, DatasetDict, Dataset
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, pipeline
import pandas as pd
import torch


In [2]:
from huggingface_hub import login

login(token="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", add_to_git_credential=True)

Token is valid (permission: write).
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in your terminal in case you want to set the 'store' credential helper as default.

git config --global credential.helper store

Read https://git-scm.com/book/en/v2/Git-Tools-Credential-Storage for more details.[0m
Token has not been saved to git credential helper.
Your token has been saved to /home/user/.cache/huggingface/token
Login successful


In [3]:
# Model from Hugging Face hub
base_model = "michellejieli/emotion_text_classifier"

# Saved fine-tuned model 
new_model = "emotion_text_classifier_on_dd_v1"

In [4]:
def preprocessing(data):
    data = data.rename_column("utterance", "text")
    data = data.rename_column("emotion", "label")
    data = data.remove_columns(["dialog_id", "turn_type"])
    return data

In [5]:
data_name = "benjaminbeilharz/better_daily_dialog"
data = load_dataset(data_name, num_proc=8)
data = preprocessing(data)
data

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 87170
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 8069
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 7740
    })
})

In [6]:
tokenizer = AutoTokenizer.from_pretrained(base_model)
def tokenize(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True)

In [7]:
emotions = data
print(tokenize(emotions["train"][:2]))

{'input_ids': [[0, 34673, 2156, 2488, 2156, 141, 59, 164, 13, 10, 367, 16328, 71, 3630, 17487, 1437, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [8]:
#hide_input
#not very sure what did he do here
tokens2ids = list(zip(tokenizer.all_special_tokens, tokenizer.all_special_ids))
data = sorted(tokens2ids, key=lambda x : x[-1])
df = pd.DataFrame(data, columns=["Special Token", "Special Token ID"])
df.T

Unnamed: 0,0,1,2,3,4
Special Token,<s>,<pad>,</s>,<unk>,<mask>
Special Token ID,0,1,2,3,50264


In [9]:
 # hide_output
emotions_encoded = emotions.map(tokenize, batched=True, batch_size=None)
print(emotions_encoded["train"].column_names)

['text', 'label', 'input_ids', 'attention_mask']


In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# hide_output
num_labels = 7
id2label = {
    0 : "neutral",
    1 : "anger",
    2 : "disgust",
    3 : "fear",
    4 : "joy",
    5 : "sadness",
    6 : "surprise"
}

label2id = {
    "neutral" : 0,
    "anger" : 1,
    "disgust" : 2,
    "fear" : 3,
    "joy" : 4,
    "sadness" : 5,
    "surprise" : 6
}

model = (AutoModelForSequenceClassification
         .from_pretrained(base_model, num_labels=num_labels, id2label=id2label, label2id=label2id)
         .to(device))

In [11]:
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

## Train
on my 3090, it seems to use 23 out of 24 GB of my vram

In [12]:
batch_size = 64
logging_steps = len(emotions_encoded["train"]) // batch_size
training_args = TrainingArguments(output_dir=new_model,
                                  num_train_epochs=2,
                                  learning_rate=2e-5,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  weight_decay=0.01,
                                  evaluation_strategy="epoch",
                                  disable_tqdm=False,
                                  logging_steps=logging_steps,
                                  push_to_hub=True,
                                  log_level="error")

In [13]:
trainer = Trainer(model=model, args=training_args,
                  compute_metrics=compute_metrics,
                  train_dataset=emotions_encoded["train"],
                  eval_dataset=emotions_encoded["validation"],
                  tokenizer=tokenizer)
trainer.train();

  0%|          | 0/2726 [00:00<?, ?it/s]

{'loss': 0.4144, 'learning_rate': 1.000733675715334e-05, 'epoch': 1.0}


  0%|          | 0/127 [00:00<?, ?it/s]

{'eval_loss': 0.25377553701400757, 'eval_accuracy': 0.9060602305118354, 'eval_f1': 0.8972645200485583, 'eval_runtime': 35.7614, 'eval_samples_per_second': 225.634, 'eval_steps_per_second': 3.551, 'epoch': 1.0}
{'loss': 0.3553, 'learning_rate': 1.4673514306676451e-08, 'epoch': 2.0}


  0%|          | 0/127 [00:00<?, ?it/s]

{'eval_loss': 0.25728917121887207, 'eval_accuracy': 0.9008551245507498, 'eval_f1': 0.8972971948347446, 'eval_runtime': 35.8892, 'eval_samples_per_second': 224.831, 'eval_steps_per_second': 3.539, 'epoch': 2.0}
{'train_runtime': 2172.5561, 'train_samples_per_second': 80.246, 'train_steps_per_second': 1.255, 'train_loss': 0.3846907970862448, 'epoch': 2.0}


In [14]:
preds_output = trainer.predict(emotions_encoded["validation"])
preds_output.metrics

  0%|          | 0/127 [00:00<?, ?it/s]

{'test_loss': 0.25728917121887207,
 'test_accuracy': 0.9008551245507498,
 'test_f1': 0.8972971948347446,
 'test_runtime': 35.5041,
 'test_samples_per_second': 227.27,
 'test_steps_per_second': 3.577}

# Inference
I use first five percent of train data to predict

In [15]:


classifier = pipeline("sentiment-analysis", model=new_model, device=0)
data_name = "benjaminbeilharz/better_daily_dialog"
data = load_dataset(data_name, split='train[:5%]', num_proc=8)
data = preprocessing(data)

def predict(row):
    text = row['text']
    true_label = row['label']
    predicted_result = classifier(text)[0]
    predicted_label = predicted_result["label"]
    
    print(f"Predicted: {predicted_label}, True: {true_label},        ##Text: {text}")
    return {"predicted_label": predicted_label, "true_label": true_label}

predictions = data.map(predict)

correct_predictions = sum(1 for p in predictions if p["predicted_label"] == "neutral")
total_predictions = len(predictions)
accuracy = correct_predictions / total_predictions

print(f"acc: {accuracy}")




Map:   0%|          | 0/4358 [00:00<?, ? examples/s]

Predicted: neutral, True: 0,        ##Text: Say , Jim , how about going for a few beers after dinner ? 
Predicted: neutral, True: 0,        ##Text:  You know that is tempting but is really not good for our fitness . 
Predicted: neutral, True: 0,        ##Text:  What do you mean ? It will help us to relax . 
Predicted: neutral, True: 0,        ##Text:  Do you really think so ? I don't . It will just make us fat and act silly . Remember last time ? 
Predicted: neutral, True: 0,        ##Text:  I guess you are right.But what shall we do ? I don't feel like sitting at home . 
Predicted: neutral, True: 0,        ##Text:  I suggest a walk over to the gym where we can play singsong and meet some of our friends . 
Predicted: joy, True: 4,        ##Text:  That's a good idea . I hear Mary and Sally often go there to play pingpong.Perhaps we can make a foursome with them . 
Predicted: joy, True: 4,        ##Text:  Sounds great to me ! If they are willing , we could ask them to go dancing with us.



Predicted: joy, True: 4,        ##Text:  Yes . I like cooking very much . I got this hobby when I was 12 years sold . 
Predicted: neutral, True: 0,        ##Text:  Why do you like it ? 
Predicted: neutral, True: 0,        ##Text:  I have no idea . I like cooking by myself . I like to taste delicious food . 
Predicted: joy, True: 4,        ##Text:  That's wonderful ! 
Predicted: joy, True: 0,        ##Text:  And I love trying new recipes , which I usually test with my friends . You can come , too . 
Predicted: surprise, True: 6,        ##Text:  Really ? I hope I can have a chance to taste it . Don't forget to tell me . 
Predicted: neutral, True: 0,        ##Text:  Certainly . 
Predicted: neutral, True: 0,        ##Text: Anyone home ? Jen ! 
Predicted: neutral, True: 0,        ##Text:  I'm in the kitchen ... let yourself in ! 
Predicted: surprise, True: 0,        ##Text:  Wow ! You're really working up a storm ! 
Predicted: neutral, True: 0,        ##Text:  I know . I've even worked up a