In [42]:
# Install the transformers library
!pip install transformers



In [43]:
# Import required packages
import torch
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer

# Create class for data preparation
class SimpleDataset:
    def __init__(self, tokenized_texts):
        self.tokenized_texts = tokenized_texts
    
    def __len__(self):
        return len(self.tokenized_texts["input_ids"])
    
    def __getitem__(self, idx):
        return {k: v[idx] for k, v in self.tokenized_texts.items()}

In [44]:
# Load tokenizer and model, create trainer
model_name = "j-hartmann/emotion-english-distilroberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
trainer = Trainer(model=model)

In [45]:
# Create list of texts (can be imported from .csv, .xls etc.)
pred_texts = ['I like that','That is annoying','This is great!','Wouldn´t recommend it.']

In [46]:
# Example: Import data from csv-file stored on Google Drive

#from google.colab import drive
#drive.mount('/content/drive')


#file_name = "/content/drive/MyDrive/Colab Notebooks/your-filename.csv"
#text_column = "text"

#df_pred = pd.read_csv(file_name)
#pred_texts = df_pred[text_column].dropna().astype('str').tolist()

In [47]:
# Tokenize texts and create prediction data set
tokenized_texts = tokenizer(pred_texts,truncation=True,padding=True)
pred_dataset = SimpleDataset(tokenized_texts)

In [48]:
# Run predictions
predictions = trainer.predict(pred_dataset)

In [49]:
print(predictions)

PredictionOutput(predictions=array([[-0.86106074, -0.70981276, -2.0573738 ,  5.4963565 ,  0.04488751,
        -0.13068563, -0.39005983],
       [ 2.3808117 ,  2.2069035 , -1.7530867 , -2.553061  ,  0.6291722 ,
         0.45815602, -1.4305464 ],
       [-0.78908926, -1.1183804 , -2.2689617 ,  4.869933  ,  1.130197  ,
        -1.5899479 ,  1.0672089 ],
       [-0.48905545,  1.01938   ,  0.3934468 , -2.7029574 ,  1.9741044 ,
         1.5008929 , -2.0667758 ]], dtype=float32), label_ids=None, metrics={'test_runtime': 0.0194, 'test_samples_per_second': 206.448, 'init_mem_cpu_alloc_delta': 0, 'init_mem_gpu_alloc_delta': 329793536, 'init_mem_cpu_peaked_delta': 0, 'init_mem_gpu_peaked_delta': 0, 'test_mem_cpu_alloc_delta': 0, 'test_mem_gpu_alloc_delta': 0, 'test_mem_cpu_peaked_delta': 0, 'test_mem_gpu_peaked_delta': 1218560})


In [50]:
# Transform predictions to labels
preds = predictions.predictions.argmax(-1)
labels = pd.Series(preds).map(model.config.id2label)
scores = (np.exp(predictions[0])/np.exp(predictions[0]).sum(-1,keepdims=True)).max(1)

In [51]:
# scores raw
temp = (np.exp(predictions[0])/np.exp(predictions[0]).sum(-1,keepdims=True))

In [59]:
# work in progress
# container
anger = []
disgust = []
fear = []
joy = []
neutral = []
sadness = []
surprise = []

# extract scores (as many entries as exist in pred_texts)
for i in range(len(pred_texts)):
  anger.append(temp[i][0])
  disgust.append(temp[i][1])
  fear.append(temp[i][2])
  joy.append(temp[i][3])
  neutral.append(temp[i][4])
  sadness.append(temp[i][5])
  surprise.append(temp[i][6])

[0.001708315, 0.45342872, 0.0033060017, 0.036617532]
[0.036617532]


In [54]:
# Create DataFrame with texts, predictions, labels, and scores
df = pd.DataFrame(list(zip(pred_texts,preds,labels,scores,  anger, disgust, fear, joy, neutral, sadness, surprise)), columns=['text','pred','label','score', 'anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'surprise'])
df.head()

Unnamed: 0,text,pred,label,score,anger,disgust,fear,joy,neutral,sadness,surprise
0,I like that,3,joy,0.985279,0.001708,0.001987,0.000516,0.985279,0.004227,0.003546,0.002736
1,That is annoying,0,anger,0.453429,0.453429,0.38105,0.007264,0.003264,0.078665,0.0663,0.010029
2,This is great!,3,joy,0.948386,0.003306,0.002378,0.000753,0.948386,0.022534,0.001484,0.021158
3,Wouldn´t recommend it.,4,neutral,0.429958,0.036618,0.165499,0.088502,0.004001,0.429958,0.267863,0.00756
