In [1]:
BATCH_SIZE = 32
MULTILABEL = True
USE_TEMPLATE = True
USE_PREPROCESSING = True

In [2]:
import re
import numpy as np
import pandas as pd
from transformers import pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score, classification_report

In [3]:
df_test = pd.read_csv('./Indonesian-Twitter-Emotion-Dataset/Twitter_Emotion_Dataset.csv')
X_test = df_test['tweet'].tolist()
y_test = df_test['label']

label = ['kemarahan', 'ketakutan', 'kebahagiaan', 'cinta', 'kesedihan']
doc_count = len(X_test)

hypotheses_template = "{}."
if USE_TEMPLATE:
    hypotheses_template = "Kalimat ini mengekspresikan {}."

if USE_PREPROCESSING:
    X_test = [re.sub(r'\[([A-Z]+)\]', r'', X) for X in X_test]
    X_test = [re.sub(r'#(\S+)', r'\g<1>', X) for X in X_test]
    X_test = [re.sub(r'(\.)\1{2,}', '', X) for X in X_test]
    X_test = [re.sub(r'(\W)\1{2,}', r'\g<1>', X) for X in X_test]
    X_test = [X.lower() for X in X_test]

In [4]:
le = LabelEncoder()
y_test = le.fit_transform(y_test)

In [5]:
zsl = pipeline('zero-shot-classification', device=0, model='joeddav/xlm-roberta-large-xnli', )

y_preds = []
for idx_start in range(0, doc_count, BATCH_SIZE):
    idx_end = idx_start + BATCH_SIZE
    if idx_end > doc_count:
        idx_end = doc_count
    print(f'{idx_start+1}-{idx_end+1}/{doc_count}')

    y_pred = zsl(X_test[idx_start:idx_end], label, hypotheses_template, multi_label=MULTILABEL)
    y_preds.extend(y_pred)

y_preds = [label.index(y_preds[i]['labels'][0]) for i in range(doc_count)]

Some weights of the model checkpoint at joeddav/xlm-roberta-large-xnli were not used when initializing XLMRobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


1-33/4401
33-65/4401
65-97/4401
97-129/4401
129-161/4401
161-193/4401
193-225/4401
225-257/4401
257-289/4401
289-321/4401
321-353/4401
353-385/4401
385-417/4401
417-449/4401
449-481/4401
481-513/4401
513-545/4401
545-577/4401
577-609/4401
609-641/4401
641-673/4401
673-705/4401
705-737/4401
737-769/4401
769-801/4401
801-833/4401
833-865/4401
865-897/4401
897-929/4401
929-961/4401
961-993/4401
993-1025/4401
1025-1057/4401
1057-1089/4401
1089-1121/4401
1121-1153/4401
1153-1185/4401
1185-1217/4401
1217-1249/4401
1249-1281/4401
1281-1313/4401
1313-1345/4401
1345-1377/4401
1377-1409/4401
1409-1441/4401
1441-1473/4401
1473-1505/4401
1505-1537/4401
1537-1569/4401
1569-1601/4401
1601-1633/4401
1633-1665/4401
1665-1697/4401
1697-1729/4401
1729-1761/4401
1761-1793/4401
1793-1825/4401
1825-1857/4401
1857-1889/4401
1889-1921/4401
1921-1953/4401
1953-1985/4401
1985-2017/4401
2017-2049/4401
2049-2081/4401
2081-2113/4401
2113-2145/4401
2145-2177/4401
2177-2209/4401
2209-2241/4401
2241-2273/4401
2273-2

In [7]:
print('F1 Score:', f1_score(y_test, y_preds, average='micro'))
print(classification_report(y_test, y_preds, target_names=le.classes_))

F1 Score: 0.6203135650988412
              precision    recall  f1-score   support

       anger       0.62      0.55      0.58      1101
        fear       0.52      0.81      0.63       649
       happy       0.77      0.60      0.68      1017
        love       0.62      0.81      0.70       637
     sadness       0.61      0.47      0.53       997

    accuracy                           0.62      4401
   macro avg       0.63      0.65      0.62      4401
weighted avg       0.64      0.62      0.62      4401

