In [None]:
import pandas as pd
import numpy as np
from cleantext import clean
import re
from transformers import XLNetTokenizer, XLNetForSequenceClassification, TrainingArguments, Trainer, pipeline
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import datasets
import evaluate
import random

## Preprocess our data

In [None]:
data_train = pd.read_csv('./emotions_data/emotion-labels-train.csv')
data_test = pd.read_csv('./emotions_data/emotion-labels-test.csv')
data_val = pd.read_csv('./emotions_data/emotion-labels-val.csv')

In [None]:
data_train.head()

In [None]:
data = pd.concat([data_train, data_test, data_val], ignore_index=True)

In [None]:
data['text_clean'] = data['text'].apply(lambda x: clean(x, no_emoji=True))

In [None]:
data['text_clean'] = data['text_clean'].apply(lambda x: re.sub('@[Ë†\s]+', '', x))

In [None]:
data.head(20)

In [None]:
data['label'].value_counts().plot(kind="bar")

In [None]:
g = data.groupby('label')
data = pd.DataFrame(g.apply(lambda x: x.sample(g.size().min())).reset_index(drop=True))

In [None]:
data['label'].value_counts().plot(kind="bar")

In [None]:
data['label_int'] = LabelEncoder().fit_transform(data['label'])

In [None]:
NUM_LABELS = 4

In [None]:
train_split, test_split = train_test_split(data, train_size = 0.8)
train_split, val_split = train_test_split(train_split, train_size = 0.9)

In [None]:
print(len(train_split))
print(len(test_split))
print(len(val_split))

In [None]:
train_df = pd.DataFrame({
    "label": train_split.label_int.values,
    "text": train_split.text_clean.values
})

test_df = pd.DataFrame({
    "label": test_split.label_int.values,
    "text": test_split.text_clean.values
})

In [None]:
train_dict = train_df.to_dict(orient='list')  # Convert DataFrame to dictionary
test_dict = test_df.to_dict(orient='list')    # Convert DataFrame to dictionary

In [None]:
train_df = datasets.Dataset.from_dict(train_df)
test_df = datasets.Dataset.from_dict(test_df)

In [None]:
dataset_dict = datasets.DatasetDict({"train": train_df, "test": test_df})

In [None]:
dataset_dict