In [17]:
import plotly.express as px
import pandas as pd

## Dataset

In [25]:
# Load the Parquet file
file_path = "/Users/juliamf/Downloads/train-00000-of-00001.parquet"
df = pd.read_parquet(file_path)

# Display the first few rows
print(df.head())


                                                text  label
0  i feel awful about it too because it s my job ...      0
1                              im alone i feel awful      0
2  ive probably mentioned this before but i reall...      1
3           i was feeling a little low few days back      0
4  i beleive that i am much more sensitive to oth...      2


In [26]:
len(df['label'])

416809

0: sadness

1: joy

2: love

3: anger

4: fear

5: surprise

In [27]:
emotions = {0: "sadness", 1: "joy", 2: "love", 3: "anger", 4: "fear", 5: "surprise"}

for emotion in emotions.values():
    df[emotion] = 0

print(df.head())


                                                text  label  sadness  joy  \
0  i feel awful about it too because it s my job ...      0        0    0   
1                              im alone i feel awful      0        0    0   
2  ive probably mentioned this before but i reall...      1        0    0   
3           i was feeling a little low few days back      0        0    0   
4  i beleive that i am much more sensitive to oth...      2        0    0   

   love  anger  fear  surprise  
0     0      0     0         0  
1     0      0     0         0  
2     0      0     0         0  
3     0      0     0         0  
4     0      0     0         0  


In [28]:
for index, row in df.iterrows():
    emotion_name = emotions[row["label"]]  
    df.at[index, emotion_name] = 1  

df["label_name"] = df["label"].map(emotions)

print(df.head())


                                                text  label  sadness  joy  \
0  i feel awful about it too because it s my job ...      0        1    0   
1                              im alone i feel awful      0        1    0   
2  ive probably mentioned this before but i reall...      1        0    1   
3           i was feeling a little low few days back      0        1    0   
4  i beleive that i am much more sensitive to oth...      2        0    0   

   love  anger  fear  surprise label_name  
0     0      0     0         0    sadness  
1     0      0     0         0    sadness  
2     0      0     0         0        joy  
3     0      0     0         0    sadness  
4     1      0     0         0       love  


In [14]:
df.drop(columns=['label'])

Unnamed: 0,text,sadness,joy,love,anger,fear,surprise
0,i feel awful about it too because it s my job ...,1,0,0,0,0,0
1,im alone i feel awful,1,0,0,0,0,0
2,ive probably mentioned this before but i reall...,0,1,0,0,0,0
3,i was feeling a little low few days back,1,0,0,0,0,0
4,i beleive that i am much more sensitive to oth...,0,0,1,0,0,0
...,...,...,...,...,...,...,...
416804,that was what i felt when i was finally accept...,0,1,0,0,0,0
416805,i take every day as it comes i m just focussin...,0,0,0,0,1,0
416806,i just suddenly feel that everything was fake,1,0,0,0,0,0
416807,im feeling more eager than ever to claw back w...,0,1,0,0,0,0


## Class distribution

In [29]:
labels = ['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']
label_counts = df[labels].sum()
print(label_counts)
total = label_counts.sum()
print("total labels:", total)

sadness     121187
joy         141067
love         34554
anger        57317
fear         47712
surprise     14972
dtype: int64
total labels: 416809


In [30]:
px.bar(df['label_name'].value_counts(ascending=True),template='plotly_white')

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

## Tweet length

In [31]:
df["Words Per Tweet"] = df["text"].str.split().apply(len)

px.box(df,y='Words Per Tweet',
       color='label_name',
       template='plotly_white')

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

## Tokenisation

In [34]:
from sklearn.model_selection import train_test_split

train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42, stratify=df["label"])
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42, stratify=temp_df["label"])

# Print the sizes
print(f"Training Set: {len(train_df)} samples")
print(f"Validation Set: {len(val_df)} samples")
print(f"Test Set: {len(test_df)} samples")


Training Set: 291766 samples
Validation Set: 62521 samples
Test Set: 62522 samples


In [36]:
from datasets import Dataset
from transformers import DistilBertTokenizerFast

# Load DistilBERT tokenizer
distilbert_tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

# Tokenization function
def tokenise(batch):
    return distilbert_tokenizer(batch["text"], padding=True, truncation=True)

# Convert Pandas DataFrames to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

# Apply tokenization with batched=True (only works on Hugging Face Dataset)
train_tokenised = train_dataset.map(tokenise, batched=True)
val_tokenised = val_dataset.map(tokenise, batched=True)
test_tokenised = test_dataset.map(tokenise, batched=True)

# Print an example
print(train_tokenised[0])

Map: 100%|██████████| 291766/291766 [00:37<00:00, 7762.04 examples/s]
Map: 100%|██████████| 62521/62521 [00:06<00:00, 9539.26 examples/s] 
Map: 100%|██████████| 62522/62522 [00:06<00:00, 10077.16 examples/s]

{'text': 'i was looking forward to feeling delighted', 'label': 1, 'sadness': 0, 'joy': 1, 'love': 0, 'anger': 0, 'fear': 0, 'surprise': 0, 'label_name': 'joy', 'Words Per Tweet': 7, '__index_level_0__': 200374, 'input_ids': [101, 1045, 2001, 2559, 2830, 2000, 3110, 15936, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}





In [40]:
emotions_encoded = {
    "train": train_tokenised,
    "validation": val_tokenised,
    "test": test_tokenised
}

In [41]:
print(emotions_encoded)

{'train': Dataset({
    features: ['text', 'label', 'sadness', 'joy', 'love', 'anger', 'fear', 'surprise', 'label_name', 'Words Per Tweet', '__index_level_0__', 'input_ids', 'attention_mask'],
    num_rows: 291766
}), 'validation': Dataset({
    features: ['text', 'label', 'sadness', 'joy', 'love', 'anger', 'fear', 'surprise', 'label_name', 'Words Per Tweet', '__index_level_0__', 'input_ids', 'attention_mask'],
    num_rows: 62521
}), 'test': Dataset({
    features: ['text', 'label', 'sadness', 'joy', 'love', 'anger', 'fear', 'surprise', 'label_name', 'Words Per Tweet', '__index_level_0__', 'input_ids', 'attention_mask'],
    num_rows: 62522
})}


## Training

In [47]:
from transformers import AutoModelForSequenceClassification
import torch
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
num_labels = 6

model_ckpt = "distilbert-base-uncased"
model = (AutoModelForSequenceClassification
         .from_pretrained(model_ckpt, 
                          num_labels=num_labels)
         .to(device))

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [48]:
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

In [53]:
from transformers import Trainer, TrainingArguments

bs = 64  # Batch size
logging_steps = len(emotions_encoded["train"]) // bs  # Define logging steps
model_name = 'Destilbert-finetuned-emotion'
training_args = TrainingArguments(output_dir=model_name,
                                  num_train_epochs=3,             # number of training epochs
                                  learning_rate=2e-5,             # model learning rate
                                  per_device_train_batch_size=bs, # batch size
                                  per_device_eval_batch_size=bs,  # batch size
                                  weight_decay=0.01,
                                  eval_strategy="epoch",
                                  disable_tqdm=False, 
                                  report_to="none",
                                  logging_steps=logging_steps,
                                  push_to_hub=False,
                                  log_level="error")

ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.26.0`: Please run `pip install transformers[torch]` or `pip install 'accelerate>={ACCELERATE_MIN_VERSION}'`