# <font color = 'indianred'>**Emotion Detection - MultiLabel** </font>

**Objective:**

In this notebook, we will understand the data for Emotion Detection. We will also understand how to push the dataset to Huggingface hub for use in future notebooks.






















# <font color = 'indianred'> **1. Setting up the Environment** </font>

In [None]:
# If in Colab, then import the drive module from google.colab
if 'google.colab' in str(get_ipython()):
  from google.colab import drive
  # Mount the Google Drive to access files stored there
  drive.mount('/content/drive')

  # Install the latest version of torchtext library quietly without showing output
  # !pip install torchtext -qq
  !pip install datasets evaluate -U -qq ## NEW LINES ##
  basepath = '/content/drive/MyDrive/data/'
else:
  basepath = '/home/harpreet/Insync/google_drive_shaannoor/data'

Mounted at /content/drive
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[?25h

<font color = 'indianred'> *Load Libraries* </font>

In [None]:
from datasets import load_dataset, DatasetDict
from pathlib import Path
from collections import Counter
import evaluate
import numpy as np
from sklearn.metrics import f1_score

In [None]:
data_path = Path(basepath) / 'datasets/Kaggle_spring_2024'
data_path

PosixPath('/content/drive/MyDrive/data/datasets/Kaggle_spring_2024')

# <font color = 'indianred'> **2. Load and Process Dataset** </font>

In [None]:
emotion_data = load_dataset('csv', data_files= str(data_path/'train.csv'))

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
emotion_data

DatasetDict({
    train: Dataset({
        features: ['ID', 'Tweet', 'anger', 'anticipation', 'disgust', 'fear', 'joy', 'love', 'optimism', 'pessimism', 'sadness', 'surprise', 'trust'],
        num_rows: 7724
    })
})

In [None]:
emotion_data['train'][0:2]

{'ID': ['2017-21441', '2017-31535'],
 'Tweet': ["“Worry is a down payment on a problem you may never have'. \xa0Joyce Meyer.  #motivation #leadership #worry",
  'Whatever you decide to do make sure it makes you #happy.'],
 'anger': [0, 0],
 'anticipation': [1, 0],
 'disgust': [0, 0],
 'fear': [0, 0],
 'joy': [0, 1],
 'love': [0, 1],
 'optimism': [1, 1],
 'pessimism': [0, 0],
 'sadness': [0, 0],
 'surprise': [0, 0],
 'trust': [1, 0]}

In [None]:
labels = [label for label in emotion_data['train'].features.keys() if label not in ['ID', 'Tweet']]
labels

['anger',
 'anticipation',
 'disgust',
 'fear',
 'joy',
 'love',
 'optimism',
 'pessimism',
 'sadness',
 'surprise',
 'trust']

In [None]:
# Let's define the combined and type-casting function
def combine_emotion_labels(example):
    # Combine and cast to float
    example['label'] = [float(example[emotion]) for emotion in labels]
    return example

# Apply the function to each example in the dataset
processed_dataset = emotion_data.map(combine_emotion_labels)

# Remove the individual emotion columns to only keep 'Tweet' and 'label'
processed_dataset = processed_dataset.remove_columns(['ID', 'anger', 'anticipation', 'disgust', 'fear', 'joy', 'love', 'optimism', 'pessimism', 'sadness', 'surprise', 'trust'])

# Rename the 'Tweet' column to 'text'
processed_dataset = processed_dataset.rename_column('Tweet', 'text')


Map:   0%|          | 0/7724 [00:00<?, ? examples/s]

In [None]:
processed_dataset['train'].features

{'text': Value(dtype='string', id=None),
 'label': Sequence(feature=Value(dtype='float64', id=None), length=-1, id=None)}

In [None]:
processed_dataset['train'][0:3]

{'text': ["“Worry is a down payment on a problem you may never have'. \xa0Joyce Meyer.  #motivation #leadership #worry",
  'Whatever you decide to do make sure it makes you #happy.',
  "@Max_Kellerman  it also helps that the majority of NFL coaching is inept. Some of Bill O'Brien's play calling was wow, ! #GOPATS"],
 'label': [[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0],
  [0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0],
  [1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0]]}

# <font color = 'indianred'> **3. Accessing and Manuplating Splits**</font>



In [None]:
# Split the test set into test and validation sets
train_val_splits = processed_dataset["train"].train_test_split(
    test_size=0.2, seed=42)  # 80% for train, 20% for validation

# Extract the test and validation splits
train_split = train_val_splits["train"]
valid_split = train_val_splits["test"]


<font color = 'indianred'>*small subset for initial experimenttaion*</font>

In [None]:
train_split

Dataset({
    features: ['text', 'label'],
    num_rows: 6179
})

In [None]:
valid_split

Dataset({
    features: ['text', 'label'],
    num_rows: 1545
})

In [None]:
train_emotion_spring_2024 = DatasetDict({
    'train': train_split,
    'valid': valid_split,
})

In [None]:
train_emotion_spring_2024['train'][0:2]

{'text': ['Does anyone know, are both Sims in a dual sim phone both locked to the same network! #worry',
  'A not very young man compiled info on a smiling baby then a mattress created new evil.'],
 'label': [[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0],
  [1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]}

In [1]:
from huggingface_hub import notebook_login
notebook_login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
repository_name = "harpreetmann/train_emotion_spring_2024"
train_emotion_spring_2024.push_to_hub(repository_name)

# <font color = 'indianred'> **4. Label Distribution** </font>

In [None]:
# Initialize counters for each label in train and valid sets
train_label_counts = Counter()
valid_label_counts = Counter()

# Function to update counts
def update_label_counts(dataset, label_counts):
    for label_array in dataset['label']:
        # print(label_array)
        # label_array is expected to be a list of binary values
        for index, label in enumerate(label_array):
            if label == 1:
                label_counts[index] += 1

# Update counts for both datasets
update_label_counts(train_emotion_spring_2024['train'], train_label_counts)
update_label_counts(train_emotion_spring_2024['valid'], valid_label_counts)

print(train_label_counts)

# Display the label distributions
print("Training set label distribution:")
for label, count in train_label_counts.items():
    print(f"Label {labels[label]}: {count}")

print("\nValidation set label distribution:")
for label, count in valid_label_counts.items():
    print(f"Label {labels[label]}: {count}")


Counter({2: 2330, 0: 2306, 4: 2293, 8: 1850, 6: 1818, 3: 1084, 1: 891, 7: 714, 5: 656, 9: 314, 10: 306})
Training set label distribution:
Label fear: 1084
Label pessimism: 714
Label sadness: 1850
Label anger: 2306
Label anticipation: 891
Label joy: 2293
Label optimism: 1818
Label disgust: 2330
Label trust: 306
Label love: 656
Label surprise: 314

Validation set label distribution:
Label joy: 584
Label anger: 553
Label anticipation: 211
Label disgust: 591
Label optimism: 473
Label pessimism: 181
Label sadness: 423
Label fear: 279
Label love: 176
Label surprise: 82
Label trust: 94


In [None]:
y_true = np.array([[0, 0, 0], [1, 1, 1], [0, 1, 1]])
logits = np.array([[-2, -0.2, -0.2], [8, 8, 8], [0.7, 0.7, -3]])
eval_pred = ( logits, y_true,)
y_pred = [[0, 0, 0], [1, 1, 1], [1, 1, 0]]
print(f1_score(y_true, y_pred, average='macro'))
print(f1_score(y_true, y_pred, average='micro'))

0.7777777777777777
0.8000000000000002


In [None]:
y_True = [ 0, 1, 2]

In [None]:
y_true = ([0, 0, 0], [1, 1, 1], [0, 1, 1]])
y_pred = [[0, 0, 0], [1, 1, 1], [1, 1, 0]]

In [None]:
accuracy_metric = evaluate.load('accuracy', 'multilabel')
f1 = evaluate.load('f1','multilabel')


def compute_metrics(eval_pred):
    # accuracy_metric = evaluate.load('accuracy', 'multilabel')

    logits, labels = eval_pred
    # print(logits.shape)
    preds = (logits > 0).astype(int)
    accuracy = accuracy_metric.compute(predictions=preds, references=labels)
    f1_micro = f1.compute(predictions=preds, references=labels, average='micro')
    f1_macro = f1.compute(predictions=preds, references=labels, average='macro')
    return {'f1_micro':f1_micro['f1'],
            'f1_macro':f1_macro['f1'],
            'accuracy':accuracy['accuracy'],
            }

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

In [None]:
compute_metrics(eval_pred)

{'f1_micro': 0.8000000000000002,
 'f1_macro': 0.7777777777777777,
 'accuracy': 0.6666666666666666}