# Stage 1: Emotion classification multiclass

In [None]:
!pip install datasets
!pip install pandas
!pip install transformers
!pip install shap

In [None]:
import datasets
import pandas as pd
import transformers

import shap

# load the emotion dataset
dataset = datasets.load_dataset("emotion", split="train")
data = pd.DataFrame({"text": dataset["text"], "emotion": dataset["label"]})

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/3.97k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/3.28k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/8.78k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/592k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/74.0k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/74.9k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/16000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [None]:
data

Unnamed: 0,text,emotion
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,3
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,3
...,...,...
15995,i just had a very brief time in the beanbag an...,0
15996,i am now turning and i feel pathetic that i am...,0
15997,i feel strong and good overall,1
15998,i feel like this was such a rude comment and i...,3


## Build a transformers pipeline

> Note that we have set return_all_scores=True for the pipeline so we can observe the model's behavior for all classes, not just the top output.

In [None]:
# load the model and tokenizer
tokenizer = transformers.AutoTokenizer.from_pretrained(
    "nateraw/bert-base-uncased-emotion", use_fast=True
)
model = transformers.AutoModelForSequenceClassification.from_pretrained(
    "nateraw/bert-base-uncased-emotion"
)

# build a pipeline object to do predictions
pred = transformers.pipeline(
    "text-classification",
    model=model,
    tokenizer=tokenizer,
    return_all_scores=True,
)

tokenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/768 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]



In [None]:
pred(data["text"][:3].tolist()) # Sample output with percentage for each emotion type

[[{'label': 'sadness', 'score': 0.9964079260826111},
  {'label': 'joy', 'score': 0.0006807023892179132},
  {'label': 'love', 'score': 0.0006490044179372489},
  {'label': 'anger', 'score': 0.0012332070618867874},
  {'label': 'fear', 'score': 0.0006467584171332419},
  {'label': 'surprise', 'score': 0.0003823300648946315}],
 [{'label': 'sadness', 'score': 0.9952924251556396},
  {'label': 'joy', 'score': 0.0011914707720279694},
  {'label': 'love', 'score': 0.0009963026968762279},
  {'label': 'anger', 'score': 0.0004628194437827915},
  {'label': 'fear', 'score': 0.0016494360752403736},
  {'label': 'surprise', 'score': 0.000407604209613055}],
 [{'label': 'sadness', 'score': 0.0022772427182644606},
  {'label': 'joy', 'score': 0.0006832053186371922},
  {'label': 'love', 'score': 0.0020760460756719112},
  {'label': 'anger', 'score': 0.9914624094963074},
  {'label': 'fear', 'score': 0.0019362237071618438},
  {'label': 'surprise', 'score': 0.001564879552461207}]]

In [None]:
# Define the emotion to number mapping
emotion_to_number = {
    'sadness': 0,
    'joy': 1,
    'love': 2,
    'anger': 3,
    'fear': 4,
    'surprise': 5
}

def predict_emotion(text):
    predictions = pred(text)
    # Find the prediction with the highest score
    highest_score = 0.0
    predicted_emotion = None
    for emotion in predictions[0]:  # Assuming one text input
        if emotion['score'] > highest_score:
            highest_score = emotion['score']
            predicted_emotion = emotion['label']
    if predicted_emotion in emotion_to_number:
      return int(emotion_to_number[predicted_emotion])
    else:
      return None

In [None]:
df = data.iloc[:101].copy() # Subset of original dataset to accelerate the process
df["predicted_class"] = df["text"].apply(predict_emotion)

In [None]:
df['predicted_class'] = df['predicted_class'].astype(int) # Convert the prediction to Int
df

Unnamed: 0,text,emotion,predicted_class
0,i didnt feel humiliated,0,0
1,i can go from feeling so hopeless to so damned...,0,0
2,im grabbing a minute to post i feel greedy wrong,3,3
3,i am ever feeling nostalgic about the fireplac...,2,2
4,i am feeling grouchy,3,3
...,...,...,...
96,im starting to feel wryly amused at the banal ...,1,1
97,i find every body beautiful and only want peop...,1,1
98,i hear are owners who feel victimized by their...,0,0
99,i say goodbye to the fam theyre all sad a cryi...,3,3


In [None]:
correct_predictions = (df['emotion'] == df['predicted_class']).sum()

# Calculate the total number of predictions
total_predictions = len(df)

# Calculate accuracy
accuracy = correct_predictions / total_predictions

print(f"Accuracy: {accuracy}")

Accuracy: 0.9603960396039604


Extracting texts based on predicted emotion classes: sadness (0), happiness (1), anger (3), and surprise (5)

In [None]:
sad_texts = df[df['predicted_class'] == 0]['text']
happy_texts = df[df['predicted_class'] == 1]['text']
anger_texts = df[df['predicted_class'] == 3]['text']
surprise_texts = df[df['predicted_class'] == 5]['text']

Saving the extracted texts to text files, without indexes or headers, categorized by emotion

In [None]:
happy_texts.to_csv("happy_texts.txt", index=False, header=False)
sad_texts.to_csv("sad_texts.txt", index=False, header=False)
anger_texts.to_csv("anger_texts.txt", index=False, header=False)
surprise_texts.to_csv("surprise_texts.txt", index=False, header=False)

# Stage 2: Text-to-Speech

In [None]:
!pip install TTS

In [None]:
from TTS.api import TTS
tts = TTS(model_name="tts_models/en/ljspeech/glow-tts")

 > Downloading model to /root/.local/share/tts/tts_models--en--ljspeech--glow-tts


 99%|█████████▉| 341M/344M [00:08<00:00, 47.6MiB/s]

 > Model's license - MPL
 > Check https://www.mozilla.org/en-US/MPL/2.0/ for more info.
 > Downloading model to /root/.local/share/tts/vocoder_models--en--ljspeech--multiband-melgan



100%|██████████| 344M/344M [00:12<00:00, 27.7MiB/s]

  1%|          | 770k/82.8M [00:00<00:10, 7.69MiB/s][A
  3%|▎         | 2.18M/82.8M [00:00<00:07, 11.5MiB/s][A
  4%|▍         | 3.57M/82.8M [00:00<00:06, 11.4MiB/s][A
  8%|▊         | 6.81M/82.8M [00:00<00:03, 19.2MiB/s][A
 17%|█▋        | 13.9M/82.8M [00:00<00:01, 37.1MiB/s][A
 25%|██▍       | 20.5M/82.8M [00:00<00:01, 46.7MiB/s][A
 32%|███▏      | 26.9M/82.8M [00:00<00:01, 52.3MiB/s][A
 41%|████      | 33.6M/82.8M [00:00<00:00, 56.9MiB/s][A
 49%|████▉     | 40.7M/82.8M [00:00<00:00, 61.2MiB/s][A
 58%|█████▊    | 48.2M/82.8M [00:01<00:00, 65.3MiB/s][A
 66%|██████▌   | 54.7M/82.8M [00:01<00:00, 63.1MiB/s][A
 75%|███████▍  | 61.9M/82.8M [00:01<00:00, 65.6MiB/s][A
 83%|████████▎ | 69.2M/82.8M [00:01<00:00, 67.8MiB/s][A
 92%|█████████▏| 76.0M/82.8M [00:01<00:00, 61.9MiB/s][A
 99%|█████████▉| 82.3M/82.8M [00:01<00:00, 37.2MiB/s][A

 > Model's license - MPL
 > Check https://www.mozilla.org/en-US/MPL/2.0/ for more info.
 > Using model: glow_tts
 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:0
 | > fft_size:1024
 | > power:1.1
 | > preemphasis:0.0
 | > griffin_lim_iters:60
 | > signal_norm:False
 | > symmetric_norm:True
 | > mel_fmin:50.0
 | > mel_fmax:7600.0
 | > pitch_fmin:1.0
 | > pitch_fmax:640.0
 | > spec_gain:1.0
 | > stft_pad_mode:reflect
 | > max_norm:1.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:10
 | > hop_length:256
 | > win_length:1024
 > Vocoder Model: multiband_melgan
 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func

In [None]:
import os

Read text lines from each text file and convert each line to an audio file using text-to-speech model loaded above


  
*   Sad Texts




In [None]:
sad_file = open('sad_texts.txt', 'r')
sad_lines = sad_file.readlines()
count = 1
folder_name = "sad"
if not os.path.exists(folder_name):
    os.makedirs(folder_name)
for line in sad_lines:
  audio_filenanme = "sad" + str(count) + ".wav"
  output_file_path = os.path.join(folder_name, audio_filenanme)
  tts.tts_to_file(text=line, file_path=output_file_path) # Convert text to speech and save as audio file
  count += 1
sad_file.close()

*   Happy Texts

In [None]:
joy_file = open('happy_texts.txt', 'r')
joy_lines = joy_file.readlines()
count = 1
folder_name = "happy"
if not os.path.exists(folder_name):
    os.makedirs(folder_name)
for line in joy_lines:
  audio_filenanme = "happy" + str(count) + ".wav"
  output_file_path = os.path.join(folder_name, audio_filenanme)
  tts.tts_to_file(text=line, file_path=output_file_path)
  count += 1
joy_file.close()

*   Anger Texts

In [None]:
anger_file = open('anger_texts.txt', 'r')
anger_lines = anger_file.readlines()
count = 1
folder_name = "anger"
if not os.path.exists(folder_name):
    os.makedirs(folder_name)
for line in anger_lines:
  audio_filenanme = "anger" + str(count) + ".wav"
  output_file_path = os.path.join(folder_name, audio_filenanme)
  tts.tts_to_file(text=line, file_path=output_file_path)
  count += 1
anger_file.close()

*   Surprise Texts

In [None]:
surprise_file = open('surprise_texts.txt', 'r')
surprise_lines = surprise_file.readlines()
count = 1
folder_name = "surprise"
if not os.path.exists(folder_name):
    os.makedirs(folder_name)
for line in surprise_lines:
  audio_filenanme = "surprise" + str(count) + ".wav"
  output_file_path = os.path.join(folder_name, audio_filenanme)
  tts.tts_to_file(text=line, file_path=output_file_path)
  count += 1
surprise_file.close()

*   Sample Output

In [None]:
from IPython.display import Audio
Audio('sad/sad8.wav')

*   Zip all folders for easy downloading

In [None]:
!zip -r all_anger.zip /content/anger/
!zip -r all_happy.zip /content/happy/
!zip -r all_sad.zip /content/sad/
!zip -r all_surprise.zip /content/surprise/