# Emotion extraction using Speechbrain
This Python notebook is designed to extract emotion scores [Neutral, Happy, Sad, Angry] from WAV files using a pretrained SpeechBrain model. The extraction process relies on the time boundaries of voice transcriptions for each phrase. In this approach, phrases are analyzed at the phrase level, where the emotion scores are averaged for each emotion within the same chunk. This differs from the frame-level approach, where the scores are used as they are without averaging.

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
path = '/content/drive/MyDrive/CS5647 Sound/CS5647_Project/'

In [None]:
!pip install speechbrain torch torchaudio

Collecting speechbrain
  Downloading speechbrain-1.0.2-py3-none-any.whl.metadata (23 kB)
Collecting hyperpyyaml (from speechbrain)
  Downloading HyperPyYAML-1.2.2-py3-none-any.whl.metadata (7.6 kB)
Collecting ruamel.yaml>=0.17.28 (from hyperpyyaml->speechbrain)
  Downloading ruamel.yaml-0.18.6-py3-none-any.whl.metadata (23 kB)
Collecting ruamel.yaml.clib>=0.2.7 (from ruamel.yaml>=0.17.28->hyperpyyaml->speechbrain)
  Downloading ruamel.yaml.clib-0.2.12-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.7 kB)
Downloading speechbrain-1.0.2-py3-none-any.whl (824 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m824.8/824.8 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading HyperPyYAML-1.2.2-py3-none-any.whl (16 kB)
Downloading ruamel.yaml-0.18.6-py3-none-any.whl (117 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m117.8/117.8 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading ruamel.yaml.clib-0.2.12-cp310-cp31

In [None]:
import pandas as pd
# Need to initialize the pretrained model first
from speechbrain.pretrained import EncoderASR, EncoderClassifier
import torchaudio
import torch
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC

# Load ASR and Emotion models from SpeechBrain
# For asr-crdnn-rnnlm-librispeech, use this alternative
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model_asr = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")

emotion_model = EncoderClassifier.from_hparams(source="speechbrain/emotion-recognition-wav2vec2-IEMOCAP", savedir="./pretrained_emotion")

import os
import csv
import glob


# Loop through all wav files in the specified directory
for audio_file in glob.glob(path + "data/DAIC_WOZ/wavNscript/*.wav"):
    file_number = audio_file.split('/')[-1].split('_')[0]
    if file_number in ['300','301','302','303','304','305','306',      '308','309','310','311','312','313','314','315','316','317','318','319',
                       '320','321','322','323','324','325','326','327','328','329','330','331','332','333','334','335','336','337','338','339',
                       '340','341',      '343','344','345','346','347','348','349','350','351','352','353','354','355','356','357','358','359',
                       '360','361','362','363','364','365','366','367','368','369','370','371','372','373','374','375','376','377','378','379',
                       '380','381','382','383','384','385','386','387','388','389','390','391','392','393',      '395','396','397',      '399',
                       '404', '405', '406', '407', '408', '409', '410', '411', '412', '413', '415', '416', '417', '418', '419', '420', '421',
                       '422', '423', '424', '425', '426', '427', '428', '429', '430', '431', '432', '433', '434', '435', '436', '437',' 438',
                       '439', '440', '441', '442', '443', '444', '445', '446', '447', '448', '449', '450', '451', '452', '453', '454', '455',
                       '456', '457', '458', '459', '461', '462', '463', '464', '465', '466', '467', '468', '469', '470', '471', '472', '473',
                       '474', '475', '476', '477', '478', '479', '480', '481', '482', '483', '485', '484', '486', '487', '488', '489', '490',
                       '491', '492', '600', '601', '602', '605', '604', '606', '607',
                       '603', '608', '609', '307', # Causing error in emotion extraciton
                       '400','401','402','403','414','438','612','615','617','618','619' # on going results -> not included in project report
                       ]:
        continue

    # print(f"Processing file: {audio_file}")
    print(f"'{file_number}',")

    # Extract the file number from the audio file path
    transcript_file = path + f"data/DAIC_WOZ/wavNscript/{file_number}_Transcript.csv"
    # print(f"    with: {transcript_file}")

    # Step 1: Transcribe the audio using ASR (if needed) - we can keep this out of the loop if don't want to transcribe all audios again
    try:
        df_transcription = pd.read_csv(transcript_file)
    except FileNotFoundError:
        print(f"Transcript file not found for {audio_file}. Skipping...")
        continue

    # Step 2: Extract raw audio and run emotion detection
    waveform, sample_rate = torchaudio.load(audio_file)
    features = emotion_model.mods.wav2vec2(waveform)
    emotion_scores = emotion_model.mods.output_mlp(features)


    # Step 3: Perform sliding-window emotion detection
    window_size = 1.0  # Sliding window size in seconds
    step_size = 0.5  # Overlap step size
    emotion_results = []

    for i in range(0, waveform.shape[-1], int(step_size * sample_rate)):
        window = waveform[:, i:i + int(window_size * sample_rate)]
        if window.shape[-1] < window_size * sample_rate:
            break  # Stop if the window size is smaller than required

        score = emotion_model.mods.output_mlp(emotion_model.mods.wav2vec2(window))
        emotion_results.append(score[0])

    # Step 4: Align emotion predictions with phrase timestamps
    phrase_emotion_scores = []
    for index, row in df_transcription.iterrows():

        start_time = row['Start_Time']
        end_time = row['End_Time']
        confidence = row['Confidence']
        transcription = row['Text']

        # Find corresponding emotion scores within the phrase's time range
        emotions_for_phrase = [e for (j, e) in enumerate(emotion_results)
                            if j * step_size >= start_time and j * step_size <= end_time]

        # Average the emotion scores across the window for that word
        if len(emotions_for_phrase) >0:
            avg_emotion_score = torch.mean(torch.stack(emotions_for_phrase), dim=0).tolist()
        else:
            avg_emotion_score = [[0.0, 0.0, 0.0, 0.0]]
        phrase_emotion_scores.append({
            "phrase": transcription,
            "emotion_score": avg_emotion_score  # List of emotion probabilities
        })


    # Create/open the CSV file in append mode
    emo_file = path + f"data/DAIC_WOZ/wavNscript/{file_number}_phrase_emotions.csv"
    with open(emo_file, 'a', newline='') as csvfile:
        fieldnames = ['phrase', 'emotion_score']  # Define column headers
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        # Write the header if the file is newly created (check file size)

        if os.stat(emo_file).st_size == 0:
            writer.writeheader()
        # Write the phrase_emotion_scores to the CSV file
        writer.writerow({'phrase': audio_file, 'emotion_score': []})
        for item in phrase_emotion_scores:
            writer.writerow(item)


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  state_dict = torch.load(path, map_location=device)


'402',


model.safetensors:   0%|          | 0.00/380M [00:00<?, ?B/s]

'401',
'400',
'403',
'414',
'438',


  return F.conv1d(


'617',
'618',
'612',
'615',
'619',
'620',


In [None]:
print(len(phrase_emotion_scores))
print(phrase_emotion_scores[0])

5
{'phrase': 'good', 'emotion_score': [[6.7289509773254395, -14.300140380859375, -2.990981101989746, 5.504098892211914], [6.754121780395508, -14.435650825500488, -2.8771352767944336, 5.355656623840332], [6.734217166900635, -14.400794982910156, -2.673593759536743, 5.264455795288086], [6.832696437835693, -14.580904006958008, -2.3428032398223877, 4.996855735778809], [6.903836727142334, -14.529321670532227, -2.4742608070373535, 5.091804027557373], [6.8349738121032715, -14.507585525512695, -2.5069408416748047, 5.120858192443848], [6.741348743438721, -14.524739265441895, -2.3476338386535645, 5.091528415679932], [6.822110176086426, -14.655706405639648, -2.362755298614502, 5.028277397155762], [6.821504592895508, -14.633331298828125, -2.3437819480895996, 5.059662818908691], [6.7699127197265625, -14.569358825683594, -2.397566795349121, 5.142264366149902], [6.753446578979492, -14.49173641204834, -2.4931397438049316, 5.203610897064209], [6.694135665893555, -14.517919540405273, -2.59428334236145, 5

In [None]:
# len(phrase_emotion_scores)
# phrase_emotion_scores[0]
print(len(phrase_emotion_scores[0]['emotion_score']))
phrase_emotion_scores[0]['emotion_score'][:5]

137


[[6.7289509773254395,
  -14.300140380859375,
  -2.990981101989746,
  5.504098892211914],
 [6.754121780395508,
  -14.435650825500488,
  -2.8771352767944336,
  5.355656623840332],
 [6.734217166900635,
  -14.400794982910156,
  -2.673593759536743,
  5.264455795288086],
 [6.832696437835693,
  -14.580904006958008,
  -2.3428032398223877,
  4.996855735778809],
 [6.903836727142334,
  -14.529321670532227,
  -2.4742608070373535,
  5.091804027557373]]

In [None]:
import numpy as np

print(f"{'':>45} {'   Neutral':<12} {'   Happy':<15} {       'Sad':<13} {'Angry':<12} ")
for i in range(len(phrase_emotion_scores)):
    emotion_array = np.array(phrase_emotion_scores[i]['emotion_score'])
    phrase_emotion_scores[i]['avr_emo_score'] = emotion_array.mean(axis=0)

    print(f"{phrase_emotion_scores[i]['phrase']:>45} {phrase_emotion_scores[i]['avr_emo_score']}")



                                                 Neutral      Happy        Sad           Angry        
                              so I'm going to [ 12.32447507 -12.07226545  -5.52684767   0.0738344 ]
                         interview in Spanish [12.94128605 -9.95798156 -2.58876414 -5.71331066]
                                         okay [16.55466889 -7.38492625 -7.81034679 -6.17315637]
                                         good [16.56782271 -6.27761511 -8.68089255 -5.91080189]
                              Atlanta Georgia [16.77601869 -8.16337349 -7.42987523 -5.99051741]
                     my parents are from here [12.20556491 -9.395598   -2.12558469 -4.98088449]
                                    I love it [15.64358655 -9.28779931 -8.0404845  -3.12641051]
  I like the weather I like the opportunities [  5.44363773 -12.85527155   0.59646954   2.26757566]
                                at the minute [15.96490138 -8.90930802 -7.46222741 -4.60714845]
                         