# NLP Speech Sentiment Analysis: Detection

#### Import libraries

In [1]:
import numpy as np
import re
import pickle
import nltk
from nltk.corpus import stopwords
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
import torch
import whisper

## Helper functions

### Text processing function

In [2]:
# Preprocesses text strings
def preprocess_text(text, stop_words):
    # Remove URLs
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    
    # Remove non-alphabetic characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Convert to lowercase
    text = text.lower()
    
    # Replace multiple spaces with single space
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Remove input stopwords
    words = text.split()
    filtered_words = [word for word in words if word not in stop_words]
    
    return ' '.join(filtered_words)

## Emotion detection using model pre-trained 

In [3]:
# Load the tokenizer pickle file (created with the 'nlp_model.ipynb' notebook)
with open('tokenizer.pkl', 'rb') as file:
    tokenizer = pickle.load(file)

In [4]:
# Define stopwords
nltk_stop_words = set(stopwords.words('english')) # NLTK stopwords
custom_stop_words = { # Custom stopwords 
    "im", 'days', 'well', 'quite', 'look', 'find', 'come', 'year','lot', 'part', 'take',
    'href', 'every', 'able', 'though','left', 'need', 'new', 'http', 'sure', 'around', 'say',
    'also', 'work', 'today', 'pretty', 'feels', 'going', 'feelings', 'back', 'way', 'always',
    'things', "feel", "thats", "one", 'actually', 'right', 'many', 'thing', 'seen', 'thought',
    'believe', 'didnt', 'want', 'time', 'makes', "even", "day", "go", "made", "yeah", "man", 
    'youre', 'ive', 'much', 'good', "know", 'see', 'cant', 'never', "got", 'think', "would",
    "still", "dont", "people", "like", "really", "get", "name", "i", "you", "really", "name",
    "make", 'could', "oh", 'something', 'little', 'bit', 'life', 'feeling', 'something', 'home',
    'enough', 'sometimes', 'important',
    
}
# Join NLTK and Custom stopwords
stop_words_nltk = nltk_stop_words.union(custom_stop_words)

In [5]:
# Load pre-trained from 'nlp_model.ipynb'
emotion_model = load_model("NLP_model.h5")

In [6]:
# Load Whisper model for speech-to-text conversion
whisper_model = whisper.load_model("base")

  checkpoint = torch.load(fp, map_location=device)


In [7]:
# Classifies emotions of audio files using speech-to-text and the pre-trained model
def classify_audio_emotion(audio_file_path, whisper_model, emotion_model, tokenizer, stop_words, max_len=178, threshold=0.5):
    # Speech-to-text using the Whisper library
    text_transcribed = whisper_model.transcribe(audio_file_path)
    text_transcribed = text_transcribed.get("text", "").strip()
    print("======  Transcribed Text (speech-to-text ======")
    print( text_transcribed)

    # Preprocess text (clean and remove stopwords)
    text_processed = preprocess_text(text_transcribed, stop_words)

    # Text tokenize using the loaded tokenizer
    text_tokenized = tokenizer.texts_to_sequences([text_processed])

    # Validate the tokenized text
    if text_tokenized and all(token is not None for token in text_tokenized[0]):
        # Pad the sequence to match the length used during training
        text_tokenized_seq = pad_sequences(text_tokenized, maxlen=max_len, padding='post', truncating='post')

        # Predict the emotion using the pre-trained emotion model
        predictions = emotion_model.predict(text_tokenized_seq, verbose=0)

        # Define the emotion map
        emotion_map = {0: 'disgust', 1: 'fear', 2: 'anger', 3: 'joy', 4: 'sadness', 5: 'surprise'}

        # Obtain index of emotion with the highest probability
        prediction_i = np.argmax(predictions)          # Index of emotion with highest probability
        prediction_max = predictions[0][prediction_i]  # probability value
        
        # Display predicted probabilities
        print("====== Predicted Probabilities ======")
        for emotion, prob in zip(emotion_map.values(), predictions[0]):
            print(f"{emotion}: {prob:.6f}")
            
        # Determine the predicted emotion based on the threshold (i.e., unknown if all probabilities are below)
        if prediction_max >= threshold:
            predicted_emotion = emotion_map[prediction_i]
        else:
            predicted_emotion = "unknown"
            
        # Display emotion prediction
        print("======  Prediction ======")
        print("Emotion:", predicted_emotion)
    else:
        print("Error: Text could not be tokenized")

## NLP emotion detection using audio

### Disgust [0]

In [8]:
# Prediction for disgust audio file 1
classify_audio_emotion(
    audio_file_path="data/audio/disgust.wav",
    whisper_model=whisper_model,
    emotion_model=emotion_model,
    tokenizer=tokenizer,
    stop_words=stop_words_nltk,
    max_len=178,
    threshold=0.5
)



I am disgusted by that awful smell.
disgust: 1.000000
fear: 0.000000
anger: 0.000000
joy: 0.000000
sadness: 0.000000
surprise: 0.000000
Emotion: disgust


In [9]:
# Prediction for disgust audio file 2
classify_audio_emotion(
    audio_file_path="data/audio/disgust2.wav",
    whisper_model=whisper_model,
    emotion_model=emotion_model,
    tokenizer=tokenizer,
    stop_words=stop_words_nltk,
    max_len=178,
    threshold=0.5
)

I am grossed out by this awful place.
disgust: 1.000000
fear: 0.000000
anger: 0.000000
joy: 0.000000
sadness: 0.000000
surprise: 0.000000
Emotion: disgust


### Fear [1]

In [10]:
# Prediction for fear audio file 1
classify_audio_emotion(
    audio_file_path="data/audio/fear1.wav",
    whisper_model=whisper_model,
    emotion_model=emotion_model,
    tokenizer=tokenizer,
    stop_words=stop_words_nltk,
    max_len=178,
    threshold=0.5
)

I am scared to death by this haunted house.
disgust: 0.000000
fear: 1.000000
anger: 0.000000
joy: 0.000000
sadness: 0.000000
surprise: 0.000000
Emotion: fear


In [11]:
# Prediction for fear audio file 2
classify_audio_emotion(
    audio_file_path="data/audio/fear2.wav",
    whisper_model=whisper_model,
    emotion_model=emotion_model,
    tokenizer=tokenizer,
    stop_words=stop_words_nltk,
    max_len=178,
    threshold=0.5
)

There is a tense feeling when name is around.
disgust: 0.000004
fear: 0.999744
anger: 0.000021
joy: 0.000212
sadness: 0.000019
surprise: 0.000001
Emotion: fear


### Anger [2]

In [12]:
# Prediction for anger audio file 1
classify_audio_emotion(
    audio_file_path="data/audio/angry1.wav",
    whisper_model=whisper_model,
    emotion_model=emotion_model,
    tokenizer=tokenizer,
    stop_words=stop_words_nltk,
    max_len=178,
    threshold=0.5
)

I am outraged about the mad news today.
disgust: 0.000000
fear: 0.000000
anger: 1.000000
joy: 0.000000
sadness: 0.000000
surprise: 0.000000
Emotion: anger


In [13]:
# Prediction for anger audio file 2
classify_audio_emotion(
    audio_file_path="data/audio/angry2.wav",
    whisper_model=whisper_model,
    emotion_model=emotion_model,
    tokenizer=tokenizer,
    stop_words=stop_words_nltk,
    max_len=178,
    threshold=0.5
)

I am upset with all the hostile people on my street.
disgust: 0.000000
fear: 0.000000
anger: 1.000000
joy: 0.000000
sadness: 0.000000
surprise: 0.000000
Emotion: anger


### Joy [3]

In [14]:
# Prediction for happiness audio file 1
classify_audio_emotion(
    audio_file_path="data/audio/joy1.wav",
    whisper_model=whisper_model,
    emotion_model=emotion_model,
    tokenizer=tokenizer,
    stop_words=stop_words_nltk,
    max_len=178,
    threshold=0.5
)

I am quite comfortable in my current situation.
disgust: 0.000000
fear: 0.000000
anger: 0.000000
joy: 1.000000
sadness: 0.000000
surprise: 0.000000
Emotion: joy


In [22]:
# Prediction for happiness audio file 2
classify_audio_emotion(
    audio_file_path="data/audio/joy2.wav",
    whisper_model=whisper_model,
    emotion_model=emotion_model,
    tokenizer=tokenizer,
    stop_words=stop_words_nltk,
    max_len=178,
    threshold=0.5
)

I played a bad game today, but overall I'm happy about the results.
disgust: 0.000000
fear: 0.000000
anger: 0.000000
joy: 1.000000
sadness: 0.000000
surprise: 0.000000
Emotion: joy


### Sadness [4]

In [16]:
# Prediction for sadness audio file 1
classify_audio_emotion(
    audio_file_path="data/audio/sad1.wav",
    whisper_model=whisper_model,
    emotion_model=emotion_model,
    tokenizer=tokenizer,
    stop_words=stop_words_nltk,
    max_len=178,
    threshold=0.5
)

I feel alone in this town.
disgust: 0.000005
fear: 0.053434
anger: 0.010556
joy: 0.000504
sadness: 0.935500
surprise: 0.000000
Emotion: sadness


In [17]:
# Prediction for sadness audio file 2
classify_audio_emotion(
    audio_file_path="data/audio/sad2.wav",
    whisper_model=whisper_model,
    emotion_model=emotion_model,
    tokenizer=tokenizer,
    stop_words=stop_words_nltk,
    max_len=178,
    threshold=0.5
)

I am absolutely devastated about today's stock market.
disgust: 0.004804
fear: 0.075597
anger: 0.002202
joy: 0.014012
sadness: 0.900289
surprise: 0.003096
Emotion: sadness


### Surprise [5]

In [18]:
# Prediction for surprise audio file 1
classify_audio_emotion(
    audio_file_path="data/audio/surprise1.wav",
    whisper_model=whisper_model,
    emotion_model=emotion_model,
    tokenizer=tokenizer,
    stop_words=stop_words_nltk,
    max_len=178,
    threshold=0.5
)

I'm a bit startled by the results.
disgust: 0.000001
fear: 0.000253
anger: 0.000000
joy: 0.000004
sadness: 0.000018
surprise: 0.999724
Emotion: surprise


In [19]:
# Prediction for surprise audio file 2
classify_audio_emotion(
    audio_file_path="data/audio/surprise2.wav",
    whisper_model=whisper_model,
    emotion_model=emotion_model,
    tokenizer=tokenizer,
    stop_words=stop_words_nltk,
    max_len=178,
    threshold=0.5
)

The moon landing was unbelievable.
disgust: 0.000000
fear: 0.000000
anger: 0.000000
joy: 0.000000
sadness: 0.000001
surprise: 0.999998
Emotion: surprise


### Neutral: Categorized as "unknown"

When no predominant emotion is detected, i.e., below the threshold, the prediction funciton should classificy the emotion as "unknown"

In [20]:
# Prediction for neutral audio file 1
classify_audio_emotion(
    audio_file_path="data/audio/neutral1.wav",
    whisper_model=whisper_model,
    emotion_model=emotion_model,
    tokenizer=tokenizer,
    stop_words=stop_words_nltk,
    max_len=178,
    threshold=0.5
)

it is raining outside.
disgust: 0.013432
fear: 0.225701
anger: 0.250112
joy: 0.217455
sadness: 0.218968
surprise: 0.074332
Emotion: unknown


In [21]:
# Prediction for neutral audio file 2
classify_audio_emotion(
    audio_file_path="data/audio/neutral2.wav",
    whisper_model=whisper_model,
    emotion_model=emotion_model,
    tokenizer=tokenizer,
    stop_words=stop_words_nltk,
    max_len=178,
    threshold=0.5
)

The coffee is ready.
disgust: 0.013081
fear: 0.159300
anger: 0.165829
joy: 0.351188
sadness: 0.179508
surprise: 0.131094
Emotion: unknown


## Conclusion

The model correctly detects the emotional state based on the short sentences of the audio files. However, in future iterations it may be advisable to include a neutral or unknown classification when training the model, as currently the neutral class relies solely on the threshold, which depends only on the six trained labels, thus introducing bias if all or most emotions contain 0 probability and one or a few contain very little probability.