### Option 1: STT process Using LibriSpeech (Torch Audio)

In [1]:
import torchaudio

# Load a sample audio file from LibriSpeech
datasets = torchaudio.datasets.LIBRISPEECH(".", url="test-clean", download=True)

key = 0
for dataset in datasets:  
    if ( key < 10):
        # Access the first audio sample in the dataset
        waveform, sample_rate, utterance, speaker_id, chapter_id, utterance_id = dataset
        
        # Save the audio to a WAV file
        torchaudio.save("librispeech_sample.wav", waveform, sample_rate)
        key += 1
        
        print(f"Utterance: {utterance}")
        print(f"Sample Rate: {sample_rate} Hz")


Utterance: HE HOPED THERE WOULD BE STEW FOR DINNER TURNIPS AND CARROTS AND BRUISED POTATOES AND FAT MUTTON PIECES TO BE LADLED OUT IN THICK PEPPERED FLOUR FATTENED SAUCE
Sample Rate: 16000 Hz
Utterance: STUFF IT INTO YOU HIS BELLY COUNSELLED HIM
Sample Rate: 16000 Hz
Utterance: AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BROTHELS
Sample Rate: 16000 Hz
Utterance: HELLO BERTIE ANY GOOD IN YOUR MIND
Sample Rate: 16000 Hz
Utterance: NUMBER TEN FRESH NELLY IS WAITING ON YOU GOOD NIGHT HUSBAND
Sample Rate: 16000 Hz
Utterance: THE MUSIC CAME NEARER AND HE RECALLED THE WORDS THE WORDS OF SHELLEY'S FRAGMENT UPON THE MOON WANDERING COMPANIONLESS PALE FOR WEARINESS
Sample Rate: 16000 Hz
Utterance: THE DULL LIGHT FELL MORE FAINTLY UPON THE PAGE WHEREON ANOTHER EQUATION BEGAN TO UNFOLD ITSELF SLOWLY AND TO SPREAD ABROAD ITS WIDENING TAIL
Sample Rate: 16000 Hz
Utterance: A COLD LUCID INDIFFERENCE REIGNED IN HIS SOUL
Sample Rate: 16000 Hz
Utterance:

### Option 2: STT Process Using Cloud APIs (Google Speech-to-Text)

In [2]:
import speech_recognition as sr

# Initialize recognizer
recognizer = sr.Recognizer()

# Load the audio file
with sr.AudioFile("librispeech_sample.wav") as source:
    audio_data = recognizer.record(source)

# Convert speech to text
try:
    text = recognizer.recognize_google(audio_data)
    print("Transcribed Text:", text)
except Exception as e:
    print("Error:", str(e))



ValueError: Audio file could not be read as PCM WAV, AIFF/AIFF-C, or Native FLAC; check if file is corrupted or in another format

In [None]:
from pydub import AudioSegment

# Convert audio to PCM WAV format
audio = AudioSegment.from_file("librispeech_sample.wav")
audio = audio.set_frame_rate(16000).set_channels(1)
audio.export("output.wav", format="wav")


### Option 3: STT Process Using OpenAI Whisper (Local Model)

In [3]:
from transformers import pipeline

# Load Whisper model
speech_to_text = pipeline("automatic-speech-recognition", model="openai/whisper-small")

# Perform STT
result = speech_to_text("librispeech_sample.wav")
print("Transcribed Text:", result['text'])





Device set to use cpu


ValueError: ffmpeg was not found but is required to load audio files from filename

### Option 3: STT Process Using Wav2Vec 2.0

In [1]:
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
import torch
import librosa

# Load model and processor
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")

# Load audio and preprocess
audio_input, sample_rate = librosa.load("librispeech_sample.wav", sr=16000)
input_values = processor(audio_input, sampling_rate=16000, return_tensors="pt").input_values

# Perform STT
logits = model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.decode(predicted_ids[0])
print("Transcribed Text:", transcription)


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Transcribed Text: I LOVE THEE WITH TE LOVE I SEEMED TO LOSE WITH MY LOST SAINTS I LOVE THEE WITH THE BREATH SMILES TEARS OF ALL MY LIFE AND IF GOD CHOOSE I SHALL BUT LOVE THEE BETTER AFTER DEATH


In [3]:
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
import torch

# Load model and processor
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")

# Process input audio
inputs = processor(audio_input, sampling_rate=16000, return_tensors="pt", padding=True)
logits = model(inputs.input_values).logits

# Decode predictions
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids)
print(transcription)

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


['I LOVE THEE WITH TE LOVE I SEEMED TO LOSE WITH MY LOST SAINTS I LOVE THEE WITH THE BREATH SMILES TEARS OF ALL MY LIFE AND IF GOD CHOOSE I SHALL BUT LOVE THEE BETTER AFTER DEATH']


### Feature Extraction Using Librosa Dataset

In [45]:
import librosa
import numpy as np
import pandas as pd
import torchaudio

# Function to extract features
def extract_features(file_path):
    audio, sample_rate = librosa.load(file_path, sr=16000)
    mfccs = np.mean(librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=13).T, axis=0)
    chroma = np.mean(librosa.feature.chroma_stft(y=audio, sr=sample_rate).T, axis=0)
    zcr = np.mean(librosa.feature.zero_crossing_rate(y=audio).T, axis=0)
    spectral_contrast = np.mean(librosa.feature.spectral_contrast(y=audio, sr=sample_rate).T, axis=0)
    return np.hstack([mfccs, chroma, zcr, spectral_contrast])

# Load a sample audio file from LibriSpeech
datasets = torchaudio.datasets.LIBRISPEECH(".", url="test-clean", download=True)

key = 0
features = []
file_path = "librispeech_sample.wav"
for dataset in datasets:  
    # Access the first audio sample in the dataset
    waveform, sample_rate, utterance, speaker_id, chapter_id, utterance_id = dataset
    
    # Save the audio to a WAV file
    torchaudio.save(file_path, waveform, sample_rate)
    key += 1

    feature = extract_features(file_path)
    features.append(feature) 

# Convert the feature list to a numpy array for further processing
features_array = np.array(features)
print(f"Extracted features shape: {features_array.shape}")

# Convert np array into data freame 
features_df = pd.DataFrame(features_array)
features_df.to_csv("features.csv", index=False)

Extracted features shape: (2620, 33)


In [38]:
import librosa
import numpy as np
import torchaudio
import pandas as pd

# Function to extract features from audio waveform
def extract_features_from_waveform(waveform, sample_rate):
    audio = waveform.numpy().flatten()  # Convert PyTorch tensor to numpy array
    mfccs = np.mean(librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=13).T, axis=0)
    chroma = np.mean(librosa.feature.chroma_stft(y=audio, sr=sample_rate).T, axis=0)
    zcr = np.mean(librosa.feature.zero_crossing_rate(y=audio).T, axis=0)
    spectral_contrast = np.mean(librosa.feature.spectral_contrast(y=audio, sr=sample_rate).T, axis=0)
    return np.hstack([mfccs, chroma, zcr, spectral_contrast])

# Load a sample audio dataset from LibriSpeech
datasets = torchaudio.datasets.LIBRISPEECH(".", url="test-clean", download=True)

# Extract features from first 10 utterances
features = []
key = 0
for dataset in datasets:
    # Access audio sample and its metadata
    waveform, sample_rate, utterance, speaker_id, chapter_id, utterance_id = dataset
    
    # Extract features directly from waveform
    feature = extract_features_from_waveform(waveform, sample_rate)
    features.append(feature)  # Append feature array for each utterance  

# Convert the feature list to a numpy array for further processing
features_array = np.array(features)
print(f"Extracted features shape: {features_array.shape}")

# Convert np array into data freame 
features_df = pd.DataFrame(features_array)
features_df.to_csv("features.csv", index=False)


Extracted features shape: (2620, 33)


In [46]:
features_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,23,24,25,26,27,28,29,30,31,32
0,-300.538055,102.676567,-11.533751,34.515648,3.69797,3.740451,-7.779979,-5.030833,-1.161886,-0.948949,...,0.428673,0.454517,0.127497,19.646138,14.211446,16.890984,15.460212,17.686253,19.864507,14.851772
1,-364.247498,91.37735,4.10496,40.97274,3.855652,11.06149,-3.984364,-0.601738,4.03419,-2.416186,...,0.511107,0.555132,0.10782,18.03772,13.369277,16.100759,14.565921,16.484215,19.391356,14.97076
2,-303.546387,130.277008,-17.590935,26.203365,-7.220541,2.721291,-7.697184,-8.856187,1.239684,-1.230602,...,0.466134,0.469798,0.087339,21.492927,15.680788,17.94167,17.006381,17.779363,20.308019,14.854105
3,-313.24292,110.683128,-20.646511,21.18219,-5.369611,-9.688619,-9.589869,-7.087545,-10.202554,-2.014711,...,0.426402,0.498014,0.065348,21.618438,17.300488,20.150371,17.723875,18.778041,19.73462,16.104894
4,-298.868347,104.327721,-21.300791,29.336489,2.966362,-3.672034,-11.293174,-2.402841,-1.28424,0.897291,...,0.480502,0.447972,0.096332,20.960629,16.082848,17.179983,16.033419,19.139572,19.313629,15.433236


### Build the Emotion Recognition Model

##### ML Model (Random Forest)

In [54]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

X = features_array  # Extracted features

# Generate dummy target labels
'''
neutral = 1 
sad = 2
happy = 3
'''
labels = [1,2,3]

# Ensure the number of labels matches the number of rows in the features array
num_samples = features_array.shape[0]
y = np.random.choice(labels, num_samples)  # Randomly assign one label to each sample

# Assume `X` contains features and `y` contains emotion labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Evaluate
y_pred = rf_model.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           1       0.28      0.41      0.33       143
           2       0.28      0.29      0.29       163
           3       0.41      0.27      0.33       218

    accuracy                           0.31       524
   macro avg       0.32      0.32      0.31       524
weighted avg       0.33      0.31      0.31       524



##### Deep Learning Model (CNN)

In [55]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from sklearn.preprocessing import LabelEncoder

# Encode labels
encoder = LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.transform(y_test)

# Convert X_train and X_test to float32
X_train = X_train.astype(np.float32).reshape(X_train.shape[0], X_train.shape[1], 1)
X_test = X_test.astype(np.float32).reshape(X_test.shape[0], X_test.shape[1], 1)

# Re-train the model
model = Sequential([
    Conv1D(64, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], X_train.shape[2])),
    MaxPooling1D(pool_size=2),
    Dropout(0.2),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.2),
    Dense(len(np.unique(y_train)), activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=32)

Epoch 1/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m66/66[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.3501 - loss: 3.6320 - val_accuracy: 0.3111 - val_loss: 1.1346
Epoch 2/10
[1m66/66[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.3279 - loss: 1.2040 - val_accuracy: 0.3263 - val_loss: 1.0996
Epoch 3/10
[1m66/66[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.3401 - loss: 1.1101 - val_accuracy: 0.2901 - val_loss: 1.1003
Epoch 4/10
[1m66/66[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.3592 - loss: 1.1015 - val_accuracy: 0.3321 - val_loss: 1.0991
Epoch 5/10
[1m66/66[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.3445 - loss: 1.1025 - val_accuracy: 0.4065 - val_loss: 1.0975
Epoch 6/10
[1m66/66[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.3499 - loss: 1.1037 - val_accuracy: 0.3950 - val_loss: 1.0982
Epoch 7/10
[1m66/66[0m [32m━━━━━━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x21f2340bbe0>

In [56]:
from sklearn.metrics import confusion_matrix, accuracy_score

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)


Accuracy: 0.24236641221374045
Confusion Matrix:
 [[ 0 58 54 31]
 [ 0 60 48 55]
 [ 0 92 67 59]
 [ 0  0  0  0]]


In [59]:
import torchaudio

# Load a sample audio file from LibriSpeech
datasets = torchaudio.datasets.LIBRISPEECH(".", url="test-clean", download=True)

key = 0
for dataset in datasets:  
    if ( key < 10):
        # Access the first audio sample in the dataset
        waveform, sample_rate, utterance, speaker_id, chapter_id, utterance_id = dataset

         # Predict emotion
        predicted_emotion = rf_model.predict(new_features)
        print("Predicted Emotion:", predicted_emotion)

         # Extract features directly from waveform
        new_features = extract_features_from_waveform(waveform, sample_rate)
        new_features = new_features.reshape(1, -1)  # Reshape for prediction
        key += 1
        
        # Predict emotion
        predicted_emotion = rf_model.predict(new_features)
        print("WaveForm Predicted Emotion:", predicted_emotion)

Predicted Emotion: [2]
WaveForm Predicted Emotion: [2]
Predicted Emotion: [2]
WaveForm Predicted Emotion: [2]
Predicted Emotion: [2]
WaveForm Predicted Emotion: [2]
Predicted Emotion: [2]
WaveForm Predicted Emotion: [2]
Predicted Emotion: [2]
WaveForm Predicted Emotion: [1]
Predicted Emotion: [1]
WaveForm Predicted Emotion: [2]
Predicted Emotion: [2]
WaveForm Predicted Emotion: [3]
Predicted Emotion: [3]
WaveForm Predicted Emotion: [3]
Predicted Emotion: [3]
WaveForm Predicted Emotion: [1]
Predicted Emotion: [1]
WaveForm Predicted Emotion: [1]
