# Set Up & Installation

In [None]:
!pip install librosa torchaudio soundfile omegaconf



In [None]:
import torch
import torchaudio
import torchaudio.functional as F
import torchaudio.transforms as T

print(torch.__version__)
print(torchaudio.__version__)

import librosa
import matplotlib.pyplot as plt
torch.manual_seed(0)

2.5.1+cu121
2.5.1+cu121


<torch._C.Generator at 0x79aae83d9a50>

# Extracting Attributes

In [None]:
# Load in dataset from Kaggle
import os
import shutil
import kagglehub

content_folder = "/content/audio_emotions_dataset"
os.makedirs(content_folder, exist_ok=True)
path = kagglehub.dataset_download("uldisvalainis/audio-emotions")
shutil.move(path, content_folder)

Downloading from https://www.kaggle.com/api/v1/datasets/download/uldisvalainis/audio-emotions?dataset_version_number=1...


100%|██████████| 1.12G/1.12G [00:15<00:00, 80.1MB/s]

Extracting files...





'/content/audio_emotions_dataset/1'

In [None]:
dataset_path = "/content/audio_emotions_dataset/1"
output_json = "audio_features.json"

# Extract features and transcriptions
def extract_features_and_transcriptions(audio_data):
    features_and_text = []
    skipped_files = []

    # Initialize the ASR model
    model, decoder, utils = torch.hub.load(
        repo_or_dir='snakers4/silero-models',
        model='silero_stt',
        language='en',
        device=torch.device('cpu')
    )
    (read_batch, split_into_batches, read_audio, prepare_model_input) = utils

    for file_path, label in tqdm(audio_data, desc="Processing audio"):
        try:
            # Extract features
            waveform, sr = librosa.load(file_path, sr=16000)
            mfccs = librosa.feature.mfcc(y=waveform, sr=sr, n_mfcc=13).mean(axis=1).tolist()

            # Transcribe audio
            wav = read_audio(file_path)
            input_data = prepare_model_input([wav], device=torch.device('cpu'))
            output = model(input_data)
            transcription = decoder(output[0].cpu())

            # Save results
            features_and_text.append({
                "file_name": file_path,
                "label": label,
                "features": mfccs,
                "transcription": transcription
            })
        except Exception as e:
            print(f"Error processing {file_path}: {e}")
            skipped_files.append(file_path)

    return features_and_text, skipped_files

def get_audio_data_limited(dataset_path, max_files_per_label):
    """
    Scans dataset directories for audio files and assigns labels based on directory structure.
    Limits the number of files processed per label.
    """
    data = []
    label_file_count = {}

    for root, dirs, files in os.walk(dataset_path):
        label = os.path.basename(root)
        if label not in label_file_count:
            label_file_count[label] = 0

        for file in files:
            if file.endswith(".wav"):
                if label_file_count[label] < max_files_per_label:
                    file_path = os.path.join(root, file)
                    data.append((file_path, label))
                    label_file_count[label] += 1
                else:
                    break
    return data

In [None]:
# Load and Extract
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import librosa
import json

MAX_FILES_PER_LABEL = 100

dataset_path = "/content/audio_emotions_dataset/1"
output_json = "limited_audio_data.json"

audio_data = get_audio_data_limited(dataset_path, max_files_per_label=MAX_FILES_PER_LABEL)

features_and_text, skipped_files = extract_features_and_transcriptions(audio_data)

output_data = {
    "features_and_text": features_and_text,
    "skipped_files": skipped_files
}

with open(output_json, "w") as f:
    json.dump(output_data, f, indent=4)

print(f"Data saved to {output_json}.")
print(f"Processed {len(features_and_text)} files, skipped {len(skipped_files)}.")

Using cache found in /root/.cache/torch/hub/snakers4_silero-models_master
Processing audio: 100%|██████████| 700/700 [35:09<00:00,  3.01s/it]

Data saved to limited_audio_data.json.
Processed 700 files, skipped 0.





# Creating the Model

In [None]:
import json
import numpy as np

json_file_path = 'limited_audio_data.json'

texts = []
numerical_data = []
labels = []

with open(json_file_path, 'r') as f:
    data = json.load(f)

for entry in data['features_and_text']:
    texts.append(entry['transcription'])
    numerical_data.append(entry['features'])
    labels.append(entry["label"])

numerical_data = np.array(numerical_data)

print("Texts:", texts[:5])
print("Numerical data:", numerical_data[:5])
print("Label: ", labels[:5])

Texts: ['say the word gin', 'dogs are sitting by the door', 'say the word lot', 'say the word make', "say the word' choice"]
Numerical data: [[-3.85439880e+02  5.75812721e+01  4.66590071e+00  8.06424999e+00
  -6.90113544e-01 -1.65634060e+01 -7.00231647e+00 -9.07964897e+00
  -1.29140091e+01 -1.13545475e+01 -1.73926449e+01  3.85375679e-01
  -5.52141809e+00]
 [-5.13680298e+02  4.90157127e+01 -1.66620083e+01  9.16706181e+00
  -7.41104031e+00 -8.39771271e+00 -1.50733442e+01 -2.14511418e+00
  -1.14692354e+01  2.80825973e+00 -3.28063893e+00 -5.07822418e+00
  -2.15373492e+00]
 [-3.21289337e+02  4.57948074e+01 -6.40001822e+00 -2.27347183e+01
  -1.69492188e+01 -1.73506284e+00 -1.42188101e+01 -1.00522261e+01
  -4.60576916e+00  1.38825047e+00 -6.79480457e+00  3.85209370e+00
  -4.00710440e+00]
 [-3.91458252e+02  8.19596710e+01  5.75373697e+00  3.49292922e+00
   2.79894185e+00 -1.56160812e+01 -4.93057728e+00 -1.49114742e+01
  -9.84164715e+00 -1.44694710e+01 -1.09908476e+01 -5.05252063e-01
  -5.12331

In [87]:
import json
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold
from sklearn.compose import ColumnTransformer

# Load the data
def load_data(json_file_path):
    with open(json_file_path, 'r') as f:
        data = json.load(f)

    texts = []
    numerical_data = []
    labels = []

    for entry in data['features_and_text']:
        texts.append(entry['transcription'])
        numerical_data.append(entry['features'])
        labels.append(entry["label"])

    return texts, np.array(numerical_data), labels

# Prepare the data
def prepare_data(texts, numerical_data, labels):
    le = LabelEncoder()
    y = le.fit_transform(labels)

    X_text_train, X_text_test = [], []
    X_num_train, X_num_test = [], []
    y_train, y_test = [], []

    unique_labels = np.unique(y)

    for label in unique_labels:
        label_indices = np.where(y == label)[0]

        train_indices = label_indices[:80]
        test_indices = label_indices[80:]

        X_text_train.extend(np.array(texts)[train_indices])
        X_num_train.extend(numerical_data[train_indices])
        y_train.extend(y[train_indices])

        X_text_test.extend(np.array(texts)[test_indices])
        X_num_test.extend(numerical_data[test_indices])
        y_test.extend(y[test_indices])

    return (np.array(X_text_train), np.array(X_text_test),
            np.array(X_num_train), np.array(X_num_test),
            np.array(y_train), np.array(y_test), le)

# Create a preprocessing pipeline
def create_preprocessing_pipeline():
    numeric_transformer = Pipeline(steps=[
        ('scaler', StandardScaler())
    ])

    text_transformer = Pipeline(steps=[
        ('tfidf', TfidfVectorizer(stop_words='english', max_features=100))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, [0]),
            ('text', text_transformer, [1])
        ])

    return preprocessor

# Combine text and numerical features
def combine_features(X_num, X_text):
    print("Initializing TF-IDF vectorizer...")
    tfidf = TfidfVectorizer(stop_words='english', max_features=100)
    text_features = tfidf.fit_transform(X_text).toarray()

    print("Combining numerical and text features...")
    combined_features = np.column_stack((X_num, text_features))

    return combined_features

# Train and evaluate models
def train_and_evaluate_models(X_text_train, X_text_test, X_num_train, X_num_test, y_train, y_test, le):
    X_train_combined = combine_features(X_num_train, X_text_train)
    X_test_combined = combine_features(X_num_test, X_text_test)

    models = {
        'Random Forest': RandomForestClassifier(n_estimators=1000, max_depth=30, random_state=42),
        'Neural Network': MLPClassifier(hidden_layer_sizes=(100, 50, 25, 20), max_iter=250, learning_rate_init=0.0005, random_state=42)
    }

    results = {}

    for name, model in models.items():
        print(f"\nTraining {name}...")

        model.fit(X_train_combined, y_train)
        y_pred = model.predict(X_test_combined)

        results[name] = {
            'classification_report': classification_report(y_test, y_pred, target_names=le.classes_),
            'confusion_matrix': confusion_matrix(y_test, y_pred)
        }

        print(f"{name} Results:")
        print("Classification Report:")
        print(results[name]['classification_report'])
        print("\nConfusion Matrix:")
        print(results[name]['confusion_matrix'])

    return results

def main():
    json_file_path = 'limited_audio_data.json'
    texts, numerical_data, labels = load_data(json_file_path)
    X_text_train, X_text_test, X_num_train, X_num_test, y_train, y_test, le = prepare_data(texts, numerical_data, labels)
    results = train_and_evaluate_models( X_text_train, X_text_test, X_num_train, X_num_test, y_train, y_test, le)

if __name__ == '__main__':
    main()

Initializing TF-IDF vectorizer...
Combining numerical and text features...
Initializing TF-IDF vectorizer...
Combining numerical and text features...

Training Random Forest...
Random Forest Results:
Classification Report:
              precision    recall  f1-score   support

       Angry       0.47      0.95      0.63        20
   Disgusted       0.25      0.15      0.19        20
     Fearful       0.57      0.20      0.30        20
       Happy       0.15      0.15      0.15        20
     Neutral       0.35      0.40      0.37        20
         Sad       0.58      0.55      0.56        20
    Suprised       0.53      0.50      0.51        20

    accuracy                           0.41       140
   macro avg       0.41      0.41      0.39       140
weighted avg       0.41      0.41      0.39       140


Confusion Matrix:
[[19  0  0  1  0  0  0]
 [ 2  3  0  5  7  2  1]
 [ 4  1  4  6  2  3  0]
 [ 6  4  1  3  1  0  5]
 [ 1  1  0  5  8  3  2]
 [ 1  2  0  0  5 11  1]
 [ 7  1  2  0  0 

