In [1]:
!pip install librosa soundfile numpy pandas scikit-learn matplotlib seaborn tqdm



In [2]:
import os
import numpy as np
import pandas as pd
import soundfile as sf
import librosa
from tqdm import tqdm

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")

In [10]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("kam001/audio-dataset")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/audio-dataset


In [11]:
# Map RAVDESS filename codes to emotion labels
emotion_map = {
    '01':'neutral','02':'calm','03':'happy','04':'sad',
    '05':'angry','06':'fearful','07':'disgust','08':'surprise'
}

def load_metadata(base_dir):
    paths, labels = [], []
    for root, _, files in os.walk(base_dir):
        for f in files:
            if not f.endswith('.wav'):
                continue
            parts = f.split('-')
            # modality=03 (audio-only), channel=01 (speech)
            if len(parts) >= 3 and parts[0]=='03' and parts[1]=='01':
                paths.append(os.path.join(root, f))
                labels.append(emotion_map.get(parts[2], 'unknown'))
    return pd.DataFrame({'path': paths, 'emotion': labels})

data_dir = '/kaggle/input/audio-dataset'
df = load_metadata(data_dir)

print(f"Loaded {len(df)} files")
print(df.emotion.value_counts())


Loaded 1440 files
emotion
surprise    192
disgust     192
fearful     192
sad         192
happy       192
calm        192
angry       192
neutral      96
Name: count, dtype: int64


In [12]:
# extract_feature.py

import librosa
import numpy as np
import soundfile as sf

def extract_feature(file_path, n_mels=128, sr=16000, duration=3):
    y, orig_sr = sf.read(file_path, dtype='float32')

    if y.ndim > 1:
        y = np.mean(y, axis=1)

    if orig_sr != sr:
        y = librosa.resample(y, orig_sr=orig_sr, target_sr=sr)

    # Pad or trim to fixed duration
    target_len = sr * duration
    if len(y) < target_len:
        y = np.pad(y, (0, target_len - len(y)))
    else:
        y = y[:target_len]

    # Log-mel spectrogram
    mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels)
    log_mel = librosa.power_to_db(mel_spec, ref=np.max)
    return log_mel  # shape: (128, T)



In [13]:
from tqdm import tqdm
import numpy as np
import pandas as pd



feature_list = []
for fp in tqdm(df.path, desc="Extracting features"):
    feature_list.append(extract_feature(fp))  # Each is (128, T)

# Resize all to fixed shape (e.g. 128x128)
X = np.array([librosa.util.fix_length(f, size=128, axis=1) for f in feature_list])  # Shape: (N, 128, 128)
y = df.emotion.values

print("Feature matrix shape:", X.shape)  # e.g. (1440, 128, 128)


Extracting features: 100%|██████████| 1440/1440 [00:47<00:00, 30.29it/s]


Feature matrix shape: (1440, 128, 128)


In [14]:
# After this...
feature_list = []
for fp in tqdm(df.path, desc="Extracting features"):
    feature_list.append(extract_feature(fp))  # Returns (128, T)

# Resize all to (128, 128)
X = np.array([librosa.util.fix_length(f, size=128, axis=1) for f in feature_list])  # (N, 128, 128)
y = df.emotion.values  # e.g., encoded labels



Extracting features: 100%|██████████| 1440/1440 [00:16<00:00, 85.61it/s]


In [15]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Save label encoder for inference
import joblib
joblib.dump(le, "label_encoder.pkl")


['label_encoder.pkl']

In [16]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, stratify=y_encoded, random_state=42
)


In [17]:
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
import torch

class SpectrogramDataset(Dataset):
    def __init__(self, X, y, transform=None):
        self.X = X
        self.y = y
        self.transform = transform

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        img = self.X[idx]
        img = Image.fromarray((img - img.min()) / (img.max() - img.min()) * 255).convert("L")
        if self.transform:
            img = self.transform(img)
        return img, self.y[idx]

transform = transforms.Compose([
    transforms.Resize((128, 128)),
    transforms.ToTensor(),
    transforms.Normalize([0.5], [0.5])
])

train_ds = SpectrogramDataset(X_train, y_train, transform)
test_ds  = SpectrogramDataset(X_test, y_test, transform)

train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)
test_loader  = DataLoader(test_ds, batch_size=32)


In [18]:
import torch.nn as nn
from torchvision import models

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = models.resnet18(pretrained=False)
model.conv1 = nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3, bias=False)  # 1 channel
model.fc = nn.Linear(model.fc.in_features, len(le.classes_))
model = model.to(device)




In [19]:
import torch.optim as optim

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

for epoch in range(100):
    model.train()
    running_loss = 0
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    print(f"Epoch {epoch+1}: Loss = {running_loss:.10f}")


Epoch 1: Loss = 65.4746266603
Epoch 2: Loss = 49.8251084089
Epoch 3: Loss = 42.2513065338
Epoch 4: Loss = 37.3216333389
Epoch 5: Loss = 31.5409514904
Epoch 6: Loss = 25.7720549703
Epoch 7: Loss = 19.0384627283
Epoch 8: Loss = 16.7856324613
Epoch 9: Loss = 10.9953543842
Epoch 10: Loss = 13.4594138116
Epoch 11: Loss = 10.0810331777
Epoch 12: Loss = 8.4653440863
Epoch 13: Loss = 5.2155143358
Epoch 14: Loss = 2.6345114764
Epoch 15: Loss = 3.3167638732
Epoch 16: Loss = 4.9766954705
Epoch 17: Loss = 4.0502650440
Epoch 18: Loss = 4.8093749657
Epoch 19: Loss = 5.0013228580
Epoch 20: Loss = 3.6546622915
Epoch 21: Loss = 4.5311867953
Epoch 22: Loss = 3.6865304522
Epoch 23: Loss = 2.8422668120
Epoch 24: Loss = 1.3086641373
Epoch 25: Loss = 1.2073666952
Epoch 26: Loss = 0.8179105308
Epoch 27: Loss = 0.3424441983
Epoch 28: Loss = 0.1878867992
Epoch 29: Loss = 0.0986985708
Epoch 30: Loss = 0.0504640642
Epoch 31: Loss = 0.0289399986
Epoch 32: Loss = 0.0317973650
Epoch 33: Loss = 0.0230790770
Epoch 34

In [20]:
import torch
import os

# Convert to float16
model_fp16 = model.half()  # Convert model to float16

# Save
torch.save(model_fp16.state_dict(), "resnet18_emotion_fp16.pt")

# Check file size
size_mb = os.path.getsize("resnet18_emotion_fp16.pt") / (1024 ** 2)
print(f"Float16 model saved! Size: {size_mb:.2f} MB")



Float16 model saved! Size: 21.37 MB


In [21]:
from sklearn.metrics import classification_report

# Convert model to float16
model = model.half()
model.eval()

all_preds, all_labels = [], []

with torch.no_grad():
    for inputs, labels in test_loader:
        inputs = inputs.to(device).half()  # convert input to float16
        outputs = model(inputs)
        preds = outputs.argmax(dim=1).cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(labels.numpy())

print(classification_report(all_labels, all_preds, target_names=le.classes_))



              precision    recall  f1-score   support

       angry       0.85      0.89      0.87        38
        calm       0.88      0.95      0.91        38
     disgust       0.92      0.92      0.92        38
     fearful       0.71      0.90      0.80        39
       happy       0.61      0.56      0.59        39
     neutral       0.73      0.58      0.65        19
         sad       0.76      0.74      0.75        38
    surprise       0.94      0.77      0.85        39

    accuracy                           0.80       288
   macro avg       0.80      0.79      0.79       288
weighted avg       0.80      0.80      0.80       288

