In [None]:
# Kaggle default setup code

# # This Python 3 environment comes with many helpful analytics libraries installed
# # It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# # For example, here's several helpful packages to load

# import numpy as np # linear algebra
# import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# # Input data files are available in the read-only "../input/" directory
# # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# # You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# # You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import os, random, math, json, warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import librosa
import torchaudio
import torch
from torch import nn
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

from transformers import (
    Wav2Vec2Processor,
    Wav2Vec2ForSequenceClassification,
    TrainingArguments,
    Trainer,
)

SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

DATA_ROOT = "/kaggle/input/nor-smart-speech"
SR = 16000
MAX_SEC = 4
MAX_LEN = SR*MAX_SEC

emotion_map = {
    'angry':0, 'disgust':1, 'fear':2, 'happy':3, 'neutral':4, 'sad':5
}
inv_map = {v:k for k,v in emotion_map.items()}


2025-08-11 16:54:04.759392: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1754931245.133119      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1754931245.236460      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [None]:
audio_paths, labels = [], []
seen = set()

for root, _, files in os.walk(DATA_ROOT):
    for f in files:
        if not f.lower().endswith(".wav"):
            continue
        fname = f.lower()
        if fname in seen:
            continue

        label_folder = os.path.basename(root).lower()
        if label_folder in emotion_map:
            seen.add(fname)
            audio_paths.append(os.path.join(root, f))
            labels.append(emotion_map[label_folder])

df = pd.DataFrame({"path": audio_paths, "label": labels})
df = df.sample(frac=1, random_state=SEED).reset_index(drop=True)

print("Total files found:", len(df))
print(df["label"].map(inv_map).value_counts().sort_index())
df.head()

Total files found: 19487
label
angry      3376
disgust    2822
fear       2901
happy      3681
neutral    3270
sad        3437
Name: count, dtype: int64


Unnamed: 0,path,label
0,/kaggle/input/nor-smart-speech/disgust/03-02-0...,1
1,/kaggle/input/nor-smart-speech/angry/03-01-05-...,0
2,/kaggle/input/nor-smart-speech/happy/03-01-03-...,3
3,/kaggle/input/nor-smart-speech/sad/1040_IWL_SA...,5
4,/kaggle/input/nor-smart-speech/happy/1049_ITH_...,3


In [None]:
train_df, temp_df = train_test_split(df, test_size=0.20, stratify=df["label"], random_state=SEED)
val_df, test_df   = train_test_split(temp_df, test_size=0.50, stratify=temp_df["label"], random_state=SEED)

print("Train/Val/Test sizes:", len(train_df), len(val_df), len(test_df))
print("Train dist:\n", train_df["label"].map(inv_map).value_counts().sort_index())

Train/Val/Test sizes: 15589 1949 1949
Train dist:
 label
angry      2701
disgust    2257
fear       2321
happy      2945
neutral    2616
sad        2749
Name: count, dtype: int64


In [None]:
# ---------------------------
# Robust audio augmentation
# ---------------------------
import numpy as np
import random
import librosa

SR       = 16000
MAX_SEC  = 4
MAX_LEN  = SR * MAX_SEC

def _pad_or_trim(y, target_len=MAX_LEN):
    if y is None or len(y) == 0:
        y = np.zeros(target_len, dtype=np.float32)
    if len(y) >= target_len:
        return y[:target_len].astype(np.float32)
    return np.pad(y.astype(np.float32), (0, target_len - len(y)))

def _sanitize(y):
    if not np.isfinite(y).all():
        y = np.nan_to_num(y, nan=0.0, posinf=0.0, neginf=0.0)
    # soft clip to [-1, 1]
    y = np.clip(y, -1.0, 1.0)
    return y.astype(np.float32)

def add_gaussian_noise(y, snr_db=20.0):
    try:
        rms = np.sqrt(np.mean(y**2) + 1e-9)
        snr = 10**(snr_db/20.0)
        noise_rms = rms / max(snr, 1e-6)
        noise = np.random.normal(0.0, noise_rms, size=y.shape).astype(np.float32)
        y_out = y + noise
        return _sanitize(y_out)
    except Exception:
        return y

def pitch_shift_aug(y, sr=SR, n_steps=0.0):
    try:
        y_out = librosa.effects.pitch_shift(y, sr=sr, n_steps=n_steps)
        return _sanitize(_pad_or_trim(y_out))
    except Exception:
        return y

def time_stretch_aug(y, rate=1.0):
    try:
        rate = float(rate)
        rate = np.clip(rate, 0.8, 1.25)  # safety bounds
        y_out = librosa.effects.time_stretch(y, rate)
        return _sanitize(_pad_or_trim(y_out))
    except Exception:
        return y

def time_shift_aug(y, max_frac=0.1):
    try:
        max_frac = float(np.clip(max_frac, 0.0, 0.49))
        shift = int(random.uniform(-max_frac, max_frac) * len(y))
        y_out = np.roll(y, shift)
        return _sanitize(y_out)
    except Exception:
        return y

def random_gain_aug(y, low=0.9, high=1.1):
    try:
        g = float(random.uniform(low, high))
        return _sanitize(y * g)
    except Exception:
        return y

def bandpass_filt_aug(y, sr=SR, low=100.0, high=7000.0):
    """Simple STFT mask to mimic bandpass (cheap & stable)."""
    try:
        S = librosa.stft(y, n_fft=512, hop_length=160, win_length=400)
        freqs = librosa.fft_frequencies(sr=sr, n_fft=512)
        mask = (freqs >= low) & (freqs <= high)
        S[~mask, :] = 0
        y_out = librosa.istft(S, hop_length=160, win_length=400, length=len(y))
        return _sanitize(y_out)
    except Exception:
        return y

def augment_waveform(
    y,
    sr=SR,
    probs=dict(noise=0.5, pitch=0.3, stretch=0.3, shift=0.4, gain=0.5, bandpass=0.2),
    ranges=dict(snr=(15,25), pitch=(-1.5,1.5), stretch=(0.9,1.1), shift=0.08, gain=(0.9,1.1), bp=(100,7000))
):
    """
    Robust, probabilistic augmentations. Always returns a valid, length-exact waveform.
    """
    y = _pad_or_trim(y)
    y = _sanitize(y)

    # noise
    if random.random() < probs.get("noise", 0):
        snr_lo, snr_hi = ranges["snr"]
        y = add_gaussian_noise(y, snr_db=random.uniform(snr_lo, snr_hi))

    # pitch
    if random.random() < probs.get("pitch", 0):
        p_lo, p_hi = ranges["pitch"]
        y = pitch_shift_aug(y, sr=sr, n_steps=random.uniform(p_lo, p_hi))

    # stretch
    if random.random() < probs.get("stretch", 0):
        s_lo, s_hi = ranges["stretch"]
        y = time_stretch_aug(y, rate=random.uniform(s_lo, s_hi))

    # shift
    if random.random() < probs.get("shift", 0):
        y = time_shift_aug(y, max_frac=ranges["shift"])

    # gain
    if random.random() < probs.get("gain", 0):
        g_lo, g_hi = ranges["gain"]
        y = random_gain_aug(y, low=g_lo, high=g_hi)

    # bandpass
    if random.random() < probs.get("bandpass", 0):
        low, high = ranges["bp"]
        y = bandpass_filt_aug(y, sr=sr, low=low, high=high)

    # final safety
    return _pad_or_trim(_sanitize(y))


In [None]:
class SERDataset(Dataset):
    def __init__(self, df, processor, is_train=False, max_len=MAX_LEN, sr=SR):
        self.df = df.reset_index(drop=True)
        self.processor = processor
        self.is_train = is_train
        self.max_len = max_len
        self.sr = sr

    def __len__(self): return len(self.df)

    def __getitem__(self, idx):
        p = self.df.loc[idx, "path"]
        y, _ = librosa.load(p, sr=self.sr, mono=True)

        # pad/trim up front for stability
        if len(y) >= self.max_len:
            y = y[:self.max_len]
        else:
            y = np.pad(y, (0, self.max_len - len(y)))

        if self.is_train:
            y = augment_waveform(y, sr=self.sr)  # robust version

        inputs = self.processor(
            y, sampling_rate=self.sr, return_tensors="pt", padding=True, truncation=False
        )
        label = int(self.df.loc[idx, "label"])
        return {
            "input_values": inputs.input_values.squeeze(0),
            "labels": torch.tensor(label, dtype=torch.long),
        }



In [None]:
MODEL_PATH = "/kaggle/input/ravdess-wav2vec2"

processor = Wav2Vec2Processor.from_pretrained(MODEL_PATH)

# 6 labels in NOR dataset
model = Wav2Vec2ForSequenceClassification.from_pretrained(
    MODEL_PATH,
    num_labels=6,
    problem_type="single_label_classification",
    ignore_mismatched_sizes=True
).to(DEVICE)


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/ravdess-wav2vec2 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([5]) in the checkpoint and torch.Size([6]) in the model instantiated
- classifier.weight: found shape torch.Size([5, 256]) in the checkpoint and torch.Size([6, 256]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
train_ds = SERDataset(train_df, processor, is_train=True)
val_ds   = SERDataset(val_df,   processor, is_train=False)
test_ds  = SERDataset(test_df,  processor, is_train=False)

In [None]:
# compute weights = 1 / freq
counts = train_df["label"].value_counts().sort_index().values.astype(np.float32)
class_weights = (1.0 / (counts + 1e-6))
class_weights = class_weights / class_weights.sum() * len(counts)
class_weights_t = torch.tensor(class_weights, dtype=torch.float, device=DEVICE)
print("Class weights:", {inv_map[i]: float(w) for i,w in enumerate(class_weights)})

class WeightedTrainer(Trainer):
    # accept **kwargs to be future-proof
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):

        labels = inputs["labels"]
        if labels.dtype != torch.long:
            labels = labels.long()


        outputs = model(**{k: v for k, v in inputs.items() if k != "labels"})
        logits = outputs.get("logits")

        loss_fct = nn.CrossEntropyLoss(weight=class_weights_t.to(logits.device))
        loss = loss_fct(logits, labels.to(logits.device))

        return (loss, outputs) if return_outputs else loss

Class weights: {'angry': 0.9534974098205566, 'disgust': 1.1410706043243408, 'fear': 1.1096065044403076, 'happy': 0.8744979500770569, 'neutral': 0.9844788312911987, 'sad': 0.936848521232605}


In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    acc = accuracy_score(labels, preds)
    prec, rec, f1, _ = precision_recall_fscore_support(labels, preds, average="weighted", zero_division=0)
    return {"accuracy": acc, "precision": prec, "recall": rec, "f1": f1}

In [None]:
args = TrainingArguments(
    output_dir="./nor-wav2vec2-results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    logging_strategy="steps",
    logging_steps=100,
    report_to=[],
    fp16=torch.cuda.is_available(),
)

trainer = WeightedTrainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=processor,
    compute_metrics=compute_metrics
)

trainer.train()
val_metrics = trainer.evaluate()
print("Validation:", val_metrics)


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.003,0.860664,0.720883,0.730815,0.720883,0.722325
2,0.8403,0.730775,0.751154,0.764732,0.751154,0.750914
3,0.6934,0.641532,0.786557,0.790927,0.786557,0.78605
4,0.6206,0.601095,0.79528,0.799014,0.79528,0.79519
5,0.5324,0.55626,0.813238,0.814853,0.813238,0.812697
6,0.4917,0.637334,0.79374,0.799951,0.79374,0.793972
7,0.4192,0.594414,0.816316,0.818065,0.816316,0.816366
8,0.385,0.598723,0.806567,0.810481,0.806567,0.806573
9,0.3129,0.591484,0.816829,0.817057,0.816829,0.816687
10,0.2713,0.599801,0.82196,0.822787,0.82196,0.822106


Validation: {'eval_loss': 0.5998007655143738, 'eval_accuracy': 0.8219599794766547, 'eval_precision': 0.822787228814219, 'eval_recall': 0.8219599794766547, 'eval_f1': 0.8221063763739254, 'eval_runtime': 28.6098, 'eval_samples_per_second': 68.124, 'eval_steps_per_second': 4.264, 'epoch': 10.0}


In [None]:
test_metrics = trainer.evaluate(test_ds)
print("Test:", test_metrics)

# Saving to reuse
save_dir = "./nor-pretrained-wav2vec2"
model.save_pretrained(save_dir)
processor.save_pretrained(save_dir)
print("Saved to", save_dir)


Test: {'eval_loss': 0.6594431400299072, 'eval_accuracy': 0.8086198050282196, 'eval_precision': 0.8110265307552449, 'eval_recall': 0.8086198050282196, 'eval_f1': 0.8090447258311045, 'eval_runtime': 59.3287, 'eval_samples_per_second': 32.851, 'eval_steps_per_second': 2.056, 'epoch': 10.0}
Saved to ./nor-pretrained-wav2vec2


In [None]:
import random
idx = random.randrange(len(test_ds))
sample = test_ds[idx]
with torch.no_grad():
    logits = model(sample["input_values"].unsqueeze(0).to(DEVICE)).logits
pred = logits.argmax(dim=-1).item()
print("Pred:", inv_map[pred], "| True:", inv_map[int(sample["labels"])])


Pred: fear | True: fear


In [None]:
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2Processor
import shutil

model.save_pretrained("wav2vec2_nor_model")
processor.save_pretrained("wav2vec2_nor_model")


[]

In [None]:
# Saving zip file of the model

shutil.make_archive("wav2vec2_nor_model", 'zip', "wav2vec2_nor_model")


'/kaggle/working/wav2vec2_nor_model.zip'