In [None]:
# Kaggle default setup code

# # This Python 3 environment comes with many helpful analytics libraries installed
# # It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# # For example, here's several helpful packages to load

# import numpy as np # linear algebra
# import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# # Input data files are available in the read-only "../input/" directory
# # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# # You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# # You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import os
import pandas as pd

# Mapping from emotion ID to label name and encoded value
ravdess_emotion_map = {
    '01': 'neutral',
    '02': 'calm',
    '03': 'happy',
    '04': 'sad',
    '05': 'angry',
    '06': 'fear',
    '07': 'disgust',
    '08': 'surprise'
}

# Selecting the required emotions
selected_emotions = ['angry', 'fear', 'happy', 'neutral', 'sad']
final_emotion_map = {emotion: idx for idx, emotion in enumerate(selected_emotions)}

In [None]:
audio_paths = []
labels = []

root_dir = "/kaggle/input"

for root, _, files in os.walk(root_dir):
    for file in files:
        if file.endswith(".wav"):
            emotion_id = file.split("-")[2]
            emotion = ravdess_emotion_map.get(emotion_id)
            if emotion in final_emotion_map:
                full_path = os.path.join(root, file)
                audio_paths.append(full_path)
                labels.append(final_emotion_map[emotion])


In [None]:
from sklearn.model_selection import train_test_split

df = pd.DataFrame({"audio_path": audio_paths, "label": labels})
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)

print("Train samples:", len(train_df))
print("Test samples:", len(test_df))

Train samples: 1382
Test samples: 346


In [None]:
df.head()

Unnamed: 0,audio_path,label
0,/kaggle/input/ravdess-emotional-speech-audio/A...,0
1,/kaggle/input/ravdess-emotional-speech-audio/a...,0
2,/kaggle/input/ravdess-emotional-speech-audio/A...,2
3,/kaggle/input/ravdess-emotional-speech-audio/a...,4
4,/kaggle/input/ravdess-emotional-speech-audio/a...,0


In [None]:
import random
import IPython.display as ipd

# Picking a random sample from the train set
random_idx = random.randint(0, len(train_df) - 1)
sample_path = train_df.iloc[random_idx]['audio_path']
label_id = train_df.iloc[random_idx]['label']

# Reverse lookup of label name
label_name = [k for k, v in final_emotion_map.items() if v == label_id][0]

print(f"Playing audio sample {random_idx}:")
print(f"Path: {sample_path}")
print(f"Label: {label_name}")

# Playing the audio
ipd.Audio(sample_path)

Playing audio sample 550:
Path: /kaggle/input/ravdess-emotional-speech-audio/Actor_12/03-01-06-02-02-01-12.wav
Label: fear


In [None]:
import warnings
warnings.filterwarnings("ignore")

from transformers import Wav2Vec2Processor

model_path = "/kaggle/input/tess-pretrained/kaggle/working/tess-pretrained-model"

processor = Wav2Vec2Processor.from_pretrained(model_path)

2025-08-05 16:22:07.970249: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1754410928.341045      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1754410928.441269      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [None]:
import torch
import librosa
import numpy as np

class RAVDESSDataset(torch.utils.data.Dataset):
    def __init__(self, df, processor, max_length=16000*4):
        self.df = df
        self.processor = processor
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        path = self.df.iloc[idx]['audio_path']
        label = self.df.iloc[idx]['label']
        audio, sr = librosa.load(path, sr=16000)

        if len(audio) > self.max_length:
            audio = audio[:self.max_length]
        else:
            audio = np.pad(audio, (0, self.max_length - len(audio)))

        inputs = self.processor(audio, sampling_rate=16000, return_tensors="pt", padding=True, truncation=False)

        return {
            'input_values': inputs.input_values.squeeze(0),
            'labels': torch.tensor(label, dtype=torch.long)
        }

train_dataset = RAVDESSDataset(train_df, processor)
test_dataset = RAVDESSDataset(test_df, processor)

In [None]:
from transformers import Wav2Vec2ForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

model = Wav2Vec2ForSequenceClassification.from_pretrained(
    model_path,
    num_labels=len(final_emotion_map),
    ignore_mismatched_sizes=True,
    problem_type="single_label_classification"
)

training_args = TrainingArguments(
    output_dir="./wav2vec2-ravdess-results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=6,
    weight_decay=0.01,
    logging_dir="./logs",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    save_total_limit=2,
    report_to=[]
)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="weighted")
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=processor,
    compute_metrics=compute_metrics
)

trainer.train()


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/tess-pretrained/kaggle/working/tess-pretrained-model and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([7]) in the checkpoint and torch.Size([5]) in the model instantiated
- classifier.weight: found shape torch.Size([7, 256]) in the checkpoint and torch.Size([5, 256]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,1.003979,0.713873,0.715266,0.713873,0.699747
2,No log,0.678757,0.846821,0.858148,0.846821,0.84432
3,No log,0.46811,0.901734,0.904132,0.901734,0.900359
4,No log,0.2711,0.965318,0.965823,0.965318,0.965276
5,No log,0.198957,0.979769,0.980231,0.979769,0.979762
6,0.598500,0.168946,0.976879,0.977232,0.976879,0.976893




TrainOutput(global_step=522, training_loss=0.5796737109107533, metrics={'train_runtime': 448.5515, 'train_samples_per_second': 18.486, 'train_steps_per_second': 1.164, 'total_flos': 3.01122542366208e+17, 'train_loss': 0.5796737109107533, 'epoch': 6.0})

In [None]:
model.save_pretrained("/kaggle/working/ravdess-pretrained-model")
processor.save_pretrained("/kaggle/working/ravdess-pretrained-model")

print("Model and processor saved successfully!")

Model and processor saved successfully!


In [None]:
import shutil

# Zipping the saved model directory
shutil.make_archive("/kaggle/working/ravdess-pretrained-model", 'zip', "/kaggle/working/ravdess-pretrained-model")

print("Model and processor zipped successfully!")


Model and processor zipped successfully!
