In [1]:
!pip install transformers --q
!pip install datasets --q
!pip install torch==1.10.0+cu113 torchvision==0.11.1+cu113 torchaudio===0.10.0+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html --q
!pip install --upgrade --force-reinstall gdown --q
!pip install wandb --q
# !pip install jiwer --q
# !pip install librosa --q

In [79]:
!jupyter nbextension enable --py widgetsnbextension

Config option `kernel_spec_manager_class` not recognized by `EnableNBExtensionApp`.
404 Client Error: Not Found for url: http://metadata/computeMetadata/v1/instance/attributes/use-collaborative
Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: [32mOK[0m


# Data Exploration

The download_dataset script already takes care of the downloading of the audio files and splitting into train and test.

In [62]:
import pandas as pd
import numpy as np
import torchaudio
import IPython.display as ipd

In [6]:
train = pd.read_csv('data/train.csv', delimiter = '\t')
train.head()

Unnamed: 0,name,path,emotion
0,h13 (3),data/aesdd/happiness/h13 (3).wav,happiness
1,a05 (1),data/aesdd/anger/a05 (1).wav,anger
2,d03 (2),data/aesdd/disgust/d03 (2).wav,disgust
3,s11 (6),data/aesdd/sadness/s11 (6).wav,sadness
4,s01 (6),data/aesdd/sadness/s01 (6).wav,sadness


In [56]:
train.groupby("emotion").count()[["path"]]

Unnamed: 0_level_0,path
emotion,Unnamed: 1_level_1
anger,97
disgust,98
fear,96
happiness,95
sadness,97


In [60]:
test = pd.read_csv('data/test.csv', delimiter = '\t')
test.head()

Unnamed: 0,name,path,emotion
0,d09 (4),data/aesdd/disgust/d09 (4).wav,disgust
1,s16 (3),data/aesdd/sadness/s16 (3).wav,sadness
2,a16 (5),data/aesdd/anger/a16 (5).wav,anger
3,s15 (1),data/aesdd/sadness/s15 (1).wav,sadness
4,h17 (6),data/aesdd/happiness/h17 (6).wav,happiness


In [61]:
test.groupby("emotion").count()[["path"]]

Unnamed: 0_level_0,path
emotion,Unnamed: 1_level_1
anger,24
disgust,24
fear,24
happiness,24
sadness,25


Let us listen to a random sample from the training dataset

In [67]:
idx = np.random.randint(0, len(train))
sample = train.iloc[idx]
path = sample['path']
label = sample['emotion']

print(f'Id: {idx} Emotion: {label}')
signal, sr = torchaudio.load(path)
resampler = torchaudio.transforms.Resample(sr, 16_000)
signal = resampler(signal).squeeze().numpy()
ipd.Audio(data=np.asarray(signal), rate = 16_000)

Id: 18 Emotion: happiness


# Prepare the data for training

In [69]:
# Loading the created dataset using datasets
from datasets import load_dataset, load_metric


data_files = {
    "train": "data/train.csv", 
    "validation": "data/test.csv",
}

dataset = load_dataset("csv", data_files=data_files, delimiter="\t", )
train_dataset = dataset["train"]
eval_dataset = dataset["validation"]

print(train_dataset)
print(eval_dataset)

Using custom data configuration default-ece915fc643c1df9


Downloading and preparing dataset csv/default to /home/jupyter/.cache/huggingface/datasets/csv/default-ece915fc643c1df9/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e...


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /home/jupyter/.cache/huggingface/datasets/csv/default-ece915fc643c1df9/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

Dataset({
    features: ['name', 'path', 'emotion'],
    num_rows: 483
})
Dataset({
    features: ['name', 'path', 'emotion'],
    num_rows: 121
})


In [70]:
# We need to specify the input and output column
input_column = "path"
output_column = "emotion"
# we need to distinguish the unique labels in our SER dataset
label_list = train_dataset.unique(output_column)
label_list.sort()  # Let's sort it for determinism
num_labels = len(label_list)
print(f"A classification problem with {num_labels} classes: {label_list}")

A classification problem with 5 classes: ['anger', 'disgust', 'fear', 'happiness', 'sadness']


## Converts audio to vector features (tokens)

In [72]:
from transformers import AutoConfig, Wav2Vec2Processor

model_name_or_path = "lighteternal/wav2vec2-large-xlsr-53-greek"
pooling_mode = "mean"

# config
config = AutoConfig.from_pretrained(
    model_name_or_path,
    num_labels=num_labels,
    label2id={label: i for i, label in enumerate(label_list)},
    id2label={i: label for i, label in enumerate(label_list)},
    finetuning_task="wav2vec2_clf",
)
setattr(config, 'pooling_mode', pooling_mode)

  "Passing `gradient_checkpointing` to a config initialization is deprecated and will be removed in v5 "


In [73]:
processor = Wav2Vec2Processor.from_pretrained(model_name_or_path,)
target_sampling_rate = processor.feature_extractor.sampling_rate
print(f"The target sampling rate: {target_sampling_rate}")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


The target sampling rate: 16000


The processing will mainly be resampling the audio file to the sampling frequency used by the model. Then we "tokenize" the audio file using the processor.

In [75]:
def speech_file_to_array_fn(path):
    speech_array, sampling_rate = torchaudio.load(path)
    resampler = torchaudio.transforms.Resample(sampling_rate, target_sampling_rate)
    speech = resampler(speech_array).squeeze().numpy()
    return speech

def label_to_id(label, label_list):

    if len(label_list) > 0:
        return label_list.index(label) if label in label_list else -1

    return label

def preprocess_function(examples):
    speech_list = [speech_file_to_array_fn(path) for path in examples[input_column]]
    target_list = [label_to_id(label, label_list) for label in examples[output_column]]

    result = processor(speech_list, sampling_rate=target_sampling_rate)
    result["labels"] = list(target_list)

    return result

In [None]:
train_dataset = train_dataset.map(
    preprocess_function,
    batch_size=100,
    batched=True,
    num_proc=4
)

In [None]:
eval_dataset = eval_dataset.map(
    preprocess_function,
    batch_size=100,
    batched=True,
    num_proc=4
)

In [None]:
idx = 0
print(f"Training input_values: {train_dataset[idx]['input_values']}")
print(f"Training attention_mask: {train_dataset[idx]['attention_mask']}")
print(f"Training labels: {train_dataset[idx]['labels']} - {train_dataset[idx]['emotion']}")

# Defining the Model
In this section we will define the model that we will be using. The Wav2Vec2 family of models are originally designed for Speech-to-text. We will be modifying its function by attaching a classification head on top of the original model

In [None]:
from dataclasses import dataclass
from typing import Optional, Tuple
import torch
from transformers.file_utils import ModelOutput


@dataclass
class SpeechClassifierOutput(ModelOutput):
    loss: Optional[torch.FloatTensor] = None
    logits: torch.FloatTensor = None
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    attentions: Optional[Tuple[torch.FloatTensor]] = None


The classification head is an artificial neural network structure that takes the output of the original Wave2Vec2 model and then converts it to probabilities for each of the 5 classes. The class with the highes probabiity will be the output of the classfier

In [None]:
import torch
import torch.nn as nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

from transformers.models.wav2vec2.modeling_wav2vec2 import (
    Wav2Vec2PreTrainedModel,
    Wav2Vec2Model
)


class Wav2Vec2ClassificationHead(nn.Module):
    """Head for wav2vec classification task."""

    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.dropout = nn.Dropout(config.final_dropout)
        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)

    def forward(self, features, **kwargs):
        x = features
        x = self.dropout(x)
        x = self.dense(x)
        x = torch.tanh(x)
        x = self.dropout(x)
        x = self.out_proj(x)
        return x


class Wav2Vec2ForSpeechClassification(Wav2Vec2PreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.pooling_mode = config.pooling_mode
        self.config = config

        self.wav2vec2 = Wav2Vec2Model(config)
        self.classifier = Wav2Vec2ClassificationHead(config)

        self.init_weights()

    def freeze_feature_extractor(self):
        self.wav2vec2.feature_extractor._freeze_parameters()

    def merged_strategy(
            self,
            hidden_states,
            mode="mean"
    ):
        if mode == "mean":
            outputs = torch.mean(hidden_states, dim=1)
        elif mode == "sum":
            outputs = torch.sum(hidden_states, dim=1)
        elif mode == "max":
            outputs = torch.max(hidden_states, dim=1)[0]
        else:
            raise Exception(
                "The pooling method hasn't been defined! Your pooling mode must be one of these ['mean', 'sum', 'max']")

        return outputs

    def forward(
            self,
            input_values,
            attention_mask=None,
            output_attentions=None,
            output_hidden_states=None,
            return_dict=None,
            labels=None,
    ):
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        outputs = self.wav2vec2(
            input_values,
            attention_mask=attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        hidden_states = outputs[0]
        hidden_states = self.merged_strategy(hidden_states, mode=self.pooling_mode)
        logits = self.classifier(hidden_states)

        loss = None
        if labels is not None:
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)

        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return SpeechClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
