# INSTALL 

In [None]:
%%capture

!pip install git+https://github.com/huggingface/datasets.git
!pip install git+https://github.com/huggingface/transformers.git
!pip install jiwer
!pip install torchaudio
!pip install librosa
# Monitor the training process
# !pip install wandb

In [None]:
!pip install wandb -qqq
import wandb

[K     |████████████████████████████████| 1.8 MB 4.3 MB/s 
[K     |████████████████████████████████| 145 kB 68.0 MB/s 
[K     |████████████████████████████████| 181 kB 74.6 MB/s 
[K     |████████████████████████████████| 63 kB 2.1 MB/s 
[?25h  Building wheel for pathtools (setup.py) ... [?25l[?25hdone


In [None]:
wandb.login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [None]:
%env WANDB_LOG_MODEL=true

env: WANDB_LOG_MODEL=true


In [None]:
%env LC_ALL=C.UTF-8
%env LANG=C.UTF-8
%env TRANSFORMERS_CACHE=/content/cache
%env HF_DATASETS_CACHE=/content/cache
%env CUDA_LAUNCH_BLOCKING=1

env: LC_ALL=C.UTF-8
env: LANG=C.UTF-8
env: TRANSFORMERS_CACHE=/content/cache
env: HF_DATASETS_CACHE=/content/cache
env: CUDA_LAUNCH_BLOCKING=1


# IMPORT

In [None]:
import numpy as np
import pandas as pd
import math 

from pathlib import Path
from tqdm import tqdm

import torchaudio
from sklearn.model_selection import train_test_split

import os
import sys

from dataclasses import dataclass
from typing import Optional, Tuple
import torch
from transformers.file_utils import ModelOutput

import torch.nn as nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

from transformers.models.wav2vec2.modeling_wav2vec2 import (
    Wav2Vec2PreTrainedModel,
    Wav2Vec2Model
)

from typing import Dict, List, Optional, Union

import transformers
from transformers import Wav2Vec2Processor
from transformers import AutoConfig
from datasets import DatasetDict, load_from_disk

from typing import Any, Dict, Union

from packaging import version

from transformers import (
    Trainer,
    is_apex_available,
)

if is_apex_available():
    from apex import amp

if version.parse(torch.__version__) >= version.parse("1.6"):
    _is_native_amp_available = True
    from torch.cuda.amp import autocast

from transformers import TrainingArguments



# DATASET

In [None]:
pooling_mode = "mean"
model_name_or_path = "facebook/wav2vec2-base-960h"

In [None]:
!unzip -d /content/ /content/drive/MyDrive/CN7/DLP/voice_emotion_recognition/dataset/RAV_TESS_preprocessed_data.zip
!mv /content/content/RAV_TESS_preprocessed_data /content/

Archive:  /content/drive/MyDrive/CN7/DLP/voice_emotion_recognition/dataset/RAV_TESS_preprocessed_data.zip
   creating: /content/content/RAV_TESS_preprocessed_data/
 extracting: /content/content/RAV_TESS_preprocessed_data/dataset_dict.json  
   creating: /content/content/RAV_TESS_preprocessed_data/train/
  inflating: /content/content/RAV_TESS_preprocessed_data/train/dataset_info.json  
  inflating: /content/content/RAV_TESS_preprocessed_data/train/dataset.arrow  
  inflating: /content/content/RAV_TESS_preprocessed_data/train/state.json  
   creating: /content/content/RAV_TESS_preprocessed_data/validation/
  inflating: /content/content/RAV_TESS_preprocessed_data/validation/dataset_info.json  
  inflating: /content/content/RAV_TESS_preprocessed_data/validation/dataset.arrow  
  inflating: /content/content/RAV_TESS_preprocessed_data/validation/state.json  


In [None]:
dataset = load_from_disk("/content/RAV_TESS_preprocessed_data")
train_dataset, eval_dataset = dataset["train"], dataset["validation"]

In [None]:
print(train_dataset)
print(eval_dataset)

Dataset({
    features: ['Path', 'Emotion', 'input_values', 'labels'],
    num_rows: 4544
})
Dataset({
    features: ['Path', 'Emotion', 'input_values', 'labels'],
    num_rows: 1136
})


In [None]:
# We need to specify the input and output column
input_column = "Path"
output_column = "Emotion"

In [None]:
# we need to distinguish the unique labels in our SER dataset
label_list = train_dataset.unique(output_column)
label_list.sort()  # Let's sort it for determinism
num_labels = len(label_list)
print(f"A classification problem with {num_labels} classes: {label_list}")

A classification problem with 8 classes: ['angry', 'calm', 'disgusted', 'fearful', 'happy', 'neutral', 'sad', 'surprised']


In [None]:
# config
config = AutoConfig.from_pretrained(
    model_name_or_path,
    num_labels=num_labels,
    label2id={label: i for i, label in enumerate(label_list)},
    id2label={i: label for i, label in enumerate(label_list)},
    finetuning_task="wav2vec2_clf",
)
setattr(config, 'pooling_mode', pooling_mode)

Downloading:   0%|          | 0.00/1.56k [00:00<?, ?B/s]

In [None]:
processor = Wav2Vec2Processor.from_pretrained(model_name_or_path,)
target_sampling_rate = processor.feature_extractor.sampling_rate
print(f"The target sampling rate: {target_sampling_rate}")

Downloading:   0%|          | 0.00/159 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/163 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/291 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

The target sampling rate: 16000


# MODEL

In [None]:
@dataclass
class SpeechClassifierOutput(ModelOutput):
    loss: Optional[torch.FloatTensor] = None
    logits: torch.FloatTensor = None
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    attentions: Optional[Tuple[torch.FloatTensor]] = None

In [None]:
class Wav2Vec2ClassificationHead(nn.Module):
    """Head for wav2vec classification task."""

    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.dropout = nn.Dropout(config.final_dropout)
        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)

    def forward(self, features, **kwargs):
        x = features
        x = self.dropout(x)
        x = self.dense(x)
        x = torch.tanh(x)
        x = self.dropout(x)
        x = self.out_proj(x)
        return x


class Wav2Vec2ForSpeechClassification(Wav2Vec2PreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.pooling_mode = config.pooling_mode
        self.config = config

        self.wav2vec2 = Wav2Vec2Model(config)
        self.classifier = Wav2Vec2ClassificationHead(config)

        self.init_weights()

    def freeze_feature_extractor(self):
        self.wav2vec2.feature_extractor._freeze_parameters()

    def merged_strategy(
            self,
            hidden_states,
            mode="mean"
    ):
        if mode == "mean":
            outputs = torch.mean(hidden_states, dim=1)
        elif mode == "sum":
            outputs = torch.sum(hidden_states, dim=1)
        elif mode == "max":
            outputs = torch.max(hidden_states, dim=1)[0]
        else:
            raise Exception(
                "The pooling method hasn't been defined! Your pooling mode must be one of these ['mean', 'sum', 'max']")

        return outputs

    def forward(
            self,
            input_values,
            attention_mask=None,
            output_attentions=None,
            output_hidden_states=None,
            return_dict=None,
            labels=None,
    ):
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        outputs = self.wav2vec2(
            input_values,
            attention_mask=attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        hidden_states = outputs[0]
        hidden_states = self.merged_strategy(hidden_states, mode=self.pooling_mode)
        logits = self.classifier(hidden_states)

        loss = None
        if labels is not None:
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)

        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return SpeechClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

# TRAINING

In [None]:
@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
        max_length (:obj:`int`, `optional`):
            Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
        max_length_labels (:obj:`int`, `optional`):
            Maximum length of the ``labels`` returned list and optionally padding length (see above).
        pad_to_multiple_of (:obj:`int`, `optional`):
            If set will pad the sequence to a multiple of the provided value.
            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
            7.5 (Volta).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [feature["labels"] for feature in features]

        d_type = torch.long if isinstance(label_features[0], int) else torch.float

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )

        batch["labels"] = torch.tensor(label_features, dtype=d_type)

        return batch

In [None]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

In [None]:
is_regression = False

In [None]:
import numpy as np
from transformers import EvalPrediction


def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    preds = np.squeeze(preds) if is_regression else np.argmax(preds, axis=1)

    if is_regression:
        return {"mse": ((preds - p.label_ids) ** 2).mean().item()}
    else:
        return {"accuracy": (preds == p.label_ids).astype(np.float32).mean().item()}

In [None]:
model = Wav2Vec2ForSpeechClassification.from_pretrained(
    model_name_or_path,
    config=config,
)

Downloading:   0%|          | 0.00/360M [00:00<?, ?B/s]

Some weights of the model checkpoint at facebook/wav2vec2-base-960h were not used when initializing Wav2Vec2ForSpeechClassification: ['lm_head.weight', 'lm_head.bias']
- This IS expected if you are initializing Wav2Vec2ForSpeechClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForSpeechClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForSpeechClassification were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.dense.bias', 'wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a

In [None]:
model.freeze_feature_extractor()

In [None]:
!mkdir -p wav2vec2-english-speech-emotion-recognition

In [None]:

training_args = TrainingArguments(
    report_to = 'wandb',
    output_dir="/content/wav2vec2-english-speech-emotion-recognition",
    group_by_length=True,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    gradient_accumulation_steps=2,
    evaluation_strategy="steps",
    num_train_epochs=30.0,
    fp16=True,
    save_steps=10,
    eval_steps=10,
    logging_steps=10,
    learning_rate=1e-4,
    weight_decay=0.005, 
    warmup_steps=500,
    save_total_limit=2,
    load_best_model_at_end=True,
    run_name = 'EVMR',
)

In [None]:
trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=processor.feature_extractor,
)

Using cuda_amp half precision backend


In [None]:
from warnings import filterwarnings
filterwarnings(action='ignore', category=DeprecationWarning, message='`np.bool` is a deprecated alias')
trainer.train()

The following columns in the training set don't have a corresponding argument in `Wav2Vec2ForSpeechClassification.forward` and have been ignored: Emotion, Path. If Emotion, Path are not expected by `Wav2Vec2ForSpeechClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 4544
  Num Epochs = 30
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 2
  Total optimization steps = 1050
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


[34m[1mwandb[0m: Currently logged in as: [33mkhangnguyen2907[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss,Accuracy
10,2.0886,2.080406,0.147007
20,2.0795,2.078787,0.139965
30,2.0916,2.07577,0.139965
40,2.1872,2.072041,0.146127
50,2.0742,2.068383,0.162852
60,2.0708,2.065174,0.177817
70,2.0677,2.06105,0.160211
80,2.1633,2.055563,0.139965
90,2.0623,2.050861,0.138204
100,2.0503,2.045584,0.139085


The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForSpeechClassification.forward` and have been ignored: Emotion, Path. If Emotion, Path are not expected by `Wav2Vec2ForSpeechClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1136
  Batch size = 64
Saving model checkpoint to /content/wav2vec2-english-speech-emotion-recognition/checkpoint-10
Configuration saved in /content/wav2vec2-english-speech-emotion-recognition/checkpoint-10/config.json
Model weights saved in /content/wav2vec2-english-speech-emotion-recognition/checkpoint-10/pytorch_model.bin
Feature extractor saved in /content/wav2vec2-english-speech-emotion-recognition/checkpoint-10/preprocessor_config.json
The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForSpeechClassification.forward` and have been ignored: Emotion, Path. If Emotion, Path are not expected by `Wav2Vec2ForSpeechClassifi

TrainOutput(global_step=1050, training_loss=0.7602961384682428, metrics={'train_runtime': 8802.0384, 'train_samples_per_second': 15.487, 'train_steps_per_second': 0.119, 'total_flos': 3.7838772599500984e+18, 'train_loss': 0.7602961384682428, 'epoch': 29.99})

In [None]:
wandb.finish()

VBox(children=(Label(value='362.351 MB of 362.351 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0,…

0,1
eval/accuracy,▁▁▁▁▁▂▃▃▄▃▅▅▆▆▅▇▆▆▆▇▇▇▇▇████████████████
eval/loss,█████▇▆▆▆▅▅▄▄▄▄▃▄▃▃▂▂▂▂▃▂▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁
eval/runtime,█▃▄▂▂▁▁▁▁▂▃▃▂▃▂▂▆▃▂▃▅▅▇▅▅▄▄▆▇▄▅▄▆▆█▅▆▅▅▆
eval/samples_per_second,▁▆▅▇▇████▇▆▆▇▆▇▇▃▆▇▆▄▄▂▄▄▅▅▃▂▅▄▅▃▃▁▄▃▄▄▃
eval/steps_per_second,▁▆▅▆▇█▇▇▇▆▆▆▆▆▆▆▃▆▆▆▄▄▁▄▄▅▅▂▂▄▄▅▃▃▁▄▃▄▃▂
train/epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/learning_rate,▁▁▂▂▃▃▃▄▄▅▅▅▆▆▆▇▇████▇▇▇▆▆▆▅▅▅▄▄▃▃▃▂▂▂▁▁
train/loss,█████▇▆▆▆▆▅▅▅▄▃▃▄▃▃▃▃▃▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁
train/total_flos,▁

0,1
eval/accuracy,0.99032
eval/loss,0.0525
eval/runtime,34.5897
eval/samples_per_second,32.842
eval/steps_per_second,0.52
train/epoch,29.99
train/global_step,1050.0
train/learning_rate,0.0
train/loss,0.019
train/total_flos,3.7838772599500974e+18


In [None]:
!mkdir -p saved_model

In [None]:
trainer.save_model("/content/saved_model")

Saving model checkpoint to /content/saved_model
Configuration saved in /content/saved_model/config.json
Model weights saved in /content/saved_model/pytorch_model.bin
Feature extractor saved in /content/saved_model/preprocessor_config.json


In [None]:
!zip -r saved_model_evmr.zip /content/saved_model
!zip -r checkpoint_evmr.zip /content/wav2vec2-english-speech-emotion-recognition

  adding: content/saved_model/ (stored 0%)
  adding: content/saved_model/training_args.bin (deflated 48%)
  adding: content/saved_model/pytorch_model.bin (deflated 9%)
  adding: content/saved_model/config.json (deflated 64%)
  adding: content/saved_model/preprocessor_config.json (deflated 33%)
  adding: content/wav2vec2-english-speech-emotion-recognition/ (stored 0%)
  adding: content/wav2vec2-english-speech-emotion-recognition/checkpoint-1030/ (stored 0%)
  adding: content/wav2vec2-english-speech-emotion-recognition/checkpoint-1030/optimizer.pt (deflated 8%)
  adding: content/wav2vec2-english-speech-emotion-recognition/checkpoint-1030/training_args.bin (deflated 48%)
  adding: content/wav2vec2-english-speech-emotion-recognition/checkpoint-1030/rng_state.pth (deflated 28%)
  adding: content/wav2vec2-english-speech-emotion-recognition/checkpoint-1030/pytorch_model.bin (deflated 9%)
  adding: content/wav2vec2-english-speech-emotion-recognition/checkpoint-1030/config.json (deflated 64%)
 

In [None]:
!zip -r wandb_evmr.zip /content/wandb

In [None]:
!cp -av /content/wandb_evmr.zip /content/drive/MyDrive/CN7/DLP/voice_emotion_recognition/zip

'/content/wandb_evmr.zip' -> '/content/drive/MyDrive/CN7/DLP/voice_emotion_recognition/zip/wandb_evmr.zip'


In [None]:
!cp -av /content/saved_model_evmr.zip  /content/drive/MyDrive/CN7/DLP/voice_emotion_recognition/zip
!cp -av /content/checkpoint_evmr.zip  /content/drive/MyDrive/CN7/DLP/voice_emotion_recognition/zip

'/content/saved_model_evmr.zip' -> '/content/drive/MyDrive/CN7/DLP/voice_emotion_recognition/zip/saved_model_evmr.zip'
'/content/checkpoint_evmr.zip' -> '/content/drive/MyDrive/CN7/DLP/voice_emotion_recognition/zip/checkpoint_evmr.zip'


In [None]:
from transformers import AutoModel

In [None]:
model = Wav2Vec2ForSpeechClassification.from_pretrained("/content/wav2vec2-xlsr-greek-speech-emotion-recognition/checkpoint-560")

In [None]:
model.cuda()

In [None]:
new_trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=processor.feature_extractor,
)

Using cuda_amp half precision backend


In [None]:
new_trainer.train(resume_from_checkpoint=True)

Loading model from /content/wav2vec2-xlsr-greek-speech-emotion-recognition/checkpoint-560.
The following columns in the training set don't have a corresponding argument in `Wav2Vec2ForSpeechClassification.forward` and have been ignored: Path, Emotion. If Path, Emotion are not expected by `Wav2Vec2ForSpeechClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 4544
  Num Epochs = 1
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 2
  Total optimization steps = 568
  Continuing training from checkpoint, will skip to saved global_step
  Continuing training from epoch 0
  Continuing training from global step 560
  Will skip the first 0 epochs then the first 1120 batches in the first epoch. If this takes a lot of time, you can add the `--ignore_data_skip` flag to your launch command, but you will resume the training on data already seen by your

  0%|          | 0/1120 [00:00<?, ?it/s]

Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss,Validation Loss




Training completed. Do not forget to share your model on huggingface.co/models =)


Using cuda_amp half precision backend
Saving model checkpoint to /tmp/tmpq501g8li
Configuration saved in /tmp/tmpq501g8li/config.json
Model weights saved in /tmp/tmpq501g8li/pytorch_model.bin
Feature extractor saved in /tmp/tmpq501g8li/preprocessor_config.json


TrainOutput(global_step=568, training_loss=0.020683181117957746, metrics={'train_runtime': 109.6715, 'train_samples_per_second': 41.433, 'train_steps_per_second': 5.179, 'total_flos': 1.570500384537744e+17, 'train_loss': 0.020683181117957746, 'epoch': 1.0})

# Evaluation

In [None]:
import librosa
from sklearn.metrics import classification_report

In [None]:
dataset = load_from_disk("/content/RAV_TESS_preprocessed_data")
test_dataset = dataset["validation"]

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")

In [None]:
model_name_or_path = ""
config  = AutoConfig(model_name_or_path)
processor = Wav2Vec2Processor.from_pretrained(model_name_or_path)
model = Wav2Vec2ForSpeechClassification.from_pretrained(model_name_or_path).to(device)

In [None]:
def speech_file_to_array_fn(batch):
    speech_array, sampling_rate = torchaudio.load(batch["Path"])
    speech_array = speech_array.squeeze().numpy()
    speech_array = librosa.resample(np.asarray(speech_array), sampling_rate, processor.feature_extractor.sampling_rate)

    batch["speech"] = speech_array
    return batch


def predict(batch):
    features = processor(batch["speech"], sampling_rate=processor.feature_extractor.sampling_rate, return_tensors="pt", padding=True)

    input_values = features.input_values.to(device)
    attention_mask = features.attention_mask.to(device)

    with torch.no_grad():
        logits = model(input_values, attention_mask=attention_mask).logits 

    pred_ids = torch.argmax(logits, dim=-1).detach().cpu().numpy()
    batch["predicted"] = pred_ids
    return batch

In [None]:
test_dataset = test_dataset.map(speech_file_to_array_fn)

In [None]:
result = test_dataset.map(predict, batched=True, batch_size=8)

In [None]:
label_names = [config.id2label[i] for i in range(config.num_labels)]
label_names

In [None]:
label_names = [config.id2label[i] for i in range(config.num_labels)]
label_names

In [None]:
y_true = [config.label2id[name] for name in result["emotion"]]
y_pred = result["predicted"]

print(y_true[:5])
print(y_pred[:5])

In [None]:
print(classification_report(y_true, y_pred, target_names=label_names))

# Prediction 

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_name_or_path = ""
config = AutoConfig.from_pretrained(model_name_or_path)
processor = Wav2Vec2Processor.from_pretrained(model_name_or_path)
sampling_rate = processor.feature_extractor.sampling_rate
model = Wav2Vec2ForSpeechClassification.from_pretrained(model_name_or_path).to(device)

In [None]:
def speech_file_to_array_fn(path, sampling_rate):
    speech_array, _sampling_rate = torchaudio.load(path)
    resampler = torchaudio.transforms.Resample(_sampling_rate)
    speech = resampler(speech_array[0]).squeeze().numpy()
    return speech


def predict(path, sampling_rate):
    speech = speech_file_to_array_fn(path, sampling_rate)
    features = processor(speech, sampling_rate=sampling_rate, return_tensors="pt", padding=True)

    input_values = features.input_values.to(device)
    attention_mask = features.attention_mask.to(device)

    with torch.no_grad():
        logits = model(input_values, attention_mask=attention_mask).logits

    scores = F.softmax(logits, dim=1).detach().cpu().numpy()[0]
    outputs = [{"Emotion": config.id2label[i], "Score": f"{round(score * 100, 3):.1f}%"} for i, score in enumerate(scores)]
    return outputs

STYLES = """
<style>
div.display_data {
    margin: 0 auto;
    max-width: 500px;
}
table.xxx {
    margin: 50px !important;
    float: right !important;
    clear: both !important;
}
table.xxx td {
    min-width: 300px !important;
    text-align: center !important;
}
</style>
""".strip()

def prediction(df_row):
    path, emotion = df_row["Path"], df_row["Emotion"]
    df = pd.DataFrame([{"Emotion": emotion, "Sentence": "    "}])
    setup = {
        'border': 2,
        'show_dimensions': True,
        'justify': 'center',
        'classes': 'xxx',
        'escape': False,
    }
    ipd.display(ipd.HTML(STYLES + df.to_html(**setup) + "<br />"))
    speech, sr = torchaudio.load(path)
    speech = speech[0].numpy().squeeze()
    speech = librosa.resample(np.asarray(speech), sr, sampling_rate)
    ipd.display(ipd.Audio(data=np.asarray(speech), autoplay=True, rate=sampling_rate))

    outputs = predict(path, sampling_rate)
    r = pd.DataFrame(outputs)
    ipd.display(ipd.HTML(STYLES + r.to_html(**setup) + "<br />"))

In [None]:
test = pd.read_csv("/content/data/test.csv", sep="\t")
test.head()