In [1]:
from datasets import load_dataset, Audio, DatasetDict, ClassLabel

In [2]:
def adjust_labels(batch):
    batch["emotion"] = [sentiment for sentiment in batch["emotion"]]
    return batch

In [3]:
english_dataset = load_dataset("./dataset/ravd", data_dir="./", split="train")
features = english_dataset.features.copy()
features["emotion"] = ClassLabel(names=['happy','neutral','angry','sad','fearful','disgust','calm','surprised'])
english_dataset = english_dataset.map(adjust_labels, batched=True, features=features)
english_dataset = english_dataset.train_test_split(test_size=0.2,stratify_by_column="emotion")
test_data_split = english_dataset["test"].train_test_split(test_size=0.5,stratify_by_column="emotion")
english_dataset = DatasetDict({
    "train": english_dataset["train"],
    "test": test_data_split["test"],
    "val": test_data_split["train"]
})

Resolving data files:   0%|          | 0/1441 [00:00<?, ?it/s]

In [4]:
from transformers import AutoFeatureExtractor

feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-large-xlsr-53")

In [5]:
def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["audio"]]
    inputs = feature_extractor(
        audio_arrays, sampling_rate=feature_extractor.sampling_rate
    )
    return inputs


In [6]:
english_dataset = english_dataset.cast_column("audio", Audio(sampling_rate=16000))

In [7]:
labels = english_dataset["train"].features["emotion"].names
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

In [8]:
encoded_english_dataset = english_dataset.map(preprocess_function, remove_columns="audio", batched=True)
encoded_english_dataset = encoded_english_dataset.rename_column("emotion", "label")

Map:   0%|          | 0/1152 [00:00<?, ? examples/s]

Map:   0%|          | 0/144 [00:00<?, ? examples/s]

Map:   0%|          | 0/144 [00:00<?, ? examples/s]

In [9]:
import evaluate

accuracy = evaluate.load("accuracy")

In [10]:
import numpy as np


def compute_metrics(eval_pred):
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=eval_pred.label_ids)

In [11]:
from transformers import AutoModelForAudioClassification, TrainingArguments, Trainer

num_labels = len(label2id)
model = AutoModelForAudioClassification.from_pretrained(
    "facebook/wav2vec2-large-xlsr-53", num_labels=num_labels, label2id=label2id, id2label=label2id
)

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-large-xlsr-53 and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
training_args = TrainingArguments(
    output_dir="english_new_emotion_model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=0.0001,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    per_device_eval_batch_size=4,
    num_train_epochs=25,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    push_to_hub=False,
    
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_english_dataset["train"].with_format("torch"),
    eval_dataset=encoded_english_dataset["val"].with_format("torch"),
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
)

trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


Epoch,Training Loss,Validation Loss,Accuracy
1,2.0839,2.070834,0.131944
2,2.0494,1.924896,0.277778
3,1.7696,1.731799,0.270833
4,1.6705,1.446478,0.465278
5,1.2,1.0931,0.590278
6,0.826,0.863294,0.680556
7,0.5604,0.664508,0.798611
8,0.5104,0.691342,0.805556
9,0.4057,0.53556,0.8125
10,0.3589,0.476575,0.861111


TrainOutput(global_step=3600, training_loss=0.5487772062675665, metrics={'train_runtime': 1266.9957, 'train_samples_per_second': 22.731, 'train_steps_per_second': 2.841, 'total_flos': 3.5550352964020173e+18, 'train_loss': 0.5487772062675665, 'epoch': 25.0})

In [13]:
trainer.evaluate(encoded_english_dataset["test"].with_format("torch"))

{'eval_loss': 0.572293221950531,
 'eval_accuracy': 0.9097222222222222,
 'eval_runtime': 2.4581,
 'eval_samples_per_second': 58.582,
 'eval_steps_per_second': 14.646,
 'epoch': 25.0}

In [14]:
best_ckpt_path = trainer.state.best_model_checkpoint
print(best_ckpt_path)

english_new_emotion_model\checkpoint-2448


### Test against emo dataset for data with common labels only

In [15]:
emo_dataset = load_dataset("./dataset/ravd_model_test_using_emo", data_dir="./", split="train")
features = emo_dataset.features.copy()
features["emotion"] = ClassLabel(names=['happy','neutral','angry','sad','fearful','disgust','calm','surprised'])
emo_dataset = emo_dataset.map(adjust_labels, batched=True, features=features)
emo_dataset = DatasetDict({
    "test": emo_dataset,
})

Resolving data files:   0%|          | 0/455 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/454 [00:00<?, ? examples/s]

In [16]:
from transformers import AutoFeatureExtractor
feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-large-xlsr-53")

def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["audio"]]
    inputs = feature_extractor(
        audio_arrays, sampling_rate=feature_extractor.sampling_rate
    )
    return inputs
    
emo_dataset = emo_dataset.cast_column("audio", Audio(sampling_rate=16000))
encoded_emo_dataset = emo_dataset.map(preprocess_function, remove_columns="audio", batched=True)
encoded_emo_dataset = encoded_emo_dataset.rename_column("emotion", "label")

Map:   0%|          | 0/454 [00:00<?, ? examples/s]

In [17]:
trainer.evaluate(encoded_emo_dataset["test"].with_format("torch"))

{'eval_loss': 3.400320529937744,
 'eval_accuracy': 0.5154185022026432,
 'eval_runtime': 9.0468,
 'eval_samples_per_second': 50.184,
 'eval_steps_per_second': 12.601,
 'epoch': 25.0}