In [1]:
from datasets import load_dataset, Audio, DatasetDict, ClassLabel

In [2]:
def adjust_labels(batch):
    batch["emotion"] = [sentiment for sentiment in batch["emotion"]]
    return batch


In [3]:
combined_dataset = load_dataset("./dataset/combined", data_dir="./", split="train")
features = combined_dataset.features.copy()
features["emotion"] = ClassLabel(names=['happy','neutral','angry','sad','fearful','disgust','calm','surprised','boredom'])
combined_dataset = combined_dataset.map(adjust_labels, batched=True, features=features)
combined_dataset = combined_dataset.train_test_split(test_size=0.2,stratify_by_column="emotion")
test_data_split = combined_dataset["test"].train_test_split(test_size=0.5,stratify_by_column="emotion")
combined_dataset = DatasetDict({
    "train": combined_dataset["train"],
    "test": test_data_split["test"],
    "val": test_data_split["train"]
})

Resolving data files:   0%|          | 0/1976 [00:00<?, ?it/s]

In [4]:
from transformers import AutoFeatureExtractor
feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-large-xlsr-53")

In [5]:
def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["audio"]]
    inputs = feature_extractor(
        audio_arrays, sampling_rate=feature_extractor.sampling_rate
    )
    return inputs

In [6]:
combined_dataset = combined_dataset.cast_column("audio", Audio(sampling_rate=16000))

In [7]:
labels = combined_dataset["train"].features["emotion"].names
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

In [8]:
encoded_combined_dataset = combined_dataset.map(preprocess_function, remove_columns="audio", batched=True)
encoded_combined_dataset = encoded_combined_dataset.rename_column("emotion", "label")
encoded_combined_dataset

Map:   0%|          | 0/1580 [00:00<?, ? examples/s]

Map:   0%|          | 0/198 [00:00<?, ? examples/s]

Map:   0%|          | 0/197 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'input_values', 'attention_mask'],
        num_rows: 1580
    })
    test: Dataset({
        features: ['label', 'input_values', 'attention_mask'],
        num_rows: 198
    })
    val: Dataset({
        features: ['label', 'input_values', 'attention_mask'],
        num_rows: 197
    })
})

In [9]:
import evaluate

accuracy = evaluate.load("accuracy")

In [10]:
import numpy as np


def compute_metrics(eval_pred):
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=eval_pred.label_ids)

In [11]:
from transformers import AutoModelForAudioClassification, TrainingArguments, Trainer

num_labels = len(label2id)
model = AutoModelForAudioClassification.from_pretrained(
    "facebook/wav2vec2-large-xlsr-53", num_labels=num_labels, label2id=label2id, id2label=label2id
)

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-large-xlsr-53 and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
training_args = TrainingArguments(
    output_dir="combined_new_emotion_model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=0.0001,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    per_device_eval_batch_size=4,
    num_train_epochs=25,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    push_to_hub=False,
    
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_combined_dataset["train"].with_format("torch"),
    eval_dataset=encoded_combined_dataset["val"].with_format("torch"),
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
)

trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


Epoch,Training Loss,Validation Loss,Accuracy
0,2.1665,2.140993,0.162437
2,1.6779,1.672525,0.340102
4,1.0152,0.858021,0.685279
6,0.8356,0.413865,0.893401
8,0.3902,0.301223,0.93401
10,0.2318,0.666462,0.873096
12,0.5071,0.353614,0.923858
14,0.1722,0.179738,0.959391
16,0.1664,0.334996,0.939086
18,0.0799,0.285123,0.949239


TrainOutput(global_step=4925, training_loss=0.5267856774605376, metrics={'train_runtime': 1696.8494, 'train_samples_per_second': 23.278, 'train_steps_per_second': 2.902, 'total_flos': 4.876366232802485e+18, 'train_loss': 0.5267856774605376, 'epoch': 24.94})

In [13]:
trainer.evaluate(encoded_combined_dataset["test"].with_format("torch"))

{'eval_loss': 0.6186405420303345,
 'eval_accuracy': 0.8838383838383839,
 'eval_runtime': 3.7204,
 'eval_samples_per_second': 53.22,
 'eval_steps_per_second': 13.439,
 'epoch': 24.94}

In [14]:
best_ckpt_path = trainer.state.best_model_checkpoint
print(best_ckpt_path)

combined_new_emotion_model\checkpoint-2962
