In [1]:
from datasets import load_dataset, Audio, DatasetDict, ClassLabel

In [2]:
def adjust_labels(batch):
    batch["emotion"] = [sentiment for sentiment in batch["emotion"]]
    return batch

In [3]:
german_dataset = load_dataset("./dataset/emo", data_dir="./", split="train")
features = german_dataset.features.copy()
features["emotion"] = ClassLabel(names=[ 'happy','neutral','angry','sad','fearful','boredom','disgust'])
german_dataset = german_dataset.map(adjust_labels, batched=True, features=features)
german_dataset = german_dataset.train_test_split(test_size=0.2,stratify_by_column="emotion")
test_data_split = german_dataset["test"].train_test_split(test_size=0.5,stratify_by_column="emotion")
german_dataset = DatasetDict({
    "train": german_dataset["train"],
    "test": test_data_split["test"],
    "val": test_data_split["train"]
})

Resolving data files:   0%|          | 0/536 [00:00<?, ?it/s]

In [4]:
german_dataset

DatasetDict({
    train: Dataset({
        features: ['audio', 'emotion'],
        num_rows: 428
    })
    test: Dataset({
        features: ['audio', 'emotion'],
        num_rows: 54
    })
    val: Dataset({
        features: ['audio', 'emotion'],
        num_rows: 53
    })
})

In [5]:
german_dataset["train"][0]


{'audio': {'path': 'C:\\Users\\wkwon\\Documents\\Neural Networks\\WenYue\\deep-learning-project\\dataset\\emo\\data\\15a07Fb.wav',
  'array': array([ 9.15527344e-05,  1.52587891e-04, -7.62939453e-04, ...,
          4.57763672e-04, -1.46484375e-03, -5.18798828e-04]),
  'sampling_rate': 16000},
 'emotion': 0}

In [6]:
german_dataset["train"][1]


{'audio': {'path': 'C:\\Users\\wkwon\\Documents\\Neural Networks\\WenYue\\deep-learning-project\\dataset\\emo\\data\\10a02Wa.wav',
  'array': array([-0.00057983, -0.0005188 , -0.00073242, ..., -0.00131226,
         -0.00180054, -0.00143433]),
  'sampling_rate': 16000},
 'emotion': 2}

In [7]:
german_dataset["train"][2]


{'audio': {'path': 'C:\\Users\\wkwon\\Documents\\Neural Networks\\WenYue\\deep-learning-project\\dataset\\emo\\data\\16b01Wb.wav',
  'array': array([ 2.13623047e-04,  7.32421875e-04,  4.27246094e-04, ...,
         -1.83105469e-04, -9.15527344e-05,  0.00000000e+00]),
  'sampling_rate': 16000},
 'emotion': 2}

In [8]:
from transformers import AutoFeatureExtractor

feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-large-xlsr-53")

In [9]:
def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["audio"]]
    inputs = feature_extractor(
        audio_arrays, sampling_rate=feature_extractor.sampling_rate
    )
    return inputs

In [10]:
german_dataset = german_dataset.cast_column("audio", Audio(sampling_rate=16000))

In [11]:
encoded_german_dataset = german_dataset.map(preprocess_function, remove_columns="audio", batched=True)
encoded_german_dataset = encoded_german_dataset.rename_column("emotion", "label")

Map:   0%|          | 0/428 [00:00<?, ? examples/s]

Map:   0%|          | 0/54 [00:00<?, ? examples/s]

Map:   0%|          | 0/53 [00:00<?, ? examples/s]

In [12]:
encoded_german_dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'input_values', 'attention_mask'],
        num_rows: 428
    })
    test: Dataset({
        features: ['label', 'input_values', 'attention_mask'],
        num_rows: 54
    })
    val: Dataset({
        features: ['label', 'input_values', 'attention_mask'],
        num_rows: 53
    })
})

In [13]:

label = {
    'happy' : 0,
    'neutral' : 1,
    'angry' : 2,
    'sad' : 3,
    'fearful': 4,
    'boredom' : 5,
    'disgust' : 6,
}

label_id = {
    0 : 'happy',
    1 : 'neutral',
    2 : 'angry',
    3 : 'sad',
    4 : 'fearful',
    5 : 'boredom',
    6 : 'disgust',
}


In [14]:
import evaluate

accuracy = evaluate.load("accuracy")

In [15]:
import numpy as np


def compute_metrics(eval_pred):
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=eval_pred.label_ids)

In [16]:
from transformers import AutoModelForAudioClassification, TrainingArguments, Trainer

num_labels = len(label)
model = AutoModelForAudioClassification.from_pretrained(
    "facebook/wav2vec2-large-xlsr-53", num_labels=num_labels, label2id=label, id2label=label_id
)

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-large-xlsr-53 and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
training_args = TrainingArguments(
    output_dir="german_emotion_model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=0.0001,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    per_device_eval_batch_size=4,
    num_train_epochs=20,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    push_to_hub=False,
    
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_german_dataset["train"].with_format("torch"),
    eval_dataset=encoded_german_dataset["val"].with_format("torch"),
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
)

trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


Epoch,Training Loss,Validation Loss,Accuracy
0,1.9376,1.926298,0.245283
2,1.4675,1.290713,0.566038
4,0.8708,0.704629,0.698113
6,0.6482,0.424657,0.886792
8,0.3723,0.158696,0.981132
10,0.0837,0.187771,0.943396
12,0.0359,0.194751,0.943396
14,0.0333,0.006215,1.0
16,0.0166,0.008997,1.0
18,0.1417,0.003631,1.0


TrainOutput(global_step=1060, training_loss=0.5308463309803662, metrics={'train_runtime': 433.3331, 'train_samples_per_second': 19.754, 'train_steps_per_second': 2.446, 'total_flos': 9.818516611860818e+17, 'train_loss': 0.5308463309803662, 'epoch': 19.81})

In [18]:
trainer.evaluate(encoded_german_dataset["test"].with_format("torch"))

{'eval_loss': 0.09691314399242401,
 'eval_accuracy': 0.9814814814814815,
 'eval_runtime': 1.3197,
 'eval_samples_per_second': 40.918,
 'eval_steps_per_second': 10.608,
 'epoch': 19.81}

In [19]:
best_ckpt_path = trainer.state.best_model_checkpoint
print(best_ckpt_path)

german_emotion_model\checkpoint-1060


### Testing against RAVD dataset for data with common labels

In [20]:
ravd_dataset = load_dataset("./dataset/emo_model_test_using_ravd", data_dir="./", split="train")
features = ravd_dataset.features.copy()
features["emotion"] = ClassLabel(names=[ 'happy','neutral','angry','sad','fearful','boredom','disgust'])
ravd_dataset = ravd_dataset.map(adjust_labels, batched=True, features=features)
ravd_dataset = DatasetDict({
    "test": ravd_dataset,
})

Resolving data files:   0%|          | 0/1057 [00:00<?, ?it/s]

In [21]:
from transformers import AutoFeatureExtractor
feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-large-xlsr-53")

def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["audio"]]
    inputs = feature_extractor(
        audio_arrays, sampling_rate=feature_extractor.sampling_rate
    )
    return inputs
    
ravd_dataset = ravd_dataset.cast_column("audio", Audio(sampling_rate=16000))
encoded_ravd_dataset = ravd_dataset.map(preprocess_function, remove_columns="audio", batched=True)
encoded_ravd_dataset = encoded_ravd_dataset.rename_column("emotion", "label")

Map:   0%|          | 0/1056 [00:00<?, ? examples/s]

In [22]:
trainer.evaluate(encoded_ravd_dataset["test"].with_format("torch"))

{'eval_loss': 2.9718003273010254,
 'eval_accuracy': 0.5350378787878788,
 'eval_runtime': 15.6417,
 'eval_samples_per_second': 67.512,
 'eval_steps_per_second': 16.878,
 'epoch': 19.81}