# Training the Model
It's time to train our model!

In [2]:
import pandas as pd
import librosa
import matplotlib.pyplot as plt
import librosa.display
import numpy as np
import IPython
import os
!pip install datasets
from datasets import load_dataset, Audio, DatasetDict, ClassLabel, concatenate_datasets
from transformers import AutoFeatureExtractor, AutoModelForAudioClassification
from transformers import TrainingArguments, Trainer
from datasets import load_from_disk






## Dataset Loading and combining

Load the dataset

In [3]:
dataset = load_from_disk("output/dataset")

Have a look 

In [4]:
dataset

Dataset({
    features: ['audio', 'condition'],
    num_rows: 869
})

Let's print the first element.

In [5]:
dataset[0]

{'audio': './output/audio/SIDEA/504.mp3', 'condition': 'VG+'}

Loaded! Now is time to split into training and testing. Frist we remove the elements without condition and encode the labels.

In [6]:
dataset = dataset.filter(lambda example: example["condition"] is not None)

In [8]:
labels = set(dataset["condition"])

print(labels)

label2id, id2label = dict(), dict()

for i, label in enumerate(labels):
    label2id[label] = i
    id2label[i] = label

{'VG', 'VG+', 'G', 'G+', 'F', 'NM'}


In [9]:
label2id,id2label

({'VG': 0, 'VG+': 1, 'G': 2, 'G+': 3, 'F': 4, 'NM': 5},
 {0: 'VG', 1: 'VG+', 2: 'G', 3: 'G+', 4: 'F', 5: 'NM'})

In [None]:
dataset = dataset.map(lambda x: {"condition":label2id[x["condition"]]})

Let's have a look to the first element with the encoded labels.

In [12]:
dataset[0]

{'audio': './output/audio/SIDEA/504.mp3', 'condition': 1}

Now we can load our model and our feature extractor.

In [None]:
from transformers import AutoFeatureExtractor
model_id = "facebook/wav2vec2-base"
feature_extractor = AutoFeatureExtractor.from_pretrained(
    model_id, do_normalize=True, return_attention_mask=True
)
sampling_rate = feature_extractor.sampling_rate
sampling_rate

Let's resample

In [14]:
dataset = dataset.cast_column("audio", Audio(sampling_rate=sampling_rate))

Let's shuffle and split

In [15]:
dataset = dataset.shuffle(seed=95)

dataset_split = dataset.train_test_split(test_size=0.20)

ds = DatasetDict({
    'train' : dataset_split['train'],
    'eval' : dataset_split['test']
})

Let's see how it looks

In [16]:
import random
for _ in range(5):
    rand_idx = random.randint(0, len(ds["train"])-1)
    example = ds["train"][rand_idx]
    audio = example["audio"]

    print(f'Label: {id2label[example["condition"]]}')
    print(f'Shape: {audio["array"].shape}, sampling rate: {audio["sampling_rate"]}')
    print()


Label: G
Shape: (331396,), sampling rate: 16000

Label: G+
Shape: (3039771,), sampling rate: 16000

Label: VG+
Shape: (1371650,), sampling rate: 16000

Label: G+
Shape: (1061060,), sampling rate: 16000

Label: VG
Shape: (646816,), sampling rate: 16000



In [17]:
sample = ds["train"][0]["audio"]

print(f"Mean: {np.mean(sample['array']):.3}, Variance: {np.var(sample['array']):.3}")

Mean: -0.000345, Variance: 0.0265


In [18]:
inputs = feature_extractor(sample["array"], sampling_rate=sample["sampling_rate"])

print(f"inputs keys: {list(inputs.keys())}")

print(
    f"Mean: {np.mean(inputs['input_values']):.3}, Variance: {np.var(inputs['input_values']):.3}"
)

inputs keys: ['input_values', 'attention_mask']
Mean: -1.5e-08, Variance: 1.0


Let's truncate the clips to 10 secs and preprocess the dataset

In [19]:
max_duration = 10.0
def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["audio"]]
    inputs = feature_extractor(
        audio_arrays,
        sampling_rate=feature_extractor.sampling_rate,
        max_length=int(feature_extractor.sampling_rate * max_duration),
        truncation=True,
        return_attention_mask=True,
    )
    return inputs

In [None]:
encoded_audio = ds.map(preprocess_function, remove_columns="audio", batched=True)

Rename for the model

In [None]:
encoded_audio = encoded_audio.rename_column("condition", "label")

## Finetuning the model

In [None]:
from transformers import AutoModelForAudioClassification

num_labels = len(id2label)

model = AutoModelForAudioClassification.from_pretrained(
    model_id,
    num_labels=num_labels,
    label2id=label2id,
    id2label=id2label,
)

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
!pip install accelerate>=0.21.0
!pip install transformers[torch]

In [None]:
from transformers import TrainingArguments

model_name = model_id.split("/")[-1]+ "-vinyl_condition"
batch_size = 8
gradient_accumulation_steps = 1
num_train_epochs = 10

training_args = TrainingArguments(
    model_name,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_train_epochs,
    warmup_ratio=0.1,
    logging_steps=5,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    fp16=True,
    push_to_hub=True,
)

In [None]:
!pip install evaluate

In [None]:
import evaluate
import numpy as np

metric = evaluate.load("accuracy")


def compute_metrics(p):
    '''
    This function calculates & returns the following metrics:
    - accuracy
    - f1 score
    - recall
    - precision
    '''
    import evaluate

    accuracy_metric = evaluate.load("accuracy")
    accuracy = accuracy_metric.compute(predictions=np.argmax(p.predictions, axis=1), references=p.label_ids)['accuracy']

    ### ------------------- F1 scores -------------------

    f1_score_metric = evaluate.load("f1")
    f1_score = f1_score_metric.compute(predictions=np.argmax(p.predictions, axis=1), references=p.label_ids, average = "macro")["f1"]

    ### ------------------- recall -------------------

    recall_metric = evaluate.load("recall")
    recall = recall_metric.compute(predictions=np.argmax(p.predictions, axis=1), references=p.label_ids, average = "macro")["recall"]

    ### ------------------- precision -------------------

    precision_metric = evaluate.load("precision")
    precision = precision_metric.compute(predictions=np.argmax(p.predictions, axis=1), references=p.label_ids, average = "macro")["precision"]

    return {"accuracy" : accuracy,
            "F1" : f1_score,
            "Recall" : recall,
            "Precision" : precision,
            }


All prepared. Let's train the model! :)

In [None]:
from transformers import Trainer

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = encoded_audio["train"],
    eval_dataset = encoded_audio["eval"],
    tokenizer = feature_extractor,
    compute_metrics = compute_metrics,
)

trainer.train()