In [1]:
%%capture --no-display
! wget https://github.com/karoldvl/ESC-50/archive/master.zip
! unzip -qn master.zip

In [2]:
%%capture --no-display
%pip install librosa pandas evaluate
%pip install pyarrow==12.0.1 datasets==2.17.0
%pip install transformers==4.35.2 accelerate==0.27.0

In [3]:
import pandas as pd

df = pd.read_csv("ESC-50-master/meta/esc50.csv")
df_train, df_dev = df[df.fold<5], df[df.fold==5]

In [4]:
from datasets import Dataset, DatasetDict

ds = DatasetDict(
    {
        "train": Dataset.from_pandas(df_train, split="train"),
        "validation": Dataset.from_pandas(df_dev, split="validation"),
    }
)

In [5]:
model_name = "mispeech/ced-base"

In [6]:
from transformers import AutoFeatureExtractor

feature_extractor = AutoFeatureExtractor.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/290 [00:00<?, ?B/s]

In [7]:
import librosa

def preprocess_function(examples):
    max_duration = 1.0
    audio_root = "/content/ESC-50-master/audio/"

    audio, orig_sr = librosa.load(audio_root + examples["filename"], sr=44100)
    audio = librosa.resample(audio, orig_sr=orig_sr, target_sr=16000)

    inputs = feature_extractor(
        audio,
        sampling_rate=feature_extractor.sampling_rate,
        max_length=int(feature_extractor.sampling_rate * max_duration),
        truncation=True,
    )
    inputs["input_values"] = inputs["input_values"].squeeze(0)
    return {**inputs, "labels": examples["target"]}

In [8]:
encoded_dataset = ds.map(
    preprocess_function,
    remove_columns=['filename', 'fold', 'target', 'category', 'esc10', 'src_file', 'take', '__index_level_0__'],
    batched=False,
    with_indices=False,
    with_rank=False,
)

Map:   0%|          | 0/1600 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

In [9]:
from transformers import AutoModelForAudioClassification

outputdim = 50
model = AutoModelForAudioClassification.from_pretrained(model_name, outputdim=outputdim, ignore_mismatched_sizes=True)

model.freeze_encoder()
model.config.loss = "CrossEntropyLoss"

config.json:   0%|          | 0.00/24.9k [00:00<?, ?B/s]

class_labels_indices.csv:   0%|          | 0.00/14.7k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/343M [00:00<?, ?B/s]

Some weights of CedForAudioClassification were not initialized from the model checkpoint at mispeech/ced-base and are newly initialized because the shapes did not match:
- outputlayer.1.bias: found shape torch.Size([527]) in the checkpoint and torch.Size([50]) in the model instantiated
- outputlayer.1.weight: found shape torch.Size([527, 768]) in the checkpoint and torch.Size([50, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
from transformers import TrainingArguments, Trainer

In [11]:
batch_size = 128
args = TrainingArguments(
    f"{model_name}-linearhead-esc50",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-3,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=1,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    warmup_ratio=0,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    dataloader_num_workers=2,    # Google Colab suggests setting num_worker=2
    push_to_hub=False,
)

In [12]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions = np.argmax(eval_pred.predictions[0], axis=1)
    return metric.compute(predictions=predictions, references=eval_pred.label_ids)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [13]:
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
)

In [14]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,3.731,3.493418,0.9425
2,3.4448,3.289105,0.9525
3,3.2837,3.189903,0.9575
4,3.1331,3.153491,0.9575
5,3.1127,3.141858,0.96


TrainOutput(global_step=65, training_loss=3.2985691950871394, metrics={'train_runtime': 207.0171, 'train_samples_per_second': 38.644, 'train_steps_per_second': 0.314, 'total_flos': 1.31352016730112e+17, 'train_loss': 3.2985691950871394, 'epoch': 5.0})