In [1]:
!pip install audiomentations

Collecting audiomentations
  Downloading audiomentations-0.37.0-py3-none-any.whl.metadata (11 kB)
Collecting numpy-minmax<1,>=0.3.0 (from audiomentations)
  Downloading numpy_minmax-0.3.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.0 kB)
Collecting numpy-rms<1,>=0.4.2 (from audiomentations)
  Downloading numpy_rms-0.4.2-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.5 kB)
Collecting scipy<1.13,>=1.4 (from audiomentations)
  Downloading scipy-1.12.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.4/60.4 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Downloading audiomentations-0.37.0-py3-none-any.whl (80 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m80.5/80.5 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy_minmax-0.

In [2]:
from datasets import Dataset, Audio, ClassLabel, Features
import os

class_names = ['Bonafide', 'Spoof']
class_labels = ClassLabel(names=class_names)

features = Features({
    'audio': Audio(),
    'labels': class_labels
})

In [None]:
from datasets import Dataset, Features, Audio, ClassLabel

def parse_protocol(protocol_path, base_dir):
    file_paths = []
    labels = []
    with open(protocol_path, 'r') as f:
        for line in f:
            _, utt_id, _, _, label = line.strip().split()
            file_paths.append(f"{base_dir}/flac/{utt_id}.flac")
            labels.append(0 if label == "bonafide" else 1)
    return file_paths, labels

train_protocol = "/kaggle/input/asvpoof-2019-dataset/LA/LA/ASVspoof2019_LA_cm_protocols/ASVspoof2019.LA.cm.train.trn.txt"
dev_protocol = "/kaggle/input/asvpoof-2019-dataset/LA/LA/ASVspoof2019_LA_cm_protocols/ASVspoof2019.LA.cm.dev.trl.txt"
eval_protocol = "/kaggle/input/asvpoof-2019-dataset/LA/LA/ASVspoof2019_LA_cm_protocols/ASVspoof2019.LA.cm.eval.trl.txt"

train_dir = "/kaggle/input/asvpoof-2019-dataset/LA/LA/ASVspoof2019_LA_train"
dev_dir = "/kaggle/input/asvpoof-2019-dataset/LA/LA/ASVspoof2019_LA_dev"
eval_dir = "/kaggle/input/asvpoof-2019-dataset/LA/LA/ASVspoof2019_LA_eval"

train_audio_files, train_labels = parse_protocol(train_protocol, train_dir)
dev_audio_files, dev_labels = parse_protocol(dev_protocol, dev_dir)
eval_audio_files, eval_labels = parse_protocol(eval_protocol, eval_dir)

class_names = ['Bonafide', 'Spoof']
features = Features({
    'audio': Audio(sampling_rate=16000),
    'labels': ClassLabel(names=class_names)
})

train_dataset = Dataset.from_dict({'audio': train_audio_files, 'labels': train_labels}, features=features)
dev_dataset = Dataset.from_dict({'audio': dev_audio_files, 'labels': dev_labels}, features=features)
eval_dataset = Dataset.from_dict({'audio': eval_audio_files, 'labels': eval_labels}, features=features)

In [4]:
train_dataset = train_dataset.cast_column('audio', Audio(sampling_rate=16000))
dev_dataset = dev_dataset.cast_column('audio', Audio(sampling_rate=16000))
eval_dataset = eval_dataset.cast_column('audio', Audio(sampling_rate=16000))

from datasets import concatenate_datasets

train_dataset = concatenate_datasets([train_dataset, dev_dataset, eval_dataset])

num_labels = 2

In [5]:
train_val_split = train_dataset.train_test_split(test_size=0.1, seed=52) 

In [6]:
from datasets import DatasetDict

dataset = DatasetDict({
    'train': train_val_split['train'],
    'val': train_val_split['test']
})

In [7]:
print(dataset['train'][0])

{'audio': {'path': '/kaggle/input/asvpoof-2019-dataset/LA/LA/ASVspoof2019_LA_eval/flac/LA_E_8880294.flac', 'array': array([-2.44140625e-04, -3.05175781e-05, -9.15527344e-05, ...,
       -4.21142578e-03, -3.93676758e-03, -4.24194336e-03]), 'sampling_rate': 16000}, 'labels': 1}


In [8]:
from transformers import ASTFeatureExtractor

pretrained_model = "MIT/ast-finetuned-audioset-10-10-0.4593"
feature_extractor = ASTFeatureExtractor.from_pretrained(pretrained_model)

model_input_name = feature_extractor.model_input_names[0]  # key -> 'input_values'
SAMPLING_RATE = feature_extractor.sampling_rate

preprocessor_config.json:   0%|          | 0.00/297 [00:00<?, ?B/s]

In [9]:
from audiomentations import Compose, AddGaussianSNR, GainTransition, Gain, ClippingDistortion, TimeStretch, PitchShift

audio_augmentations = Compose([
    #AddGaussianSNR(min_snr_db=10, max_snr_db=20, p=0.5),
    Gain(min_gain_db=-6, max_gain_db=6, p=0.2),
    GainTransition(min_gain_db=-6, max_gain_db=6, min_duration=0.01, max_duration=0.3, duration_unit="fraction", p=0.2),
    #ClippingDistortion(min_percentile_threshold=0, max_percentile_threshold=30, p=0.1),
], p=0.8, shuffle=True)

In [10]:
def preprocess_audio(batch):
    wavs = [audio["array"] for audio in batch["input_values"]]
    # inputs are spectrograms as torch.tensors now
    inputs = feature_extractor(wavs, sampling_rate=SAMPLING_RATE, return_tensors="pt")

    output_batch = {model_input_name: inputs.get(model_input_name), "labels": list(batch["labels"])}
    return output_batch

def preprocess_audio_with_transforms(batch):
    # we apply augmentations on each waveform
    wavs = [audio_augmentations(audio["array"], sample_rate=SAMPLING_RATE) for audio in batch["input_values"]]
    inputs = feature_extractor(wavs, sampling_rate=SAMPLING_RATE, return_tensors="pt")

    output_batch = {model_input_name: inputs.get(model_input_name), "labels": list(batch["labels"])}

    return output_batch

In [11]:
dataset = dataset.rename_column("audio", "input_values")  # rename audio column
dataset.set_transform(preprocess_audio, output_all_columns=False)

In [12]:
# with augmentations on the training set
dataset["train"].set_transform(preprocess_audio_with_transforms, output_all_columns=False)
# w/o augmentations on the val set
dataset["val"].set_transform(preprocess_audio, output_all_columns=False)

In [13]:
from transformers import ASTConfig, ASTForAudioClassification

# Load configuration from the pretrained model
config = ASTConfig.from_pretrained(pretrained_model)

# Update configuration with the number of labels in our dataset
config.num_labels = num_labels
config.label2id = {'Bonafide': 0, 'Spoof': 1}
config.id2label = {v: k for k, v in config.label2id.items()}

# Initialize the model with the updated configuration
model = ASTForAudioClassification.from_pretrained(pretrained_model, config=config, ignore_mismatched_sizes=True)
model.init_weights()

config.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

Some weights of ASTForAudioClassification were not initialized from the model checkpoint at MIT/ast-finetuned-audioset-10-10-0.4593 and are newly initialized because the shapes did not match:
- classifier.dense.bias: found shape torch.Size([527]) in the checkpoint and torch.Size([2]) in the model instantiated
- classifier.dense.weight: found shape torch.Size([527, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from transformers import TrainingArguments

os.environ["WANDB_API_KEY"] = ''
os.environ["WANDB_PROJECT"] = "Fine-tuning AST AIRI 228"
os.environ["WANDB_NOTES"] = "Fine-tuning ASTSPOOF AIRI 228"
os.environ["WANDB_NAME"] = "ft-astspoof-airi-228"

# Configure training run with TrainingArguments class
training_args = TrainingArguments(
    output_dir="./runs/ast_classifier",
    logging_dir="./logs/ast_classifier",
    report_to="wandb",
    learning_rate=5e-5,  # Learning rate
    push_to_hub=False,
    num_train_epochs=2,  # Number of epochs
    per_device_train_batch_size=16,  # Batch size per device
    eval_strategy="epoch",  # Evaluation strategy
    save_strategy="steps",
    eval_steps=1,
    save_steps=500,
    metric_for_best_model="accuracy",
    logging_strategy="steps",
    logging_steps=1,
)

In [15]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [16]:
import evaluate
import numpy as np

accuracy = evaluate.load("accuracy")
recall = evaluate.load("recall")
precision = evaluate.load("precision")
f1 = evaluate.load("f1")

AVERAGE = "macro" if config.num_labels > 2 else "binary"

def compute_metrics(eval_pred):
    logits = eval_pred.predictions
    predictions = np.argmax(logits, axis=1)
    metrics = accuracy.compute(predictions=predictions, references=eval_pred.label_ids)
    metrics.update(precision.compute(predictions=predictions, references=eval_pred.label_ids, average=AVERAGE))
    metrics.update(recall.compute(predictions=predictions, references=eval_pred.label_ids, average=AVERAGE))
    metrics.update(f1.compute(predictions=predictions, references=eval_pred.label_ids, average=AVERAGE))
    return metrics

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.36k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.55k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

In [17]:
from transformers import Trainer, TrainerCallback
import time

class TimeLimitCallback(TrainerCallback):
    def __init__(self, max_time_in_seconds):
        self.max_time_in_seconds = max_time_in_seconds
        self.start_time = None

    def on_train_begin(self, args, state, control, **kwargs):
        self.start_time = time.time()  # Start the timer when training begins

    def on_step_end(self, args, state, control, **kwargs):
        elapsed_time = time.time() - self.start_time
        if elapsed_time > self.max_time_in_seconds:
            print(f"Stopping training after {self.max_time_in_seconds / 3600} hours.")
            control.should_early_stop = True  # Stop the training
            control.should_save = True  # Optionally save the model at the end

In [18]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["val"],
    compute_metrics=compute_metrics,  # Use the metrics function from above
    callbacks=[TimeLimitCallback(max_time_in_seconds=11*3600)]  # 10 hours
)

In [19]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mlightsource-[0m ([33mlightsource-unk[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Tracking run with wandb version 0.18.3
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20241126_194017-8vxlq0ic[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33m./runs/ast_classifier[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/lightsource-unk/Fine-tuning%20AST%20AIRI%20228[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/lightsource-unk/Fine-tuning%20AST%20AIRI%20228/runs/8vxlq0ic[0m
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.0,0.002137,0.999588,0.999726,0.999817,0.999772
2,0.0,0.000811,0.999671,0.999726,0.999909,0.999817


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(devic

TrainOutput(global_step=6834, training_loss=0.011461016461029724, metrics={'train_runtime': 36516.0024, 'train_samples_per_second': 5.987, 'train_steps_per_second': 0.187, 'total_flos': 1.481922672034893e+19, 'train_loss': 0.011461016461029724, 'epoch': 2.0})