In [47]:
from datasets import load_dataset
from transformers import ASTFeatureExtractor
import torch
from torch.utils.data import DataLoader
import torch.profiler
import numpy as np
from datasets import Audio


In [48]:
import platform
print(f"Python version: {platform.python_version()}")
print(f"System: {platform.system()} {platform.release()}")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA version: {torch.version.cuda}")
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"CUDA device count: {torch.cuda.device_count()}")

Python version: 3.12.7
System: Windows 11
PyTorch version: 2.7.1+cu118
CUDA available: True
CUDA version: 11.8
GPU: NVIDIA GeForce RTX 4050 Laptop GPU
CUDA device count: 1


In [49]:
dataset = load_dataset("audiofolder", data_dir="./for-2seconds")

pretrained_model = "MIT/ast-finetuned-audioset-10-10-0.4593"
feature_extractor = ASTFeatureExtractor.from_pretrained(pretrained_model, num_mel_bins=64, max_length=507)


model_input_name = feature_extractor.model_input_names[0]
SAMPLING_RATE = feature_extractor.sampling_rate
num_labels = len(np.unique(dataset["train"]["label"]))


Resolving data files:   0%|          | 0/13956 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/2826 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/1088 [00:00<?, ?it/s]

In [50]:
print('dataset:', dataset)
print('dataset train features:', dataset['train'].features)
print('dataset test features:', dataset['test'].features)
print('dataset validation features:', dataset['validation'].features)
print('model_input_name:', model_input_name)
print('SAMPLING_RATE:', SAMPLING_RATE)
print('dataset["train"][0]:', dataset['train'][0])
print('num_labels:', num_labels)
print('dataset columns:', dataset['train'].column_names)


dataset: DatasetDict({
    train: Dataset({
        features: ['audio', 'label'],
        num_rows: 13956
    })
    validation: Dataset({
        features: ['audio', 'label'],
        num_rows: 2826
    })
    test: Dataset({
        features: ['audio', 'label'],
        num_rows: 1088
    })
})
dataset train features: {'audio': Audio(sampling_rate=None, mono=True, decode=True, id=None), 'label': ClassLabel(names=['fake', 'real'], id=None)}
dataset test features: {'audio': Audio(sampling_rate=None, mono=True, decode=True, id=None), 'label': ClassLabel(names=['fake', 'real'], id=None)}
dataset validation features: {'audio': Audio(sampling_rate=None, mono=True, decode=True, id=None), 'label': ClassLabel(names=['fake', 'real'], id=None)}
model_input_name: input_values
SAMPLING_RATE: 16000
dataset["train"][0]: {'audio': {'path': 'D:\\Documents\\Github\\thesis-testing\\for-2seconds\\training\\fake\\file10005.mp3.wav_16k.wav_norm.wav_mono.wav_silence.wav_2sec.wav', 'array': array([ 0.105529

In [51]:
# calculate values for normalization
feature_extractor.do_normalize = False

# Initialize running statistics
n = 0
mean = 0.0
M2 = 0.0  # For running variance calculation

def preprocess_audio(batch):
    wavs = [audio["array"] for audio in batch["input_values"]]
    inputs = feature_extractor(wavs, sampling_rate=SAMPLING_RATE, return_tensors="pt", return_attention_mask=True, max_length=507)
    return {
        model_input_name: inputs.get(model_input_name),
        "labels": torch.tensor(batch["label"])
    }

dataset = dataset.rename_column("audio", "input_values")
dataset["train"].set_transform(preprocess_audio, output_all_columns=False)

# Process in batches to save memory
for batch in dataset["train"]:
    
    audio_input = batch[model_input_name]
    batch_size = audio_input.shape[0]

    # Calculate batch statistics
    batch_mean = torch.mean(audio_input)
    batch_variance = torch.var(audio_input, unbiased=False)  # Use N instead of N-1 for population variance
    
    # Update running statistics using Welford's online algorithm
    delta = batch_mean - mean
    mean += delta * batch_size / (n + batch_size)
    M2 += batch_variance * batch_size + delta ** 2 * n * batch_size / (n + batch_size)
    n += batch_size

# Calculate final statistics
feature_extractor.mean = mean.item()
feature_extractor.std = torch.sqrt(M2 / n).item()  # Population standard deviation
feature_extractor.do_normalize = True

print('mean: ', feature_extractor.mean)
print('std: ', feature_extractor.std)

mean:  -1.6385200023651123
std:  3.336308717727661


In [52]:
from audiomentations import Compose, AddGaussianSNR, GainTransition, Gain, ClippingDistortion, TimeStretch, PitchShift

audio_augmentations = Compose([
    AddGaussianSNR(min_snr_db=10, max_snr_db=20),
    Gain(min_gain_db=-6, max_gain_db=6),
    GainTransition(min_gain_db=-6, max_gain_db=6, min_duration=0.01, max_duration=0.3, duration_unit="fraction"),
    ClippingDistortion(min_percentile_threshold=0, max_percentile_threshold=30, p=0.5),
    TimeStretch(min_rate=0.8, max_rate=1.2),
    PitchShift(min_semitones=-4, max_semitones=4),
], p=0.8, shuffle=True)

In [53]:
def preprocess_audio_with_transforms(batch):
    # we apply augmentations on each waveform
    wavs = [audio_augmentations(audio["array"], sample_rate=SAMPLING_RATE) for audio in batch["input_values"]]
    inputs = feature_extractor(wavs, sampling_rate=SAMPLING_RATE, return_tensors="pt", return_attention_mask=True, max_length=507)
    return {
        model_input_name: inputs.get(model_input_name),
        "labels": torch.tensor(batch["label"])
    }

print(dataset)
# Cast the audio column to the appropriate feature type and rename it
dataset = dataset.cast_column("input_values", Audio(sampling_rate=feature_extractor.sampling_rate))



DatasetDict({
    train: Dataset({
        features: ['input_values', 'label'],
        num_rows: 13956
    })
    validation: Dataset({
        features: ['input_values', 'label'],
        num_rows: 2826
    })
    test: Dataset({
        features: ['input_values', 'label'],
        num_rows: 1088
    })
})


In [54]:
# with augmentations on the training set
dataset["train"].set_transform(preprocess_audio_with_transforms, output_all_columns=False)
# w/o augmentations on the test set
dataset["test"].set_transform(preprocess_audio, output_all_columns=False)
dataset["validation"].set_transform(preprocess_audio, output_all_columns=False)


In [55]:
temp_ds = dataset['train'].with_format(None)  # This removes any applied transforms
temp_ds_test = dataset['test'].with_format(None)  # This removes any applied transforms

print(temp_ds[0])
unique_labels = sorted(set(temp_ds["label"] + temp_ds_test["label"]))
print(unique_labels)

{'input_values': {'path': 'D:\\Documents\\Github\\thesis-testing\\for-2seconds\\training\\fake\\file10005.mp3.wav_16k.wav_norm.wav_mono.wav_silence.wav_2sec.wav', 'array': array([ 0.10552979,  0.11013794,  0.00952148, ..., -0.20837402,
       -0.25552368, -0.24255371]), 'sampling_rate': 16000}, 'label': 0}
[0, 1]


In [56]:
from transformers import ASTConfig, ASTForAudioClassification

# Load configuration from the pretrained model
config = ASTConfig.from_pretrained(pretrained_model)

# Update configuration with the number of labels in our dataset
config.num_mel_bins = 64  # Make sure this matches your feature extractor
config.max_length = 507   # Or whatever your sequence length is
config.num_labels = num_labels
config.label2id = {"fake": 0, "real": 1}
config.id2label = {0: "fake", 1: "real"}

# Initialize the model with the updated configuration
model = ASTForAudioClassification.from_pretrained(
    pretrained_model,
    config=config,
    ignore_mismatched_sizes=True
)
model.init_weights()

Some weights of ASTForAudioClassification were not initialized from the model checkpoint at MIT/ast-finetuned-audioset-10-10-0.4593 and are newly initialized because the shapes did not match:
- audio_spectrogram_transformer.embeddings.position_embeddings: found shape torch.Size([1, 1214, 768]) in the checkpoint and torch.Size([1, 252, 768]) in the model instantiated
- classifier.dense.bias: found shape torch.Size([527]) in the checkpoint and torch.Size([2]) in the model instantiated
- classifier.dense.weight: found shape torch.Size([527, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [57]:
from transformers import TrainingArguments

# Configure training run with TrainingArguments class
training_args = TrainingArguments(
    output_dir="./runs/ast_classifier",
    logging_dir="./logs/ast_classifier",
    report_to="tensorboard",
    learning_rate=5e-5,
    num_train_epochs=10,
    per_device_train_batch_size=8,  # Start with 8, can increase if memory allows
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,
    dataloader_num_workers=0,  # Changed from 4 to 0 for Windows
    dataloader_pin_memory=False,  # Disable pin_memory on Windows
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    logging_steps=20,
    remove_unused_columns=False,
    fp16=True,
    fp16_full_eval=True,
    no_cuda=not torch.cuda.is_available(),
)

In [58]:
import evaluate
import numpy as np

accuracy = evaluate.load("accuracy")
recall = evaluate.load("recall")
precision = evaluate.load("precision")
f1 = evaluate.load("f1")

AVERAGE = "macro" if config.num_labels > 2 else "binary"

def compute_metrics(eval_pred):
    logits = eval_pred.predictions
    predictions = np.argmax(logits, axis=1)
    metrics = accuracy.compute(predictions=predictions, references=eval_pred.label_ids)
    metrics.update(precision.compute(predictions=predictions, references=eval_pred.label_ids, average=AVERAGE))
    metrics.update(recall.compute(predictions=predictions, references=eval_pred.label_ids, average=AVERAGE))
    metrics.update(f1.compute(predictions=predictions, references=eval_pred.label_ids, average=AVERAGE))
    return metrics

In [59]:
from transformers import Trainer, DataCollatorWithPadding
from tqdm.auto import tqdm

# Initialize the data collator
data_collator = DataCollatorWithPadding(
    tokenizer=feature_extractor,  # Your feature extractor acts as the tokenizer
    padding=True,
    max_length=config.max_length,
    return_tensors="pt"
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    compute_metrics=compute_metrics,
    tokenizer=feature_extractor,
    data_collator=data_collator,
)

  trainer = Trainer(


In [60]:
# Add this before training to verify shapes
sample = next(iter(dataset["train"]))
print(f"Sample input shape: {sample['input_values'].shape}")

# Forward pass to check for errors
with torch.no_grad():
    output = model(sample["input_values"].unsqueeze(0))
print("Forward pass successful!")

Sample input shape: torch.Size([507, 64])
Forward pass successful!


In [61]:
print(f"Model config: {model.config}")
print(f"Sample input shape: {sample['input_values'].shape}")
print(f"Model's expected input shape: {model.config.max_length}")
print(f"Feature extractor config: {feature_extractor}")

Model config: ASTConfig {
  "architectures": [
    "ASTForAudioClassification"
  ],
  "attention_probs_dropout_prob": 0.0,
  "frequency_stride": 10,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "id2label": {
    "0": "fake",
    "1": "real"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "fake": 0,
    "real": 1
  },
  "layer_norm_eps": 1e-12,
  "max_length": 507,
  "model_type": "audio-spectrogram-transformer",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_mel_bins": 64,
  "patch_size": 16,
  "qkv_bias": true,
  "time_stride": 10,
  "torch_dtype": "float32",
  "transformers_version": "4.53.1"
}

Sample input shape: torch.Size([507, 64])
Model's expected input shape: 507
Feature extractor config: ASTFeatureExtractor {
  "do_normalize": true,
  "feature_extractor_type": "ASTFeatureExtractor",
  "feature_size": 1,
  "max_length": 507,
  "mean": -1.6385200023651123,
  "num_mel_bins": 64,
  "padding_side":

In [62]:
# Check GPU availability
print(f"Using device: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU'}")
if torch.cuda.is_available():
     print(f"GPU Memory Total: {torch.cuda.get_device_properties(0).total_memory/1e9:.2f} GB")
     print(f"GPU Memory Allocated: {torch.cuda.memory_allocated(0)/1e9:.2f} GB")
     print(f"GPU Memory Cached: {torch.cuda.memory_reserved(0)/1e9:.2f} GB")


Using device: NVIDIA GeForce RTX 4050 Laptop GPU
GPU Memory Total: 6.44 GB
GPU Memory Allocated: 0.36 GB
GPU Memory Cached: 2.53 GB


In [63]:
trainer.train(resume_from_checkpoint=False) #Set to False when no checkpoint is available.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2053,0.570544,0.794118,0.973373,0.604779,0.746032
2,0.17,0.407251,0.876838,0.91,0.836397,0.871648
3,0.0471,0.491739,0.842831,0.763047,0.994485,0.863528
4,0.0641,0.299739,0.911765,0.875839,0.959559,0.915789
5,0.0978,0.961041,0.766544,0.683081,0.994485,0.80988
6,0.0843,0.292511,0.921875,0.987261,0.854779,0.916256
7,0.0656,0.262866,0.938419,0.909091,0.974265,0.94055
8,0.0215,0.794465,0.849265,0.769886,0.996324,0.86859
9,0.0548,0.495819,0.895221,0.830769,0.992647,0.904523
10,0.0011,0.308466,0.933824,0.889439,0.990809,0.937391




TrainOutput(global_step=8730, training_loss=0.07627669029612039, metrics={'train_runtime': 2932.4359, 'train_samples_per_second': 47.592, 'train_steps_per_second': 2.977, 'total_flos': 2.32177401831638e+18, 'train_loss': 0.07627669029612039, 'epoch': 10.0})

In [64]:
if torch.cuda.is_available():
     print(f"GPU Memory Total: {torch.cuda.get_device_properties(0).total_memory/1e9:.2f} GB")
     print(f"GPU Memory Allocated: {torch.cuda.memory_allocated(0)/1e9:.2f} GB")
     print(f"GPU Memory Cached: {torch.cuda.memory_reserved(0)/1e9:.2f} GB")
else:
    print("No GPU available")

GPU Memory Total: 6.44 GB
GPU Memory Allocated: 1.42 GB
GPU Memory Cached: 2.89 GB
