In [None]:
from datasets import load_dataset
from transformers import ASTFeatureExtractor
import torch
from torch.utils.data import DataLoader
import torch.profiler
import numpy as np
from datasets import Audio


In [None]:
import platform
print(f"Python version: {platform.python_version()}")
print(f"System: {platform.system()} {platform.release()}")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA version: {torch.version.cuda}")
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"CUDA device count: {torch.cuda.device_count()}")

In [None]:
dataset = load_dataset("audiofolder", data_dir="./for-2seconds")

pretrained_model = "MIT/ast-finetuned-audioset-10-10-0.4593"
feature_extractor = ASTFeatureExtractor.from_pretrained(pretrained_model, num_mel_bins=64, max_length=507)


model_input_name = feature_extractor.model_input_names[0]
SAMPLING_RATE = feature_extractor.sampling_rate
num_labels = len(np.unique(dataset["train"]["label"]))


In [None]:
print('dataset:', dataset)
print('dataset train features:', dataset['train'].features)
print('dataset test features:', dataset['test'].features)
print('dataset validation features:', dataset['validation'].features)
print('model_input_name:', model_input_name)
print('SAMPLING_RATE:', SAMPLING_RATE)
print('dataset["train"][0]:', dataset['train'][0])
print('num_labels:', num_labels)
print('dataset columns:', dataset['train'].column_names)


In [None]:
# calculate values for normalization
feature_extractor.do_normalize = False

# Initialize running statistics
n = 0
mean = 0.0
M2 = 0.0  # For running variance calculation

def preprocess_audio(batch):
    wavs = [audio["array"] for audio in batch["input_values"]]
    inputs = feature_extractor(wavs, sampling_rate=SAMPLING_RATE, return_tensors="pt", return_attention_mask=True, max_length=507)
    return {
        model_input_name: inputs.get(model_input_name),
        "labels": torch.tensor(batch["label"])
    }

dataset = dataset.rename_column("audio", "input_values")
dataset["train"].set_transform(preprocess_audio, output_all_columns=False)

# Process in batches to save memory
for batch in dataset["train"]:
    
    audio_input = batch[model_input_name]
    batch_size = audio_input.shape[0]

    # Calculate batch statistics
    batch_mean = torch.mean(audio_input)
    batch_variance = torch.var(audio_input, unbiased=False)  # Use N instead of N-1 for population variance
    
    # Update running statistics using Welford's online algorithm
    delta = batch_mean - mean
    mean += delta * batch_size / (n + batch_size)
    M2 += batch_variance * batch_size + delta ** 2 * n * batch_size / (n + batch_size)
    n += batch_size

# Calculate final statistics
feature_extractor.mean = mean.item()
feature_extractor.std = torch.sqrt(M2 / n).item()  # Population standard deviation
feature_extractor.do_normalize = True

print('mean: ', feature_extractor.mean)
print('std: ', feature_extractor.std)

In [None]:
from audiomentations import Compose, AddGaussianSNR, GainTransition, Gain, ClippingDistortion, TimeStretch, PitchShift

audio_augmentations = Compose([
    AddGaussianSNR(min_snr_db=10, max_snr_db=20),
    Gain(min_gain_db=-6, max_gain_db=6),
    GainTransition(min_gain_db=-6, max_gain_db=6, min_duration=0.01, max_duration=0.3, duration_unit="fraction"),
    ClippingDistortion(min_percentile_threshold=0, max_percentile_threshold=30, p=0.5),
    TimeStretch(min_rate=0.8, max_rate=1.2),
    PitchShift(min_semitones=-4, max_semitones=4),
], p=0.8, shuffle=True)

In [None]:
def preprocess_audio_with_transforms(batch):
    # we apply augmentations on each waveform
    wavs = [audio_augmentations(audio["array"], sample_rate=SAMPLING_RATE) for audio in batch["input_values"]]
    inputs = feature_extractor(wavs, sampling_rate=SAMPLING_RATE, return_tensors="pt", return_attention_mask=True, max_length=507)
    return {
        model_input_name: inputs.get(model_input_name),
        "labels": torch.tensor(batch["label"])
    }

print(dataset)
# Cast the audio column to the appropriate feature type and rename it
dataset = dataset.cast_column("input_values", Audio(sampling_rate=feature_extractor.sampling_rate))



In [None]:
# with augmentations on the training set
dataset["train"].set_transform(preprocess_audio_with_transforms, output_all_columns=False)
# w/o augmentations on the test set
dataset["test"].set_transform(preprocess_audio, output_all_columns=False)
dataset["validation"].set_transform(preprocess_audio, output_all_columns=False)


In [None]:
temp_ds = dataset['train'].with_format(None)  # This removes any applied transforms
temp_ds_test = dataset['test'].with_format(None)  # This removes any applied transforms

print(temp_ds[0])
unique_labels = sorted(set(temp_ds["label"] + temp_ds_test["label"]))
print(unique_labels)

In [None]:
from transformers import ASTConfig, ASTForAudioClassification

# Load configuration from the pretrained model
config = ASTConfig.from_pretrained(pretrained_model)

# Update configuration with the number of labels in our dataset
config.num_mel_bins = 64  # Make sure this matches your feature extractor
config.max_length = 507   # Or whatever your sequence length is
config.num_labels = num_labels
config.label2id = {"fake": 0, "real": 1}
config.id2label = {0: "fake", 1: "real"}

# Initialize the model with the updated configuration
model = ASTForAudioClassification.from_pretrained(
    pretrained_model,
    config=config,
    ignore_mismatched_sizes=True
)
model.init_weights()

In [None]:
from transformers import TrainingArguments

# Configure training run with TrainingArguments class
training_args = TrainingArguments(
    output_dir="./runs/ast_classifier",
    logging_dir="./logs/ast_classifier",
    report_to="tensorboard",
    learning_rate=5e-5,
    num_train_epochs=10,
    per_device_train_batch_size=8,  # Start with 8, can increase if memory allows
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,
    dataloader_num_workers=0,  # Changed from 4 to 0 for Windows
    dataloader_pin_memory=False,  # Disable pin_memory on Windows
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    logging_steps=20,
    remove_unused_columns=False,
    fp16=True,
    fp16_full_eval=True,
    no_cuda=not torch.cuda.is_available(),
)

In [None]:
import evaluate
import numpy as np

accuracy = evaluate.load("accuracy")
recall = evaluate.load("recall")
precision = evaluate.load("precision")
f1 = evaluate.load("f1")

AVERAGE = "macro" if config.num_labels > 2 else "binary"

def compute_metrics(eval_pred):
    logits = eval_pred.predictions
    predictions = np.argmax(logits, axis=1)
    metrics = accuracy.compute(predictions=predictions, references=eval_pred.label_ids)
    metrics.update(precision.compute(predictions=predictions, references=eval_pred.label_ids, average=AVERAGE))
    metrics.update(recall.compute(predictions=predictions, references=eval_pred.label_ids, average=AVERAGE))
    metrics.update(f1.compute(predictions=predictions, references=eval_pred.label_ids, average=AVERAGE))
    return metrics

In [None]:
from transformers import Trainer, DataCollatorWithPadding
from tqdm.auto import tqdm

# Initialize the data collator
data_collator = DataCollatorWithPadding(
    tokenizer=feature_extractor,  # Your feature extractor acts as the tokenizer
    padding=True,
    max_length=config.max_length,
    return_tensors="pt"
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    compute_metrics=compute_metrics,
    tokenizer=feature_extractor,
    data_collator=data_collator,
)

In [None]:
# Add this before training to verify shapes
sample = next(iter(dataset["train"]))
print(f"Sample input shape: {sample['input_values'].shape}")

# Forward pass to check for errors
with torch.no_grad():
    output = model(sample["input_values"].unsqueeze(0))
print("Forward pass successful!")

In [None]:
print(f"Model config: {model.config}")
print(f"Sample input shape: {sample['input_values'].shape}")
print(f"Model's expected input shape: {model.config.max_length}")
print(f"Feature extractor config: {feature_extractor}")

In [None]:
# Check GPU availability
print(f"Using device: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU'}")
if torch.cuda.is_available():
     print(f"GPU Memory Total: {torch.cuda.get_device_properties(0).total_memory/1e9:.2f} GB")
     print(f"GPU Memory Allocated: {torch.cuda.memory_allocated(0)/1e9:.2f} GB")
     print(f"GPU Memory Cached: {torch.cuda.memory_reserved(0)/1e9:.2f} GB")


In [None]:
# trainer.train(resume_from_checkpoint=False) #Set to False when no checkpoint is available.


In [None]:
if torch.cuda.is_available():
     print(f"GPU Memory Total: {torch.cuda.get_device_properties(0).total_memory/1e9:.2f} GB")
     print(f"GPU Memory Allocated: {torch.cuda.memory_allocated(0)/1e9:.2f} GB")
     print(f"GPU Memory Cached: {torch.cuda.memory_reserved(0)/1e9:.2f} GB")
else:
    print("No GPU available")

In [None]:
import os
import json

# Define the checkpoint path
checkpoint_path = "runs/ast_classifier/checkpoint-8730"

# Load the model and feature extractor
model = ASTForAudioClassification.from_pretrained(checkpoint_path)
feature_extractor = ASTFeatureExtractor.from_pretrained(checkpoint_path)

# Look for training history
trainer_state_path = os.path.join(checkpoint_path, "trainer_state.json")
if os.path.exists(trainer_state_path):
    with open(trainer_state_path, "r") as f:
        trainer_state = json.load(f)
    print("\nTraining metrics from trainer_state.json:")
    print(json.dumps(trainer_state, indent=2))

# Set model to evaluation mode
model.eval()

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
print(f"Model loaded on {device}")

In [None]:
import librosa
def predict_audio(file_path, model, feature_extractor, device="cuda"):
    # Load audio file
    audio, sr = librosa.load(file_path, sr=feature_extractor.sampling_rate)
    
    # Preprocess the audio
    inputs = feature_extractor(
        audio, 
        sampling_rate=feature_extractor.sampling_rate,
        return_tensors="pt",
        padding=True,
        return_attention_mask=True
    )
    
    # Move inputs to the same device as the model
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    # Make prediction
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        probabilities = torch.softmax(logits, dim=-1)
    
    print(f"Raw logits: {logits}")
    print(f"Raw probabilities: {probabilities}")
    
    # Get predicted class (0 for fake, 1 for real)
    predicted_class = torch.argmax(probabilities, dim=1).item()
    confidence = probabilities[0][predicted_class].item()
    
    print(f"P(fake): {probabilities[0][0].item():.4f}")
    print(f"P(real): {probabilities[0][1].item():.4f}")
    
    # Map class index to label
    label = "fake" if predicted_class == 0 else "real"
    
    return {
        "label": label,
        "confidence": confidence,
        "probabilities": {
            "fake": probabilities[0][0].item(),
            "real": probabilities[0][1].item()
        }
    }

In [None]:

audio_file_path = "file1186.wav_16k.wav_norm.wav_mono.wav_silence.wav_2sec.wav"
result = predict_audio(audio_file_path, model, feature_extractor, device)

print(f"Prediction: {result['label']}")
print(f"Confidence: {result['confidence']:.4f}")
print(f"Probabilities - Fake: {result['probabilities']['fake']:.4f}, Real: {result['probabilities']['real']:.4f}")