In [1]:
import os
import sys
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, Subset
import torchaudio
import torchaudio.transforms as transforms
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score, precision_score, recall_score
import torch.nn.functional as F


# -------------------------
# 1. Add the PANNs model directory to the system path
# -------------------------
pytorch_path = r"C:\\Users\\Harsh\\Desktop\\Audio Recognition Project\\audioset_tagging_cnn-master\\pytorch"
sys.path.insert(0, pytorch_path)

# Import model (we use Cnn14 in this example)
from models import Cnn14

# -------------------------
# 2. Configuration and Paths
# -------------------------
DATASET_PATH = r"C:\\Users\\Harsh\\Desktop\\Audio Recognition Project\\dataset"

TRAIN_METADATA_CSV = os.path.join(DATASET_PATH, "metadata of train set.csv")
TEST_METADATA_CSV  = os.path.join(DATASET_PATH, "metadata of test set.csv")

TRAIN_AUDIO_DIR = os.path.join(DATASET_PATH, "train")
TEST_AUDIO_DIR  = os.path.join(DATASET_PATH, "test")

BATCH_SIZE = 8
EPOCHS = 20
LEARNING_RATE = 1e-4
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# -------------------------
# 3. Audio Processing Parameters
# -------------------------
SAMPLE_RATE = 32000
N_FFT = 1024
HOP_LENGTH = 320
N_MELS = 64
FMIN = 50
FMAX = 14000

# (No need for a mel_transform here because the model internally computes spectrograms.)

# -------------------------
# 4. Prepare Metadata and Create Class Mapping
# -------------------------
train_meta = pd.read_csv(TRAIN_METADATA_CSV)
train_meta.columns = train_meta.columns.str.strip()

# We use the "Classname" column as the label.
classes = sorted(train_meta["Classname"].unique())
class_to_idx = {cls: i for i, cls in enumerate(classes)}
num_classes = len(classes)
print("Class mapping (Classname -> Index):")
print(class_to_idx)

# -------------------------
# 5. Define the Custom Dataset (Return Raw Waveform)
# -------------------------
class AudioDataset(Dataset):
    def __init__(self, metadata_csv, audio_dir, class_to_idx, transform=None):
        self.metadata = pd.read_csv(metadata_csv)
        self.metadata.columns = self.metadata.columns.str.strip()
        self.audio_dir = audio_dir
        self.class_to_idx = class_to_idx
        self.transform = transform

    def __len__(self):
        return len(self.metadata)

    def __getitem__(self, idx):
        row = self.metadata.iloc[idx]
        filename = row["Filename"]
        label = self.class_to_idx[row["Classname"]]
        file_path = os.path.join(self.audio_dir, filename)
        
        waveform, sr = torchaudio.load(file_path)
        # If stereo, take the first channel.
        if waveform.shape[0] > 1:
            waveform = waveform[0:1, :]
        # Resample if needed.
        if sr != SAMPLE_RATE:
            resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=SAMPLE_RATE)
            waveform = resampler(waveform)
        # Squeeze to make the waveform 1D.
        waveform = waveform.squeeze(0)  # shape: [data_length]
        if self.transform:
            waveform = self.transform(waveform)
        return waveform, label

# Create dataset instances.
train_dataset = AudioDataset(TRAIN_METADATA_CSV, TRAIN_AUDIO_DIR, class_to_idx, transform=None)
test_dataset = AudioDataset(TEST_METADATA_CSV, TEST_AUDIO_DIR, class_to_idx, transform=None)

# -------------------------
# 6. Custom Collate Function for Raw Waveforms
# -------------------------
def collate_fn(batch):
    """
    Pads raw 1D waveforms in the batch along the time dimension
    so that all waveforms have the same length.
    """
    waveforms, labels = zip(*batch)
    max_length = max(waveform.shape[0] for waveform in waveforms)
    padded_waveforms = []
    for waveform in waveforms:
        pad_length = max_length - waveform.shape[0]
        padded_waveform = F.pad(waveform, (0, pad_length))
        padded_waveforms.append(padded_waveform)
    stacked_waveforms = torch.stack(padded_waveforms, dim=0)  # shape: [batch_size, max_length]
    labels = torch.tensor(labels, dtype=torch.long)
    return stacked_waveforms, labels

# Create DataLoaders with the custom collate function.
train_loader_full = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

# -------------------------
# 7. Define Model Parameters and Initialize Model
# -------------------------
model_params = {
    "sample_rate": SAMPLE_RATE,
    "window_size": N_FFT,
    "hop_size": HOP_LENGTH,
    "mel_bins": N_MELS,
    "fmin": FMIN,
    "fmax": FMAX,
    "classes_num": num_classes  # For example, 7 if you have 7 classes.
}

# -------------------------
# 8. 3-Fold Cross-Validation on Training Set
# -------------------------
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score, precision_score, recall_score

kf = KFold(n_splits=3, shuffle=True, random_state=42)
fold_f1 = []
fold_precision = []
fold_recall = []

print("\nStarting 3-Fold Cross-Validation...")
for fold, (train_idx, val_idx) in enumerate(kf.split(train_dataset)):
    print(f"\n--- Fold {fold+1} ---")
    
    # Create subsets for training and validation for this fold.
    train_subset = Subset(train_dataset, train_idx)
    val_subset = Subset(train_dataset, val_idx)
    
    train_loader = DataLoader(train_subset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
    val_loader = DataLoader(val_subset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)
    
    # Initialize a new model for this fold.
    model_fold = Cnn14(**model_params)
    model_fold.to(DEVICE)
    optimizer_fold = optim.Adam(model_fold.parameters(), lr=LEARNING_RATE)
    criterion_fold = nn.CrossEntropyLoss()
    
    # Train for the specified number of epochs.
    for epoch in range(EPOCHS):
        model_fold.train()
        running_loss = 0.0
        for waveforms, labels in tqdm(train_loader, desc=f"Fold {fold+1} Epoch {epoch+1}"):
            waveforms, labels = waveforms.to(DEVICE), labels.to(DEVICE)
            optimizer_fold.zero_grad()
            outputs = model_fold(waveforms)
            logits = outputs["clipwise_output"]  # Extract classification logits.
            loss = criterion_fold(logits, labels)
            loss.backward()
            optimizer_fold.step()
            running_loss += loss.item()
        print(f"Fold {fold+1} Epoch {epoch+1} Loss: {running_loss/len(train_loader):.4f}")
    
    # Evaluate on the validation set for this fold.
    model_fold.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for waveforms, labels in val_loader:
            waveforms, labels = waveforms.to(DEVICE), labels.to(DEVICE)
            outputs = model_fold(waveforms)
            logits = outputs["clipwise_output"]
            _, preds = torch.max(logits, 1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    fold_f1.append(f1_score(all_labels, all_preds, average="weighted", zero_division=0))
    fold_precision.append(precision_score(all_labels, all_preds, average="weighted", zero_division=0))
    fold_recall.append(recall_score(all_labels, all_preds, average="weighted", zero_division=0))
    print(f"Fold {fold+1} - F1: {fold_f1[-1]:.4f}, Precision: {fold_precision[-1]:.4f}, Recall: {fold_recall[-1]:.4f}")

# Print average metrics across folds.
avg_f1 = np.mean(fold_f1)
avg_precision = np.mean(fold_precision)
avg_recall = np.mean(fold_recall)
print("\n--- Average 3-Fold Cross-Validation Results ---")
print(f"F1 Score: {avg_f1:.4f}")
print(f"Precision: {avg_precision:.4f}")
print(f"Recall: {avg_recall:.4f}")

# -------------------------
# 9. Optionally, Train on Full Training Set and Evaluate on Test Set
# -------------------------
# (This part is separate from cross-validation; you can choose to train a final model on all training data.)
model_final = Cnn14(**model_params)
model_final.to(DEVICE)
optimizer_final = optim.Adam(model_final.parameters(), lr=LEARNING_RATE)
criterion_final = nn.CrossEntropyLoss()

print("\nTraining final model on full training set...")
for epoch in range(EPOCHS):
    model_final.train()
    total_loss = 0.0
    for waveforms, labels in tqdm(train_loader_full, desc=f"Final Model Epoch {epoch+1}/{EPOCHS}"):
        waveforms, labels = waveforms.to(DEVICE), labels.to(DEVICE)
        optimizer_final.zero_grad()
        outputs = model_final(waveforms)
        logits = outputs["clipwise_output"]
        loss = criterion_final(logits, labels)
        loss.backward()
        optimizer_final.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} - Training Loss: {total_loss/len(train_loader_full):.4f}")

torch.save(model_final.state_dict(), "trained_model.pth")
print("Final training complete. Model saved as 'trained_model.pth'.")

# Evaluate on the test set.
model_final.eval()
correct = 0
total = 0
with torch.no_grad():
    for waveforms, labels in test_loader:
        waveforms, labels = waveforms.to(DEVICE), labels.to(DEVICE)
        outputs = model_final(waveforms)
        logits = outputs["clipwise_output"]
        _, predicted = torch.max(logits, 1)
        correct += (predicted == labels).sum().item()
        total += labels.size(0)
        
test_accuracy = correct / total
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")


Class mapping (Classname -> Index):
{'breath': 0, 'cough': 1, 'crying': 2, 'laugh': 3, 'screaming': 4, 'sneeze': 5, 'yawn': 6}

Starting 3-Fold Cross-Validation...

--- Fold 1 ---


Fold 1 Epoch 1: 100%|██████████| 524/524 [01:57<00:00,  4.47it/s]


Fold 1 Epoch 1 Loss: 1.6847


Fold 1 Epoch 2: 100%|██████████| 524/524 [01:52<00:00,  4.64it/s]


Fold 1 Epoch 2 Loss: 1.6684


Fold 1 Epoch 3: 100%|██████████| 524/524 [01:54<00:00,  4.56it/s]


Fold 1 Epoch 3 Loss: 1.6946


Fold 1 Epoch 4: 100%|██████████| 524/524 [01:56<00:00,  4.50it/s]


Fold 1 Epoch 4 Loss: 1.6791


Fold 1 Epoch 5: 100%|██████████| 524/524 [01:58<00:00,  4.42it/s]


Fold 1 Epoch 5 Loss: 1.6679


Fold 1 Epoch 6: 100%|██████████| 524/524 [02:00<00:00,  4.34it/s]


Fold 1 Epoch 6 Loss: 1.6687


Fold 1 Epoch 7: 100%|██████████| 524/524 [02:02<00:00,  4.29it/s]


Fold 1 Epoch 7 Loss: 1.6585


Fold 1 Epoch 8: 100%|██████████| 524/524 [02:04<00:00,  4.22it/s]


Fold 1 Epoch 8 Loss: 1.6618


Fold 1 Epoch 9: 100%|██████████| 524/524 [02:04<00:00,  4.21it/s]


Fold 1 Epoch 9 Loss: 1.6588


Fold 1 Epoch 10: 100%|██████████| 524/524 [02:05<00:00,  4.17it/s]


Fold 1 Epoch 10 Loss: 1.6675


Fold 1 Epoch 11: 100%|██████████| 524/524 [02:07<00:00,  4.12it/s]


Fold 1 Epoch 11 Loss: 1.6683


Fold 1 Epoch 12: 100%|██████████| 524/524 [02:06<00:00,  4.13it/s]


Fold 1 Epoch 12 Loss: 1.6541


Fold 1 Epoch 13: 100%|██████████| 524/524 [02:06<00:00,  4.14it/s]


Fold 1 Epoch 13 Loss: 1.6523


Fold 1 Epoch 14: 100%|██████████| 524/524 [02:06<00:00,  4.15it/s]


Fold 1 Epoch 14 Loss: 1.6626


Fold 1 Epoch 15: 100%|██████████| 524/524 [02:05<00:00,  4.17it/s]


Fold 1 Epoch 15 Loss: 1.6999


Fold 1 Epoch 16: 100%|██████████| 524/524 [02:06<00:00,  4.14it/s]


Fold 1 Epoch 16 Loss: 1.7798


Fold 1 Epoch 17: 100%|██████████| 524/524 [02:07<00:00,  4.11it/s]


Fold 1 Epoch 17 Loss: 1.7811


Fold 1 Epoch 18: 100%|██████████| 524/524 [02:07<00:00,  4.12it/s]


Fold 1 Epoch 18 Loss: 1.7804


Fold 1 Epoch 19: 100%|██████████| 524/524 [02:07<00:00,  4.11it/s]


Fold 1 Epoch 19 Loss: 1.7782


Fold 1 Epoch 20: 100%|██████████| 524/524 [02:07<00:00,  4.10it/s]


Fold 1 Epoch 20 Loss: 1.7816
Fold 1 - F1: 0.3145, Precision: 0.2669, Recall: 0.4340

--- Fold 2 ---


Fold 2 Epoch 1: 100%|██████████| 525/525 [02:08<00:00,  4.10it/s]


Fold 2 Epoch 1 Loss: 1.6828


Fold 2 Epoch 2: 100%|██████████| 525/525 [02:07<00:00,  4.10it/s]


Fold 2 Epoch 2 Loss: 1.6830


Fold 2 Epoch 3: 100%|██████████| 525/525 [02:08<00:00,  4.10it/s]


Fold 2 Epoch 3 Loss: 1.6665


Fold 2 Epoch 4: 100%|██████████| 525/525 [02:07<00:00,  4.12it/s]


Fold 2 Epoch 4 Loss: 1.6733


Fold 2 Epoch 5: 100%|██████████| 525/525 [02:07<00:00,  4.12it/s]


Fold 2 Epoch 5 Loss: 1.6670


Fold 2 Epoch 6: 100%|██████████| 525/525 [02:08<00:00,  4.10it/s]


Fold 2 Epoch 6 Loss: 1.6728


Fold 2 Epoch 7: 100%|██████████| 525/525 [02:08<00:00,  4.10it/s]


Fold 2 Epoch 7 Loss: 1.6540


Fold 2 Epoch 8: 100%|██████████| 525/525 [02:07<00:00,  4.12it/s]


Fold 2 Epoch 8 Loss: 1.6538


Fold 2 Epoch 9: 100%|██████████| 525/525 [02:07<00:00,  4.11it/s]


Fold 2 Epoch 9 Loss: 1.6496


Fold 2 Epoch 10: 100%|██████████| 525/525 [02:08<00:00,  4.09it/s]


Fold 2 Epoch 10 Loss: 1.6434


Fold 2 Epoch 11: 100%|██████████| 525/525 [02:07<00:00,  4.12it/s]


Fold 2 Epoch 11 Loss: 1.6554


Fold 2 Epoch 12: 100%|██████████| 525/525 [02:08<00:00,  4.10it/s]


Fold 2 Epoch 12 Loss: 1.6524


Fold 2 Epoch 13: 100%|██████████| 525/525 [02:07<00:00,  4.11it/s]


Fold 2 Epoch 13 Loss: 1.6494


Fold 2 Epoch 14: 100%|██████████| 525/525 [02:08<00:00,  4.10it/s]


Fold 2 Epoch 14 Loss: 1.6388


Fold 2 Epoch 15: 100%|██████████| 525/525 [02:08<00:00,  4.10it/s]


Fold 2 Epoch 15 Loss: 1.6414


Fold 2 Epoch 16: 100%|██████████| 525/525 [02:08<00:00,  4.10it/s]


Fold 2 Epoch 16 Loss: 1.6360


Fold 2 Epoch 17: 100%|██████████| 525/525 [02:07<00:00,  4.11it/s]


Fold 2 Epoch 17 Loss: 1.6855


Fold 2 Epoch 18: 100%|██████████| 525/525 [02:07<00:00,  4.10it/s]


Fold 2 Epoch 18 Loss: 1.7723


Fold 2 Epoch 19: 100%|██████████| 525/525 [02:08<00:00,  4.10it/s]


Fold 2 Epoch 19 Loss: 1.7608


Fold 2 Epoch 20: 100%|██████████| 525/525 [02:08<00:00,  4.09it/s]


Fold 2 Epoch 20 Loss: 1.7780
Fold 2 - F1: 0.2095, Precision: 0.1782, Recall: 0.3402

--- Fold 3 ---


Fold 3 Epoch 1: 100%|██████████| 525/525 [02:07<00:00,  4.12it/s]


Fold 3 Epoch 1 Loss: 1.6726


Fold 3 Epoch 2: 100%|██████████| 525/525 [02:08<00:00,  4.09it/s]


Fold 3 Epoch 2 Loss: 1.6726


Fold 3 Epoch 3: 100%|██████████| 525/525 [02:07<00:00,  4.12it/s]


Fold 3 Epoch 3 Loss: 1.6497


Fold 3 Epoch 4: 100%|██████████| 525/525 [02:08<00:00,  4.09it/s]


Fold 3 Epoch 4 Loss: 1.7231


Fold 3 Epoch 5: 100%|██████████| 525/525 [02:07<00:00,  4.11it/s]


Fold 3 Epoch 5 Loss: 1.6623


Fold 3 Epoch 6: 100%|██████████| 525/525 [02:08<00:00,  4.09it/s]


Fold 3 Epoch 6 Loss: 1.6508


Fold 3 Epoch 7: 100%|██████████| 525/525 [02:07<00:00,  4.12it/s]


Fold 3 Epoch 7 Loss: 1.6906


Fold 3 Epoch 8: 100%|██████████| 525/525 [02:07<00:00,  4.12it/s]


Fold 3 Epoch 8 Loss: 1.6668


Fold 3 Epoch 9: 100%|██████████| 525/525 [02:07<00:00,  4.12it/s]


Fold 3 Epoch 9 Loss: 1.6512


Fold 3 Epoch 10: 100%|██████████| 525/525 [02:08<00:00,  4.10it/s]


Fold 3 Epoch 10 Loss: 1.6442


Fold 3 Epoch 11: 100%|██████████| 525/525 [02:07<00:00,  4.10it/s]


Fold 3 Epoch 11 Loss: 1.6275


Fold 3 Epoch 12: 100%|██████████| 525/525 [02:07<00:00,  4.11it/s]


Fold 3 Epoch 12 Loss: 1.6142


Fold 3 Epoch 13: 100%|██████████| 525/525 [02:07<00:00,  4.11it/s]


Fold 3 Epoch 13 Loss: 1.6260


Fold 3 Epoch 14: 100%|██████████| 525/525 [02:07<00:00,  4.12it/s]


Fold 3 Epoch 14 Loss: 1.6188


Fold 3 Epoch 15: 100%|██████████| 525/525 [02:07<00:00,  4.11it/s]


Fold 3 Epoch 15 Loss: 1.6210


Fold 3 Epoch 16: 100%|██████████| 525/525 [02:07<00:00,  4.11it/s]


Fold 3 Epoch 16 Loss: 1.7154


Fold 3 Epoch 17: 100%|██████████| 525/525 [02:07<00:00,  4.12it/s]


Fold 3 Epoch 17 Loss: 1.6925


Fold 3 Epoch 18: 100%|██████████| 525/525 [02:07<00:00,  4.11it/s]


Fold 3 Epoch 18 Loss: 1.6903


Fold 3 Epoch 19: 100%|██████████| 525/525 [02:08<00:00,  4.10it/s]


Fold 3 Epoch 19 Loss: 1.6929


Fold 3 Epoch 20: 100%|██████████| 525/525 [02:07<00:00,  4.10it/s]


Fold 3 Epoch 20 Loss: 1.6978
Fold 3 - F1: 0.3761, Precision: 0.3012, Recall: 0.5129

--- Average 3-Fold Cross-Validation Results ---
F1 Score: 0.3000
Precision: 0.2487
Recall: 0.4290

Training final model on full training set...


Final Model Epoch 1/20: 100%|██████████| 787/787 [03:27<00:00,  3.80it/s]


Epoch 1 - Training Loss: 1.6998


Final Model Epoch 2/20: 100%|██████████| 787/787 [03:24<00:00,  3.84it/s]


Epoch 2 - Training Loss: 1.7111


Final Model Epoch 3/20: 100%|██████████| 787/787 [03:12<00:00,  4.08it/s]


Epoch 3 - Training Loss: 1.6999


Final Model Epoch 4/20: 100%|██████████| 787/787 [03:14<00:00,  4.05it/s]


Epoch 4 - Training Loss: 1.6974


Final Model Epoch 5/20: 100%|██████████| 787/787 [03:15<00:00,  4.02it/s]


Epoch 5 - Training Loss: 1.7040


Final Model Epoch 6/20: 100%|██████████| 787/787 [03:18<00:00,  3.97it/s]


Epoch 6 - Training Loss: 1.6964


Final Model Epoch 7/20: 100%|██████████| 787/787 [03:20<00:00,  3.92it/s]


Epoch 7 - Training Loss: 1.6879


Final Model Epoch 8/20: 100%|██████████| 787/787 [03:22<00:00,  3.90it/s]


Epoch 8 - Training Loss: 1.7061


Final Model Epoch 9/20: 100%|██████████| 787/787 [03:42<00:00,  3.54it/s]


Epoch 9 - Training Loss: 1.6614


Final Model Epoch 10/20: 100%|██████████| 787/787 [03:30<00:00,  3.73it/s]


Epoch 10 - Training Loss: 1.6695


Final Model Epoch 11/20: 100%|██████████| 787/787 [03:43<00:00,  3.53it/s]


Epoch 11 - Training Loss: 1.6609


Final Model Epoch 12/20: 100%|██████████| 787/787 [03:44<00:00,  3.50it/s]


Epoch 12 - Training Loss: 1.6602


Final Model Epoch 13/20: 100%|██████████| 787/787 [03:45<00:00,  3.50it/s]


Epoch 13 - Training Loss: 1.6873


Final Model Epoch 14/20: 100%|██████████| 787/787 [03:49<00:00,  3.43it/s]


Epoch 14 - Training Loss: 1.6830


Final Model Epoch 15/20: 100%|██████████| 787/787 [03:52<00:00,  3.39it/s]


Epoch 15 - Training Loss: 1.6676


Final Model Epoch 16/20: 100%|██████████| 787/787 [03:54<00:00,  3.35it/s]


Epoch 16 - Training Loss: 1.6744


Final Model Epoch 17/20: 100%|██████████| 787/787 [03:54<00:00,  3.35it/s]


Epoch 17 - Training Loss: 1.6819


Final Model Epoch 18/20: 100%|██████████| 787/787 [03:52<00:00,  3.38it/s]


Epoch 18 - Training Loss: 1.6879


Final Model Epoch 19/20: 100%|██████████| 787/787 [03:53<00:00,  3.37it/s]


Epoch 19 - Training Loss: 1.6558


Final Model Epoch 20/20: 100%|██████████| 787/787 [03:53<00:00,  3.37it/s]


Epoch 20 - Training Loss: 1.6628
Final training complete. Model saved as 'trained_model.pth'.
Test Accuracy: 39.45%
