In [None]:
import sys
sys.path.append(r"C:\\Users\\Harsh\\Desktop\\Audio Recognition Project\\audioset_tagging_cnn-master\\pytorch")


In [20]:
import sys
import os

# Manually set the correct directory
pytorch_path = r"C:\Users\Harsh\Desktop\Audio Recognition Project\audioset_tagging_cnn-master\pytorch"
sys.path.insert(0, pytorch_path)  # Use insert(0, ...) to give it higher priority

# Force reload the module to avoid conflicts
import importlib

# Import the correct pytorch_utils
import pytorch_utils
importlib.reload(pytorch_utils)  # Reload to ensure it picks up the correct file

# Now try importing the functions
from pytorch_utils import do_mixup, interpolate, pad_framewise_output


In [51]:
import os
import sys
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, Subset
import torchaudio
import torchaudio.transforms as transforms
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score, precision_score, recall_score
import torch.nn.functional as F

# -------------------------
# 1. Add the PANNs model directory to the system path
# -------------------------
pytorch_path = r"C:\Users\Harsh\Desktop\Audio Recognition Project\audioset_tagging_cnn-master\pytorch"
sys.path.insert(0, pytorch_path)

# Import models (both are available if needed)
from models import Wavegram_Logmel_Cnn14  # (if you want to use this variant)
from models import Cnn14

# -------------------------
# 2. Configuration and Paths
# -------------------------
DATASET_PATH = r"C:\Users\Harsh\Desktop\Audio Recognition Project\dataset"

TRAIN_METADATA_CSV = os.path.join(DATASET_PATH, "metadata of train set.csv")
TEST_METADATA_CSV  = os.path.join(DATASET_PATH, "metadata of test set.csv")

TRAIN_AUDIO_DIR = os.path.join(DATASET_PATH, "train")
TEST_AUDIO_DIR  = os.path.join(DATASET_PATH, "test")

# Since we are training from scratch, we do not use a pretrained checkpoint.
# If you did want to load one, you’d specify its path here.
# PRETRAINED_MODEL_PATH = r"..." 

BATCH_SIZE = 8
EPOCHS = 10
LEARNING_RATE = 1e-4
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# -------------------------
# 3. Audio Processing Parameters
# -------------------------
SAMPLE_RATE = 32000
N_FFT = 1024
HOP_LENGTH = 320
N_MELS = 64
FMIN = 50
FMAX = 14000

# (No need to create mel_transform here because the model does that internally.)

# -------------------------
# 4. Prepare Metadata and Create Class Mapping
# -------------------------
train_meta = pd.read_csv(TRAIN_METADATA_CSV)
train_meta.columns = train_meta.columns.str.strip()

# Use "Classname" as our label.
classes = sorted(train_meta["Classname"].unique())
class_to_idx = {cls: i for i, cls in enumerate(classes)}
num_classes = len(classes)
print("Class mapping (Classname -> Index):")
print(class_to_idx)

# -------------------------
# 5. Define the Custom Dataset (Return Raw Waveform)
# -------------------------
class AudioDataset(Dataset):
    def __init__(self, metadata_csv, audio_dir, class_to_idx, transform=None):
        """
        metadata_csv: CSV file containing at least columns "Filename" and "Classname"
        audio_dir: Directory where audio files are stored.
        class_to_idx: A dictionary mapping class names to integer labels.
        transform: (Optional) extra transform for the waveform.
        """
        self.metadata = pd.read_csv(metadata_csv)
        self.metadata.columns = self.metadata.columns.str.strip()
        self.audio_dir = audio_dir
        self.class_to_idx = class_to_idx
        self.transform = transform

    def __len__(self):
        return len(self.metadata)

    def __getitem__(self, idx):
        row = self.metadata.iloc[idx]
        filename = row["Filename"]
        label = self.class_to_idx[row["Classname"]]
        file_path = os.path.join(self.audio_dir, filename)
        
        waveform, sr = torchaudio.load(file_path)
        # If stereo, take the first channel.
        if waveform.shape[0] > 1:
            waveform = waveform[0:1, :]
        # Resample if needed.
        if sr != SAMPLE_RATE:
            resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=SAMPLE_RATE)
            waveform = resampler(waveform)
        # Remove the channel dimension so that waveform is 1D.
        waveform = waveform.squeeze(0)  # Now shape is (data_length,)
        
        if self.transform:
            waveform = self.transform(waveform)
        
        return waveform, label

# Create dataset instances.
train_dataset = AudioDataset(TRAIN_METADATA_CSV, TRAIN_AUDIO_DIR, class_to_idx, transform=None)
test_dataset = AudioDataset(TEST_METADATA_CSV, TEST_AUDIO_DIR, class_to_idx, transform=None)

# -------------------------
# 6. Custom Collate Function for Raw Waveforms
# -------------------------
def collate_fn(batch):
    """
    Pads raw 1D waveforms (each of shape [data_length]) in the batch along the time dimension
    so that all tensors have the same length.
    """
    waveforms, labels = zip(*batch)
    max_length = max(waveform.shape[0] for waveform in waveforms)
    padded_waveforms = []
    for waveform in waveforms:
        pad_length = max_length - waveform.shape[0]
        # Pad the 1D tensor on the right (last dimension) with zeros.
        padded_waveform = F.pad(waveform, (0, pad_length))
        padded_waveforms.append(padded_waveform)
    stacked_waveforms = torch.stack(padded_waveforms, dim=0)  # Shape: [batch_size, max_length]
    labels = torch.tensor(labels, dtype=torch.long)
    return stacked_waveforms, labels

# Create DataLoaders with the custom collate function.
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

# -------------------------
# 7. Add PANNs Model Directory and Import the Model
# -------------------------
# (We already added the PANNs model directory above.)
from models import Cnn14  # We use the standard Cnn14 model.

# -------------------------
# 8. Model Initialization (Train from Scratch)
# -------------------------
# Initialize the model with your dataset's number of classes (e.g., 7).
model_params = {
    "sample_rate": SAMPLE_RATE,
    "window_size": N_FFT,
    "hop_size": HOP_LENGTH,
    "mel_bins": N_MELS,
    "fmin": FMIN,
    "fmax": FMAX,
    "classes_num": num_classes  # This should be the number of unique classes in your dataset.
}

model = Cnn14(**model_params)
model.to(DEVICE)

# -------------------------
# 9. Loss & Optimizer
# -------------------------
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

# -------------------------
# 10. Training Loop
# -------------------------
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0.0
    for waveforms, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS}"):
        waveforms, labels = waveforms.to(DEVICE), labels.to(DEVICE)
        optimizer.zero_grad()
        outputs = model(waveforms)  # This returns a dictionary
        logits = outputs["clipwise_output"]  # Use the clipwise output for classification
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} - Training Loss: {total_loss / len(train_loader):.4f}")

# -------------------------
# 11. Evaluate on the Test Set
# -------------------------
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for waveforms, labels in test_loader:
        waveforms, labels = waveforms.to(DEVICE), labels.to(DEVICE)
        outputs = model(waveforms)
        logits = outputs["clipwise_output"]
        _, predicted = torch.max(logits, 1)
        correct += (predicted == labels).sum().item()
        total += labels.size(0)
        
accuracy = correct / total
print(f"Test Accuracy: {accuracy * 100:.2f}%")




Class mapping (Classname -> Index):
{'breath': 0, 'cough': 1, 'crying': 2, 'laugh': 3, 'screaming': 4, 'sneeze': 5, 'yawn': 6}


Epoch 1/10:  11%|█         | 85/787 [06:18<52:09,  4.46s/it]


KeyboardInterrupt: 

In [23]:
import os
DATASET_PATH = r"C:\Users\Harsh\Desktop\Audio Recognition Project\dataset"
print(os.listdir(DATASET_PATH))


['metadata of test set.csv', 'metadata of train set .csv', 'test', 'train', 'youtube ID vs link .TXT']


In [9]:
model = Cnn14(
    sample_rate=32000,
    window_size=1024,
    hop_size=320,
    mel_bins=64,
    fmin=50,
    fmax=14000,
    classes_num=7
)
checkpoint = torch.load(r"path_to\Wavegram_Logmel_Cnn14_mAP=0.439.pth", map_location=DEVICE)
model.load_state_dict(checkpoint["model"] if "model" in checkpoint else checkpoint)
model.to(DEVICE)


NameError: name 'Cnn14' is not defined

In [None]:
MODEL_TYPE="Transfer_Cnn14"
CHECKPOINT_PATH="Cnn14_mAP=0.431.pth"
CUDA_VISIBLE_DEVICES=0 python3 pytorch/finetune_template.py train \
    --sample_rate=32000 \
    --window_size=1024 \
    --hop_size=320 \
    --mel_bins=64 \
    --fmin=50 \
    --fmax=14000 \
    --model_type=$MODEL_TYPE \
    --pretrained_checkpoint_path=$CHECKPOINT_PATH \
    --cuda


SyntaxError: invalid syntax (725903655.py, line 3)