Lets Do it

In [1]:
pip install librosa torch torchvision torchaudio numpy pandas matplotlib


Note: you may need to restart the kernel to use updated packages.


DataSet preprocessing

In [1]:
import os
import librosa
import torch    
import numpy as np
import warnings
from torch.utils.data import Dataset, DataLoader

# GTZAN Path (Update this to your dataset location)
DATASET_PATH = "/media/kuzhalogi/Storage/Workspace/Pakari/gtzan_dataset/Data/genres_original"

# Genre Labels
GENRES = ['blues', 'classical', 'country', 'disco', 'hiphop', 
          'jazz', 'metal', 'pop', 'reggae', 'rock']


In [2]:

# Function to Load and Convert Audio to Mel Spectrogram
def extract_mel_spectrogram(file_path, sr=22050, n_mels=128, hop_length=512, target_len=1293):
    try: 
        y, _ = librosa.load(file_path, sr=sr)  # Load audio
        mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels, hop_length=hop_length)
        mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)  # Convert to dB scale

        # Pad or truncate spectrogram to the target length
        mel_spec_db = mel_spec_db[:, :target_len]  # Truncate
        mel_spec_db = np.pad(mel_spec_db, ((0, 0), (0, target_len - mel_spec_db.shape[1])), mode='constant')  # Pad

        return mel_spec_db  # Return as NumPy array

    except Exception as e:
        warnings.warn(f"Skipping file {file_path} due to error: {e}")
        return np.zeros((n_mels, target_len))  # Return zero spectrogram



In [3]:

# Custom PyTorch Dataset
class GTZANDataset(Dataset):
    def __init__(self, dataset_path, genres, transform=None,target_len=1293):
        self.dataset_path = dataset_path
        self.genres = genres
        self.transform = transform
        self.target_len = target_len
        self.data = []

        # Load all files
        for genre_idx, genre in enumerate(genres):
            genre_path = os.path.join(dataset_path, genre)
            for file in os.listdir(genre_path):
                if file.endswith(".wav"):
                    file_path = os.path.join(genre_path, file)
                    self.data.append((file_path, genre_idx))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        file_path, label = self.data[idx]
        mel_spec = extract_mel_spectrogram(file_path, target_len=self.target_len)  # Convert audio to Mel Spectrogram

        # Convert to Tensor
        mel_spec = torch.tensor(mel_spec, dtype=torch.float32).unsqueeze(0)  # Add channel dim

        return mel_spec, label


In [4]:

# Create Dataset and DataLoader
dataset = GTZANDataset(DATASET_PATH, GENRES)
train_loader = DataLoader(dataset, batch_size=16, shuffle=True)


In [5]:

# Test the dataset
sample_data, sample_label = next(iter(train_loader))
print(f"Sample Shape: {sample_data.shape}, Label: {sample_label.shape}")
print( )

Sample Shape: torch.Size([16, 1, 128, 1293]), Label: torch.Size([16])



Training CRNN with GTZAN dataset

In [6]:
import torch
import torch.nn as nn

class CRNN(nn.Module):
    def __init__(self, num_classes=10, input_channels=1, hidden_size=128, num_lstm_layers=2):
        super(CRNN, self).__init__()

        # Convolutional layers
        self.conv1 = nn.Conv2d(input_channels, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        self.conv2 = nn.Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        self.conv3 = nn.Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        
        # Batch Normalization after each Conv Layer
        self.bn1 = nn.BatchNorm2d(32)
        self.bn2 = nn.BatchNorm2d(64)
        self.bn3 = nn.BatchNorm2d(128)
        
        # Maxpooling after Conv Layers
        self.pool = nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2))

        # LSTM Layer
        self.lstm = nn.LSTM(input_size=20608, hidden_size=hidden_size, num_layers=num_lstm_layers, batch_first=True)

        # Fully connected layer to output classification
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        # Pass through Conv Layers + MaxPool + BatchNorm
        x = self.pool(torch.relu(self.bn1(self.conv1(x))))
        x = self.pool(torch.relu(self.bn2(self.conv2(x))))
        x = self.pool(torch.relu(self.bn3(self.conv3(x))))

        print(f"before --> {x.shape}")
        # [16,128,16,161]
        # Reshape for LSTM (batch_size, seq_len, feature_size)
        x = x.reshape(x.shape[0], x.shape[2], -1)
        # [16, 16, 128 * 161] -> [16, 16, 20608]

        # print(f"after reshaping{x.shape}")
        
        # Pass through LSTM
        x, (hn, cn) = self.lstm(x)

        # Take the last hidden state of the LSTM for classification
        x = hn[-1]

        # Final classification
        x = self.fc(x)

        return x


In [7]:
import torch.optim as optim

# Initialize Model, Loss, Optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CRNN(num_classes=10).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


In [8]:

# Training Loop
def train_model(model, train_loader, epochs=10):
    model.train()
    
    for epoch in range(epochs):
        total_loss = 0
        for batch in train_loader:
            inputs, labels = batch
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(inputs)
            print(outputs.shape)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_loader):.4f}")


In [9]:

# Train the Model
train_model(model, train_loader, epochs=10)


before --> torch.Size([16, 128, 16, 161])
torch.Size([16, 10])
before --> torch.Size([16, 128, 16, 161])
torch.Size([16, 10])
before --> torch.Size([16, 128, 16, 161])
torch.Size([16, 10])
before --> torch.Size([16, 128, 16, 161])
torch.Size([16, 10])
before --> torch.Size([16, 128, 16, 161])
torch.Size([16, 10])
before --> torch.Size([16, 128, 16, 161])
torch.Size([16, 10])
before --> torch.Size([16, 128, 16, 161])
torch.Size([16, 10])
before --> torch.Size([16, 128, 16, 161])
torch.Size([16, 10])
before --> torch.Size([16, 128, 16, 161])
torch.Size([16, 10])
before --> torch.Size([16, 128, 16, 161])
torch.Size([16, 10])
before --> torch.Size([16, 128, 16, 161])
torch.Size([16, 10])
before --> torch.Size([16, 128, 16, 161])
torch.Size([16, 10])
before --> torch.Size([16, 128, 16, 161])
torch.Size([16, 10])
before --> torch.Size([16, 128, 16, 161])
torch.Size([16, 10])
before --> torch.Size([16, 128, 16, 161])
torch.Size([16, 10])
before --> torch.Size([16, 128, 16, 161])
torch.Size([1

  y, _ = librosa.load(file_path, sr=sr)  # Load audio
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


before --> torch.Size([16, 128, 16, 161])
torch.Size([16, 10])
before --> torch.Size([16, 128, 16, 161])
torch.Size([16, 10])
before --> torch.Size([16, 128, 16, 161])
torch.Size([16, 10])
before --> torch.Size([16, 128, 16, 161])
torch.Size([16, 10])
before --> torch.Size([16, 128, 16, 161])
torch.Size([16, 10])
before --> torch.Size([16, 128, 16, 161])
torch.Size([16, 10])
before --> torch.Size([16, 128, 16, 161])
torch.Size([16, 10])
before --> torch.Size([16, 128, 16, 161])
torch.Size([16, 10])
before --> torch.Size([16, 128, 16, 161])
torch.Size([16, 10])
before --> torch.Size([16, 128, 16, 161])
torch.Size([16, 10])
before --> torch.Size([16, 128, 16, 161])
torch.Size([16, 10])
before --> torch.Size([16, 128, 16, 161])
torch.Size([16, 10])
before --> torch.Size([16, 128, 16, 161])
torch.Size([16, 10])
before --> torch.Size([16, 128, 16, 161])
torch.Size([16, 10])
before --> torch.Size([16, 128, 16, 161])
torch.Size([16, 10])
before --> torch.Size([16, 128, 16, 161])
torch.Size([1

KeyboardInterrupt: 

Evaluate on test data

In [None]:
# from sklearn.metrics import accuracy_score

# def evaluate_model(model, test_loader):
#     model.eval()
#     all_preds, all_labels = [], []

#     with torch.no_grad():
#         for batch in test_loader:
#             inputs, labels = batch
#             inputs, labels = inputs.to(device), labels.to(device)

#             outputs = model(inputs)
#             _, preds = torch.max(outputs, 1)

#             all_preds.extend(preds.cpu().numpy())
#             all_labels.extend(labels.cpu().numpy())

#     accuracy = accuracy_score(all_labels, all_preds)
#     print(f"Test Accuracy: {accuracy * 100:.2f}%")

# # Call Evaluation Function
# evaluate_model(model, train_loader)
