In [1]:
!pip install kaggle
!pip install librosa
!pip install torch
!pip install scikit-learn
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

cp: cannot stat 'kaggle.json': No such file or directory
chmod: cannot access '/root/.kaggle/kaggle.json': No such file or directory


In [2]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("andradaolteanu/gtzan-dataset-music-genre-classification")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/andradaolteanu/gtzan-dataset-music-genre-classification?dataset_version_number=1...


100%|██████████| 1.21G/1.21G [00:07<00:00, 164MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/andradaolteanu/gtzan-dataset-music-genre-classification/versions/1


In [4]:
!pip install torchinfo

Collecting torchinfo
  Downloading torchinfo-1.8.0-py3-none-any.whl.metadata (21 kB)
Downloading torchinfo-1.8.0-py3-none-any.whl (23 kB)
Installing collected packages: torchinfo
Successfully installed torchinfo-1.8.0


In [5]:
import os
import librosa
import numpy as np
import torch
from datetime import datetime
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import warnings
from sklearn.model_selection import ParameterGrid
# from torchsummary import summary
from torchinfo import summary
warnings.filterwarnings("ignore")

gpu_index = os.environ.get('CUDA_VISIBLE_DEVICES')
if gpu_index:
    print(f"Using SLURM-assigned GPU(s): {gpu_index}")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

DATASET_PATH = path + '/Data/genres_original/'

Using device: cuda


In [7]:
def load_mel_spectrogram(data_dir, classes, target_shape=(150,150)):
    try:
        data = []
        labels = []
        for i_class, class_name in enumerate(classes):
            class_dir = os.path.join(data_dir, class_name)
            for filename in os.listdir(class_dir):
                if filename.endswith('.wav'):
                    file_path = os.path.join(class_dir, filename)
                    try:
                        audio_data, sample_rate = librosa.load(file_path, sr=None)
                        if len(audio_data) == 0:
                            print(f"Skipping empty or corrupted file: {file_path}")
                            continue
                    except Exception as e:
                        print(f"Error loading file {file_path}: {e}")
                        continue
                chunk_duration = 4
                overlap_duration = 2
                chunk_samples = chunk_duration * sample_rate
                overlap_samples = overlap_duration * sample_rate
                num_chunks = int(np.ceil((len(audio_data) - chunk_samples) / (chunk_samples - overlap_samples))) + 1
                for i in range(num_chunks):
                    start = i * (chunk_samples - overlap_samples)
                    end = start + chunk_samples
                    chunk = audio_data[start:end]
                    if len(chunk) < chunk_samples:
                        print(f"Skipping incomplete chunk from file: {file_path}")
                        continue
                    mel_spectrogram = librosa.feature.melspectrogram(y=chunk, sr=sample_rate)
                    if mel_spectrogram.shape[1] < target_shape[1]:
                        mel_spectrogram = np.pad(mel_spectrogram,
                                                 ((0, 0), (0, target_shape[1] - mel_spectrogram.shape[1])),
                                                 mode='constant')
                    if mel_spectrogram.shape[0] < target_shape[0]:
                        mel_spectrogram = np.pad(mel_spectrogram,
                                                 ((0, target_shape[0] - mel_spectrogram.shape[0]), (0, 0)),
                                                 mode='constant')
                    mel_spectrogram = mel_spectrogram[:target_shape[0], :target_shape[1]]
                    mel_spectrogram = np.expand_dims(mel_spectrogram, axis=0)
                    data.append(mel_spectrogram)
                    labels.append(i_class)
        return np.array(data), np.array(labels)
    except Exception as e:
        print(f"Error processing file {file_path}: {e}")
        return None


def load_data():
    classes = ['blues', 'classical', 'country', 'disco', 'hiphop', 'jazz', 'metal', 'pop', 'reggae', 'rock']
    data, labels = load_mel_spectrogram(DATASET_PATH, classes)
    return data, labels

class GenreDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.long)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]


In [8]:
X, y = load_data()
num_classes = 10
print(f"X shape : {X.shape}, \n Y shape : {y.shape}")
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
train_dataset = GenreDataset(X_train, y_train)
test_dataset = GenreDataset(X_test, y_test)


Skipping incomplete chunk from file: /root/.cache/kagglehub/datasets/andradaolteanu/gtzan-dataset-music-genre-classification/versions/1/Data/genres_original/blues/blues.00081.wav
Skipping incomplete chunk from file: /root/.cache/kagglehub/datasets/andradaolteanu/gtzan-dataset-music-genre-classification/versions/1/Data/genres_original/blues/blues.00048.wav
Skipping incomplete chunk from file: /root/.cache/kagglehub/datasets/andradaolteanu/gtzan-dataset-music-genre-classification/versions/1/Data/genres_original/blues/blues.00041.wav
Skipping incomplete chunk from file: /root/.cache/kagglehub/datasets/andradaolteanu/gtzan-dataset-music-genre-classification/versions/1/Data/genres_original/blues/blues.00008.wav
Skipping incomplete chunk from file: /root/.cache/kagglehub/datasets/andradaolteanu/gtzan-dataset-music-genre-classification/versions/1/Data/genres_original/blues/blues.00011.wav
Skipping incomplete chunk from file: /root/.cache/kagglehub/datasets/andradaolteanu/gtzan-dataset-music-g

In [9]:
# @title Define the Model architecture


In [10]:
class GenreClassifier(nn.Module):
    def __init__(self, num_classes, lstm_units=128, dropout_rate=0.5):
        super(GenreClassifier, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=(3, 3))
        self.bn1 = nn.BatchNorm2d(32)
        self.pool1 = nn.MaxPool2d(kernel_size=(2, 2), padding=(1, 1))
        self.conv2 = nn.Conv2d(32, 64, kernel_size=(3, 3))
        self.bn2 = nn.BatchNorm2d(64)
        self.pool2 = nn.MaxPool2d(kernel_size=(2, 2), padding=(1, 1))
        self.conv3 = nn.Conv2d(64, 128, kernel_size=(3, 3))
        self.bn3 = nn.BatchNorm2d(128)
        self.pool3 = nn.MaxPool2d(kernel_size=(2, 2), padding=(1, 1))
        self.flatten = nn.Flatten()
        # cnn_output_size = 128 * (N_MELS // 8) * (8 // 8)
        self.lstm = nn.LSTM(41472, lstm_units, batch_first=True, bidirectional=True)
        self.fc1 = nn.Linear(lstm_units*2, 128)
        self.dropout = nn.Dropout(dropout_rate)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, num_classes)

    def forward(self, x):
        # print(f"X shape : {x.shape}")
        batch_size, channels, height, width = x.size()
        x = x.view(batch_size, channels, height, width)
        x = self.pool1(torch.relu(self.bn1(self.conv1(x))))
        x = self.pool2(torch.relu(self.bn2(self.conv2(x))))
        x = self.pool3(torch.relu(self.bn3(self.conv3(x))))
        x = self.flatten(x)
        x = x.view(batch_size, 1, -1)
        x, _ = self.lstm(x)
        x = self.dropout(torch.relu(self.fc1(x[:, -1, :])))
        x = self.dropout(torch.relu(self.fc2(x)))
        x = self.fc3(x)
        return x


In [19]:
def save_model(model, params, epoch, val_acc, save_dir="saved_models"):
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    filename = f"model_lstm{params['lstm_units']}_dropout{params['dropout_rate']}_lr{params['learning_rate']}_batch{params['batch_size']}_epochs{epoch}_valacc{val_acc:.4f}.pth"
    filepath = os.path.join(save_dir, filename)
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'params': params,
        'val_acc': val_acc,
    }, filepath)
    print(f"Model saved to {filepath}")


# Training and Evaluating the model

In [12]:
def train_model(model, train_loader, val_loader, criterion, optimizer, scheduler, epochs, params, save_best=True):
    model.to(device)
    best_val_acc = 0  # Track the best validation accuracy
    for epoch in range(epochs):
        model.train()
        train_loss, train_correct = 0, 0
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()

            train_loss += loss.item()
            train_correct += (outputs.argmax(1) == y_batch).sum().item()

        val_loss, val_correct = 0, 0
        model.eval()
        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                X_batch, y_batch = X_batch.to(device), y_batch.to(device)
                outputs = model(X_batch)
                loss = criterion(outputs, y_batch)
                val_loss += loss.item()
                val_correct += (outputs.argmax(1) == y_batch).sum().item()

        val_acc = val_correct / len(val_loader.dataset)
        scheduler.step(val_loss)
        print(f"Epoch {epoch + 1}/{epochs}: "
              f"Train Loss = {train_loss / len(train_loader):.4f}, "
              f"Train Acc = {train_correct / len(train_loader.dataset):.4f}, "
              f"Val Loss = {val_loss / len(val_loader):.4f}, "
              f"Val Acc = {val_acc:.4f},")

        # Save the model if it achieves the best validation accuracy
        if save_best and val_acc > best_val_acc:
            best_val_acc = val_acc
            save_model(model, params, epoch + 1, val_acc)

    return best_val_acc, model  # Return the best validation accuracy achieved


**Model Training**

In [16]:
from sklearn.metrics import classification_report
classes = ['blues', 'classical', 'country', 'disco', 'hiphop', 'jazz', 'metal', 'pop', 'reggae', 'rock']

best_acc = 0
best_params = None

params = {
    'epochs': 50,
    'lstm_units': 128,
    'dropout_rate': 0.5,
    'learning_rate': 0.0005,
    'batch_size': 64
}

train_loader = DataLoader(train_dataset, batch_size=params['batch_size'], shuffle=True)
val_loader = DataLoader(test_dataset, batch_size=params['batch_size'], shuffle=False)

model = GenreClassifier(num_classes=num_classes, lstm_units=params['lstm_units'], dropout_rate=params['dropout_rate'])
print("Model summary : ")
print(model)
print("Proper model summary : ")
summary(model, input_size=(16, 1, 150, 150))
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=params['learning_rate'], weight_decay=1e-5)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=10)

val_acc, model = train_model(model, train_loader, val_loader, criterion, optimizer, scheduler, epochs=params['epochs'], params=params)

print(f"\n***********\nParams: {params}, Val Accuracy: {val_acc:.4f}\n************")

val_loss, val_correct = 0, 0
all_preds = []
all_labels = []
model.eval()
with torch.no_grad():
    for X_batch, y_batch in val_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        val_loss += loss.item()
        val_correct += (outputs.argmax(1) == y_batch).sum().item()
        all_preds.extend(outputs.argmax(1).cpu().numpy())
        all_labels.extend(y_batch.cpu().numpy())

val_acc = val_correct / len(val_loader.dataset)
print(f"\n\n\n\nValidation Accuracy: {val_acc:.4f}")
print("Classification Report:")
print(classification_report(all_labels, all_preds, target_names=classes))

Model summary : 
GenreClassifier(
  (conv1): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1))
  (bn1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool1): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=(1, 1), dilation=1, ceil_mode=False)
  (conv2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1))
  (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool2): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=(1, 1), dilation=1, ceil_mode=False)
  (conv3): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1))
  (bn3): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool3): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=(1, 1), dilation=1, ceil_mode=False)
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (lstm): LSTM(41472, 128, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=256, out_features=128, bias=True)
  (dropout): Dropout(p=0.5, inplace=Fal

In [20]:
from sklearn.metrics import classification_report
classes = ['blues', 'classical', 'country', 'disco', 'hiphop', 'jazz', 'metal', 'pop', 'reggae', 'rock']

best_acc = 0
best_params = None

params = {
    'epochs': 50,
    'lstm_units': 256,
    'dropout_rate': 0.5,
    'learning_rate': 0.0005,
    'batch_size': 32
}

train_loader = DataLoader(train_dataset, batch_size=params['batch_size'], shuffle=True)
val_loader = DataLoader(test_dataset, batch_size=params['batch_size'], shuffle=False)

model = GenreClassifier(num_classes=num_classes, lstm_units=params['lstm_units'], dropout_rate=params['dropout_rate'])
print("Model summary : ")
print(model)
print("Proper model summary : ")
summary(model, input_size=(16, 1, 150, 150))
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=params['learning_rate'], weight_decay=1e-5)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=10)

val_acc, model = train_model(model, train_loader, val_loader, criterion, optimizer, scheduler, epochs=params['epochs'], params=params)

print(f"\n***********\nParams: {params}, Val Accuracy: {val_acc:.4f}\n************")

val_loss, val_correct = 0, 0
all_preds = []
all_labels = []
model.eval()
with torch.no_grad():
    for X_batch, y_batch in val_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        val_loss += loss.item()
        val_correct += (outputs.argmax(1) == y_batch).sum().item()
        all_preds.extend(outputs.argmax(1).cpu().numpy())
        all_labels.extend(y_batch.cpu().numpy())

val_acc = val_correct / len(val_loader.dataset)
print(f"\n\n\n\nValidation Accuracy: {val_acc:.4f}")
print("Classification Report:")
print(classification_report(all_labels, all_preds, target_names=classes))

Model summary : 
GenreClassifier(
  (conv1): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1))
  (bn1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool1): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=(1, 1), dilation=1, ceil_mode=False)
  (conv2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1))
  (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool2): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=(1, 1), dilation=1, ceil_mode=False)
  (conv3): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1))
  (bn3): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool3): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=(1, 1), dilation=1, ceil_mode=False)
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (lstm): LSTM(41472, 256, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=512, out_features=128, bias=True)
  (dropout): Dropout(p=0.5, inplace=Fal

In [21]:
from sklearn.metrics import classification_report
classes = ['blues', 'classical', 'country', 'disco', 'hiphop', 'jazz', 'metal', 'pop', 'reggae', 'rock']

best_acc = 0
best_params = None

params = {
    'epochs': 50,
    'lstm_units': 128,
    'dropout_rate': 0.5,
    'learning_rate': 0.0001,
    'batch_size': 32
}

train_loader = DataLoader(train_dataset, batch_size=params['batch_size'], shuffle=True)
val_loader = DataLoader(test_dataset, batch_size=params['batch_size'], shuffle=False)

model = GenreClassifier(num_classes=num_classes, lstm_units=params['lstm_units'], dropout_rate=params['dropout_rate'])
print("Model summary : ")
print(model)
print("Proper model summary : ")
summary(model, input_size=(16, 1, 150, 150))
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=params['learning_rate'], weight_decay=1e-5)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=10)

val_acc, model = train_model(model, train_loader, val_loader, criterion, optimizer, scheduler, epochs=params['epochs'], params=params)

print(f"\n***********\nParams: {params}, Val Accuracy: {val_acc:.4f}\n************")

val_loss, val_correct = 0, 0
all_preds = []
all_labels = []
model.eval()
with torch.no_grad():
    for X_batch, y_batch in val_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        val_loss += loss.item()
        val_correct += (outputs.argmax(1) == y_batch).sum().item()
        all_preds.extend(outputs.argmax(1).cpu().numpy())
        all_labels.extend(y_batch.cpu().numpy())

val_acc = val_correct / len(val_loader.dataset)
print(f"\n\n\n\nValidation Accuracy: {val_acc:.4f}")
print("Classification Report:")
print(classification_report(all_labels, all_preds, target_names=classes))

Model summary : 
GenreClassifier(
  (conv1): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1))
  (bn1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool1): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=(1, 1), dilation=1, ceil_mode=False)
  (conv2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1))
  (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool2): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=(1, 1), dilation=1, ceil_mode=False)
  (conv3): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1))
  (bn3): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool3): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=(1, 1), dilation=1, ceil_mode=False)
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (lstm): LSTM(41472, 128, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=256, out_features=128, bias=True)
  (dropout): Dropout(p=0.5, inplace=Fal