# Task 3: custom made CNN

Tito Scutari

<a href="https://colab.research.google.com/github/inspektral/asmc-genre-classification/blob/main/task3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


In [11]:
import os
import numpy as np
import pandas as pd
import librosa
import librosa.display
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import (Conv2D, MaxPooling2D, Flatten, Dense, 
                                     Dropout, BatchNormalization, Input)
from sklearn.preprocessing import LabelEncoder


In [3]:
SR = 22050           # sample rate
DURATION = 30        # duration in seconds (or adjust to fixed length)
SAMPLES_PER_TRACK = SR * DURATION
N_MELS = 128 

In [4]:
# retrieve data

data = []

for root, dirs, files in os.walk('data/genres_original'):
    for file in files:
        if file.endswith('.wav'):
            data.append({'path': os.path.join(root, file), 'genre': root.split('/')[-1]})


df = pd.DataFrame(data)

display(df)



Unnamed: 0,path,genre
0,data/genres_original/rock/rock.00036.wav,rock
1,data/genres_original/rock/rock.00001.wav,rock
2,data/genres_original/rock/rock.00012.wav,rock
3,data/genres_original/rock/rock.00065.wav,rock
4,data/genres_original/rock/rock.00049.wav,rock
...,...,...
995,data/genres_original/pop/pop.00094.wav,pop
996,data/genres_original/pop/pop.00025.wav,pop
997,data/genres_original/pop/pop.00027.wav,pop
998,data/genres_original/pop/pop.00053.wav,pop


In [5]:
def extract_mel_spec(file_path, sr=SR, n_mels=N_MELS, duration=DURATION):
    signal, _ = librosa.load(file_path, sr=sr, duration=duration)

    if len(signal) < SAMPLES_PER_TRACK:
        pad_width = SAMPLES_PER_TRACK - len(signal)
        signal = np.pad(signal, (0, pad_width), mode='constant')
    else:
        signal = signal[:SAMPLES_PER_TRACK]

    mel = librosa.feature.melspectrogram(y=signal, sr=sr, n_mels=n_mels)
    mel_db = librosa.power_to_db(mel, ref=np.max)

    mel_norm = (mel_db - mel_db.min()) / (mel_db.max() - mel_db.min())
    return mel_norm

features = []
labels = []
for idx, row in df.iterrows():
    try:
        mel_spec = extract_mel_spec(row['path'])
        features.append(np.expand_dims(mel_spec, axis=-1))
        labels.append(row['genre'])
    except Exception as e:
        print(f"Error processing {row['path']}: {e}")


X = np.array(features)
y = np.array(labels)

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
y = to_categorical(y)


np.save('data/X.npy', X)
np.save('data/y.npy', y)
np.save('data/label_encoder.npy', label_encoder.classes_)
print('Data saved')


  signal, _ = librosa.load(file_path, sr=sr, duration=duration)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Error processing data/genres_original/jazz/jazz.00054.wav: 
Data saved


In [6]:
# load data

X = np.load('data/X.npy')
y = np.load('data/y.npy')
label_encoder = LabelEncoder()
label_encoder.classes_ = np.load('data/label_encoder.npy')
print('Data loaded')


Data loaded


In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = Sequential([
    Input(shape=X.shape[1:]),
    Conv2D(32, (3,3), activation='relu', padding='same'),
    BatchNormalization(),
    MaxPooling2D((2,2)),
    Dropout(0.25),

    Conv2D(64, (3,3), activation='relu', padding='same'),
    BatchNormalization(),
    MaxPooling2D((2,2)),
    Dropout(0.25),

    Conv2D(128, (3,3), activation='relu', padding='same'),
    BatchNormalization(),
    MaxPooling2D((2,2)),
    Dropout(0.25),

    Flatten(),
    Dense(256, activation='relu'),
    BatchNormalization(),
    Dropout(0.5),
    Dense(y.shape[1], activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

model.fit(X_train, y_train, epochs=50, batch_size=16, validation_data=(X_test, y_test))

I0000 00:00:1741096572.241589   14486 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1741096572.762753   14486 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1741096572.765204   14486 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1741096572.770571   14486 cuda_executor.cc:1015] successful NUMA node read from SysFS ha

Epoch 1/50


I0000 00:00:1741096577.699750   14713 service.cc:146] XLA service 0x73dc780090b0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1741096577.699780   14713 service.cc:154]   StreamExecutor device (0): NVIDIA GeForce RTX 3060 Laptop GPU, Compute Capability 8.6
2025-03-04 14:56:17.774060: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2025-03-04 14:56:18.148998: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:531] Loaded cuDNN version 90300


[1m 2/50[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m4s[0m 99ms/step - accuracy: 0.1562 - loss: 3.4255  

I0000 00:00:1741096594.518016   14713 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m49/50[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 100ms/step - accuracy: 0.2369 - loss: 2.5281

2025-03-04 14:56:47.958506: E external/local_xla/xla/service/slow_operation_alarm.cc:65] Trying algorithm eng0{} for conv (f32[64,32,3,3]{3,2,1,0}, u8[0]{0}) custom-call(f32[15,32,64,646]{3,2,1,0}, f32[15,64,64,646]{3,2,1,0}), window={size=3x3 pad=1_1x1_1}, dim_labels=bf01_oi01->bf01, custom_call_target="__cudnn$convBackwardFilter", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kNone","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false} is taking a while...
2025-03-04 14:56:48.103142: E external/local_xla/xla/service/slow_operation_alarm.cc:133] The operation took 1.144724806s
Trying algorithm eng0{} for conv (f32[64,32,3,3]{3,2,1,0}, u8[0]{0}) custom-call(f32[15,32,64,646]{3,2,1,0}, f32[15,64,64,646]{3,2,1,0}), window={size=3x3 pad=1_1x1_1}, dim_labels=bf01_oi01->bf01, custom_call_target="__cudnn$convBackwardFilter", backend_config={"operation_queue_id":"0","wait_on_ope

[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 482ms/step - accuracy: 0.2389 - loss: 2.5157 - val_accuracy: 0.0750 - val_loss: 245.1147
Epoch 2/50
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 104ms/step - accuracy: 0.5495 - loss: 1.3459 - val_accuracy: 0.0600 - val_loss: 156.1750
Epoch 3/50
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 110ms/step - accuracy: 0.7946 - loss: 0.7080 - val_accuracy: 0.0600 - val_loss: 41.5774
Epoch 4/50
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 104ms/step - accuracy: 0.8342 - loss: 0.5685 - val_accuracy: 0.0900 - val_loss: 23.7437
Epoch 5/50
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 104ms/step - accuracy: 0.9573 - loss: 0.2078 - val_accuracy: 0.0600 - val_loss: 8.7470
Epoch 6/50
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 120ms/step - accuracy: 0.9845 - loss: 0.0975 - val_accuracy: 0.1300 - val_loss: 7.1401
Epoch 7/50
[1m50/50[0m [32m━━

<keras.src.callbacks.history.History at 0x73de2df7ea40>

In [9]:
# save model
model.save('model.h5')
print('Model saved')



Model saved


In [12]:
# load model
model = load_model('model.h5')
print('Model loaded')

# evaluate model
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f'Test accuracy: {test_acc}')




Model loaded


2025-03-04 15:04:55.022473: W external/local_tsl/tsl/framework/bfc_allocator.cc:291] Allocator (GPU_0_bfc) ran out of memory trying to allocate 682.19MiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.
2025-03-04 15:04:55.022549: W external/local_tsl/tsl/framework/bfc_allocator.cc:291] Allocator (GPU_0_bfc) ran out of memory trying to allocate 682.19MiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.
2025-03-04 15:04:55.022573: W external/local_tsl/tsl/framework/bfc_allocator.cc:291] Allocator (GPU_0_bfc) ran out of memory trying to allocate 682.19MiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.
2025-03-04 15:04:55.022590: W external/local_tsl/tsl/framewor

[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 164ms/step - accuracy: 0.5580 - loss: 4.5670
Test accuracy: 0.5350000262260437


# Considerations

The CNN gets better with training, but the performance is quite bad, around 0.5. I think we are also overfitting a little bit since the training accuracy is almost 1. I think that on a small dataset like this it is much better to use a pre-trained model and fine-tune it or to train on more meaningful features.