Project's objective: Music Genre Classification
1. Dataset: GZTAN https://www.kaggle.com/datasets/andradaolteanu/gtzan-dataset-music-genre-classification/data
2. Model: MLP (2-layer)

In [2]:
# Usual Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import IPython.display as ipd
import warnings
warnings.filterwarnings('ignore')
import os
import glob
import librosa
import librosa.display
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import joblib

In [3]:
import os

# Check what's in the downloaded dataset
path = './GZTAN'
print("Contents of dataset path:")
print(os.listdir(path))

# Try to find genres_original
preferred = os.path.join(path, 'genres_original')
if os.path.exists(preferred):
    genres_dir = preferred
    print("\ngenres_original found:")
    print(list(os.listdir(genres_dir)))
else:
    print(f"\ngenres_original NOT found at {preferred}")
    # Search for it
    for root, dirs, files in os.walk(path):
        if 'genres_original' in dirs:
            genres_dir = os.path.join(root, 'genres_original')
            print(f"Found genres_original at: {genres_dir}")
            print(list(os.listdir(genres_dir))[:10])  # first 10 items
            break

Contents of dataset path:
['features_30_sec.csv', 'features_3_sec.csv', 'genres_original', 'images_original']

genres_original found:
['blues', 'classical', 'country', 'disco', 'hiphop', 'jazz', 'metal', 'pop', 'reggae', 'rock']


PREPROCESSING

In [11]:
csv_path = os.path.join(path, 'features_3_sec.csv')

# if not found, search under path
if not os.path.exists(csv_path):
    for root, dirs, files in os.walk(path):
        if 'features_3_sec.csv' in files:
            csv_path = os.path.join(root, 'features_3_sec.csv')
            break

if not os.path.exists(csv_path):
    raise FileNotFoundError(f"CSV not found under {path}. Check dataset download.")

data = pd.read_csv(csv_path)
data = data.iloc[:, 1:]  # drop index column
data.head()

Unnamed: 0,length,chroma_stft_mean,chroma_stft_var,rms_mean,rms_var,spectral_centroid_mean,spectral_centroid_var,spectral_bandwidth_mean,spectral_bandwidth_var,rolloff_mean,...,mfcc16_var,mfcc17_mean,mfcc17_var,mfcc18_mean,mfcc18_var,mfcc19_mean,mfcc19_var,mfcc20_mean,mfcc20_var,label
0,66149,0.335406,0.091048,0.130405,0.003521,1773.065032,167541.630869,1972.744388,117335.771563,3714.560359,...,39.687145,-3.24128,36.488243,0.722209,38.099152,-5.050335,33.618073,-0.243027,43.771767,blues
1,66149,0.343065,0.086147,0.112699,0.00145,1816.693777,90525.690866,2010.051501,65671.875673,3869.682242,...,64.748276,-6.055294,40.677654,0.159015,51.264091,-2.837699,97.03083,5.784063,59.943081,blues
2,66149,0.346815,0.092243,0.132003,0.00462,1788.539719,111407.437613,2084.565132,75124.921716,3997.63916,...,67.336563,-1.76861,28.348579,2.378768,45.717648,-1.938424,53.050835,2.517375,33.105122,blues
3,66149,0.363639,0.086856,0.132565,0.002448,1655.289045,111952.284517,1960.039988,82913.639269,3568.300218,...,47.739452,-3.841155,28.337118,1.218588,34.770935,-3.580352,50.836224,3.630866,32.023678,blues
4,66149,0.335579,0.088129,0.143289,0.001701,1630.656199,79667.267654,1948.503884,60204.020268,3469.992864,...,30.336359,0.664582,45.880913,1.689446,51.363583,-3.392489,26.738789,0.536961,29.146694,blues


In [54]:
from sklearn import preprocessing

y = data['label'] # genre variable.
X = data.loc[:, data.columns != 'label'] #select all columns but not the labels

#### NORMALIZE X ####

# Normalize so everything is on the same scale.

cols = X.columns
min_max_scaler = preprocessing.MinMaxScaler()
np_scaled = min_max_scaler.fit_transform(X)

# new data frame with the new scaled data.
X = pd.DataFrame(np_scaled, columns = cols)
min_vals = min_max_scaler.data_min_
max_vals = min_max_scaler.data_max_
print(min_vals.shape[0])

58


In [26]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

TRAINING

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report


mlp_pipeline = Pipeline([
    ("minmax", MinMaxScaler()),
    ("mlp", MLPClassifier(random_state=42))
])

mlp_param_grid = {
    "mlp__hidden_layer_sizes": [(64,), (64, 32), (128, 64)],
    "mlp__activation": ["relu"],
    "mlp__alpha": [1e-4, 1e-3, 1e-2],
    "mlp__learning_rate_init": [1e-3, 5e-4],
    "mlp__max_iter": [200, 300, 350]
}

mlp_grid = GridSearchCV(
    mlp_pipeline,
    mlp_param_grid,
    cv=5,
    n_jobs=-1,
    verbose=2
)

mlp_grid.fit(X_train, y_train)

print("Best MLP params:", mlp_grid.best_params_)

best_mlp = mlp_grid.best_estimator_
y_pred_mlp = best_mlp.predict(X_test)

print("MLP (tuned) results")
print(classification_report(y_test, y_pred_mlp))
print(confusion_matrix(y_test, y_pred_mlp))

In [45]:
import joblib
joblib.dump(best_mlp, "genre_mlp_model.pkl")

['genre_mlp_model.pkl']

In [58]:
joblib.dump(min_max_scaler, "gztan_minmax_scaler.pkl")
print(min_max_scaler)

MinMaxScaler()


In [46]:
def quantize_int8(arr):
    """
    Symmetric per-tensor int8 quantization
    return: q_arr, scale
    """
    max_val = np.max(np.abs(arr))
    scale = max_val / 127.0 if max_val != 0 else 1.0
    q = np.round(arr / scale).astype(np.int8)
    return q, scale


In [None]:
best_mlp = joblib.load("genre_mlp_model.pkl")
min_vals = min_max_scaler.data_min_.astype(np.float32)
max_vals = min_max_scaler.data_max_.astype(np.float32)

mlp_model = best_mlp.named_steps["mlp"]
mlp_scaler = best_mlp.named_steps["scaler"]

# Extract weights and biases for all layers
coefs = mlp_model.coefs_
intercepts = mlp_model.intercepts_
n_layers = len(coefs)  # Total layers (input + hidden + output)

# Quantize all weights and biases
q_coefs = []
s_coefs = []
q_intercepts = []
s_intercepts = []
for i in range(n_layers):
    qW, sW = quantize_int8(coefs[i])
    qb, sb = quantize_int8(intercepts[i])
    q_coefs.append(qW)
    s_coefs.append(sW)
    q_intercepts.append(qb)
    s_intercepts.append(sb)

mean = mlp_scaler.mean_.astype(np.float32)
scale = mlp_scaler.scale_.astype(np.float32)

with open("genre_mlp_int8.h", "w") as f:
    f.write("// INT8 MLP (multi-layer) for ESP32\n")
    f.write("#pragma once\n")
    f.write("#include <stdint.h>\n\n")
    
    # Defines
    f.write(f"#define N_FEATURES {coefs[0].shape[0]}\n")
    f.write(f"#define N_CLASSES {coefs[-1].shape[1]}\n")  # Correct: Use output layer size
    f.write(f"#define N_HIDDEN_LAYERS {n_layers - 1}\n")  # Number of hidden layers
    
    # Hidden layer sizes (array for dynamic access)
    hidden_sizes = [coefs[i].shape[1] for i in range(n_layers - 1)]  # Sizes of hidden layers
    f.write(f"static const int HIDDEN_SIZES[N_HIDDEN_LAYERS] = {{{', '.join(map(str, hidden_sizes))}}};\n\n")
    
    # Input scaler
    f.write("static const float INPUT_MEAN[N_FEATURES] = {" + ",".join(map(str, mean)) + "};\n")
    f.write("static const float INPUT_SCALE[N_FEATURES] = {" + ",".join(map(str, scale)) + "};\n\n")
    
    # MinMax scaler (from training)
    f.write("static const float GENRE_MIN[N_FEATURES] = {\n")
    for v in min_vals:
        f.write(f"  {v:.8f}f,\n")
    f.write("};\n\n")

    f.write("static const float GENRE_MAX[N_FEATURES] = {\n")
    for v in max_vals:
        f.write(f"  {v:.8f}f,\n")
    f.write("};\n\n")

    # Scales for weights and biases
    for i in range(n_layers):
        layer_name = f"LAYER_{i+1}" if i < n_layers - 1 else "OUTPUT"
        f.write(f"static const float W{i+1}_SCALE = {s_coefs[i]};\n")
        f.write(f"static const float B{i+1}_SCALE = {s_intercepts[i]};\n")
    
    f.write("\n")
    
    # Weights (2D arrays)
    for i in range(n_layers):
        in_size = coefs[i].shape[0]
        out_size = coefs[i].shape[1]
        layer_name = f"W{i+1}"
        f.write(f"static const int8_t {layer_name}[{in_size}][{out_size}] = {{\n")
        for row in q_coefs[i]:
            f.write("{" + ",".join(map(str, row.tolist())) + "},\n")
        f.write("};\n\n")
    
    # Biases (1D arrays)
    for i in range(n_layers):
        out_size = coefs[i].shape[1]
        layer_name = f"B{i+1}"
        f.write(f"static const int8_t {layer_name}[{out_size}] = {{" + ",".join(map(str, q_intercepts[i].tolist())) + "}};\n\n")

print("Exported genre_mlp_int8.h with corrected N_CLASSES and multi-layer support")

# ...existing code...

Exported genre_mlp_int8.h with corrected N_CLASSES and multi-layer support


In [47]:
# Preprocess mp3 sample
import librosa

def extract_full_features(mp3_path, sr=22050, duration=3):
    """Extract features from MP3 matching training CSV columns."""
    y, _ = librosa.load(mp3_path, sr=sr, duration=duration)
    features = {}

    # length
    features['length'] = len(y)

    # chroma_stft
    chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr)
    features['chroma_stft_mean'] = chroma_stft.mean()
    features['chroma_stft_var'] = chroma_stft.var()

    # rms
    rms = librosa.feature.rms(y=y)
    features['rms_mean'] = rms.mean()
    features['rms_var'] = rms.var()

    # spectral_centroid
    spec_cent = librosa.feature.spectral_centroid(y=y, sr=sr)
    features['spectral_centroid_mean'] = spec_cent.mean()
    features['spectral_centroid_var'] = spec_cent.var()

    # spectral_bandwidth
    spec_bw = librosa.feature.spectral_bandwidth(y=y, sr=sr)
    features['spectral_bandwidth_mean'] = spec_bw.mean()
    features['spectral_bandwidth_var'] = spec_bw.var()

    # rolloff
    rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
    features['rolloff_mean'] = rolloff.mean()
    features['rolloff_var'] = rolloff.var()

    # zero_crossing_rate
    zcr = librosa.feature.zero_crossing_rate(y)
    features['zero_crossing_rate_mean'] = zcr.mean()
    features['zero_crossing_rate_var'] = zcr.var()

    # harmony & perceptr
    try:
        harmony, perceptr = librosa.effects.hpss(y)
        features['harmony_mean'] = harmony.mean()
        features['harmony_var'] = harmony.var()
        features['perceptr_mean'] = perceptr.mean()
        features['perceptr_var'] = perceptr.var()
    except Exception:
        features['harmony_mean'] = 0
        features['harmony_var'] = 0
        features['perceptr_mean'] = 0
        features['perceptr_var'] = 0

    # tempo
    tempo, _ = librosa.beat.beat_track(y=y, sr=sr)
    features['tempo'] = tempo

    # MFCCs (20 coefficients, mean + var = 40 total)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=20)
    for i in range(1, 21):
        features[f'mfcc{i}_mean'] = mfcc[i-1].mean()
        features[f'mfcc{i}_var'] = mfcc[i-1].var()

    # Build feature vector in correct order matching training columns
    feat_vec = []
    for col in cols:
        if col in features:
            val = features[col]
            # ensure scalar (not array)
            feat_vec.append(float(val) if np.isscalar(val) else float(val))
        else:
            print(f"Warning: column '{col}' not found in extracted features, using 0")
            feat_vec.append(0.0)
    
    return np.array(feat_vec)

In [57]:
import joblib 
from pathlib import Path 
pipeline = joblib.load("genre_mlp_model.pkl") 
minmax_scaler = joblib.load("gztan_minmax_scaler.pkl")
test_path = r"C:\workspace\HMUD\deam\DEAM_audio\MEMD_audio" 
for i in range(300, 331): 
    mp3_file = os.path.join(test_path, f"{i}.mp3") 
    if not Path(mp3_file).exists(): 
        print("File not found:", mp3_file) 
        continue 
    feat_vec = extract_full_features(mp3_file) 
    feat_scaled = minmax_scaler.transform(feat_vec.reshape(1, -1)) 
    genre = pipeline.predict(feat_scaled.reshape(1, -1))[0] 
    prob = pipeline.predict_proba(feat_scaled.reshape(1, -1))[0] 
    print(f"\nMP3: {mp3_file}") 
    print(f"{genre} (confidence: {prob.max():.4f})")


MP3: C:\workspace\HMUD\deam\DEAM_audio\MEMD_audio\300.mp3
country (confidence: 0.5833)

MP3: C:\workspace\HMUD\deam\DEAM_audio\MEMD_audio\301.mp3
blues (confidence: 0.8081)

MP3: C:\workspace\HMUD\deam\DEAM_audio\MEMD_audio\302.mp3
pop (confidence: 0.7670)

MP3: C:\workspace\HMUD\deam\DEAM_audio\MEMD_audio\303.mp3
classical (confidence: 0.9675)

MP3: C:\workspace\HMUD\deam\DEAM_audio\MEMD_audio\304.mp3
pop (confidence: 0.9704)

MP3: C:\workspace\HMUD\deam\DEAM_audio\MEMD_audio\305.mp3
country (confidence: 0.6323)

MP3: C:\workspace\HMUD\deam\DEAM_audio\MEMD_audio\306.mp3
jazz (confidence: 1.0000)

MP3: C:\workspace\HMUD\deam\DEAM_audio\MEMD_audio\307.mp3
classical (confidence: 1.0000)

MP3: C:\workspace\HMUD\deam\DEAM_audio\MEMD_audio\308.mp3
hiphop (confidence: 0.5222)

MP3: C:\workspace\HMUD\deam\DEAM_audio\MEMD_audio\309.mp3
disco (confidence: 0.9518)

MP3: C:\workspace\HMUD\deam\DEAM_audio\MEMD_audio\310.mp3
country (confidence: 1.0000)

MP3: C:\workspace\HMUD\deam\DEAM_audio\MEMD

In [51]:
print(pipeline)

Pipeline(steps=[('scaler', StandardScaler()),
                ('mlp',
                 MLPClassifier(alpha=0.001, hidden_layer_sizes=(128, 64),
                               learning_rate_init=0.0005, max_iter=300,
                               random_state=42))])


In [25]:
import numpy as np
fake = np.random.rand(feat_vec.shape[0]).reshape(1, -1)
print(pipeline.predict(fake), pipeline.predict_proba(fake))


['reggae'] [[2.92611212e-62 3.54992118e-48 7.67296766e-51 7.68428017e-39
  1.45726478e-23 5.29078192e-45 2.71604792e-47 3.61544294e-09
  9.99999996e-01 1.16886845e-61]]


In [None]:
# serialize the model and scaler
import joblib
from pathlib import Path

# Save artifacts
outdir = Path("models")
outdir.mkdir(exist_ok=True)


print("Saved models to models/")

Saved models to models/
