# Setup n Data Preparation

In [23]:
import os
import zipfile
import json
import numpy as np
import pandas as pd
import librosa
import matplotlib.pyplot as plt
import warnings
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import classification_report
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from google.colab import drive

warnings.filterwarnings('ignore')
pd.set_option('display.max_colwidth', None)

print(" Mounting Google Drive...")
drive.mount('/content/drive', force_remount=True)

# Path
DRIVE_PROJECT_PATH = "/content/drive/MyDrive/Colab Notebooks/FP_MachineLearning"
# 1. Path file ke dataset
DATASET_ZIP_PATH = os.path.join(DRIVE_PROJECT_PATH, "dataset.zip")

# 2. Path environment colab
EXTRACT_DIR = "/content/dataset/dataset/Data"

# 3. Path ke dir after extract
DATASET_PATH = os.path.join(EXTRACT_DIR, "genres_original")

# 4. Path json
JSON_PATH = os.path.join(DRIVE_PROJECT_PATH, "data_mfcc.json")
WEIGHTS_DIR = os.path.join(DRIVE_PROJECT_PATH, "model_weights")

# Create dir
os.makedirs(WEIGHTS_DIR, exist_ok=True)
print(f"Setup complete. Model weights will be saved in: {WEIGHTS_DIR}")

 Mounting Google Drive...
Mounted at /content/drive
Setup complete. Model weights will be saved in: /content/drive/MyDrive/Colab Notebooks/FP_MachineLearning/model_weights


# 1. Data Loading & Preprocessing

In [24]:
# --- 1. Unzip dataset
if not os.path.exists(EXTRACT_DIR):
    print(f"Extracting {DATASET_ZIP_PATH} to {EXTRACT_DIR}...")
    # Buka file zip
    with zipfile.ZipFile(DATASET_ZIP_PATH, 'r') as zip_ref:
        zip_ref.extractall(EXTRACT_DIR)
    print("Extraction complete.")
else:
    print(f"Dataset already extracted at {EXTRACT_DIR}.")


# --- 2. Extract MFCC features
def extract_mfcc(dataset_path, json_path, n_mfcc=23, n_fft=2048, hop_length=512, duration=30):
    """
    Extracts MFCCs from music files and saves them to a JSON file.
    """
    if os.path.exists(json_path):
        print(f"JSON file found at {json_path}. Skipping MFCC extraction.")
        return

    print(f"Starting MFCC extraction from {dataset_path}...")
    SAMPLE_RATE = 22050
    SAMPLES_PER_TRACK = SAMPLE_RATE * duration
    data = {"mapping": [], "mfcc": [], "labels": []}

    for i, (dirpath, _, filenames) in enumerate(sorted(os.walk(dataset_path))):
        if dirpath != dataset_path:
            genre_label = os.path.basename(dirpath)
            data["mapping"].append(genre_label)
            print(f"\nProcessing genre: {genre_label}")
            for f in sorted(filenames):
                if f.endswith((".wav", ".au")):
                    file_path = os.path.join(dirpath, f)
                    try:
                        signal, sr = librosa.load(file_path, sr=SAMPLE_RATE, duration=duration)
                        if len(signal) < SAMPLES_PER_TRACK:
                            signal = np.pad(signal, (0, SAMPLES_PER_TRACK - len(signal)), 'constant')
                        mfcc = librosa.feature.mfcc(y=signal, sr=sr, n_mfcc=n_mfcc, n_fft=n_fft, hop_length=hop_length).T
                        data["mfcc"].append(mfcc.tolist())
                        data["labels"].append(i - 1)
                        print(".", end="")
                    except Exception as e:
                        print(f"\nCould not process file {f}: {e}")

    with open(json_path, "w") as fp:
        json.dump(data, fp, indent=4)
    print("\nMFCC extraction complete.")

# Extraction MFCC
extract_mfcc(DATASET_PATH, JSON_PATH)


# --- 3. Load and Split Data
print("\nLoading data from JSON file...")
with open(JSON_PATH, "r") as fp: data = json.load(fp)
X = np.array(data["mfcc"]); y = np.array(data["labels"]); genre_labels = data["mapping"]
X = X[..., np.newaxis]

X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.25, random_state=42, stratify=y_train_full)

input_shape = (X_train.shape[1], X_train.shape[2], X_train.shape[3])
num_genres = len(genre_labels)
print(f"Data prepared. Train shape: {X_train.shape}, Test shape: {X_test.shape}")

all_results = []

Dataset already extracted at /content/dataset/dataset/Data.
JSON file found at /content/drive/MyDrive/Colab Notebooks/FP_MachineLearning/data_mfcc.json. Skipping MFCC extraction.

[INFO] Loading data from JSON file...
[SUCCESS] Data prepared. Train shape: (599, 1292, 23, 1), Test shape: (200, 1292, 23, 1)


# 3. Model Architecture Definitions

In [25]:
early_stopping = EarlyStopping(monitor='val_accuracy', patience=30, restore_best_weights=True, mode='max', verbose=1)
lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=10, min_lr=1e-7, verbose=1)

def build_cnn_model(input_shape, num_genres):
    """
    Builds a baseline Convolutional Neural Network (CNN).
    This architecture uses the CNN part described in the paper with a
    GlobalAveragePooling2D head for robust classification.

    Args:
        input_shape (tuple): The shape of the input data.
        num_genres (int): The number of output classes.

    Returns:
        tf.keras.Model: The compiled Keras model.
    """
    inputs = layers.Input(shape=input_shape, name="input_layer")
    x = layers.Conv2D(32, (3, 3), activation='relu', padding='same')(inputs)
    x = layers.MaxPooling2D((2, 2), padding='same')(x)
    x = layers.BatchNormalization()(x)
    x = layers.Conv2D(64, (3, 3), activation='relu', padding='same')(x)
    x = layers.MaxPooling2D((3, 3), padding='same')(x)
    x = layers.BatchNormalization()(x)
    x = layers.Conv2D(128, (3, 3), activation='relu', padding='same')(x)
    x = layers.MaxPooling2D((4, 4), padding='same')(x)
    x = layers.BatchNormalization()(x)
    x = layers.GlobalAveragePooling2D(name="gap_layer")(x)
    outputs = layers.Dense(num_genres, activation='softmax', name="output_layer")(x)
    return models.Model(inputs=inputs, outputs=outputs, name="CNN_Model")

def build_crnn_gru_model(input_shape, num_genres):
    """
    Builds a Convolutional Recurrent Neural Network (CRNN) with GRU layers.
    This architecture faithfully replicates the CNN-GRU model from the paper,
    using a Reshape layer to connect the CNN feature extractor to the RNN.

    Args:
        input_shape (tuple): The shape of the input data.
        num_genres (int): The number of output classes.

    Returns:
        tf.keras.Model: The compiled Keras model.
    """
    model = models.Sequential(name="CRNN_GRU_Model")
    model.add(layers.Input(shape=input_shape))
    model.add(layers.Conv2D(32, (3, 3), activation='relu', padding='same'))
    model.add(layers.MaxPooling2D((2, 2), padding='same')); model.add(layers.Dropout(0.1)); model.add(layers.BatchNormalization())
    model.add(layers.Conv2D(64, (3, 3), activation='relu', padding='same'))
    model.add(layers.MaxPooling2D((3, 3), padding='same')); model.add(layers.Dropout(0.1)); model.add(layers.BatchNormalization())
    model.add(layers.Conv2D(128, (3, 3), activation='relu', padding='same'))
    model.add(layers.MaxPooling2D((4, 4), padding='same')); model.add(layers.Dropout(0.1)); model.add(layers.BatchNormalization())
    last_layer_shape = model.output_shape
    model.add(layers.Reshape((last_layer_shape[1], last_layer_shape[2] * last_layer_shape[3])))
    model.add(layers.GRU(20, return_sequences=True)); model.add(layers.GRU(20))
    model.add(layers.Dense(num_genres, activation='softmax'))
    return model

def build_crnn_lstm_model(input_shape, num_genres):
    """
    Builds a Convolutional Recurrent Neural Network (CRNN) with LSTM layers.
    This architecture faithfully replicates the CNN-LSTM model from the paper,
    using a Reshape layer and the specified number of LSTM cells (30).

    Args:
        input_shape (tuple): The shape of the input data.
        num_genres (int): The number of output classes.

    Returns:
        tf.keras.Model: The compiled Keras model.
    """
    model = models.Sequential(name="CRNN_LSTM_Model")
    model.add(layers.Input(shape=input_shape))
    model.add(layers.Conv2D(32, (3, 3), activation='relu', padding='same'))
    model.add(layers.MaxPooling2D((2, 2), padding='same')); model.add(layers.Dropout(0.1)); model.add(layers.BatchNormalization())
    model.add(layers.Conv2D(64, (3, 3), activation='relu', padding='same'))
    model.add(layers.MaxPooling2D((3, 3), padding='same')); model.add(layers.Dropout(0.1)); model.add(layers.BatchNormalization())
    model.add(layers.Conv2D(128, (3, 3), activation='relu', padding='same'))
    model.add(layers.MaxPooling2D((4, 4), padding='same')); model.add(layers.Dropout(0.1)); model.add(layers.BatchNormalization())
    last_layer_shape = model.output_shape
    model.add(layers.Reshape((last_layer_shape[1], last_layer_shape[2] * last_layer_shape[3])))
    model.add(layers.LSTM(30, return_sequences=True)); model.add(layers.LSTM(30))
    model.add(layers.Dense(num_genres, activation='softmax'))
    return model

print("All model architectures are defined.")

All model architectures are defined.


# 4. Train and Evaluate CNN Model

In [26]:
print("\nStarting CNN Model training")
cnn_model = build_cnn_model(input_shape, num_genres)
cnn_model.compile(optimizer=tf.keras.optimizers.Adam(0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
cnn_model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=250, batch_size=32, verbose=1, callbacks=[early_stopping, lr_scheduler])
cnn_model.save_weights(os.path.join(WEIGHTS_DIR, "cnn_model.weights.h5"))
print("[SUCCESS] CNN model training complete. Weights saved.")
report = classification_report(y_test, np.argmax(cnn_model.predict(X_test), axis=1), output_dict=True)
all_results.append({"Model": "CNN", "Accuracy": report['accuracy'], "F1-Score (Macro)": report['macro avg']['f1-score']})


Starting CNN Model training
Epoch 1/250
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 188ms/step - accuracy: 0.3535 - loss: 1.9342 - val_accuracy: 0.1750 - val_loss: 8.5901 - learning_rate: 0.0010
Epoch 2/250
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 40ms/step - accuracy: 0.5782 - loss: 1.1197 - val_accuracy: 0.1100 - val_loss: 8.1936 - learning_rate: 0.0010
Epoch 3/250
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 38ms/step - accuracy: 0.6788 - loss: 0.9605 - val_accuracy: 0.2650 - val_loss: 5.3447 - learning_rate: 0.0010
Epoch 4/250
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 32ms/step - accuracy: 0.7303 - loss: 0.8735 - val_accuracy: 0.2550 - val_loss: 3.9503 - learning_rate: 0.0010
Epoch 5/250
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 35ms/step - accuracy: 0.7536 - loss: 0.7193 - val_accuracy: 0.2150 - val_loss: 3.8850 - learning_rate: 0.0010
Epoch 6/250
[1m19/19[0m [32m━━━━━━━━━

# Train and Evaluate CRNN-GRU Model

In [27]:

print("\ntarting CRNN-GRU Model training...")
crnn_gru_model = build_crnn_gru_model(input_shape, num_genres)
crnn_gru_model.compile(optimizer=tf.keras.optimizers.Adam(0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
crnn_gru_model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=250, batch_size=32, verbose=1, callbacks=[early_stopping, lr_scheduler])
crnn_gru_model.save_weights(os.path.join(WEIGHTS_DIR, "crnn_gru_model.weights.h5"))
print("[SUCCESS] CRNN-GRU model training complete. Weights saved.")
report = classification_report(y_test, np.argmax(crnn_gru_model.predict(X_test), axis=1), output_dict=True)
all_results.append({"Model": "CRNN-GRU", "Accuracy": report['accuracy'], "F1-Score (Macro)": report['macro avg']['f1-score']})


tarting CRNN-GRU Model training...
Epoch 1/250
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 132ms/step - accuracy: 0.2253 - loss: 2.1882 - val_accuracy: 0.1750 - val_loss: 2.2574 - learning_rate: 0.0010
Epoch 2/250
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 70ms/step - accuracy: 0.3691 - loss: 1.8772 - val_accuracy: 0.1600 - val_loss: 2.2208 - learning_rate: 0.0010
Epoch 3/250
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 68ms/step - accuracy: 0.4476 - loss: 1.7593 - val_accuracy: 0.2300 - val_loss: 2.0715 - learning_rate: 0.0010
Epoch 4/250
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 69ms/step - accuracy: 0.4653 - loss: 1.5830 - val_accuracy: 0.2700 - val_loss: 1.9043 - learning_rate: 0.0010
Epoch 5/250
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 66ms/step - accuracy: 0.5707 - loss: 1.4512 - val_accuracy: 0.3650 - val_loss: 1.7433 - learning_rate: 0.0010
Epoch 6/250
[1m19/19[0m [32m━━

# Train and Evaluate CRNN-LSTM Model

In [28]:
print("\nStarting CRNN-LSTM Model training...")
crnn_lstm_model = build_crnn_lstm_model(input_shape, num_genres)
crnn_lstm_model.compile(optimizer=tf.keras.optimizers.Adam(0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
crnn_lstm_model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=250, batch_size=32, verbose=1, callbacks=[early_stopping, lr_scheduler])
crnn_lstm_model.save_weights(os.path.join(WEIGHTS_DIR, "crnn_lstm_model.weights.h5"))
print("CRNN-LSTM model training complete. Weights saved.")
report = classification_report(y_test, np.argmax(crnn_lstm_model.predict(X_test), axis=1), output_dict=True)
all_results.append({"Model": "CRNN-LSTM", "Accuracy": report['accuracy'], "F1-Score (Macro)": report['macro avg']['f1-score']})


Starting CRNN-LSTM Model training...
Epoch 1/250
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 94ms/step - accuracy: 0.2101 - loss: 2.1984 - val_accuracy: 0.1000 - val_loss: 2.3574 - learning_rate: 0.0010
Epoch 2/250
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 66ms/step - accuracy: 0.3984 - loss: 1.8981 - val_accuracy: 0.1050 - val_loss: 2.4343 - learning_rate: 0.0010
Epoch 3/250
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 69ms/step - accuracy: 0.4382 - loss: 1.6983 - val_accuracy: 0.1550 - val_loss: 2.3269 - learning_rate: 0.0010
Epoch 4/250
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 65ms/step - accuracy: 0.4938 - loss: 1.5680 - val_accuracy: 0.2650 - val_loss: 2.0373 - learning_rate: 0.0010
Epoch 5/250
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 66ms/step - accuracy: 0.5044 - loss: 1.4382 - val_accuracy: 0.2650 - val_loss: 1.8937 - learning_rate: 0.0010
Epoch 6/250
[1m19/19[0m [32m━



[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 51ms/step


# SVM Training & Evaluation

In [29]:
print(f"\nPreparing Data & Training SVM Model")
# 1. Build the CNN architecture again
svm_feature_extractor_base = build_cnn_model(input_shape, num_genres)
# 2. Compile it to lock the architecture
svm_feature_extractor_base.compile(optimizer='adam', loss='sparse_categorical_crossentropy')
# 3. Load the pre-trained weights
print("[INFO] Loading pre-trained CNN weights for feature extraction...")
svm_feature_extractor_base.load_weights(os.path.join(WEIGHTS_DIR, "cnn_model.weights.h5"))

# 4. Create the feature extractor model by taking layers up to GlobalAveragePooling
feature_extractor = models.Model(
    inputs=svm_feature_extractor_base.input,
    outputs=svm_feature_extractor_base.get_layer('gap_layer').output
)

# 5. Extract features
print("Extracting features using the CNN...")
X_train_svm = feature_extractor.predict(X_train)
X_test_svm = feature_extractor.predict(X_test)

# 6. Train SVM with hyperparameter tuning
param_grid = {'C': [1, 10, 100, 1000], 'gamma': [0.1, 0.01, 0.001, 0.0001], 'kernel': ['rbf']}
grid = GridSearchCV(SVC(probability=True), param_grid, refit=True, verbose=1, cv=3)
print("\nSearching for best SVM parameters with GridSearchCV...")
grid.fit(X_train_svm, y_train)
print(f"\nBest SVM parameters found: {grid.best_params_}")

# 7. Evaluate the best SVM model
print("\nEvaluating the best SVM model on the test set")
report_svm = classification_report(y_test, grid.predict(X_test_svm), output_dict=True)
all_results.append({"Model": "SVM (CNN Features)", "Accuracy": report_svm['accuracy'], "F1-Score (Macro)": report_svm['macro avg']['f1-score']})
print("SVM evaluation complete.")


Preparing Data & Training SVM Model
[INFO] Loading pre-trained CNN weights for feature extraction...
Extracting features using the CNN...




[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 62ms/step
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step

Searching for best SVM parameters with GridSearchCV...
Fitting 3 folds for each of 16 candidates, totalling 48 fits

Best SVM parameters found: {'C': 1, 'gamma': 0.01, 'kernel': 'rbf'}

Evaluating the best SVM model on the test set
SVM evaluation complete.


# Final Report

In [31]:
summary_df = pd.DataFrame(all_results).round(4).sort_values(by="Accuracy", ascending=False)
summary_df.set_index('Model', inplace=True)

print("\n\nMODEL COMPARISON REPORT")
print(summary_df)



MODEL COMPARISON REPORT
                    Accuracy  F1-Score (Macro)
Model                                         
SVM (CNN Features)     0.800            0.7992
CNN                    0.790            0.7891
CRNN-LSTM              0.685            0.6799
CRNN-GRU               0.660            0.6627
