In [13]:
import os
import numpy as np
import pandas as pd
import librosa
import librosa.display
from keras.models import Model
from keras.layers import Input, Dense, Flatten, Conv2D, MaxPooling2D, concatenate
from keras.optimizers import Adam
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from PIL import Image
import matplotlib.pyplot as plt
import cv2  # or use PIL for image handling

In [2]:
working_OS = 'Windows'

if working_OS == 'MacOS':
    os.chdir(r"/Users/jordanlee/Code/School/CSCI416/music-genre-classification")
    print("Current Working Directory:", os.getcwd())

elif working_OS == 'Windows':
    os.chdir(r"C:\Code\School\CSCI416\music_genre_classification\music-genre-classification")
    print("Current Working Directory:", os.getcwd())

Current Working Directory: C:\Code\School\CSCI416\music_genre_classification\music-genre-classification


In [3]:
df = pd.read_csv('data/features/aug_features_cleaned.csv')
X = df.drop(columns=['label'])
y = df['label']
label_encoder =LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train_tabular, X_test_tabular, y_train_tabular, y_test_tabular = train_test_split(X_scaled, y_encoded, test_size=0.2, random_state=42)

In [14]:
def generate_spectrogram(audio_path, output_path, sr=22050):
    """
    Generate a spectrogram from an audio file and save it as a PNG.
    
    Parameters:
    - audio_path: Path to the audio file.
    - output_path: Path to save the generated spectrogram.
    - sr: Sampling rate for loading the audio.
    """
    # Load audio file
    y, sr = librosa.load(audio_path, sr=sr)
    
    # Create a mel-spectrogram
    S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128, fmax=8000)
    S_dB = librosa.power_to_db(S, ref=np.max)
    
    # Plot and save the spectrogram
    plt.figure(figsize=(4, 4))
    librosa.display.specshow(S_dB, sr=sr, x_axis='time', y_axis='mel', fmax=8000, cmap='viridis')
    plt.axis('off')  # Remove axes for a cleaner PNG
    plt.tight_layout(pad=0)
    plt.savefig(output_path, dpi=100, bbox_inches='tight', pad_inches=0)
    plt.close()


In [15]:
augmented_audio_dir = "data/augmented"
spectrogram_dir = "data/aug_spectrograms"

# Create spectrogram directory if it doesn't exist
os.makedirs(spectrogram_dir, exist_ok=True)

# Loop through all audio files in the augmented directory
for root, dirs, files in os.walk(augmented_audio_dir):
    for file in files:
        if file.endswith(".wav"):  # Ensure only audio files are processed
            genre = os.path.basename(root)  # Use folder name as genre
            genre_dir = os.path.join(spectrogram_dir, genre)
            os.makedirs(genre_dir, exist_ok=True)  # Create genre folder if needed
            
            # File paths
            audio_path = os.path.join(root, file)
            spectrogram_path = os.path.join(genre_dir, file.replace(".wav", ".png"))
            
            # Generate spectrogram
            generate_spectrogram(audio_path, spectrogram_path)

In [16]:
spectrogram_dir = 'data/aug_spectrograms'  # Path to the spectrograms folder

In [17]:
def load_spectrograms_and_labels(spectrogram_dir):
    spectrograms = []  # List to store spectrogram data
    labels = []        # List to store genre labels
    
    # Loop through all subdirectories (each representing a genre)
    for genre in os.listdir(spectrogram_dir):
        genre_folder = os.path.join(spectrogram_dir, genre)
        
        # Skip files, process only directories
        if os.path.isdir(genre_folder):
            for img_file in os.listdir(genre_folder):
                img_path = os.path.join(genre_folder, img_file)
                
                # Read the image using PIL (you can resize or preprocess here if needed)
                img = Image.open(img_path).convert('L')  # Convert to grayscale ('L')
                img = img.resize((128, 128))  # Resize to 128x128 (adjust as needed)
                
                # Convert image to numpy array
                img_array = np.array(img)
                
                # Append the image and its genre label
                spectrograms.append(img_array)
                labels.append(genre)  # The genre is the label for classification
    
    # Convert lists to numpy arrays
    X_spectrograms = np.array(spectrograms)
    y_labels = np.array(labels)
    
    return X_spectrograms, y_labels

In [18]:
# Load spectrograms and labels
X_spectrograms, y_labels = load_spectrograms_and_labels(spectrogram_dir)

# Normalize pixel values (between 0 and 1)
X_spectrograms = X_spectrograms.astype('float32') / 255.0

# Reshape to add channel dimension (since images are grayscale, the channel is 1)
X_spectrograms = X_spectrograms.reshape(-1, 128, 128, 1)

# Encode labels (genres) as numeric values
label_encoder = LabelEncoder()
y_labels_encoded = label_encoder.fit_transform(y_labels)

# Optional: one-hot encode the labels if needed (for multi-class classification)
y_labels_one_hot = np.eye(len(label_encoder.classes_))[y_labels_encoded]

In [19]:
# Split the data into training and test sets (80% train, 20% test)
X_train_spectrogram, X_test_spectrogram, y_train, y_test = train_test_split(
    X_spectrograms, y_labels_one_hot, test_size=0.2, random_state=42
)

In [20]:
# Define the input for tabular data
tabular_input = layers.Input(shape=(X_train_tabular.shape[1],))  # Adjust shape based on your tabular data

# Define the input for spectrogram images
spectrogram_input = layers.Input(shape=(128, 128, 1))  # Adjust shape based on your spectrogram size

# Tabular data processing: Dense layers
tabular_x = layers.Dense(64, activation='relu')(tabular_input)
tabular_x = layers.Dense(32, activation='relu')(tabular_x)

# Spectrogram data processing: Convolutional layers
spectrogram_x = layers.Conv2D(32, (3, 3), activation='relu')(spectrogram_input)
spectrogram_x = layers.MaxPooling2D((2, 2))(spectrogram_x)
spectrogram_x = layers.Conv2D(64, (3, 3), activation='relu')(spectrogram_x)
spectrogram_x = layers.MaxPooling2D((2, 2))(spectrogram_x)
spectrogram_x = layers.Flatten()(spectrogram_x)

# Concatenate the two branches (tabular and spectrogram)
combined = layers.concatenate([tabular_x, spectrogram_x])

# Dense layers after concatenation
x = layers.Dense(128, activation='relu')(combined)
x = layers.Dense(64, activation='relu')(x)

# Output layer (for classification)
output = layers.Dense(y_train.shape[1], activation='softmax')(x)  # Softmax for multi-class classification

# Define the model
model = models.Model(inputs=[tabular_input, spectrogram_input], outputs=output)

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Model summary to visualize the architecture
model.summary()


In [21]:
# Initialize the ImageDataGenerator for augmentation
datagen = ImageDataGenerator(
    rotation_range=30,       # Random rotations
    width_shift_range=0.2,   # Random width shifts
    height_shift_range=0.2,  # Random height shifts
    shear_range=0.2,         # Random shearing
    zoom_range=0.2,          # Random zoom
    horizontal_flip=True,    # Random horizontal flip
    fill_mode='nearest'      # Fill missing pixels after transformations
)

# Fit the generator on your training data
datagen.fit(X_train_spectrogram)

In [22]:
model.fit(
    [X_train_tabular, X_train_spectrogram], y_train,
    epochs=10, batch_size=32, validation_data=([X_test_tabular, X_test_spectrogram], y_test),
    steps_per_epoch=len(X_train_spectrogram)
)


Epoch 1/10
[1m  75/2397[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m3:30[0m 91ms/step - accuracy: 0.1785 - loss: 2.2540



[1m2397/2397[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 3ms/step - accuracy: 0.2606 - loss: 2.0725 - val_accuracy: 0.4150 - val_loss: 1.6378
Epoch 2/10
[1m2397/2397[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - accuracy: 0.5148 - loss: 1.3277 - val_accuracy: 0.5433 - val_loss: 1.2533
Epoch 3/10
[1m2397/2397[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - accuracy: 0.6285 - loss: 1.0722 - val_accuracy: 0.6200 - val_loss: 1.0889
Epoch 4/10
[1m2397/2397[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - accuracy: 0.6834 - loss: 0.9115 - val_accuracy: 0.6050 - val_loss: 1.0655
Epoch 5/10
[1m2397/2397[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - accuracy: 0.7457 - loss: 0.7282 - val_accuracy: 0.6933 - val_loss: 0.9087
Epoch 6/10
[1m2397/2397[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - accuracy: 0.8254 - loss: 0.5243 - val_accuracy: 0.7267 - val_loss: 0.8400
Epoch 7/10
[1m2397/2397[

<keras.src.callbacks.history.History at 0x25a83246380>

In [23]:
test_loss, test_accuracy = model.evaluate([X_test_tabular, X_test_spectrogram], y_test)

# Print the results
print(f"Test loss: {test_loss}")
print(f"Test accuracy: {test_accuracy}")

[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - accuracy: 0.7515 - loss: 0.9670
Test loss: 0.8567209839820862
Test accuracy: 0.7483333349227905


In [24]:
# Make predictions on the test data
y_pred = model.predict([X_test_tabular, X_test_spectrogram])

# Convert predictions to class labels (if using softmax, for classification)
y_pred_classes = y_pred.argmax(axis=1)

# Print predictions for the first few test samples
print(f"Predicted classes: {y_pred_classes[:5]}")
print(f"True classes: {y_test[:5]}")

[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 23ms/step
Predicted classes: [4 4 0 9 0]
True classes: [[0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]


In [25]:
# Convert y_test (one-hot encoded) to integer labels
y_test_classes = np.argmax(y_test, axis=1)

# Now compare the predicted classes with the true class labels
print(f"Predicted classes: {y_pred_classes[:5]}")
print(f"True classes: {y_test_classes[:5]}")

Predicted classes: [4 4 0 9 0]
True classes: [4 3 0 9 0]


In [26]:
# Confusion Matrix
cm = confusion_matrix(y_test_classes, y_pred_classes)
print("Confusion Matrix:")
print(cm)

# Classification Report (precision, recall, f1-score)
print("Classification Report:")
print(classification_report(y_test_classes, y_pred_classes))

Confusion Matrix:
[[46  0 15  2  0  4  0  1  1  4]
 [ 0 66  0  0  0  2  0  1  0  0]
 [ 4  0 50  0  0  1  0  0  2  1]
 [ 0  0  6 39  4  2  0  4  3  1]
 [ 0  0  1  2 44  0  1  0  2  3]
 [ 5  3  3  1  0 37  0  0  0  2]
 [ 2  0  0  0  3  0 46  0  0  5]
 [ 1  1  5  6  2  0  0 40  1  1]
 [ 0  0  4  3  3  0  0  4 44  0]
 [ 2  0 12  8  0  0  2  2  3 37]]
Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.63      0.69        73
           1       0.94      0.96      0.95        69
           2       0.52      0.86      0.65        58
           3       0.64      0.66      0.65        59
           4       0.79      0.83      0.81        53
           5       0.80      0.73      0.76        51
           6       0.94      0.82      0.88        56
           7       0.77      0.70      0.73        57
           8       0.79      0.76      0.77        58
           9       0.69      0.56      0.62        66

    accuracy                    