# Music-Genre-Classifier RNN
Kaito Minami

In [54]:
import pandas as pd
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout
from keras.optimizers import Adam
import matplotlib.pyplot as plt

import numpy as np
import librosa
import librosa.display
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import pandas as pd
import os
from sklearn.preprocessing import StandardScaler
import audioread
from sklearn.preprocessing import StandardScaler
# from pydub import AudioSegment


## 1. Data Processing

In [55]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [56]:
# Load the datasets
file_30_sec = '/content/drive/MyDrive/Colab Notebooks/GTZAN/features_30_sec.csv'
file_3_sec = '/content/drive/MyDrive/Colab Notebooks/GTZAN/features_3_sec.csv'

df_30_sec = pd.read_csv(file_30_sec)
df_3_sec = pd.read_csv(file_3_sec)

In [57]:
# Function to extract Mel spectrogram features
def extract_mel_spectrogram(file_path, sr=22050, n_mels=128):
    try:
        # Load audio file
        y, sr = librosa.load(file_path, sr=sr)
        # Extract Mel spectrogram
        mel_spectrogram = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels)
        mel_spectrogram_db = librosa.power_to_db(mel_spectrogram, ref=np.max)
        mel_spectrogram_flattened = mel_spectrogram_db.flatten()  # Flatten for use in ML models
    except (FileNotFoundError, librosa.util.exceptions.LibrosaError, audioread.NoBackendError):
        print(f"Audio file {file_path} not found or cannot be processed. Using CSV features instead.")
        mel_spectrogram_flattened = None
    return mel_spectrogram_flattened

# Splitting the dataset into training, validation, and test sets
def split_data(df, test_size=0.2, val_size=0.2):
    train_val, test = train_test_split(df, test_size=test_size, stratify=df['label'], random_state=42)
    train, val = train_test_split(train_val, test_size=val_size, stratify=train_val['label'], random_state=42)
    return train, val, test

# Function to process the entire dataset and extract features
def process_dataset(df, audio_directory, feature_type='mel'):
    features = []
    labels = []
    for idx, row in df.iterrows():
        file_path = os.path.join(audio_directory, row['label'], row['filename'])
        if feature_type == 'mel':
            feature_vector = extract_mel_spectrogram(file_path)
        elif feature_type == 'wavelet':
            feature_vector = extract_wavelet_features(file_path)
        elif feature_type == 'chroma':
            feature_vector = extract_chroma_features(file_path)
        else:
            raise ValueError("Invalid feature type. Choose from 'mel', 'wavelet', 'chroma'.")

        # If audio features cannot be extracted, use CSV features
        if feature_vector is None:
            feature_vector = row.drop(['label']).filter(regex='^(?!filename)').values.astype(np.float32)
        features.append(feature_vector)
        labels.append(row['label'])

    # Ensure all feature vectors have the same length by padding or truncating
    max_length = max(len(f) for f in features)
    features = np.array([np.pad(f, (0, max_length - len(f)), 'constant') if len(f) < max_length else f[:max_length] for f in features])
    return features, np.array(labels)

In [58]:
# Standardizing dataset before splitting
scaler = StandardScaler()
df_30_sec_numeric = df_30_sec.select_dtypes(include=['float64', 'int64']).copy()
df_30_sec[df_30_sec_numeric.columns] = scaler.fit_transform(df_30_sec_numeric)

df_3_sec_numeric = df_3_sec.select_dtypes(include=['float64', 'int64']).copy()
df_3_sec[df_3_sec_numeric.columns] = scaler.fit_transform(df_3_sec_numeric)

# Splitting the dataset for 30-sec
train_df_30_sec, val_df_30_sec, test_df_30_sec = split_data(df_30_sec)

# Extract and process features for 30-sec dataset
audio_directory = '/content/drive/MyDrive/Colab Notebooks/GTZAN/genres_original'

# Mel Spectrogram Features
train_mel_features, train_mel_labels = process_dataset(train_df_30_sec, audio_directory, feature_type='mel')
val_mel_features, val_mel_labels = process_dataset(val_df_30_sec, audio_directory, feature_type='mel')
test_mel_features, test_mel_labels = process_dataset(test_df_30_sec, audio_directory, feature_type='mel')

  y, sr = librosa.load(file_path, sr=sr)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Audio file /content/drive/MyDrive/Colab Notebooks/GTZAN/genres_original/jazz/jazz.00054.wav not found or cannot be processed. Using CSV features instead.


In [63]:
train_mel_features = train_mel_features.reshape(train_mel_features.shape[0], train_mel_features.shape[1])

## 2. LSTM model

In [60]:
def one_hot(genre_strings):
    genre_list = ["blues", "classical", "country", "disco", "hiphop", "jazz", "metal", "pop", "reggae", "rock"]

    y_one_hot = np.zeros((genre_strings.shape[0], len(genre_list)))
    for i, genre_string in enumerate(genre_strings):
        index = genre_list.index(genre_string)
        y_one_hot[i, index] = 1
    return y_one_hot

In [65]:
# Model building
model = Sequential()
input_shape = (train_mel_features.shape[0], train_mel_features.shape[1])

model.add(LSTM(units=128, dropout=0.05, recurrent_dropout=0.35, return_sequences=True, input_shape=input_shape))
model.add(LSTM(units=32,  dropout=0.05, recurrent_dropout=0.35, return_sequences=False))
model.add(Dense(units=len(np.unique(train_mel_labels)), activation="softmax"))

# Model compiling
opt = Adam()
model.compile(loss="categorical_crossentropy", optimizer=opt, metrics=["accuracy"])
model.summary()

# Model training
batch_size = 35  # num of training examples per minibatch
num_epochs = 400

print(train_mel_features.shape, one_hot(train_mel_labels).shape)

history = model.fit(
    train_mel_features,
    one_hot(train_mel_labels),
    batch_size=batch_size,
    epochs=num_epochs,
    validation_data=(val_mel_features, one_hot(val_mel_labels))
)

# Testing
score, accuracy = model.evaluate(
    test_mel_features, one_hot(test_mel_labels), batch_size=batch_size, verbose=1
)
print("Test loss:  ", score)
print("Test accuracy:  ", accuracy)

  super().__init__(**kwargs)


(640, 168960) (640, 10)
Epoch 1/400


ValueError: Exception encountered when calling Sequential.call().

[1mInvalid input shape for input Tensor("data:0", shape=(None, 168960), dtype=float32). Expected shape (None, 640, 168960), but input has incompatible shape (None, 168960)[0m

Arguments received by Sequential.call():
  • inputs=tf.Tensor(shape=(None, 168960), dtype=float32)
  • training=True
  • mask=None

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'])
plt.show()