In [1]:
!pip install Pydub
!pip install PyWavelets


import numpy as np
import librosa
import librosa.display
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import pywt
import pandas as pd
import os
from sklearn.preprocessing import StandardScaler
import audioread
from pydub import AudioSegment
import tensorflow as tf
# from tensorflow.keras import layers, models
from keras.models import Sequential
from keras.layers import Dense, Conv2D, MaxPooling2D , Flatten, Reshape, Input

Collecting Pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: Pydub
Successfully installed Pydub-0.25.1
Collecting PyWavelets
  Downloading pywavelets-1.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.0 kB)
Downloading pywavelets-1.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.5/4.5 MB[0m [31m27.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyWavelets
Successfully installed PyWavelets-1.7.0


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Load the datasets
#file_30_sec = 'data/GTZAN/features_30_sec.csv'
#file_3_sec = 'data/GTZAN/features_3_sec.csv'

file_30_sec = '/content/drive/MyDrive/Colab Notebooks/GTZAN/features_30_sec.csv'
file_3_sec = '/content/drive/MyDrive/Colab Notebooks/GTZAN/features_3_sec.csv'
audio_directory = '/content/drive/MyDrive/Colab Notebooks/GTZAN/genres_original'

df_30_sec = pd.read_csv(file_30_sec)
df_3_sec = pd.read_csv(file_3_sec)

In [4]:
# Function to extract spectrograms and wavelet features
def extract_features(file_path, sr=22050, n_mels=128, wavelet='db1'):
    try:
        # Load audio file
        y, sr = librosa.load(file_path, sr=sr)
        # Extract Mel spectrogram
        mel_spectrogram = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels)
        mel_spectrogram_db = librosa.power_to_db(mel_spectrogram, ref=np.max)
        mel_spectrogram_flattened = mel_spectrogram_db.flatten()  # Flatten for use in ML models

        # Extract wavelet features
        coeffs = pywt.wavedec(y, wavelet, level=5)
        wavelet_features = np.concatenate([np.array(c).flatten() for c in coeffs])

        # Extract chroma features
        chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr)
        chroma_features_flattened = chroma_stft.flatten()  # Flatten for use in ML models

        # Combine all features into a single feature vector
        combined_features = np.concatenate((mel_spectrogram_flattened, wavelet_features, chroma_features_flattened), axis=0)
    except (FileNotFoundError, librosa.util.exceptions.LibrosaError, audioread.NoBackendError):
        # If audio file is not found or cannot be loaded, use features from CSV instead
        print(f"Audio file {file_path} not found or cannot be processed. Using CSV features instead.")
        combined_features = None

    return combined_features

# Splitting the dataset into training, validation, and test sets
def split_data(df, test_size=0.2, val_size=0.2):
    train_val, test = train_test_split(df, test_size=test_size, stratify=df['label'], random_state=42)
    train, val = train_test_split(train_val, test_size=val_size, stratify=train_val['label'], random_state=42)
    return train, val, test

# Function to process the entire dataset and extract features
def process_dataset(df, audio_directory):
    features = []
    labels = []
    for idx, row in df.iterrows():
        file_path = os.path.join(audio_directory, row['label'], row['filename'])
        feature_vector = extract_features(file_path)
        # If audio features cannot be extracted, use CSV features
        if feature_vector is None:
            feature_vector = row.drop(['label']).filter(regex='^(?!filename)').values.astype(np.float32)
        features.append(feature_vector)
        labels.append(row['label'])
    # Ensure all feature vectors have the same length by padding or truncating
    max_length = max(len(f) for f in features)
    features = np.array([np.pad(f, (0, max_length - len(f)), 'constant') if len(f) < max_length else f[:max_length] for f in features])
    return features, np.array(labels)

In [5]:
# Standardizing dataset before splitting
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df_30_sec_numeric = df_30_sec.select_dtypes(include=['float64', 'int64']).copy()
df_30_sec[df_30_sec_numeric.columns] = scaler.fit_transform(df_30_sec_numeric)

df_3_sec_numeric = df_3_sec.select_dtypes(include=['float64', 'int64']).copy()
df_3_sec[df_3_sec_numeric.columns] = scaler.fit_transform(df_3_sec_numeric)

In [6]:
# Splitting the dataset for 30-sec
train_df_30s, val_df_30s, test_df_30s = split_data(df_30_sec)

# Extract and process features for 30-sec dataset
#audio_directory = 'data/GTZAN/genres_original'

# Extract and process features for 30-sec dataset
#audio_directory = 'data/GTZAN/genres_original'

# Combined Features for 30-sec
train_features_30s, train_labels_30s = process_dataset(train_df_30s, audio_directory)
val_features_30s, val_labels_30s = process_dataset(val_df_30s, audio_directory)
test_features_30s, test_labels_30s = process_dataset(test_df_30s, audio_directory)

  y, sr = librosa.load(file_path, sr=sr)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Audio file /content/drive/MyDrive/Colab Notebooks/GTZAN/genres_original/jazz/jazz.00054.wav not found or cannot be processed. Using CSV features instead.


In [7]:
# Splitting the dataset for 3-sec
train_df_3s, val_df_3s, test_df_3s = split_data(df_3_sec)

# Combined Features for 3-sec
train_features_3s, train_labels_3s = process_dataset(train_df_3s, audio_directory)
val_features_3s, val_labels_3s = process_dataset(val_df_3s, audio_directory)
test_features_3s, test_labels_3s = process_dataset(test_df_3s, audio_directory)

Output hidden; open in https://colab.research.google.com to view.

In [9]:
# CNN Model Implementation
# Function to build a CNN model

"""
def build_cnn_model(input_shape, num_classes):
    model = models.Sequential()
    model.add(layers.Input(shape=(input_shape,)))
    model.add(layers.Dense(256, activation='relu'))  # Increased number of units
    model.add(layers.Reshape((32, 8, 1)))  # Adjusted reshape to match input dimensions
    model.add(layers.Conv2D(64, (3, 3), activation='relu', padding='same'))  # Increased filters
    model.add(layers.MaxPooling2D((2, 2)))
    model.add(layers.Conv2D(128, (3, 3), activation='relu', padding='same'))  # Added more layers
    model.add(layers.MaxPooling2D((2, 2)))
    model.add(layers.Conv2D(128, (3, 3), activation='relu', padding='same'))
    model.add(layers.MaxPooling2D((2, 2)))
    model.add(layers.Conv2D(256, (3, 3), activation='relu', padding='same'))  # Added deeper layers
    model.add(layers.Flatten())
    model.add(layers.Dense(64, activation='relu'))  # Increased number of units
    model.add(layers.Dropout(0.5))  # Added Dropout layer to prevent overfitting
    model.add(layers.Dense(num_classes, activation='softmax'))

    model.compile(optimizer='adam',
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])
    return model
"""

# VGG16 Model Implementation
# Function to build a VGG16 model

def build_vgg16_model(input_shape, num_classes):
    # Create an object for training and testing
    model = Sequential()
    model.add(Input(shape=input_shape))
    # model.add(Reshape((32, 8, 1)))  # Adjusted reshape to match input dimensions
    model.add(Conv2D(filters=32, kernel_size=(3,3), padding="same", activation="relu"))
    model.add(Conv2D(filters=32, kernel_size=(3,3), padding="same", activation="relu"))
    model.add(MaxPooling2D(pool_size=(2,2),strides=(2,2),padding="same"))
    model.add(Conv2D(filters=64,kernel_size=(3,3),padding="same", activation="relu")) # input_shape
    model.add(Conv2D(filters=64,kernel_size=(3,3),padding="same", activation="relu"))
    model.add(MaxPooling2D(pool_size=(2,2),strides=(2,2),padding="same"))
    model.add(Conv2D(filters=128, kernel_size=(3,3), padding="same", activation="relu"))
    model.add(Conv2D(filters=128, kernel_size=(3,3), padding="same", activation="relu"))
    model.add(Conv2D(filters=128, kernel_size=(3,3), padding="same", activation="relu"))
    model.add(MaxPooling2D(pool_size=(2,2),strides=(2,2),padding="same"))
    model.add(Conv2D(filters=256, kernel_size=(3,3), padding="same", activation="relu"))
    model.add(Conv2D(filters=256, kernel_size=(3,3), padding="same", activation="relu"))
    model.add(Conv2D(filters=256, kernel_size=(3,3), padding="same", activation="relu"))
    model.add(MaxPooling2D(pool_size=(2,2),strides=(2,2),padding="same"))
    model.add(Conv2D(filters=512, kernel_size=(3,3), padding="same", activation="relu"))
    model.add(Conv2D(filters=512, kernel_size=(3,3), padding="same", activation="relu"))
    model.add(Conv2D(filters=512, kernel_size=(3,3), padding="same", activation="relu"))
    model.add(MaxPooling2D(pool_size=(2,2),strides=(2,2),padding="same"))

    # Initialize the model
    model.add(Flatten())
    model.add(Dense(units=4096,activation="relu"))
    model.add(Dense(units=4096,activation="relu"))
    model.add(Dense(units=num_classes, activation="softmax"))

    model.compile(optimizer='adam',
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    return model

# Convert labels to numeric format
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
train_labels_30s = le.fit_transform(train_labels_30s)
val_labels_30s = le.transform(val_labels_30s)
test_labels_30s = le.transform(test_labels_30s)

train_labels_3s = le.fit_transform(train_labels_3s)
val_labels_3s = le.transform(val_labels_3s)
test_labels_3s = le.transform(test_labels_3s)

# Callback for reducing learning rate
from tensorflow.keras.callbacks import ReduceLROnPlateau
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-6, verbose=1)

In [10]:
# Combined Features CNN Model for 30-sec Features
print(train_features_30s.shape)
print(len(np.unique(train_labels_30s)))
combined_vgg16_model_30s = build_vgg16_model((32,8,1), len(np.unique(train_labels_30s)))
history_30s = combined_vgg16_model_30s.fit(train_features_30s, train_labels_30s, epochs=100, validation_data=(val_features_30s, val_labels_30s), batch_size=32, callbacks=[reduce_lr])
combined_predictions_30s = combined_vgg16_model_30s.predict(test_features_30s)
print("Combined Features Predictions for 30-sec Features: ", np.argmax(combined_predictions_30s, axis=1))


(2151520, 32, 8, 1)
10


ValueError: Data cardinality is ambiguous. Make sure all arrays contain the same number of samples.'x' sizes: 2151520
'y' sizes: 640


In [None]:
# Combined Features CNN Model for 3-sec Features
combined_vgg16_model_3s = build_vgg16_model((32,8,1), len(np.unique(train_labels_3s)))
history_3s = combined_vgg16_model_3s.fit(train_features_3s, train_labels_3s, epochs=100, validation_data=(val_features_3s, val_labels_3s), batch_size=32, callbacks=[reduce_lr])
combined_predictions_3s = combined_vgg16_model_3s.predict(test_features_3s)
print("Combined Features Predictions for 3-sec Features: ", np.argmax(combined_predictions_3s, axis=1))

In [None]:
# Plotting model loss and accuracy for 30-sec and 3-sec
import matplotlib.pyplot as plt

def plot_losses(hist):

  fig, axs = plt.subplots(1,2, figsize=(12, 4))
  axs[0].plot(hist.history['loss'])
  axs[0].plot(hist.history['val_loss'])
  axs[0].set_title('Model Loss')
  axs[0].set_ylabel('Loss')
  axs[0].set_xlabel('Epoch')
  axs[0].legend(['Train', 'Test'], loc='upper right')

  axs[1].plot(hist.history['accuracy'])
  axs[1].plot(hist.history['val_accuracy'])
  axs[1].set_title('Model Accuracy')
  axs[1].set_ylabel('Accuracy')
  axs[1].set_xlabel('Epoch')
  axs[1].legend(['Train', 'Test'], loc='upper right')
  plt.show()

# Plot for 30-sec model
plot_losses(history_30s)

# Plot for 3-sec model
plot_losses(history_3s)