In [1]:
import numpy as np
import librosa
import librosa.display
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import pywt
import pandas as pd
import os
from sklearn.preprocessing import StandardScaler
import audioread
from pydub import AudioSegment



In [2]:
# Load the datasets
file_30_sec = 'data/GTZAN/features_30_sec.csv' 
file_3_sec = 'data/GTZAN/features_3_sec.csv'  

df_30_sec = pd.read_csv(file_30_sec)
df_3_sec = pd.read_csv(file_3_sec)

In [3]:
# Function to extract spectrograms and wavelet features
def extract_features(file_path, sr=22050, n_mels=128, wavelet='db1'):
    try:
        # Load audio file
        y, sr = librosa.load(file_path, sr=sr)
        # Extract Mel spectrogram
        mel_spectrogram = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels)
        mel_spectrogram_db = librosa.power_to_db(mel_spectrogram, ref=np.max)
        mel_spectrogram_flattened = mel_spectrogram_db.flatten()  # Flatten for use in ML models
        
        # Extract wavelet features
        coeffs = pywt.wavedec(y, wavelet, level=5)
        wavelet_features = np.concatenate([np.array(c).flatten() for c in coeffs])
        
        # Extract chroma features
        chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr)
        chroma_features_flattened = chroma_stft.flatten()  # Flatten for use in ML models

        # Combine all features into a single feature vector
        combined_features = np.concatenate((mel_spectrogram_flattened, wavelet_features, chroma_features_flattened), axis=0)
    except (FileNotFoundError, librosa.util.exceptions.LibrosaError, audioread.NoBackendError):
        # If audio file is not found or cannot be loaded, use features from CSV instead
        print(f"Audio file {file_path} not found or cannot be processed. Using CSV features instead.")
        combined_features = None
    
    return combined_features

# Splitting the dataset into training, validation, and test sets
def split_data(df, test_size=0.2, val_size=0.2):
    train_val, test = train_test_split(df, test_size=test_size, stratify=df['label'], random_state=42)
    train, val = train_test_split(train_val, test_size=val_size, stratify=train_val['label'], random_state=42)
    return train, val, test

# Function to process the entire dataset and extract features
def process_dataset(df, audio_directory):
    features = []
    labels = []
    for idx, row in df.iterrows():
        file_path = os.path.join(audio_directory, row['label'], row['filename'])
        feature_vector = extract_features(file_path)
        # If audio features cannot be extracted, use CSV features
        if feature_vector is None:
            feature_vector = row.drop(['label']).filter(regex='^(?!filename)').values.astype(np.float32)
        features.append(feature_vector)
        labels.append(row['label'])
    # Ensure all feature vectors have the same length by padding or truncating
    max_length = max(len(f) for f in features)
    features = np.array([np.pad(f, (0, max_length - len(f)), 'constant') if len(f) < max_length else f[:max_length] for f in features])
    return features, np.array(labels)

In [4]:
# Splitting the smaller dataset for testing feature extraction (30-sec dataset)
train_df_30_sec, val_df_30_sec, test_df_30_sec = split_data(df_30_sec)

# Extract features for the entire dataset and standardize
scaler_30_sec = StandardScaler()

# Process training, validation, and test sets for 30-sec dataset
train_features_30_sec, train_labels_30_sec = process_dataset(train_df_30_sec, 'data/GTZAN/genres_original')
val_features_30_sec, val_labels_30_sec = process_dataset(val_df_30_sec, 'data/GTZAN/genres_original')
test_features_30_sec, test_labels_30_sec = process_dataset(test_df_30_sec, 'data/GTZAN/genres_original')

# Standardize the features for 30-sec dataset
train_features_30_sec = scaler_30_sec.fit_transform(train_features_30_sec)
val_features_30_sec = scaler_30_sec.transform(val_features_30_sec)
test_features_30_sec = scaler_30_sec.transform(test_features_30_sec)

# Display resulting data splits for 30-sec dataset
print("30-sec Dataset - Training Features Shape:", train_features_30_sec.shape)
print("30-sec Dataset - Validation Features Shape:", val_features_30_sec.shape)
print("30-sec Dataset - Test Features Shape:", test_features_30_sec.shape)

  y, sr = librosa.load(file_path, sr=sr)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Audio file data/GTZAN/genres_original/jazz/jazz.00054.wav not found or cannot be processed. Using CSV features instead.
30-sec Dataset - Training Features Shape: (640, 860608)
30-sec Dataset - Validation Features Shape: (160, 860608)
30-sec Dataset - Test Features Shape: (200, 860608)


In [5]:
# Splitting the smaller dataset for testing feature extraction (3-sec dataset)
train_df_3_sec, val_df_3_sec, test_df_3_sec = split_data(df_3_sec)

# Extract features directly from CSV for 3-sec dataset
scaler_3_sec = StandardScaler()

# Prepare features and labels from the dataframe directly
train_features_3_sec = train_df_3_sec.drop(columns=['label']).filter(regex='^(?!filename)').values.astype(np.float32)
train_labels_3_sec = train_df_3_sec['label'].values

val_features_3_sec = val_df_3_sec.drop(columns=['label']).filter(regex='^(?!filename)').values.astype(np.float32)
val_labels_3_sec = val_df_3_sec['label'].values

test_features_3_sec = test_df_3_sec.drop(columns=['label']).filter(regex='^(?!filename)').values.astype(np.float32)
test_labels_3_sec = test_df_3_sec['label'].values

# Standardize the features for 3-sec dataset
train_features_3_sec = scaler_3_sec.fit_transform(train_features_3_sec)
val_features_3_sec = scaler_3_sec.transform(val_features_3_sec)
test_features_3_sec = scaler_3_sec.transform(test_features_3_sec)

# Display resulting data splits for 3-sec dataset
print("3-sec Dataset - Training Features Shape:", train_features_3_sec.shape)
print("3-sec Dataset - Validation Features Shape:", val_features_3_sec.shape)
print("3-sec Dataset - Test Features Shape:", test_features_3_sec.shape)

3-sec Dataset - Training Features Shape: (6393, 58)
3-sec Dataset - Validation Features Shape: (1599, 58)
3-sec Dataset - Test Features Shape: (1998, 58)
