In [5]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import librosa
import librosa.display
import IPython.display as ipd
import kagglehub
import librosa
from pathlib import Path
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
# Download the GTZAN dataset from Kaggle
path = kagglehub.dataset_download("andradaolteanu/gtzan-dataset-music-genre-classification")
print("Path to dataset files:", path)

Path to dataset files: /home/madojo/.cache/kagglehub/datasets/andradaolteanu/gtzan-dataset-music-genre-classification/versions/1


In [7]:
def split_audio(file_path):
    """
    split 30s fragment of WAV file into 3s frament.

    Parameters:
    - file_path (str): Path to the WAV audio file.
    - label (str): Genre label of the audio file.

    Returns:
    - 3 second fragment of WAV file.
    """

    try:
        # Load the audio file
        y, sr = librosa.load(file_path, duration=30)  # Ensures consistency in duration
        features = {}
        print(len(y)/sr)
        # Define the duration of each fragment (3 seconds)
        fragment_duration = 3  # seconds
        fragment_samples = fragment_duration * sr  # number of samples per fragment

        # Number of fragments
        num_fragments = len(y) // fragment_samples

        # Create a list to store the fragments
        fragments = []

        # Loop through the audio and create 3-second fragments
        for i in range(num_fragments):
            start_sample = i * fragment_samples
            end_sample = start_sample + fragment_samples
            frag = y[start_sample:end_sample]
            fragments.append(frag)

        return fragments, sr

    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None

In [11]:
def extract_audio_features(file_path, y, sr, label):
    """
    Extracts a comprehensive set of audio features from a WAV file.

    Parameters:
    - file_path (str): Path to the WAV audio file.
    - label (str): Genre label of the audio file.

    Returns:
    - features (dict): A dictionary containing the extracted audio features.
    """
    features = {}

    # Basic Metadata
    features['filename'] = os.path.basename(file_path)
    features['length'] = len(y)

    # Chroma STFT
    chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr)
    features['chroma_stft_mean'] = np.mean(chroma_stft)
    features['chroma_stft_var'] = np.var(chroma_stft)

    # RMS Energy
    rms = librosa.feature.rms(y=y)
    features['rms_mean'] = np.mean(rms)
    features['rms_var'] = np.var(rms)

    # Spectral Centroid
    spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
    features['spectral_centroid_mean'] = np.mean(spectral_centroid)
    features['spectral_centroid_var'] = np.var(spectral_centroid)

    # Spectral Bandwidth
    spectral_bandwidth = librosa.feature.spectral_bandwidth(y=y, sr=sr)
    features['spectral_bandwidth_mean'] = np.mean(spectral_bandwidth)
    features['spectral_bandwidth_var'] = np.var(spectral_bandwidth)

    # Spectral Rolloff
    rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
    features['rolloff_mean'] = np.mean(rolloff)
    features['rolloff_var'] = np.var(rolloff)

    # Zero Crossing Rate
    zero_crossing_rate = librosa.feature.zero_crossing_rate(y)
    features['zero_crossing_rate_mean'] = np.mean(zero_crossing_rate)
    features['zero_crossing_rate_var'] = np.var(zero_crossing_rate)

    # Harmony
    y_harmonic, _ = librosa.effects.hpss(y)
    harmony = librosa.feature.spectral_contrast(y=y_harmonic, sr=sr)
    features['harmony_mean'] = np.mean(harmony)
    features['harmony_var'] = np.var(harmony)

    # Percussive
    y_percussive, _ = librosa.effects.hpss(y)
    perceptr = librosa.feature.spectral_contrast(y=y_percussive, sr=sr)
    features['perceptr_mean'] = np.mean(perceptr)
    features['perceptr_var'] = np.var(perceptr)

    # Tempo
    onset_env = librosa.onset.onset_strength(y=y, sr=sr)
    tempo = librosa.beat.tempo(onset_envelope=onset_env, sr=sr)
    features['tempo'] = tempo[0]

    # MFCCs
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=20)
    for i in range(1, 21):
        mfcc = mfccs[i-1]
        features[f'mfcc{i}_mean'] = np.mean(mfcc)
        features[f'mfcc{i}_var'] = np.var(mfcc)

    # Label
    features['label'] = label

    return features

In [1]:
def load_and_extract_audio_features(file_path, label):
    """
    Extracts a comprehensive set of audio features from a WAV file.

    Parameters:
    - file_path (str): Path to the WAV audio file.
    - label (str): Genre label of the audio file.

    Returns:
    - features (dict): A dictionary containing the extracted audio features.
    """
    try:
        # Load the audio file
        y, sr = librosa.load(file_path, duration=30)  # Ensures consistency in duration
        features = {}

        # Basic Metadata
        features['filename'] = os.path.basename(file_path)
        features['length'] = len(y)

        # Chroma STFT
        chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr)
        features['chroma_stft_mean'] = np.mean(chroma_stft)
        features['chroma_stft_var'] = np.var(chroma_stft)

        # RMS Energy
        rms = librosa.feature.rms(y=y)
        features['rms_mean'] = np.mean(rms)
        features['rms_var'] = np.var(rms)

        # Spectral Centroid
        spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
        features['spectral_centroid_mean'] = np.mean(spectral_centroid)
        features['spectral_centroid_var'] = np.var(spectral_centroid)

        # Spectral Bandwidth
        spectral_bandwidth = librosa.feature.spectral_bandwidth(y=y, sr=sr)
        features['spectral_bandwidth_mean'] = np.mean(spectral_bandwidth)
        features['spectral_bandwidth_var'] = np.var(spectral_bandwidth)

        # Spectral Rolloff
        rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
        features['rolloff_mean'] = np.mean(rolloff)
        features['rolloff_var'] = np.var(rolloff)

        # Zero Crossing Rate
        zero_crossing_rate = librosa.feature.zero_crossing_rate(y)
        features['zero_crossing_rate_mean'] = np.mean(zero_crossing_rate)
        features['zero_crossing_rate_var'] = np.var(zero_crossing_rate)

        # Harmony
        y_harmonic, _ = librosa.effects.hpss(y)
        harmony = librosa.feature.spectral_contrast(y=y_harmonic, sr=sr)
        features['harmony_mean'] = np.mean(harmony)
        features['harmony_var'] = np.var(harmony)

        # Percussive
        y_percussive, _ = librosa.effects.hpss(y)
        perceptr = librosa.feature.spectral_contrast(y=y_percussive, sr=sr)
        features['perceptr_mean'] = np.mean(perceptr)
        features['perceptr_var'] = np.var(perceptr)

        # Tempo
        onset_env = librosa.onset.onset_strength(y=y, sr=sr)
        tempo = librosa.beat.tempo(onset_envelope=onset_env, sr=sr)
        features['tempo'] = tempo[0]

        # MFCCs
        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=20)
        for i in range(1, 21):
            mfcc = mfccs[i-1]
            features[f'mfcc{i}_mean'] = np.mean(mfcc)
            features[f'mfcc{i}_var'] = np.var(mfcc)

        # Label
        features['label'] = label

        return features

    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None


In [None]:
# Define the path to your dataset: may be root or user home directory

#DATASET_PATH = '/root/.cache/kagglehub/datasets/andradaolteanu/gtzan-dataset-music-genre-classification/versions/1/Data/genres_original'
DATASET_PATH = os.path.expanduser('~/.cache/kagglehub/datasets/andradaolteanu/gtzan-dataset-music-genre-classification/versions/1/Data/genres_original')

# Define the genres (ensure these match your dataset's genres)
GENRES = ['blues', 'classical', 'country', 'disco', 'hiphop',
          'jazz', 'metal', 'pop', 'reggae', 'rock']

# Initialize a list to hold feature dictionaries
features_list = []

# Iterate through each genre and extract features from each file
for genre in GENRES:
    genre_dir = Path(DATASET_PATH) / genre
    if not genre_dir.exists():
        print(f"Genre directory {genre_dir} does not exist. Skipping...")
        continue

    # List all WAV files in the genre directory
    wav_files = list(genre_dir.glob('*.wav'))

    print(f"Processing genre: {genre} ({len(wav_files)} files)")
    for wav_file in tqdm(wav_files, desc=f"Processing {genre}"):
        fragment_list, sr= split_audio(str(wav_file))
        for idx, fragment in enumerate(fragment_list):
            print(f"Processing fragment {idx}...")
            feature_dict = extract_audio_features(str(wav_file), fragment, sr, genre)
            if feature_dict:
                features_list.append(feature_dict)

# Create a DataFrame from the list of feature dictionaries
df = pd.DataFrame(features_list)

# Display the first few rows of the DataFrame
print("Sample of extracted features:")
print(df.head())

# Save the DataFrame to a CSV file
output_csv = 'audio_features_extracted.csv'
df.to_csv(output_csv, index=False)
print(f"Feature extraction complete. Saved to {output_csv}")

Processing genre: blues (100 files)


Processing blues:   0%|          | 0/100 [00:00<?, ?it/s]

30.0
Processing fragment 0...


	This function was moved to 'librosa.feature.rhythm.tempo' in librosa version 0.10.0.
	This alias will be removed in librosa version 1.0.
  tempo = librosa.beat.tempo(onset_envelope=onset_env, sr=sr)


Processing fragment 1...
Processing fragment 2...
Processing fragment 3...
Processing fragment 4...
Processing fragment 5...
Processing fragment 6...
Processing fragment 7...
Processing fragment 8...
Processing fragment 9...


Processing blues:   1%|          | 1/100 [00:05<08:26,  5.11s/it]

30.0
Processing fragment 0...
Processing fragment 1...
Processing fragment 2...
Processing fragment 3...
Processing fragment 4...
Processing fragment 5...
Processing fragment 6...
Processing fragment 7...
Processing fragment 8...
Processing fragment 9...


Processing blues:   2%|▏         | 2/100 [00:09<07:59,  4.89s/it]

30.0
Processing fragment 0...
Processing fragment 1...
Processing fragment 2...
Processing fragment 3...
Processing fragment 4...
Processing fragment 5...
Processing fragment 6...
Processing fragment 7...
Processing fragment 8...
Processing fragment 9...


Processing blues:   3%|▎         | 3/100 [00:13<06:40,  4.13s/it]

30.0
Processing fragment 0...
Processing fragment 1...
Processing fragment 2...
Processing fragment 3...
Processing fragment 4...
Processing fragment 5...
Processing fragment 6...
Processing fragment 7...
Processing fragment 8...
Processing fragment 9...


Processing blues:   4%|▍         | 4/100 [00:16<05:59,  3.75s/it]

30.0
Processing fragment 0...
Processing fragment 1...
Processing fragment 2...
Processing fragment 3...
Processing fragment 4...
Processing fragment 5...
Processing fragment 6...
Processing fragment 7...
Processing fragment 8...
Processing fragment 9...


Processing blues:   5%|▌         | 5/100 [00:19<05:35,  3.53s/it]

30.0
Processing fragment 0...
Processing fragment 1...
Processing fragment 2...
Processing fragment 3...
Processing fragment 4...
Processing fragment 5...
Processing fragment 6...
Processing fragment 7...
Processing fragment 8...
Processing fragment 9...


Processing blues:   6%|▌         | 6/100 [00:22<05:18,  3.39s/it]

30.0
Processing fragment 0...
Processing fragment 1...
Processing fragment 2...
Processing fragment 3...
Processing fragment 4...
Processing fragment 5...
Processing fragment 6...
Processing fragment 7...
Processing fragment 8...
Processing fragment 9...


Processing blues:   7%|▋         | 7/100 [00:25<05:07,  3.30s/it]

30.0
Processing fragment 0...
Processing fragment 1...
Processing fragment 2...
Processing fragment 3...
Processing fragment 4...
Processing fragment 5...
Processing fragment 6...
Processing fragment 7...
Processing fragment 8...
Processing fragment 9...


Processing blues:   8%|▊         | 8/100 [00:29<05:11,  3.39s/it]

30.0
Processing fragment 0...
Processing fragment 1...
Processing fragment 2...
Processing fragment 3...
Processing fragment 4...
Processing fragment 5...
Processing fragment 6...
Processing fragment 7...
Processing fragment 8...
Processing fragment 9...


Processing blues:   9%|▉         | 9/100 [00:32<05:09,  3.40s/it]

30.0
Processing fragment 0...
Processing fragment 1...
Processing fragment 2...
Processing fragment 3...
Processing fragment 4...
Processing fragment 5...
Processing fragment 6...
Processing fragment 7...
Processing fragment 8...
Processing fragment 9...


Processing blues:  10%|█         | 10/100 [00:37<05:35,  3.73s/it]

30.0
Processing fragment 0...
Processing fragment 1...
Processing fragment 2...
Processing fragment 3...
Processing fragment 4...
Processing fragment 5...
Processing fragment 6...
Processing fragment 7...
Processing fragment 8...
Processing fragment 9...


Processing blues:  11%|█         | 11/100 [00:41<05:57,  4.01s/it]

30.0
Processing fragment 0...
Processing fragment 1...
Processing fragment 2...
Processing fragment 3...
Processing fragment 4...
Processing fragment 5...
Processing fragment 6...
Processing fragment 7...
Processing fragment 8...
Processing fragment 9...


Processing blues:  12%|█▏        | 12/100 [00:44<05:31,  3.77s/it]

30.0
Processing fragment 0...
Processing fragment 1...
Processing fragment 2...
Processing fragment 3...
Processing fragment 4...
Processing fragment 5...
Processing fragment 6...
Processing fragment 7...
Processing fragment 8...
Processing fragment 9...


Processing blues:  13%|█▎        | 13/100 [00:48<05:25,  3.75s/it]

30.0
Processing fragment 0...
Processing fragment 1...
Processing fragment 2...
Processing fragment 3...
Processing fragment 4...
Processing fragment 5...
Processing fragment 6...
Processing fragment 7...
Processing fragment 8...
Processing fragment 9...


Processing blues:  14%|█▍        | 14/100 [00:51<05:08,  3.58s/it]

30.0
Processing fragment 0...
Processing fragment 1...
Processing fragment 2...
Processing fragment 3...
Processing fragment 4...
Processing fragment 5...
Processing fragment 6...
Processing fragment 7...
Processing fragment 8...
Processing fragment 9...


Processing blues:  15%|█▌        | 15/100 [00:55<05:01,  3.55s/it]

30.0
Processing fragment 0...
Processing fragment 1...
Processing fragment 2...
Processing fragment 3...
Processing fragment 4...
Processing fragment 5...
Processing fragment 6...
Processing fragment 7...
Processing fragment 8...
Processing fragment 9...


Processing blues:  16%|█▌        | 16/100 [00:58<04:49,  3.45s/it]

30.0
Processing fragment 0...
Processing fragment 1...
Processing fragment 2...
Processing fragment 3...
Processing fragment 4...
Processing fragment 5...
Processing fragment 6...
Processing fragment 7...
Processing fragment 8...
Processing fragment 9...


Processing blues:  17%|█▋        | 17/100 [01:01<04:42,  3.40s/it]

30.0
Processing fragment 0...
Processing fragment 1...
Processing fragment 2...
Processing fragment 3...
Processing fragment 4...
Processing fragment 5...
Processing fragment 6...
Processing fragment 7...
Processing fragment 8...
Processing fragment 9...


Processing blues:  18%|█▊        | 18/100 [01:05<04:36,  3.37s/it]

30.0
Processing fragment 0...
Processing fragment 1...
Processing fragment 2...
Processing fragment 3...
Processing fragment 4...
Processing fragment 5...
Processing fragment 6...
Processing fragment 7...
Processing fragment 8...
Processing fragment 9...


Processing blues:  19%|█▉        | 19/100 [01:08<04:29,  3.33s/it]

30.0
Processing fragment 0...
Processing fragment 1...
Processing fragment 2...
Processing fragment 3...
Processing fragment 4...
Processing fragment 5...
Processing fragment 6...
Processing fragment 7...
Processing fragment 8...
Processing fragment 9...


Processing blues:  20%|██        | 20/100 [01:11<04:24,  3.31s/it]

30.0
Processing fragment 0...
Processing fragment 1...
Processing fragment 2...
Processing fragment 3...
Processing fragment 4...
Processing fragment 5...
Processing fragment 6...
Processing fragment 7...
Processing fragment 8...
Processing fragment 9...


Processing blues:  21%|██        | 21/100 [01:14<04:18,  3.27s/it]

30.0
Processing fragment 0...
Processing fragment 1...
Processing fragment 2...
Processing fragment 3...
Processing fragment 4...
Processing fragment 5...
Processing fragment 6...
Processing fragment 7...
Processing fragment 8...
Processing fragment 9...


Processing blues:  22%|██▏       | 22/100 [01:17<04:11,  3.22s/it]

30.0
Processing fragment 0...
Processing fragment 1...
Processing fragment 2...
Processing fragment 3...
Processing fragment 4...
Processing fragment 5...
Processing fragment 6...
Processing fragment 7...
Processing fragment 8...
Processing fragment 9...


Processing blues:  23%|██▎       | 23/100 [01:20<04:04,  3.18s/it]

30.0
Processing fragment 0...
Processing fragment 1...
Processing fragment 2...
Processing fragment 3...
Processing fragment 4...
Processing fragment 5...
Processing fragment 6...
Processing fragment 7...
Processing fragment 8...
Processing fragment 9...


Processing blues:  24%|██▍       | 24/100 [01:24<04:09,  3.29s/it]

30.0
Processing fragment 0...
Processing fragment 1...
Processing fragment 2...
Processing fragment 3...
Processing fragment 4...
Processing fragment 5...
Processing fragment 6...
Processing fragment 7...
Processing fragment 8...
Processing fragment 9...


Processing blues:  25%|██▌       | 25/100 [01:27<04:07,  3.30s/it]

30.0
Processing fragment 0...
Processing fragment 1...
Processing fragment 2...
Processing fragment 3...
Processing fragment 4...
Processing fragment 5...
Processing fragment 6...
Processing fragment 7...
Processing fragment 8...
Processing fragment 9...


Processing blues:  26%|██▌       | 26/100 [01:31<04:08,  3.36s/it]

30.0
Processing fragment 0...
Processing fragment 1...
Processing fragment 2...
Processing fragment 3...
Processing fragment 4...
Processing fragment 5...
Processing fragment 6...
Processing fragment 7...
Processing fragment 8...
Processing fragment 9...


Processing blues:  27%|██▋       | 27/100 [01:34<04:01,  3.31s/it]

30.0
Processing fragment 0...
Processing fragment 1...
Processing fragment 2...


In [3]:
print(f"Contents of the dataset at: {path}\n")

for root, dirs, files in os.walk(path):
    level = root.replace(path, '').count(os.sep)
    indent = ' ' * 4 * level
    print(f"{indent}{os.path.basename(root)}/")
    subindent = ' ' * 4 * (level + 1)
    for f in files:
        print(f"{subindent}{f}")

Contents of the dataset at: /home/madojo/.cache/kagglehub/datasets/andradaolteanu/gtzan-dataset-music-genre-classification/versions/1

1/
    Data/
        features_30_sec.csv
        features_3_sec.csv
        genres_original/
            metal/
                metal.00061.wav
                metal.00062.wav
                metal.00073.wav
                metal.00050.wav
                metal.00095.wav
                metal.00066.wav
                metal.00012.wav
                metal.00034.wav
                metal.00037.wav
                metal.00094.wav
                metal.00055.wav
                metal.00057.wav
                metal.00007.wav
                metal.00031.wav
                metal.00081.wav
                metal.00093.wav
                metal.00020.wav
                metal.00077.wav
                metal.00004.wav
                metal.00092.wav
                metal.00021.wav
                metal.00052.wav
                metal.00045.wav
                m

In [6]:
dfDATASET_PATH = os.path.expanduser('~/.cache/kagglehub/datasets/andradaolteanu/gtzan-dataset-music-genre-classification/versions/1/Data/genres_original')

Unnamed: 0,filename,length,chroma_stft_mean,chroma_stft_var,rms_mean,rms_var,spectral_centroid_mean,spectral_centroid_var,spectral_bandwidth_mean,spectral_bandwidth_var,...,mfcc16_var,mfcc17_mean,mfcc17_var,mfcc18_mean,mfcc18_var,mfcc19_mean,mfcc19_var,mfcc20_mean,mfcc20_var,label
0,blues.00034.wav,661500,0.262119,0.099052,0.135918,0.003228,1536.731078,3.411545e+05,2157.058612,161035.529489,...,66.560493,-15.326929,116.120934,-6.845921,93.477928,-9.827670,90.838371,-13.943093,92.893188,blues
1,blues.00071.wav,661500,0.305249,0.080075,0.120495,0.000327,2429.335959,9.769592e+04,2286.694983,25051.139843,...,39.768433,-10.848846,36.446468,3.321632,41.193359,-6.187531,42.513241,2.999421,40.246086,blues
2,blues.00003.wav,661500,0.404848,0.094018,0.141139,0.006340,1070.110059,1.845067e+05,1596.244204,166637.568358,...,44.456993,-3.320055,50.236965,0.637263,37.351917,-0.617507,37.291168,-3.406940,31.988441,blues
3,blues.00073.wav,661500,0.342027,0.084189,0.259655,0.005487,2094.775834,1.086419e+05,2158.105747,42017.307458,...,38.748482,-2.830827,45.266598,4.726117,44.094547,0.482423,51.535992,1.477394,70.009857,blues
4,blues.00037.wav,661500,0.248683,0.098003,0.069130,0.002232,1188.101159,3.697985e+05,1682.707390,200378.602726,...,194.047531,-7.189154,158.227753,-9.366816,176.350723,-4.469685,142.500168,-3.683124,175.982468,blues
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
994,rock.00056.wav,661500,0.488519,0.072430,0.175698,0.002058,3303.904680,4.609732e+05,2914.147802,76963.771617,...,29.190754,-0.688878,26.353708,1.526084,24.003740,-0.467646,26.583841,0.503203,25.119268,rock
995,rock.00011.wav,661500,0.343224,0.088863,0.123377,0.000945,1536.987180,4.917425e+05,2036.922074,173140.946072,...,56.059555,-4.372727,71.118393,1.172764,96.851227,0.487429,80.008102,-5.829122,75.735825,rock
996,rock.00063.wav,661500,0.329628,0.093699,0.053099,0.000365,1711.662556,1.535022e+06,2310.122655,689970.984576,...,63.449406,-0.816013,97.788795,-2.373943,86.863861,0.376118,75.303490,-1.867185,125.579407,rock
997,rock.00083.wav,661500,0.378793,0.081320,0.101675,0.000519,2115.983895,1.894329e+05,2245.276674,61790.276599,...,39.681881,-8.042082,42.630722,3.527468,34.146507,-7.682268,51.044621,-1.345104,43.839245,rock
