In [2]:
# Cell 1: imports & settings
import os
import glob

import numpy as np
import pandas as pd
import librosa

os.chdir("C:\\Users\\lucac\\Documents\\GitHub\\song-cluster")

# Directory containing your .wav files
AUDIO_DIR = './data/raw_data/musicgpt_(small)_files'

# Audio settings
SR = 22050        # sampling rate
N_MFCC = 16       # number of MFCCs

In [4]:
def extract_features(filepath):
    """
    Load an audio file, extract MFCC means and 
    mean+variance of spectral centroid, rolloff, and ZCR.
    Returns a dict of values plus artist_name & genre_top.
    """
    y, sr = librosa.load(filepath, sr=SR)
    
    # MFCCs (n_mfcc x frames)
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=N_MFCC)
    mfcc_means = mfccs.mean(axis=1)
    
    # Spectral features
    centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
    rolloff  = librosa.feature.spectral_rolloff(y=y, sr=sr)
    zcr      = librosa.feature.zero_crossing_rate(y)
    
    features = {
        'artist_name':    'musicgpt',
        # extract genre from filename: musicgpt_[genre].wav
        'genre_top':      os.path.basename(filepath).split('_',1)[1].replace('.wav','')
    }
    
    # add MFCC means
    for i, m in enumerate(mfcc_means, start=1):
        features[f'mfcc_{i}'] = m
    
    # add mean & variance for each
    features['centroid_mean']     = centroid.mean()
    features['centroid_variance'] = centroid.var()
    features['rolloff_mean']      = rolloff.mean()
    features['rolloff_variance']  = rolloff.var()
    features['zcr_mean']          = zcr.mean()
    features['zcr_variance']      = zcr.var()
    
    return features


In [7]:
all_files = glob.glob(os.path.join(AUDIO_DIR, 'musicgpt_*.wav'))

rows = []
for fp in all_files:
    feat = extract_features(fp)
    rows.append(feat)

df = pd.DataFrame(rows)

# reorder columns to match your head example
col_order = (
    ['artist_name', 'genre_top'] +
    [f'mfcc_{i}' for i in range(1, N_MFCC+1)] +
    ['centroid_mean','centroid_variance',
     'rolloff_mean','rolloff_variance',
     'zcr_mean','zcr_variance']
)
df = df[col_order]

df.head(15)

# save to csv
df.to_csv('./data/analysis_data/musicgpt_features.csv', index=False)