In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import entropy
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
df_chords = pd.read_csv("../data/df_jazz_chords.csv")
df_chords.head()

Unnamed: 0,Title,Composer,Key,Rhythm,Time Signature,Section,Measure,Chord Index,Chord,Year,Roman Numeral,Root,Root_pc,Chord_clean,Chord Type,Chord Extension
0,9.20 Special,Earl Warren,,Medium Swing,4/4,A,0,0,D9,1941,,D,2.0,D9,Triad,9
1,9.20 Special,Earl Warren,,Medium Swing,4/4,A,1,1,Fm6,1941,,F,5.0,Fm6,Triad,6
2,9.20 Special,Earl Warren,,Medium Swing,4/4,A,2,2,D9,1941,,D,2.0,D9,Triad,9
3,9.20 Special,Earl Warren,,Medium Swing,4/4,A,3,3,Fm6,1941,,F,5.0,Fm6,Triad,6
4,9.20 Special,Earl Warren,,Medium Swing,4/4,A,4,4,C,1941,,C,0.0,C,Triad,No extension


In [3]:
#define helper functions for computing model features

def compute_root_motion(root_pcs):
    diffs = root_pcs.diff().dropna()
    return np.minimum(np.abs(diffs), 12 - np.abs(diffs))

def pitch_class_entropy(root_pcs):
    counts = root_pcs.value_counts(normalize=True)
    return entropy(counts)

In [4]:
# define function for extracting song features for our model

def extract_song_features(song_df):
    features = {}

    # ------------------------
    # Basic structure
    # ------------------------
    features["n_chords"] = len(song_df)
    features["n_measures"] = song_df["Measure"].nunique()
    features["chords_per_measure"] = (
        features["n_chords"] / features["n_measures"]
        if features["n_measures"] > 0 else 0
    )

    # ------------------------
    # Harmonic vocabulary
    # ------------------------
    chord_types = song_df["Chord Type"].value_counts(normalize=True)

    features["pct_triad"] = chord_types.get("Triad", 0)
    features["pct_maj7"] = chord_types.get("Maj7", 0)
    features["pct_m7"] = chord_types.get("m7", 0)
    features["pct_dominant"] = chord_types.get("7", 0)
    features["pct_diminished"] = chord_types.get("o7", 0) + chord_types.get("Ã¸7", 0)

    # Extensions
    extensions = pd.to_numeric(song_df["Chord Extension"], errors="coerce")
    features["pct_extensions_9plus"] = (extensions >= 9).mean()

    # ------------------------
    # Root motion
    # ------------------------
    root_pcs = song_df["Root_pc"].dropna()

    if len(root_pcs) > 1:
        motion = compute_root_motion(root_pcs)
        features["mean_root_motion"] = motion.mean()

        # Descending fifth = +5 mod 12
        features["pct_desc_fifth"] = (motion == 5).mean()
    else:
        features["mean_root_motion"] = 0
        features["pct_desc_fifth"] = 0

    # ------------------------
    # Tonal stability
    # ------------------------
    features["root_pc_entropy"] = (
        pitch_class_entropy(root_pcs)
        if len(root_pcs) > 0 else 0
    )

    # ------------------------
    # Metadata (single-valued)
    # ------------------------
    features["Rhythm"] = song_df["Rhythm"].iloc[0]
    features["TimeSignature"] = song_df["Time Signature"].iloc[0]
    features["Year"] = song_df["Year"].iloc[0]

    return features


In [5]:
song_features = (
    df_chords
    .groupby("Title")
    .apply(extract_song_features)
    .apply(pd.Series)
    .reset_index()
)

  .apply(extract_song_features)


In [6]:
song_features.head()

Unnamed: 0,Title,n_chords,n_measures,chords_per_measure,pct_triad,pct_maj7,pct_m7,pct_dominant,pct_diminished,pct_extensions_9plus,mean_root_motion,pct_desc_fifth,root_pc_entropy,Rhythm,TimeSignature,Year
0,26-2,58,8,7.25,0.0,0.37931,0.155172,0.465517,0.0,0.0,3.929825,0.578947,2.191928,Medium Up Swing,4/4,1947
1,500 Miles High,22,18,1.222222,0.0,0.181818,0.545455,0.136364,0.136364,0.136364,1.52381,0.142857,2.161287,Bossa Nova,4/4,1972
2,502 Blues,22,20,1.1,0.0,0.227273,0.272727,0.318182,0.181818,0.227273,3.571429,0.52381,1.943998,Waltz,3/4,1959
3,52nd Street Theme,38,8,4.75,0.315789,0.0,0.315789,0.368421,0.0,0.0,3.945946,0.675676,1.489767,Up Tempo Swing,4/4,1947
4,9.20 Special,37,10,3.7,0.459459,0.0,0.0,0.513514,0.0,0.189189,2.685714,0.371429,1.859337,Medium Swing,4/4,1941


In [7]:
song_features.dtypes

Title                    object
n_chords                  int64
n_measures                int64
chords_per_measure      float64
pct_triad               float64
pct_maj7                float64
pct_m7                  float64
pct_dominant            float64
pct_diminished          float64
pct_extensions_9plus    float64
mean_root_motion        float64
pct_desc_fifth          float64
root_pc_entropy         float64
Rhythm                   object
TimeSignature            object
Year                     object
dtype: object

In [8]:
song_features['Year'] = pd.to_numeric(song_features['Year'], errors='coerce')

In [9]:
song_features.describe()

Unnamed: 0,n_chords,n_measures,chords_per_measure,pct_triad,pct_maj7,pct_m7,pct_dominant,pct_diminished,pct_extensions_9plus,mean_root_motion,pct_desc_fifth,root_pc_entropy,Year
count,1382.0,1382.0,1382.0,1382.0,1382.0,1382.0,1382.0,1382.0,1382.0,1382.0,1382.0,1382.0,1308.0
mean,40.539074,12.776411,3.626327,0.1593,0.134307,0.244574,0.3949,0.056156,0.11914,3.292028,0.50009,1.870712,1949.574159
std,14.037484,5.229277,1.724451,0.177627,0.120581,0.138959,0.172154,0.061448,0.166236,0.818532,0.179606,0.303576,15.610793
min,7.0,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.064516,0.0,0.450561,1800.0
25%,32.0,8.0,2.2375,0.029412,0.044697,0.15,0.305556,0.0,0.0,2.898883,0.4,1.735601,1937.0
50%,40.0,10.0,3.5,0.1,0.117647,0.25,0.37931,0.042553,0.060606,3.413418,0.526316,1.908178,1950.0
75%,49.0,16.0,4.75,0.226906,0.194953,0.333333,0.465417,0.083333,0.156076,3.868297,0.633943,2.063049,1961.0
max,101.0,44.0,10.0,1.0,1.0,0.863636,1.0,0.382353,1.0,5.0,1.0,2.484907,1999.0


In [10]:
# Convert categorical columns to dummies 

numeric_features = [
    "n_chords", "n_measures", "chords_per_measure",
    "pct_triad", "pct_maj7", "pct_m7", "pct_dominant", "pct_diminished",
    "pct_extensions_9plus", "mean_root_motion", "pct_desc_fifth", "root_pc_entropy", "Year"
]

categorical_features = ["Rhythm", "TimeSignature"]

song_features_processed = pd.get_dummies(
    song_features,
    columns=categorical_features,
    drop_first=True  
)

In [11]:
# Standardize numeric columns (after dealing with null values)

song_features_processed[numeric_features] = song_features_processed[numeric_features].replace('na', pd.NA)
song_features_processed[numeric_features] = song_features_processed[numeric_features].apply(pd.to_numeric, errors='coerce')
song_features_processed[numeric_features] = song_features_processed[numeric_features].fillna(song_features_processed[numeric_features].mean())

scaler = StandardScaler()
song_features_processed[numeric_features] = scaler.fit_transform(
    song_features_processed[numeric_features]
)

In [14]:
import joblib

joblib.dump(scaler, "scaler.pkl")

['scaler.pkl']

In [12]:
song_features_processed.to_csv(
    "../data/song_features_processed.csv",
    index=False
)

In [13]:
# Split into training and testing datasets

X = song_features_processed.drop(columns=["Title"])

# Since we plan to do unsuperervised clustering, we only need an X set (and not a y)
X_train, X_test = train_test_split(
    X,
    test_size=0.2,   
    random_state=42, 
    shuffle=True
)
