## PREAMBLE ##
# Author: Luca Carnegie
# Date: 2025-04-23

# This code does some basic data sanity checks to ensure everything is working properly with the classifier dataset.

# Prerequisites: 01-clean-fma-data

In [2]:
import pandas as pd
import os

# Change working directory to the repo
os.chdir("C:\\Users\\lucac\\Documents\\GitHub\\song-cluster")

# load the data
df = pd.read_csv('data/analysis_data/classifier_data.csv')



In [None]:
# Basic data sanity tests

# 1. DataFrame is not empty
assert not df.empty, "DataFrame is empty."

# 2. No missing values in any column
missing = df.isnull().sum()
assert missing.sum() == 0, f"Missing values found:\n{missing}"

# 3. Categorical columns should have a reasonable number of unique values
cat_cols = df.select_dtypes(include=['object', 'category']).columns
for col in cat_cols:
    n_unique = df[col].nunique()
    assert n_unique < len(df) * 0.5, f"Column '{col}' has {n_unique} unique values (too many?)."

# 4. Should have one of the ten chosen genres in the genre_top column
assert df.genre_top.isin(['Classical', 'Electronic', 
                          'Experimental', 'Folk', 
                          'Hip-Hop', 'Instrumental', 
                          'International', 'Jazz', 
                          'Pop', 'Rock']
                        ).all(), "Invalid genre in 'genre_top' column."

print("All basic data checks passed.")

All basic data checks passed.
