# 01: MERGE Preprocessing

This notebook preprocesses the MERGE balanced dataset for downstream embedding and clustering tasks. 

The goal is to load metadata and arousal-valence annotations, filter for complete records, and produce a clean subset with validated audio paths.

## Setup

In [None]:
import sys
from pathlib import Path

import pandas as pd

project_root = Path.cwd().parent.parent
sys.path.insert(0, str(project_root))

from configs.dataset import load_config
from dataset.filter import METADATA_COLUMNS, AV_COLUMNS

In [None]:
config = load_config(project_root / "configs" / "config.yaml")
merge = config.datasets["merge"]

print(f"Dataset root: {merge.root}")
print(f"Audio dir: {merge.audio.dir}")

## Data Preparation

### Load raw CSVs

The MERGE dataset provides two CSV files:
- `merge_audio_balanced_metadata.csv` with 14 columns of track metadata and tags
- `merge_audio_balanced_av_values.csv` with arousal-valence annotations

In [None]:
metadata_path = merge.metadata["metadata_file"]
av_path = merge.metadata["av_values_file"]

meta_df = pd.read_csv(metadata_path)
av_df = pd.read_csv(av_path)

print(f"Metadata: {len(meta_df)} rows, {len(meta_df.columns)} columns")
print(f"AV values: {len(av_df)} rows, {len(av_df.columns)} columns")

### Standardize column names

We use the column mappings from `dataset.filter` to convert raw headers to standardized names.

In [None]:
meta_df = meta_df.rename(columns=METADATA_COLUMNS)
meta_df = meta_df[list(METADATA_COLUMNS.values())]

av_df = av_df.rename(columns=AV_COLUMNS)
av_df = av_df[list(AV_COLUMNS.values())]

print("Metadata columns:", list(meta_df.columns))
print("AV columns:", list(av_df.columns))

### Merge metadata with AV values

Inner join on `song_id` to combine metadata with arousal-valence annotations.

In [None]:
df = meta_df.merge(av_df, on="song_id", how="inner")
print(f"Merged: {len(df)} rows")

### Filter for complete data

Require all tag fields to be populated (non-empty strings) and valid arousal/valence values.

In [None]:
tag_cols = ["mood_all", "genre", "theme", "style"]

for col in tag_cols:
    df = df[df[col].notna() & (df[col].str.strip() != "")]

df = df[df["arousal"].notna() & df["valence"].notna()]

print(f"After filtering: {len(df)} rows")

### Filter excluded genres

Remove tracks where all genres are non-sonic categories (Children's, Holiday, Religious, etc.).

In [None]:
from dataset.filter import EXCLUDED_GENRES

def has_valid_genre(genre_str: str) -> bool:
    """Check if track has at least one genre not in excluded list."""
    genres = [g.strip() for g in genre_str.split(",")]
    return any(g not in EXCLUDED_GENRES for g in genres)

before = len(df)
df = df[df["genre"].apply(has_valid_genre)]
print(f"Filtered excluded-only genres: {before} â†’ {len(df)} tracks")

### Build and validate audio paths

Construct audio file paths and filter to tracks with existing files.

In [None]:
audio_dir = merge.audio.dir

def build_audio_path(row: pd.Series) -> Path:
    return audio_dir / row["quadrant"] / f"{row['song_id']}.mp3"

df["audio_path"] = df.apply(build_audio_path, axis=1)
df["audio_exists"] = df["audio_path"].apply(lambda p: p.exists())

n_missing = (~df["audio_exists"]).sum()
print(f"Missing audio files: {n_missing}")

df = df[df["audio_exists"]].drop(columns=["audio_exists"])
print(f"Final: {len(df)} tracks with valid audio")

In [None]:
df["audio_path"] = df["audio_path"].astype(str)
df.head()

## Save Data

In [None]:
output_dir = project_root / "notebooks" / "data"
output_dir.mkdir(exist_ok=True)

output_path = output_dir / "merge_preprocessed.csv"
df.to_csv(output_path, index=False)

print(f"Saved: {output_path}")
print(f"Shape: {df.shape}")