# 01 - Week 1: Data Setup & EDA

Tasks:
1. Load raw Spotify datasets (tracks + Top 100 list).
2. Merge and label tracks as HIT/NON-HIT.
3. Explore class imbalance and feature distributions.
4. Save the processed dataset for modeling.

## 1. Imports and settings

In [None]:
import os
import json
import warnings
from pathlib import Path

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8')
RANDOM_STATE = 42


## 2. Configuration
Update paths here if your files live elsewhere.

In [None]:
RAW_TRACKS_PATH = Path('data/raw/tracks.csv')
TOP100_PATH = Path('data/raw/top100_tracks.csv')
PROCESSED_PATH = Path('data/processed/hits_dataset.csv')

AUDIO_FEATURES = [
    'danceability', 'energy', 'loudness', 'speechiness',
    'acousticness', 'instrumentalness', 'liveness', 'valence',
    'tempo'
]


## 3. Load datasets
Adjust column names if your source files differ.

In [None]:
tracks_df = pd.read_csv(RAW_TRACKS_PATH)
top100_df = pd.read_csv(TOP100_PATH)

print(tracks_df.head())
print(top100_df.head())


## 4. Label hits
Priority order: (1) Spotify track ID match, (2) name + artist exact match, (3) fuzzy match fallback.

In [None]:
def label_hits(tracks: pd.DataFrame, hits: pd.DataFrame) -> pd.DataFrame:
    tracks = tracks.copy()
    hits = hits.copy()

    # Standardize column names
    rename_map = {
        'track_name': 'name',
        'track': 'name',
        'artists': 'artist',
        'artists_name': 'artist',
    }
    tracks.rename(columns=rename_map, inplace=True)
    hits.rename(columns=rename_map, inplace=True)

    tracks['name'] = tracks['name'].str.lower().str.strip()
    tracks['artist'] = tracks['artist'].str.lower().str.strip()
    hits['name'] = hits['name'].str.lower().str.strip()
    hits['artist'] = hits['artist'].str.lower().str.strip()

    # Strategy 1: Spotify ID
    if 'id' in tracks.columns and 'id' in hits.columns:
        hit_ids = set(hits['id'].dropna())
        tracks['is_hit'] = tracks['id'].isin(hit_ids)
    else:
        tracks['is_hit'] = False

    # Strategy 2: exact name + artist match for remaining rows
    remaining = tracks[~tracks['is_hit']]
    hit_pairs = set(zip(hits['name'], hits['artist']))
    exact_mask = remaining.apply(lambda r: (r['name'], r['artist']) in hit_pairs, axis=1)
    tracks.loc[remaining.index, 'is_hit'] = exact_mask.values | tracks.loc[remaining.index, 'is_hit']

    # Strategy 3: fuzzy match
    remaining = tracks[~tracks['is_hit']]
    def fuzzy_match(row):
        candidate = f"{row['name']} - {row['artist']}"
        choices = [f"{n} - {a}" for n, a in zip(hits['name'], hits['artist'])]
        if not choices:
            return False
        match, score = process.extractOne(candidate, choices, scorer=fuzz.token_sort_ratio)
        return score >= 90

    tracks.loc[remaining.index, 'is_hit'] = remaining.apply(fuzzy_match, axis=1)
    return tracks

tracks_labeled = label_hits(tracks_df, top100_df)
print(tracks_labeled['is_hit'].value_counts(normalize=True))


## 5. Feature selection and cleaning

In [None]:
# Keep audio features plus identifiers for reference
available_features = [c for c in AUDIO_FEATURES if c in tracks_labeled.columns]
selected_cols = ['name', 'artist', 'id', 'release_date', 'is_hit'] + available_features
processed = tracks_labeled[selected_cols].dropna(subset=available_features)

# Ensure correct dtypes
processed['release_date'] = pd.to_datetime(processed['release_date'], errors='coerce')
processed = processed.dropna(subset=['release_date'])

# Temporal features
processed['release_year'] = processed['release_date'].dt.year
processed['release_month'] = processed['release_date'].dt.month


## 6. Class balance analysis

In [None]:
hit_rate = processed['is_hit'].mean()
print(f"Hit rate: {hit_rate:.3%} ({processed['is_hit'].sum()} hits / {len(processed)} tracks)")


## 7. Visualizations

In [None]:
fig, axes = plt.subplots(len(available_features), 1, figsize=(8, 4*len(available_features)), constrained_layout=True)
if len(available_features) == 1:
    axes = [axes]
for ax, feat in zip(axes, available_features):
    sns.boxplot(data=processed, x='is_hit', y=feat, ax=ax)
    ax.set_title(f"{feat} by hit status")
plt.savefig('figures/feature_distributions.png', dpi=300)
plt.close()

plt.figure(figsize=(10,6))
sns.countplot(data=processed, x='release_year', hue='is_hit')
plt.title('Tracks by year and hit status')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('figures/tracks_by_year.png', dpi=300)
plt.close()

corr = processed[available_features].corr()
plt.figure(figsize=(10,8))
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Feature correlation')
plt.tight_layout()
plt.savefig('figures/correlation_matrix.png', dpi=300)
plt.close()


## 8. Save processed data

In [None]:
PROCESSED_PATH.parent.mkdir(parents=True, exist_ok=True)
processed.to_csv(PROCESSED_PATH, index=False)
print(f"Saved processed dataset to {PROCESSED_PATH.resolve()}")


Proceed to `02_Week2_Baseline_Modeling.ipynb`.