# Week 1 – Data Setup and EDA

This notebook loads Spotify audio features and Top 100 charts, labels tracks as hits/non-hits, and performs exploratory data analysis. Outputs are saved to `data/processed/` and `figures/`.

In [None]:
import json
import random
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from fuzzywuzzy import fuzz

random.seed(42)
np.random.seed(42)

DATA_DIR = Path('data')
RAW_DIR = DATA_DIR / 'raw'
PROC_DIR = DATA_DIR / 'processed'
FIG_DIR = Path('figures')

TRACKS_PATH = RAW_DIR / 'tracks.csv'
TOP_PATH = RAW_DIR / 'top100_tracks.csv'
OUTPUT_PATH = PROC_DIR / 'hits_dataset.csv'

assert TRACKS_PATH.exists(), 'tracks.csv missing in data/raw'
assert TOP_PATH.exists(), 'top100_tracks.csv missing in data/raw'

## Load and harmonize datasets

Tracks are matched using Spotify IDs when available, falling back to fuzzy name+artist matching. The result is a labeled dataset with `is_hit` = 1 for Top 100 tracks.

In [None]:
def normalize_text(text: str) -> str:
    return ' '.join(str(text).lower().strip().split())


tracks_df = pd.read_csv(TRACKS_PATH)
hits_df = pd.read_csv(TOP_PATH)

tracks_df.columns = [c.lower() for c in tracks_df.columns]
hits_df.columns = [c.lower() for c in hits_df.columns]

for col in ['name', 'artists']:
    if col in tracks_df.columns:
        tracks_df[col] = tracks_df[col].apply(normalize_text)
    if col in hits_df.columns:
        hits_df[col] = hits_df[col].apply(normalize_text)

if 'id' in tracks_df.columns and 'id' in hits_df.columns:
    hits = set(hits_df['id'].dropna().unique())
    tracks_df['is_hit'] = tracks_df['id'].isin(hits).astype(int)
else:
    hit_pairs = {(row['name'], row['artists']) for _, row in hits_df.iterrows()}
    labels = []
    for _, row in tracks_df.iterrows():
        name, artist = row.get('name', ''), row.get('artists', '')
        label = 0
        for h_name, h_artist in hit_pairs:
            if fuzz.token_set_ratio(name, h_name) >= 92 and fuzz.token_set_ratio(artist, h_artist) >= 92:
                label = 1
                break
        labels.append(label)
    tracks_df['is_hit'] = labels

print('Labeled hits:', tracks_df['is_hit'].sum())
print('Total tracks:', len(tracks_df))
tracks_df.to_csv(OUTPUT_PATH, index=False)
print(f'Saved labeled dataset to {OUTPUT_PATH}')

## Class balance summary

We inspect hit vs. non-hit counts to quantify the class imbalance challenge and guide model choices.

In [None]:
class_counts = tracks_df['is_hit'].value_counts().rename_axis('is_hit').reset_index(name='count')
class_counts['fraction'] = class_counts['count'] / len(tracks_df)
print(class_counts)

fig, ax = plt.subplots(figsize=(4, 3))
sns.barplot(data=class_counts, x='is_hit', y='count', ax=ax, palette='muted')
ax.set_title('Class Balance (Hit vs Non-hit)')
ax.set_xlabel('is_hit (1 = hit)')
fig.tight_layout()
FIG_DIR.mkdir(exist_ok=True)
fig.savefig(FIG_DIR / 'class_balance.png', dpi=300)
plt.close(fig)

## Feature distributions and correlations

Visualize how key audio features differ between hits and non-hits and inspect correlations to guide feature selection.

In [None]:
audio_features = [
    'danceability', 'energy', 'loudness', 'speechiness', 'acousticness',
    'instrumentalness', 'liveness', 'valence', 'tempo'
]

available_features = [f for f in audio_features if f in tracks_df.columns]
print('Using audio features:', available_features)

for feature in available_features:
    fig, ax = plt.subplots(figsize=(4, 3))
    sns.boxplot(data=tracks_df, x='is_hit', y=feature, ax=ax, palette='muted')
    ax.set_title(f'{feature} by hit status')
    fig.tight_layout()
    fig.savefig(FIG_DIR / f'{feature}_by_hit.png', dpi=300)
    plt.close(fig)

if available_features:
    corr = tracks_df[available_features].corr()
    fig, ax = plt.subplots(figsize=(6, 5))
    sns.heatmap(corr, cmap='coolwarm', center=0, ax=ax)
    ax.set_title('Correlation matrix (audio features)')
    fig.tight_layout()
    fig.savefig(FIG_DIR / 'correlation_matrix.png', dpi=300)
    plt.close(fig)

## Temporal analysis

If a `year` column is present, plot how many tracks appear per year to understand coverage and potential drift.

In [None]:
if 'year' in tracks_df.columns:
    yearly = tracks_df.groupby('year')['is_hit'].agg(['count', 'mean']).reset_index()
    fig, ax1 = plt.subplots(figsize=(6, 4))
    sns.barplot(data=yearly, x='year', y='count', ax=ax1, color='lightsteelblue')
    ax1.set_ylabel('Track count')
    ax1.set_title('Tracks by year')
    ax1.tick_params(axis='x', rotation=45)
    fig.tight_layout()
    fig.savefig(FIG_DIR / 'tracks_by_year.png', dpi=300)
    plt.close(fig)

    fig, ax2 = plt.subplots(figsize=(6, 4))
    sns.lineplot(data=yearly, x='year', y='mean', marker='o', ax=ax2)
    ax2.set_ylabel('Hit rate')
    ax2.set_title('Hit rate by year')
    ax2.tick_params(axis='x', rotation=45)
    fig.tight_layout()
    fig.savefig(FIG_DIR / 'hit_rate_by_year.png', dpi=300)
    plt.close(fig)
else:
    print('No year column found; skipping temporal analysis.')