# Spotify Dataset - Exploratory Data Analysis

This notebook explores the Spotify dataset (1921-2020) containing 169k+ songs.

## Objectives
1. Load and inspect the dataset
2. Understand data quality and missing values
3. Analyze feature distributions
4. Explore temporal trends in music
5. Investigate feature correlations
6. Identify patterns and insights

In [None]:
# Setup
import sys
from pathlib import Path

# Add src to path
project_root = Path.cwd().parent
sys.path.insert(0, str(project_root / 'src'))

# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from featurebeats.utils.data_loader import SpotifyDataLoader, quick_load
from featurebeats.utils.visualization import (
    plot_feature_distributions,
    plot_temporal_trends,
    plot_correlation_matrix,
    plot_top_artists,
    plot_key_mode_distribution,
    plot_popularity_vs_features
)
from featurebeats.utils.config import get_config

# Settings
%matplotlib inline
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

print("Setup complete!")

## 1. Load Data

In [None]:
# Initialize loader
loader = SpotifyDataLoader()

# Load data
print("Loading dataset...")
df = loader.load_csv()

print(f"\nDataset shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")

## 2. Initial Inspection

In [None]:
# First few rows
df.head()

In [None]:
# Data types and info
df.info()

In [None]:
# Summary statistics
df.describe()

## 3. Data Quality Check

In [None]:
# Missing values
missing = df.isnull().sum()
missing_pct = (missing / len(df)) * 100

missing_df = pd.DataFrame({
    'Missing Count': missing,
    'Percentage': missing_pct
})
missing_df = missing_df[missing_df['Missing Count'] > 0].sort_values('Missing Count', ascending=False)

if len(missing_df) > 0:
    print("Columns with missing values:")
    print(missing_df)
else:
    print("No missing values found!")

In [None]:
# Duplicates
duplicates = df.duplicated(subset=['id']).sum()
print(f"Duplicate tracks (by ID): {duplicates}")

if duplicates > 0:
    print("\nExample duplicates:")
    print(df[df.duplicated(subset=['id'], keep=False)][['id', 'name', 'artists']].head(10))

## 4. Clean Data

In [None]:
# Clean the dataset
df_clean = loader.clean_data(df)

print(f"Original size: {len(df)}")
print(f"Cleaned size: {len(df_clean)}")
print(f"Removed: {len(df) - len(df_clean)} rows")

## 5. Dataset Statistics

In [None]:
# Get comprehensive statistics
stats = loader.get_statistics(df_clean)

print("Dataset Statistics:")
print(f"Total Tracks: {stats.total_tracks:,}")
print(f"Unique Artists: {stats.unique_artists:,}")
print(f"Year Range: {stats.year_range[0]} - {stats.year_range[1]}")
print(f"Average Duration: {stats.avg_duration_ms/60000:.2f} minutes")
print(f"Average Popularity: {stats.avg_popularity:.1f}/100")
print(f"Average Tempo: {stats.avg_tempo:.1f} BPM")
print(f"Most Common Key: {stats.most_common_key}")
print(f"Most Common Mode: {'Major' if stats.most_common_mode == 1 else 'Minor'}")
print(f"Explicit Content: {stats.explicit_percentage:.1f}%")

## 6. Feature Distributions

In [None]:
# Plot audio feature distributions
fig = plot_feature_distributions(df_clean)
plt.suptitle('Audio Feature Distributions', fontsize=16, y=1.02)
plt.show()

## 7. Temporal Analysis

In [None]:
# Tracks per year
tracks_per_year = df_clean['year'].value_counts().sort_index()

fig, ax = plt.subplots(figsize=(14, 5))
tracks_per_year.plot(ax=ax, linewidth=2)
ax.set_title('Number of Tracks per Year', fontsize=14)
ax.set_xlabel('Year')
ax.set_ylabel('Number of Tracks')
ax.grid(True, alpha=0.3)
plt.show()

In [None]:
# Energy trends over time
fig = plot_temporal_trends(df_clean, feature='energy', group_by='decade')
plt.suptitle('Energy Trends Over Time', fontsize=16, y=1.02)
plt.show()

In [None]:
# Danceability trends
fig = plot_temporal_trends(df_clean, feature='danceability', group_by='decade')
plt.suptitle('Danceability Trends Over Time', fontsize=16, y=1.02)
plt.show()

In [None]:
# Valence (happiness) trends
fig = plot_temporal_trends(df_clean, feature='valence', group_by='decade')
plt.suptitle('Valence (Musical Happiness) Trends Over Time', fontsize=16, y=1.02)
plt.show()

## 8. Correlation Analysis

In [None]:
# Feature correlation matrix
fig = plot_correlation_matrix(df_clean)
plt.show()

In [None]:
# Popularity correlations
audio_features = ['acousticness', 'danceability', 'energy', 'instrumentalness',
                  'liveness', 'speechiness', 'valence', 'tempo', 'loudness']

pop_corr = df_clean[audio_features + ['popularity']].corr()['popularity'].sort_values(ascending=False)
pop_corr = pop_corr[pop_corr.index != 'popularity']

fig, ax = plt.subplots(figsize=(10, 6))
pop_corr.plot(kind='barh', ax=ax, color='steelblue')
ax.set_title('Feature Correlation with Popularity', fontsize=14)
ax.set_xlabel('Correlation Coefficient')
ax.axvline(x=0, color='black', linestyle='-', linewidth=0.5)
plt.tight_layout()
plt.show()

print("\nTop positive correlations with popularity:")
print(pop_corr.head())
print("\nTop negative correlations with popularity:")
print(pop_corr.tail())

## 9. Musical Key and Mode Analysis

In [None]:
# Key and mode distributions
fig = plot_key_mode_distribution(df_clean)
plt.show()

## 10. Artist Analysis

In [None]:
# Top artists by track count
fig = plot_top_artists(df_clean, top_n=20)
plt.show()

## 11. Popularity Analysis

In [None]:
# Popularity vs features
fig = plot_popularity_vs_features(df_clean)
plt.suptitle('Popularity vs Audio Features', fontsize=16, y=1.02)
plt.show()

In [None]:
# Popularity by decade
fig, ax = plt.subplots(figsize=(12, 6))
df_clean.boxplot(column='popularity', by='decade', ax=ax)
ax.set_title('Popularity Distribution by Decade')
ax.set_xlabel('Decade')
ax.set_ylabel('Popularity')
plt.suptitle('')  # Remove default title
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## 12. Insights and Findings

### Key Observations

1. **Temporal Trends**:
   - [Document trends in energy, danceability, etc. over decades]
   
2. **Feature Correlations**:
   - [Note strongest positive/negative correlations]
   
3. **Popularity Patterns**:
   - [Identify features that correlate with popularity]
   
4. **Musical Characteristics**:
   - [Note most common keys, modes, etc.]

### Next Steps

1. Feature engineering for ML models
2. Build popularity prediction model
3. Cluster songs by audio features
4. Analyze artist evolution over time
5. Build recommendation system

## 13. Export Cleaned Data

In [None]:
# Export cleaned dataset
output_path = loader.export_cleaned_data()
print(f"Cleaned data exported to: {output_path}")