In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from spotify_cleaner import clean_data

sns.set_style('whitegrid')

## Introduction


## Basics of the Dataset
    * important note: most of the non-objective data (not album names, artist names, tempo, key, etc.) are generated by the Spotify software; while popularity is based directly on a concrete value (number and recency of track plays), other features like danceability and valence are defined by a combination of (often opaque) criteria

initial problems:
    * track duplication
    * genre "distribution" (pre-clean: exactly 1000 tracks/genre)

In [None]:
df = pd.read_csv('data/dataset.csv', index_col=0)
df = clean_data(df)

print(df.info())
display(df.describe())
df.head()

## General Distributions and Trends
[text to be added]

In [None]:
df.hist(figsize=(14, 10), bins=30)
plt.suptitle('Feature Distributions', fontsize=20, fontweight='bold')


## Feature Correlations and Interactions

In [None]:
plt.figure(figsize=(12, 6))
sns.heatmap(df.select_dtypes(include=['number']).corr(), annot=True)
plt.title('Feature Correlations', fontsize=20, fontweight='bold')

In [None]:
# full feature scatterplot overview

scatter_features = ['danceability', 'popularity',  'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 
                  'valence', 'tempo']
sns.pairplot(df.loc[:, scatter_features], diag_kind='kde', plot_kws=dict(alpha=0.01))

In [None]:
related_features = ['tempo', 'energy', 'valence']
targets = ['danceability', 'popularity']
fig, ax = plt.subplots(len(targets), len(related_features), figsize=(15, 10))

for i, target in enumerate(targets):
    for j, feature in enumerate(related_features):
        sns.scatterplot(x=feature, y=target, data=df, ax=ax[i, j], alpha=0.1)
        ax[i, j].set_title(f'{target.capitalize()} vs {feature.capitalize()}')

plt.tight_layout()

## Categorical Feature Details

In [None]:
categorical_features = ['key', 'mode', 'explicit']
targets = ['danceability', 'popularity']
fig, axes = plt.subplots(len(targets), len(categorical_features), figsize=(15, 10))

for i, target in enumerate(targets):
    for j, feature in enumerate(categorical_features):
        sns.boxplot(x=feature, y=target, data=df, ax=axes[i, j])
        axes[i, j].set_title(f'{target.capitalize()} vs {feature.capitalize()}')

plt.tight_layout()