In [8]:
import json
import pandas as pd
import numpy as np
import datetime
from datetime import datetime as dt
import altair as alt
import dash
from dash import dcc, html, Input, Output, ClientsideFunction
alt.data_transformers.enable("vegafusion")
app = dash.Dash(
    __name__,
    meta_tags=[{"name": "viewport", "content": "width=device-width, initial-scale=1"}],
    external_stylesheets=['https://codepen.io/chriddyp/pen/bWLwgP.css']
)
app.title = "Spotify Song Popularity"

server = app.server
app.config.suppress_callback_exceptions = True


# Read data
df = pd.read_csv('../data/processed/spotify_songs_processed.csv', index_col=0)
object_columns = df.select_dtypes(include=['object']).columns
for column in object_columns:
    df[column] = df[column].astype('string')
features=['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'duration_ms']
df['duration_min'] = df['duration_ms'] / 60000
genre_list = df['playlist_genre'].unique().tolist()
subgenre_list = df['playlist_subgenre'].unique().tolist()
artist_list = df['track_artist'].unique().tolist()

# Format release date
def parse_date(x):
    try:
        if len(x)==10:
            return dt.strptime(x, "%Y-%m-%d")
        elif len(x)==7:
            return dt.strptime(x, "%Y-%m")
        elif len(x)==4:
            return dt.strptime(x, "%Y")
    except ValueError:
        return None

df["track_album_release_date"] = df["track_album_release_date"].apply(parse_date)
# String -> Datetime

In [21]:
df.info()

df = pd.read_csv('../data/processed/spotify_songs_processed.csv', index_col=0)
df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
Index: 32833 entries, 0 to 32832
Data columns (total 24 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   track_id                  32833 non-null  object 
 1   track_name                32828 non-null  object 
 2   track_artist              32828 non-null  object 
 3   track_popularity          32833 non-null  int64  
 4   track_album_id            32833 non-null  object 
 5   track_album_name          32828 non-null  object 
 6   track_album_release_date  32833 non-null  object 
 7   playlist_name             32833 non-null  object 
 8   playlist_id               32833 non-null  object 
 9   playlist_genre            32833 non-null  object 
 10  playlist_subgenre         32833 non-null  object 
 11  danceability              32833 non-null  float64
 12  energy                    32833 non-null  float64
 13  key                       32833 non-null  int64  
 14  loudness   

track_id                    0
track_name                  5
track_artist                5
track_popularity            0
track_album_id              0
track_album_name            5
track_album_release_date    0
playlist_name               0
playlist_id                 0
playlist_genre              0
playlist_subgenre           0
danceability                0
energy                      0
key                         0
loudness                    0
mode                        0
speechiness                 0
acousticness                0
instrumentalness            0
liveness                    0
valence                     0
tempo                       0
duration_ms                 0
nominal_popularity          0
dtype: int64

In [9]:
selected_features = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_min']

In [18]:
def create_feature_distribution_charts(df, selected_features):
    charts = []
    
    # Determine the layout based on the number of selected features
    layout_columns = 2 if len(selected_features) > 1 else 1
    
    for feature in selected_features:
        # Check if the feature is 'key' or 'mode' for categorical encoding, else treat as numerical
        if feature in ['key', 'mode']:  # Categorical features
            chart = alt.Chart(df).mark_bar().encode(
                alt.X(f"{feature}:N", sort='-y'),
                alt.Y('count()', stack=None),
                alt.Color('nominal_popularity:N', legend=alt.Legend(title="Popularity"), scale=alt.Scale(scheme='set2'))
            ).properties(
                title=f'Distribution of {feature.capitalize()}'
            )
        else:  # Numerical features
            chart = alt.Chart(df).mark_bar().encode(
                alt.X(f"{feature}:Q", bin=True),
                alt.Y('count()', stack=None),
                alt.Color('nominal_popularity:N', legend=alt.Legend(title="Popularity"), scale=alt.Scale(scheme='set2'))
            ).properties(
                title=f'Distribution of {feature.capitalize()}'
            )
        
        charts.append(chart)
    
    # Combine all charts into a single chart, adjusting the layout based on the number of charts
    if len(charts) > 1:
        combined_chart = alt.hconcat(*[alt.vconcat(*charts[i::layout_columns]).resolve_scale(y='independent') for i in range(layout_columns)]).configure_view(
            strokeWidth=0
        ).properties(
            title='Distribution of Selected Features'
        ).configure_range(category={'scheme': 'set2'})
    else:
        combined_chart = charts[0].configure_range(category={'scheme': 'set2'})  # If only one chart, just use it directly
    
    return combined_chart.to_html()




In [17]:
# Example usage
selected_features = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_min']
one_feature = ['danceability']
# Assuming df is your dataframe
chart_all = create_feature_distribution_charts(df, selected_features)
chart_one = create_feature_distribution_charts(df, one_feature)
chart_all.display()
chart_one.display()

In [None]:
chart = alt.Chart(df).mark_bar().encode(
    x='nominal_popularity',
    y='count()',
    color='nominal_popularity'
).properties(
    title='Population Level Distribution for All Songs'
)
chart