In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from spotify_cleaner import clean_data

In [None]:

# read-in & genre-to-category
df = pd.read_csv('data/dataset.csv', index_col=0)
#df['track_genre'] = df['track_genre'].astype('category') #114 genres in total --> reasonable for 160k songs

# use "basic" cleaning function (removes audiobooks and NaN for target variables)
df = clean_data(df)

# general df info
display(df.info())
display(df.describe())
display(df.loc[:, 'track_genre'].value_counts(sort=True))
df.head()

In [None]:
df.hist(figsize=(12, 8))
df[df.isna().any(axis=1)]

In [None]:
# general correlation
plt.figure(figsize=(12, 6))
sns.heatmap(df.select_dtypes(include=['number']).corr(), annot=True)
plt.title('Feature Correlations')

plt.savefig('visuals/EDA number heatmap.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# popularity relationships
df.select_dtypes(include=['number']).corr()['popularity'].sort_values(ascending=False)

In [None]:
# danceability relationships

dance_features = ['danceability',  'energy', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 
                  'valence', 'tempo', 'popularity']

dance_plot = sns.pairplot(df.loc[:, dance_features], diag_kind='kde', plot_kws=dict(alpha=0.01))
plt.show()
dance_plot.savefig('visuals/EDA danceability features.png')

In [None]:
# basic histograms for popularity/danceability
plt.figure(figsize=(12, 6))

sns.histplot(df.loc[:,'danceability'], bins=30)
plt.title('Distribution of Danceability')
plt.show()

sns.histplot(df.loc[:,'popularity'], bins=30)
plt.title('Distribution of Popularity')
plt.show()

In [None]:
# checking uniques in non-num columns (common-sense run before checking target variation)
categorical_cols = df.select_dtypes(exclude=['number']).columns
print(df[categorical_cols].nunique())  

# duplicated tracks check
duplicated_tracks = df[df.duplicated(subset=['track_id'], keep=False)]
display(duplicated_tracks.sort_values('track_id').head(15))
print('Duplicated track_ids:', duplicated_tracks.shape[0])

# checking genre distribution
genre_counts = df.loc[:, 'track_id'].value_counts() # counts how many duplicates of each ID

plt.figure(figsize=(8,6))
plt.hist(genre_counts, bins=(range(1,genre_counts.max()+1)))
plt.xlabel('Number of Genres per track_id')
plt.ylabel('Count of Tracks')
plt.title('Genre Number Distribution')

In [None]:
# check if multi-genre duplicates skew target variables

# genres per track
track_genre_counts = df.groupby('track_id')['track_genre'].count().reset_index()
track_genre_counts.columns = ['track_id', 'num_genres']

# new df with ONLY genre counts (and single unique IDs)
df_unique = df.drop_duplicates(subset=['track_id']).merge(track_genre_counts, on='track_id')

# does number of genres impact popularity/danceability?
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(15, 5))
sns.boxplot(x='num_genres', y='popularity', data=df_unique, ax=ax[0])
ax[0].set_xlabel('Number of Genres')
ax[0].set_ylabel('Popularity')
ax[0].set_title('Popularity vs Number of Genres')

sns.boxplot(x='num_genres', y='danceability', data=df_unique, ax=ax[1])
ax[1].set_xlabel('Number of Genres')
ax[1].set_ylabel('Danceability')
ax[1].set_title('Danceability vs Number of Genres')

corr_popularity = df_unique['num_genres'].corr(df_unique.loc[:,'popularity'], method='spearman') # linear relationship unlikely --> Spearman
corr_danceability = df_unique['num_genres'].corr(df_unique['danceability'], method='spearman')

print('Correlation: Popularity vs Genre Count: {:.3f}'.format(corr_popularity))
print('Correlation: Danceability vs Genre Count: {:.3f}'.format(corr_danceability))

In [None]:
#check for genre distribution
df.loc[:, 'track_genre'].value_counts(sort=True)


In [None]:
dance_mask = df.loc[:, 'danceability'] == 0
df.loc[dance_mask, :]

In [None]:
genre_mask = df.loc[:, 'track_genre'] == 'sleep'
df.loc[genre_mask, :]

In [None]:
print(', '.join(df['track_genre'].unique()))