In [1]:
import pandas as pd
import altair as alt

In [4]:
hit_songs = pd.read_csv('../songs_data/hit_songs/spotify_hits_dataset_complete.csv', delimiter='\t')
hit_songs.columns

Index(['song_id', 'song_name', 'artist_id', 'artist_name', 'popularity',
       'explicit', 'song_type', 'track_number', 'num_artists',
       'num_available_markets', 'release_date', 'duration_ms', 'key', 'mode',
       'time_signature', 'acousticness', 'danceability', 'energy',
       'instrumentalness', 'liveness', 'loudness', 'speechiness', 'valence',
       'tempo'],
      dtype='object')

In [7]:
hit_songs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13880 entries, 0 to 13879
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   song_id                13880 non-null  object 
 1   song_name              13855 non-null  object 
 2   artist_id              13880 non-null  object 
 3   artist_name            13880 non-null  object 
 4   popularity             13880 non-null  int64  
 5   explicit               13880 non-null  bool   
 6   song_type              13880 non-null  object 
 7   track_number           13880 non-null  int64  
 8   num_artists            13880 non-null  int64  
 9   num_available_markets  13880 non-null  int64  
 10  release_date           13880 non-null  object 
 11  duration_ms            13880 non-null  int64  
 12  key                    13880 non-null  int64  
 13  mode                   13880 non-null  int64  
 14  time_signature         13880 non-null  int64  
 15  ac

In [18]:
hit_songs['release_date'].describe

<bound method NDFrame.describe of 0        2018-08-03
1        2019-06-21
2        2017-05-12
3        2011-04-08
4        2017-10-20
            ...    
13875    2019-08-23
13876    2017-05-19
13877    2018-10-15
13878    2017-04-28
13879    2019-08-30
Name: release_date, Length: 13880, dtype: object>

In [45]:
hit_songs['release_date'] = pd.to_datetime(hit_songs['release_date'], errors='coerce')
hit_songs['year_release'] = hit_songs['release_date'].dt.year.where(hit_songs['release_date'].notnull(), hit_songs['release_date'].astype(str).str[:4])
hit_songs['year_release'] =  pd.to_numeric(hit_songs['year_release'], errors='coerce', downcast='integer')
hit_songs = hit_songs.dropna(subset=['year_release'])
hit_songs['year_release'] = hit_songs['year_release'].astype(int)
hit_songs.columns


Index(['song_id', 'song_name', 'artist_id', 'artist_name', 'popularity',
       'explicit', 'song_type', 'track_number', 'num_artists',
       'num_available_markets', 'release_date', 'duration_ms', 'key', 'mode',
       'time_signature', 'acousticness', 'danceability', 'energy',
       'instrumentalness', 'liveness', 'loudness', 'speechiness', 'valence',
       'tempo', 'year', 'year_release'],
      dtype='object')

In [61]:
colunas_desejadas = ['year_release', 'song_name', 'artist_name', 'popularity', 'danceability', 'energy', 'duration_ms', 'valence']
data = hit_songs[colunas_desejadas]

In [64]:
data = hit_songs.copy().head(5000)

dropdown_xcol = alt.binding_select(
    options=['danceability', 'energy', 'duration_ms', 'valence'],
    name='X-axis column '
)
xcol_param = alt.param(
    value='danceability',
    bind=dropdown_xcol
)


list_years = sorted(data['year_release'].unique().tolist())
dropdown_year_release = alt.binding_select(
    options= list_years,
    name='Ano de lançamento: '
)
year_release_param = alt.param(
    value=2017,
    bind=dropdown_year_release
)

points = alt.Chart(data, width=500, height=500).mark_circle().encode(
    alt.X('x', type='quantitative'),
    alt.Y('popularity', type='quantitative', title='Popularidade da Música'),
    color=alt.Color('year_release:N', title='Ano de Lançamento').scale(domain=list_years)
).transform_calculate(
    x=f'datum[{xcol_param.name}]'
).add_params(
    xcol_param,
    year_release_param
).transform_filter(
     alt.FieldEqualPredicate(field='year', equal=year_release_param)
)

points 