In [1]:
import pandas as pd
import altair as alt

In [2]:
hit_songs = pd.read_csv('../songs_data/hit_songs/spotify_hits_dataset_complete.csv', delimiter='\t')
hit_songs.columns

Index(['song_id', 'song_name', 'artist_id', 'artist_name', 'popularity',
       'explicit', 'song_type', 'track_number', 'num_artists',
       'num_available_markets', 'release_date', 'duration_ms', 'key', 'mode',
       'time_signature', 'acousticness', 'danceability', 'energy',
       'instrumentalness', 'liveness', 'loudness', 'speechiness', 'valence',
       'tempo'],
      dtype='object')

In [3]:
hit_songs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13880 entries, 0 to 13879
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   song_id                13880 non-null  object 
 1   song_name              13855 non-null  object 
 2   artist_id              13880 non-null  object 
 3   artist_name            13880 non-null  object 
 4   popularity             13880 non-null  int64  
 5   explicit               13880 non-null  bool   
 6   song_type              13880 non-null  object 
 7   track_number           13880 non-null  int64  
 8   num_artists            13880 non-null  int64  
 9   num_available_markets  13880 non-null  int64  
 10  release_date           13880 non-null  object 
 11  duration_ms            13880 non-null  int64  
 12  key                    13880 non-null  int64  
 13  mode                   13880 non-null  int64  
 14  time_signature         13880 non-null  int64  
 15  ac

In [4]:
hit_songs['release_date'].describe

<bound method NDFrame.describe of 0        2018-08-03
1        2019-06-21
2        2017-05-12
3        2011-04-08
4        2017-10-20
            ...    
13875    2019-08-23
13876    2017-05-19
13877    2018-10-15
13878    2017-04-28
13879    2019-08-30
Name: release_date, Length: 13880, dtype: object>

In [5]:
hit_songs['release_date'] = pd.to_datetime(hit_songs['release_date'], errors='coerce')
hit_songs['year_release'] = hit_songs['release_date'].dt.year.where(hit_songs['release_date'].notnull(), hit_songs['release_date'].astype(str).str[:4])
hit_songs['year_release'] =  pd.to_numeric(hit_songs['year_release'], errors='coerce', downcast='integer')
hit_songs = hit_songs.dropna(subset=['year_release'])
hit_songs['year_release'] = hit_songs['year_release'].astype(int)
hit_songs.columns


Index(['song_id', 'song_name', 'artist_id', 'artist_name', 'popularity',
       'explicit', 'song_type', 'track_number', 'num_artists',
       'num_available_markets', 'release_date', 'duration_ms', 'key', 'mode',
       'time_signature', 'acousticness', 'danceability', 'energy',
       'instrumentalness', 'liveness', 'loudness', 'speechiness', 'valence',
       'tempo', 'year_release'],
      dtype='object')

In [31]:
hit_songs['interval'] = pd.cut(hit_songs['year_release'], bins=range(1940, 2021, 5), right=False, labels=[f'{i}-{i+4}' for i in range(1940, 2020, 5)])
hit_songs['interval'].value_counts()


interval
2015-2019    12983
2010-2014      254
2005-2009      124
2000-2004       91
1990-1994       61
1995-1999       56
1985-1989       37
1975-1979       29
1980-1984       26
1970-1974       22
1965-1969       14
1960-1964        9
1955-1959        8
1945-1949        4
1940-1944        3
1950-1954        0
Name: count, dtype: int64

In [57]:
hit_songs_2015_2019 = hit_songs[hit_songs['interval'] == '2015-2019']
hit_songs_not_2015_2019 = hit_songs[hit_songs['interval'] != '2015-2019']

hit_songs_2015_2019 = hit_songs_2015_2019.sample(n=250, random_state=42)

sample = pd.concat([hit_songs_2015_2019, hit_songs_not_2015_2019])
sample['interval'].value_counts()

interval
2010-2014    254
2015-2019    250
2005-2009    124
2000-2004     91
1990-1994     61
1995-1999     56
1985-1989     37
1975-1979     29
1980-1984     26
1970-1974     22
1965-1969     14
1960-1964      9
1955-1959      8
1945-1949      4
1940-1944      3
1950-1954      0
Name: count, dtype: int64

In [58]:
colunas_desejadas = ['year_release', 'song_name', 'artist_name', 'popularity', 'danceability', 'energy', 'duration_ms', 'valence', 'interval']
data = sample[colunas_desejadas]

In [61]:
data_sample = data

dropdown_xcol = alt.binding_select(
    options=['danceability', 'energy', 'duration_ms', 'valence'],
    name='X-axis column '
)
xcol_param = alt.param(
    value='danceability',
    bind=dropdown_xcol
)

list_years = sorted(data_sample['interval'].unique().tolist())
input_dropdown = alt.binding_select(options=list_years, name='Ano de lançamento: ')
selection = alt.selection_single(fields=['interval'], bind=input_dropdown)
color = alt.condition(
    selection,
    alt.Color('interval:N', title='Ano de Lançamento', scale=alt.Scale(domain=list_years,  range=['red', 'green', 'blue', 'purple']), legend=None),
    alt.value('lightgray')
)

points = alt.Chart(data_sample, width=400, height=400).mark_circle().encode(
    alt.X('x', type='quantitative'),
    alt.Y('popularity', type='quantitative', title='Popularidade da Música'),
    color=color,
    tooltip = ['year_release', 'song_name']
).transform_calculate(
    x=f'datum[{xcol_param.name}]'
).add_params(
    xcol_param,
).add_selection(
    selection
)

points 

