In [1]:
import pandas as pd
import altair as alt

In [2]:
from datetime import datetime, timedelta
 
def descobre_mes(ano, num_semana):    
    date = str(ano) + '-W' + str(num_semana)
    res = datetime.strptime(date + '-1', "%Y-W%W-%w")
    return res.strftime("%B")

In [3]:
def extrair_elementos(lista_str):
    try:
        lista = ast.literal_eval(lista_str)
        return ', '.join(lista)
    except (SyntaxError, ValueError):
        return None

In [4]:
def remover_duplicatas(lista_str):
    try:
        lista = list(dict.fromkeys(lista_str))
        return lista
    except (SyntaxError, ValueError):
        return None

In [5]:
import ast

hit_songs = pd.read_csv('../songs_data/hit_songs/spotify_hits_dataset_complete.csv', delimiter='\t')
hit_songs = hit_songs.explode(['artist_id','artist_name']).reset_index(drop=True)
hit_songs['artist_id'] = hit_songs['artist_id'].apply(extrair_elementos)

artists = pd.read_csv('../songs_data/artists/spotify_artists_info_complete_reduced_genres.csv', delimiter='\t')
total_charts = pd.read_csv('../my_data/charts/total_charts.csv')

songs_and_artists = pd.merge(hit_songs, artists, on='artist_id')
charts = pd.merge(total_charts, songs_and_artists, on='song_id')


In [6]:
# ajustando japao
charts.loc[(charts['country'] == 'japan') & (charts['year'] == 2017), 'week'] += 34

In [7]:
charts[(charts['country'] == 'japan') & (charts['year']==2017)]['week'].describe()

count    2453.000000
mean       43.642886
std         5.183625
min        35.000000
25%        39.000000
50%        44.000000
75%        48.000000
max        52.000000
Name: week, dtype: float64

In [8]:
charts['month'] = charts.apply(lambda row: descobre_mes(row['year'], row['week']), axis=1)

In [9]:
charts.columns

Index(['position', 'song_id', 'song_name_x', 'artist', 'streams',
       'last_week_position', 'weeks_on_chart', 'peak_position',
       'position_status', 'year', 'week', 'country', 'song_name_y',
       'artist_id', 'artist_name', 'popularity_x', 'explicit', 'song_type',
       'track_number', 'num_artists', 'num_available_markets', 'release_date',
       'duration_ms', 'key', 'mode', 'time_signature', 'acousticness',
       'danceability', 'energy', 'instrumentalness', 'liveness', 'loudness',
       'speechiness', 'valence', 'tempo', 'name', 'followers', 'popularity_y',
       'genres', 'image_url', 'month'],
      dtype='object')

In [24]:
colunas_de_interesse = ['artist_id', 'artist_name', 'year', 'streams', 'song_name_x', 'genres', 'country', 'image_url']
data = charts[colunas_de_interesse]

agg_func = {
    'artist_name': 'first', 
    'streams': 'sum', 
    'song_name_x': list, 
    'genres': list,  
    'image_url': 'first'
}
data_analysis = data.groupby(['artist_id', 'year', 'country']).agg(agg_func).reset_index()
data_analysis['genres'] = data_analysis['genres'].apply(lambda x: list(pd.Series(x).explode().unique()))
data_analysis['artist_name'] = data_analysis['artist_name'].apply(extrair_elementos)
data_analysis['song_name_x'] = data_analysis['song_name_x'].apply(remover_duplicatas)

data_analysis = data_analysis.sort_values(by=['year','country','streams'], ascending=[True, True, False])
data_analysis = data_analysis.groupby(['year','country']).head(5)

data_analysis.info()

<class 'pandas.core.frame.DataFrame'>
Index: 135 entries, 5324 to 793
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   artist_id    135 non-null    object
 1   year         135 non-null    int64 
 2   country      135 non-null    object
 3   artist_name  135 non-null    object
 4   streams      135 non-null    int64 
 5   song_name_x  135 non-null    object
 6   genres       135 non-null    object
 7   image_url    135 non-null    object
dtypes: int64(2), object(6)
memory usage: 9.5+ KB


In [25]:
data_analysis[(data_analysis['year'] == 2017) & (data_analysis['country'] == 'brazil')]['artist_name']

1990       Matheus & Kauan
3007    Henrique & Juliano
1246        Jorge & Mateus
1467      Marília Mendonça
5325            Ed Sheeran
Name: artist_name, dtype: object

<h1>Gráfico</h1>

tentando fazer tooltip

In [26]:
# tentando colocar imagens
data2 = data_analysis.copy()
data2 = data2.rename(columns={'image_url':'image'})
data2['image'] = data2['image'].astype(str)
data2['image'].dtype

dtype('O')

GRÁFICO DE BARRAS

In [77]:
dropdown_year = alt.binding_select(
    options= data2['year'].unique().tolist(),
    name='Escolha o ano '
)

dropdown_country = alt.binding_select(
    options= data2['country'].unique().tolist(),
    name='Escolha um país ou visão global '
)

year_select = alt.selection_point(fields=['year'], bind=dropdown_year, value=2017)
country_select = alt.selection_point(fields=['country'], bind=dropdown_country, value='global')

def make_scale(country_select):
  y_scale = alt.Scale(
    domain=(0, 5000000000) if country_select == 'global' else (0, 100)
  )
  return y_scale

# y_scale = alt.Scale(domain=(0, 5000000000) if country_select == 'global' else (0, 10000000))
grafico = alt.Chart(data2, width=600, height=300).mark_bar(
    width=50,
    height=50
).encode(
    alt.X('artist_name', type='nominal', title='Artistas'),
    alt.Y('streams', type='quantitative', title='Número de Streams durante o ano'),
    tooltip=['image']
    # tooltip=alt.Tooltip(['image:N'], scale=alt.Scale(domain=[0, 1], range=[0, 1]))
).add_selection(
    year_select,
    country_select
).transform_filter(
    year_select & country_select
).properties(title='Top 5 Artistas do ano do país e ano selecionados')

generos = grafico.mark_text(align='center', baseline='middle', dy=-10).encode(
    text='genres'
)

(grafico + generos).properties(padding=20)

