In [26]:
import pandas as pd
import numpy as np
from datetime import datetime
from datetime import time
from pytz import timezone
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
import plotly.express as px

from matplotlib.dates import DateFormatter

In [60]:
data_dir = '/Users/Karolina/ProjectsDataScience/data_science_environment/data/'
file_list = ['MyDataSpotify/StreamingHistory0.json',
             'MyDataSpotify/StreamingHistory1.json',
             'MyDataSpotify/StreamingHistory2.json',
             'MyDataSpotify/StreamingHistory3.json',
             'MyDataSpotify/StreamingHistory4.json']

dfs = [] # an empty list to store the data frames
for file in file_list:
    data = pd.read_json(data_dir+file) # read data frame from json file
    dfs.append(data) # append the data frame to the list

df = pd.concat(dfs, ignore_index=True) # concatenate all the data frames in the list.
df.head()

Unnamed: 0,endTime,artistName,trackName,msPlayed
0,2020-11-21 08:21,"Does It Offend You, Yeah?",We Are Rockstars,13744
1,2020-11-21 08:25,"Does It Offend You, Yeah?",With A Heavy Heart (I Regret To Inform You),256170
2,2020-11-21 08:27,"Does It Offend You, Yeah?",We Are Rockstars,69343
3,2020-11-21 08:30,"Does It Offend You, Yeah?",Doomed Now,221133
4,2020-11-21 08:31,"Does It Offend You, Yeah?",Being Bad Feels Pretty Good,2603


In [61]:
df.endTime = pd.to_datetime(df.endTime, infer_datetime_format=True) 

#Set the current time to US Pacific timezone
df['endTimeTz']= df.endTime.dt.tz_localize(tz='UTC')

#convert US Pacific to Europe/Berlin Timezone
df['endTimeTzEU']=df.endTimeTz.dt.tz_convert(tz='Europe/Berlin')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47735 entries, 0 to 47734
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype                        
---  ------       --------------  -----                        
 0   endTime      47735 non-null  datetime64[ns]               
 1   artistName   47735 non-null  object                       
 2   trackName    47735 non-null  object                       
 3   msPlayed     47735 non-null  int64                        
 4   endTimeTz    47735 non-null  datetime64[ns, UTC]          
 5   endTimeTzEU  47735 non-null  datetime64[ns, Europe/Berlin]
dtypes: datetime64[ns, Europe/Berlin](1), datetime64[ns, UTC](1), datetime64[ns](1), int64(1), object(2)
memory usage: 2.2+ MB


In [62]:
df['hour'] = df.endTimeTzEU.dt.hour
df['date'] = df['endTime'].dt.to_period('D').apply(lambda r: r.start_time)
df['week'] = df['endTime'].dt.to_period('W').apply(lambda r: r.start_time)
df['month'] = df['endTimeTzEU'].dt.to_period('M').apply(lambda r: r.start_time)


df['sPlayed'] = df['msPlayed']/(1000)
df['mPlayed'] = df['sPlayed']/(60)
df['hPlayed'] = df['sPlayed']/(60*60)


Converting to PeriodArray/Index representation will drop timezone information.



In [63]:
df.head()

Unnamed: 0,endTime,artistName,trackName,msPlayed,endTimeTz,endTimeTzEU,hour,date,week,month,sPlayed,mPlayed,hPlayed
0,2020-11-21 08:21:00,"Does It Offend You, Yeah?",We Are Rockstars,13744,2020-11-21 08:21:00+00:00,2020-11-21 09:21:00+01:00,9,2020-11-21,2020-11-16,2020-11-01,13.744,0.229067,0.003818
1,2020-11-21 08:25:00,"Does It Offend You, Yeah?",With A Heavy Heart (I Regret To Inform You),256170,2020-11-21 08:25:00+00:00,2020-11-21 09:25:00+01:00,9,2020-11-21,2020-11-16,2020-11-01,256.17,4.2695,0.071158
2,2020-11-21 08:27:00,"Does It Offend You, Yeah?",We Are Rockstars,69343,2020-11-21 08:27:00+00:00,2020-11-21 09:27:00+01:00,9,2020-11-21,2020-11-16,2020-11-01,69.343,1.155717,0.019262
3,2020-11-21 08:30:00,"Does It Offend You, Yeah?",Doomed Now,221133,2020-11-21 08:30:00+00:00,2020-11-21 09:30:00+01:00,9,2020-11-21,2020-11-16,2020-11-01,221.133,3.68555,0.061426
4,2020-11-21 08:31:00,"Does It Offend You, Yeah?",Being Bad Feels Pretty Good,2603,2020-11-21 08:31:00+00:00,2020-11-21 09:31:00+01:00,9,2020-11-21,2020-11-16,2020-11-01,2.603,0.043383,0.000723


In [70]:
agg_dict = {'hPlayed':'sum', 'trackName':'count'}
df_agg = df.groupby(['date', 'hour'], as_index=False).agg(agg_dict)
df_agg


df_agg.loc[:, 'dow'] = df_agg['date'].dt.dayofweek
df_agg.loc[:, 'dom'] = df_agg['date'].dt.day
df_agg.loc[:, 'day'] = df_agg['date'].dt.strftime('%a')

df_agg = df_agg.sort_values(by=['dow', 'hour'], ascending=False)

cols = df_agg['hour'].unique().tolist()
df_agg = pd.pivot_table(df_agg, index=['date', 'dow', 'day', 'dom'], columns='hour',
                    values='hPlayed', aggfunc='sum')\
        .fillna(0).reset_index()


hour_df = pd.melt(df_agg, id_vars=['date', 'dow', 'day', 'dom'], value_vars=cols)

hour_df['week'] = hour_df['date'].dt.to_period('W').apply(lambda r: r.start_time)
hour_df['month'] = hour_df['date'].dt.to_period('M').apply(lambda r: r.start_time)

hour_df.head(24)


Unnamed: 0,date,hour,hPlayed,trackName
0,2020-11-21,9,0.581994,65
1,2020-11-21,10,0.407470,20
2,2020-11-21,11,0.994231,29
3,2020-11-21,12,0.406045,9
4,2020-11-21,14,0.092658,3
...,...,...,...,...
4247,2021-11-21,17,0.643222,24
4248,2021-11-21,20,0.330321,6
4249,2021-11-21,21,0.755962,10
4250,2021-11-21,22,0.580919,18


In [128]:
hour_df.groupby(['month'], as_index=False).agg({'value':'sum', 'date':'nunique'})

# Monthly usage
df_monthly = hour_df.sort_values(by='date')
df_monthly = df_monthly.groupby(['month'], as_index=False).agg({'value':'sum', 'date':'nunique'})

#dividing by 24 to get from hours to days
df_monthly.loc[:, 'perc'] = 100*(df_monthly['value']/(24))/df_monthly['date']

min_perc = (df_monthly['perc'] == df_monthly['perc'].min())
max_perc = (df_monthly['perc'] == df_monthly['perc'].max())

df_monthly.loc[min_perc | max_perc, 'text'] = df_monthly['month'].dt.strftime('%b')
df_monthly.fillna('', inplace=True)

df_monthly

Unnamed: 0,month,value,date,perc,text
0,2020-11-01,57.256078,10,23.856699,
1,2020-12-01,180.001789,31,24.193789,
2,2021-01-01,140.831439,31,18.928957,
3,2021-02-01,145.021871,28,21.580636,
4,2021-03-01,176.210461,31,23.684202,
5,2021-04-01,145.486167,30,20.206412,
6,2021-05-01,172.591502,31,23.197783,
7,2021-06-01,119.853254,30,16.646285,
8,2021-07-01,135.555843,31,18.219871,
9,2021-08-01,137.436762,31,18.472683,


In [129]:
fig = px.line(df_monthly, x='month', y='perc', line_shape='spline', text='text',
              color_discrete_sequence=spotify_colors)
fig.update_traces(mode='lines+markers+text', textposition='top center', line_width=2.5)
fig.update_yaxes(rangemode='tozero', ticksuffix='%')
fig.update_xaxes(tickformat='%b %y', nticks=5)
fig.update_layout(
    title='<b>Spotify</b>: <i> How much time did I spend on Spotify each month?',
    yaxis_title='% of total month time listening to music',
    xaxis_title=''
)
fig.show()

In [66]:
df_agg

hour,date,dow,day,dom,0,1,2,3,4,5,...,14,15,16,17,18,19,20,21,22,23
0,2020-11-21,5,Sat,21,0.0,0.0,0.0,0.0,0.0,0.0,...,0.092658,0.875201,0.461119,0.432448,0.000000,0.001484,0.149162,0.447485,0.000000,0.000000
1,2020-11-22,6,Sun,22,0.0,0.0,0.0,0.0,0.0,0.0,...,0.235185,0.376365,0.000000,0.094099,0.013161,0.316239,0.049721,0.000000,0.357879,0.362365
2,2020-11-23,0,Mon,23,0.0,0.0,0.0,0.0,0.0,0.0,...,0.654687,0.085726,1.031896,0.890592,0.663798,0.000000,0.000000,0.826353,0.401900,0.271380
3,2020-11-24,1,Tue,24,0.0,0.0,0.0,0.0,0.0,0.0,...,0.149162,0.807964,0.000000,0.000000,0.041846,0.000000,0.000000,0.000000,0.000000,0.000000
4,2020-11-25,2,Wed,25,0.0,0.0,0.0,0.0,0.0,0.0,...,0.347016,0.967682,0.801217,0.000000,0.371095,0.475291,0.678444,0.317439,0.524991,0.613533
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
361,2021-11-17,2,Wed,17,0.0,0.0,0.0,0.0,0.0,0.0,...,0.312070,0.289874,0.251244,0.459358,0.002144,0.194176,0.000000,0.317527,0.154941,0.222518
362,2021-11-18,3,Thu,18,0.0,0.0,0.0,0.0,0.0,0.0,...,0.180989,0.114858,0.162406,0.080933,0.512751,0.006260,0.156647,0.344175,0.020769,0.000000
363,2021-11-19,4,Fri,19,0.0,0.0,0.0,0.0,0.0,0.0,...,0.664811,0.435628,0.627551,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.320508
364,2021-11-20,5,Sat,20,0.0,0.0,0.0,0.0,0.0,0.0,...,0.356630,0.837122,1.110942,0.535390,0.067806,0.827087,0.938616,0.113237,0.254808,0.806508


In [41]:
df = hour_df.groupby(['date', 'dow', 'day'], as_index=False).agg({'value':'sum'})
df['week_dt'] = df['date'].dt.to_period('W').apply(lambda r: r.start_time)
df['month_dt'] = df['date'].dt.to_period('M').apply(lambda r: r.start_time)

df.loc[:, 'next'] = df['week_dt'] + pd.Timedelta(7, 'day')

df.loc[:, 'dom'] = df['date'].dt.day
df.loc[:, 'next_dom'] = df['dow'] + 1

df.loc[:, 'next_dow'] = df['dow'] + 1
df.loc[:, 'delta'] = df['next_dow'] - df['dow']

df.loc[:, 'week'] = df['week_dt'].dt.strftime('%d %b %Y')
df.loc[:, 'date_str'] = df['date'].dt.strftime('%d% %b %Y')

hover_dict = {'week_dt':False, 'next':False, 'week':True, 'value':':.3g'}

In [42]:
spotify_colors = ['#1DB954', '#191414']
heat_pal = px.colors.sequential.YlGn

In [43]:
fig = px.timeline(df.sort_values(by='dom', ascending=False), x_start='dow', x_end='next_dow', y='week_dt', 
                  hover_name='date_str', color='value', hover_data=hover_dict, 
                  color_continuous_scale='YlGn'
                 )

fig.update_yaxes(
    autorange='reversed',
    dtick='M1',
    tickformat='%b/%y'
)
# fig.update_xaxes(dtick=5, range=[0.5, 32])

fig.layout.xaxis.type = 'linear'
fig.data[0].x = df['delta'].tolist()


fig.update_traces(marker_line_color='lightgrey', marker_line_width=0.5)

fig.update_xaxes(tickvals=[0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5], #tickangle=45,
                 showgrid=False,
                 ticktext=['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'])

fig.update_layout(
    title='<b>Spotify (un)Wrapped</b>:<i> When did I use Spotify?</i>',
    margin_t=80,
    yaxis_title='',
    xaxis_title='',
    coloraxis_colorbar_title='Listened<br>hours',
    coloraxis_colorbar_ticksuffix='h'
)

# fig.update_layout(coloraxis_colorbar_tickfont_size=18, height=600, width=600)
# write('insta', fig, 'heatmap')

fig.update_layout(coloraxis_colorbar_tickfont_size=12, width=500)
fig.show()

Unnamed: 0,month,value,date
0,2020-11-01,57.256078,10
1,2020-12-01,180.001789,31
2,2021-01-01,140.831439,31
3,2021-02-01,145.021871,28
4,2021-03-01,176.210461,31
5,2021-04-01,145.486167,30
6,2021-05-01,172.591502,31
7,2021-06-01,119.853254,30
8,2021-07-01,135.555843,31
9,2021-08-01,137.436762,31


In [45]:
# Dow x Hours
df = hour_df\
        .groupby(['date', 'dow', 'day'], as_index=False)\
        .agg({'value':'sum'})

df = df\
        .groupby(['dow', 'day'], as_index=False)\
        .agg({'value':'mean'})\
        .sort_values(by='dow')

# df.loc[:, 'dow'] = df['date'].dt.dayofweek
# df.loc[:, 'day'] = df['date'].dt.day_name()


fig = px.line(df, x='day', y='value', color_discrete_sequence=spotify_colors, line_shape='spline')
fig.update_traces(mode='lines+markers+text', texttemplate='%{y:.1f}h', textposition='top center')
fig.update_yaxes(rangemode='tozero', ticksuffix='h', nticks=2)

fig.update_layout(
    title='<b>Spotify (un)Wrapped</b>: <i>When in the week do I listen to music the most?</i>',
    yaxis_title='Average hours listened per day',
    xaxis_title='Day of the week'
)
fig.show()

In [33]:
df = hour_df.copy()
df.loc[:, 'hour_disc'] = np.floor(df['hour']/1)*1

df.loc[:, 'week_period'] = df['dow'].apply(lambda x: 'Weekend' if x >= 5 else 'Weekdays')

df = df\
        .groupby(['hour_disc', 'week_period'], as_index=False)\
        .agg({'value':'mean'})\
        .sort_values(by=['week_period', 'hour_disc'])

df.loc[:, 'hour_percentage'] = df['value']*60
df.loc[:, 'text'] = ''
df.loc[(df['week_period'] == 'Weekdays') & (df['hour_disc'].isin([1, 14, 18, 21])), 'text'] = df['hour_disc'].apply(lambda x: "{:.0f}h".format(x))
df.loc[(df['week_period'] == 'Weekend') & (df['hour_disc'].isin([16, 21])), 'text'] = df['hour_disc'].apply(lambda x: "{:.0f}h".format(x))


fig = px.line(df, x='hour_disc', y='hour_percentage', color='week_period', text='text',
              line_shape='spline', color_discrete_sequence=spotify_colors)
fig.update_traces(mode='lines+markers+text', textposition='top center', line_width=2.5)
fig.update_yaxes(rangemode='tozero', ticksuffix='min', nticks=5)
fig.update_xaxes(title='', ticksuffix='h', tickangle=45)
fig.update_layout(
#     legend_orientation='h',
    title='<b>Spotify (un)Wrapped</b>: <i>At what time did I listen to music?</i>',
    legend_title='Week period',
    legend_orientation='h',
    yaxis_title='Average time listening to music (min)',
)
fig.show()