In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
from datetime import time
from pytz import timezone
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go


from matplotlib.dates import DateFormatter

In [12]:
data_dir = '/Users/Karolina/ProjectsDataScience/data_science_environment/data/'
file_list = ['MyDataSpotify/StreamingHistory0.json',
             'MyDataSpotify/StreamingHistory1.json',
             'MyDataSpotify/StreamingHistory2.json',
             'MyDataSpotify/StreamingHistory3.json',
             'MyDataSpotify/StreamingHistory4.json',
             'MyDataSpotify/StreamingHistory5.json',
             'MyDataSpotify/StreamingHistory6.json']

dfs = [] # an empty list to store the data frames
for file in file_list:
    data = pd.read_json(data_dir+file) # read data frame from json file
    dfs.append(data) # append the data frame to the list

df = pd.concat(dfs, ignore_index=True) # concatenate all the data frames in the list.
df.head()

Unnamed: 0,endTime,artistName,trackName,msPlayed
0,2020-11-21 08:21,"Does It Offend You, Yeah?",We Are Rockstars,13744
1,2020-11-21 08:25,"Does It Offend You, Yeah?",With A Heavy Heart (I Regret To Inform You),256170
2,2020-11-21 08:27,"Does It Offend You, Yeah?",We Are Rockstars,69343
3,2020-11-21 08:30,"Does It Offend You, Yeah?",Doomed Now,221133
4,2020-11-21 08:31,"Does It Offend You, Yeah?",Being Bad Feels Pretty Good,2603


In [13]:
# remove duplicates
df.drop_duplicates(inplace=True)

# double checking for duplicates
df.duplicated().sum()

0

In [14]:
df.endTime = pd.to_datetime(df.endTime, infer_datetime_format=True) 

#Set the current time to US Pacific timezone
df['endTimeTz']= df.endTime.dt.tz_localize(tz='UTC')

#convert US Pacific to Europe/Berlin Timezone
df['endTimeTzEU']=df.endTimeTz.dt.tz_convert(tz='Europe/Berlin')

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59053 entries, 0 to 63059
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype                        
---  ------       --------------  -----                        
 0   endTime      59053 non-null  datetime64[ns]               
 1   artistName   59053 non-null  object                       
 2   trackName    59053 non-null  object                       
 3   msPlayed     59053 non-null  int64                        
 4   endTimeTz    59053 non-null  datetime64[ns, UTC]          
 5   endTimeTzEU  59053 non-null  datetime64[ns, Europe/Berlin]
dtypes: datetime64[ns, Europe/Berlin](1), datetime64[ns, UTC](1), datetime64[ns](1), int64(1), object(2)
memory usage: 3.2+ MB


In [15]:
df['hour'] = df.endTimeTzEU.dt.hour
df['date'] = df['endTime'].dt.to_period('D').apply(lambda r: r.start_time)
df['week'] = df['endTime'].dt.to_period('W').apply(lambda r: r.start_time)
df['month'] = df['endTimeTzEU'].dt.to_period('M').apply(lambda r: r.start_time)


df['sPlayed'] = df['msPlayed']/(1000)
df['mPlayed'] = df['sPlayed']/(60)
df['hPlayed'] = df['sPlayed']/(60*60)


Converting to PeriodArray/Index representation will drop timezone information.



In [16]:
df.head()

Unnamed: 0,endTime,artistName,trackName,msPlayed,endTimeTz,endTimeTzEU,hour,date,week,month,sPlayed,mPlayed,hPlayed
0,2020-11-21 08:21:00,"Does It Offend You, Yeah?",We Are Rockstars,13744,2020-11-21 08:21:00+00:00,2020-11-21 09:21:00+01:00,9,2020-11-21,2020-11-16,2020-11-01,13.744,0.229067,0.003818
1,2020-11-21 08:25:00,"Does It Offend You, Yeah?",With A Heavy Heart (I Regret To Inform You),256170,2020-11-21 08:25:00+00:00,2020-11-21 09:25:00+01:00,9,2020-11-21,2020-11-16,2020-11-01,256.17,4.2695,0.071158
2,2020-11-21 08:27:00,"Does It Offend You, Yeah?",We Are Rockstars,69343,2020-11-21 08:27:00+00:00,2020-11-21 09:27:00+01:00,9,2020-11-21,2020-11-16,2020-11-01,69.343,1.155717,0.019262
3,2020-11-21 08:30:00,"Does It Offend You, Yeah?",Doomed Now,221133,2020-11-21 08:30:00+00:00,2020-11-21 09:30:00+01:00,9,2020-11-21,2020-11-16,2020-11-01,221.133,3.68555,0.061426
4,2020-11-21 08:31:00,"Does It Offend You, Yeah?",Being Bad Feels Pretty Good,2603,2020-11-21 08:31:00+00:00,2020-11-21 09:31:00+01:00,9,2020-11-21,2020-11-16,2020-11-01,2.603,0.043383,0.000723


In [17]:
agg_dict = {'hPlayed':'sum', 'trackName':'count'}
df_agg = df.groupby(['date', 'hour'], as_index=False).agg(agg_dict)
df_agg


df_agg.loc[:, 'dow'] = df_agg['date'].dt.dayofweek
df_agg.loc[:, 'dom'] = df_agg['date'].dt.day
df_agg.loc[:, 'day'] = df_agg['date'].dt.strftime('%a')

df_agg = df_agg.sort_values(by=['dow', 'hour'], ascending=False)

cols = df_agg['hour'].unique().tolist()
df_agg = pd.pivot_table(df_agg, index=['date', 'dow', 'day', 'dom'], columns='hour',
                    values='hPlayed', aggfunc='sum')\
        .fillna(0).reset_index()


hour_df = pd.melt(df_agg, id_vars=['date', 'dow', 'day', 'dom'], value_vars=cols)

hour_df['week'] = hour_df['date'].dt.to_period('W').apply(lambda r: r.start_time)
hour_df['month'] = hour_df['date'].dt.to_period('M').apply(lambda r: r.start_time)

hour_df.head(24)


Unnamed: 0,date,dow,day,dom,hour,value,week,month
0,2020-11-21,5,Sat,21,23,0.0,2020-11-16,2020-11-01
1,2020-11-22,6,Sun,22,23,0.362365,2020-11-16,2020-11-01
2,2020-11-23,0,Mon,23,23,0.27138,2020-11-23,2020-11-01
3,2020-11-24,1,Tue,24,23,0.0,2020-11-23,2020-11-01
4,2020-11-25,2,Wed,25,23,0.613533,2020-11-23,2020-11-01
5,2020-11-26,3,Thu,26,23,0.501301,2020-11-23,2020-11-01
6,2020-11-27,4,Fri,27,23,0.204901,2020-11-23,2020-11-01
7,2020-11-28,5,Sat,28,23,0.564695,2020-11-23,2020-11-01
8,2020-11-29,6,Sun,29,23,0.977205,2020-11-23,2020-11-01
9,2020-11-30,0,Mon,30,23,0.053934,2020-11-30,2020-11-01


In [18]:
hour_df.groupby(['month'], as_index=False).agg({'value':'sum', 'date':'nunique'})

# Monthly usage
df_monthly = hour_df.sort_values(by='date')
df_monthly = df_monthly.groupby(['month'], as_index=False).agg({'value':'sum', 'date':'nunique'})

#dividing by 24 to get from hours to days
df_monthly.loc[:, 'perc'] = 100*(df_monthly['value']/(24))/df_monthly['date']

min_perc = (df_monthly['perc'] == df_monthly['perc'].min())
max_perc = (df_monthly['perc'] == df_monthly['perc'].max())

df_monthly.loc[min_perc | max_perc, 'text'] = df_monthly['month'].dt.strftime('%b')
df_monthly.fillna('', inplace=True)

df_monthly

Unnamed: 0,month,value,date,perc,text
0,2020-11-01,57.256078,10,23.856699,
1,2020-12-01,180.001789,31,24.193789,
2,2021-01-01,140.831439,31,18.928957,
3,2021-02-01,145.021871,28,21.580636,
4,2021-03-01,176.210461,31,23.684202,
5,2021-04-01,145.486167,30,20.206412,
6,2021-05-01,172.591502,31,23.197783,
7,2021-06-01,119.853254,30,16.646285,
8,2021-07-01,135.555843,31,18.219871,
9,2021-08-01,137.436762,31,18.472683,


In [19]:
spotify_colors = ['#1DB954', '#191414']
heat_pal = px.colors.sequential.YlGn

In [20]:
fig = px.line(df_monthly, x='month', y='perc', line_shape='spline', text='text',
              color_discrete_sequence=spotify_colors)
fig.update_traces(mode='lines+markers+text', textposition='top center', line_width=2.5)
fig.update_yaxes(rangemode='tozero', ticksuffix='%')
fig.update_xaxes(tickformat='%b %y', nticks=5)
fig.update_layout(
    title='<b>Spotify</b>: <i> How much time did I spend on Spotify each month?</i>',
    yaxis_title='% of total month time listening to music',
    xaxis_title=''
)
fig.show()

In [21]:
# Dow x Hours
df_daily_sum = hour_df\
                .groupby(['date', 'dow', 'day'], as_index=False)\
                .agg({'value':'sum'})

df_daily_sum = df_daily_sum\
                .groupby(['dow', 'day'], as_index=False)\
                .agg({'value':'mean'})\
                .sort_values(by='dow')

In [22]:
fig = px.line(df_daily_sum, x='day', y='value', color_discrete_sequence=spotify_colors, line_shape='spline')
fig.update_traces(mode='lines+markers+text', texttemplate='%{y:.1f}h', textposition='top center')
fig.update_yaxes(rangemode='tozero', ticksuffix='h', nticks=2)

fig.update_layout(
    title='<b>Spotify</b>: <i>When in the week do I listen to music the most?</i>',
    yaxis_title='Average hours listened per day',
    yaxis_range=[0,7],
    xaxis_title='Day of the week'
    
)
fig.show()

In [23]:
df_hour = hour_df\
                .groupby(['hour'], as_index=False)\
                .agg({'value':'mean'})\
                .sort_values(by=['hour'])

df_hour['hour_to_minutes'] = df_hour['value']*60

In [24]:
palette = sns.color_palette("YlGn", 24).as_hex() #RdYlBu

fig = go.Figure(go.Barpolar(
    r=df_hour.hour_to_minutes,
    theta=np.linspace(0, 360, 24, endpoint=False),
    width=15,
    marker_color=palette,
    marker_line_color='white',
    marker_line_width=0.10,
    #opacity=0.9,
    offset=0
))

num_slices=24
angular_tickvals = [(i) * 360 / num_slices for i in range(num_slices)]


labels = [time(i, 0).strftime("%H:%M") for i in df_hour.index]

fig.update_layout(
    title='<b>Spotify</b>: <i> Listening clock</i>',
    template=None,
    #polar_angularaxis_tickvals=angular_tickvals,
    polar = dict(
        bgcolor='#e5ecf6',
        radialaxis = dict(range=[0, 25], 
                          showticklabels=False,
                          #showgrid=False
                          ticks='', 
                          nticks=1,
                          color='grey', 
                          linewidth=0),
        angularaxis = dict(showticklabels=True, 
                           ticks='',
                           tickfont_size = 8,
                           tickfont_color = 'black',
                           rotation = 90,
                           direction = "clockwise",
                           color='white', 
                           tickmode='array',
                           tickvals=angular_tickvals,
                           ticktext=labels,
                           #tickcolor='yellow'
                           )
        
    )
)

fig.show()

In [25]:
hour_df['week_period'] = hour_df['dow'].apply(lambda x: 'Weekend' if x >= 5 else 'Weekdays')

df_weekend = hour_df\
                .groupby(['hour', 'week_period'], as_index=False)\
                .agg({'value':'mean'})\
                .sort_values(by=['week_period', 'hour'])

df_weekend['hour_to_minutes'] = df_weekend['value']*60
df_weekend['text'] = ''

df_weekend.loc[(df_weekend['week_period'] == 'Weekdays') & (df_weekend['hour'].isin([8, 10, 18])), 'text'] = df_weekend['hour'].apply(lambda x: "{:.0f}h".format(x))
df_weekend.loc[(df_weekend['week_period'] == 'Weekend') & (df_weekend['hour'].isin([10, 18, 21])), 'text'] = df_weekend['hour'].apply(lambda x: "{:.0f}h".format(x))

In [26]:
fig = px.line(df_weekend, x='hour', y='hour_to_minutes', color='week_period', text='text',
              line_shape='spline', color_discrete_sequence=spotify_colors)

fig.update_traces(mode='lines+markers+text', textposition='top center', line_width=2.5)
fig.update_yaxes(rangemode='tozero', ticksuffix='min', nticks=5)
fig.update_xaxes(title='', ticksuffix=':00', tickangle=45)

fig.update_layout(
    title='<b>Spotify</b>: <i>Do my habits change during the weekend?</i>',
    legend_title='Week period',
    legend_orientation='h',
    yaxis_title='Average time listening to music (min)',
)

fig.show()

In [27]:
df['sec_disc'] = np.floor(df['sPlayed'])
df_skip = df.groupby(['sec_disc'], as_index=False).agg({'trackName':'count'}).sort_values(by='sec_disc')
df_skip['percent'] = df_skip.sort_values(by='sec_disc')['trackName'].cumsum()/df_skip['trackName'].sum()
df_skip = df_skip.loc[df_skip['percent'] < 0.99]

In [28]:
fig = px.histogram(df_skip, x='sec_disc', y='trackName', histfunc='sum', histnorm='percent', nbins=50, 
                   color_discrete_sequence=spotify_colors)
fig.update_yaxes(ticksuffix='%')
fig.update_xaxes(dtick=30)
fig.update_layout(
    title='<b>Spotify</b>: <i>Do I skip songs a lot?</i>',
    yaxis_title='Share of streams',
    xaxis_title='Seconds of song'
)