# Pandas Datetime Capabilities and Visualizations

Pandas' datetime objects makes it possible to accomplish complex datetime operations in just a few lines of code.

In [None]:
# importing the libraries for data processing
import numpy as np 
import pandas as pd 

#matplotlib for visualizations
import matplotlib.pyplot as plt


### 1. Data Preparation
Merge the charts and the tracks datasets. Repeat the process from the previous notebook

In [None]:
# read and process the charts dataset
charts_df = pd.read_csv('data/spotify_daily_charts.csv')
#transform date column into a datetime column
charts_df['date'] = pd.to_datetime(charts_df['date'])
charts_df.head()

In [None]:
# read and process the tracks dataset
tracks_df = pd.read_csv('data/spotify_daily_charts_tracks.csv')
tracks_df.head()

View track feature distribution

In [None]:
df = charts_df.merge(tracks_df, on='track_id', how='left')

df = df.drop(columns='track_name_y')
df = df.rename(columns={'track_name_x':'track_name'})

df.head()

### 2. Pandas Time Series Aggregation Capabilties
A *time series* is any data that is indexed by some measure of time. 

A *time series plot* is a graph where some measure of time is the unit on the x-axis, often called the time-axis. The y-axis is for the variable that is being measured. 

Pandas has some quick one-liners to help you operate on time series data.

##### 2.1. Resampling

With a datetime object set as index, a pandas dataframe can be aggregated in time with one simple line!

In [None]:
df = df.set_index('date')
df.head()

In [None]:
#get total yearly streams
yr_df = df['streams'].resample('Y').sum() #M for month
yr_df

In [None]:
#bar chart of yearly streams
fig= plt.figure(figsize=(8,6))
ax = fig.add_subplot(111)
yr_df.plot(ax=ax)

#Uncomment for cleaner x labels
#ax.set_xticklabels([x.strftime('%Y') for x in yr_df.index], rotation=0)

plt.xlabel('year')
plt.ylabel('Streams (in billions)')
plt.title('Spotify Yearly Total Streams')

In [None]:
#get total monthly streams
mon_df = df['streams'].resample('MS').sum() #M for month
mon_df

In [None]:
#line chart of monthly streams

fig= plt.figure(figsize=(8,6))
ax = fig.add_subplot(111)
#default is line so you can omit kind= parameter
mon_df.plot(ax=ax, kind='line')

#Uncomment for cleaner x labels
#ax.set_xticklabels([x.strftime('%Y-%m') for x in mon_df.index])

plt.ylabel('streams (in hundred millions)')
plt.title('Spotify Monthly Total Streams')

Q: Compute for the artist *BLACKPINK*'s total yearly streams from 2018-2020

In [None]:
df[df['artist']=='BLACKPINK']['streams'].resample('Y').sum()

https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.resample.html

#### 2.2 First-Order differencing

At times, we might be more interested in the change of a quantity in time rather than the absolute quantities. We use `df.diff()` for this

In [None]:
# month - previous month
delta_mon_df = mon_df.diff()
delta_mon_df

Q: How did Spotify streams grow month-on-month from 2018 to 2020?

In [None]:
#line chart of monthly streams

fig= plt.figure(figsize=(8,6))
ax = fig.add_subplot(111)
#default is line so you can omit kind= parameter
#omit incomplete month
delta_mon_df[:-1].plot(ax=ax)

#Uncomment for cleaner x labels
#ax.set_xticklabels([x.strftime('%Y-%m') for x in mon_df.index])

#add reference line at y=0
plt.axhline(0, color='k', ls='--')

plt.ylabel('streams (in ten millions)')
plt.title('Spotify Month-on-Month Stream Growth')

> Q: Can you show the above month-on-month growth plot as a percentage of the previous month?

#### 2.3 Cummulative sum

We compute for the cummulative sum using `df.cumsum()`

In [None]:
#get cummulative total spotify monthly streams
mon_df.cumsum()

Q: Compute for the cummulative total monthly streams of Ben and Ben's 'Kathang Isip' 

In [None]:
df[df['track_name']=='Kathang Isip']['streams'].resample('M').sum().cumsum()

In [None]:
#line chart of monthly streams

fig= plt.figure(figsize=(8,6))
ax = fig.add_subplot(111)
#default is line so you can omit kind= parameter
data = df[df['track_name']=='Kathang Isip']['streams'].resample('M').sum().cumsum()

data.plot(ax=ax,marker='o')
#Uncomment for cleaner x labels
#ax.set_xticklabels([x.strftime('%Y-%m') for x in mon_df.index])

plt.ylabel('streams (in hundred millions)')
plt.title('Spotify Monthly Total Streams')

Q: How does the cummulative total monthly streams of Ed Sheeran's Shape of You compare with  Ben and Ben's 'Kathang Isip' ?

In [None]:
#line chart of monthly streams

fig= plt.figure(figsize=(8,6))
ax = fig.add_subplot(111)
#default is line so you can omit kind= parameter
data1 = df[df['track_name']=='Kathang Isip']['streams'].resample('M').sum().cumsum()
data2 = df[df['track_name']=='Shape of You']['streams'].resample('M').sum().cumsum()

data1.plot(ax=ax, label='Ben&Ben- Kathang isip')
data2.plot(ax=ax, label='Ed Sheeran- Shape of You')
#Uncomment for cleaner x labels
#ax.set_xticklabels([x.strftime('%Y-%m') for x in mon_df.index])

plt.legend()
plt.ylabel('streams (in hundred millions)')
plt.title('Spotify Monthly Total Streams')

#### 2.4 Rolling window

We can use the function `.rolling()` to define a time series window where we could aggregate the target variable

Q: Smooth out daily streams of Ben and Ben's Kathang Isip over a 1-week window

In [None]:
df[df['track_name']=='Kathang Isip']['streams'].rolling(7).mean()

In [None]:
fig = plt.figure(figsize=(13,4))
ax = fig.add_subplot(111)

data1 = df[df['track_name']=='Kathang Isip']['streams']
data2 = df[df['track_name']=='Kathang Isip']['streams'].rolling(7).mean()

data1.plot(ax=ax, label='raw')
data2.plot(ax=ax, label='smoothed')

plt.legend()
plt.ylabel('streams')
plt.title('Spotify Daily Streams: Ben&Ben- Kathang isip')


Q: Compare the 7-day smoothed daily streams of Ben and Ben's Kathang Isip and Taylor Swift's Lover

In [None]:
fig = plt.figure(figsize=(13,4))
ax = fig.add_subplot(111)

data1 = df[(df.index.year>=2019)&(df['track_name']=='Kathang Isip')]['streams'].rolling(7).mean()
data2 = df[(df.index.year>=2019)&(df['track_name']=='Lover')]['streams'].rolling(7).mean()

data1.plot(ax=ax, label='Ben&Ben- Kathang isip')
data2.plot(ax=ax, label='Taylor Swift- Lover')

plt.legend()
plt.ylabel('streams')
plt.title('Spotify Daily Streams')


>Q: What insights can you infer from the chart above?

Q: Compare the 7-day top chart **position** for Ben & Ben and Ariana Grande

In [None]:
fig = plt.figure(figsize=(13,4))
ax = fig.add_subplot(111)

#get top position of all charting songs of the artist per day
data1 = df[df['artist']=='Ben&Ben'].groupby('date')[['position']].min()
#get rolling 7 day minimum of top daily positions
data1 = data1.rolling(7).min()
data2 = df[df['artist']=='Ariana Grande'].groupby('date')[['position']].min()
data2 = data2.rolling(7).min()

data1.plot(ax=ax, label='Ben&Ben')
data2.plot(ax=ax, label='Ariana Grande')

#reverse the yaxis to show 1 on top
#plt.ylim([200,0])
plt.yticks([1]+np.arange(25,201,25).tolist())

plt.legend()
plt.ylabel('Chart Position')
plt.title('Spotify Weekly Top Chart Positions')

>Q: What insights can you infer from the chart above?

### 4. Interactive Time-Series plot in Bokeh


In [None]:
# Standard imports 
from bokeh.io import output_notebook, show
from bokeh.plotting import figure

from bokeh.models.tools import HoverTool

from bokeh.models import ColumnDataSource, Range1d
output_notebook()

>Q: Compare the contribution of the top 1-50 tracks vs top 50-200 tracks in the yearly streams of Spotify

In [None]:
#add is_in_top_50 column in df
df['is_in_top50']=df['position']<=50
df.head()

In [None]:
#get total yearly streams
yr_seg_df = df.groupby('is_in_top50')[['streams']].resample('Y').sum().reset_index()
yr_seg_df

In [None]:
p = figure(title="Spotify Yearly Streams for Charting Tracks",plot_width=500, plot_height=300)
data = yr_seg_df
p.vbar(x=yr_df.index.year.values-0.15,\
       top=data[data['is_in_top50']==True]['streams'].values/1000000000,
       width=0.2, bottom=0, color="crimson", legend_label='Top50')
p.vbar(x=yr_df.index.year.values+0.15,\
       top=data[data['is_in_top50']==True]['streams'].values/1000000000,
       width=0.2, bottom=0, color="dodgerblue", legend_label='Top51-100')

p.xaxis.axis_label = 'date'
p.yaxis.axis_label = 'Streams (in billions)'

p.legend.label_text_font_size = '8pt'
p.legend.click_policy="hide"

show(p)

>Q: What insights can you infer from the chart above?

Convert the time series static plots into Bokeh charts

In [None]:
#Simple time series plot with Viz
from bokeh.models import Range1d

p = figure(x_axis_type="datetime", title="Spotify Daily Charts", plot_height=350, plot_width=800)
p.xgrid.grid_line_color=None
p.ygrid.grid_line_alpha=0.5
p.xaxis.axis_label = 'date'
p.yaxis.axis_label = 'Position'
p.y_range.flipped = True
p.y_range=Range1d(200,0.5)

p.yaxis.ticker = [1,50,100,150,200]
#p.xaxis.major_label_overrides = {1: 'A', 2: 'B', 3: 'C'}

artists = ['Ariana Grande','Ben&Ben']
colors=['orange','blue']
for n,artist in enumerate(artists):
    artist_df = pd.DataFrame({'date':pd.unique(df.index)}).set_index('date')
    positions = df[df['artist']==artist].groupby('date')[['position']].min().rolling(7).min()
    artist_df['position'] = positions
    artist_df = artist_df.reset_index()
    
    #add .flatten() to y-values to deal with rolling window value nesting
    p.line(artist_df['date'], artist_df['position'], color=colors[n],legend_label=artist)
    

p.legend.location = "bottom_right"
p.legend.click_policy="hide"

show(p)

In [None]:
#Simple time series plot with selectable legend and hover output

p = figure(x_axis_type="datetime", title="Spotify Daily Charts", plot_height=350, plot_width=800)

#reformat data to suit ColumnDataSource
artists = ['Ariana Grande','Ben&Ben']
colors=['blue','orange','red']
for n,artist in enumerate(artists):
    artist_df = pd.DataFrame({'date':pd.unique(df.index)}).set_index('date')
    top5songs = df[(df['artist']==artist)].groupby(['track_name'])[['streams']].sum()\
                                            .sort_values(by='streams', ascending=False)[:5].reset_index()['track_name']
    positions = df[(df['artist']==artist)&(df['track_name'].isin(top5songs))].groupby('date')[['position']].min().rolling(7).min()
    artist_df['position'] = positions
    artist_df = artist_df.reset_index()
    artist_df['artist'] = artist

    source = ColumnDataSource(data = {'date': artist_df['date'].values,
                                      'position': artist_df['position'].values,
                                      'artist': artist_df['artist'].values})
    
    #add .flatten() to y-values to deal with rolling window value nesting
    p.line(x='date', y='position', color=colors[n], source=source, legend_label=artist)
    #add hover tool
    p.add_tools(HoverTool(
        tooltips=[
                ( 'date',   '@date{%F}'),
                ( 'position',  '@position' ), # use @{ } for field names with spaces
                ( 'artist', '@artist'      ),
            ],
        formatters={
            '@date' : 'datetime',
        }
    ))

#plot formatting
p.xgrid.grid_line_color=None
p.ygrid.grid_line_alpha=0.5
p.xaxis.axis_label = 'date'
p.yaxis.axis_label = 'Position'
p.y_range.flipped = True
p.y_range=Range1d(200,0)
p.yaxis.ticker = [1,50,100,150,200]

p.legend.location = "bottom_right"
p.legend.click_policy="hide"

show(p)

## Try it yourself!

1. Among those included in the Spotify charts, pick 1 artist you like to analyze. 

   a. Plot the streams and positions of their top 5 streamed songs.
   
   b. Compare these charts with streams and positions of what you feel to be a possible collaborator/competitor/related artist. 

   What insights can you draw from the data?