# More Visualization Types

In [None]:
# importing the libraries for data processing
import numpy as np 
import pandas as pd 

#These two modules will be used to create some basic visualizations
import seaborn as sns
import matplotlib.pyplot as plt

#This is a jupyter magic command that embeds the image generated by matplotlib right after the code cell
%matplotlib inline

In [None]:
#open merged dataset
df_streams = pd.read_csv('data/spotify_daily_charts.csv')
df_streams.head()

In [None]:
#transform date column into a datetime column
df_streams['date'] = pd.to_datetime(df_streams['date'])
df_streams = df_streams.set_index('date')
df_streams.head()

### 1. Histograms

Put simply, histograms are graphical representation of tallies.
Read more about histograms here: (https://statistics.laerd.com/statistical-guides/understanding-histograms.php).

These are very useful in EDA because at a glance, you could already see how the data is spread over its range.

In particular, you should look out for:
1. Skewness - Do the values peak around the mean, or over lower (left-skewed)/higher values(right-skewed)?
2. Mode - Does it have one peak (unimodal)? two peaks (bimodal)? How many peaks?
3. Outliers - Are there a few data points that are substantially distant from bulk of all values?

It is strongly advised that you look at histograms before you do any aggregations.


**Q**: Compare the positions earned by Ariana Grande, Dua Lipa and Lady Gaga Songs in 2019

In [None]:
plt.hist?

In [None]:
plt.figure(figsize=(8,6))    
ax = plt.subplot(111) 

for artist_name in ["Ariana Grande", "Dua Lipa" , "Lady Gaga"]:
    data = df_streams[(df_streams['artist']==artist_name)].loc['2019-01-01':'2019-12-31']
    plt.hist(data['position'].values,bins=np.arange(0,210,10),\
             histtype='stepfilled', label=artist_name, alpha=0.35)
    

plt.xticks([1]+np.arange(0,210,10).tolist())
plt.xlim([200,1]) 

plt.ylabel('Frequency')
plt.xlabel('Position')
plt.legend(frameon=False)



### 2. Stacked Bar/Area chart

A stacked bar/area chart shows the extent of contribution of one/few parts to the whole.
It is advised that the total is divided to 2-3 major contributing parts only--lump all else in another category ("others")

**Q**: How many streams did the songs from Ed Sheeran's 2 latest albums *No 6 collaboration project* (album id: 3oIFxDIo2fwuk4lwCmFZCx) and *Divide* (album id: 3T4tUhGYeRNVUGevb0wThu) contribute to his monthly total streams in 2019?

In [None]:
#read tracks csv
df_tracks = pd.read_csv('data/spotify_daily_charts_tracks.csv')
df_tracks.head()

In [None]:
#join dataframe
df_streams_tracks = df_streams.reset_index().merge(df_tracks, on=['track_id','track_name'], how='left')
df_streams_tracks = df_streams_tracks.set_index('date')
df_streams_tracks.head()

In [None]:
df_ed = df_streams_tracks[df_streams_tracks['artist']=='Ed Sheeran'].loc['2019-01-01':'2019-12-31']\
                                .groupby(['album_id'])['streams']\
                                .resample('M').sum().reset_index()\
                                .sort_values('streams',ascending=False)
#df_ed = df_ed.set_index('date')
df_ed['album_name']= df_ed['album_id'].apply(lambda x: 
                                             'No.6' if (x=='3oIFxDIo2fwuk4lwCmFZCx') else 
                                            'Divide' if (x=='3T4tUhGYeRNVUGevb0wThu') else 'others')

df_ed = df_ed.set_index('date')
df_ed = df_ed.groupby(['album_name'])['streams']\
                                .resample('M').sum().reset_index()\
                                .sort_values('streams',ascending=False)
df_ed

In [None]:
data.plot.area?

In [None]:
plt.figure(figsize=(8,6))    
ax = plt.subplot(111) 

color_list = ['skyblue','C7','0.8']
#reshape
data = df_ed.pivot(index='date',columns='album_name',values='streams')#/1000000
data.plot.area(ax=ax, lw=0, color=color_list)


#plt.yticks(np.arange(0,25,5),[str(x)+'M' if x>0 else str(x) for x in np.arange(0,25,5)])
plt.ylabel('streams')

plt.xlabel('')
plt.legend(frameon=False)


In [None]:
plt.figure(figsize=(8,6))    
ax = plt.subplot(111) 

color_list = ['skyblue','C7','0.8']
#reshape
data = df_ed.pivot(index='date',columns='album_name',values='streams')
#normalize with monthly sums
data[data.columns] = 100*data[data.columns].div(data.sum(axis=1), axis=0)
#plot
data.plot.area(ax=ax, lw=0, color=color_list)

#custom ticks
#plt.yticks(np.arange(0,120,20),[str(x)+'%' for x in np.arange(0,120,20)])
#plt.ylabel('streams')
#plt.ylim([0,100])

plt.xlabel('')

# Put a legend below current axis
ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), frameon=False,ncol=3)



>Q: Extend the stacked area plots above to cover up to December 2020

### 3. Boxplot

Box plots are useful as they provide a visual summary of the data to quickly identify mean values, the dispersion of the data set, and signs of skewness. 

Here are the parts of a box plot
![](https://www.simplypsychology.org/boxplot.jpg?ezimgfmt=rs:279x143/rscb18/ng:webp/ngcb18)

**Q**: How does loudness, tempo, danceability, energy, and valence of KPOP girl group songs compare with the rest of the dataset?

In [None]:
kpop_girl_grps = ["BLACKPINK","Girls' Generation-Oh!GG", "Girls' Generation-TTS",\
                  "ITZY","IZ*ONE","MOMOLAND","Red Velvet","TWICE"]
metrics = ["loudness", "tempo", "danceability", "energy","valence"]

In [None]:
df_metrics = df_streams_tracks.groupby(['track_name','artist'])[metrics].mean().reset_index()

df_metrics['is_gg']= ['kpop girl group' if artist in kpop_girl_grps else 'all else'\
                      for artist in df_metrics['artist'].values]

#get max value for normalization
#max_tempo = df_metrics['tempo'].max()
#max_loudness = df_metrics['loudness'].min()

#normalize tempo and loudness
#df_metrics['tempo']= df_metrics['tempo']/max_tempo
#df_metrics['loudness']= df_metrics['loudness']/max_loudness

#set multiindex
df_metrics = df_metrics.set_index(['track_name','artist','is_gg'])

#stack to achieve shape demanded by boxplot
df_metrics = pd.DataFrame({'value':df_metrics.stack()})
df_metrics = df_metrics.reset_index()

df_metrics = df_metrics.rename(columns={'level_3':'metric'})
df_metrics.head()

In [None]:
plt.figure(figsize=(8,6))    
ax = plt.subplot(111) 
#data.boxplot(column=['SepalLength'], by='Name', ax=ax)
sns.boxplot(data = df_metrics, x ='metric', y='value',  hue='is_gg', ax=ax,
            hue_order=['kpop girl group','all else'], palette=['pink','C7'])

ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.1), frameon=False,ncol=3)


### 4. Heatmap
The heatmap is a way of representing the data in a 2-dimensional form. The data values are represented as colors in the graph. The goal of the heatmap is to provide a colored visual summary of information.

**Q**: Visualize Ben&Ben songs in top 200 as a heatmap of its monthly streams

In [None]:
df_bb = df_streams[df_streams['artist']=="Ben&Ben"].groupby('track_name')[['streams']]\
                                                                .resample('M').sum()
df_bb = df_bb.reset_index()
df_bb = df_bb[df_bb['date']<'2021-01-31']
#clean long titles
#df_bb['track_name'] = df_bb['track_name'].apply(lambda x: x.split('(')[0])\
#                                        .apply(lambda x: x.split(' - ')[0])
df_bb

In [None]:
arr_df = df_bb.pivot(index='track_name', columns='date',values='streams')
#divide by 1M to show streams in millions
arr_df = arr_df/1000000
arr_df.fillna(0, inplace=True)
arr_df['total_streams'] = arr_df.sum(axis=1)
#arr_df = arr_df.sort_values('total_streams',ascending=False)



In [None]:

plt.figure(figsize=(8,6))    
ax = plt.subplot(111) 

#get all month columns and specify format for yticks
moncols=arr_df.columns[:-1]
yymm_cols = pd.Series(moncols.values).apply(lambda x: x.strftime('%Y-%m'))

sns.heatmap(arr_df[moncols],ax=ax,
            cmap='viridis',
            cbar_kws={'label': 'million streams', 'ticks':np.arange(0,8,1) },
            xticklabels=yymm_cols, yticklabels=True)

plt.ylabel('')
plt.xlabel('')

**Q**: Visualize Jose Mari Chan songs as a heatmap of its monthly streams

In [None]:
df_jmc = df_streams[df_streams['artist']=="Jose Mari Chan"].groupby('track_name')[['streams']]\
                                                                .resample('M').sum()
df_jmc = df_jmc.reset_index()
df_jmc = df_jmc[df_jmc['date']<'2021-01-31']
#clean long titles
df_jmc['track_name'] = df_jmc['track_name'].apply(lambda x: x.split('(')[0])\
                                        .apply(lambda x: x.split(' - ')[0])
df_jmc

In [None]:
arr_df = df_jmc.pivot(index='track_name', columns='date',values='streams')
#divide by 100k to show streams in hundred thousands
arr_df = arr_df/100000
arr_df.fillna(0, inplace=True)
arr_df['total_streams'] = arr_df.sum(axis=1)
arr_df = arr_df.sort_values('total_streams',ascending=False)

In [None]:

plt.figure(figsize=(8,6))    
ax = plt.subplot(111) 

#get all month columns and specify format for yticks
moncols=arr_df.columns[:-1]
yymm_cols = pd.Series(moncols.values).apply(lambda x: x.strftime('%Y-%m'))

sns.heatmap(arr_df[moncols],ax=ax,
            cmap='viridis',
            cbar_kws={'label': 'streams x 100k' },
            xticklabels=yymm_cols, yticklabels=True)

plt.ylabel('')
plt.xlabel('')