## Import Libraries

In [1]:
import pandas as pd
from cm_api import *
from helper_funct import *
from helper_funct1 import *
from cm_config import config
import matplotlib.pyplot as plt
from tabulate import tabulate 

## Request API Token

In [2]:
api_token = get_api_token(config['refresh_token'])

## Collect Top 200 Spotify Weekly Charts (US) for 2020

### Create list of dates

In [3]:
date_list = get_date_range('2020-01-01', '2020-12-11', 'W')

In [4]:
first_date = date_list[0]
recent_date = date_list[-1]
len(date_list)

49

### Collect top tracks for 2020

In [5]:
counter = 0
spotify_chart_data = []
for date in date_list:
    data = get_spotify_charts(api_token, date, 'US', 'regional', 'weekly')
    counter += 1
    print(counter)
    for track in data:
        track_tuple = (track['added_at'],track['rank'], track['name'], track['cm_track'], track['artist_names'],
                      track['cm_artist'], track['current_plays'], track['time_on_chart'], track['velocity'])
        spotify_chart_data.append(track_tuple)

spotify2020 = pd.DataFrame(spotify_chart_data, columns=['add_date','rank', 'title', 'cm_track_id', 'artist_names', 'cm_artist_ids', 'current_plays', 'time_on_chart', 'velocity'])

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49


### Save raw data

In [6]:
spotify2020.to_csv('spotify2020_raw.csv')

In [18]:
spotify2020.head()

Unnamed: 0,add_date,rank,title,cm_track_id,artist_names,cm_artist_ids,current_plays,time_on_chart,velocity,primary_artist,secondary_artist,primary_artist ID,secondfary_artist ID
0,2020-01-09T00:00:00.000Z,1,The Box,27599255,[Roddy Ricch],[839964],18952305,55,0.0,Roddy Ricch,,839964,
1,2020-01-09T00:00:00.000Z,2,ROXANNE,27228348,[Arizona Zervas],[64150],9671478,57,0.0,Arizona Zervas,,64150,
2,2020-01-09T00:00:00.000Z,3,Yummy,28077991,[Justin Bieber],[3479],9648561,6,,Justin Bieber,,3479,
3,2020-01-09T00:00:00.000Z,4,Circles,25021708,[Post Malone],[135326],8244725,67,-0.142857,Post Malone,,135326,
4,2020-01-09T00:00:00.000Z,5,BOP,25358331,[DaBaby],[398544],7985170,65,-0.142857,DaBaby,,398544,


In [None]:
#convert artist id to string
spotify2020['primary_artist ID'] = spotify2020['primary_artist ID'].astype('int64')

In [None]:
spotify2020.keys()

In [10]:
#create primary artist feature

spotify2020['primary_artist'] = spotify2020['artist_names'].apply(lambda x: x[0])

In [11]:
#create secondary artist feature

spotify2020['secondary_artist'] = spotify2020['artist_names'].apply(lambda x: x[1] if len(x) > 1 else None)

In [12]:
#create primary artist ID feature

spotify2020['primary_artist ID'] = spotify2020['cm_artist_ids'].apply(lambda x: x[0])
#convert artist id to string
spotify2020['primary_artist ID'] = spotify2020['primary_artist ID'].astype('str')

In [13]:
#create secondary artist ID feature

spotify2020['secondfary_artist ID'] = spotify2020['cm_artist_ids'].apply(lambda x: x[1] if len(x) > 1 else None)

In [14]:
#how many unique artists?
spotify2020['primary_artist'].nunique()

336

In [15]:
unique_partists = list(spotify2020['primary_artist'].unique())

In [16]:
unique_pids = list(spotify2020['primary_artist ID'].unique())

In [17]:
#create a dictionary of artist : artist id

primary_artist_dict = dict(zip(unique_partists, unique_pids)) 

In [19]:
artist_listener_data = []

for k,v in primary_artist_dict.items():
    listener_data = get_fan_metrics(api_token, v, 'spotify', first_date, recent_date, field='listeners')['listeners']
    listener_tuple = (k, v, listener_data[0]['value'], listener_data[-1]['value'])
    artist_listener_data.append(listener_tuple)
    
    

In [20]:
#create lists for unique primary artist ids, first spotify listens, recent spotify listens

first_listeners = []
recent_listeners = []
artist_ids = []

for tup in artist_listener_data:
    artist_ids.append(tup[1])
    first_listeners.append(tup[2])
    recent_listeners.append(tup[3])

In [21]:
#create dictionaries for first and recent listeners

prim_artist_firstlist_di = dict(zip(artist_ids, first_listeners))
prim_artist_recentlist_di = dict(zip(artist_ids, recent_listeners))

In [22]:
#create a new features of first and recent listeners for each unique primary artist

spotify2020['Prim: Jan. 5-Listeners'] = spotify2020['primary_artist ID'].map(prim_artist_firstlist_di)
spotify2020['Prim: Dec. 6-Listeners'] = spotify2020['primary_artist ID'].map(prim_artist_recentlist_di)

In [67]:
spotify2020

Unnamed: 0,add_date,rank,title,cm_track_id,artist_names,cm_artist_ids,current_plays,time_on_chart,velocity,primary_artist,secondary_artist,primary_artist ID,secondfary_artist ID,Prim: Jan. 5-Listeners,Prim: Dec. 6-Listeners
0,2020-01-09T00:00:00.000Z,1,The Box,27599255,[Roddy Ricch],[839964],18952305,55,0.000000,Roddy Ricch,,839964,,19014642,21923722
1,2020-01-09T00:00:00.000Z,2,ROXANNE,27228348,[Arizona Zervas],[64150],9671478,57,0.000000,Arizona Zervas,,64150,,28519561,12499023
2,2020-01-09T00:00:00.000Z,3,Yummy,28077991,[Justin Bieber],[3479],9648561,6,,Justin Bieber,,3479,,59053556,72539216
3,2020-01-09T00:00:00.000Z,4,Circles,25021708,[Post Malone],[135326],8244725,67,-0.142857,Post Malone,,135326,,59360500,41224314
4,2020-01-09T00:00:00.000Z,5,BOP,25358331,[DaBaby],[398544],7985170,65,-0.142857,DaBaby,,398544,,21300480,42133931
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9795,2020-12-10T00:00:00.000Z,196,SICKO MODE,20318958,[Travis Scott],[4215],1707997,123,-1.857143,Travis Scott,,4215,,38669161,39381880
9796,2020-12-10T00:00:00.000Z,197,Beautiful Crazy,19173498,[Luke Combs],[207759],1707684,125,-3.142857,Luke Combs,,207759,,7904711,11328621
9797,2020-12-10T00:00:00.000Z,198,Just The Two Of Us (feat. Bill Withers),15275075,"[Grover Washington, Jr., Bill Withers]","[11492, 822]",1704552,1,,"Grover Washington, Jr.",Bill Withers,11492,822.0,1036884,4072160
9798,2020-12-10T00:00:00.000Z,199,The Less I Know The Better,11020231,[Tame Impala],[3315],1702908,65,-0.428571,Tame Impala,,3315,,10849722,13643806


In [24]:
spotify2020.to_csv('spotify2020_v1.csv')

## Artist Type Feature
This feature indicates whether or not the artist is considered "emerging", "established", or "super" based on
the number of spotify listeners early in the year (January 5th)

In [30]:
spotify2020.describe()

Unnamed: 0,rank,cm_track_id,current_plays,velocity,secondfary_artist ID,Prim: Jan. 5-Listeners,Prim: Dec. 6-Listeners
count,9800.0,9800.0,9800.0,8389.0,3807.0,9800.0,9800.0
mean,100.5,25484590.0,2715603.0,-0.887693,400950.4,23482140.0,27627130.0
std,57.737251,4876548.0,1739207.0,3.332286,518020.6,17535920.0,16470630.0
min,1.0,10988390.0,1350029.0,-23.428571,60.0,13.0,1.0
25%,50.75,22358950.0,1724710.0,-2.142857,4215.0,7992604.0,13643810.0
50%,100.5,27430930.0,2096916.0,-0.428571,209693.0,19704530.0,24785160.0
75%,150.25,29155500.0,3004683.0,0.571429,731530.0,37770330.0,39885900.0
max,200.0,32198060.0,24966900.0,27.714286,3985882.0,64948950.0,72539220.0


In [68]:
#create a new feature
spotify2020['artist_popularity_type'] = spotify2020['Prim: Jan. 5-Listeners'].apply(lambda x: 'emerging' if x <= spotify2020['Prim: Jan. 5-Listeners'].quantile(.25) 
                                            else ('established' if spotify2020['Prim: Jan. 5-Listeners'].quantile(.25) < x <= spotify2020['Prim: Jan. 5-Listeners'].quantile(.75)
                                                  else 'superstar'))

In [69]:
spotify2020.to_csv('spotify2020_v2.csv')

## EDA

In [28]:
import seaborn as sns

In [None]:
unique_artist_df

In [None]:
#look and listener distribution for January 5th

unique_artist_df.describe()

In [None]:
#new dataframe of 25th percentile of artists - Jan5th
lower25 = unique_artist_df.loc[unique_artist_df['Jan. 5-Listeners'] < unique_artist_df['Jan. 5-Listeners'].quantile(.25)]

In [None]:
#new feature of listener change between Jan and Dec
lower25['listener_change'] = lower25['Dec. 6-Listeners'] - lower25['Jan. 5-Listeners']

In [None]:
#new feature of percent change is artist listeners
def percent_change(x):
    return ((x[1]- x[0])/x[0])*100

In [None]:
lower25['listener percent change %'] = lower25[['Jan. 5-Listeners', 'Dec. 6-Listeners']].apply(lambda x: percent_change(x), axis=1)

In [None]:
lower25.reset_index(inplace=True, drop=True)

In [None]:
#find artist with biggest percent gain in listeners

top_emerging_artists = lower25.sort_values('listener percent change %', ascending=False).reset_index(drop=True)

In [None]:


top_emerging_artists['Jan. 5-Listeners'].apply(lambda x: "{:,}".format(x['Jan. 5-Listeners']))

In [None]:
#save top emerging artists to csv
top_emerging_artists.to_csv('top_emerging_artists2020.csv', index_label='index')

# top_emerging_artists = pd.read_csv('top_emerging_artists2020.csv', index_col=0)

In [None]:
listener_sorted = top_emerging_artists.sort_values('listener percent change %', ascending=False)
top5 = listener_sorted.iloc[0:5, :]

In [None]:
top5.keys()

In [None]:
sns.barplot(top5['primary_artist'], top5['listener percent change %'], )
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

In [None]:
top5brdg = top5[['title', 'primary_artist', 'Jan. 5-Listeners','Dec. 6-Listeners', 'listener_change', 'listener percent change %']]

In [None]:
tabulate(top5brdg, headers='keys', tablefmt = 'pretty')

In [None]:
top5brdg