# Import Libraries

In [1]:
import spotipy
import sys
import json
from spotipy.oauth2 import SpotifyClientCredentials
import pandas as pd
import yaml
import os
import glob
import time
from sklearn.preprocessing import MinMaxScaler
from collections import Counter


# Get Track Information

In [3]:


def get_spotify_track_info(df_input,path_config):
    df = df_input.copy()
    cred = read_yaml_file(path_config)
    client_id = cred['client_id']
    client_secret = cred['client_secret']

    client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
    sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

    for x in df.index:
        artist = df.loc[x,'artist_name']
        song = df.loc[x,'track_name']
        query = "artist:{} track:{}".format(artist,song)
        result = sp.search(q=query,type='track',limit=5)        
        file_name = "/Users/jo186027/Desktop/track_json/track_{}.txt".format(str(x))
        with open(file_name, 'w') as outfile:
            json.dump(result, outfile)
        
        

In [4]:

def parse_result_spotify(result):
    df = pd.DataFrame()
    for x in range(len(result['tracks']['items'])):
        df.loc[x,'artistName'] = result['tracks']['items'][x]['artists'][0]['name']
        df.loc[x,'artistId'] = result['tracks']['items'][x]['artists'][0]['id']
        df.loc[x,'songName'] = result['tracks']['items'][x]['name']
        df.loc[x,'songId'] = result['tracks']['items'][x]['id']
        df.loc[x,'availableMarket'] = ','.join(result['tracks']['items'][x]['available_markets'])
        df.loc[x,'songPopularity'] = result['tracks']['items'][x]['popularity']
    df.reset_index(drop=True,inplace=True)
    return df
        
def read_yaml_file(path):
    try:
        f = open(path, 'r')
        try:
            return yaml.load(f)
        finally:
            f.close()
    except (IOError, EOFError) as e:
        raise Exception("Unable to open pipeline: " + path)       

In [103]:
# Get Track Information

In [5]:
# Get top songs per top artist
top_songs_artist = pd.read_csv('/Users/jo186027/Desktop/fp_top_songs_artist_100.csv')
top_songs_artist.head()

Unnamed: 0,track_name,artist_name,count
0,Midnight City,M83,2609
1,Get Lucky - Radio Edit,Daft Punk,2341
2,Radioactive,Imagine Dragons,2336
3,Little Talks,Of Monsters and Men,2254
4,Wake Me Up,Avicii,2242


In [8]:
# Get spotify track information
path_config = '/Users/jo186027/Desktop/spotify_credentials.yaml'
df_input = top_songs_artist
get_spotify_track_info(df_input,path_config)


In [17]:
path = "/Users/jo186027/Desktop/track_json/*.txt"
json_path = glob.glob(path)
df_track_complete = pd.DataFrame()
for x in json_path:
    with open(x) as json_file:
        data = json.load(json_file)
    df = parse_result_spotify(data)
    df_track_complete = df_track_complete.append(df)

In [22]:
df_track_complete.drop_duplicates(subset='artistName',inplace=True)
df_track_complete.reset_index(drop=True,inplace=True)

In [23]:
df_track_complete.head()

Unnamed: 0,artistName,artistId,songName,songId,availableMarket,songPopularity
0,M83,63MQldklfxkjYDoUE4Tppz,Midnight City,1eyzqe2QqGZUmfcPZtrIyt,"CA,US",71.0
1,Daft Punk,4tZwfgrHOc3mvqYlEYSvVi,Get Lucky (feat. Pharrell Williams & Nile Rodg...,2Foc5Q5nqNiosCNqttzHof,"AD,AE,AR,AT,AU,BE,BG,BH,BO,BR,CA,CH,CL,CO,CR,C...",77.0
2,Foster The People,7gP3bB2nilZXLfPHJhMdvc,Pumped Up Kicks,7w87IxuO7BDcJ3YUqCyMTT,"AD,AE,AR,AT,AU,BE,BG,BH,BO,BR,CA,CH,CL,CO,CR,C...",84.0
3,Kanye West,5K4W6rqBFWDnAN6FQUkS6x,Black Skinhead,722tgOgdIbNe3BEyLnejw4,"AD,AE,AR,AT,AU,BE,BG,BH,BO,BR,CA,CH,CL,CO,CR,C...",76.0
4,fun.,5nCi3BB41mBaMH9gfr6Su0,We Are Young (feat. Janelle Monáe),7a86XRg84qjasly9f6bPSD,"AT,BG,CH,CZ,DE,DK,EE,FI,HU,IL,IS,LT,LV,NO,PL,R...",72.0


In [60]:
# save track infortion
df_track_complete.to_csv('/Users/jo186027/Desktop/df_track_complete.csv',index=False)

# Get Audio Features

In [24]:
def get_spotify_audio_feature(df_input,path_config):
    df = df_input.copy()
    cred = read_yaml_file(path_config)
    client_id = cred['client_id']
    client_secret = cred['client_secret']
    client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
    sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)
    for x in df.index:
        song_id = df.loc[x,'songId']
        result = sp.audio_features(song_id)        
        file_name = "/Users/jo186027/Desktop/song_features_json/song_{}.txt".format(str(x))
        with open(file_name, 'w') as outfile:
            json.dump(result, outfile)
            

In [28]:
path_config = '/Users/jo186027/Desktop/spotify_credentials.yaml'
get_spotify_audio_feature(df_track_complete,path_config)

In [48]:
def parse_result_audio(result):
    df = pd.DataFrame()
    df.loc[0,'danceability'] = result[0]['danceability']
    df.loc[0,'energy'] = result[0]['energy']
    df.loc[0,'key'] = result[0]['key']
    df.loc[0,'loudness'] = result[0]['loudness']
    df.loc[0,'mode'] = result[0]['mode']    
    df.loc[0,'speechiness'] = result[0]['speechiness']
    df.loc[0,'acousticness'] = result[0]['acousticness']
    df.loc[0,'instrumentalness'] = result[0]['instrumentalness']
    df.loc[0,'liveness'] = result[0]['liveness']
    df.loc[0,'valence'] = result[0]['valence']    
    df.loc[0,'tempo'] = result[0]['tempo']

    return df

In [49]:
# read and parse json audio features
path = "/Users/jo186027/Desktop/song_features_json/*.txt"
json_path = glob.glob(path)

df_track_audio = pd.DataFrame()
for x in json_path:
    with open(x) as json_file:
        data = json.load(json_file)
    df = parse_result_audio(data)
    df_track_audio = df_track_audio.append(df)

In [52]:
df_track_audio.reset_index(drop=True,inplace=True)
df_track_audio.head()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,0.526,0.712,11.0,-6.525,0.0,0.0356,0.0161,0.0,0.179,0.32,105.009
1,0.794,0.811,6.0,-8.966,0.0,0.038,0.0426,1e-06,0.101,0.862,116.047
2,0.531,0.821,2.0,-6.812,1.0,0.0439,0.0701,0.171,0.124,0.34,129.139
3,0.299,0.613,7.0,-9.012,1.0,0.0347,0.0163,0.00359,0.368,0.306,166.96
4,0.671,0.373,9.0,-18.064,1.0,0.0323,0.257,7.9e-05,0.0481,0.732,92.717


In [62]:
# save track audio features
df_track_audio.to_csv('/Users/jo186027/Desktop/df_track_audio.csv',index=False)

# Get Artist Information

In [6]:

def get_spotify_artist_info(df_input,path_config):
    df = df_input.copy()
    cred = read_yaml_file(path_config)
    client_id = cred['client_id']
    client_secret = cred['client_secret']
    client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
    sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)
    
    for x in df.index:
        artist_name = df.loc[x,'artistName']
        result = sp.search(q='artist:' + artist_name, type='artist')        
        file_name = "/Users/jo186027/Desktop/artist_json/artist_{}.txt".format(str(x))
        with open(file_name, 'w') as outfile:
            json.dump(result, outfile)
            
        if x%10==0:
            time.sleep(100)

In [7]:
path_config = '/Users/jo186027/Desktop/spotify_credentials.yaml'
get_spotify_artist_info(df_track_complete.loc[80:,:],path_config)

In [21]:
def parse_result_artist(result):
    df = pd.DataFrame()
    df.loc[0,'aritstPopularity'] = result['popularity']
    df.loc[0,'artistGenre'] = ','.join(result['genres'])
    return df

In [22]:
# read and parse json
path = "/Users/jo186027/Desktop/artist_json/*.txt"
json_path = glob.glob(path)

df_track_artist = pd.DataFrame()
for x in json_path:
    with open(x) as json_file:
        data = json.load(json_file)
    items = data['artists']['items'][0]
    df = parse_result_artist(items)
    df_track_artist = df_track_artist.append(df)


In [32]:
df_track_artist.reset_index(drop=True,inplace=True)
df_track_artist.head()

Unnamed: 0,aritstPopularity,artistGenre
0,71.0,"french shoegaze,indie pop,indietronica,metropo..."
1,80.0,"electro,filter house"
2,64.0,"alternative dance,australian indie,indie pop,i..."
3,74.0,"irish rock,pop rock,rock"
4,75.0,"album rock,mellow gold,rock,soft rock,yacht rock"


In [33]:
# save df artist information
df_track_artist.to_csv("/Users/jo186027/Desktop/df_track_artist.csv",index=False)

# Merge All Track Information

In [2]:
# # open all dataframes spotify 
# df_track_audio = pd.read_csv("/Users/jo186027/Desktop/df_track_audio.csv")
# df_track_artist = pd.read_csv("/Users/jo186027/Desktop/df_track_artist.csv")
# df_track_complete = pd.read_csv("/Users/jo186027/Desktop/df_track_complete.csv")



In [70]:
df1 = pd.merge(df_track_audio, df_track_artist, right_index=True, left_index=True)
df_track_all_information = pd.merge(df_track_complete,df1, right_index=True, left_index=True)
#replace Nan with ""
df_track_all_information.fillna(value='',inplace=True)
# save complete scraped track information
df_track_all_information.to_csv("/Users/jo186027/Desktop/df_track_all_inforamtion.csv")
df_track_all_information.head()

Unnamed: 0,artistName,artistId,songName,songId,availableMarket,songPopularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,aritstPopularity,artistGenre
0,M83,63MQldklfxkjYDoUE4Tppz,Midnight City,1eyzqe2QqGZUmfcPZtrIyt,"CA,US",71.0,0.526,0.712,11.0,-6.525,0.0,0.0356,0.0161,0.0,0.179,0.32,105.009,71.0,"french shoegaze,indie pop,indietronica,metropo..."
1,Daft Punk,4tZwfgrHOc3mvqYlEYSvVi,Get Lucky (feat. Pharrell Williams & Nile Rodg...,2Foc5Q5nqNiosCNqttzHof,"AD,AE,AR,AT,AU,BE,BG,BH,BO,BR,CA,CH,CL,CO,CR,C...",77.0,0.794,0.811,6.0,-8.966,0.0,0.038,0.0426,1e-06,0.101,0.862,116.047,80.0,"electro,filter house"
2,Foster The People,7gP3bB2nilZXLfPHJhMdvc,Pumped Up Kicks,7w87IxuO7BDcJ3YUqCyMTT,"AD,AE,AR,AT,AU,BE,BG,BH,BO,BR,CA,CH,CL,CO,CR,C...",84.0,0.531,0.821,2.0,-6.812,1.0,0.0439,0.0701,0.171,0.124,0.34,129.139,64.0,"alternative dance,australian indie,indie pop,i..."
3,Kanye West,5K4W6rqBFWDnAN6FQUkS6x,Black Skinhead,722tgOgdIbNe3BEyLnejw4,"AD,AE,AR,AT,AU,BE,BG,BH,BO,BR,CA,CH,CL,CO,CR,C...",76.0,0.299,0.613,7.0,-9.012,1.0,0.0347,0.0163,0.00359,0.368,0.306,166.96,74.0,"irish rock,pop rock,rock"
4,fun.,5nCi3BB41mBaMH9gfr6Su0,We Are Young (feat. Janelle Monáe),7a86XRg84qjasly9f6bPSD,"AT,BG,CH,CZ,DE,DK,EE,FI,HU,IL,IS,LT,LV,NO,PL,R...",72.0,0.671,0.373,9.0,-18.064,1.0,0.0323,0.257,8e-05,0.0481,0.732,92.717,75.0,"album rock,mellow gold,rock,soft rock,yacht rock"


In [7]:
# basic statistics of data
df_track_all_information.describe()

Unnamed: 0,songPopularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,aritstPopularity
count,117.0,117.0,117.0,117.0,117.0,117.0,117.0,117.0,117.0,117.0,117.0,117.0,117.0
mean,62.752137,0.600897,0.666262,4.615385,-7.021829,0.74359,0.065958,0.162937,0.08976,0.173596,0.494068,119.619231,66.666667
std,26.097267,0.166676,0.195478,3.640875,3.15173,0.438529,0.069141,0.244345,0.240054,0.134091,0.242117,27.335732,25.248079
min,0.0,0.209,0.0637,0.0,-18.064,0.0,0.025,2.9e-05,0.0,0.0344,0.104,72.847,0.0
25%,67.0,0.486,0.532,1.0,-8.771,0.0,0.0334,0.0102,0.0,0.0926,0.287,101.006,67.0
50%,73.0,0.609,0.71,5.0,-6.383,1.0,0.0402,0.0372,7.7e-05,0.124,0.485,121.986,75.0
75%,77.0,0.737,0.802,7.0,-4.995,1.0,0.0663,0.209,0.00212,0.199,0.684,130.041,80.0
max,86.0,0.912,0.995,11.0,-1.355,1.0,0.456,0.957,0.937,0.731,0.965,195.154,96.0


*** Scale Audio Features ***

In [20]:
scaler = MinMaxScaler()
scaled_audio = scaler.fit_transform(df_track_audio)

In [68]:
scaled_audio_df = pd.DataFrame(data=scaled_audio, columns=df_track_audio.columns)
df1 = pd.merge(scaled_audio_df, df_track_artist, right_index=True, left_index=True)
df_track_all_information_scaled = pd.merge(df_track_complete,df1, right_index=True, left_index=True)
#replace Nan with ""
df_track_all_information_scaled.fillna(value='',inplace=True)
# save complete scraped track information
df_track_all_information_scaled.to_csv("/Users/jo186027/Desktop/df_track_all_inforamtion_scaled.csv")
df_track_all_information_scaled.head()


Unnamed: 0,artistName,artistId,songName,songId,availableMarket,songPopularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,aritstPopularity,artistGenre
0,M83,63MQldklfxkjYDoUE4Tppz,Midnight City,1eyzqe2QqGZUmfcPZtrIyt,"CA,US",71.0,0.450925,0.696124,1.0,0.690586,0.0,0.024594,0.016794,0.0,0.20758,0.250871,0.262961,71.0,"french shoegaze,indie pop,indietronica,metropo..."
1,Daft Punk,4tZwfgrHOc3mvqYlEYSvVi,Get Lucky (feat. Pharrell Williams & Nile Rodg...,2Foc5Q5nqNiosCNqttzHof,"AD,AE,AR,AT,AU,BE,BG,BH,BO,BR,CA,CH,CL,CO,CR,C...",77.0,0.832148,0.802427,0.545455,0.544497,0.0,0.030162,0.044486,1e-06,0.095607,0.880372,0.35321,80.0,"electro,filter house"
2,Foster The People,7gP3bB2nilZXLfPHJhMdvc,Pumped Up Kicks,7w87IxuO7BDcJ3YUqCyMTT,"AD,AE,AR,AT,AU,BE,BG,BH,BO,BR,CA,CH,CL,CO,CR,C...",84.0,0.458037,0.813164,0.181818,0.67341,1.0,0.043852,0.073222,0.182497,0.128625,0.2741,0.460252,64.0,"alternative dance,australian indie,indie pop,i..."
3,Kanye West,5K4W6rqBFWDnAN6FQUkS6x,Black Skinhead,722tgOgdIbNe3BEyLnejw4,"AD,AE,AR,AT,AU,BE,BG,BH,BO,BR,CA,CH,CL,CO,CR,C...",76.0,0.128023,0.589821,0.636364,0.541744,1.0,0.022506,0.017003,0.003831,0.478898,0.234611,0.769482,74.0,"irish rock,pop rock,rock"
4,fun.,5nCi3BB41mBaMH9gfr6Su0,We Are Young (feat. Janelle Monáe),7a86XRg84qjasly9f6bPSD,"AT,BG,CH,CZ,DE,DK,EE,FI,HU,IL,IS,LT,LV,NO,PL,R...",72.0,0.657183,0.332116,0.818182,0.0,1.0,0.016937,0.268526,8.5e-05,0.019667,0.729384,0.16246,75.0,"album rock,mellow gold,rock,soft rock,yacht rock"


# Get Distribution Available Market Track

In [52]:
country_label = pd.read_csv("/Users/jo186027/Desktop/countrycode_list.csv")
country_label.set_index(keys='Code',inplace=True)
country_label_dict = country_label.to_dict()['Name']
country_label_dict


{'AD': 'Andorra',
 'AE': 'United Arab Emirates',
 'AF': 'Afghanistan',
 'AG': 'Antigua and Barbuda',
 'AI': 'Anguilla',
 'AL': 'Albania',
 'AM': 'Armenia',
 'AO': 'Angola',
 'AQ': 'Antarctica',
 'AR': 'Argentina',
 'AS': 'American Samoa',
 'AT': 'Austria',
 'AU': 'Australia',
 'AW': 'Aruba',
 'AX': 'Ã…land Islands',
 'AZ': 'Azerbaijan',
 'BA': 'Bosnia and Herzegovina',
 'BB': 'Barbados',
 'BD': 'Bangladesh',
 'BE': 'Belgium',
 'BF': 'Burkina Faso',
 'BG': 'Bulgaria',
 'BH': 'Bahrain',
 'BI': 'Burundi',
 'BJ': 'Benin',
 'BL': 'Saint BarthÃ©lemy',
 'BM': 'Bermuda',
 'BN': 'Brunei Darussalam',
 'BO': 'Bolivia, Plurinational State of',
 'BQ': 'Bonaire, Sint Eustatius and Saba',
 'BR': 'Brazil',
 'BS': 'Bahamas',
 'BT': 'Bhutan',
 'BV': 'Bouvet Island',
 'BW': 'Botswana',
 'BY': 'Belarus',
 'BZ': 'Belize',
 'CA': 'Canada',
 'CC': 'Cocos (Keeling) Islands',
 'CD': 'Congo, the Democratic Republic of the',
 'CF': 'Central African Republic',
 'CG': 'Congo',
 'CH': 'Switzerland',
 'CI': "CÃ´te d

In [62]:
# Aggregate Countries from Available Market List
collect_countries = []
available_market_df = pd.DataFrame()
cnt = 0

for x in df_track_all_information.availableMarket:
    collect_countries.extend(x.split(','))
aggregated_countries = Counter(collect_countries)

for k,v in aggregated_countries.items():
    available_market_df.loc[cnt,'Country'] = country_label_dict[k]
    available_market_df.loc[cnt,'Count'] = v
    cnt+=1
available_market_df.head()

Unnamed: 0,Country,Count
0,Indonesia,80.0
1,Brazil,85.0
2,United States,117.0
3,Slovakia,78.0
4,Peru,83.0


In [64]:
# save available market distribution
available_market_df.to_csv("/Users/jo186027/Desktop/available_market_df.csv", index=False)


# Get Distribution Genre from Artist

In [80]:
# Aggregate Countries from Available Market List
collect_genre = []
genre_df = pd.DataFrame()
cnt = 0
for x in df_track_all_information.artistGenre:
    collect_genre.extend(x.split(','))
aggregated_genre = Counter(collect_genre)

for k,v in aggregated_genre.items():
    genre_df.loc[cnt,'Genre'] = k
    genre_df.loc[cnt,'Count'] = v
    cnt+=1
genre_df.sort_values(by='Count', ascending=False, inplace=True)
genre_df.reset_index(drop=True,inplace=True)
genre_df.head()

Unnamed: 0,Genre,Count
0,pop,45.0
1,modern rock,37.0
2,rock,30.0
3,dance pop,23.0
4,indie pop,18.0


In [81]:
# save available market distribution
genre_df.to_csv("/Users/jo186027/Desktop/genre_df.csv", index=False)
