# Testing Basic ML Models
**Authors:** Martin Ziran Xu

In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt

In [2]:
%matplotlib inline

## Cleaning Data

In [27]:
#Read Data
# Can't read 1st data set
df_countries = pd.read_csv('combined_all_countries_2.csv')
df_countries.head()

Unnamed: 0.1,Unnamed: 0,acousticness,analysis_url,danceability,duration_ms,energy,id,instrumentalness,key,liveness,...,pt_100,ca_100,co_100,nz_100,tr_100,cz_100,hn_100,be_100,id_100,bo_100
0,0,0.581,https://api.spotify.com/v1/audio-analysis/7qiZ...,0.825,233713,0.652,True,0.0,1,0.0931,...,True,True,True,True,True,True,True,True,True,True
1,1,0.00902,https://api.spotify.com/v1/audio-analysis/12VW...,0.785,173987,0.617,True,0.00246,1,0.351,...,True,True,True,True,True,True,True,True,False,True
2,2,0.415,https://api.spotify.com/v1/audio-analysis/7crM...,0.736,245507,0.541,True,0.0,8,0.11,...,True,True,True,True,True,True,True,True,True,True
3,3,0.00346,https://api.spotify.com/v1/audio-analysis/4vS8...,0.723,176561,0.809,True,0.00123,7,0.565,...,True,False,True,True,True,True,False,True,False,False
4,4,0.474,https://api.spotify.com/v1/audio-analysis/34gC...,0.781,281560,0.445,True,0.0,2,0.184,...,True,True,True,True,False,True,True,True,True,True


In [28]:
features = list(df_countries.columns)
print(features)

['Unnamed: 0', 'acousticness', 'analysis_url', 'danceability', 'duration_ms', 'energy', 'id', 'instrumentalness', 'key', 'liveness', 'loudness', 'mode', 'speechiness', 'tempo', 'time_signature', 'track_href', 'type', 'uri', 'valence', 'popularity', 'song', 'artist', 'genre', 'sv', 'cr', 'de', 'co', 'dk', 'at', 'pa', 'es', 'gb', 'sk', 'global', 'nl', 'br', 'cl', 'ec', 'lt', 'us', 'it', 'be', 'lu', 'gr', 'mx', 'uy', 'ca', 'jp', 'sg', 'tr', 'cz', 'lv', 'pe', 'ch', 'se', 'ar', 'gt', 'py', 'pl', 'ph', 'nz', 'fr', 'fi', 'hn', 'ie', 'do', 'no', 'hu', 'pt', 'is', 'bo', 'au', 'hk', 'tw', 'my', 'ee', 'py_top', 'lt_top', 'au_top', 'se_top', 'ph_top', 'ee_top', 'gt_top', 'my_top', 'gb_top', 'cr_top', 'ec_top', 'ar_top', 'is_top', 'global_top', 'uy_top', 'nl_top', 'us_top', 'sk_top', 'do_top', 'de_top', 'hu_top', 'gr_top', 'pl_top', 'cl_top', 'at_top', 'tw_top', 'fr_top', 'pe_top', 'mx_top', 'no_top', 'sg_top', 'it_top', 'sv_top', 'lv_top', 'hk_top', 'es_top', 'pa_top', 'lu_top', 'br_top', 'ch_top'

In [29]:
# Dataset with one country (global) and overall popularity
df = df_countries.loc[:, ['song', 'analysis_url','track_href','uri','artist','acousticness', 'danceability', 'duration_ms', 'energy', 'instrumentalness', 'key', 'liveness', 'loudness', 'mode', 'speechiness', 'tempo', 'valence','genre','global', 'global_100', 'global_top', 'popularity']]
df.head()

Unnamed: 0,song,analysis_url,track_href,uri,artist,acousticness,danceability,duration_ms,energy,instrumentalness,...,loudness,mode,speechiness,tempo,valence,genre,global,global_100,global_top,popularity
0,Shape of You,https://api.spotify.com/v1/audio-analysis/7qiZ...,https://api.spotify.com/v1/tracks/7qiZfU4dY1lW...,spotify:track:7qiZfU4dY1lWllzX7mPBI3,Ed Sheeran,0.581,0.825,233713,0.652,0.0,...,-3.183,0,0.0802,95.977,0.931,pop,True,True,True,92
1,One Dance,https://api.spotify.com/v1/audio-analysis/12VW...,https://api.spotify.com/v1/tracks/12VWzyPDBCc8...,spotify:track:12VWzyPDBCc8fqeWCAfNwR,Drake,0.00902,0.785,173987,0.617,0.00246,...,-5.871,1,0.0522,103.981,0.382,hip hop,True,True,False,74
2,Closer,https://api.spotify.com/v1/audio-analysis/7crM...,https://api.spotify.com/v1/tracks/7crMiinWx373...,spotify:track:7crMiinWx373rNBZBaVske,The Chainsmokers,0.415,0.736,245507,0.541,0.0,...,-5.597,1,0.0297,94.962,0.662,house,True,True,False,65
3,Lean On (feat. MØ & DJ Snake),https://api.spotify.com/v1/audio-analysis/4vS8...,https://api.spotify.com/v1/tracks/4vS8VaBwJJV5...,spotify:track:4vS8VaBwJJV5Ry7UFIQuoo,Major Lazer,0.00346,0.723,176561,0.809,0.00123,...,-3.081,0,0.0625,98.007,0.274,electronic,True,True,False,20
4,Thinking Out Loud,https://api.spotify.com/v1/audio-analysis/34gC...,https://api.spotify.com/v1/tracks/34gCuhDGsG4b...,spotify:track:34gCuhDGsG4bRPIf9bb02f,Ed Sheeran,0.474,0.781,281560,0.445,0.0,...,-6.061,1,0.0295,78.998,0.591,pop,True,True,False,87


In [31]:
# clean genre: International
list_genre = pd.unique(df['genre'])
print(list_genre)
df['genre'] = df['genre']\
.replace(['French', 'German', 'Korean'], 'International')
print(pd.unique(df['genre']))

['pop' 'hip hop' 'house' 'electronic' 'latino' 'rap' 'punk' 'Unknown'
 'folk' 'r&b' 'indie' 'rock' 'French' 'German' 'metal' 'International']
['pop' 'hip hop' 'house' 'electronic' 'latino' 'rap' 'punk' 'Unknown'
 'folk' 'r&b' 'indie' 'rock' 'International' 'metal']


In [37]:
# clean genre: NaN
df_unknown = df[df['genre']=='Unknown']
print(df_unknown.size)
print(df.size)

220
12078


In [38]:
# Genre Dummy mapping
binary_encoded = pd.get_dummies(df['genre'])
newcols = binary_encoded.columns
df[newcols] = binary_encoded


In [44]:
df = df.drop(['genre'], axis=1)

In [50]:
# Rearrange dataset
df.head()
print(df.columns)
new_col = ['song', 'analysis_url', 'track_href', 'uri', 'artist', 'acousticness',
       'danceability', 'duration_ms', 'energy', 'instrumentalness', 'key',
       'liveness', 'loudness', 'mode', 'speechiness', 'tempo', 'valence', 'International',
       'Unknown', 'electronic', 'folk', 'hip hop', 'house', 'indie', 'latino',
       'metal', 'pop', 'punk', 'r&b', 'rap', 'rock',
       'global', 'global_100', 'global_top', 'popularity']
df = df[new_col]
df.head()

Index(['song', 'analysis_url', 'track_href', 'uri', 'artist', 'acousticness',
       'danceability', 'duration_ms', 'energy', 'instrumentalness', 'key',
       'liveness', 'loudness', 'mode', 'speechiness', 'tempo', 'valence',
       'global', 'global_100', 'global_top', 'popularity', 'International',
       'Unknown', 'electronic', 'folk', 'hip hop', 'house', 'indie', 'latino',
       'metal', 'pop', 'punk', 'r&b', 'rap', 'rock'],
      dtype='object')


Unnamed: 0,song,analysis_url,track_href,uri,artist,acousticness,danceability,duration_ms,energy,instrumentalness,...,metal,pop,punk,r&b,rap,rock,global,global_100,global_top,popularity
0,Shape of You,https://api.spotify.com/v1/audio-analysis/7qiZ...,https://api.spotify.com/v1/tracks/7qiZfU4dY1lW...,spotify:track:7qiZfU4dY1lWllzX7mPBI3,Ed Sheeran,0.581,0.825,233713,0.652,0.0,...,0,1,0,0,0,0,True,True,True,92
1,One Dance,https://api.spotify.com/v1/audio-analysis/12VW...,https://api.spotify.com/v1/tracks/12VWzyPDBCc8...,spotify:track:12VWzyPDBCc8fqeWCAfNwR,Drake,0.00902,0.785,173987,0.617,0.00246,...,0,0,0,0,0,0,True,True,False,74
2,Closer,https://api.spotify.com/v1/audio-analysis/7crM...,https://api.spotify.com/v1/tracks/7crMiinWx373...,spotify:track:7crMiinWx373rNBZBaVske,The Chainsmokers,0.415,0.736,245507,0.541,0.0,...,0,0,0,0,0,0,True,True,False,65
3,Lean On (feat. MØ & DJ Snake),https://api.spotify.com/v1/audio-analysis/4vS8...,https://api.spotify.com/v1/tracks/4vS8VaBwJJV5...,spotify:track:4vS8VaBwJJV5Ry7UFIQuoo,Major Lazer,0.00346,0.723,176561,0.809,0.00123,...,0,0,0,0,0,0,True,True,False,20
4,Thinking Out Loud,https://api.spotify.com/v1/audio-analysis/34gC...,https://api.spotify.com/v1/tracks/34gCuhDGsG4b...,spotify:track:34gCuhDGsG4bRPIf9bb02f,Ed Sheeran,0.474,0.781,281560,0.445,0.0,...,0,1,0,0,0,0,True,True,False,87


In [59]:
# Clean Ranking
# not needed, since True==1 in Python

In [62]:
# Shuffle dataframe
from sklearn.utils import shuffle
df = shuffle(df)
df.head()

Unnamed: 0,song,analysis_url,track_href,uri,artist,acousticness,danceability,duration_ms,energy,instrumentalness,...,metal,pop,punk,r&b,rap,rock,global,global_100,global_top,popularity
113,Fake Love,https://api.spotify.com/v1/audio-analysis/6NMN...,https://api.spotify.com/v1/tracks/6NMNgWgEAzde...,spotify:track:6NMNgWgEAzde5M8U3lc6FN,Drake,0.182,0.924,207813,0.543,0.0,...,0,0,0,0,0,0,True,True,False,18
158,Roar,https://api.spotify.com/v1/audio-analysis/3XSc...,https://api.spotify.com/v1/tracks/3XSczvk4MRte...,spotify:track:3XSczvk4MRteOw4Yx3lqMU,Katy Perry,0.00778,0.667,222667,0.791,1.2e-05,...,0,1,0,0,0,0,False,False,False,57
516,Vacaciones,https://api.spotify.com/v1/audio-analysis/1rXo...,https://api.spotify.com/v1/tracks/1rXojdsUqqxG...,spotify:track:1rXojdsUqqxGj2WCmJGWHP,Wisin,0.246,0.777,238813,0.911,0.0,...,0,0,0,0,0,0,True,True,False,78
531,The Sound Of Silence,https://api.spotify.com/v1/audio-analysis/0eZB...,https://api.spotify.com/v1/tracks/0eZBeB2xFIS6...,spotify:track:0eZBeB2xFIS65jQHerispi,Disturbed,0.468,0.322,248467,0.28,1e-06,...,1,0,0,0,0,0,False,False,False,72
240,Classic,https://api.spotify.com/v1/audio-analysis/5x9V...,https://api.spotify.com/v1/tracks/5x9VIW2fS21J...,spotify:track:5x9VIW2fS21JMswOt6AORI,MKTO,0.0353,0.713,175333,0.781,0.0,...,0,1,0,0,0,0,False,False,False,67


## Define Input and Output

In [60]:
print(df.columns)

Index(['song', 'analysis_url', 'track_href', 'uri', 'artist', 'acousticness',
       'danceability', 'duration_ms', 'energy', 'instrumentalness', 'key',
       'liveness', 'loudness', 'mode', 'speechiness', 'tempo', 'valence',
       'International', 'Unknown', 'electronic', 'folk', 'hip hop', 'house',
       'indie', 'latino', 'metal', 'pop', 'punk', 'r&b', 'rap', 'rock',
       'global', 'global_100', 'global_top', 'popularity'],
      dtype='object')


In [65]:
# Define Input
X = df.loc[:,['acousticness',
       'danceability', 'duration_ms', 'energy', 'instrumentalness', 'key',
       'liveness', 'loudness', 'mode', 'speechiness', 'tempo', 'valence',
       'International', 'Unknown', 'electronic', 'folk', 'hip hop', 'house',
       'indie', 'latino', 'metal', 'pop', 'punk', 'r&b', 'rap', 'rock']]
X.head()

Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,...,hip hop,house,indie,latino,metal,pop,punk,r&b,rap,rock
113,0.182,0.924,207813,0.543,0.0,9,0.103,-7.474,0,0.298,...,1,0,0,0,0,0,0,0,0,0
158,0.00778,0.667,222667,0.791,1.2e-05,7,0.415,-4.329,0,0.0319,...,0,0,0,0,0,1,0,0,0,0
516,0.246,0.777,238813,0.911,0.0,6,0.257,-3.223,0,0.173,...,0,0,0,1,0,0,0,0,0,0
531,0.468,0.322,248467,0.28,1e-06,6,0.102,-9.367,0,0.0281,...,0,0,0,0,1,0,0,0,0,0
240,0.0353,0.713,175333,0.781,0.0,1,0.155,-4.859,1,0.121,...,0,0,0,0,0,1,0,0,0,0


In [67]:
# Define Output
Y = df.loc[:, ['global', 'global_100', 'global_top', 'popularity']]
Y.head()

Unnamed: 0,global,global_100,global_top,popularity
113,True,True,False,18
158,False,False,False,57
516,True,True,False,78
531,False,False,False,72
240,False,False,False,67


In [72]:
Y.describe()

Unnamed: 0,popularity
count,549.0
mean,68.983607
std,19.17899
min,2.0
25%,67.0
50%,74.0
75%,80.0
max,100.0


In [75]:
# Split in Regression and Classification
Y_reg = Y.loc[:,'popularity']
Y_class = Y.iloc[:,0:3]
print(Y_reg.head())
print(Y_class.head())

113    18
158    57
516    78
531    72
240    67
Name: popularity, dtype: int64
     global  global_100  global_top
113    True        True       False
158   False       False       False
516    True        True       False
531   False       False       False
240   False       False       False


In [86]:
# Normalize Data
print(X)
from sklearn.preprocessing import normalize
X = normalize(X)
print(X)

[[8.75787183e-07 4.44630416e-06 9.99999791e-01 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [3.49400646e-08 2.99550425e-06 9.99999918e-01 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [1.03009459e-06 3.25359146e-06 9.99999919e-01 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 ...
 [2.09869841e-07 3.10950009e-06 9.99999809e-01 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [2.34819095e-06 1.80878533e-06 9.99999888e-01 ... 0.00000000e+00
  0.00000000e+00 3.22997380e-06]
 [1.35300038e-08 2.82027445e-06 9.99999842e-01 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]]
[[8.75787183e-07 4.44630416e-06 9.99999791e-01 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [3.49400646e-08 2.99550425e-06 9.99999918e-01 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [1.03009459e-06 3.25359146e-06 9.99999919e-01 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 ...
 [2.09869841e-07 3.10950009e-06 9.99999809e-01 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]

## Perform Machine Learning

### a) Classification on Charts

In [76]:
#Split Data
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X,Y_class, test_size=0.15, random_state=100)

In [107]:
print(x_train.shape)
print(y_train.shape)
y_train.iloc[:,0]
y_train.columns[1]

(466, 26)
(466, 3)


'global_100'

In [110]:
# Perform Ridge Regression on charts
from sklearn.linear_model import RidgeClassifier
print('Ridge Classifier on regional charts: ')
for i in range(0,3):
    #Train
    y_train_curr = y_train.iloc[:,i]
    y_test_curr = y_test.iloc[:,i]
    ridgeModel = RidgeClassifier()
    ridgeModel.fit(x_train, y_train_curr)
    # Accuracy
    print(y_train.columns[i])
    print('Train Accuracy:')
    print(ridgeModel.score(x_train, y_train_curr))
    print('Test Accuracy:')
    print(ridgeModel.score(x_test, y_test_curr))

Ridge Classifier on regional charts: 
global
Train Accuracy:
0.648068669527897
Test Accuracy:
0.5180722891566265
global_100
Train Accuracy:
0.703862660944206
Test Accuracy:
0.6506024096385542
global_top
Train Accuracy:
0.9871244635193133
Test Accuracy:
0.963855421686747


## b1) Perform Linear Regression on Popularity - very low accuracy

In [120]:
#Split Data
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X,Y_reg, test_size=0.15, random_state=100)

In [112]:
# Perform Linear Regression
from sklearn import linear_model
linreg_model = linear_model.LinearRegression()
print('Training a linear Regression Model...')
linreg_model.fit(x_train, y_train)

Training a linear Regression Model...


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [115]:
# Accuracy:
print('Linear Regression on song popularity')
print('Training Data')
print(linreg_model.score(x_train, y_train))
print('Testing Data')
print(linreg_model.score(x_test, y_test))

Linear Regression on song popularity
Training Data
0.10217204348736908
Testing Data
0.010571942955733271


### b2) Ridge Regression on Popularity - Very weird accuracy

In [123]:
from sklearn.linear_model import Ridge
ridgeModel = Ridge(alpha = 100)
ridgeModel.fit(x_train, y_train)

Ridge(alpha=100, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [124]:
# Accuracy
print('Ridge Regression on song popularity')
print('Training Data')
print(ridgeModel.score(x_train, y_train))
print('Testing Data')
print(ridgeModel.score(x_test, y_test))

Ridge Regression on song popularity
Training Data
1.20656151736398e-10
Testing Data
-0.00050589650331756
