# ML Models on Raw Positions

In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
%matplotlib inline

## 1. Preprocessing

### a) Import data

In [2]:
df = pd.read_csv('workingDataset_timbrePosition.csv', encoding = 'latin1')
df.head()

Unnamed: 0.1,Unnamed: 0,song,analysis_url,track_href,uri,artist,acousticness,danceability,duration_ms,energy,...,bo_position,no_position,id_position,es_position,fi_position,nz_position,global_position,ee_position,ch_position,ec_position
0,0,Shape of You,https://api.spotify.com/v1/audio-analysis/7qiZ...,https://api.spotify.com/v1/tracks/7qiZfU4dY1lW...,spotify:track:7qiZfU4dY1lWllzX7mPBI3,Ed Sheeran,0.581,0.825,233713,0.652,...,42.432836,,,66.076923,,,100.380952,,119.324074,52.651685
1,1,One Dance,https://api.spotify.com/v1/audio-analysis/12VW...,https://api.spotify.com/v1/tracks/12VWzyPDBCc8...,spotify:track:12VWzyPDBCc8fqeWCAfNwR,Drake,0.00902,0.785,173987,0.617,...,65.158879,115.357143,115.032258,87.482517,129.234375,144.111111,98.850394,42.0,88.46875,79.851562
2,2,Closer,https://api.spotify.com/v1/audio-analysis/7crM...,https://api.spotify.com/v1/tracks/7crMiinWx373...,spotify:track:7crMiinWx373rNBZBaVske,The Chainsmokers,0.415,0.736,245507,0.541,...,28.352941,,,73.657407,,,114.163793,,148.209877,32.95
3,3,Lean On (feat. MØ & DJ Snake),https://api.spotify.com/v1/audio-analysis/4vS8...,https://api.spotify.com/v1/tracks/4vS8VaBwJJV5...,spotify:track:4vS8VaBwJJV5Ry7UFIQuoo,Major Lazer,0.00346,0.723,176561,0.809,...,64.307692,,,91.511811,,,121.038095,,136.555556,83.41791
4,4,Thinking Out Loud,https://api.spotify.com/v1/audio-analysis/34gC...,https://api.spotify.com/v1/tracks/34gCuhDGsG4b...,spotify:track:34gCuhDGsG4bRPIf9bb02f,Ed Sheeran,0.474,0.781,281560,0.445,...,59.852273,,,91.032787,,,120.951456,,123.412844,78.459677


In [3]:
popularity = df.popularity
df = df.drop(['Unnamed: 0', 'popularity'], axis = 1)

In [4]:
df_columns = list(df.columns)
df_columns

['song',
 'analysis_url',
 'track_href',
 'uri',
 'artist',
 'acousticness',
 'danceability',
 'duration_ms',
 'energy',
 'instrumentalness',
 'key',
 'liveness',
 'loudness',
 'mode',
 'speechiness',
 'tempo',
 'valence',
 'International',
 'Unknown',
 'electronic',
 'folk',
 'hip hop',
 'house',
 'indie',
 'latino',
 'metal',
 'pop',
 'punk',
 'r&b',
 'rap',
 'rock',
 'mean_timbre1',
 'median_timbre1',
 'std_timbre1',
 'min_timbre1',
 'max_timbre1',
 'range_timbre1',
 '80Percentile_timbre1',
 'mean_timbre2',
 'median_timbre2',
 'std_timbre2',
 'min_timbre2',
 'max_timbre2',
 'range_timbre2',
 '80Percentile_timbre2',
 'mean_timbre3',
 'median_timbre3',
 'std_timbre3',
 'min_timbre3',
 'max_timbre3',
 'range_timbre3',
 '80Percentile_timbre3',
 'mean_timbre4',
 'median_timbre4',
 'std_timbre4',
 'min_timbre4',
 'max_timbre4',
 'range_timbre4',
 '80Percentile_timbre4',
 'mean_timbre5',
 'median_timbre5',
 'std_timbre5',
 'min_timbre5',
 'max_timbre5',
 'range_timbre5',
 '80Percentile_tim

In [5]:
df.shape

(549, 169)

In [6]:
features_startIndex = df_columns.index('acousticness')
output_startIndex = df_columns.index('pt_position')

### b) Define Output

In [147]:
Y = df.iloc[:, output_startIndex:]
Y.head()
Y.shape

(549, 54)

In [148]:
# Number of nul values
df_isNull = Y.iloc[:, ].isnull().sum()
print(df.shape[0])
df_isNull

549


pt_position        269
au_position        339
pe_position         62
us_position        366
fr_position        364
lt_position        444
is_position        433
lv_position        447
ph_position        360
sk_position        446
sg_position        333
cl_position         86
uy_position        136
lu_position        519
gb_position        332
se_position        330
br_position        330
hk_position        353
co_position         80
do_position        190
ar_position        116
pa_position        168
de_position        355
gt_position        102
at_position        327
ca_position        340
gr_position        416
ie_position        325
hn_position        233
jp_position        359
hu_position        328
pl_position        311
dk_position        355
it_position        300
tw_position        350
tr_position        354
mx_position        125
my_position        344
sv_position        241
nl_position        345
cz_position        309
py_position        174
be_position        312
cr_position

In [149]:
list_isNull = list(df_isNull)
threshold = df.shape[0]/2
print('Threshold: {}'.format(threshold))
toKeep = list()
for i in range(len(list_isNull)):
    if list_isNull[i]<threshold:
        toKeep.append(i)
print(toKeep)
print(len(toKeep))

Threshold: 274.5
[0, 2, 11, 12, 18, 19, 20, 21, 23, 28, 36, 38, 41, 43, 44, 47, 50, 53]
18


In [150]:
Y = Y.iloc[:, toKeep]
Y.head()

Unnamed: 0,pt_position,pe_position,cl_position,uy_position,co_position,do_position,ar_position,pa_position,gt_position,hn_position,mx_position,sv_position,py_position,cr_position,bo_position,es_position,global_position,ec_position
0,123.957627,43.814286,42.649351,48.04878,49.886076,84.764228,44.540541,65.621359,52.348315,55.414894,56.680851,55.301075,74.133929,42.987013,42.432836,66.076923,100.380952,52.651685
1,81.709677,57.912088,64.320755,75.792,93.047619,79.330827,59.19,79.261905,90.243421,57.297872,59.424242,61.414141,74.781818,88.291667,65.158879,87.482517,98.850394,79.851562
2,159.025316,26.391304,25.659574,48.268657,24.116279,41.445946,39.733333,37.884058,28.365385,32.20339,33.433333,31.694915,42.130435,25.106383,28.352941,73.657407,114.163793,32.95
3,121.372881,66.531915,75.62931,89.298387,97.92517,88.067227,69.789474,78.783333,94.649007,62.265306,60.196078,60.443299,76.57265,95.629139,64.307692,91.511811,121.038095,83.41791
4,123.943182,64.810526,78.214876,84.792793,66.418182,76.066667,88.966387,70.266055,82.204545,51.625,81.837209,56.848485,68.686869,68.355932,59.852273,91.032787,120.951456,78.459677


In [103]:
Y.describe()

Unnamed: 0,pt_position,pe_position,cl_position,uy_position,co_position,do_position,ar_position,pa_position,gt_position,hn_position,mx_position,sv_position,py_position,cr_position,bo_position,es_position,global_position,ec_position
count,280.0,487.0,463.0,413.0,469.0,359.0,433.0,381.0,447.0,316.0,424.0,308.0,375.0,465.0,309.0,408.0,296.0,549.0
mean,133.995952,127.058255,128.391632,124.273267,129.563948,112.047714,129.163084,93.482514,119.23341,77.65315,128.148685,71.714263,95.809203,127.044351,71.555838,128.47734,134.940675,131.109842
std,31.751765,37.047994,37.246898,34.901789,37.949184,28.335455,35.932511,20.844399,30.357347,17.679588,35.413495,16.507015,21.94467,36.920208,17.16468,33.43957,30.505466,35.084311
min,70.5,26.391304,16.444444,20.578947,24.116279,12.454545,20.884615,18.842105,14.592593,14.571429,33.433333,12.478261,18.821429,25.106383,8.285714,43.080645,62.784615,30.594595
25%,109.61953,105.46875,107.975,100.465347,110.209302,94.727099,107.0,83.767442,105.404255,67.755501,106.774373,63.182796,82.795918,105.833333,62.670213,103.494845,112.859375,110.132231
50%,129.526074,130.0,126.445455,121.73,129.405405,112.634921,129.460317,91.6,120.649123,79.055556,127.396179,70.626263,96.375,127.966292,71.833333,123.64,132.707071,133.634615
75%,159.533333,152.777778,155.9,154.78125,158.307692,131.433333,153.859155,105.412844,140.880392,89.272727,154.149639,79.617647,110.291667,153.714286,79.262626,155.54,157.695652,157.487179
max,200.0,200.0,200.0,198.0,200.0,187.0,193.4,186.0,197.0,131.0,199.0,137.0,190.0,197.0,130.9,199.0,199.0,198.0


In [151]:
# Replace NaN Values by very low chart score: 
lowPos = 300 # set arbitrarily
var = 0.01
print('Before:')
print(Y.isnull().sum())
for i in range(Y.shape[0]):
    Y.iloc[i,:] = Y.iloc[i,:].fillna(lowPos)
    lowPos += var
print('After:')
print(Y.isnull().sum())

Before:
pt_position        269
pe_position         62
cl_position         86
uy_position        136
co_position         80
do_position        190
ar_position        116
pa_position        168
gt_position        102
hn_position        233
mx_position        125
sv_position        241
py_position        174
cr_position         84
bo_position        240
es_position        141
global_position    253
ec_position          0
dtype: int64
After:
pt_position        0
pe_position        0
cl_position        0
uy_position        0
co_position        0
do_position        0
ar_position        0
pa_position        0
gt_position        0
hn_position        0
mx_position        0
sv_position        0
py_position        0
cr_position        0
bo_position        0
es_position        0
global_position    0
ec_position        0
dtype: int64


In [152]:
# Get binary values
Y_test = Y.copy()
for i in range(Y.shape[1]):
    Y_test.iloc[:,i] = pd.qcut(Y_test.iloc[:,i], 2, labels = False)

### c) Define Input/Features

In [153]:
X = df.iloc[:, features_startIndex:output_startIndex]
X.head()

Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,...,max_timbre11,range_timbre11,80Percentile_timbre11,mean_timbre12,median_timbre12,std_timbre12,min_timbre12,max_timbre12,range_timbre12,80Percentile_timbre12
0,0.581,0.825,233713,0.652,0.0,1,0.0931,-3.183,0,0.0802,...,57.744,157.995,3.6902,-3.411885,-2.479,13.339611,-53.845,37.774,91.619,7.7588
1,0.00902,0.785,173987,0.617,0.00246,1,0.351,-5.871,1,0.0522,...,87.01,199.987,1.6932,-2.246651,-2.442,15.645132,-44.506,36.627,81.133,11.5428
2,0.415,0.736,245507,0.541,0.0,8,0.11,-5.597,1,0.0297,...,104.427,217.686,0.6802,1.795072,1.944,13.29221,-37.643,41.531,79.174,12.892
3,0.00346,0.723,176561,0.809,0.00123,7,0.565,-3.081,0,0.0625,...,49.5,123.525,5.583,0.437129,1.104,17.708769,-48.071,54.298,102.369,16.377
4,0.474,0.781,281560,0.445,0.0,2,0.184,-6.061,1,0.0295,...,67.121,158.016,6.8418,-6.117562,-5.8825,15.538641,-57.848,61.165,119.013,7.052


In [154]:
from sklearn.preprocessing import normalize
X_norm = normalize(X)

### d) Constructs Dataframe

In [155]:
df_work = pd.concat([X,Y], axis=1)
df_work.head()

Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,...,gt_position,hn_position,mx_position,sv_position,py_position,cr_position,bo_position,es_position,global_position,ec_position
0,0.581,0.825,233713,0.652,0.0,1,0.0931,-3.183,0,0.0802,...,52.348315,55.414894,56.680851,55.301075,74.133929,42.987013,42.432836,66.076923,100.380952,52.651685
1,0.00902,0.785,173987,0.617,0.00246,1,0.351,-5.871,1,0.0522,...,90.243421,57.297872,59.424242,61.414141,74.781818,88.291667,65.158879,87.482517,98.850394,79.851562
2,0.415,0.736,245507,0.541,0.0,8,0.11,-5.597,1,0.0297,...,28.365385,32.20339,33.433333,31.694915,42.130435,25.106383,28.352941,73.657407,114.163793,32.95
3,0.00346,0.723,176561,0.809,0.00123,7,0.565,-3.081,0,0.0625,...,94.649007,62.265306,60.196078,60.443299,76.57265,95.629139,64.307692,91.511811,121.038095,83.41791
4,0.474,0.781,281560,0.445,0.0,2,0.184,-6.061,1,0.0295,...,82.204545,51.625,81.837209,56.848485,68.686869,68.355932,59.852273,91.032787,120.951456,78.459677


## 2. Perform Machine Learning

### Import packages/Define functions

In [156]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Perceptron
import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, auc

import operator

from sklearn.preprocessing import normalize


In [157]:
# Confusion Matrix
def printConfusionMatrix (y_true, y_pred):
    cf=pd.DataFrame(confusion_matrix(y_true, y_pred))
    print(cf)

def plotROCCurve (y_true, y_pred):
    fpr, tpr, thresholds = roc_curve(y_true, y_pred)
    roc_auc = auc(fpr, tpr)
    
    plt.figure()
    lw = 2
    plt.plot(fpr, tpr, color='darkorange',
             lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.show()

def featureImportance(x_train, y_train, features):
    random_forest = RandomForestClassifier(n_estimators=190)
    random_forest.fit(x_train, y_train)
    values = random_forest.feature_importances_
    dict_featureImportance = dict(zip(features,values))
    sorted_dict = sorted(dict_featureImportance.items(), key=operator.itemgetter(1), reverse = True)
    return sorted_dict

def performsMLModelComparison(x_train, x_test, y_train, y_test, printing=True):
    '''Tries standard ML models, reports average accuracy and best model (with its accuracy and confusion matrix)'''
    MLModels = [LogisticRegression(), SVC(), Perceptron(), KNeighborsClassifier(n_neighbors = 3), xgb.XGBClassifier(n_estimators = 140), RandomForestClassifier(n_estimators = 140)]
    MLModelsStrings = ["Logistic Regression", "Support Vector Machine", "Perceptron", "KN Neighbors", "Gradient Boosting", "Random Forest"]
    nModels = len(MLModels)
    trainingAccuracy = list()
    testAccuracy = list()
    aucScore = list()

    # Train models
    for i in range(nModels):
        MLModels[i].fit(x_train, y_train)
        trainingAccuracy.append (MLModels[i].score(x_train, y_train))
        testAccuracy.append(MLModels[i].score(x_test, y_test))
        
        fpr, tpr, thresholds = roc_curve(y_test, MLModels[i].predict(x_test))
        roc_auc = auc(fpr, tpr)
        aucScore.append(roc_auc)
    
    # Report average test accuracy
    if (printing):
        '''print('Used the following models:')
        print(MLModelsStrings)
        print('Test Scores')
        print(testAccuracy)
        print('AUC Scores')
        print(aucScore)
        print('Average test accuracies of all models is: {}'.format(np.mean(testAccuracy)))'''
    
    # Find best model
    max_testAccuracy = max(testAccuracy)
    max_index = testAccuracy.index(max_testAccuracy)
    bestModel = MLModels[max_index]
    if (printing):
        print('{} is the best model'.format(MLModelsStrings[max_index]))
        print('Test Accuracy: {}'.format(max_testAccuracy))
        print('AUC Score: {}'.format(aucScore[max_index]))
        print('Confusion Matrix')
        printConfusionMatrix(y_test, bestModel.predict(x_test))
    
    return bestModel

In [158]:
def runsModel(x_train, x_test, y_train, y_test, model):

    # Train model
    model.fit(x_train, y_train)
    trainingAccuracy = (model.score(x_train, y_train))
    testAccuracy = (model.score(x_test, y_test))

    fpr, tpr, thresholds = roc_curve(y_test, model.predict(x_test))
    roc_auc = auc(fpr, tpr)
    aucScore = (roc_auc)
    
    print('Test Accuracy: {}'.format(testAccuracy))
    print('Training Accuracy: {}'.format(trainingAccuracy))
    print('AUC Score: {}'.format(aucScore))
    print('Confusion Matrix')
    printConfusionMatrix(y_test, model.predict(x_test))
    return model

### 1. Test out random models for all countries

In [159]:
a = list(df_work.columns)
index = a.index('pt_position')
output =a[index:]
print("Number of outputs: {}".format(len(output)))
print(output)

Number of outputs: 18
['pt_position', 'pe_position', 'cl_position', 'uy_position', 'co_position', 'do_position', 'ar_position', 'pa_position', 'gt_position', 'hn_position', 'mx_position', 'sv_position', 'py_position', 'cr_position', 'bo_position', 'es_position', 'global_position', 'ec_position']


In [160]:
features = a[:index]
print("Number of features: {}".format(len(features)))
print(features)

Number of features: 110
['acousticness', 'danceability', 'duration_ms', 'energy', 'instrumentalness', 'key', 'liveness', 'loudness', 'mode', 'speechiness', 'tempo', 'valence', 'International', 'Unknown', 'electronic', 'folk', 'hip hop', 'house', 'indie', 'latino', 'metal', 'pop', 'punk', 'r&b', 'rap', 'rock', 'mean_timbre1', 'median_timbre1', 'std_timbre1', 'min_timbre1', 'max_timbre1', 'range_timbre1', '80Percentile_timbre1', 'mean_timbre2', 'median_timbre2', 'std_timbre2', 'min_timbre2', 'max_timbre2', 'range_timbre2', '80Percentile_timbre2', 'mean_timbre3', 'median_timbre3', 'std_timbre3', 'min_timbre3', 'max_timbre3', 'range_timbre3', '80Percentile_timbre3', 'mean_timbre4', 'median_timbre4', 'std_timbre4', 'min_timbre4', 'max_timbre4', 'range_timbre4', '80Percentile_timbre4', 'mean_timbre5', 'median_timbre5', 'std_timbre5', 'min_timbre5', 'max_timbre5', 'range_timbre5', '80Percentile_timbre5', 'mean_timbre6', 'median_timbre6', 'std_timbre6', 'min_timbre6', 'max_timbre6', 'range_tim

In [161]:
testAccuracy = list()
Y = df_work[output]
for i in range(Y.shape[1]):
    print('Performs prediction at Country: {}'.format(Y_test.columns[i]))
    Y_curr = Y_test.iloc[:,i]
    x_train, x_test, y_train, y_test = train_test_split(X_norm,Y_curr, test_size=0.15, random_state=100)
    bestModel = performsMLModelComparison(x_train, x_test, y_train, y_test)
    testAccuracy.append(bestModel.score(x_test, y_test))

Performs prediction at Country: pt_position


  if diff:
  if diff:
  if diff:


Gradient Boosting is the best model
Test Accuracy: 0.5662650602409639
AUC Score: 0.5622093023255814
Confusion Matrix
    0   1
0  29  14
1  22  18
Performs prediction at Country: pe_position


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


Perceptron is the best model
Test Accuracy: 0.5662650602409639
AUC Score: 0.48409363745498196
Confusion Matrix
    0  1
0  46  3
1  33  1
Performs prediction at Country: cl_position


  if diff:
  if diff:
  if diff:


KN Neighbors is the best model
Test Accuracy: 0.5903614457831325
AUC Score: 0.5874269005847954
Confusion Matrix
    0   1
0  28  17
1  17  21
Performs prediction at Country: uy_position


  if diff:
  if diff:
  if diff:


Perceptron is the best model
Test Accuracy: 0.5301204819277109
AUC Score: 0.47380952380952385
Confusion Matrix
    0  1
0  40  8
1  31  4
Performs prediction at Country: co_position


  if diff:
  if diff:
  if diff:


Perceptron is the best model
Test Accuracy: 0.5421686746987951
AUC Score: 0.502046783625731
Confusion Matrix
    0  1
0  44  1
1  37  1
Performs prediction at Country: do_position


  if diff:
  if diff:
  if diff:


Perceptron is the best model
Test Accuracy: 0.5662650602409639
AUC Score: 0.5283625730994153
Confusion Matrix
    0  1
0  44  1
1  35  3
Performs prediction at Country: ar_position


  if diff:
  if diff:
  if diff:


Perceptron is the best model
Test Accuracy: 0.5783132530120482
AUC Score: 0.4897959183673469
Confusion Matrix
    0  1
0  48  1
1  34  0
Performs prediction at Country: pa_position


  if diff:
  if diff:
  if diff:


Perceptron is the best model
Test Accuracy: 0.5783132530120482
AUC Score: 0.5
Confusion Matrix
    0  1
0  48  0
1  35  0
Performs prediction at Country: gt_position


  if diff:
  if diff:
  if diff:


KN Neighbors is the best model
Test Accuracy: 0.5903614457831325
AUC Score: 0.5956140350877193
Confusion Matrix
    0   1
0  24  21
1  13  25
Performs prediction at Country: hn_position


  if diff:
  if diff:
  if diff:


Perceptron is the best model
Test Accuracy: 0.6024096385542169
AUC Score: 0.5285714285714286
Confusion Matrix
    0  1
0  48  0
1  33  2
Performs prediction at Country: mx_position


  if diff:
  if diff:
  if diff:


Perceptron is the best model
Test Accuracy: 0.5542168674698795
AUC Score: 0.48936170212765956
Confusion Matrix
    0  1
0  46  1
1  36  0
Performs prediction at Country: sv_position


  if diff:
  if diff:
  if diff:


Perceptron is the best model
Test Accuracy: 0.5662650602409639
AUC Score: 0.5
Confusion Matrix
    0  1
0  47  0
1  36  0
Performs prediction at Country: py_position


  if diff:
  if diff:
  if diff:


Gradient Boosting is the best model
Test Accuracy: 0.5301204819277109
AUC Score: 0.5390716803760283
Confusion Matrix
    0   1
0  21  25
1  14  23
Performs prediction at Country: cr_position


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


Perceptron is the best model
Test Accuracy: 0.5542168674698795
AUC Score: 0.5
Confusion Matrix
    0  1
0  46  0
1  37  0
Performs prediction at Country: bo_position


  if diff:
  if diff:
  if diff:


Perceptron is the best model
Test Accuracy: 0.5421686746987951
AUC Score: 0.4917743830787309
Confusion Matrix
    0  1
0  44  2
1  36  1
Performs prediction at Country: es_position


  if diff:
  if diff:
  if diff:


Perceptron is the best model
Test Accuracy: 0.5301204819277109
AUC Score: 0.4782608695652174
Confusion Matrix
    0  1
0  44  2
1  37  0
Performs prediction at Country: global_position


  if diff:
  if diff:
  if diff:


KN Neighbors is the best model
Test Accuracy: 0.5421686746987951
AUC Score: 0.5419580419580419
Confusion Matrix
    0   1
0  24  20
1  18  21
Performs prediction at Country: ec_position


  if diff:
  if diff:
  if diff:


Perceptron is the best model
Test Accuracy: 0.5301204819277109
AUC Score: 0.4888888888888889
Confusion Matrix
    0  1
0  44  1
1  38  0


In [162]:
print(testAccuracy)
print(np.mean(testAccuracy))

[0.5662650602409639, 0.5662650602409639, 0.5903614457831325, 0.5301204819277109, 0.5421686746987951, 0.5662650602409639, 0.5783132530120482, 0.5783132530120482, 0.5903614457831325, 0.6024096385542169, 0.5542168674698795, 0.5662650602409639, 0.5301204819277109, 0.5542168674698795, 0.5421686746987951, 0.5301204819277109, 0.5421686746987951, 0.5301204819277109]
0.5589022757697457


### 2. Try with an gap in between

In [69]:
a = list(df_work.columns)
index = a.index('pt_position')
output =a[index:]
print("Number of outputs: {}".format(len(output)))
print(output)

Number of outputs: 18
['pt_position', 'pe_position', 'cl_position', 'uy_position', 'co_position', 'do_position', 'ar_position', 'pa_position', 'gt_position', 'hn_position', 'mx_position', 'sv_position', 'py_position', 'cr_position', 'bo_position', 'es_position', 'global_position', 'ec_position']


In [68]:
features = a[:index]
print("Number of features: {}".format(len(features)))
print(features)

Number of features: 110
['acousticness', 'danceability', 'duration_ms', 'energy', 'instrumentalness', 'key', 'liveness', 'loudness', 'mode', 'speechiness', 'tempo', 'valence', 'International', 'Unknown', 'electronic', 'folk', 'hip hop', 'house', 'indie', 'latino', 'metal', 'pop', 'punk', 'r&b', 'rap', 'rock', 'mean_timbre1', 'median_timbre1', 'std_timbre1', 'min_timbre1', 'max_timbre1', 'range_timbre1', '80Percentile_timbre1', 'mean_timbre2', 'median_timbre2', 'std_timbre2', 'min_timbre2', 'max_timbre2', 'range_timbre2', '80Percentile_timbre2', 'mean_timbre3', 'median_timbre3', 'std_timbre3', 'min_timbre3', 'max_timbre3', 'range_timbre3', '80Percentile_timbre3', 'mean_timbre4', 'median_timbre4', 'std_timbre4', 'min_timbre4', 'max_timbre4', 'range_timbre4', '80Percentile_timbre4', 'mean_timbre5', 'median_timbre5', 'std_timbre5', 'min_timbre5', 'max_timbre5', 'range_timbre5', '80Percentile_timbre5', 'mean_timbre6', 'median_timbre6', 'std_timbre6', 'min_timbre6', 'max_timbre6', 'range_tim

In [163]:
# Get binary values
gapSize = 0.2
binSize = (1-gapSize)/2.0
print(binSize)
testAccuracy = list()
for i in range(len(output)):
    print('Performs prediction at Country: {}'.format(output[i]))
    
    df_test = df_work.copy()
    df_test[output[i]] = pd.qcut(df_test[output[i]], [0, 0.3, 0.7,1], duplicates ='raise', labels=False)
    df_test = df_test[df_test[output[i]]!=1]
    df_test[output[i]] = df_test[output[i]].replace(2,1)
    print('Reduced data set to {} from {}:'.format(df_test.shape[0], df.shape[0]))

    X = df_test[features]
    X = normalize(X)
    Y = df_test[output[i]]
    x_train, x_test, y_train, y_test = train_test_split(X,Y, test_size=0.15, random_state=100)
    bestModel = performsMLModelComparison(x_train, x_test, y_train, y_test)
    testAccuracy.append(bestModel.score(x_test, y_test))
    

0.4
Performs prediction at Country: pt_position
Reduced data set to 330 from 549:


  if diff:
  if diff:
  if diff:


Perceptron is the best model
Test Accuracy: 0.56
AUC Score: 0.5
Confusion Matrix
    0  1
0  28  0
1  22  0
Performs prediction at Country: pe_position
Reduced data set to 330 from 549:


  if diff:
  if diff:
  if diff:


Perceptron is the best model
Test Accuracy: 0.62
AUC Score: 0.5
Confusion Matrix
    0  1
0  31  0
1  19  0
Performs prediction at Country: cl_position
Reduced data set to 332 from 549:


  if diff:
  if diff:
  if diff:


Gradient Boosting is the best model
Test Accuracy: 0.46
AUC Score: 0.46223316912972084
Confusion Matrix
    0   1
0  10  11
1  16  13
Performs prediction at Country: uy_position
Reduced data set to 332 from 549:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


Perceptron is the best model
Test Accuracy: 0.5
AUC Score: 0.5273752012882448
Confusion Matrix
    0  1
0  20  3
1  22  5
Performs prediction at Country: co_position
Reduced data set to 330 from 549:


  if diff:
  if diff:
  if diff:


Perceptron is the best model
Test Accuracy: 0.62
AUC Score: 0.5
Confusion Matrix
    0  1
0  31  0
1  19  0
Performs prediction at Country: do_position
Reduced data set to 330 from 549:


  if diff:
  if diff:
  if diff:


Perceptron is the best model
Test Accuracy: 0.62
AUC Score: 0.5
Confusion Matrix
    0  1
0  31  0
1  19  0
Performs prediction at Country: ar_position
Reduced data set to 328 from 549:


  if diff:
  if diff:
  if diff:


Perceptron is the best model
Test Accuracy: 0.58
AUC Score: 0.5
Confusion Matrix
    0  1
0  29  0
1  21  0
Performs prediction at Country: pa_position
Reduced data set to 330 from 549:


  if diff:
  if diff:
  if diff:


Perceptron is the best model
Test Accuracy: 0.64
AUC Score: 0.5779967159277505
Confusion Matrix
    0  1
0  28  1
1  17  4
Performs prediction at Country: gt_position
Reduced data set to 330 from 549:


  if diff:
  if diff:
  if diff:


Perceptron is the best model
Test Accuracy: 0.5
AUC Score: 0.4561688311688311
Confusion Matrix
    0  1
0  23  5
1  20  2
Performs prediction at Country: hn_position
Reduced data set to 330 from 549:


  if diff:
  if diff:
  if diff:


KN Neighbors is the best model
Test Accuracy: 0.52
AUC Score: 0.5160256410256411
Confusion Matrix
    0   1
0  16  10
1  14  10
Performs prediction at Country: mx_position
Reduced data set to 328 from 549:


  if diff:
  if diff:
  if diff:


KN Neighbors is the best model
Test Accuracy: 0.54
AUC Score: 0.5289855072463768
Confusion Matrix
    0  1
0  18  9
1  14  9
Performs prediction at Country: sv_position
Reduced data set to 331 from 549:


  if diff:
  if diff:
  if diff:


Random Forest is the best model
Test Accuracy: 0.48
AUC Score: 0.487012987012987
Confusion Matrix
    0   1
0  12  16
1  10  12
Performs prediction at Country: py_position
Reduced data set to 330 from 549:


  if diff:
  if diff:
  if diff:


Perceptron is the best model
Test Accuracy: 0.58
AUC Score: 0.5
Confusion Matrix
    0  1
0  27  3
1  18  2
Performs prediction at Country: cr_position
Reduced data set to 328 from 549:


  if diff:
  if diff:
  if diff:


Random Forest is the best model
Test Accuracy: 0.52
AUC Score: 0.5314091680814941
Confusion Matrix
    0   1
0  15  16
1   8  11
Performs prediction at Country: bo_position
Reduced data set to 332 from 549:


  if diff:
  if diff:
  if diff:


Gradient Boosting is the best model
Test Accuracy: 0.62
AUC Score: 0.6288244766505636
Confusion Matrix
    0   1
0  14  13
1   6  17
Performs prediction at Country: es_position
Reduced data set to 331 from 549:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


Perceptron is the best model
Test Accuracy: 0.5
AUC Score: 0.4376026272577997
Confusion Matrix
    0  1
0  24  5
1  20  1
Performs prediction at Country: global_position
Reduced data set to 331 from 549:


  if diff:
  if diff:
  if diff:


Perceptron is the best model
Test Accuracy: 0.54
AUC Score: 0.5
Confusion Matrix
   0   1
0  0  23
1  0  27
Performs prediction at Country: ec_position
Reduced data set to 330 from 549:


  if diff:
  if diff:
  if diff:


Perceptron is the best model
Test Accuracy: 0.56
AUC Score: 0.5
Confusion Matrix
    0  1
0  28  0
1  22  0


In [165]:
np.mean(testAccuracy)

0.5533333333333332

In [171]:
testAccuracy

[0.56,
 0.62,
 0.46,
 0.5,
 0.62,
 0.62,
 0.58,
 0.64,
 0.5,
 0.52,
 0.54,
 0.48,
 0.58,
 0.52,
 0.62,
 0.5,
 0.54,
 0.56]

### 3. Feature Importance

#### a) Are those the same for all outputs?

In [166]:
Y = df_work[output]
X = df_work[features]
X_norm = normalize(X)

In [167]:
for i in range(Y.shape[1]):
    Y_curr = pd.qcut(Y.iloc[:, i], 2, labels= False)
    x_train, x_test, y_train, y_test = train_test_split(X,Y_curr, test_size=0.15, random_state=100)

    featureRanking = featureImportance(x_train, y_train, features) # List
    features_sorted = list()
    for j in range(len(featureRanking)):
        features_sorted.append(featureRanking[j][0])
    print('Top 5 features for output {}'.format(i))
    print(features_sorted[:6])

Top 5 features for output 0
['min_timbre10', 'min_timbre9', 'range_timbre10', 'max_timbre3', 'range_timbre7', 'min_timbre8']
Top 5 features for output 1
['max_timbre10', 'range_timbre10', 'range_timbre2', '80Percentile_timbre9', 'min_timbre6', 'median_timbre7']
Top 5 features for output 2
['max_timbre10', 'max_timbre2', 'range_timbre2', 'min_timbre12', 'max_timbre1', 'min_timbre10']
Top 5 features for output 3
['range_timbre2', 'max_timbre10', 'max_timbre9', 'range_timbre10', 'median_timbre9', 'min_timbre6']
Top 5 features for output 4
['range_timbre10', 'max_timbre10', 'max_timbre1', 'tempo', 'min_timbre2', 'range_timbre9']
Top 5 features for output 5
['max_timbre10', 'max_timbre9', 'min_timbre6', 'max_timbre4', 'valence', 'range_timbre2']
Top 5 features for output 6
['max_timbre2', '80Percentile_timbre5', 'range_timbre7', 'max_timbre10', 'mean_timbre4', 'max_timbre1']
Top 5 features for output 7
['min_timbre11', 'min_timbre6', 'range_timbre6', 'min_timbre12', 'std_timbre2', 'range_ti

#### b)Try to reduce features

In [169]:
features = features_sorted[:]
print(features)

['min_timbre6', 'max_timbre10', 'max_timbre1', 'range_timbre1', 'median_timbre7', '80Percentile_timbre5', 'range_timbre2', 'range_timbre6', 'mean_timbre1', 'liveness', 'loudness', 'max_timbre5', '80Percentile_timbre7', 'range_timbre10', 'tempo', 'median_timbre1', 'range_timbre7', 'mean_timbre9', 'min_timbre2', 'mean_timbre7', '80Percentile_timbre9', 'min_timbre12', 'min_timbre11', 'std_timbre1', 'std_timbre2', 'danceability', 'range_timbre9', 'max_timbre9', 'median_timbre8', 'median_timbre10', 'max_timbre4', 'range_timbre4', 'range_timbre11', 'median_timbre9', 'mean_timbre6', 'median_timbre3', 'min_timbre4', 'std_timbre5', '80Percentile_timbre11', 'min_timbre3', 'min_timbre5', 'mean_timbre8', 'mean_timbre10', 'std_timbre9', 'mean_timbre2', 'std_timbre8', 'mean_timbre12', 'median_timbre5', 'range_timbre8', 'mean_timbre4', '80Percentile_timbre6', 'max_timbre3', 'median_timbre12', 'mean_timbre11', 'range_timbre12', 'range_timbre5', 'mean_timbre5', 'max_timbre2', 'min_timbre8', 'speechines

In [144]:
testAccuracy = list()
Y = df_work[output]
X = df_work[features]
X_norm = normalize(X)
model = RandomForestClassifier(n_estimators=190)
for i in range(Y.shape[1]):
    Y_curr = pd.qcut(Y.iloc[:, i], 2, labels= False)

    print('Performs prediction at Country: {}'.format(Y_test.columns[i]))
    x_train, x_test, y_train, y_test = train_test_split(X_norm,Y_curr, test_size=0.15, random_state=100)
    bestModel = runsModel(x_train, x_test, y_train, y_test, model)
    testAccuracy.append(bestModel.score(x_test, y_test))

Performs prediction at Country: pt_position
Test Accuracy: 0.5180722891566265
Training Accuracy: 0.9978540772532188
AUC Score: 0.5130813953488372
Confusion Matrix
    0   1
0  28  15
1  25  15
Performs prediction at Country: pe_position
Test Accuracy: 0.3614457831325301
Training Accuracy: 0.9978540772532188
AUC Score: 0.36914765906362546
Confusion Matrix
    0   1
0  16  33
1  20  14
Performs prediction at Country: cl_position
Test Accuracy: 0.5180722891566265
Training Accuracy: 0.9978540772532188
AUC Score: 0.5228070175438597
Confusion Matrix
    0   1
0  21  24
1  16  22
Performs prediction at Country: uy_position
Test Accuracy: 0.4819277108433735
Training Accuracy: 0.9978540772532188
AUC Score: 0.5095238095238096
Confusion Matrix
    0   1
0  16  32
1  11  24
Performs prediction at Country: co_position
Test Accuracy: 0.4939759036144578
Training Accuracy: 0.9978540772532188
AUC Score: 0.5005847953216375
Confusion Matrix
    0   1
0  19  26
1  16  22
Performs prediction at Country: do

In [145]:
np.mean(testAccuracy)

0.4899598393574297

In [None]:
testAccuac