# Testing Basic ML Models
**Authors:** Martin Ziran Xu

In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt

In [2]:
%matplotlib inline

## Cleaning Data

In [3]:
#Read Data
# Can't read 1st data set
df_countries = pd.read_csv('combined_all_countries_2.csv')
df_countries.head()

Unnamed: 0.1,Unnamed: 0,acousticness,analysis_url,danceability,duration_ms,energy,id,instrumentalness,key,liveness,...,pt_100,ca_100,co_100,nz_100,tr_100,cz_100,hn_100,be_100,id_100,bo_100
0,0,0.581,https://api.spotify.com/v1/audio-analysis/7qiZ...,0.825,233713,0.652,True,0.0,1,0.0931,...,True,True,True,True,True,True,True,True,True,True
1,1,0.00902,https://api.spotify.com/v1/audio-analysis/12VW...,0.785,173987,0.617,True,0.00246,1,0.351,...,True,True,True,True,True,True,True,True,False,True
2,2,0.415,https://api.spotify.com/v1/audio-analysis/7crM...,0.736,245507,0.541,True,0.0,8,0.11,...,True,True,True,True,True,True,True,True,True,True
3,3,0.00346,https://api.spotify.com/v1/audio-analysis/4vS8...,0.723,176561,0.809,True,0.00123,7,0.565,...,True,False,True,True,True,True,False,True,False,False
4,4,0.474,https://api.spotify.com/v1/audio-analysis/34gC...,0.781,281560,0.445,True,0.0,2,0.184,...,True,True,True,True,False,True,True,True,True,True


In [4]:
features = list(df_countries.columns)
print(features)

['Unnamed: 0', 'acousticness', 'analysis_url', 'danceability', 'duration_ms', 'energy', 'id', 'instrumentalness', 'key', 'liveness', 'loudness', 'mode', 'speechiness', 'tempo', 'time_signature', 'track_href', 'type', 'uri', 'valence', 'popularity', 'song', 'artist', 'genre', 'sv', 'cr', 'de', 'co', 'dk', 'at', 'pa', 'es', 'gb', 'sk', 'global', 'nl', 'br', 'cl', 'ec', 'lt', 'us', 'it', 'be', 'lu', 'gr', 'mx', 'uy', 'ca', 'jp', 'sg', 'tr', 'cz', 'lv', 'pe', 'ch', 'se', 'ar', 'gt', 'py', 'pl', 'ph', 'nz', 'fr', 'fi', 'hn', 'ie', 'do', 'no', 'hu', 'pt', 'is', 'bo', 'au', 'hk', 'tw', 'my', 'ee', 'py_top', 'lt_top', 'au_top', 'se_top', 'ph_top', 'ee_top', 'gt_top', 'my_top', 'gb_top', 'cr_top', 'ec_top', 'ar_top', 'is_top', 'global_top', 'uy_top', 'nl_top', 'us_top', 'sk_top', 'do_top', 'de_top', 'hu_top', 'gr_top', 'pl_top', 'cl_top', 'at_top', 'tw_top', 'fr_top', 'pe_top', 'mx_top', 'no_top', 'sg_top', 'it_top', 'sv_top', 'lv_top', 'hk_top', 'es_top', 'pa_top', 'lu_top', 'br_top', 'ch_top'

In [5]:
# Dataset with one country (global) and overall popularity
df = df_countries.loc[:, ['song', 'analysis_url','track_href','uri','artist','acousticness', 'danceability', 'duration_ms', 'energy', 'instrumentalness', 'key', 'liveness', 'loudness', 'mode', 'speechiness', 'tempo', 'valence','genre','global', 'global_100', 'global_top', 'popularity']]
df.head()

Unnamed: 0,song,analysis_url,track_href,uri,artist,acousticness,danceability,duration_ms,energy,instrumentalness,...,loudness,mode,speechiness,tempo,valence,genre,global,global_100,global_top,popularity
0,Shape of You,https://api.spotify.com/v1/audio-analysis/7qiZ...,https://api.spotify.com/v1/tracks/7qiZfU4dY1lW...,spotify:track:7qiZfU4dY1lWllzX7mPBI3,Ed Sheeran,0.581,0.825,233713,0.652,0.0,...,-3.183,0,0.0802,95.977,0.931,pop,True,True,True,92
1,One Dance,https://api.spotify.com/v1/audio-analysis/12VW...,https://api.spotify.com/v1/tracks/12VWzyPDBCc8...,spotify:track:12VWzyPDBCc8fqeWCAfNwR,Drake,0.00902,0.785,173987,0.617,0.00246,...,-5.871,1,0.0522,103.981,0.382,hip hop,True,True,False,74
2,Closer,https://api.spotify.com/v1/audio-analysis/7crM...,https://api.spotify.com/v1/tracks/7crMiinWx373...,spotify:track:7crMiinWx373rNBZBaVske,The Chainsmokers,0.415,0.736,245507,0.541,0.0,...,-5.597,1,0.0297,94.962,0.662,house,True,True,False,65
3,Lean On (feat. MØ & DJ Snake),https://api.spotify.com/v1/audio-analysis/4vS8...,https://api.spotify.com/v1/tracks/4vS8VaBwJJV5...,spotify:track:4vS8VaBwJJV5Ry7UFIQuoo,Major Lazer,0.00346,0.723,176561,0.809,0.00123,...,-3.081,0,0.0625,98.007,0.274,electronic,True,True,False,20
4,Thinking Out Loud,https://api.spotify.com/v1/audio-analysis/34gC...,https://api.spotify.com/v1/tracks/34gCuhDGsG4b...,spotify:track:34gCuhDGsG4bRPIf9bb02f,Ed Sheeran,0.474,0.781,281560,0.445,0.0,...,-6.061,1,0.0295,78.998,0.591,pop,True,True,False,87


In [6]:
# clean genre: International
list_genre = pd.unique(df['genre'])
print(list_genre)
df['genre'] = df['genre']\
.replace(['French', 'German', 'Korean'], 'International')
print(pd.unique(df['genre']))

['pop' 'hip hop' 'house' 'electronic' 'latino' 'rap' 'punk' 'Unknown'
 'folk' 'r&b' 'indie' 'rock' 'French' 'German' 'metal' 'Korean']
['pop' 'hip hop' 'house' 'electronic' 'latino' 'rap' 'punk' 'Unknown'
 'folk' 'r&b' 'indie' 'rock' 'International' 'metal']


In [7]:
# clean genre: NaN
df_unknown = df[df['genre']=='Unknown']
print(df_unknown.size)
print(df.size)

220
12078


In [8]:
# Genre Dummy mapping
binary_encoded = pd.get_dummies(df['genre'])
newcols = binary_encoded.columns
df[newcols] = binary_encoded


In [9]:
df = df.drop(['genre'], axis=1)

In [10]:
# Rearrange dataset
df.head()
print(df.columns)
new_col = ['song', 'analysis_url', 'track_href', 'uri', 'artist', 'acousticness',
       'danceability', 'duration_ms', 'energy', 'instrumentalness', 'key',
       'liveness', 'loudness', 'mode', 'speechiness', 'tempo', 'valence', 'International',
       'Unknown', 'electronic', 'folk', 'hip hop', 'house', 'indie', 'latino',
       'metal', 'pop', 'punk', 'r&b', 'rap', 'rock',
       'global', 'global_100', 'global_top', 'popularity']
df = df[new_col]
df.head()

Index(['song', 'analysis_url', 'track_href', 'uri', 'artist', 'acousticness',
       'danceability', 'duration_ms', 'energy', 'instrumentalness', 'key',
       'liveness', 'loudness', 'mode', 'speechiness', 'tempo', 'valence',
       'global', 'global_100', 'global_top', 'popularity', 'International',
       'Unknown', 'electronic', 'folk', 'hip hop', 'house', 'indie', 'latino',
       'metal', 'pop', 'punk', 'r&b', 'rap', 'rock'],
      dtype='object')


Unnamed: 0,song,analysis_url,track_href,uri,artist,acousticness,danceability,duration_ms,energy,instrumentalness,...,metal,pop,punk,r&b,rap,rock,global,global_100,global_top,popularity
0,Shape of You,https://api.spotify.com/v1/audio-analysis/7qiZ...,https://api.spotify.com/v1/tracks/7qiZfU4dY1lW...,spotify:track:7qiZfU4dY1lWllzX7mPBI3,Ed Sheeran,0.581,0.825,233713,0.652,0.0,...,0,1,0,0,0,0,True,True,True,92
1,One Dance,https://api.spotify.com/v1/audio-analysis/12VW...,https://api.spotify.com/v1/tracks/12VWzyPDBCc8...,spotify:track:12VWzyPDBCc8fqeWCAfNwR,Drake,0.00902,0.785,173987,0.617,0.00246,...,0,0,0,0,0,0,True,True,False,74
2,Closer,https://api.spotify.com/v1/audio-analysis/7crM...,https://api.spotify.com/v1/tracks/7crMiinWx373...,spotify:track:7crMiinWx373rNBZBaVske,The Chainsmokers,0.415,0.736,245507,0.541,0.0,...,0,0,0,0,0,0,True,True,False,65
3,Lean On (feat. MØ & DJ Snake),https://api.spotify.com/v1/audio-analysis/4vS8...,https://api.spotify.com/v1/tracks/4vS8VaBwJJV5...,spotify:track:4vS8VaBwJJV5Ry7UFIQuoo,Major Lazer,0.00346,0.723,176561,0.809,0.00123,...,0,0,0,0,0,0,True,True,False,20
4,Thinking Out Loud,https://api.spotify.com/v1/audio-analysis/34gC...,https://api.spotify.com/v1/tracks/34gCuhDGsG4b...,spotify:track:34gCuhDGsG4bRPIf9bb02f,Ed Sheeran,0.474,0.781,281560,0.445,0.0,...,0,1,0,0,0,0,True,True,False,87


In [16]:
df.to_csv('cleanGlobalPrediction1.csv')

In [None]:
# Clean Ranking
# not needed, since True==1 in Python

In [None]:
# Shuffle dataframe
from sklearn.utils import shuffle
df = shuffle(df)
df.head()

## Define Input and Output

In [None]:
print(df.columns)

In [None]:
# Define Input
X = df.loc[:,['acousticness',
       'danceability', 'duration_ms', 'energy', 'instrumentalness', 'key',
       'liveness', 'loudness', 'mode', 'speechiness', 'tempo', 'valence',
       'International', 'Unknown', 'electronic', 'folk', 'hip hop', 'house',
       'indie', 'latino', 'metal', 'pop', 'punk', 'r&b', 'rap', 'rock']]
X.head()

In [None]:
# Define Output
Y = df.loc[:, ['global', 'global_100', 'global_top', 'popularity']]
Y.head()

## Analyze dataset

In [None]:
# Check distribution of data:
for i in range(0,3):
    #Value Counts
    print(Y.columns[i])
    print(Y.iloc[:,i].value_counts())

In [None]:
# Distribution of popularity
plt.figure()
Y.loc[:, 'popularity'].hist()
plt.show()

In [None]:
# Feature Correlation
X.loc[:,['acousticness',
       'danceability', 'duration_ms', 'energy', 'instrumentalness', 'key',
       'liveness', 'loudness', 'mode', 'speechiness', 'tempo', 'valence']].corr()

In [None]:
# Create big data frame
df = pd.concat([X, Y], axis=1)
df.head()
df.popularity = pd.qcut(Y.loc[:, 'popularity'], 4, labels=False)

### Modifying Output

In [None]:
# Devide popularity into 4 quantiles
Y_popularity = pd.qcut(Y.loc[:, 'popularity'], 4, labels=False)
Y_popularity.head()
# Check if balanced data set
Y_popularity.value_counts() 

In [None]:
# Split in Regression and Classification
Y_reg = Y.loc[:,'popularity']
Y_class = Y.iloc[:,0:3]
print(Y_reg.head())
print(Y_class.head())

In [None]:
# Normalize Data
from sklearn.preprocessing import normalize
X = normalize(X)

## Perform Machine Learning

### a) Classification on the individual categories

In [None]:
#Binary Confusion Matrix function: y_true is a pd data frame, and y_pred an array
def binaryConfusionMatrix(y_true, y_pred):
    n = y_pred.size
    confusionMatrix = np.zeros([2,2])
    for i in range(n):
        confusionMatrix[int(y_true.iloc[i]), int(y_pred[i])] += 1
    return confusionMatrix

In [None]:
#Split Data
from sklearn.metrics import confusion_matrix
def printConfusionMatrix (y_true, y_pred):
    cf=pd.DataFrame(confusion_matrix(y_true, y_pred))
    print(cf)

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X,Y_class, test_size=0.15, random_state=100)

In [None]:
print(x_train.shape)
print(y_train.shape)
y_train.iloc[:,0]
y_train.columns

In [None]:
# Perform Ridge Regression on charts: for all 3 columns
from sklearn.linear_model import RidgeClassifier
print('Ridge Classifier on regional charts: ')
for i in range(0,3):
    #Train
    y_train_curr = y_train.iloc[:,i]
    y_test_curr = y_test.iloc[:,i]
    ridgeModel = RidgeClassifier()
    ridgeModel.fit(x_train, y_train_curr)
    # Accuracy
    print(y_train.columns[i])
    print('Train Accuracy:')
    print(ridgeModel.score(x_train, y_train_curr))
    print('Test Accuracy:')
    print(ridgeModel.score(x_test, y_test_curr))
    print('Confusion Matrix')
    print(binaryConfusionMatrix(y_test_curr, ridgeModel.predict(x_test)))

In [None]:
# Perform Ridge Regression on popularity (with quantile bins):
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X,Y_popularity, test_size=0.15, random_state=100)
print('Logistic Regression on overall popularity: ')
logRegModel = LogisticRegression()
logRegModel.fit(x_train, y_train)
# Accuracy
print('Train Accuracy:')
print(logRegModel.score(x_train, y_train))
print('Test Accuracy:')
print(logRegModel.score(x_test, y_test))

In [None]:
printConfusionMatrix(y_test, logRegModel.predict(x_test))

## b1) Perform Linear Regression on Popularity - very low accuracy

In [None]:
#Split Data
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X,Y_reg, test_size=0.15, random_state=100)

In [None]:
# Perform Linear Regression
from sklearn import linear_model
linreg_model = linear_model.LinearRegression()
print('Training a linear Regression Model...')
linreg_model.fit(x_train, y_train)

In [None]:
# Accuracy:
print('Linear Regression on song popularity')
print('Training Data')
print(linreg_model.score(x_train, y_train))
print('Testing Data')
print(linreg_model.score(x_test, y_test))

### b2) Ridge Regression on Popularity - Very weird accuracy

In [None]:
from sklearn.linear_model import Ridge
ridgeModel = Ridge(alpha = 1e-5)
ridgeModel.fit(x_train, y_train)

In [None]:
# Accuracy
print('Ridge Regression on song popularity')
print('Training Data')
print(ridgeModel.score(x_train, y_train))
print('Testing Data')
print(ridgeModel.score(x_test, y_test))

### c) Combine outputs y into a overall score

In [None]:
Y.head()

In [None]:
Y.loc[:, 'popularity'].hist()

In [None]:
for i in range(0,3):
    #Value Counts
    print(Y.columns[i])
    print(Y.iloc[:,i].value_counts())

In [None]:
# Define cost model:
a = 10
b = 20
c = 100

Y['successFactor'] = Y.loc[:,'global']*a+Y.global_100*b+Y.global_top*c+Y.popularity

In [None]:
Y.head()

In [None]:
Y.loc[:, 'successFactor'].hist()

In [None]:
# Create bins for successFactor
Y_sF = pd.qcut(Y.loc[:, 'successFactor'], 4, labels=False) # Doesnt change anything with normal cut
Y_sF.head()

In [None]:
Y_sF.value_counts()

In [None]:
#Perform logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X,Y_sF, test_size=0.15, random_state=100)
print('Logistic Regression on success Factor: ')
logRegModel = LogisticRegression()
logRegModel.fit(x_train, y_train)
# Accuracy
print('Train Accuracy:')
print(logRegModel.score(x_train, y_train))
print('Test Accuracy:')
print(logRegModel.score(x_test, y_test))

In [None]:
printConfusionMatrix(y_test, logRegModel.predict(x_test))

### Try different models

In [None]:
#Random Forest
from sklearn.ensemble import RandomForestClassifier
random_forest = RandomForestClassifier(n_estimators=140)
random_forest.fit(x_train, y_train)
acc_rf1 = random_forest.score(x_train, y_train)
acc_rf2 = random_forest.score(x_test, y_test)
print("Random Forest: ")
print("Training: ", acc_rf1)
print("Test: ", acc_rf2)

In [None]:
random_forest.feature_importances_

In [None]:
printConfusionMatrix(y_test, random_forest.predict(x_test))

In [None]:
#import packages
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Perceptron

import xgboost as xgb

In [None]:
x_train.shape

In [None]:
# Support Vector Machine
svc = SVC()
svc.fit(x_train, y_train)
acc_svc1 = round(svc.score(x_train, y_train) * 100, 2)
acc_svc2 = round(svc.score(x_test, y_test) * 100, 2)
print("Support Vector Machine: ")
print("Training: ", acc_svc1, "%")
print("Test: ", acc_svc2, "%")

In [None]:
printConfusionMatrix(y_test, svc(x_test))

In [None]:
#Perceptron
perceptron = Perceptron()
perceptron.fit(x_train, y_train)
acc_perceptron1 = round(perceptron.score(x_train, y_train) * 100, 2)
acc_perceptron2 = round(perceptron.score(x_test, y_test) * 100, 2)
print("Perceptron: ")
print("Training: ", acc_perceptron1, "%")
print("Test: ", acc_perceptron2, "%")
printConfusionMatrix(y_test, perceptron.predict(x_test))

In [None]:
# KNN
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(x_train, y_train)
acc_knn1 = round(knn.score(x_train, y_train) * 100, 2)
acc_knn2 = round(knn.score(x_test, y_test) * 100, 2)
print("K-Neighbour: ")
print("Training: ", acc_knn1, "%")
print("Test: ", acc_knn2, "%")
printConfusionMatrix(y_test, knn.predict(x_test))

In [None]:
#XGBoost
gradboost = xgb.XGBClassifier(n_estimators=110)
gradboost.fit(x_train, y_train)
acc_gb1 = round(gradboost.score(x_train, y_train) * 100, 2)
acc_gb2 = round(gradboost.score(x_test, y_test) * 100, 2)
print("XGBoost: ")
print("Training: ", acc_gb1, "%")
print("Test: ", acc_gb2, "%")
printConfusionMatrix(y_test, gradboost.predict(x_test))