In [None]:
#https://www.kaggle.com/huanntran100/spotify-song-popularity-prediction/data

In [None]:
import numpy as np 
import pandas as pd
import seaborn as sns
from scipy import stats
import plotly
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import plotnine
from plotnine import *
import itertools
import math
from prettytable import PrettyTable
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder, MinMaxScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, BaggingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVR
from sklearn.decomposition import PCA
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split, cross_val_score, RepeatedStratifiedKFold
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.metrics import plot_roc_curve, balanced_accuracy_score
from xgboost import XGBClassifier
import xgboost as xgb
from scipy.spatial import distance
from imblearn.over_sampling import SMOTE
import copy
import warnings
warnings.filterwarnings("ignore")
plotly.offline.init_notebook_mode(connected = True)
%matplotlib inline
sns.set(style = "ticks", context = "talk")

In [None]:
data = pd.read_csv("/content/SpotifyFeatures.csv")
data.head()

Unnamed: 0,genre,artist_name,track_name,track_id,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,Opera,Giuseppe Verdi,"Stiffelio, Act III: Ei fugge! … Lina, pensai c...",7EsKYeHtTc4H4xWiTqSVZA,21,0.986,0.313,490867,0.231,0.000431,C#,0.0964,-14.287,Major,0.0547,86.001,4/4,0.0886
1,Opera,Giacomo Puccini,Madama Butterfly / Act 1: ... E soffitto e pareti,7MfmRBvqaW0I6UTxXnad8p,18,0.972,0.36,176797,0.201,0.028,D#,0.133,-19.794,Major,0.0581,131.798,4/4,0.369
2,Opera,Giacomo Puccini,"Turandot / Act 2: Gloria, gloria, o vincitore",7pBo1GDhIysyUMFXiDVoON,10,0.935,0.168,266184,0.47,0.0204,C,0.363,-8.415,Major,0.0383,75.126,3/4,0.0696
3,Opera,Giuseppe Verdi,"Rigoletto, Act IV: Venti scudi hai tu detto?",02mvYZX5aKNzdqEo6jF20m,17,0.961,0.25,288573,0.00605,0.0,D,0.12,-33.44,Major,0.048,76.493,4/4,0.038
4,Opera,Giuseppe Verdi,"Don Carlo / Act 4: ""Ella giammai m'amò!""",03TW0jwGMGhUabAjOpB1T9,19,0.985,0.142,629760,0.058,0.146,D,0.0969,-23.625,Major,0.0493,172.935,4/4,0.0382


# Feature Engineering

### Drop unneeded data

In [None]:
# Get rid of tracks too long or too short because data containes podcast
data = data[np.abs(stats.zscore(data['duration_ms'])) <= 3]
# Drop instrumentalness because most values are close to 0
data = data.drop(['instrumentalness'], axis=1)
# Drop time_signature because most values are "4/4"
data = data.drop(['time_signature'], axis=1)
# Drop energy because energy and loudness are highly corelated and loundness is more corelated to popularity
data = data.drop(['energy'], axis=1)
# Drop other not needed features
data = data.drop(["artist_name", "track_name", "track_id"], axis=1)
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 225869 entries, 0 to 228158
Data columns (total 12 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   genre         225869 non-null  object 
 1   popularity    225869 non-null  int64  
 2   acousticness  225869 non-null  float64
 3   danceability  225869 non-null  float64
 4   duration_ms   225869 non-null  int64  
 5   key           225869 non-null  object 
 6   liveness      225869 non-null  float64
 7   loudness      225869 non-null  float64
 8   mode          225869 non-null  object 
 9   speechiness   225869 non-null  float64
 10  tempo         225869 non-null  float64
 11  valence       225869 non-null  float64
dtypes: float64(7), int64(2), object(3)
memory usage: 22.4+ MB


### Normalize Numeric features

In [None]:
sc = StandardScaler()
# get numeric data
num_data = data.select_dtypes(exclude=['object'])
num_data = num_data.drop(['popularity'], axis=1)
# update the cols with their normalized values
data[num_data.columns] = sc.fit_transform(num_data)
data

Unnamed: 0,genre,popularity,acousticness,danceability,duration_ms,key,liveness,loudness,mode,speechiness,tempo,valence
0,Opera,21,1.826793,-1.330878,3.351118,C#,-0.600486,-0.850499,Major,-0.363464,-1.027817,-1.407142
1,Opera,18,1.786747,-1.073710,-0.682017,D#,-0.413724,-1.786201,Major,-0.345010,0.463625,-0.306297
2,Opera,10,1.680909,-2.124269,0.465848,C,0.759918,0.147221,Major,-0.452474,-1.381976,-1.481735
3,Opera,17,1.755281,-1.675593,0.753356,D,-0.480060,-4.104812,Major,-0.399828,-1.337458,-1.605796
5,Opera,20,1.838235,-1.888988,1.345954,G#,-0.719891,-1.934364,Major,-0.370519,-1.177557,-1.597944
...,...,...,...,...,...,...,...,...,...,...,...,...
228154,Soundtrack,32,1.832514,-0.499185,-0.620172,F#,1.632495,-1.172141,Minor,-0.499693,-1.380446,-1.185716
228155,Soundtrack,36,1.452071,-2.709737,-0.691391,A,-0.623448,-2.171900,Major,-0.447047,-1.970906,-1.628959
228156,Soundtrack,30,-0.988803,-2.698247,1.234528,C,-0.602017,-0.104758,Minor,-0.466586,-1.613588,-1.584596
228157,Soundtrack,31,1.105953,-2.080496,0.102548,E,-0.622428,-1.071214,Minor,-0.464415,-0.147776,-1.614433


### Encode categorical features

In [None]:
# Using one-hot encoding because these categorical features are nominal
data_all_genre = pd.get_dummies(data)
data_all_genre

Unnamed: 0,popularity,acousticness,danceability,duration_ms,liveness,loudness,speechiness,tempo,valence,genre_A Capella,genre_Alternative,genre_Anime,genre_Blues,genre_Children’s Music,genre_Classical,genre_Comedy,genre_Country,genre_Dance,genre_Electronic,genre_Folk,genre_Hip-Hop,genre_Indie,genre_Jazz,genre_Movie,genre_Opera,genre_Pop,genre_R&B,genre_Rap,genre_Reggae,genre_Reggaeton,genre_Rock,genre_Ska,genre_Soul,genre_Soundtrack,genre_World,key_A,key_A#,key_B,key_C,key_C#,key_D,key_D#,key_E,key_F,key_F#,key_G,key_G#,mode_Major,mode_Minor
0,21,1.826793,-1.330878,3.351118,-0.600486,-0.850499,-0.363464,-1.027817,-1.407142,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0
1,18,1.786747,-1.073710,-0.682017,-0.413724,-1.786201,-0.345010,0.463625,-0.306297,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0
2,10,1.680909,-2.124269,0.465848,0.759918,0.147221,-0.452474,-1.381976,-1.481735,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0
3,17,1.755281,-1.675593,0.753356,-0.480060,-4.104812,-0.399828,-1.337458,-1.605796,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0
5,20,1.838235,-1.888988,1.345954,-0.719891,-1.934364,-0.370519,-1.177557,-1.597944,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
228154,32,1.832514,-0.499185,-0.620172,1.632495,-1.172141,-0.499693,-1.380446,-1.185716,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1
228155,36,1.452071,-2.709737,-0.691391,-0.623448,-2.171900,-0.447047,-1.970906,-1.628959,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0
228156,30,-0.988803,-2.698247,1.234528,-0.602017,-0.104758,-0.466586,-1.613588,-1.584596,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1
228157,31,1.105953,-2.080496,0.102548,-0.622428,-1.071214,-0.464415,-0.147776,-1.614433,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1


# Modeling without subsetting by genre(i.e., inclues genre as a feature)

### Train test split

In [None]:
X = data_all_genre.drop(['popularity'], axis=1)
y = data_all_genre['popularity'].values.reshape(-1, 1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

### Linear Regression

In [None]:
reg = LinearRegression().fit(X_train, y_train)
y_pred_train = reg.predict(X_train)
print("Training accuracy: ", r2_score(y_train, y_pred_train))
scores = cross_val_score(reg, X_train, y_train, cv=10)
print("Mean cross-validation score: ", scores.mean())
y_pred_test = reg.predict(X_test)
print("Testing accuracy: ", r2_score(y_test, y_pred_test))

Training accuracy:  0.6976818730340925
Mean cross-validation score:  0.6974866483175111
Testing accuracy:  0.694708285182922


### Support Vector Regression

In [None]:
# svr_lin = SVR(kernel="linear", C=100).fit(X_train, y_train)
# y_pred_train = svr_lin.predict(X_train)
# y_pred_test = svr_lin.predict(X_test)
# print("Training accuracy is: ", r2_score(y_train, y_pred_train))
# print("Testing accuracy is: ", r2_score(y_test, y_pred_test))


In [None]:
xgbr = xgb.XGBRegressor().fit(X_train, y_train)
score = xgbr.score(X_train, y_train)
print("Training accuracy: ", score)
scores = cross_val_score(xgbr, X_train, y_train,cv=10)
print("Mean cross-validation score: ", scores.mean())

Training accuracy is:  0.6784416129038908
Mean cross-validation score: 0.68


# Modeling with subsetting by genre

In [None]:
# Subset by genr
genres = data.genre.unique()

# create a data frame dictionary to store the subsets
genre_subset_dict = {genre : pd.DataFrame for genre in genres}

for key in genre_subset_dict.keys():
  genre_subset_dict[key] = data[:][data.genre == key]

print(genre_subset_dict.keys())

dict_keys(['Opera', 'A Capella', 'Alternative', 'Blues', 'Dance', 'Pop', 'Electronic', 'R&B', 'Children’s Music', 'Folk', 'Anime', 'Rap', 'Classical', 'Reggae', 'Hip-Hop', 'Comedy', 'Country', 'Reggaeton', 'Ska', 'Indie', 'Rock', 'Soul', 'Soundtrack', 'Jazz', 'World', 'Movie'])


In [None]:
genre_subset_dict['Rap']

Unnamed: 0,genre,popularity,acousticness,danceability,duration_ms,key,liveness,loudness,mode,speechiness,tempo,valence
68784,Rap,99,-0.527379,1.514387,-1.032295,B,-0.577013,0.319851,Minor,0.305744,-0.573646,-0.243482
68785,Rap,96,-0.567426,1.536274,-0.209488,G#,0.290461,-0.413147,Major,0.837636,0.209151,0.062745
68786,Rap,97,0.596788,1.114956,-0.922885,D,-0.733669,0.629939,Major,-0.407426,-0.900483,1.829435
68787,Rap,94,-0.978963,1.519859,1.064725,G#,-0.459649,0.945975,Major,0.544552,1.219490,-0.003997
68788,Rap,95,-0.252773,1.820801,-0.634182,E,0.193508,0.495539,Minor,-0.169160,-1.027752,0.380749
...,...,...,...,...,...,...,...,...,...,...,...,...
103764,Rap,51,-0.856620,0.983636,-0.055145,A,-0.240229,0.268708,Minor,-0.282052,0.275000,0.129486
103765,Rap,58,-0.763941,-0.012207,1.800710,G,-0.326976,1.004934,Major,2.069672,1.780446,0.789051
103766,Rap,49,-0.687566,0.994579,-0.637123,C,-0.638757,-0.142478,Major,-0.126283,-1.384777,0.785125
103767,Rap,51,-0.976560,1.060239,-0.787189,F#,0.831357,0.395971,Minor,1.423803,-1.371653,-0.035405


In [None]:
# one-hot encode all subsets
for key in genre_subset_dict.keys():
  df = genre_subset_dict[key]
  genre_subset_dict[key] = pd.get_dummies(df.drop(["genre"], axis=1))

In [None]:
genre_subset_dict['Rap']

Unnamed: 0,popularity,acousticness,danceability,duration_ms,liveness,loudness,speechiness,tempo,valence,key_A,key_A#,key_B,key_C,key_C#,key_D,key_D#,key_E,key_F,key_F#,key_G,key_G#,mode_Major,mode_Minor
68784,99,-0.527379,1.514387,-1.032295,-0.577013,0.319851,0.305744,-0.573646,-0.243482,0,0,1,0,0,0,0,0,0,0,0,0,0,1
68785,96,-0.567426,1.536274,-0.209488,0.290461,-0.413147,0.837636,0.209151,0.062745,0,0,0,0,0,0,0,0,0,0,0,1,1,0
68786,97,0.596788,1.114956,-0.922885,-0.733669,0.629939,-0.407426,-0.900483,1.829435,0,0,0,0,0,1,0,0,0,0,0,0,1,0
68787,94,-0.978963,1.519859,1.064725,-0.459649,0.945975,0.544552,1.219490,-0.003997,0,0,0,0,0,0,0,0,0,0,0,1,1,0
68788,95,-0.252773,1.820801,-0.634182,0.193508,0.495539,-0.169160,-1.027752,0.380749,0,0,0,0,0,0,0,1,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103764,51,-0.856620,0.983636,-0.055145,-0.240229,0.268708,-0.282052,0.275000,0.129486,1,0,0,0,0,0,0,0,0,0,0,0,0,1
103765,58,-0.763941,-0.012207,1.800710,-0.326976,1.004934,2.069672,1.780446,0.789051,0,0,0,0,0,0,0,0,0,0,1,0,1,0
103766,49,-0.687566,0.994579,-0.637123,-0.638757,-0.142478,-0.126283,-1.384777,0.785125,0,0,0,1,0,0,0,0,0,0,0,0,1,0
103767,51,-0.976560,1.060239,-0.787189,0.831357,0.395971,1.423803,-1.371653,-0.035405,0,0,0,0,0,0,0,0,0,1,0,0,0,1


## Modeling   

In [None]:
X = genre_subset_dict['Rap'].drop(['popularity'], axis=1)
y = genre_subset_dict['Rap']['popularity'].values.reshape(-1, 1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [None]:
reg = LinearRegression().fit(X_train, y_train)
y_pred_train = reg.predict(X_train)
print("Training accuracy: ", r2_score(y_train, y_pred_train))
scores = cross_val_score(reg, X_train, y_train, cv=10)
print("Mean cross-validation score: ", scores.mean())
y_pred_test = reg.predict(X_test)
print("Testing accuracy: ", r2_score(y_test, y_pred_test))

Training accuracy:  0.022769681967641175
Mean cross-validation score:  0.014157098415342628
Testing accuracy:  0.01269154032148978


In [None]:
xgbr = xgb.XGBRegressor().fit(X_train, y_train)
score = xgbr.score(X_train, y_train)
print("Training accuracy: ", score)
scores = cross_val_score(xgbr, X_train, y_train,cv=10)
print("Mean cross-validation score: ", scores.mean())

Training accuracy:  0.11132158639299594
Mean cross-validation score:  0.01195860039118717


In [None]:
X = genre_subset_dict['Alternative'].drop(['popularity'], axis=1)
y = genre_subset_dict['Alternative']['popularity'].values.reshape(-1, 1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [None]:
reg = LinearRegression().fit(X_train, y_train)
y_pred_train = reg.predict(X_train)
print("Training accuracy: ", r2_score(y_train, y_pred_train))
scores = cross_val_score(reg, X_train, y_train, cv=10)
print("Mean cross-validation score: ", scores.mean())
y_pred_test = reg.predict(X_test)
print("Testing accuracy: ", r2_score(y_test, y_pred_test))

Training accuracy:  0.01706966807332877
Mean cross-validation score:  0.011058275907117299
Testing accuracy:  0.005164688703012343


In [None]:
xgbr = xgb.XGBRegressor().fit(X_train, y_train)
score = xgbr.score(X_train, y_train)
print("Training accuracy: ", score)
scores = cross_val_score(xgbr, X_train, y_train,cv=10)
print("Mean cross-validation score: ", scores.mean())

Training accuracy:  0.1080608173699763
Mean cross-validation score:  0.001309703810595142
