# 5. Classification Modeling - Song Attributes

In [1]:
# imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import time
import re

# bs4, nltk, and sklearn imports
from bs4 import BeautifulSoup   
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from sklearn import metrics
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, RidgeCV, Lasso, LassoCV
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor, BaggingClassifier, RandomForestRegressor, RandomForestClassifier, AdaBoostClassifier, AdaBoostRegressor
from sklearn.svm import SVC, SVR
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical

#pd.set_option("display.max_rows", None, "display.max_columns", None)

In [2]:
giant_ordered_df = pd.read_csv('giant_ordered_df.csv')

In [3]:
giant_ordered_df.head()

Unnamed: 0.1,Unnamed: 0,artist,track_name,popularity,danceability,energy,loudness,mode,speechiness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,acousticness,genre,track_id
0,0,24kGoldn,Mood (feat. Iann Dior),100.0,0.7,0.722,-3.558,0,0.0369,0.0,0.272,0.756,90.989,140526,4,0.221,,
1,1,DaBaby,ROCKSTAR (feat. Roddy Ricch),97.0,0.746,0.69,-7.956,1,0.164,0.0,0.101,0.497,89.977,181733,4,0.247,,
2,2,Justin Bieber,Holy (feat. Chance The Rapper),92.0,0.673,0.704,-8.056,1,0.36,0.0,0.0898,0.372,86.919,212093,4,0.196,,
3,3,Ariana Grande,Stuck with U (with Justin Bieber),91.0,0.597,0.45,-6.658,1,0.0418,0.0,0.382,0.537,178.765,228482,3,0.223,,
4,4,salem ilese,Mad at Disney,88.0,0.738,0.621,-7.313,1,0.0486,7e-06,0.692,0.715,113.968,136839,4,0.424,,


In [5]:
giant_ordered_df.drop(columns=['Unnamed: 0'], inplace = True)

In [7]:
giant_ordered_df.loc[giant_ordered_df['popularity'] < 75, 'popularity'] = 0 
giant_ordered_df.loc[giant_ordered_df['popularity'] >= 75, 'popularity'] = 1
giant_ordered_df.loc[giant_ordered_df['popularity'] == 1]

Unnamed: 0,artist,track_name,popularity,danceability,energy,loudness,mode,speechiness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,acousticness,genre,track_id
0,24kGoldn,Mood (feat. Iann Dior),1.0,0.700,0.722,-3.558,0,0.0369,0.000000,0.2720,0.756,90.989,140526,4,0.22100,,
1,DaBaby,ROCKSTAR (feat. Roddy Ricch),1.0,0.746,0.690,-7.956,1,0.1640,0.000000,0.1010,0.497,89.977,181733,4,0.24700,,
2,Justin Bieber,Holy (feat. Chance The Rapper),1.0,0.673,0.704,-8.056,1,0.3600,0.000000,0.0898,0.372,86.919,212093,4,0.19600,,
3,Ariana Grande,Stuck with U (with Justin Bieber),1.0,0.597,0.450,-6.658,1,0.0418,0.000000,0.3820,0.537,178.765,228482,3,0.22300,,
4,salem ilese,Mad at Disney,1.0,0.738,0.621,-7.313,1,0.0486,0.000007,0.6920,0.715,113.968,136839,4,0.42400,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
146777,Barry White,Let The Music Play - Single Version,1.0,0.546,0.794,-6.134,1,0.0577,0.000000,0.1740,0.774,99.354,210387,4/4,0.02280,Soul,38N1GZG7AMdFaXLRTbwFc4
146834,Mura Masa,What If I Go? (feat. Bonzai),1.0,0.560,0.533,-7.804,1,0.3880,0.000014,0.0919,0.908,200.066,195467,4/4,0.19400,Soul,2tHfNQnj50VoMZga2rpfdA
147063,Seinabo Sey,For You - Recorded At Spotify Studios Stockholm,1.0,0.409,0.379,-11.461,1,0.0329,0.027500,0.1010,0.080,117.946,205584,4/4,0.45300,Soul,4AHIgOApMmqVfpvc1hxK6x
147574,James Brown,Get Up (I Feel Like Being A) Sex Machine - Pts...,1.0,0.833,0.661,-7.595,0,0.1010,0.000149,0.2940,0.858,108.314,318800,4/4,0.27300,Soul,6hpmTwgNCz81H2bFEREx29


In [9]:
song_features = ['danceability','energy','loudness','speechiness',
                 'instrumentalness', 'liveness', 'valence', 'tempo',
                'mode','duration_ms','acousticness']

In [10]:
X = giant_ordered_df[song_features]
y = giant_ordered_df['popularity']

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=42)

In [12]:
ss = StandardScaler()

ss.fit(X_train)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [13]:
ss.mean_

array([ 5.35325609e-01,  5.52412494e-01, -1.03837744e+01,  1.30586814e-01,
        1.81814032e-01,  2.28840493e-01,  4.48077030e-01,  1.16875990e+02,
        6.59435697e-01,  2.38089986e+05,  4.15185764e-01])

In [14]:
ss.scale_

array([1.92945673e-01, 2.80765585e-01, 6.61821203e+00, 2.09619451e-01,
       3.30021555e-01, 2.15407953e-01, 2.69815537e-01, 3.13832880e+01,
       4.73898996e-01, 1.35739670e+05, 3.70701445e-01])

In [15]:
X_train_sc = ss.transform(X_train)
X_test_sc = ss.transform(X_test)

### Logistic Regression

In [16]:
logreg = LogisticRegression()

In [17]:
logreg.fit(X_train_sc,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [18]:
print(f'Training Score: {logreg.score(X_train_sc, y_train)}')
print(f'Testing Score: {logreg.score(X_test_sc, y_test)}')

Training Score: 0.9181267662494954
Testing Score: 0.919364823038622


In [19]:
logregpred = logreg.predict(X)
logregtrainpred = logreg.predict(X_train)
logregtestpred = logreg.predict(X_test)

In [20]:
print('RMSE:', metrics.mean_squared_error(y, logregpred, squared=False))
print('Train RMSE:', metrics.mean_squared_error(y_train, logregtrainpred, squared=False))
print('Test RMSE:', metrics.mean_squared_error(y_test, logregtestpred, squared=False))

RMSE: 0.2855582844967111
Train RMSE: 0.28608795758032207
Test RMSE: 0.28396333735427537


In [53]:
list(zip(logreg.coef_,X_train.columns))

[(array([ 0.48014278, -0.45208466,  0.97244694, -0.10224009, -0.65194876,
         -0.2298196 , -0.26349343,  0.06483289, -0.04195999, -0.16050297,
         -0.25052967]),
  'danceability')]

### KNN

In [21]:
knn = KNeighborsClassifier()

In [22]:
knn.fit(X_train_sc,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [23]:
print(f'Training Score: {knn.score(X_train_sc, y_train)}')
print(f'Testing Score: {knn.score(X_test_sc, y_test)}')

Training Score: 0.9233212219082223
Testing Score: 0.9102139685102947


In [24]:
knnpred = knn.predict(X)
knntrainpred = knn.predict(X_train)
knntestpred = knn.predict(X_test)

In [25]:
print('RMSE:', metrics.mean_squared_error(y, knnpred, squared=False))
print('Train RMSE:', metrics.mean_squared_error(y_train, knntrainpred, squared=False))
print('Test RMSE:', metrics.mean_squared_error(y_test, knntestpred, squared=False))

RMSE: 0.2855582844967111
Train RMSE: 0.28608795758032207
Test RMSE: 0.28396333735427537


### Decision Tree

In [26]:
dt = DecisionTreeClassifier(random_state=42)

In [27]:
dt.fit(X_train_sc, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=42, splitter='best')

In [28]:
print(f'Training Score: {dt.score(X_train_sc, y_train)}')
print(f'Testing Score: {dt.score(X_test_sc, y_test)}')

Training Score: 0.9996680572376979
Testing Score: 0.8595074687121518


In [29]:
dtpred = dt.predict(X)
dttrainpred = dt.predict(X_train)
dttestpred = dt.predict(X_test)

In [30]:
print('RMSE:', metrics.mean_squared_error(y, dtpred, squared=False))
print('Train RMSE:', metrics.mean_squared_error(y_train, dttrainpred, squared=False))
print('Test RMSE:', metrics.mean_squared_error(y_test, dttestpred, squared=False))

RMSE: 0.28923399639327885
Train RMSE: 0.28974920980806607
Test RMSE: 0.28768282000375495


### Bagged Decision Trees

In [31]:
bag = BaggingClassifier(random_state=42, n_estimators=100) 

In [32]:
bag.fit(X_train_sc, y_train)

BaggingClassifier(base_estimator=None, bootstrap=True, bootstrap_features=False,
                  max_features=1.0, max_samples=1.0, n_estimators=100,
                  n_jobs=None, oob_score=False, random_state=42, verbose=0,
                  warm_start=False)

In [33]:
print(f'Training Score: {bag.score(X_train_sc, y_train)}')
print(f'Testing Score: {bag.score(X_test_sc, y_test)}')

Training Score: 0.9996411429596734
Testing Score: 0.9189072803122056


In [34]:
bagpred = bag.predict(X)
bagtrainpred = bag.predict(X_train)
bagtestpred = bag.predict(X_test)

In [35]:
print('RMSE:', metrics.mean_squared_error(y, bagpred, squared=False))
print('Train RMSE:', metrics.mean_squared_error(y_train, bagtrainpred, squared=False))
print('Test RMSE:', metrics.mean_squared_error(y_test, bagtestpred, squared=False))

RMSE: 0.2855582844967111
Train RMSE: 0.28608795758032207
Test RMSE: 0.28396333735427537


### Random Forest

In [36]:
rfc = RandomForestClassifier(random_state=42) 

In [37]:
rfc.fit(X_train_sc, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [38]:
print(f'Training Score: {rfc.score(X_train_sc, y_train)}')
print(f'Testing Score: {rfc.score(X_test_sc, y_test)}')

Training Score: 0.9996321715336652
Testing Score: 0.919310994482573


In [39]:
rfcpred = rfc.predict(X)
rfctrainpred = rfc.predict(X_train)
rfctestpred = rfc.predict(X_test)

In [40]:
print('RMSE:', metrics.mean_squared_error(y, rfcpred, squared=False))
print('Train RMSE:', metrics.mean_squared_error(y_train, rfctrainpred, squared=False))
print('Test RMSE:', metrics.mean_squared_error(y_test, rfctestpred, squared=False))

RMSE: 0.2855582844967111
Train RMSE: 0.28608795758032207
Test RMSE: 0.28396333735427537


### AdaBoost

In [41]:
AdaBoost = AdaBoostClassifier(random_state=42)

In [42]:
AdaBoost.fit(X_train_sc, y_train)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
                   n_estimators=50, random_state=42)

In [43]:
print(f'Training Score: {AdaBoost.score(X_train_sc, y_train)}')
print(f'Testing Score: {AdaBoost.score(X_test_sc, y_test)}')

Training Score: 0.9181447091015117
Testing Score: 0.9193379087605975


In [44]:
AdaBoostpred = AdaBoost.predict(X)
AdaBoosttrainpred = AdaBoost.predict(X_train)
AdaBoosttestpred = AdaBoost.predict(X_test)

In [45]:
print('RMSE:', metrics.mean_squared_error(y, AdaBoostpred, squared=False))
print('Train RMSE:', metrics.mean_squared_error(y_train, AdaBoosttrainpred, squared=False))
print('Test RMSE:', metrics.mean_squared_error(y_test, AdaBoosttestpred, squared=False))

RMSE: 0.2855582844967111
Train RMSE: 0.28608795758032207
Test RMSE: 0.28396333735427537
