# Preprocessing and Modeling

The following notebook is dedicated to training, testing, and validating various preprocessing and modeling methods. The goal is to attempt to formulate the best possible model for classifying the genre of an EDM song.

## Imports

In [1]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, PowerTransformer, PolynomialFeatures
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier
from xgboost import XGBClassifier

## Reading In The Data

In [2]:
# Read in the data
songs = pd.read_csv('data/songs_clean.csv')
val = pd.read_csv('data/val_clean.csv')

## Setting Features And Target Variables

In [3]:
# Set features and target for modeling
X = songs.drop('genre', axis=1)
y = songs['genre']

## Train Test Split

In [4]:
# Split the modeling data into 80% training and 20% testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.15, random_state=72, stratify=y)

## Preprocessing

Several options for preprocessing will be compared when testing subsequent models.

### Polynomial Features

Training the models with more complex interaction columns and polynomial features will improve the accuracy of the results.

In [5]:
# Create polynomial features up to 5 degrees
pf = PolynomialFeatures(degree=5)
# Training data
X_train = pf.fit_transform(X_train)
# Test data
X_test = pf.transform(X_test)
# Validation data
X_val = pf.transform(val.drop('genre', axis=1))

In [6]:
# Create a list to store all the new polynomial feature names
poly_feat = pf.get_feature_names(X.columns)

### Scaling Options

Depending on the type of model, standardizing or using a power transformer to scale the variables may improve the performance of the algorithm. Both a standard scaler and a power transformer are set up here so that they can be compared with each other.

In [7]:
# Run the data through a standard scaler
ss = StandardScaler()
# Training data
X_tr_sc = ss.fit_transform(X_train)
# Test data
X_te_sc = ss.transform(X_test)
# Validation data
X_val_sc = ss.transform(X_val)

In [8]:
# Run the data through a power transformer
pt = PowerTransformer()
# Training data
X_tr_pt = pt.fit_transform(X_train)
# Test data
X_te_pt = pt.transform(X_test)
# Validation data
X_val_pt = pt.transform(X_val)

  loglike = -n_samples / 2 * np.log(x_trans.var())
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims)


## Baseline Models

A series of baseline modeling tactics will be deployed with scaled, unscaled and power transformed data. This will give a general idea of which preprocessing method works the best and which models show the most potential and should be further tuned.

### Logistic Regression

In [9]:
# Logistic regression baseline model
lr = LogisticRegression(random_state=72)
lr.fit(X_train, y_train)
print(f"Training Score: {lr.score(X_train, y_train)}")
print(f"Test Score: {lr.score(X_test, y_test)}")
print(f"Validation Score: {lr.score(X_val, val['genre'])}")



Training Score: 0.37481525273425953
Test Score: 0.35845896147403683
Validation Score: 0.28


In [10]:
# Logistic regression baseline model with standard scaler
lr.fit(X_tr_sc, y_train)
print(f"Training Score: {lr.score(X_tr_sc, y_train)}")
print(f"Test Score: {lr.score(X_te_sc, y_test)}")
print(f"Validation Score: {lr.score(X_val_sc, val['genre'])}")



Training Score: 0.7360331067100206
Test Score: 0.6834170854271356
Validation Score: 0.85


In [11]:
# Logistic regression baseline model with power transformer
lr.fit(X_tr_pt, y_train)
print(f"Training Score: {lr.score(X_tr_pt, y_train)}")
print(f"Test Score: {lr.score(X_te_pt, y_test)}")
print(f"Validation Score: {lr.score(X_val_pt, val['genre'])}")



Training Score: 0.7375110848359444
Test Score: 0.6666666666666666
Validation Score: 0.8


### K-Nearest Neighbor

In [12]:
# KNN baseline model
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
print(f"Training Score: {knn.score(X_train, y_train)}")
print(f"Test Score: {knn.score(X_test, y_test)}")
print(f"Validation Score: {knn.score(X_val, val['genre'])}")

Training Score: 0.5900088678687555
Test Score: 0.37018425460636517
Validation Score: 0.33


In [13]:
# KNN baseline model with standard scaler
knn.fit(X_tr_sc, y_train)
print(f"Training Score: {knn.score(X_tr_sc, y_train)}")
print(f"Test Score: {knn.score(X_te_sc, y_test)}")
print(f"Validation Score: {knn.score(X_val_sc, val['genre'])}")

Training Score: 0.7783032811114395
Test Score: 0.6515912897822446
Validation Score: 0.84


In [14]:
# KNN baseline model with power transformer
knn.fit(X_tr_pt, y_train)
print(f"Training Score: {knn.score(X_tr_pt, y_train)}")
print(f"Test Score: {knn.score(X_te_pt, y_test)}")
print(f"Validation Score: {knn.score(X_val_pt, val['genre'])}")

Training Score: 0.7821460242388413
Test Score: 0.661641541038526
Validation Score: 0.78


### Decision Tree

In [15]:
# Decision tree baseline model
dt = DecisionTreeClassifier(random_state=72)
dt.fit(X_tr_sc, y_train)
print(f"Training Score: {dt.score(X_tr_sc, y_train)}")
print(f"Test Score: {dt.score(X_te_sc, y_test)}")
print(f"Validation Score: {dt.score(X_val_sc, val['genre'])}")

Training Score: 0.9234407330771505
Test Score: 0.6197654941373534
Validation Score: 0.76


In [16]:
# Decision tree baseline model with standard scaler
dt.fit(X_tr_sc, y_train)
print(f"Training Score: {dt.score(X_tr_sc, y_train)}")
print(f"Test Score: {dt.score(X_te_sc, y_test)}")
print(f"Validation Score: {dt.score(X_val_sc, val['genre'])}")

Training Score: 0.9234407330771505
Test Score: 0.6197654941373534
Validation Score: 0.76


In [17]:
# Decision tree baseline model with power transformer
dt.fit(X_tr_pt, y_train)
print(f"Training Score: {dt.score(X_tr_pt, y_train)}")
print(f"Test Score: {dt.score(X_te_pt, y_test)}")
print(f"Validation Score: {dt.score(X_val_pt, val['genre'])}")

Training Score: 0.9234407330771505
Test Score: 0.6164154103852596
Validation Score: 0.75


### Bagging

In [18]:
# Bagging baseline model
bag = BaggingClassifier(random_state=72)
bag.fit(X_tr_sc, y_train)
print(f"Training Score: {bag.score(X_tr_sc, y_train)}")
print(f"Test Score: {bag.score(X_te_sc, y_test)}")
print(f"Validation Score: {bag.score(X_val_sc, val['genre'])}")

Training Score: 0.9154596511971623
Test Score: 0.7035175879396985
Validation Score: 0.88


In [19]:
# Bagging baseline model with standard scaler
bag.fit(X_tr_sc, y_train)
print(f"Training Score: {bag.score(X_tr_sc, y_train)}")
print(f"Test Score: {bag.score(X_te_sc, y_test)}")
print(f"Validation Score: {bag.score(X_val_sc, val['genre'])}")

Training Score: 0.9154596511971623
Test Score: 0.7035175879396985
Validation Score: 0.88


In [20]:
# Bagging baseline model with power transformer
bag.fit(X_tr_pt, y_train)
print(f"Training Score: {bag.score(X_tr_pt, y_train)}")
print(f"Test Score: {bag.score(X_te_pt, y_test)}")
print(f"Validation Score: {bag.score(X_val_pt, val['genre'])}")

Training Score: 0.9160508424475318
Test Score: 0.7035175879396985
Validation Score: 0.91


### Random Forest

In [21]:
# Random forest baseline model
rf = RandomForestClassifier(random_state=72)
rf.fit(X_train, y_train)
print(f"Training Score: {rf.score(X_train, y_train)}")
print(f"Test Score: {rf.score(X_test, y_test)}")
print(f"Validation Score: {rf.score(X_val, val['genre'])}")



Training Score: 0.9160508424475318
Test Score: 0.6968174204355109
Validation Score: 0.86


In [22]:
# Random forest baseline model with standard scaler
rf.fit(X_tr_sc, y_train)
print(f"Training Score: {rf.score(X_tr_sc, y_train)}")
print(f"Test Score: {rf.score(X_te_sc, y_test)}")
print(f"Validation Score: {rf.score(X_val_sc, val['genre'])}")

Training Score: 0.9160508424475318
Test Score: 0.6968174204355109
Validation Score: 0.86


In [23]:
# Random forest baseline model with power transformer
rf.fit(X_tr_pt, y_train)
print(f"Training Score: {rf.score(X_tr_pt, y_train)}")
print(f"Test Score: {rf.score(X_te_pt, y_test)}")
print(f"Validation Score: {rf.score(X_val_pt, val['genre'])}")

Training Score: 0.914572864321608
Test Score: 0.6901172529313233
Validation Score: 0.82


In [24]:
# params = {
#     'rf__max_depth': [12,13,14],
#     'rf__min_samples_leaf': [3,4,5],
#     'rf__n_estimators': [50,51,52],
#     'rf__max_features': ['auto',25,50,100]
# }

### AdaBoost

In [25]:
# Adaboost baseline model
ada = AdaBoostClassifier(random_state=72)
ada.fit(X_train, y_train)
print(f"Training Score: {ada.score(X_train, y_train)}")
print(f"Test Score: {ada.score(X_test, y_test)}")
print(f"Validation Score: {ada.score(X_val, val['genre'])}")

Training Score: 0.7652970736033107
Test Score: 0.7169179229480737
Validation Score: 0.84


In [26]:
# Adaboost baseline model with standard scaler
ada.fit(X_tr_sc, y_train)
print(f"Training Score: {ada.score(X_tr_sc, y_train)}")
print(f"Test Score: {ada.score(X_te_sc, y_test)}")
print(f"Validation Score: {ada.score(X_val_sc, val['genre'])}")

Training Score: 0.7652970736033107
Test Score: 0.7169179229480737
Validation Score: 0.84


In [27]:
# Adaboost forest baseline model with power transformer
ada.fit(X_tr_pt, y_train)
print(f"Training Score: {ada.score(X_tr_pt, y_train)}")
print(f"Test Score: {ada.score(X_te_pt, y_test)}")
print(f"Validation Score: {ada.score(X_val_pt, val['genre'])}")

Training Score: 0.7611587348507242
Test Score: 0.7386934673366834
Validation Score: 0.91


### Gradient Boost

In [28]:
# Gradient Boost baseline model
gb = GradientBoostingClassifier(random_state=72)
gb.fit(X_train, y_train)
print(f"Training Score: {gb.score(X_train, y_train)}")
print(f"Test Score: {gb.score(X_test, y_test)}")
print(f"Validation Score: {gb.score(X_val, val['genre'])}")

Training Score: 0.8604788649127993
Test Score: 0.7420435510887772
Validation Score: 0.9


In [29]:
# Gradient Boost baseline model with standard scaler
gb.fit(X_tr_sc, y_train)
print(f"Training Score: {gb.score(X_tr_sc, y_train)}")
print(f"Test Score: {gb.score(X_te_sc, y_test)}")
print(f"Validation Score: {gb.score(X_val_sc, val['genre'])}")

Training Score: 0.8604788649127993
Test Score: 0.7420435510887772
Validation Score: 0.9


In [30]:
# Gradient Boost forest baseline model with power transformer
gb.fit(X_tr_pt, y_train)
print(f"Training Score: {gb.score(X_tr_pt, y_train)}")
print(f"Test Score: {gb.score(X_te_pt, y_test)}")
print(f"Validation Score: {gb.score(X_val_pt, val['genre'])}")

Training Score: 0.857227313035767
Test Score: 0.7353433835845896
Validation Score: 0.91


### XGBoost

In [31]:
# XGBoost baseline model
xgb = XGBClassifier(random_state=72)
xgb.fit(X_train, y_train)
print(f"Training Score: {xgb.score(X_train, y_train)}")
print(f"Test Score: {xgb.score(X_test, y_test)}")
print(f"Validation Score: {xgb.score(X_val, val['genre'])}")

Training Score: 0.8294413242684008
Test Score: 0.7487437185929648
Validation Score: 0.91


In [32]:
# XGBoost baseline model with standard scaler
xgb.fit(X_tr_sc, y_train)
print(f"Training Score: {xgb.score(X_tr_sc, y_train)}")
print(f"Test Score: {xgb.score(X_te_sc, y_test)}")
print(f"Validation Score: {xgb.score(X_val_sc, val['genre'])}")

Training Score: 0.8294413242684008
Test Score: 0.7487437185929648
Validation Score: 0.91


In [33]:
# XGBoost forest baseline model with power transformer
xgb.fit(X_tr_pt, y_train)
print(f"Training Score: {xgb.score(X_tr_pt, y_train)}")
print(f"Test Score: {xgb.score(X_te_pt, y_test)}")
print(f"Validation Score: {xgb.score(X_val_pt, val['genre'])}")

Training Score: 0.8323972805202483
Test Score: 0.7403685092127303
Validation Score: 0.9


Based on the results above, particularly when it comes to validation, it appears that the models perform best overall when the data is run through a power transformer. In addition, the most promising models seem to be Bagging, Gradient Boost and XGBoost. The next step will be to focus on improving these five algorithms with parameter tuning.

## Model Tuning

### Bagging

In [34]:
# Set parameter distributions for bagging with randomized search
params = {
    'n_estimators': range(1,100)
}

In [35]:
# Instantiate and fit a randomized search with 3 folds
rs = RandomizedSearchCV(bag, params, 5, cv=3, random_state=72)
rs.fit(X_tr_pt, y_train)

RandomizedSearchCV(cv=3, error_score='raise-deprecating',
          estimator=BaggingClassifier(base_estimator=None, bootstrap=True,
         bootstrap_features=False, max_features=1.0, max_samples=1.0,
         n_estimators=10, n_jobs=None, oob_score=False, random_state=72,
         verbose=0, warm_start=False),
          fit_params=None, iid='warn', n_iter=5, n_jobs=None,
          param_distributions={'n_estimators': range(1, 100)},
          pre_dispatch='2*n_jobs', random_state=72, refit=True,
          return_train_score='warn', scoring=None, verbose=0)

In [36]:
# Display the best performing parameters
rs.best_params_

{'n_estimators': 60}

In [37]:
# Score the best bagging estimator on training, test, and validation
print(f"Training Score: {rs.best_estimator_.score(X_tr_pt, y_train)}")
print(f"Test Score: {rs.best_estimator_.score(X_te_pt, y_test)}")
print(f"Validation Score: {rs.best_estimator_.score(X_val_pt, val['genre'])}")

Training Score: 0.9234407330771505
Test Score: 0.7018425460636516
Validation Score: 0.93


In [38]:
# Set parameter distributions for bagging with grid search
params = {
    'n_estimators': range(58,63)
}

In [39]:
# Instantiate and fit a grid search with 3 folds
gs = GridSearchCV(bag, params, cv=3)
gs.fit(X_tr_pt, y_train)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=BaggingClassifier(base_estimator=None, bootstrap=True,
         bootstrap_features=False, max_features=1.0, max_samples=1.0,
         n_estimators=10, n_jobs=None, oob_score=False, random_state=72,
         verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': range(58, 63)}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score='warn', scoring=None, verbose=0)

In [40]:
# Display the best performing parameters
gs.best_params_

{'n_estimators': 59}

In [41]:
# Score the best bagging estimator on training, test, and validation
print(f"Training Score: {gs.best_estimator_.score(X_tr_pt, y_train)}")
print(f"Test Score: {gs.best_estimator_.score(X_te_pt, y_test)}")
print(f"Validation Score: {gs.best_estimator_.score(X_val_pt, val['genre'])}")

Training Score: 0.9234407330771505
Test Score: 0.7035175879396985
Validation Score: 0.92


In [42]:
# Save the best estimator as the final bagging model
bag = rs.best_estimator_

### AdaBoost

In [43]:
# Set parameter distributions for adaboost with randomized search
params = {
    'learning_rate': np.linspace(.001,5,100),
    'n_estimators': range(1,100),
}

In [44]:
# Instantiate and fit a randomized search with 3 folds
rs = RandomizedSearchCV(ada, params, 5, cv=3, random_state=72)
rs.fit(X_tr_pt, y_train)

RandomizedSearchCV(cv=3, error_score='raise-deprecating',
          estimator=AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=72),
          fit_params=None, iid='warn', n_iter=5, n_jobs=None,
          param_distributions={'learning_rate': array([1.00000e-03, 5.14949e-02, ..., 4.94951e+00, 5.00000e+00]), 'n_estimators': range(1, 100)},
          pre_dispatch='2*n_jobs', random_state=72, refit=True,
          return_train_score='warn', scoring=None, verbose=0)

In [45]:
# Display the best performing parameters
rs.best_params_

{'n_estimators': 15, 'learning_rate': 1.465353535353535}

In [46]:
# Score the best adaboost estimator on training, test, and validation
print(f"Training Score: {rs.best_estimator_.score(X_tr_pt, y_train)}")
print(f"Test Score: {gs.best_estimator_.score(X_te_pt, y_test)}")
print(f"Validation Score: {gs.best_estimator_.score(X_val_pt, val['genre'])}")

Training Score: 0.7245048773278155
Test Score: 0.7035175879396985
Validation Score: 0.92


In [47]:
# Set parameter distributions for adaboost with grid search
params = {
    'n_estimators': range(13,18)
}

In [48]:
# Instantiate and fit a grid search with 3 folds
gs = GridSearchCV(ada, params, cv=3)
gs.fit(X_tr_pt, y_train)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=72),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': range(13, 18)}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score='warn', scoring=None, verbose=0)

In [50]:
# Display the best performing parameters
gs.best_params_

{'n_estimators': 14}

In [51]:
# Score the best adaboost estimator on training, test, and validation
print(f"Training Score: {gs.best_estimator_.score(X_tr_pt, y_train)}")
print(f"Test Score: {gs.best_estimator_.score(X_te_pt, y_test)}")
print(f"Validation Score: {gs.best_estimator_.score(X_val_pt, val['genre'])}")

Training Score: 0.7655926692284954
Test Score: 0.7269681742043551
Validation Score: 0.89


In [52]:
# Save the best estimator as the final adaboost model
ada = rs.best_estimator_

### Gradient Boost

In [53]:
# Set parameter distributions for gradient boost with randomized search
params = {
    'learning_rate': np.linspace(.001,.5,100),
    'n_estimators': range(1,200),
    'subsample': np.linspace(0,1,100),
    'min_samples_split': range(2,10),
    'min_samples_leaf': range(1,10),
    'max_depth': range(1,50),
    'min_impurity_decrease': np.linspace(.001,1)
}

In [54]:
# Instantiate and fit a randomized search with 3 folds
rs = RandomizedSearchCV(gb, params, 5, cv=3, random_state=72)
rs.fit(X_tr_pt, y_train)

RandomizedSearchCV(cv=3, error_score='raise-deprecating',
          estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_sampl...      subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False),
          fit_params=None, iid='warn', n_iter=5, n_jobs=None,
          param_distributions={'learning_rate': array([0.001  , 0.00604, ..., 0.49496, 0.5    ]), 'n_estimators': range(1, 200), 'subsample': array([0.    , 0.0101, ..., 0.9899, 1.    ]), 'min_samples_split': range(2, 10), 'min_samples_leaf': range(1, 10), 'max_depth': range(1, 50), 'min_impurity_decrease': a...51, 0.8369 ,
       0.85729, 0.87767, 0.89806, 0.91845, 0.93884, 0.95922, 0.97961,
       1.     ])},
          pre_dispatch='2*n_jobs', random_state=72

In [55]:
# Display the best performing parameters
rs.best_params_

{'subsample': 0.29292929292929293,
 'n_estimators': 131,
 'min_samples_split': 4,
 'min_samples_leaf': 6,
 'min_impurity_decrease': 0.18448979591836737,
 'max_depth': 18,
 'learning_rate': 0.046363636363636364}

In [57]:
# Score the best gradient boost estimator on training, test, and validation
print(f"Training Score: {rs.best_estimator_.score(X_tr_pt, y_train)}")
print(f"Test Score: {rs.best_estimator_.score(X_te_pt, y_test)}")
print(f"Validation Score: {rs.best_estimator_.score(X_val_pt, val['genre'])}")

Training Score: 0.9074785693171741
Test Score: 0.7085427135678392
Validation Score: 0.93


In [58]:
# Save the best estimator as the final gradient boost model
gb = rs.best_estimator_

### XGBoost

In [84]:
# Set parameter distributions for XGBoost with randomized search
params = {
    'learning_rate': np.linspace(.001,.5,25),
    'n_estimators': range(1,200),
    'gamma': range(10),
    'min_child_weight': range(1,10),
    'max_delta_step': range(10),
    'subsample': np.linspace(0,1,25),
    'colsample_bytree': np.linspace(0,1,25),
    'colsample_bylevel': np.linspace(0,1,25),
    'colsample_bynode': np.linspace(0,1,25),
    'max_depth': range(1,25),
    'reg_alpha': range(5),
    'reg_lambda': range(5),
    'scale_pos_weight': np.linspace(0,1,25),
    'base_score': np.linspace(0,1,25)
}

In [96]:
# Instantiate and fit a randomized search with 3 folds
rs = RandomizedSearchCV(xgb, params, 50, cv=3, random_state=72)
rs.fit(X_tr_pt, y_train)

RandomizedSearchCV(cv=3, error_score='raise-deprecating',
          estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=100, n_jobs=1, nthread=None,
       objective='multi:softprob', random_state=72, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1),
          fit_params=None, iid='warn', n_iter=50, n_jobs=None,
          param_distributions={'learning_rate': array([0.001  , 0.02179, 0.04258, 0.06338, 0.08417, 0.10496, 0.12575,
       0.14654, 0.16733, 0.18812, 0.20892, 0.22971, 0.2505 , 0.27129,
       0.29208, 0.31288, 0.33367, 0.35446, 0.37525, 0.39604, 0.41683,
       0.43762, 0.45842, 0.47921, 0.5    ]), 'n_esti..., 0.625  , 0.66667, 0.70833, 0.75   , 0.79167, 0.83333,
       0.875  , 0.91667, 0.95833, 1.     ])},
          pre_d

In [97]:
# Display the best performing parameters
rs.best_params_

{'subsample': 0.41666666666666663,
 'scale_pos_weight': 0.4583333333333333,
 'reg_lambda': 0,
 'reg_alpha': 2,
 'n_estimators': 91,
 'min_child_weight': 9,
 'max_depth': 15,
 'max_delta_step': 3,
 'learning_rate': 0.021791666666666668,
 'gamma': 0,
 'colsample_bytree': 0.6666666666666666,
 'colsample_bynode': 0.3333333333333333,
 'colsample_bylevel': 1.0,
 'base_score': 0.08333333333333333}

In [98]:
# Score the best XGBoost estimator on training, test, and validation
print(f"Training Score: {rs.best_estimator_.score(X_tr_pt, y_train)}")
print(f"Test Score: {rs.best_estimator_.score(X_te_pt, y_test)}")
print(f"Validation Score: {rs.best_estimator_.score(X_val_pt, val['genre'])}")

Training Score: 0.8016553355010346
Test Score: 0.7470686767169179
Validation Score: 0.92


In [99]:
xgb = rs.best_estimator_

### Voting

In [100]:
# Set a list of estimators for the voting classifier
models = [
    ('bag', bag),
    ('ada', ada),
    ('gb', gb),
    ('xgb', xgb)
]

In [101]:
vc = VotingClassifier(models)
vc.fit(X_tr_pt, y_train)
print(f"Training Score: {vc.score(X_tr_pt, y_train)}")
print(f"Test Score: {vc.score(X_te_pt, y_test)}")
print(f"Validation Score: {vc.score(X_val_pt, val['genre'])}")

Training Score: 0.8675731599172333
Test Score: 0.7319932998324958
Validation Score: 0.93


In [61]:
# feat_imp = pd.DataFrame(xgb.feature_importances_, index=poly_feat).sort_values(0, ascending=False)
# feat_imp

Unnamed: 0,0
duration_ms^3 loudness,0.084715
duration_ms^4 loudness,0.061337
energy tempo^4,0.047956
energy loudness tempo,0.040677
energy^3 loudness tempo,0.033887
energy loudness^2 tempo,0.030042
energy^2 tempo^3,0.021533
loudness tempo^4,0.019903
duration_ms tempo,0.018461
duration_ms^3 energy loudness,0.016275


In [48]:
# plt.figure(figsize=(10,10))
# plt.barh(feat_imp.index, feat_imp[0])
# plt.gca().invert_yaxis()

In [None]:
# xgb.fit(X_tr_pt, y_train)

In [None]:
# xgb.score(X_tr_pt, y_train)

In [None]:
# xgb.score(X_te_pt, y_test)

In [None]:
# xgb.predict(X_val_pt)

In [None]:
# feat_imp = pd.DataFrame(xgb.feature_importances_, index=poly_feat).sort_values(0, ascending=False)
# feat_imp

In [None]:
# plt.figure(figsize=(10,10))
# plt.barh(feat_imp.index, feat_imp[0])
# plt.gca().invert_yaxis()

In [None]:
# imp_feat = feat_imp[feat_imp[0] > 0.01].index

In [None]:
# X_tr_imp_feat = pd.DataFrame(X_train, columns=poly_feat)[imp_feat]
# X_te_imp_feat = pd.DataFrame(X_test, columns=poly_feat)[imp_feat]
# X_val_imp_feat = pd.DataFrame(X_val, columns=poly_feat)[imp_feat]

In [None]:
# ss = StandardScaler()
# X_tr_sc = ss.fit_transform(X_tr_imp_feat)
# X_te_sc = ss.transform(X_te_imp_feat)
# X_val_sc = ss.transform(X_val_imp_feat)

In [None]:
# xgb.fit(X_tr_sc, y_train)

In [None]:
# xgb.score(X_tr_sc, y_train)

In [None]:
# xgb.score(X_te_sc, y_test)

In [None]:
# xgb.predict(X_val_sc)

In [None]:
# xgb.fit(X_tr_pt, y_train)

In [None]:
# xgb.score(X_tr_pt, y_train)

In [None]:
# xgb.score(X_te_pt, y_test)

In [None]:
# xgb.predict(X_val_pt)

In [None]:
# xgb = XGBClassifier(
#     max_depth=20,
#     learning_rate=.01,
#     n_estimators=25,
#     objective='multi:softprobs',
#     num_class=5,
#     gamma=6,
#     min_child_weight=3,
#     subsample=.75,
#     colsample_bytree=.75,
#     reg_lambda=1,
#     reg_alpha=0,
#     seed=42
# )

In [None]:
# dt = DecisionTreeClassifier(max_depth=7, min_samples_split=2, min_samples_leaf=5, random_state=42)
# dt.fit(X_tr_sc, y_train)
# print(dt.score(X_tr_sc, y_train))
# print(dt.score(X_te_sc, y_test))

In [None]:
# dt.predict(X_val_sc)

In [None]:
# dt = DecisionTreeClassifier(max_depth=7, min_samples_split=2, min_samples_leaf=5, random_state=42)
# dt.fit(X_tr_pt, y_train)
# print(dt.score(X_tr_pt, y_train))
# print(dt.score(X_te_pt, y_test))

In [None]:
# dt.predict(X_val_pt)

In [None]:
# knn = KNeighborsClassifier(n_neighbors=14, metric='manhattan')
# knn.fit(X_tr_sc, y_train)
# print(knn.score(X_tr_sc, y_train))
# print(knn.score(X_te_sc, y_test))

In [None]:
# knn.predict(X_val_sc)

In [None]:
# knn = KNeighborsClassifier(n_neighbors=14, metric='manhattan')
# knn.fit(X_tr_pt, y_train)
# print(knn.score(X_tr_pt, y_train))
# print(knn.score(X_te_pt, y_test))

In [None]:
# knn.predict(X_val_pt)

In [None]:
# lr = LogisticRegression(solver='lbfgs', multi_class='multinomial')
# lr.fit(X_tr_sc, y_train)
# print(lr.score(X_tr_sc, y_train))
# print(lr.score(X_te_sc, y_test))

In [None]:
# lr.predict(X_val_sc)

In [None]:
# lr = LogisticRegression(solver='lbfgs', multi_class='multinomial')
# lr.fit(X_tr_pt, y_train)
# print(lr.score(X_tr_pt, y_train))
# print(lr.score(X_te_pt, y_test))

In [None]:
# lr.predict(X_val_pt)