In [None]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
# import Random Forest classifier
from sklearn.ensemble import RandomForestClassifier
# Check accuracy score 
from sklearn.metrics import accuracy_score, classification_report

import xgboost as xgb
from sklearn.metrics import mean_squared_error

from sklearn import svm

# training the model on training set
from sklearn.naive_bayes import GaussianNB

from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder, LabelEncoder, MinMaxScaler, MaxAbsScaler, RobustScaler, Normalizer, QuantileTransformer, PowerTransformer
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.decomposition import PCA
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline, FeatureUnion

from sklearn.model_selection import train_test_split
import re
import pickle

import matplotlib.pyplot as plt
import plotly.express as px


In [None]:
# convert data to dataframe
df_missing = pd.read_csv('../../../data/clean_data/data_clean_missing.csv')


# constrain duration_ms to <= 700,000 ms due to outliers
df_constrain = df_missing[df_missing['duration_ms'] <= 700000]

#replace missing values with nan
df_constrain = df_constrain.replace([-1, '?'], np.nan)

In [None]:
# impute duration and tempo columns using the mean strategy
#imp_mean = SimpleImputer(strategy='mean')
#imp_mean = SimpleImputer(strategy='median')
#imp_mean = KNNImputer()
imp_mean = IterativeImputer(random_state=0, initial_strategy='median')
df_constrain['duration_ms_imp'] = imp_mean.fit_transform(df_constrain[['duration_ms']])

imp_median = IterativeImputer(random_state=0, initial_strategy='median')
#imp_median = KNNImputer()
#imp_median = SimpleImputer(strategy='median')
df_constrain['tempo_imp'] = imp_median.fit_transform(df_constrain[['tempo']])

# drop original columns
df_constrain.drop(['duration_ms', 'tempo'], axis = 1, inplace = True)

In [None]:
# replace Rap with Hip-Hop
df_constrain['music_genre'].replace({'Rap': 'Hip-Hop'}, inplace = True)

In [None]:
df_test = df_constrain

In [None]:
# categorize each feature for encoding or scaling
cat_feats = ['artist_name', 'key', 'mode', 'music_genre']
cat_feats_ohe = ['artist_name', 'mode']
cat_feats_le = ['music_genre']

num_feats = ['popularity', 'acousticness','danceability', 'duration_ms', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'valence']
num_feats_scale = ['popularity', 'duration_ms_imp', 'loudness', 'tempo_imp']
num_feats_imp_mean = ['duration_ms']
num_feats_imp_median = ['tempo']

In [None]:
# list of features
feats = ['popularity', 'acousticness','danceability', 'duration_ms_imp', 'energy', 'instrumentalness', 'loudness', 'speechiness', 'tempo_imp', 'valence', 'mode_Major', 'mode_Minor']

In [None]:
# instantiate labelencoder object
le = LabelEncoder()

# apply le on categorical feature columns
df_subset[cat_feats_le] = df_subset[cat_feats_le].apply(lambda col: le.fit_transform(col))

In [None]:
# get dummy variables for categorical variables
df_out = pd.get_dummies(df_subset, columns = cat_feats_ohe)

In [None]:
# get rid of unwanted values for artist names
df_out.columns = df_out.columns.str.translate("".maketrans({"[":"{", "]":"}","<":"^"}))

In [None]:
# make custom map for key column
dic = {'C': 0, 'C#': 1, 'D': 2, 'D#': 3, 'E': 4,
       'F': 5, 'F#': 6, 'G': 7, 'G#': 8, 'A': 9,
       'A#': 10, 'B': 11}

# use custom map on key column
df_out.replace({"key": dic}, inplace = True)

In [None]:
# Scale numerical features that need to be scaled
scaler = StandardScaler()

df_out[num_feats_scale] = scaler.fit_transform(df_subset[num_feats_scale])

In [None]:
# identify target variable
df_target = df_out['music_genre']

# identify input variables
df_input = df_out.drop(['music_genre'], axis=1)
#'key', 'mode_Major', 'mode_Minor', 'liveness'


# make separate input for feature selection
# df_input_selection = df_subset.drop(['music_genre', 'liveness', 'key', 'energy', 'valence', 'duration_ms_imp', 'tempo_imp'], axis=1)

# make separate input for feature selection
# df_input_selection = df_subset[feats]

In [None]:
df_input.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_input, df_target, test_size=0.3, random_state=1001)

In [None]:
xg_clf = xgb.XGBClassifier(objective= 'multi:softmax', colsample_bytree = 0.3, learning_rate = 0.2,
                max_depth = 5, alpha = 0.5, n_estimators = 100, num_class = 9, use_label_encoder=False)

xg_clf.fit(X_train,y_train)

pred_clf = xg_clf.predict(X_test)

print(xg_clf.feature_importances_)

# Check accuracy score
print('Model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, pred_clf)))

# Model Precision: what percentage of positive tuples are labeled as such?
print("Precision:",metrics.precision_score(y_test, pred_clf, average = None))

# Model Recall: what percentage of positive tuples are labelled as such?
print("Recall:",metrics.recall_score(y_test, pred_clf, average = None))

In [None]:
print(xg_clf.feature_importances_)

In [None]:
params_opt = {'objective': 'multi:softmax', 'use_label_encoder': False, 'colsample_bytree': 0.6258986644606082, 'gamma': 1.4497823418113986, 'learning_rate': 0.47434885295765056,
              'max_depth': 7, 'min_child_weight': 0, 'n_estimators': 200, 'reg_alpha': 0.1, 'reg_lambda': 0.733163084214365}

# params_opt = {'objective': 'multi:softmax', 'use_label_encoder': False, 'colsample_bytree': 0.7715942963067712, 'gamma': 0.10686394019950783, 'learning_rate': 0.09710945761615764,
#               'max_depth': 6, 'min_child_weight': 1, 'n_estimators': 300, 'reg_alpha': 0.3027723368940669, 'reg_lambda': 1.5894437328567554}

In [None]:
xg_clf_opt = xgb.XGBClassifier(**params_opt)


In [None]:
xg_clf_opt.fit(X_train,y_train)
pred_clf_opt = xg_clf_opt.predict(X_test)

print(xg_clf_opt.feature_importances_)

# Check accuracy score
print('Model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, pred_clf_opt)))

# Model Precision: what percentage of positive tuples are labeled as such?
print("Precision:",metrics.precision_score(y_test, pred_clf_opt, average = None))

# Model Recall: what percentage of positive tuples are labelled as such?
print("Recall:",metrics.recall_score(y_test, pred_clf_opt, average = None))

In [None]:
print(classification_report(y_test, pred_clf_opt, target_names=le.classes_))

In [None]:
# save the model to disk
filename = 'xgb_opt_model2.sav'
pickle.dump(xg_clf_opt, open(filename, 'wb'))

In [None]:
# load the model from disk
loaded_model = pickle.load(open('xgb_opt_model2.sav', 'rb'))
result = loaded_model.predict(X_test)

# Check accuracy score
print('Model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, result)))


In [None]:
# load the model from disk
loaded_model = pickle.load(open('xgboost_baseline_model.sav', 'rb'))
result = loaded_model.predict(X_test)

# Check accuracy score
print('Model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, result)))

In [None]:
xgb.plot_importance(loaded_model, max_num_features = 14)
# plt.rcParams['figure.figsize'] = [10, 10]
plt.show()

In [None]:
xgb.plot_importance(xg_clf_opt, max_num_features = 8)
# plt.rcParams['figure.figsize'] = [10, 10]
plt.show()

In [None]:
print(classification_report(y_test, pred_clf, target_names=le.classes_))

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns

In [None]:
def plot_confusion_matrix(cm, classes, normalized=True, cmap='bone'):
    plt.figure(figsize=[7, 6])
    norm_cm = cm
    if normalized:
        norm_cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        sns.heatmap(norm_cm, annot=cm, fmt='g', xticklabels=classes, yticklabels=classes, cmap=cmap)

In [None]:
xgb_cm = confusion_matrix(y_test, preds)
plot_confusion_matrix(xgb_cm, classes=le.classes_)
plt.show()

In [None]:
from xgboost import cv

params = {'objective':'multi:softmax', 'colsample_bytree': 0.3, 'learning_rate': 0.2,
                'max_depth': 5, 'alpha': 0.5, 'n_estimators': 100, 'num_class': 9, 'use_label_encoder': False}

xgb_cv = cv(dtrain=data_dmatrix, params=params, nfold=3,
                    num_boost_round=50, early_stopping_rounds=10, metrics="auc", as_pandas=True, seed=123)

In [None]:
xgb_cv.head()

In [None]:
from hyperopt import hp, Trials, fmin, tpe, STATUS_OK

In [None]:
space={'max_depth': hp.quniform("max_depth", 3, 10, 1),
        'gamma': hp.uniform ('gamma', 0,3),
        'reg_alpha' : hp.uniform('reg_alpha', 0,1),
        'reg_lambda' : hp.uniform('reg_lambda', 0,2),
        'colsample_bytree' : hp.uniform('colsample_bytree', 0,1),
        'min_child_weight' : hp.quniform('min_child_weight', 0, 5, 1),
        'n_estimators': hp.quniform('n_estimators', 50, 350, 100),
        'seed': 0,
       'learning_rate': hp.uniform('learning_rate', 0.01,1)
    }


In [None]:
{'colsample_bytree': 0.6258986644606082, 'gamma': 1.4497823418113986, 'learning_rate': 0.47434885295765056, 'max_depth': 7.0, 'min_child_weight': 0.0,
 'n_estimators': 200.0, 'reg_alpha': 0.1, 'reg_lambda': 0.733163084214365}

In [None]:
def objective(space):
    clf=xgb.XGBClassifier(
                    n_estimators = int(space['n_estimators']), max_depth = int(space['max_depth']), gamma = space['gamma'],
                    reg_alpha = space['reg_alpha'] , min_child_weight=int(space['min_child_weight']),
                    colsample_bytree= int(space['colsample_bytree']), learning_rate= space['learning_rate'],
                    use_label_encoder=False)
    
    evaluation = [( X_train, y_train), ( X_test, y_test)]
    
    clf.fit(X_train, y_train,
            eval_set=evaluation, eval_metric="auc",
            early_stopping_rounds=10,verbose=False)
    

    pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, pred)
    print ("SCORE:", accuracy)
    return {'loss': -accuracy, 'status': STATUS_OK }

In [None]:
trials = Trials()

best_hyperparams = fmin(fn = objective,
                        space = space,
                        algo = tpe.suggest,
                        max_evals = 100,
                        trials = trials)

In [None]:
print("The best hyperparameters are : ","\n")
print(best_hyperparams)

In [None]:
from sklearn.compose import ColumnTransformer

In [None]:
numeric_features = ['popularity', 'duration_ms', 'loudness', 'tempo']
numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
)

# define which transformer applies to which columns
preprocess = ColumnTransformer([('numeric_transformer', numeric_transformer, numeric_features)
])

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(
    steps=[("preprocess", preprocess), ('f_classif', SelectKBest(k='all')), ("xgb", xg_clf_opt)]
)

In [None]:
n_features_to_test = np.arange(1, 6652)

scalers_to_test = [StandardScaler(), RobustScaler(), QuantileTransformer(), MinMaxScaler(), MaxAbsScaler(), Normalizer(), PowerTransformer()]



param_grid = {
    "preprocess__numeric_transformer__imputer__strategy": ["mean", "median"],
    'preprocess__numeric_transformer__scaler': scalers_to_test,
}

grid_search = GridSearchCV(clf, param_grid, cv=10)
grid_search

In [None]:
X_train.columns[0:14]

In [None]:
grid_search.fit(X_train, y_train)

print("Best params:")
print(grid_search.best_params_)