# Introduction

The goal of this project is to determine the factors that directly contribute to student success for this online program. Previous EDA showed that the distributions of numeric data did not differ between passing and failing students. Since there are both continuous and categorical values to predict the student outcome, 

## Loading libraries

In [3]:
# Standard
import os
import random
from datetime import datetime, timedelta
import itertools
from subprocess import call
from io import StringIO
import time
import pickle

# Manipulation
import numpy as np
import pandas as pd
pd.options.display.max_columns = None
from scipy import stats, optimize, spatial

# Visualization
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.colors
import matplotlib.cm as cm
import seaborn as sns
import plotly.graph_objects as go
from IPython.display import Image
import graphviz
import pydotplus

# Modeling
from sklearn import datasets, svm, decomposition
from sklearn.ensemble import RandomForestClassifier, ExtraTreesRegressor, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, export_graphviz
from sklearn.cluster import KMeans, AffinityPropagation, SpectralClustering, DBSCAN, AgglomerativeClustering
from sklearn.neighbors import KNeighborsClassifier
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split, KFold, cross_val_score, StratifiedKFold
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, Normalizer, RobustScaler
from sklearn.datasets import make_classification

# Metrics
from sklearn.metrics import precision_recall_curve, f1_score, auc
from sklearn.metrics import accuracy_score, balanced_accuracy_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import precision_score, recall_score, classification_report
from sklearn.metrics import roc_curve, roc_auc_score, log_loss
from sklearn.metrics import silhouette_score, silhouette_samples

## Loading statistical functions

In [5]:
def ecdf(data):
    """Compute ECDF for a one-dimensional array of measurements."""
    n = len(data)
    x = np.sort(data)
    y = np.arange(1, n + 1) / n
    return x, y

def pearson_r(x , y):
    """Compute Pearson correlation coefficient between two arrays."""
    corr_mat = np.corrcoef(x, y)
    return corr_mat[0, 1]

def bootstrap_replicate_1d(data, func):
    """Generate bootstrap replicate of 1D data."""
    bs_sample = np.random.choice(data, len(data))
    return func(bs_sample)

def draw_bs_reps(data, func, size = 1):
    """Draw bootstrap replicates."""
    bs_replicates = np.empty(size)
    for i in range(size):
        bs_replicates[i] = bootstrap_replicate_1d(data, func)
    return bs_replicates

def draw_bs_pairs_linreg(x, y, size = 1):
    """perform pairs bootstrap for linear regression"""
    inds = np.arange(len(x))
    bs_slope_reps = np.empty(size)
    bs_intercept_reps = np.empty(size)
    for i in range(size):
        bs_inds = np.random.choice(inds, size = len(inds))
        bs_x, bs_y = x[bs_inds], y[bs_inds]
        bs_slope_reps[i], bs_intercept_reps[i] = np.polyfit(bs_x, bs_y, 1)
    return bs_slope_reps, bs_intercept_reps

def draw_bs_pairs(x, y, func, size = 1):
    """Perform pairs bootstrap for a single statistic."""
    inds = np.arange(len(x))
    bs_replicates = np.empty(size)
    for i in range(size):
        bs_inds = np.random.choice(inds, size = len(inds))
        bs_x, bs_y = x[bs_inds], y[bs_inds]
        bs_replicates[i] = func(bs_x, bs_y)
    return bs_replicates

def permutation_sample(data1, data2):
    """Generate a permutation sample from two data sets."""
    data = np.concatenate((data1, data2))
    permuted_data = np.random.permutation(data)
    perm_sample_1 = permuted_data[:len(data1)]
    perm_sample_2 = permuted_data[len(data1):]
    return perm_sample_1, perm_sample_2

def draw_perm_reps(data_1, data_2, func, size=1):
    """Generate multiple permutation replicates."""
    perm_replicates = np.empty(size)
    for i in range(size):
        perm_sample_1, perm_sample_2 = permutation_sample(data_1, data_2)
        perm_replicates[i] = func(perm_sample_1, perm_sample_2)
    return perm_replicates

def diff_of_means(data_1, data_2):
    """Difference in means of two arrays."""
    diff = np.mean(data_1) - np.mean(data_2)
    return diff

def diff_frac(data_A, data_b):
    frac_A = np.sum(data_A) / len(data_A)
    frac_B = np.sum(data_B) / len(data_B)
    return frac_B - frac_A

def rmse(pred, obs):
    return np.sqrt(((pred - obs) ** 2).mean())

def mse(pred, obs):
    return ((pred - obs) ** 2).mean()

def bon_correct(alpha, n):
    return (alpha/n)

# Loading Data

In [7]:
df = pd.read_csv('capstone_2_modeling_v2.csv')
with open('region_map.pkl', 'rb') as f:
    region_map = pickle.load(f)

with open('activity_map.pkl', 'rb') as file:
    activity_map = pickle.load(file)

In [8]:
df.drop(columns = 'Unnamed: 0', inplace = True)

In [9]:
df.head()

Unnamed: 0,id_student,gender,disability,age_band,region,imd_band,highest_education,date_registration,num_of_prev_attempts,final_result,studied_credits,activity_type,mean_score,mean_assessment_length,max_assessment_length,mean_active,total_active,mean_clicks,clicks
0,2412002,0,0,2,5686,5,2,-38,0,Distinction,90,9783,84.25,63.625,171,4.0,4,11.0,11
1,2412002,0,0,2,5686,5,2,-38,0,Distinction,90,15664,84.25,63.625,171,1.0,3,5.333333,16
2,2412002,0,0,2,5686,5,2,-38,0,Distinction,90,11403,84.25,63.625,171,3.0,3,6.0,6
3,2412002,0,0,2,5686,5,2,-38,0,Distinction,90,10272,84.25,63.625,171,2.5,5,11.5,23
4,2412002,0,0,2,5686,5,2,-38,0,Distinction,90,5981,84.25,63.625,171,2.0,2,3.0,3


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72305 entries, 0 to 72304
Data columns (total 19 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id_student              72305 non-null  int64  
 1   gender                  72305 non-null  int64  
 2   disability              72305 non-null  int64  
 3   age_band                72305 non-null  int64  
 4   region                  72305 non-null  int64  
 5   imd_band                72305 non-null  int64  
 6   highest_education       72305 non-null  int64  
 7   date_registration       72305 non-null  int64  
 8   num_of_prev_attempts    72305 non-null  int64  
 9   final_result            72305 non-null  object 
 10  studied_credits         72305 non-null  int64  
 11  activity_type           72305 non-null  int64  
 12  mean_score              72305 non-null  float64
 13  mean_assessment_length  72305 non-null  float64
 14  max_assessment_length   72305 non-null

In [11]:
df.describe()

Unnamed: 0,id_student,gender,disability,age_band,region,imd_band,highest_education,date_registration,num_of_prev_attempts,studied_credits,activity_type,mean_score,mean_assessment_length,max_assessment_length,mean_active,total_active,mean_clicks,clicks
count,72305.0,72305.0,72305.0,72305.0,72305.0,72305.0,72305.0,72305.0,72305.0,72305.0,72305.0,72305.0,72305.0,72305.0,72305.0,72305.0,72305.0,72305.0
mean,713637.3,0.392158,0.080658,1.316838,6256.934251,4.503409,2.822184,-64.656649,0.13923,75.746836,9807.248696,75.50787,103.06103,164.890851,2.089066,7.119203,6.297755,23.47514
std,568525.1,0.488235,0.272312,0.483412,1767.818949,2.822728,0.734119,40.937335,0.439077,32.128242,4126.4589,14.974022,42.343092,59.488002,2.006748,13.171022,8.463833,51.89753
min,6516.0,0.0,0.0,1.0,2035.0,0.0,1.0,-172.0,0.0,30.0,2.0,0.0,-6.0,-6.0,1.0,1.0,1.0,1.0
25%,500112.0,0.0,0.0,1.0,4912.0,2.0,2.0,-93.0,0.0,60.0,7666.0,67.56,76.923077,131.0,1.0,2.0,2.0,3.0
50%,584567.0,0.0,0.0,1.0,6262.0,4.0,3.0,-53.0,0.0,60.0,10272.0,78.222222,103.380952,178.0,1.25,4.0,3.4,8.0
75%,634833.0,1.0,0.0,2.0,7558.0,7.0,3.0,-29.0,0.0,90.0,11403.0,86.421053,127.906977,209.0,2.5,8.0,7.0,23.0
max,2698535.0,1.0,1.0,3.0,8989.0,9.0,5.0,-10.0,6.0,180.0,15664.0,100.0,236.857143,243.0,42.909091,472.0,232.375,1859.0


In [12]:
df.shape

(72305, 19)

# Splitting the Data

Now that the data has been encoded, it can now be split into its respective x and y variables. The 'final_result' feature was left untouched since it was the response variable to all the other features. For x data, 'id_student' will be dropped since it serves as more of a categorical variable.

Since the data points were originally ordered by assessment and student, I will need to shuffle the train_test to make sure students at the bottom of the df do not unnecessarily get cut.

In [14]:
x = df.drop(columns = ['id_student', 'final_result'])
y = df[['final_result']]

features = x.columns.tolist()

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 6022, stratify = y, test_size = 0.25, shuffle = True)

# Scaling

Despite encoding, a few columns are still beyond a comparable scope of eachother. A StandardScaler will be applied to ensure that all values are comparable.

In [16]:
scaler = Normalizer()
scaler_model = scaler.fit(x_train)

In [17]:
x_train_scale = scaler_model.transform(x_train)
x_test_scale = scaler_model.transform(x_test)

In [18]:
cv_k = StratifiedKFold(5)
n_est_first = [2**i for i in range(7)]
n_est_second = [int(x) for x in np.linspace(start = 100, stop = 1500, num = 15)]
n_est = n_est_first + n_est_second
n_iter = 200
verb = 4 #Extremely long training times for hypertuning required me to make sure that progress was being made

# Simple Decision Tree

In [20]:
dt_params = {
    'criterion': ['entropy', 'gini'],
    'max_depth': [int(x) for x in np.linspace(10, 90, num = 9)],
    'min_samples_split': [x for x in np.linspace(0.01, 0.5, 10, endpoint = True)]
}

cv_dt = DecisionTreeClassifier(max_features = 'sqrt', random_state = 6022)

In [21]:
start_time = time.time()
dt_rand = RandomizedSearchCV(
    estimator = cv_dt,
    param_distributions = dt_params,
    cv = cv_k,
    n_iter = n_iter,
    verbose = verb,
    return_train_score = True,
    random_state = 6022
)
dt_rand_cv = dt_rand.fit(x_train_scale, y_train)
end_time = time.time()
dt_cv_time = end_time - start_time



Fitting 5 folds for each of 180 candidates, totalling 900 fits
[CV 1/5] END criterion=entropy, max_depth=10, min_samples_split=0.01;, score=(train=0.661, test=0.654) total time=   0.1s
[CV 2/5] END criterion=entropy, max_depth=10, min_samples_split=0.01;, score=(train=0.657, test=0.653) total time=   0.1s
[CV 3/5] END criterion=entropy, max_depth=10, min_samples_split=0.01;, score=(train=0.652, test=0.643) total time=   0.1s
[CV 4/5] END criterion=entropy, max_depth=10, min_samples_split=0.01;, score=(train=0.653, test=0.643) total time=   0.1s
[CV 5/5] END criterion=entropy, max_depth=10, min_samples_split=0.01;, score=(train=0.654, test=0.650) total time=   0.1s
[CV 1/5] END criterion=entropy, max_depth=10, min_samples_split=0.06444444444444444;, score=(train=0.649, test=0.647) total time=   0.1s
[CV 2/5] END criterion=entropy, max_depth=10, min_samples_split=0.06444444444444444;, score=(train=0.635, test=0.637) total time=   0.1s
[CV 3/5] END criterion=entropy, max_depth=10, min_sam

KeyboardInterrupt: 

In [None]:
print("Best Score:" + str(dt_rand_cv.best_score_))
print("Best Parameters: " + str(dt_rand_cv.best_params_))

In [None]:
dt = DecisionTreeClassifier(**dt_rand_cv.best_params_, max_features = 'sqrt', random_state = 6022)

start_time = time.time()
dt_model = dt.fit(x_train_scale, y_train)
end_time = time.time()
dt_fit_time = end_time - start_time

start_time = time.time()
dt_pred = dt_model.predict(x_test_scale)
end_time = time.time()
dt_pred_time = end_time - start_time

dt_cv_scores_test = cross_val_score(dt_model, x_test_scale, y_test, cv = cv_k, scoring = 'roc_auc')
dt_cv_scores_train = cross_val_score(dt_model, x_train_scale, y_train, cv = cv_k, scoring = 'roc_auc')
print(f'Training CV Score: {dt_cv_scores_train.mean()} +- {dt_cv_scores_train.std()}')
print(f'Testing CV Score: {dt_cv_scores_test.mean()} +- {dt_cv_scores_test.std()}')

Because of the heavy class imbalance, the models cannot be compared using cross-validation scores. The metrics obtained from the confusion matrix will be how the final model is selected.

In [None]:
dt_confusion = confusion_matrix(y_test, dt_pred, labels = dt_model.classes_)
dt_confusion_matrix = ConfusionMatrixDisplay(confusion_matrix = dt_confusion, display_labels = dt_model.classes_)
dt_confusion_matrix.plot()
plt.show()

In [None]:
print(classification_report(y_test, dt_pred))

In [None]:
dt_imp = dt_model.feature_importances_
dt_feat_imp_df = pd.DataFrame({'feature': features, 'importance': dt_imp})
dt_sorted = dt_feat_imp_df.sort_values(by = 'importance', ascending = False)
dt_feat_plot = sns.catplot(data = dt_sorted.head(20), kind = 'bar', x = 'feature', y = 'importance', height = 5, aspect = 2)
dt_feat_plot = plt.xticks(rotation = 90)
plt.show()

In [None]:
print(f'CV time: {dt_cv_time}')
print(f'Fit time: {dt_fit_time}')
print(f'Predict time: {dt_pred_time}')

# Random Forest

In [None]:
rf_params = {
    'n_estimators': n_est,
    'criterion': ['gini', 'entropy'],
    'max_depth': [int(x) for x in np.linspace(10, 90, num = 9)],
    'min_samples_split': [x for x in np.linspace(0.01, 0.5, 10, endpoint = True)]
}

cv_rf = RandomForestClassifier(max_features = 'sqrt', n_jobs = 6, random_state = 6022)

In [None]:
start_time = time.time()
rf_rand = RandomizedSearchCV(
    estimator = cv_rf,
    param_distributions = rf_params,
    cv = cv_k,
    n_iter = n_iter,
    verbose = verb,
    return_train_score = True,
    random_state = 6022
)
rf_rand_cv = rf_rand.fit(x_train_scale, y_train)
end_time = time.time()
rf_cv_time = end_time - start_time

In [None]:
print("Best Score:" + str(rf_rand_cv.best_score_))
print("Best Parameters: " + str(rf_rand_cv.best_params_))

In [None]:
rf = RandomForestClassifier(**rf_rand_cv.best_params_, max_features = 'sqrt', random_state = 6022)

start_time = time.time()
rf_model = rf.fit(x_train_scale, y_train)
end_time = time.time()
rf_fit_time = end_time - start_time

start_time = time.time()
rf_pred = rf_model.predict(x_test_scale)
end_time = time.time()
rf_pred_time = end_time - start_time

In [None]:
rf_confusion = confusion_matrix(y_test, rf_pred, labels = rf_model.classes_)
rf_confusion_matrix = ConfusionMatrixDisplay(confusion_matrix = rf_confusion, display_labels = rf_model.classes_)
rf_confusion_matrix.plot()
plt.show()

In [None]:
print(classification_report(y_test, rf_pred))

In [None]:
rf_imp = rf_model.feature_importances_
rf_feat_imp_df = pd.DataFrame({'feature': features, 'importance': rf_imp})
rf_sorted = rf_feat_imp_df.sort_values(by = 'importance', ascending = False)
rf_feat_plot = sns.catplot(data = rf_sorted.head(20), kind = 'bar', x = 'feature', y = 'importance', height = 5, aspect = 2)
rf_feat_plot = plt.xticks(rotation = 90)
plt.show()

In [None]:
print(f'CV time: {rf_cv_time}')
print(f'Fit time: {rf_fit_time}')
print(f'Predict time: {rf_pred_time}')

# K-Nearest Neighbors

In [None]:
knn_params = {
    'n_neighbors': n_est_first,
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
}

cv_knn = KNeighborsClassifier(n_jobs = 6)

In [None]:
start_time = time.time()
knn_rand = RandomizedSearchCV(
    estimator = cv_knn,
    param_distributions = knn_params,
    cv = cv_k,
    n_iter = n_iter,
    verbose = verb,
    return_train_score = True,
    random_state = 6022
)
knn_rand_cv = knn_rand.fit(x_train_scale, y_train)
end_time = time.time()
knn_cv_time = end_time - start_time

In [None]:
print("Best Score:" + str(knn_rand_cv.best_score_))
print("Best Parameters: " + str(knn_rand_cv.best_params_))

In [None]:
knn = KNeighborsClassifer(**knn_rand_cv.best_params_, random_state = 6022)

start_time = time.time()
knn_model = knn.fit(x_train_scale, y_train)
end_time = time.time()
knn_fit_time = end_time - start_time

start_time = time.time()
knn_pred = knn_model.predict(x_test_scale)
end_time = time.time()
knn_pred_time = end_time - start_time

In [None]:
knn_confusion = confusion_matrix(y_test, knn_pred, labels = knn_model.classes_)
knn_confusion_matrix = ConfusionMatrixDisplay(confusion_matrix = knn_confusion, display_labels = knn_model.classes_)
knn_confusion_matrix.plot()
plt.show()

In [None]:
print(classification_report(y_test, knn_pred))

In [None]:
knn_imp = knn_model.feature_importances_
knn_feat_imp_df = pd.DataFrame({'feature': features, 'importance': knn_imp})
knn_sorted = knn_feat_imp_df.sort_values(by = 'importance', ascending = False)
knn_feat_plot = sns.catplot(data = knn_sorted.head(20), kind = 'bar', x = 'feature', y = 'importance', height = 5, aspect = 2)
knn_feat_plot = plt.xticks(rotation = 90)
plt.show()

In [None]:
print(f'CV time: {knn_cv_time}')
print(f'Fit time: {knn_fit_time}')
print(f'Predict time: {knn_pred_time}')

# Logistic Regression

In [None]:
lr_params = {
    'penalty': ['l1', 'l2', 'elasticnet', 'None'],
    'solver': ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'],
    'C': [0.001, 0.1, 1, 10, 100]
}

cv_lr = LogisticRegression(n_jobs = 6, random_state = 6022)

In [None]:
start_time = time.time()
lr_rand = RandomizedSearchCV(
    estimator = cv_lr,
    param_distributions = lr_params,
    cv = cv_k,
    n_iter = n_iter,
    verbose = verb,
    return_train_score = True,
    random_state = 6022
)
lr_rand_cv = lr_rand.fit(x_train_scale, y_train)
end_time = time.time()
lr_cv_time = end_time - start_time

In [None]:
print("Best Score:" + str(lr_rand_cv.best_score_))
print("Best Parameters: " + str(lr_rand_cv.best_params_))

In [None]:
lr = LogisticRegression(**lr_rand_cv.best_params_, random_state = 6022)

start_time = time.time()
lr_model = lr.fit(x_train_scale, y_train)
end_time = time.time()
lr_fit_time = end_time - start_time

start_time = time.time()
lr_pred = lr_model.predict(x_test_scale)
end_time = time.time()
lr_pred_time = end_time - start_time

In [None]:
lr_confusion = confusion_matrix(y_test, lr_pred, labels = lr_model.classes_)
lr_confusion_matrix = ConfusionMatrixDisplay(confusion_matrix = lr_confusion, display_labels = lr_model.classes_)
lr_confusion_matrix.plot()
plt.show()

In [None]:
print(classification_report(y_test, lr_pred))

In [None]:
lr_imp = lr_model.feature_importances_
lr_feat_imp_df = pd.DataFrame({'feature': features, 'importance': lr_imp})
lr_sorted = lr_feat_imp_df.sort_values(by = 'importance', ascending = False)
lr_feat_plot = sns.catplot(data = lr_sorted.head(20), kind = 'bar', x = 'feature', y = 'importance', height = 5, aspect = 2)
lr_feat_plot = plt.xticks(rotation = 90)
plt.show()

In [None]:
print(f'CV time: {lr_cv_time}')
print(f'Fit time: {lr_fit_time}')
print(f'Predict time: {lr_pred_time}')

# Gradient Boosting

In [None]:
gb_params = {
    'n_estimators': n_est_second,
    'learning_rate': [0.1, 0.25, 0.5, 1],
    'criterion': ['friedman_mse', 'squared_error'],
    'max_depth': [int(x) for x in np.linspace(10, 90, num = 9)],
    'max_features': ['log2', 'sqrt'],
    'min_samples_split': [x for x in np.linspace(0.01, 0.5, 10, endpoint = True)]
}

cv_gb = GradientBoostingClassifier(random_state = 6022)

In [None]:
start_time = time.time()
gb_rand = RandomizedSearchCV(
    estimator = cv_gb,
    param_distributions = gb_params,
    cv = cv_k,
    n_iter = n_iter,
    verbose = verb,
    return_train_score = True,
    random_state = 6022
)
gb_rand_cv = gb_rand.fit(x_train_scale, y_train)
end_time = time.time()
gb_cv_time = end_time - start_time

In [None]:
print("Best Score:" + str(gb_rand_cv.best_score_))
print("Best Parameters: " + str(gb_rand_cv.best_params_))

In [None]:
gb = GradientBoostingClassifier(**gb_rand_cv.best_params_, random_state = 6022)

start_time = time.time()
gb_model = gb.fit(x_train_scale, y_train)
end_time = time.time()
gb_fit_time = end_time - start_time

start_time = time.time()
gb_pred = gb_model.predict(x_test_scale)
end_time = time.time()
gb_pred_time = end_time - start_time

In [None]:
gb_confusion = confusion_matrix(y_test, gb_pred, labels = gb_model.classes_)
gb_confusion_matrix = ConfusionMatrixDisplay(confusion_matrix = gb_confusion, display_labels = gb_model.classes_)
gb_confusion_matrix.plot()
plt.show()

In [None]:
print(classification_report(y_test, gb_pred))

In [None]:
gb_imp = gb_model.feature_importances_
gb_feat_imp_df = pd.DataFrame({'feature': features, 'importance': gb_imp})
gb_sorted = gb_feat_imp_df.sort_values(by = 'importance', ascending = False)
gb_feat_plot = sns.catplot(data = gb_sorted.head(20), kind = 'bar', x = 'feature', y = 'importance', height = 5, aspect = 2)
gb_feat_plot = plt.xticks(rotation = 90)
plt.show()

In [None]:
print(f'CV time: {gb_cv_time}')
print(f'Fit time: {gb_fit_time}')
print(f'Predict time: {gb_pred_time}')

# KMeans Clustering

In [None]:
km_params = {
    'init': ['k-means++', 'random'],
    'max_iter': n_est
}

cv_km = KMeans(n_clusters = 4, random_state = 6022)

In [None]:
start_time = time.time()
km_rand = RandomizedSearchCV(
    estimator = cv_km,
    param_distributions = km_params,
    cv = cv_k,
    n_iter = n_iter,
    verbose = verb,
    return_train_score = True,
    random_state = 6022
)
km_rand_cv = km_rand.fit(x_train_scale, y_train)
end_time = time.time()
km_cv_time = end_time - start_time

In [None]:
print("Best Score:" + str(km_rand_cv.best_score_))
print("Best Parameters: " + str(km_rand_cv.best_params_))

In [None]:
km = KMeans(**km_rand_cv.best_params_, n_clusters = 4, random_state = 6022)

start_time = time.time()
km_model = km.fit(x_train_scale, y_train)
end_time = time.time()
km_fit_time = end_time - start_time

start_time = time.time()
km_pred = km_model.predict(x_test_scale)
end_time = time.time()
km_pred_time = end_time - start_time

In [None]:
km_confusion = confusion_matrix(y_test, km_pred, labels = km_model.classes_)
km_confusion_matrix = ConfusionMatrixDisplay(confusion_matrix = km_confusion, display_labels = km_model.classes_)
km_confusion_matrix.plot()
plt.show()

In [None]:
print(classification_report(y_test, km_pred))

In [None]:
km_imp = km_model.feature_importances_
km_feat_imp_df = pd.DataFrame({'feature': features, 'importance': km_imp})
km_sorted = km_feat_imp_df.sort_values(by = 'importance', ascending = False)
km_feat_plot = sns.catplot(data = km_sorted.head(20), kind = 'bar', x = 'feature', y = 'importance', height = 5, aspect = 2)
km_feat_plot = plt.xticks(rotation = 90)
plt.show()

In [None]:
print(f'CV time: {km_cv_time}')
print(f'Fit time: {km_fit_time}')
print(f'Predict time: {km_pred_time}')