# Introduction

The goal of this project is to determine the factors that directly contribute to student success for this online program. Previous EDA showed that the distributions of numeric data did not differ between passing and failing students. Since there are both continuous and categorical values to predict the student outcome, 

## Loading libraries

In [None]:
# Standard
import os
import random
from datetime import datetime, timedelta
import itertools
from subprocess import call
from io import StringIO

# Manipulation
import numpy as np
import pandas as pd
pd.options.display.max_columns = None
from scipy import stats, optimize, spatial

# Visualization
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.colors
import matplotlib.cm as cm
import seaborn as sns
import plotly.graph_objects as go
from IPython.display import Image
import graphviz
import pydotplus

# Modeling
from sklearn import datasets, svm, decomposition
from sklearn.ensemble import RandomForestClassifier, ExtraTreesRegressor, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, export_graphviz
from sklearn.cluster import KMeans, AffinityPropagation, SpectralClustering, DBSCAN, AgglomerativeClustering
from sklearn.neighbors import KNeighborsClassifier
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split, KFold, cross_val_score, StratifiedKFold
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.datasets import make_classification

# Metrics
from sklearn.metrics import precision_recall_curve, f1_score, auc
from sklearn.metrics import accuracy_score, balanced_accuracy_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import precision_score, recall_score, classification_report
from sklearn.metrics import roc_curve, roc_auc_score, log_loss
from sklearn.metrics import silhouette_score, silhouette_samples

## Loading statistical functions

In [None]:
def ecdf(data):
    """Compute ECDF for a one-dimensional array of measurements."""
    n = len(data)
    x = np.sort(data)
    y = np.arange(1, n + 1) / n
    return x, y

def pearson_r(x , y):
    """Compute Pearson correlation coefficient between two arrays."""
    corr_mat = np.corrcoef(x, y)
    return corr_mat[0, 1]

def bootstrap_replicate_1d(data, func):
    """Generate bootstrap replicate of 1D data."""
    bs_sample = np.random.choice(data, len(data))
    return func(bs_sample)

def draw_bs_reps(data, func, size = 1):
    """Draw bootstrap replicates."""
    bs_replicates = np.empty(size)
    for i in range(size):
        bs_replicates[i] = bootstrap_replicate_1d(data, func)
    return bs_replicates

def draw_bs_pairs_linreg(x, y, size = 1):
    """perform pairs bootstrap for linear regression"""
    inds = np.arange(len(x))
    bs_slope_reps = np.empty(size)
    bs_intercept_reps = np.empty(size)
    for i in range(size):
        bs_inds = np.random.choice(inds, size = len(inds))
        bs_x, bs_y = x[bs_inds], y[bs_inds]
        bs_slope_reps[i], bs_intercept_reps[i] = np.polyfit(bs_x, bs_y, 1)
    return bs_slope_reps, bs_intercept_reps

def draw_bs_pairs(x, y, func, size = 1):
    """Perform pairs bootstrap for a single statistic."""
    inds = np.arange(len(x))
    bs_replicates = np.empty(size)
    for i in range(size):
        bs_inds = np.random.choice(inds, size = len(inds))
        bs_x, bs_y = x[bs_inds], y[bs_inds]
        bs_replicates[i] = func(bs_x, bs_y)
    return bs_replicates

def permutation_sample(data1, data2):
    """Generate a permutation sample from two data sets."""
    data = np.concatenate((data1, data2))
    permuted_data = np.random.permutation(data)
    perm_sample_1 = permuted_data[:len(data1)]
    perm_sample_2 = permuted_data[len(data1):]
    return perm_sample_1, perm_sample_2

def draw_perm_reps(data_1, data_2, func, size=1):
    """Generate multiple permutation replicates."""
    perm_replicates = np.empty(size)
    for i in range(size):
        perm_sample_1, perm_sample_2 = permutation_sample(data_1, data_2)
        perm_replicates[i] = func(perm_sample_1, perm_sample_2)
    return perm_replicates

def diff_of_means(data_1, data_2):
    """Difference in means of two arrays."""
    diff = np.mean(data_1) - np.mean(data_2)
    return diff

def diff_frac(data_A, data_b):
    frac_A = np.sum(data_A) / len(data_A)
    frac_B = np.sum(data_B) / len(data_B)
    return frac_B - frac_A

def rmse(pred, obs):
    return np.sqrt(((pred - obs) ** 2).mean())

def mse(pred, obs):
    return ((pred - obs) ** 2).mean()

def bon_correct(alpha, n):
    return (alpha/n)

# Loading Data

In [None]:
df = pd.read_csv('capstone_2_modeling.csv')

In [None]:
df.drop(columns = 'Unnamed: 0', inplace = True)

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.shape

# Splitting the Data

Now that the data has been encoded, it can now be split into its respective x and y variables. The 'final_result' feature was left untouched since it was the response variable to all the other features. For x data, 'id_student' will be dropped since it serves as more of a categorical variable.

Since the data points were originally ordered by assessment and student, I will need to shuffle the train_test to make sure students at the bottom of the df do not unnecessarily get cut.

In [None]:
x = df.drop(columns = ['id_student', 'final_result'])
y = df[['final_result']]

features = x.columns.tolist()

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 6022, stratify = y, test_size = 0.25, shuffle = True)

# Scaling

Despite encoding, a few columns are still beyond a comparable scope of eachother. A StandardScaler will be applied to ensure that all values are comparable.

In [None]:
scaler = StandardScaler()
scaler_model = scaler.fit(x_train)

In [None]:
x_train_scale = scaler_model.transform(x_train)
x_test_scale = scaler_model.transform(x_test)

In [None]:
cv_k = StratifiedKFold(5)

# Simple Decision Tree

In [None]:
dt_params = {
    'criterion': ['entropy', 'gini'],
    'max_depth': [int(x) for x in np.linspace(10, 110, num = 11)],
    'max_features': ['auto', 'sqrt'],
    'min_samples_split': np.arange(2, 40)
}

cv_dt = DecisionTreeClassifier(random_state = 6022)

In [None]:
rf_params = {
    'n_estimators': [int(x) for x in np.linspace(start = 200, stop = 20000, num = 10)],
    'criterion': ['gini', 'entropy']
    'max_depth': [int(x) for x in np.linspace(10, 110, num = 11)],
    'max_features': ['auto', 'sqrt'],
    'min_samples_split': np.arange(2, 40)
}

cv_dt = DecisionTreeClassifier(random_state = 6022)

In [None]:
dt_rand = RandomizedSearchCV(cv_dt, dt_params, cv = cv_k, n_iter = 200, random_state = 6022)
dt_rand_cv = dt_rand.fit(x_train_scale, y_train)

In [None]:
print("Best Score:" + str(dt_rand_cv.best_score_))
print("Best Parameters: " + str(dt_rand_cv.best_params_))

In [None]:
dt = DecisionTreeClassifier(**dt_rand_cv.best_params_, class_weight = 'balanced', random_state = 6022)
dt_model = dt.fit(x_train_scale, y_train)
dt_pred = dt_model.predict(x_test_scale)

In [None]:
dt_cv_scores_test = cross_val_score(dt_model, x_test_scale, y_test, cv = cv_k, scoring = 'roc_auc')
dt_cv_scores_train = cross_val_score(dt_model, x_train_scale, y_train, cv = cv_k, scoring = 'roc_auc')
print(f'Training CV Score: {dt_cv_scores_train.mean()} +- {dt_cv_scores_train.std()}')
print(f'Testing CV Score: {dt_cv_scores_test.mean()} +- {dt_cv_scores_test.std()}')

Because of the heavy class imbalance, the models cannot be compared using cross-validation scores. The metrics obtained from the confusion matrix will be how the final model is selected.

In [None]:
dt_confusion = confusion_matrix(y_test, dt_pred, labels = dt_model.classes_)
dt_confusion_matrix = ConfusionMatrixDisplay(confusion_matrix = dt_confusion, display_labels = dt_model.classes_)
dt_confusion_matrix.plot()
plt.show()

In [None]:
print(classification_report(y_test, dt_pred))

In [None]:
dt_imp = dt_model.feature_importances_
df_feat_imp_df = pd.DataFrame({'feature': features, 'importance': dt_imp})
dt_sorted = df_feat_imp_df.sort_values(by = 'importance', ascending = False)
dt_feat_plot = sns.catplot(data = dt_sorted.head(20), kind = 'bar', x = 'feature', y = 'importance', height = 5, aspect = 2)
dt_feat_plot = plt.xticks(rotation = 90)
plt.show()