# Predictive Modeling

In [58]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import httpimport

from pathlib import Path
from sklearn import tree
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.tree import DecisionTreeClassifier

In [59]:
# Import personal library
with httpimport.github_repo("junclemente", "jcds", ref="master"):
    import jcds.metrics as jm

In [78]:
# Import datasets
datasets = Path("../datasets")
train_data = "training_data.csv"
val_data = "validation_data.csv"
test_data = "testing_data.csv"
train_df = pd.read_csv(datasets / train_data)
val_df = pd.read_csv(datasets / val_data)
test_df = pd.read_csv(datasets / test_data)
display(train_df.head())
display(val_df.head())
display(test_df.head())

Unnamed: 0,Undergrad_Degree,Work_Experience,Employability_Before,Status,Status_enc
0,Computer Science,No,185.174286,Placed,1
1,Engineering,No,206.867959,Not Placed,0
2,Art,No,234.881837,Not Placed,0
3,Finance,No,173.900408,Placed,1
4,Art,No,184.06398,Not Placed,0


Unnamed: 0,Undergrad_Degree,Work_Experience,Employability_Before,Status,Status_enc
0,Business,Yes,261.272959,Placed,1
1,Engineering,No,173.558776,Not Placed,0
2,Finance,No,205.074388,Placed,1
3,Business,Yes,230.52602,Placed,1
4,Business,No,229.0,Not Placed,0


Unnamed: 0,Undergrad_Degree,Work_Experience,Employability_Before,Status,Status_enc
0,Finance,No,168.775918,Placed,1
1,Business,Yes,195.508673,Placed,1
2,Computer Science,No,260.76051,Placed,1
3,Art,No,231.892551,Not Placed,0
4,Computer Science,Yes,400.0,Placed,1


## Setup Training and Validation dataframes

In [61]:
# Variables to use for predictive modeling
variables = ["Undergrad_Degree", "Work_Experience", "Employability_Before"]
target = "Status_enc"

In [79]:
# Setup train and val dataframes
X_train = train_df[variables]
y_train = train_df[target]
X_val = val_df[variables]
y_val = val_df[target]
X_test = test_df[variables]
y_test = test_df[target]

# One-hot encode categorical variables
X_train = pd.get_dummies(X_train, drop_first=True)
X_val = pd.get_dummies(X_val, drop_first=True)
X_test = pd.get_dummies(X_test, drop_first=True)
display(X_train.head())
display(X_val.head())
display(X_test.head())

Unnamed: 0,Employability_Before,Undergrad_Degree_Business,Undergrad_Degree_Computer Science,Undergrad_Degree_Engineering,Undergrad_Degree_Finance,Work_Experience_Yes
0,185.174286,False,True,False,False,False
1,206.867959,False,False,True,False,False
2,234.881837,False,False,False,False,False
3,173.900408,False,False,False,True,False
4,184.06398,False,False,False,False,False


Unnamed: 0,Employability_Before,Undergrad_Degree_Business,Undergrad_Degree_Computer Science,Undergrad_Degree_Engineering,Undergrad_Degree_Finance,Work_Experience_Yes
0,261.272959,True,False,False,False,True
1,173.558776,False,False,True,False,False
2,205.074388,False,False,False,True,False
3,230.52602,True,False,False,False,True
4,229.0,True,False,False,False,False


Unnamed: 0,Employability_Before,Undergrad_Degree_Business,Undergrad_Degree_Computer Science,Undergrad_Degree_Engineering,Undergrad_Degree_Finance,Work_Experience_Yes
0,168.775918,False,False,False,True,False
1,195.508673,True,False,False,False,True
2,260.76051,False,True,False,False,False
3,231.892551,False,False,False,False,False
4,400.0,False,True,False,False,True


# Decision Tree

## RandomSearchCV

In [63]:
tree = DecisionTreeClassifier()
param_dist = {
    "criterion": ["gini", "entropy"],
    "max_depth": [None, 10, 20, 30, 40],
    "min_samples_split": [2, 5, 10, 15],
    "min_samples_leaf": [1, 2, 4, 6],
    "max_features": [None, "sqrt", "log2"],
}

random_search = RandomizedSearchCV(
    estimator=tree,
    param_distributions=param_dist,
    n_iter=50,
    cv=5,
    scoring="accuracy",
    random_state=42,
)

random_search.fit(X_train, y_train)

print(random_search.best_params_)

{'min_samples_split': 15, 'min_samples_leaf': 2, 'max_features': None, 'max_depth': 10, 'criterion': 'gini'}


## GridSearchCV

In [72]:
param_grid = {
    "min_samples_split": [10, 15, 20],
    "min_samples_leaf": [1, 2, 3],
    "max_features": [None, "sqrt", "log2"],
    "max_depth": [4, 6, 8, 10],
    "criterion": ["gini", "entropy"],
}

grid_search = GridSearchCV(
    estimator=tree, param_grid=param_grid, cv=5, scoring="accuracy"
)

grid_search.fit(X_train, y_train)
print(grid_search.best_params_)

{'criterion': 'entropy', 'max_depth': 6, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 10}


## Prediction Model

In [73]:
dt_model = DecisionTreeClassifier(
    criterion="entropy",
    max_depth=6,
    max_features=None,
    min_samples_leaf=2,
    min_samples_split=10,
    random_state=42,
)
dt_model.fit(X_train, y_train)

y_pred = dt_model.predict(X_val)

In [77]:
cm = confusion_matrix(y_val, y_pred)
jm.mc_confusion(cm)

Confusion Matrix:
[[154   7]
 [  5 218]]


Unnamed: 0,Class 0,Class 1
Accuracy,0.96875,0.96875
Error rate,0.03125,0.03125
Sensitivity (Recall),0.95652,0.97758
Specificity,0.97758,0.95652
Precision,0.96855,0.96889
F1,0.9625,0.97321
F2,0.9589,0.97583
F0.5,0.96612,0.97061


## Test

In [81]:
test_pred = dt_model.predict(X_test)
cm_test = confusion_matrix(y_test, test_pred)
jm.mc_confusion(cm_test)

Confusion Matrix:
[[ 93   2]
 [  6 139]]


Unnamed: 0,Class 0,Class 1
Accuracy,0.96667,0.96667
Error rate,0.03333,0.03333
Sensitivity (Recall),0.97895,0.95862
Specificity,0.95862,0.97895
Precision,0.93939,0.98582
F1,0.95876,0.97203
F2,0.97077,0.96394
F0.5,0.94705,0.98025
