# Predictive Modeling

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import httpimport
import joblib

from pathlib import Path
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

In [3]:
# Import personal library
with httpimport.github_repo("junclemente", "jcds", ref="master"):
    import jcds.metrics as jm

In [4]:
# Import datasets
datasets = Path("../datasets")
train_data = "training_data.csv"
val_data = "validation_data.csv"
test_data = "testing_data.csv"
train_df = pd.read_csv(datasets / train_data)
val_df = pd.read_csv(datasets / val_data)
test_df = pd.read_csv(datasets / test_data)
display(train_df.head())
display(val_df.head())
display(test_df.head())

Unnamed: 0,Undergrad_Degree,Work_Experience,Employability_Before,Status,Status_enc
0,Computer Science,No,185.174286,Placed,1
1,Engineering,No,206.867959,Not Placed,0
2,Art,No,234.881837,Not Placed,0
3,Finance,No,173.900408,Placed,1
4,Art,No,184.06398,Not Placed,0


Unnamed: 0,Undergrad_Degree,Work_Experience,Employability_Before,Status,Status_enc
0,Business,Yes,261.272959,Placed,1
1,Engineering,No,173.558776,Not Placed,0
2,Finance,No,205.074388,Placed,1
3,Business,Yes,230.52602,Placed,1
4,Business,No,229.0,Not Placed,0


Unnamed: 0,Undergrad_Degree,Work_Experience,Employability_Before,Status,Status_enc
0,Finance,No,168.775918,Placed,1
1,Business,Yes,195.508673,Placed,1
2,Computer Science,No,260.76051,Placed,1
3,Art,No,231.892551,Not Placed,0
4,Computer Science,Yes,400.0,Placed,1


## Setup Training and Validation dataframes

In [5]:
# Variables to use for predictive modeling
variables = ["Undergrad_Degree", "Work_Experience", "Employability_Before"]
target = "Status_enc"

In [6]:
# Setup train, val, and test dataframes
X_train = train_df[variables]
y_train = train_df[target]

X_val = val_df[variables]
y_val = val_df[target]

X_test = test_df[variables]
y_test = test_df[target]

# One-hot encode categorical variables
X_train = pd.get_dummies(X_train, drop_first=True)
X_val = pd.get_dummies(X_val, drop_first=True)
X_test = pd.get_dummies(X_test, drop_first=True)

# Standardize cont  / Initialize scaler
scaler = StandardScaler()
std_cols = ["Employability_Before"]
X_train[std_cols] = scaler.fit_transform(X_train[std_cols])
X_val[std_cols] = scaler.transform(X_val[std_cols])
X_test[std_cols] = scaler.transform(X_test[std_cols])

display(X_train.head())
display(X_val.head())
display(X_test.head())

Unnamed: 0,Employability_Before,Undergrad_Degree_Business,Undergrad_Degree_Computer Science,Undergrad_Degree_Engineering,Undergrad_Degree_Finance,Work_Experience_Yes
0,-0.80073,False,True,False,False,False
1,-0.244034,False,False,True,False,False
2,0.474848,False,False,False,False,False
3,-1.090036,False,False,False,True,False
4,-0.829222,False,False,False,False,False


Unnamed: 0,Employability_Before,Undergrad_Degree_Business,Undergrad_Degree_Computer Science,Undergrad_Degree_Engineering,Undergrad_Degree_Finance,Work_Experience_Yes
0,1.152088,True,False,False,False,True
1,-1.098803,False,False,True,False,False
2,-0.29006,False,False,False,True,False
3,0.363071,True,False,False,False,True
4,0.323911,True,False,False,False,False


Unnamed: 0,Employability_Before,Undergrad_Degree_Business,Undergrad_Degree_Computer Science,Undergrad_Degree_Engineering,Undergrad_Degree_Finance,Work_Experience_Yes
0,-1.221539,False,False,False,True,False
1,-0.535532,True,False,False,False,True
2,1.138938,False,True,False,False,False
3,0.398138,False,False,False,False,False
4,4.712054,False,True,False,False,True


# K-Nearest Neighbor

## RandomSearchCV

In [7]:
knn = KNeighborsClassifier()
param_dist = {
    "n_neighbors": [2, 4, 6, 8, 10, 12],
    "weights": ["uniform", "distance"],
    "metric": ["euclidean", "manhattan", "chebyshev"],
}

random_search = RandomizedSearchCV(
    estimator=knn,
    param_distributions=param_dist,
    n_iter=20,
    cv=5,
    scoring="accuracy",
    random_state=42,
)

random_search.fit(X_train, y_train)

print(random_search.best_params_)

{'weights': 'uniform', 'n_neighbors': 10, 'metric': 'euclidean'}


## GridSearchCV

In [8]:
param_grid = {
    "n_neighbors": [5, 6, 7, 8, 9, 10, 11],
    "weights": ["uniform"],
    "metric": ["euclidean"],
}

grid_search = GridSearchCV(
    estimator=knn, param_grid=param_grid, cv=5, scoring="accuracy"
)

grid_search.fit(X_train, y_train)
print(grid_search.best_params_)

{'metric': 'euclidean', 'n_neighbors': 7, 'weights': 'uniform'}


## Prediction Model

In [9]:
knn_model = KNeighborsClassifier(
    metric="euclidean",
    n_neighbors=7,
    weights="uniform",
)
knn_model.fit(X_train, y_train)

y_pred = knn_model.predict(X_val)

In [10]:
cm = confusion_matrix(y_val, y_pred)
jm.mc_confusion(cm)

Confusion Matrix:
[[152   9]
 [  5 218]]


Unnamed: 0,Class 0,Class 1
Accuracy,0.96354,0.96354
Error rate,0.03646,0.03646
Sensitivity (Recall),0.9441,0.97758
Specificity,0.97758,0.9441
Precision,0.96815,0.96035
F1,0.95597,0.96889
F2,0.94881,0.97408
F0.5,0.96324,0.96375


## Test

In [11]:
test_pred = knn_model.predict(X_test)
cm_test = confusion_matrix(y_test, test_pred)
jm.mc_confusion(cm_test)

Confusion Matrix:
[[ 91   4]
 [  3 142]]


Unnamed: 0,Class 0,Class 1
Accuracy,0.97083,0.97083
Error rate,0.02917,0.02917
Sensitivity (Recall),0.95789,0.97931
Specificity,0.97931,0.95789
Precision,0.96809,0.9726
F1,0.96296,0.97595
F2,0.95992,0.97796
F0.5,0.96603,0.97394


# Export Model

In [13]:
models = Path("../models")
joblib.dump(knn_model, models / "k_nearest_neighbor_model.pkl")

['../models/k_nearest_neighbor_model.pkl']