In [2]:
import numpy as np
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt

<font color=yellow> Load Dataset </font>

In [3]:
df_train = pd.read_csv('data/noshow_train.csv')
df_test = pd.read_csv('data/noshow_test.csv')

In [4]:
list(df_train)

['PatientId',
 'AppointmentID',
 'Gender',
 'ScheduledDay',
 'AppointmentDay',
 'Age',
 'Neighbourhood',
 'Scholarship',
 'Hipertension',
 'Diabetes',
 'Alcoholism',
 'Handcap',
 'SMS_received',
 'No-show']

<font color=yellow> Missing values </font>

In [5]:
#check missing values
df_train.isnull().sum()

PatientId         0
AppointmentID     0
Gender            0
ScheduledDay      0
AppointmentDay    0
Age               0
Neighbourhood     0
Scholarship       0
Hipertension      0
Diabetes          0
Alcoholism        0
Handcap           0
SMS_received      0
No-show           0
dtype: int64

In [6]:
#check missing values
df_test.isnull().sum()

PatientId         0
AppointmentID     0
Gender            0
ScheduledDay      0
AppointmentDay    0
Age               0
Neighbourhood     0
Scholarship       0
Hipertension      0
Diabetes          0
Alcoholism        0
Handcap           0
SMS_received      0
dtype: int64

<font color=yellow> Handle non-number features </font>

In [7]:
feature_drops = ['Gender', 'ScheduledDay', 
                 'AppointmentDay', 'Neighbourhood', 
                 'PatientId']
df_train.drop(feature_drops, axis=1, inplace=True)
df_test.drop(feature_drops, axis=1, inplace=True)

<font color=yellow> Extract features from training </font>

In [8]:
X_train = df_train.drop(['No-show'], axis=1).values

<font color=yellow> Collect the training labels </font>

In [9]:
y_train = df_train['No-show'].values

<font color=yellow> Extract test features </font>

In [10]:
X_test = df_test.values

Scale features

In [11]:
from sklearn import preprocessing
std_scaler = preprocessing.StandardScaler()
std_scaler.fit(X_train)
X_train_std_sk = std_scaler.transform(X_train)
X_test_std_sk = std_scaler.transform(X_test)

<font color=red> Use Cross-Validation to optimize **n_neighbors**</font>

scoring parameter:
https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter

In [43]:
# tuned_parameters = {'n_neighbors': [3, 5, 10],
#                    'metric': ['euclidean', 'chebyshev']} # dictionary format
tuned_parameters = {'n_neighbors': [20, 25, 30, 40],
                   'metric': ['euclidean']} # dictionary format
from sklearn.model_selection import GridSearchCV
# GridSearchCV will carry out a procedure to select the best parameters
# among tuned_parameters based on the machine learning CV performances
from sklearn.neighbors import KNeighborsClassifier
#cv : number of folds, n_jobs: # of cpus
_mykNN = KNeighborsClassifier()
mykNN = GridSearchCV(_mykNN, tuned_parameters, cv=3,
                        scoring='roc_auc',
                        verbose=10,
                        n_jobs=2)
mykNN.fit(X_train_std_sk, y_train)

Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   1 tasks      | elapsed:    8.1s
[Parallel(n_jobs=2)]: Done   4 tasks      | elapsed:   15.2s
[Parallel(n_jobs=2)]: Done   9 tasks      | elapsed:   35.4s
[Parallel(n_jobs=2)]: Done  12 out of  12 | elapsed:   43.8s finished


GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='warn', n_jobs=2,
             param_grid={'metric': ['euclidean'],
                         'n_neighbors': [20, 25, 30, 40]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='roc_auc', verbose=10)

<font color=red> Cross-validation results </font>

In [44]:
#mykNN.cv_results_

In [45]:
mykNN.best_params_

{'metric': 'euclidean', 'n_neighbors': 40}

In [46]:
mykNN.best_score_ #roc_auc for Cross-validation from the training data
# it is not the roc for the test data
# find out the ROC for the test data with the recent found optimal parameters
# {'metric': 'euclidean', 'n_neighbors': 10}
# AUC for test : 0.618
# AUC for current parameters: 0.6488186552632167

0.6486190814574605

<font color=yellow> Best model from Quiz ~ 0.64 </font>

<font color=red> EVALUATION </font>

<font color=red> 0. Load real labels </font>

In [39]:
y_real = pd.read_csv('data/noshow_test_labels.csv').values

<font color=red> 1. True/False Positive/Negative and ROC curve </font>

<font color=red>2a. Get y_score</font>

In [48]:
y_prob = mykNN.predict_proba(X_test_std_sk)
y_score = y_prob[:, 1]

<font color=red> 2b. ROC curve</font>

In [49]:
from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(y_real, y_score)

<font color=red> 2c. AUC Score </font>

In [50]:
from sklearn.metrics import roc_auc_score
auc = roc_auc_score(y_real, y_score)
auc

0.6488186552632167

<font color=red>3. Optimize Logistic Regression</font>

In [25]:
from sklearn.linear_model import LogisticRegression
_logreg = LogisticRegression(penalty='l2', solver='saga')

In [51]:
tuned_parameters = {'C': [0.1, 0.5, 1]} #regularized constants
logreg = GridSearchCV(_logreg, tuned_parameters, cv=3,
                        scoring='roc_auc',
                        verbose=10,
                        n_jobs=2)
logreg.fit(X_train_std_sk, y_train)

Fitting 3 folds for each of 3 candidates, totalling 9 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   1 tasks      | elapsed:    1.6s
[Parallel(n_jobs=2)]: Done   4 tasks      | elapsed:    1.8s
[Parallel(n_jobs=2)]: Done   7 out of   9 | elapsed:    2.2s remaining:    0.6s
[Parallel(n_jobs=2)]: Done   9 out of   9 | elapsed:    2.4s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   9 out of   9 | elapsed:    2.4s finished


GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='saga',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=2, param_grid={'C': [0.1, 0.5, 1]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='roc_auc', verbose=10)

In [52]:
logreg.best_score_

0.6471392234854692

In [53]:
logreg.best_params_

{'C': 0.1}