# key words
- model selection: train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
- classification: KNeighborsClassifier
- metrics: confusion_matrix, recall_score, make_scorer, roc_auc_score

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn import datasets
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, recall_score, make_scorer, roc_auc_score


data_address = os.path.join(os.getcwd(), "pima-indians-diabetes.csv")
column_names = ['pregnancy_x', # number of times pregnant
                'plasma_con', # plasma glucose concentration a 2 hours in an oral glucose tolerance test
                'blood_pressure', # diastolic blood pressure (mm Hg)
                'skin_mm', # triceps skin fold thickness (mm)
                'insulin', # 2-hour serum insulin (mu U/ml)
                'bmi', # body mass index (weight in kg/(height in m)^2)
                'pedigree_func', # diabetes pedigree function
                'age', # age (years)
                'target'] # class varible (0 or 1)
feature_names = column_names[:-1]
target_name = column_names[-1]

data = pd.read_csv(data_address, names=column_names, skiprows=1)

X, y = data[feature_names], data[target_name]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=7)

recall_scorer = make_scorer(recall_score, greater_is_better=True)
roc_auc_scorer = make_scorer(roc_auc_score, greater_is_better=True)

knn_clf = KNeighborsClassifier()

# RandomizedSearchCV
param_dist = {'n_neighbors': list(range(3, 20, 1))}
# rs = RandomizedSearchCV(knn_clf, param_dist, cv=10, n_iter=17, iid=False, scoring=recall_scorer)
rs = RandomizedSearchCV(knn_clf, param_dist, cv=10, n_iter=17, iid=False, scoring=roc_auc_scorer)
%timeit rs.fit(X_train, y_train)
print("RandomizedSearchCV best_params: %s, best_score: %.4f" % (rs.best_params_, rs.best_score_))

y_pred = rs.predict(X_test)
print("confusion_matrix:\n%s" % confusion_matrix(y_test, y_pred))

print("recall_score:", recall_score(y_test, y_pred))
print("roc_auc_score:", roc_auc_score(y_test, y_pred))

1.32 s ± 67.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
RandomizedSearchCV best_params: {'n_neighbors': 15}, best_score: 0.7099
confusion_matrix:
[[84 16]
 [27 27]]
recall_score: 0.5
roc_auc_score: 0.67
