https://www.kaggle.com/uciml/breast-cancer-wisconsin-data/kernels

# Preparing the data

In [1]:
from sklearn.datasets import load_breast_cancer

In [3]:
data = load_breast_cancer()


In [4]:
X, y = data.data, data.target

In [5]:
from sklearn.preprocessing import scale
X= scale(X)

In [6]:
X.shape

(569, 30)

In [7]:
from sklearn.model_selection import train_test_split
X_train_val, X_test, y_train_val, y_test = train_test_split(X,y)

# K Nearest Neighbors

In [8]:
X_train, X_val, y_train, y_val = train_test_split(X_train_val,y_train_val)

In [9]:
from sklearn.neighbors import KNeighborsClassifier

In [10]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
print('The validation score is : {:.3f}'.format(knn.score(X_val, y_val)))
print('The score on the test set is : {:.3f}'.format(knn.score(X_test, y_test)))

The validation score is : 0.972
The score on the test set is : 0.965


# Trying different values for n_neighbors

In [13]:
import numpy as np
val_scores = []
neighbors = np.arange(1, 15, 2)
for i in neighbors:
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train, y_train)
    val_scores.append(knn.score(X_val, y_val))
print("best validation score: {:.3f}".format(np.max(val_scores)))
best_n_neighbors = neighbors[np.argmax(val_scores)]
print("best n_neighbors: {}".format(best_n_neighbors))
knn = KNeighborsClassifier(n_neighbors=best_n_neighbors)
knn.fit(X_train_val, y_train_val)
print("test-set score: {:.3f}".format(knn.score(X_test, y_test)))

best validation score: 0.972
best n_neighbors: 3
test-set score: 0.979


# Cross-validation

In [14]:
from sklearn.model_selection import cross_val_score

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [18]:
cross_val_scores = []

for i in neighbors:
    knn = KNeighborsClassifier(n_neighbors=i)
    scores= cross_val_score(knn, X_train, y_train, cv=10)
    cross_val_scores.append(np.mean(scores))

print('Best cross_val_score is :{}'.format(np.max(cross_val_scores)))
best_n_neighbors = neighbors[np.argmax(cross_val_scores)]
print('Best number of neighbors is:{}'.format(best_n_neighbors))

knn = KNeighborsClassifier(n_neighbors=best_n_neighbors)
knn.fit(X_train, y_train)
print('The score on the test set is :{}'.format(knn.score(X_test, y_test)))

Best cross_val_score is :0.9647840531561462
Best number of neighbors is:7
The score on the test set is :0.9790209790209791


# GridSearchCV

In [19]:
from sklearn.model_selection import GridSearchCV

In [21]:
param_grid ={'n_neighbors' : np.arange(1,15,2)}
grid = GridSearchCV(KNeighborsClassifier(),param_grid=param_grid, cv = 10)
grid.fit(X_train, y_train)

print('best mean cross_validation score is :{:.3f}'.format(grid.best_score_))
print('Best parameters: {}'.format(grid.best_params_))

print('test score: {:.3f}'.format(grid.score(X_test, y_test)))

best mean cross_validation score is :0.965
Best parameters: {'n_neighbors': 7}
test score: 0.979


In [23]:
import pandas as pd

In [24]:
results = pd.DataFrame(grid.cv_results_)
results.columns

Index(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time',
       'param_n_neighbors', 'params', 'split0_test_score', 'split1_test_score',
       'split2_test_score', 'split3_test_score', 'split4_test_score',
       'split5_test_score', 'split6_test_score', 'split7_test_score',
       'split8_test_score', 'split9_test_score', 'mean_test_score',
       'std_test_score', 'rank_test_score', 'split0_train_score',
       'split1_train_score', 'split2_train_score', 'split3_train_score',
       'split4_train_score', 'split5_train_score', 'split6_train_score',
       'split7_train_score', 'split8_train_score', 'split9_train_score',
       'mean_train_score', 'std_train_score'],
      dtype='object')

In [25]:
results.params

0     {'n_neighbors': 1}
1     {'n_neighbors': 3}
2     {'n_neighbors': 5}
3     {'n_neighbors': 7}
4     {'n_neighbors': 9}
5    {'n_neighbors': 11}
6    {'n_neighbors': 13}
Name: params, dtype: object

In [26]:
grid.best_estimator_

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=7, p=2,
           weights='uniform')