In [19]:
import numpy as np

import pandas as pd

from pandas import Series,DataFrame

from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import train_test_split

# grid网格，search搜索，cv：cross_validation
# 搜索算法最合适的参数
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import accuracy_score

### 加载数据和提取数据和目标值

In [24]:
cancer = pd.read_csv('./cancer.csv',sep = '\t')
cancer.drop('ID',axis = 1,inplace=True)

X = cancer.iloc[:,1:]

y = cancer['Diagnosis']

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2)

### 网格搜索GridSearchCV进行最佳参数的查找

In [25]:
knn = KNeighborsClassifier()

params = {'n_neighbors':[i for i in range(1,30)],
          'weights':['distance','uniform'],
          'p':[1,2]}

# cross_val_score类似
gcv = GridSearchCV(knn,params,scoring='accuracy',cv = 6)

gcv.fit(X_train,y_train)

GridSearchCV(cv=6, error_score='raise-deprecating',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29], 'weights': ['distance', 'uniform'], 'p': [1, 2]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

### 查看了GridSearchCV最佳的参数组合

In [15]:
gcv.best_params_

{'n_neighbors': 12, 'p': 1, 'weights': 'distance'}

In [13]:
gcv.best_estimator_

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=12, p=1,
           weights='distance')

In [14]:
gcv.best_score_

0.9494505494505494

### 使用GridSearchCV进行预测，计算准确率

In [27]:
y_ = gcv.predict(X_test)

(y_ == y_test).mean()

0.9122807017543859

In [28]:
gcv.score(X_test,y_test)

0.9122807017543859

In [29]:
accuracy_score(y_test,y_)

0.9122807017543859

In [26]:
# 取出了最好的模型，进行预测
# 也可以直接使用gcv进行预测，结果一样的
knn_best = gcv.best_estimator_
y_ = knn_best.predict(X_test)
accuracy_score(y_test,y_)

0.9122807017543859

### 交叉表

In [33]:
X_test.shape

(114, 30)

In [32]:
pd.crosstab(index = y_test,columns=y_,rownames=['True'],colnames=['Predict'],
            margins=True)

Predict,B,M,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
B,77,2,79
M,8,27,35
All,85,29,114


In [34]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test,y_)

array([[77,  2],
       [ 8, 27]], dtype=int64)

In [38]:
# 真实的
y_test.value_counts()

B    79
M    35
Name: Diagnosis, dtype: int64

In [40]:
# 预测
Series(y_).value_counts()

B    85
M    29
dtype: int64

In [41]:
confusion_matrix(y_test,y_)

array([[77,  2],
       [ 8, 27]], dtype=int64)

In [46]:
confusion_matrix(y_,y_test)

array([[77,  8],
       [ 2, 27]], dtype=int64)

In [44]:
np.round(77/79,2)

0.97

In [45]:
np.round(27/35,2)

0.77

In [47]:
# precision
np.round(77/85,2)

0.91

In [48]:
np.round(27/29,2)

0.93

In [49]:
np.round(2*0.97*0.91/(0.97 + 0.91),2)

0.94

In [50]:
np.round(2*0.93*0.77/(0.77 + 0.93),2)

0.84

In [36]:
# 精确率、召回率、f1-score调和平均值
from sklearn.metrics import classification_report

print(classification_report(y_test,y_,target_names = ['B','M']))

              precision    recall  f1-score   support

           B       0.91      0.97      0.94        79
           M       0.93      0.77      0.84        35

   micro avg       0.91      0.91      0.91       114
   macro avg       0.92      0.87      0.89       114
weighted avg       0.91      0.91      0.91       114



### 提升准确率，提升精确率，提升召回率

In [51]:
X.head()

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave_mean,symmetry_mean,fractal_mean,radius_sd,texture_sd,perimeter_sd,area_sd,smoothness_sd,compactness_sd,concavity_sd,concave_sd,symmetry_sd,fractal_sd,radius_max,texture_max,perimeter_max,area_max,smoothness_max,compactness_max,concavity_max,concave_max,symmetry_max,fractal_max
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,1.095,0.9053,8.589,153.4,0.006399,0.04904,0.05373,0.01587,0.03003,0.006193,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,0.5435,0.7339,3.398,74.08,0.005225,0.01308,0.0186,0.0134,0.01389,0.003532,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,0.7456,0.7869,4.585,94.03,0.00615,0.04006,0.03832,0.02058,0.0225,0.004571,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,0.4956,1.156,3.445,27.23,0.00911,0.07458,0.05661,0.01867,0.05963,0.009208,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,0.7572,0.7813,5.438,94.44,0.01149,0.02461,0.05688,0.01885,0.01756,0.005115,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [53]:
# 归一化操作
X_norm1 = (X - X.min())/(X.max() - X.min())
X_norm1.head()

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave_mean,symmetry_mean,fractal_mean,radius_sd,texture_sd,perimeter_sd,area_sd,smoothness_sd,compactness_sd,concavity_sd,concave_sd,symmetry_sd,fractal_sd,radius_max,texture_max,perimeter_max,area_max,smoothness_max,compactness_max,concavity_max,concave_max,symmetry_max,fractal_max
0,0.521037,0.022658,0.545989,0.363733,0.593753,0.792037,0.70314,0.731113,0.686364,0.605518,0.356147,0.120469,0.369034,0.273811,0.159296,0.351398,0.135682,0.300625,0.311645,0.183042,0.620776,0.141525,0.66831,0.450698,0.601136,0.619292,0.56861,0.912027,0.598462,0.418864
1,0.643144,0.272574,0.615783,0.501591,0.28988,0.181768,0.203608,0.348757,0.379798,0.141323,0.156437,0.082589,0.12444,0.12566,0.119387,0.081323,0.04697,0.253836,0.084539,0.09111,0.606901,0.303571,0.539818,0.435214,0.347553,0.154563,0.192971,0.639175,0.23359,0.222878
2,0.601496,0.39026,0.595743,0.449417,0.514309,0.431017,0.462512,0.635686,0.509596,0.211247,0.229622,0.094303,0.18037,0.162922,0.150831,0.283955,0.096768,0.389847,0.20569,0.127006,0.556386,0.360075,0.508442,0.374508,0.48359,0.385375,0.359744,0.835052,0.403706,0.213433
3,0.21009,0.360839,0.233501,0.102906,0.811321,0.811361,0.565604,0.522863,0.776263,1.0,0.139091,0.175875,0.126655,0.038155,0.251453,0.543215,0.142955,0.353665,0.728148,0.287205,0.24831,0.385928,0.241347,0.094008,0.915472,0.814012,0.548642,0.88488,1.0,0.773711
4,0.629893,0.156578,0.630986,0.48929,0.430351,0.347893,0.463918,0.51839,0.378283,0.186816,0.233822,0.093065,0.220563,0.163688,0.332359,0.167918,0.143636,0.357075,0.136179,0.1458,0.519744,0.123934,0.506948,0.341575,0.437364,0.172415,0.319489,0.558419,0.1575,0.142595


In [54]:
X_train,X_test,y_train,y_test = train_test_split(X_norm1,y,test_size  = 0.2)

knn = KNeighborsClassifier()

params = {'n_neighbors':[i for i in range(1,30)],
          'weights':['uniform','distance'],
          'p':[1,2]}
gcv = GridSearchCV(knn,params,scoring='accuracy',cv = 6)
gcv.fit(X_train,y_train)

GridSearchCV(cv=6, error_score='raise-deprecating',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29], 'weights': ['uniform', 'distance'], 'p': [1, 2]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [55]:
y_ = gcv.predict(X_test)
accuracy_score(y_test,y_)

0.9824561403508771

In [57]:
print(classification_report(y_test,y_,target_names=['B','M']))

              precision    recall  f1-score   support

           B       0.99      0.99      0.99        72
           M       0.98      0.98      0.98        42

   micro avg       0.98      0.98      0.98       114
   macro avg       0.98      0.98      0.98       114
weighted avg       0.98      0.98      0.98       114



In [58]:
# Z-Score归一化，标准化
X_norm2 = (X - X.mean())/X.std()
X_norm2.head()

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave_mean,symmetry_mean,fractal_mean,radius_sd,texture_sd,perimeter_sd,area_sd,smoothness_sd,compactness_sd,concavity_sd,concave_sd,symmetry_sd,fractal_sd,radius_max,texture_max,perimeter_max,area_max,smoothness_max,compactness_max,concavity_max,concave_max,symmetry_max,fractal_max
0,1.0961,-2.071512,1.268817,0.98351,1.567087,3.280628,2.650542,2.530249,2.215566,2.253764,2.487545,-0.564768,2.83054,2.485391,-0.213814,1.315704,0.72339,0.660239,1.147747,0.906286,1.885031,-1.358098,2.301575,1.999478,1.306537,2.614365,2.107672,2.294058,2.748204,1.935312
1,1.828212,-0.353322,1.684473,1.90703,-0.826235,-0.486643,-0.023825,0.547662,0.001391,-0.867889,0.498816,-0.875473,0.263095,0.741749,-0.604819,-0.692317,-0.440393,0.259933,-0.804742,-0.099356,1.80434,-0.368879,1.533776,1.888827,-0.375282,-0.430066,-0.14662,1.086129,-0.243675,0.280943
2,1.578499,0.455786,1.565126,1.557513,0.941382,1.052,1.36228,2.03544,0.938859,-0.397658,1.227596,-0.779398,0.85018,1.180298,-0.296744,0.814257,0.212889,1.423575,0.236827,0.293301,1.510541,-0.023953,1.346291,1.455004,0.526944,1.08198,0.854222,1.953282,1.151242,0.201214
3,-0.768233,0.253509,-0.592166,-0.763792,3.280667,3.399917,1.914213,1.450431,2.864862,4.906602,0.326087,-0.110312,0.286341,-0.288125,0.689095,2.741868,0.818798,1.114027,4.72852,2.045711,-0.281217,0.133866,-0.24972,-0.549538,3.391291,3.889975,1.987839,2.173873,6.040726,4.930672
4,1.748758,-1.150804,1.775011,1.824624,0.280125,0.538866,1.369806,1.427237,-0.009552,-0.561956,1.269426,-0.789549,1.27207,1.18931,1.481763,-0.048477,0.827742,1.143199,-0.360775,0.498889,1.297434,-1.465481,1.337363,1.219651,0.220362,-0.313119,0.61264,0.728618,-0.86759,-0.396751


In [60]:
X_norm2.mean()

radius_mean        -3.136331e-15
texture_mean       -6.558316e-15
perimeter_mean     -7.012551e-16
area_mean          -8.339355e-16
smoothness_mean     6.120470e-15
compactness_mean   -1.074321e-15
concavity_mean     -4.031144e-16
concave_mean        1.006030e-15
symmetry_mean      -1.888550e-15
fractal_mean        4.860396e-16
radius_sd          -9.998836e-16
texture_sd         -9.782255e-16
perimeter_sd        1.903086e-15
area_sd            -7.423275e-16
smoothness_sd      -7.449006e-16
compactness_sd     -3.976511e-16
concavity_sd        1.098906e-15
concave_sd         -2.766777e-16
symmetry_sd        -3.459447e-16
fractal_sd         -3.512129e-18
radius_max         -2.346102e-15
texture_max         1.761138e-15
perimeter_max      -1.214416e-15
area_max            5.919889e-16
smoothness_max     -5.033661e-15
compactness_max    -2.118204e-15
concavity_max       6.649631e-16
concave_max        -1.795088e-16
symmetry_max       -2.481734e-15
fractal_max         2.424930e-15
dtype: flo

In [59]:
X_norm2.std()

radius_mean         1.0
texture_mean        1.0
perimeter_mean      1.0
area_mean           1.0
smoothness_mean     1.0
compactness_mean    1.0
concavity_mean      1.0
concave_mean        1.0
symmetry_mean       1.0
fractal_mean        1.0
radius_sd           1.0
texture_sd          1.0
perimeter_sd        1.0
area_sd             1.0
smoothness_sd       1.0
compactness_sd      1.0
concavity_sd        1.0
concave_sd          1.0
symmetry_sd         1.0
fractal_sd          1.0
radius_max          1.0
texture_max         1.0
perimeter_max       1.0
area_max            1.0
smoothness_max      1.0
compactness_max     1.0
concavity_max       1.0
concave_max         1.0
symmetry_max        1.0
fractal_max         1.0
dtype: float64

In [61]:
X_train,X_test,y_train,y_test = train_test_split(X_norm2,y,test_size  = 0.2)

knn = KNeighborsClassifier()

params = {'n_neighbors':[i for i in range(1,30)],
          'weights':['uniform','distance'],
          'p':[1,2]}
gcv = GridSearchCV(knn,params,scoring='accuracy',cv = 6)
gcv.fit(X_train,y_train)

y_ = gcv.predict(X_test)
accuracy_score(y_test,y_)

0.9912280701754386

In [62]:
from sklearn.preprocessing import MinMaxScaler,StandardScaler

In [63]:
# MinMaxScaler 和最大值最小值归一化效果一样
mms = MinMaxScaler()

In [69]:
((X - X.min())/(X.max() - X.min())).head()

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave_mean,symmetry_mean,fractal_mean,radius_sd,texture_sd,perimeter_sd,area_sd,smoothness_sd,compactness_sd,concavity_sd,concave_sd,symmetry_sd,fractal_sd,radius_max,texture_max,perimeter_max,area_max,smoothness_max,compactness_max,concavity_max,concave_max,symmetry_max,fractal_max
0,0.521037,0.022658,0.545989,0.363733,0.593753,0.792037,0.70314,0.731113,0.686364,0.605518,0.356147,0.120469,0.369034,0.273811,0.159296,0.351398,0.135682,0.300625,0.311645,0.183042,0.620776,0.141525,0.66831,0.450698,0.601136,0.619292,0.56861,0.912027,0.598462,0.418864
1,0.643144,0.272574,0.615783,0.501591,0.28988,0.181768,0.203608,0.348757,0.379798,0.141323,0.156437,0.082589,0.12444,0.12566,0.119387,0.081323,0.04697,0.253836,0.084539,0.09111,0.606901,0.303571,0.539818,0.435214,0.347553,0.154563,0.192971,0.639175,0.23359,0.222878
2,0.601496,0.39026,0.595743,0.449417,0.514309,0.431017,0.462512,0.635686,0.509596,0.211247,0.229622,0.094303,0.18037,0.162922,0.150831,0.283955,0.096768,0.389847,0.20569,0.127006,0.556386,0.360075,0.508442,0.374508,0.48359,0.385375,0.359744,0.835052,0.403706,0.213433
3,0.21009,0.360839,0.233501,0.102906,0.811321,0.811361,0.565604,0.522863,0.776263,1.0,0.139091,0.175875,0.126655,0.038155,0.251453,0.543215,0.142955,0.353665,0.728148,0.287205,0.24831,0.385928,0.241347,0.094008,0.915472,0.814012,0.548642,0.88488,1.0,0.773711
4,0.629893,0.156578,0.630986,0.48929,0.430351,0.347893,0.463918,0.51839,0.378283,0.186816,0.233822,0.093065,0.220563,0.163688,0.332359,0.167918,0.143636,0.357075,0.136179,0.1458,0.519744,0.123934,0.506948,0.341575,0.437364,0.172415,0.319489,0.558419,0.1575,0.142595


In [70]:
mms.fit(X)
X2 = mms.transform(X)
X2.round(6)

array([[0.521037, 0.022658, 0.545989, ..., 0.912027, 0.598462, 0.418864],
       [0.643144, 0.272574, 0.615783, ..., 0.639175, 0.23359 , 0.222878],
       [0.601496, 0.39026 , 0.595743, ..., 0.835052, 0.403706, 0.213433],
       ...,
       [0.455251, 0.621238, 0.445788, ..., 0.487285, 0.128721, 0.151909],
       [0.644564, 0.66351 , 0.665538, ..., 0.910653, 0.497142, 0.452315],
       [0.036869, 0.501522, 0.02854 , ..., 0.      , 0.257441, 0.100682]])

In [72]:
# DataFrame,默认保留6位
# z = (x - u) / s
((X - X.mean())/X.std()).head()

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave_mean,symmetry_mean,fractal_mean,radius_sd,texture_sd,perimeter_sd,area_sd,smoothness_sd,compactness_sd,concavity_sd,concave_sd,symmetry_sd,fractal_sd,radius_max,texture_max,perimeter_max,area_max,smoothness_max,compactness_max,concavity_max,concave_max,symmetry_max,fractal_max
0,1.0961,-2.071512,1.268817,0.98351,1.567087,3.280628,2.650542,2.530249,2.215566,2.253764,2.487545,-0.564768,2.83054,2.485391,-0.213814,1.315704,0.72339,0.660239,1.147747,0.906286,1.885031,-1.358098,2.301575,1.999478,1.306537,2.614365,2.107672,2.294058,2.748204,1.935312
1,1.828212,-0.353322,1.684473,1.90703,-0.826235,-0.486643,-0.023825,0.547662,0.001391,-0.867889,0.498816,-0.875473,0.263095,0.741749,-0.604819,-0.692317,-0.440393,0.259933,-0.804742,-0.099356,1.80434,-0.368879,1.533776,1.888827,-0.375282,-0.430066,-0.14662,1.086129,-0.243675,0.280943
2,1.578499,0.455786,1.565126,1.557513,0.941382,1.052,1.36228,2.03544,0.938859,-0.397658,1.227596,-0.779398,0.85018,1.180298,-0.296744,0.814257,0.212889,1.423575,0.236827,0.293301,1.510541,-0.023953,1.346291,1.455004,0.526944,1.08198,0.854222,1.953282,1.151242,0.201214
3,-0.768233,0.253509,-0.592166,-0.763792,3.280667,3.399917,1.914213,1.450431,2.864862,4.906602,0.326087,-0.110312,0.286341,-0.288125,0.689095,2.741868,0.818798,1.114027,4.72852,2.045711,-0.281217,0.133866,-0.24972,-0.549538,3.391291,3.889975,1.987839,2.173873,6.040726,4.930672
4,1.748758,-1.150804,1.775011,1.824624,0.280125,0.538866,1.369806,1.427237,-0.009552,-0.561956,1.269426,-0.789549,1.27207,1.18931,1.481763,-0.048477,0.827742,1.143199,-0.360775,0.498889,1.297434,-1.465481,1.337363,1.219651,0.220362,-0.313119,0.61264,0.728618,-0.86759,-0.396751


In [74]:
nd = X.get_values()
nd

  """Entry point for launching an IPython kernel.


array([[1.799e+01, 1.038e+01, 1.228e+02, ..., 2.654e-01, 4.601e-01,
        1.189e-01],
       [2.057e+01, 1.777e+01, 1.329e+02, ..., 1.860e-01, 2.750e-01,
        8.902e-02],
       [1.969e+01, 2.125e+01, 1.300e+02, ..., 2.430e-01, 3.613e-01,
        8.758e-02],
       ...,
       [1.660e+01, 2.808e+01, 1.083e+02, ..., 1.418e-01, 2.218e-01,
        7.820e-02],
       [2.060e+01, 2.933e+01, 1.401e+02, ..., 2.650e-01, 4.087e-01,
        1.240e-01],
       [7.760e+00, 2.454e+01, 4.792e+01, ..., 0.000e+00, 2.871e-01,
        7.039e-02]])

In [75]:
(nd - nd.mean(axis = 0))/nd.std(axis = 0)

array([[ 1.09706398, -2.07333501,  1.26993369, ...,  2.29607613,
         2.75062224,  1.93701461],
       [ 1.82982061, -0.35363241,  1.68595471, ...,  1.0870843 ,
        -0.24388967,  0.28118999],
       [ 1.57988811,  0.45618695,  1.56650313, ...,  1.95500035,
         1.152255  ,  0.20139121],
       ...,
       [ 0.70228425,  2.0455738 ,  0.67267578, ...,  0.41406869,
        -1.10454895, -0.31840916],
       [ 1.83834103,  2.33645719,  1.98252415, ...,  2.28998549,
         1.91908301,  2.21963528],
       [-1.80840125,  1.22179204, -1.81438851, ..., -1.74506282,
        -0.04813821, -0.75120669]])

In [76]:
ss = StandardScaler()

X3 = ss.fit_transform(X)
X3

array([[ 1.09706398, -2.07333501,  1.26993369, ...,  2.29607613,
         2.75062224,  1.93701461],
       [ 1.82982061, -0.35363241,  1.68595471, ...,  1.0870843 ,
        -0.24388967,  0.28118999],
       [ 1.57988811,  0.45618695,  1.56650313, ...,  1.95500035,
         1.152255  ,  0.20139121],
       ...,
       [ 0.70228425,  2.0455738 ,  0.67267578, ...,  0.41406869,
        -1.10454895, -0.31840916],
       [ 1.83834103,  2.33645719,  1.98252415, ...,  2.28998549,
         1.91908301,  2.21963528],
       [-1.80840125,  1.22179204, -1.81438851, ..., -1.74506282,
        -0.04813821, -0.75120669]])