### XB1850《一書貫通-從資料科學橫入人工智慧領域》教學範例檔
### CH11KNN


In [None]:
%matplotlib inline
import os
import numpy as np
from scipy import stats
import pandas as pd
import sklearn.model_selection
import matplotlib.pyplot as plt

#os.chdir(r'D:\Python_book\11KNNNB')
pd.set_option('display.max_columns', None)

### 載入資料

In [None]:
orgData = pd.read_csv('date_data2.csv')
orgData.describe()

Unnamed: 0,income,attractive,assets,edueduclass,Dated,income_rank,attractive_rank,assets_rank
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
mean,9010.0,50.5,96.0063,3.71,0.5,1.55,1.56,1.51
std,5832.675288,28.810948,91.082226,1.225116,0.502519,1.140397,1.103896,1.123621
min,3000.0,1.0,3.7284,1.0,0.0,0.0,0.0,0.0
25%,5000.0,28.0,31.665269,3.0,0.0,1.0,1.0,0.75
50%,7500.0,51.0,70.746924,4.0,0.5,2.0,2.0,2.0
75%,11500.0,68.875,131.481061,4.0,1.0,3.0,2.25,2.25
max,34000.0,99.5,486.311758,6.0,1.0,3.0,3.0,3.0


### 選取資料欄位

In [None]:
X = orgData.loc[:,]
Y = orgData[['Dated']]
X.head()

Unnamed: 0,income,attractive,assets,edueduclass,Dated,income_rank,attractive_rank,assets_rank
0,3000,9.0,5.145476,1,0,0,0,0
1,3000,14.5,40.643781,4,1,0,0,1
2,3000,6.0,5.145476,1,0,0,0,0
3,3000,1.0,7.067434,1,0,0,0,0
4,3500,14.5,3.7284,2,0,0,0,0


### 標準化

In [None]:
from sklearn import preprocessing

min_max_scaler = preprocessing.MinMaxScaler()
X_scaled = min_max_scaler.fit_transform(X)
X_scaled[1:5]

array([[0.        , 0.13705584, 0.07649535, 0.6       , 1.        ,
        0.        , 0.        , 0.33333333],
       [0.        , 0.05076142, 0.00293644, 0.        , 0.        ,
        0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.00691908, 0.        , 0.        ,
        0.        , 0.        , 0.        ],
       [0.01612903, 0.13705584, 0.        , 0.2       , 0.        ,
        0.        , 0.        , 0.        ]])

### 切割資料:訓練集、測試集

In [None]:
import sklearn.model_selection as model_selection
from sklearn.model_selection import train_test_split
train_data, test_data, train_target, test_target = model_selection.train_test_split(
    X_scaled, Y, test_size=0.2, train_size=0.8, random_state=123)   #划分训练集和测试集

### 建模

In [None]:
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier(n_neighbors=3)  # 默认欧氏距离
model.fit(train_data, train_target.values.flatten())
test_est = model.predict(test_data)

### 驗證

In [None]:
import sklearn.metrics as metrics

print(metrics.confusion_matrix(test_target, test_est, labels=[0, 1]))  # 混淆矩阵
print(metrics.classification_report(test_target, test_est))

[[ 9  0]
 [ 0 11]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         9
           1       1.00      1.00      1.00        11

    accuracy                           1.00        20
   macro avg       1.00      1.00      1.00        20
weighted avg       1.00      1.00      1.00        20



In [None]:
model.score(test_data, test_target)

1.0

### 選擇 K值

In [None]:
for k in range(1, 30):
    k_model = KNeighborsClassifier(n_neighbors=k)
    k_model.fit(train_data, train_target.values.flatten())
    score = k_model.score(test_data, test_target)
    print(k, '\t', score)

1 	 1.0
2 	 1.0
3 	 1.0
4 	 1.0
5 	 1.0
6 	 1.0
7 	 1.0
8 	 1.0
9 	 1.0
10 	 1.0
11 	 1.0
12 	 1.0
13 	 1.0
14 	 1.0
15 	 1.0
16 	 1.0
17 	 1.0
18 	 1.0
19 	 1.0
20 	 1.0
21 	 1.0
22 	 1.0
23 	 1.0
24 	 1.0
25 	 1.0
26 	 1.0
27 	 1.0
28 	 1.0
29 	 1.0


### 交叉驗證, 選擇 K值

In [None]:
from sklearn.model_selection import ParameterGrid
#from sklearn.grid_search import GridSearchCV 
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
n_samples = len(train_data)
kf = KFold(n_splits=3)
grid = {'n_neighbors':[1,2,3,4,5,6,7,8,9]}
estimator = KNeighborsClassifier()
gridSearchCV = GridSearchCV(estimator, grid, cv=kf)
gridSearchCV.fit(train_data, train_target.values.flatten())
#gridSearchCV.grid_scores_

GridSearchCV(cv=KFold(n_splits=3, random_state=None, shuffle=False),
             error_score=nan,
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='deprecated', n_jobs=None,
             param_grid={'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [None]:
gridSearchCV.param_grid.keys()

dict_keys(['n_neighbors'])

In [None]:
import seaborn as sb
grid_visualization = []
for grid_par in gridSearchCV.cv_results_['mean_test_score']:
    grid_visualization.append(grid_par)
grid_visualization = np.array(grid_visualization)


In [None]:
grid_visualization

array([1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [None]:
gridSearchCV.best_params_

{'n_neighbors': 1}

In [None]:
best = gridSearchCV.best_estimator_ 
best.score(test_data, test_target)
# 练习：试一试哪些参数会影响结果
###################################################################################################

1.0