## 9.2	KNN算法

### 9.2.2	使用sklearn 实现KNN算法

In [38]:
import numpy as np
data_x = [[3.3423,3.1234], [3.1243,5.1234], [2.5634,4.5672],
            [2.2123,4.5332], [4.3495,4.4321], [4.4523,4.6712],
            [3.6323,5.9381], [5.2931,3.4592]]
data_y = [0, 0, 0, 0, 1, 1, 1, 1]
X_train = np.array(data_x)
y_train = np.array(data_y)

In [39]:
from sklearn.neighbors import KNeighborsClassifier
# 创建KNeighborsClassifier类的对象，并指定K值为3
model = KNeighborsClassifier(n_neighbors=3)
# 拟合从训练数据集得到的分类器
model.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=3)

In [40]:
x = np.array([4.1212,4.9342])
x_test = x.reshape(1, -1) 
# 预测测试数据集的标签   
model.predict(x_test)

array([1])

### 9.2.3	超参数

In [41]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
# 获取肿瘤数据集
data = load_breast_cancer()
X = data["data"]
y = data["target"]
# 将数据集分割为训练数据集和测试数据集
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                         test_size=0.2, shuffle=False)
X_train.shape

(455, 30)

In [42]:
X_test.shape

(114, 30)

In [43]:
model = KNeighborsClassifier(n_neighbors= 3)
model.fit(X_train, y_train)
model.score(X_test,y_test)

0.9298245614035088

In [44]:
best_score = 0.0     # 最高准确率
best_k = -1           # 最好k值
for k in range(1, 11):
    knn_model = KNeighborsClassifier(n_neighbors=k)
    knn_model.fit(X_train, y_train)
    score = knn_model.score(X_test, y_test)
    if score > best_score:
        best_k = k
        best_score = score
print("best_k=", best_k)
print("best_score=", best_score)

best_k= 5
best_score= 0.9385964912280702


In [45]:
best_score = 0.0
best_k = -1
best_weight = ""
for w in ["uniform", "distance"]:
    for k in range(1, 11):
        knn_model=KNeighborsClassifier(n_neighbors=k, weights=w)
        knn_model.fit(X_train, y_train)
        score=knn_model.score(X_test, y_test)
        if score > best_score:
            best_k = k
            best_score = score
            best_weight = w
print("best_k=", best_k)
print("best_score=", best_score)
print("best_weight=", w)

best_k= 5
best_score= 0.9385964912280702
best_weight= distance


In [48]:
best_score = 0.0
best_k = -1
best_p = -1
for k in range(1, 11):
    for p in range(1, 6):
        knn_model = KNeighborsClassifier(n_neighbors=k,
                        weights="distance", p=p)
        knn_model.fit(X_train, y_train)
        score=knn_model.score(X_test, y_test)
        if score > best_score:
            best_k = k
            best_score = score
            best_p = p
print("best_k=", best_k)
print("best_score=", best_score)
print("best_p=", best_p)

best_k= 9
best_score= 0.956140350877193
best_p= 1


### 9.2.4	网格搜索与交叉验证

In [50]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
data = load_breast_cancer()
X = data["data"]
y = data["target"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2)

In [51]:
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier()

In [52]:
param_grid = [ 
    {
        "weights": ['uniform'], 
        "n_neighbors": [i for i in range(1, 11)]
    }, 
    {
        "weights": ['distance'], 
        "n_neighbors": [i for i in range(1, 11)],
        'p':[i for i in range(1, 6)]
    }
]

In [53]:
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(model, param_grid)
grid_search.fit(X_train, y_train)
grid_search.best_estimator_

KNeighborsClassifier(p=1, weights='distance')

In [54]:
grid_search.best_score_

0.945054945054945

### 9.2.5	归一化

#### 1.	最值归一化

In [55]:
import numpy as np
data = np.random.randint(0, 1000, size = 10)
data

array([727, 935, 498, 556, 891, 848, 991, 954, 687, 386])

In [56]:
data_scale = (data - np.min(data)) / (np.max(data) - np.min(data))
data_scale

array([0.56363636, 0.90743802, 0.18512397, 0.28099174, 0.83471074,
       0.76363636, 1.        , 0.93884298, 0.49752066, 0.        ])

In [57]:
import numpy as np
data = np.array([10000000, 100, 20, 320, 111])
data

array([10000000,      100,       20,      320,      111])

In [58]:
data_scale = (data - np.min(data)) / (np.max(data) - np.min(data))
data_scale

array([1.0000000e+00, 8.0000160e-06, 0.0000000e+00, 3.0000060e-05,
       9.1000182e-06])

#### 2.	均值方差归一化

In [59]:
import numpy as np 
data = np.array( [10000000, 100, 20, 320, 111])
data

array([10000000,      100,       20,      320,      111])

In [60]:
data_scale = (data - np.mean(data)) / (np.std(data))
data_scale

array([ 2.        , -0.50000944, -0.50002944, -0.49995444, -0.50000669])

In [61]:
print("原始数据均值：", np.mean(data))
print("原始数据标准差：", np.std(data))
print("进行均值方差归一化数据均值：", np.mean(data_scale))
print("进行均值方差归一化数据标准差：", np.std(data_scale))

原始数据均值： 2000110.2
原始数据标准差： 3999944.901230536
进行均值方差归一化数据均值： 0.0
进行均值方差归一化数据标准差： 1.0


### 9.2.6	使用sklearn实现归一化

In [62]:
from sklearn.datasets import load_breast_cancer  
from sklearn.model_selection import train_test_split
data = load_breast_cancer()
X = data["data"]
y = data["target"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2)
print(X_train)

[[1.199e+01 2.489e+01 7.761e+01 ... 1.202e-01 2.599e-01 8.251e-02]
 [1.585e+01 2.395e+01 1.037e+02 ... 1.119e-01 2.809e-01 6.287e-02]
 [1.114e+01 1.407e+01 7.124e+01 ... 3.922e-02 2.576e-01 7.018e-02]
 ...
 [1.128e+01 1.339e+01 7.300e+01 ... 8.611e-02 2.102e-01 6.784e-02]
 [1.025e+01 1.618e+01 6.652e+01 ... 9.744e-02 2.608e-01 9.702e-02]
 [1.113e+01 1.662e+01 7.047e+01 ... 4.044e-02 2.383e-01 7.083e-02]]


In [63]:
from sklearn.preprocessing import StandardScaler
ssc = StandardScaler()
ssc.fit(X_train)
X_train_ssc = ssc.transform(X_train)
print(X_train_ssc)

[[-0.61812653  1.33600314 -0.60414544 ...  0.07267722 -0.50891521
  -0.10869358]
 [ 0.51698091  1.1165345   0.5065806  ... -0.05676805 -0.16862222
  -1.15874444]
 [-0.86808542 -1.19022096 -0.87533459 ... -1.1902719  -0.5461854
  -0.76791594]
 ...
 [-0.82691572 -1.3489855  -0.80040634 ... -0.45898412 -1.31427531
  -0.89302383]
 [-1.12980708 -0.69758391 -1.07627851 ... -0.28228354 -0.49433123
   0.66708229]
 [-0.87102611 -0.59485391 -0.90811569 ... -1.17124501 -0.85893087
  -0.73316374]]


In [64]:
X_test_ssc = ssc.fit_transform(X_test)
print(X_test_ssc)

[[ 0.15187827 -0.71909026  0.22067981 ...  1.2855905   1.18332163
   1.67388678]
 [ 1.47946262  1.96190113  1.41828809 ...  0.72694558 -0.45123674
  -1.01975621]
 [-0.83750018  0.36441922 -0.76285967 ...  0.44132261  0.18128733
   0.97312344]
 ...
 [-0.27718891 -0.25836721 -0.24497501 ...  1.37099737  1.6915999
   2.43908813]
 [ 0.17711752 -0.00369618  0.10518564 ... -0.74681293 -0.33183168
  -1.39027476]
 [ 0.3184573   3.19126765  0.40679461 ...  1.26738904  1.06553016
   2.60689544]]


In [65]:
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier()
from sklearn.model_selection import GridSearchCV
param_grid = [ {"weights": ['uniform', 'distance'], 
               "n_neighbors": [i for i in range(1, 11)]}, 
               ]
grid_search = GridSearchCV(model, param_grid)
grid_search.fit(X_train_ssc, y_train)
grid_search.best_score_

0.9604395604395604

## 9.3	案例：预测签到位置

### 1．获取数据集

In [24]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')   # 过滤警告信息
all_data = pd.read_csv(r'C:\Users\itcast\Desktop\train.csv')
all_data.head()

Unnamed: 0,row_id,x,y,accuracy,time,place_id
0,0,0.7941,9.0809,54,470702,8523065625
1,1,5.9567,4.7968,13,186555,1757726713
2,2,8.3078,7.0407,74,322648,1137537235
3,3,7.3665,2.5165,65,704587,6567393236
4,4,4.0961,1.1307,31,472130,7440663949


In [25]:
all_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29118021 entries, 0 to 29118020
Data columns (total 6 columns):
 #   Column    Dtype  
---  ------    -----  
 0   row_id    int64  
 1   x         float64
 2   y         float64
 3   accuracy  int64  
 4   time      int64  
 5   place_id  int64  
dtypes: float64(2), int64(4)
memory usage: 1.3 GB


### 2.处理数据

In [26]:
# 缩小数据范围,区域限定在2.0-2.5*2.0-2.5大小的地方
signin_data = all_data.query('x>2.0 & x<2.5 & y>2.0 & y<2.5')
signin_data.shape

(71664, 6)

In [27]:
# 选取时间特征
signin_data['time'] = pd.to_datetime(signin_data['time'], unit='s')
signin_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 71664 entries, 163 to 29117203
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   row_id    71664 non-null  int64         
 1   x         71664 non-null  float64       
 2   y         71664 non-null  float64       
 3   accuracy  71664 non-null  int64         
 4   time      71664 non-null  datetime64[ns]
 5   place_id  71664 non-null  int64         
dtypes: datetime64[ns](1), float64(2), int64(3)
memory usage: 3.8 MB


In [28]:
# 获取天、小时、星期，并给所有样本增加这些特征
signin_data['day'] = signin_data['time'].dt.day
signin_data['hour'] = signin_data['time'].dt.hour
signin_data['weekday'] = signin_data['time'].dt.weekday
print(signin_data.head())

      row_id       x       y  accuracy                time    place_id  day  \
163      163  2.1663  2.3755        84 1970-01-08 18:02:17  3869813743    8   
310      310  2.3695  2.2034         3 1970-01-03 17:11:59  2636621520    3   
658      658  2.3236  2.1768        66 1970-01-06 19:32:23  7877745055    6   
1368    1368  2.2613  2.3392        73 1970-01-04 16:50:22  9775192577    4   
1627    1627  2.3331  2.0011        66 1970-01-07 21:18:04  6731326909    7   

      hour  weekday  
163     18        3  
310     17        5  
658     19        1  
1368    16        6  
1627    21        2  


In [29]:
# 去掉签到比较少的地方
place_count = signin_data.groupby('place_id').count()
place_count = place_count[place_count['x'] > 3]  # 筛选出x值大于3的数据
place_count

            row_id     x     y  accuracy  time   day  hour  weekday
place_id                                                           
1008823061       4     4     4         4     4     4     4        4
1025585791      21    21    21        21    21    21    21       21
1026507711     220   220   220       220   220   220   220      220
1032417180      10    10    10        10    10    10    10       10
1040557418     123   123   123       123   123   123   123      123
...            ...   ...   ...       ...   ...   ...   ...      ...
9966115681      15    15    15        15    15    15    15       15
9970566102       5     5     5         5     5     5     5        5
9983648790    1056  1056  1056      1056  1056  1056  1056     1056
9995108787      23    23    23        23    23    23    23       23
9998968845      99    99    99        99    99    99    99       99

[929 rows x 8 columns]


In [30]:
signin_data = signin_data[signin_data['place_id'].isin(place_count.index)]
signin_data

            row_id       x       y  accuracy                time    place_id  \
163            163  2.1663  2.3755        84 1970-01-08 18:02:17  3869813743   
310            310  2.3695  2.2034         3 1970-01-03 17:11:59  2636621520   
658            658  2.3236  2.1768        66 1970-01-06 19:32:23  7877745055   
1368          1368  2.2613  2.3392        73 1970-01-04 16:50:22  9775192577   
1627          1627  2.3331  2.0011        66 1970-01-07 21:18:04  6731326909   
...            ...     ...     ...       ...                 ...         ...   
29116142  29116142  2.0804  2.0657       168 1970-01-03 12:31:26  1247398579   
29116267  29116267  2.4309  2.4646        33 1970-01-04 15:19:20  1951613663   
29116295  29116295  2.1797  2.1707        89 1970-01-01 20:49:14  4724115005   
29116475  29116475  2.3924  2.2704        62 1970-01-03 09:17:37  2819110495   
29117203  29117203  2.4942  2.2430        11 1970-01-02 20:34:43  2634419689   

          day  hour  weekday  
163     

In [31]:
# 确定特征值和目标值
x = signin_data[["x", "y", "accuracy", "day", "hour", "weekday"]]
y = signin_data["place_id"]

In [32]:
# 分割数据集
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=22)

### 3.建立特征工程

In [36]:
from sklearn.preprocessing import StandardScaler
# 创建一个转换器
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

### 4.建立机器学习模型

In [37]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
estimator = KNeighborsClassifier()
# 进行网格搜索和交叉验证
param_dict ={"n_neighbors": [1,3,5,7,9]}
estimator = GridSearchCV(estimator, param_grid=param_dict, cv=5)
# 拟合从训练数据集得到的分类器
estimator.fit(x_train, y_train)

GridSearchCV(cv=5, estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': [1, 3, 5, 7, 9]})

### 5.模型评估

In [15]:
# 预测结果
estimator.predict(x_test)

array([9983648790, 9216778377, 8914246980, ..., 8980163153, 6232319657,
       7243158436], dtype=int64)

In [16]:
# 准确率输出
estimator.score(x_test,y_test)

0.4383166101205515

In [17]:
# 最好的结果
estimator.best_score_

0.436086741795671