In [60]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
import datetime

In [61]:
start = datetime.datetime.now()
print(start)

2022-12-18 18:27:55.307139


## 获取数据集

In [62]:
facebook = pd.read_csv(r'D:\file\python\黑马AI教程\05阶段五 人工智能经典算法编程\第五阶段 人工智能经典算法编程\5-1 人工智能基础v5.0-代码和笔记\第6节 K-邻近算法\2. 其他资料\预习数据\FBlocation\train.csv')

In [63]:
print(facebook.head())

   row_id       x       y  accuracy    time    place_id
0       0  0.7941  9.0809        54  470702  8523065625
1       1  5.9567  4.7968        13  186555  1757726713
2       2  8.3078  7.0407        74  322648  1137537235
3       3  7.3665  2.5165        65  704587  6567393236
4       4  4.0961  1.1307        31  472130  7440663949


## 数据基本处理

### 筛选一个区域

In [64]:
facebook = facebook[(2<facebook['x'])&(3>facebook['x'])&(2<facebook['y'])&(3>facebook['y'])]

In [65]:
print(facebook.head())

     row_id       x       y  accuracy    time    place_id
82       82  2.9265  2.3590        55  415119  9313893105
159     159  2.4166  2.7762        66   74327  3432339087
163     163  2.1663  2.3755        84  669737  3869813743
191     191  2.7878  2.5728       144  330072  7194654578
310     310  2.3695  2.2034         3  234719  2636621520


### 将数据转为time类型

In [66]:
facebook_time = pd.to_datetime(facebook['time'],unit='s')

In [67]:
print(facebook_time.head())

82    1970-01-05 19:18:39
159   1970-01-01 20:38:47
163   1970-01-08 18:02:17
191   1970-01-04 19:41:12
310   1970-01-03 17:11:59
Name: time, dtype: datetime64[ns]


### 将时间转为Datetime类型

In [68]:
facebook_time = pd.DatetimeIndex(facebook_time)

In [69]:
print('weekday:',weekday:=facebook_time.weekday)
print('day:',day:=facebook_time.day)
print('hour:',hour:=facebook_time.hour)
print('minute:',minute:=facebook_time.minute)

weekday: Int64Index([0, 3, 3, 6, 5, 1, 3, 3, 2, 4,
            ...
            5, 0, 4, 1, 4, 4, 3, 3, 3, 5],
           dtype='int64', name='time', length=314785)
day: Int64Index([ 5,  1,  8,  4,  3,  6,  8,  8,  7,  2,
            ...
             3,  5,  2,  6,  9,  9,  1,  1,  8, 10],
           dtype='int64', name='time', length=314785)
hour: Int64Index([19, 20, 18, 19, 17, 15,  0, 22, 18, 21,
            ...
             9, 13, 20,  1, 22,  7, 22, 17,  0,  2],
           dtype='int64', name='time', length=314785)
minute: Int64Index([18, 38,  2, 41, 11,  2, 40,  2, 26, 40,
            ...
            17, 21, 34, 46, 50, 35, 20, 47, 57, 17],
           dtype='int64', name='time', length=314785)


### 将周和天和小时写入

In [70]:
facebook['weekday'] = weekday
facebook['day'] = day
facebook['hour'] = hour
facebook['minute'] = minute

In [71]:
print(facebook.head())

     row_id       x       y  accuracy    time    place_id  weekday  day  hour  \
82       82  2.9265  2.3590        55  415119  9313893105        0    5    19   
159     159  2.4166  2.7762        66   74327  3432339087        3    1    20   
163     163  2.1663  2.3755        84  669737  3869813743        3    8    18   
191     191  2.7878  2.5728       144  330072  7194654578        6    4    19   
310     310  2.3695  2.2034         3  234719  2636621520        5    3    17   

     minute  
82       18  
159      38  
163       2  
191      41  
310      11  


### 原地删除掉之前的time列

In [72]:
facebook.drop('time',axis=1,inplace=True)

In [73]:
print(facebook.head())
print(facebook.shape)

     row_id       x       y  accuracy    place_id  weekday  day  hour  minute
82       82  2.9265  2.3590        55  9313893105        0    5    19      18
159     159  2.4166  2.7762        66  3432339087        3    1    20      38
163     163  2.1663  2.3755        84  3869813743        3    8    18       2
191     191  2.7878  2.5728       144  7194654578        6    4    19      41
310     310  2.3695  2.2034         3  2636621520        5    3    17      11
(314785, 9)


### 去掉签到较少的地方

In [74]:
facebook_group = facebook.groupby('place_id').count()
facebook_count = facebook_group[facebook_group['row_id']>3]
print(facebook_count)

            row_id    x    y  accuracy  weekday  day  hour  minute
place_id                                                          
1000383269      49   49   49        49       49   49    49      49
1000616752       7    7    7         7        7    7     7       7
1008823061      18   18   18        18       18   18    18      18
1012023972       5    5    5         5        5    5     5       5
1012580558       5    5    5         5        5    5     5       5
...            ...  ...  ...       ...      ...  ...   ...     ...
9990596754     252  252  252       252      252  252   252     252
9994611236       5    5    5         5        5    5     5       5
9995108787      28   28   28        28       28   28    28      28
9998057926     117  117  117       117      117  117   117     117
9998968845     101  101  101       101      101  101   101     101

[3012 rows x 8 columns]


In [75]:
facebook = facebook[facebook['place_id'].isin(facebook_count.index)]

In [76]:
print(facebook.head())
print(facebook.shape)

     row_id       x       y  accuracy    place_id  weekday  day  hour  minute
82       82  2.9265  2.3590        55  9313893105        0    5    19      18
159     159  2.4166  2.7762        66  3432339087        3    1    20      38
163     163  2.1663  2.3755        84  3869813743        3    8    18       2
191     191  2.7878  2.5728       144  7194654578        6    4    19      41
310     310  2.3695  2.2034         3  2636621520        5    3    17      11
(309342, 9)


### 确定特征值和目标值

In [77]:
feature = facebook[["x", "y", "accuracy", "day", "hour", "weekday"]]
# feature = facebook[["x", "y", "accuracy", "day", "hour", "weekday",'minute']]
target = facebook["place_id"]

## 划分训练集和测试集

In [78]:
feature_train, feature_test, target_train, target_test = train_test_split(feature, target)

## 特征预处理,标准化

In [79]:
transfer = StandardScaler()
feature_train = transfer.fit_transform(feature_train)
feature_test = transfer.fit_transform(feature_test)

## 机器学习和交叉验证网格搜索knn-CV

In [80]:
k = [1, 2, 3, 4, 5]
cv = 10
estimator = KNeighborsClassifier(n_jobs=)
estimator_cv = GridSearchCV(estimator, param_grid={'n_neighbors': k}, cv=cv)
estimator_cv.fit(feature_train, target_train)



In [81]:
print(f'k={k},cv={cv}时交叉验证的训练集最高准确度:\n', estimator_cv.best_score_)
print(f'k={k},cv={cv}时交叉验证的最好的参数模型:\n', estimator_cv.best_estimator_)
print(f'k={k},cv={cv}时交叉验证的最好的参数模型的k值:\n', k_best := estimator_cv.best_estimator_.n_neighbors)

k=[1, 2, 3, 4, 5],cv=10时交叉验证的训练集最高准确度:
 0.32313388803098564
k=[1, 2, 3, 4, 5],cv=10时交叉验证的最好的参数模型:
 KNeighborsClassifier(n_jobs=-1, n_neighbors=1)
k=[1, 2, 3, 4, 5],cv=10时交叉验证的最好的参数模型的k值:
 1


In [82]:
# 6、用最好的模型训练数据
# 实例化估计器
estimator_best = KNeighborsClassifier(n_neighbors=k_best)
estimator_best.fit(feature_train, target_train)
# 预测测试集，并返回测试集的准确率
score_best = estimator_best.score(feature_test, target_test)
print('最好的模型测试集的准确度:\n', score_best)

最好的模型测试集的准确度:
 0.32794558808316954


In [83]:
end = datetime.datetime.now()
print(end-start)

0:00:43.380488
