In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
import datetime

In [2]:
start = datetime.datetime.now()

## 获取数据集

In [3]:
facebook = pd.read_csv(r'D:\file\python\黑马AI教程\05阶段五 人工智能经典算法编程\第五阶段 人工智能经典算法编程\5-1 人工智能基础v5.0-代码和笔记\第6节 K-邻近算法\2. 其他资料\预习数据\FBlocation\train.csv')

In [4]:
print(facebook.head())

   row_id       x       y  accuracy    time    place_id
0       0  0.7941  9.0809        54  470702  8523065625
1       1  5.9567  4.7968        13  186555  1757726713
2       2  8.3078  7.0407        74  322648  1137537235
3       3  7.3665  2.5165        65  704587  6567393236
4       4  4.0961  1.1307        31  472130  7440663949


## 数据基本处理

### 筛选一个区域

In [5]:
# facebook = facebook[(2<facebook['x'])&(3>facebook['x'])&(2<facebook['y'])&(3>facebook['y'])]

In [6]:
print(facebook.head())

   row_id       x       y  accuracy    time    place_id
0       0  0.7941  9.0809        54  470702  8523065625
1       1  5.9567  4.7968        13  186555  1757726713
2       2  8.3078  7.0407        74  322648  1137537235
3       3  7.3665  2.5165        65  704587  6567393236
4       4  4.0961  1.1307        31  472130  7440663949


### 将数据转为time类型

In [7]:
facebook_time = pd.to_datetime(facebook['time'],unit='s')

In [8]:
print(facebook_time.head())

0   1970-01-06 10:45:02
1   1970-01-03 03:49:15
2   1970-01-04 17:37:28
3   1970-01-09 03:43:07
4   1970-01-06 11:08:50
Name: time, dtype: datetime64[ns]


### 将时间转为Datetime类型

In [9]:
facebook_time = pd.DatetimeIndex(facebook_time)

In [10]:
print('weekday:',weekday:=facebook_time.weekday)
print('day:',day:=facebook_time.day)
print('hour:',hour:=facebook_time.hour)
print('minute:',minute:=facebook_time.minute)

weekday: Int64Index([1, 5, 6, 4, 1, 5, 3, 0, 4, 0,
            ...
            4, 2, 1, 3, 2, 0, 4, 4, 4, 4],
           dtype='int64', name='time', length=29118021)
day: Int64Index([6, 3, 4, 9, 6, 3, 8, 5, 2, 5,
            ...
            2, 7, 6, 1, 7, 5, 2, 9, 9, 2],
           dtype='int64', name='time', length=29118021)
hour: Int64Index([10,  3, 17,  3, 11,  1, 17,  6, 22, 15,
            ...
             5,  2, 22, 11,  8, 15, 10, 12, 20,  4],
           dtype='int64', name='time', length=29118021)
minute: Int64Index([45, 49, 37, 43,  8, 27, 13, 30, 13,  7,
            ...
            57, 44, 33, 47, 40,  2, 51, 55, 29, 34],
           dtype='int64', name='time', length=29118021)


### 将周和天和小时写入

In [11]:
facebook['weekday'] = weekday
facebook['day'] = day
facebook['hour'] = hour
facebook['minute'] = minute

In [12]:
print(facebook.head())

   row_id       x       y  accuracy    time    place_id  weekday  day  hour  \
0       0  0.7941  9.0809        54  470702  8523065625        1    6    10   
1       1  5.9567  4.7968        13  186555  1757726713        5    3     3   
2       2  8.3078  7.0407        74  322648  1137537235        6    4    17   
3       3  7.3665  2.5165        65  704587  6567393236        4    9     3   
4       4  4.0961  1.1307        31  472130  7440663949        1    6    11   

   minute  
0      45  
1      49  
2      37  
3      43  
4       8  


### 原地删除掉之前的time列

In [13]:
facebook.drop('time',axis=1,inplace=True)

In [14]:
print(facebook.head())
print(facebook.shape)

   row_id       x       y  accuracy    place_id  weekday  day  hour  minute
0       0  0.7941  9.0809        54  8523065625        1    6    10      45
1       1  5.9567  4.7968        13  1757726713        5    3     3      49
2       2  8.3078  7.0407        74  1137537235        6    4    17      37
3       3  7.3665  2.5165        65  6567393236        4    9     3      43
4       4  4.0961  1.1307        31  7440663949        1    6    11       8
(29118021, 9)


### 去掉签到较少的地方

In [15]:
facebook_group = facebook.groupby('place_id').count()
facebook_count = facebook_group[facebook_group['row_id']>3]
print(facebook_count)

            row_id    x    y  accuracy  weekday  day  hour  minute
place_id                                                          
1000015801      78   78   78        78       78   78    78      78
1000017288      95   95   95        95       95   95    95      95
1000025138     563  563  563       563      563  563   563     563
1000052096     961  961  961       961      961  961   961     961
1000063498      60   60   60        60       60   60    60      60
...            ...  ...  ...       ...      ...  ...   ...     ...
9999851158      60   60   60        60       60   60    60      60
9999855083     212  212  212       212      212  212   212     212
9999862567      63   63   63        63       63   63    63      63
9999916757     508  508  508       508      508  508   508     508
9999932225     218  218  218       218      218  218   218     218

[107814 rows x 8 columns]


In [16]:
facebook = facebook[facebook['place_id'].isin(facebook_count.index)]

In [17]:
print(facebook.head())
print(facebook.shape)

   row_id       x       y  accuracy    place_id  weekday  day  hour  minute
0       0  0.7941  9.0809        54  8523065625        1    6    10      45
1       1  5.9567  4.7968        13  1757726713        5    3     3      49
2       2  8.3078  7.0407        74  1137537235        6    4    17      37
3       3  7.3665  2.5165        65  6567393236        4    9     3      43
4       4  4.0961  1.1307        31  7440663949        1    6    11       8
(29116952, 9)


### 确定特征值和目标值

In [18]:
feature = facebook[["x", "y", "accuracy", "day", "hour", "weekday"]]
# feature = facebook[["x", "y", "accuracy", "day", "hour", "weekday",'minute']]
target = facebook["place_id"]

## 划分训练集和测试集

In [19]:
feature_train, feature_test, target_train, target_test = train_test_split(feature, target)

## 特征预处理,标准化

In [20]:
transfer = StandardScaler()
feature_train = transfer.fit_transform(feature_train)
feature_test = transfer.fit_transform(feature_test)

## 机器学习和交叉验证网格搜索knn-CV

In [21]:
k = [1, 2, 3, 4, 5]
cv = 5
estimator = KNeighborsClassifier()
estimator_cv = GridSearchCV(estimator, param_grid={'n_neighbors': k}, cv=cv)
estimator_cv.fit(feature_train, target_train)



GridSearchCV(cv=15, estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': [1, 2, 3, 4, 5, 10]})

In [22]:
print(f'k={k},cv={cv}时交叉验证的训练集最高准确度:\n', estimator_cv.best_score_)
print(f'k={k},cv={cv}时交叉验证的最好的参数模型:\n', estimator_cv.best_estimator_)
print(f'k={k},cv={cv}时交叉验证的最好的参数模型的k值:\n', k_best := estimator_cv.best_estimator_.n_neighbors)

k=[1, 2, 3, 4, 5, 10],cv=15时交叉验证的训练集最高准确度:
 0.11630168798090439
k=[1, 2, 3, 4, 5, 10],cv=15时交叉验证的最好的参数模型:
 KNeighborsClassifier(n_neighbors=1)
k=[1, 2, 3, 4, 5, 10],cv=15时交叉验证的最好的参数模型的k值:
 1


In [23]:
# 6、用最好的模型训练数据
# 实例化估计器
estimator_best = KNeighborsClassifier(n_neighbors=k_best)
estimator_best.fit(feature_train, target_train)
# 预测测试集，并返回测试集的准确率
score_best = estimator_best.score(feature_test, target_test)
print('最好的模型测试集的准确度:\n', score_best)

最好的模型测试集的准确度:
 0.1197533313239655


In [24]:
end = datetime.datetime.now()
print(end-start)

9:41:56.418093
