In [1]:
import pandas as pd

In [2]:
# 1、获取数据
data = pd.read_csv("./FBlocation/train.csv")

In [4]:
print(data.head())

   row_id       x       y  accuracy    time    place_id
0       0  0.7941  9.0809        54  470702  8523065625
1       1  5.9567  4.7968        13  186555  1757726713
2       2  8.3078  7.0407        74  322648  1137537235
3       3  7.3665  2.5165        65  704587  6567393236
4       4  4.0961  1.1307        31  472130  7440663949


In [5]:
# 2、基本的数据处理
# 1）缩小数据范围
data = data.query("x < 2.5 & x > 2 & y < 1.5 & y > 1.0")

In [6]:
print(data.head())

      row_id       x       y  accuracy    time    place_id
112      112  2.2360  1.3655        66  623174  7663031065
180      180  2.2003  1.2541        65  610195  2358558474
367      367  2.4108  1.3213        74  579667  6644108708
874      874  2.0822  1.1973       320  143566  3229876087
1022    1022  2.0160  1.1659        65  207993  3244363975


In [11]:
# 2）处理时间特征
time_value = pd.to_datetime(data["time"], unit="s")
print(time_value.head())

112    1970-01-08 05:06:14
180    1970-01-08 01:29:55
367    1970-01-07 17:01:07
874    1970-01-02 15:52:46
1022   1970-01-03 09:46:33
Name: time, dtype: datetime64[ns]


In [14]:
date = pd.DatetimeIndex(time_value)
print(date.day)

Int64Index([8, 8, 7, 2, 3, 6, 6, 2, 4, 7,
            ...
            2, 1, 9, 2, 4, 9, 7, 9, 2, 1],
           dtype='int64', name='time', length=83197)


In [13]:
data["day"] = date.day

In [15]:
data["weekday"] = date.weekday

In [16]:
data["hour"] = date.hour

In [17]:
print(data.head())

      row_id       x       y  accuracy    time    place_id  day  weekday  hour
112      112  2.2360  1.3655        66  623174  7663031065    8        3     5
180      180  2.2003  1.2541        65  610195  2358558474    8        3     1
367      367  2.4108  1.3213        74  579667  6644108708    7        2    17
874      874  2.0822  1.1973       320  143566  3229876087    2        4    15
1022    1022  2.0160  1.1659        65  207993  3244363975    3        5     9


In [22]:
# 3）过滤签到次数少的地点
place_count = data.groupby("place_id").count()["row_id"]
print(place_count.head())

place_id
1012165853     1
1013991737     3
1014605271    28
1015645743     4
1017236154    31
Name: row_id, dtype: int64


In [24]:
print(place_count[place_count > 3].head())

place_id
1014605271    28
1015645743     4
1017236154    31
1024951487     5
1028119817     4
Name: row_id, dtype: int64


In [28]:
# data["place_id"].isin(place_count[place_count > 3].index.values)
data_final = data[data["place_id"].isin(place_count[place_count > 3].index.values)]

In [29]:
print(data_final.head())

      row_id       x       y  accuracy    time    place_id  day  weekday  hour
112      112  2.2360  1.3655        66  623174  7663031065    8        3     5
367      367  2.4108  1.3213        74  579667  6644108708    7        2    17
874      874  2.0822  1.1973       320  143566  3229876087    2        4    15
1022    1022  2.0160  1.1659        65  207993  3244363975    3        5     9
1045    1045  2.3859  1.1660       498  503378  6438240873    6        1    19


In [30]:
# 筛选特征值和目标值
x = data_final[["x", "y", "accuracy", "day", "weekday", "hour"]]
y = data_final["place_id"]

In [32]:
print(x.head())

           x       y  accuracy  day  weekday  hour
112   2.2360  1.3655        66    8        3     5
367   2.4108  1.3213        74    7        2    17
874   2.0822  1.1973       320    2        4    15
1022  2.0160  1.1659        65    3        5     9
1045  2.3859  1.1660       498    6        1    19


In [34]:
print(y.head())

112     7663031065
367     6644108708
874     3229876087
1022    3244363975
1045    6438240873
Name: place_id, dtype: int64


In [35]:
# 数据集划分
from sklearn.model_selection import train_test_split

In [36]:
x_train, x_test, y_train, y_test = train_test_split(x, y)

In [37]:
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

In [38]:
# 3）特征工程：标准化
transfer = StandardScaler()
x_train = transfer.fit_transform(x_train)
x_test = transfer.transform(x_test)

# 4）KNN算法预估器
estimator = KNeighborsClassifier()

# 加入网格搜索与交叉验证
# 参数准备
param_dict = {"n_neighbors": [3, 5, 7, 9]}
estimator = GridSearchCV(estimator, param_grid=param_dict, cv=3)
estimator.fit(x_train, y_train)

# 5）模型评估
# 方法1：直接比对真实值和预测值
y_predict = estimator.predict(x_test)
print("y_predict:\n", y_predict)
print("直接比对真实值和预测值:\n", y_test == y_predict)

# 方法2：计算准确率
score = estimator.score(x_test, y_test)
print("准确率为：\n", score)

# 最佳参数：best_params_
print("最佳参数：\n", estimator.best_params_)
# 最佳结果：best_score_
print("最佳结果：\n", estimator.best_score_)
# 最佳估计器：best_estimator_
print("最佳估计器:\n", estimator.best_estimator_)
# 交叉验证结果：cv_results_
print("交叉验证结果:\n", estimator.cv_results_)



y_predict:
 [1540382716 1533408099 1533408099 ... 2367979052 6644108708 1430541006]
直接比对真实值和预测值:
 24167233     True
13720787     True
11237195     True
13031721     True
5671499      True
14884363     True
10682396     True
24127214    False
17709468     True
27958906    False
17916554    False
29007542     True
24992416     True
10458362    False
25138793     True
10150121    False
12150630    False
27161435    False
19473046    False
8478596     False
20517337    False
27151477    False
19926232     True
9184790     False
25623816    False
21910497    False
21820410    False
5674027      True
5697133     False
6927799      True
            ...  
2736363     False
9980674      True
12734797     True
3131471     False
22231951     True
4036397     False
4262669     False
18336177     True
26331661     True
2061546      True
9668776     False
4782477     False
5035875      True
19804862     True
26278729    False
20453516    False
29029618    False
8859393      True
22959337    False
44