In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

# 1、获取数据
data = pd.read_csv("./fb_location/train.csv")
# 2、数据处理（缩小数据范围，处理时间特征）
data = data.query("x < 5.5 & x > 5.0 & y < 3.5 & y > 3.0")
time_value = pd.to_datetime(data["time"], unit="s")
date = pd.DatetimeIndex(time_value)
data["day"] = date.day
data["weekday"] = date.weekday
data["hour"] = date.hour
# 3、过滤签到次数少的地点
place_count = data.groupby("place_id").count()["row_id"]
data_new = data[data["place_id"].isin(place_count[place_count > 3].index.values)]
# 4、筛选特征值和目标值
x = data_new[["x", "y", "accuracy", "day", "weekday", "hour"]]
y = data_new["place_id"]
# 5、数据集划分
x_train, x_test, y_train, y_test = train_test_split(x, y)
# 6、特征工程（标准化）
transfer = StandardScaler()
x_train = transfer.fit_transform(x_train)
x_test = transfer.transform(x_test)
# 7、KNN算法预估器
estimator = KNeighborsClassifier(p=2)
# 8、模型选择和调优（网格搜索，交叉验证）
param_dict = {"n_neighbors": [1, 3, 5, 7, 9, 11]}
estimator = GridSearchCV(estimator, param_grid=param_dict, cv=3)
estimator.fit(x_train, y_train)
# 9、模型评估
y_predict = estimator.predict(x_test)
print("y_predict: ", y_predict)
print("对比真实值和预测值：", y_test == y_predict)
score = estimator.score(x_test, y_test)
print("准确率为：", score)
print("最佳参数：", estimator.best_params_)
print("最佳结果：", estimator.best_score_)
print("最佳预估器：", estimator.best_estimator_)
print("交叉验证结果：", estimator.cv_results_)



y_predict:  [6668114465 3873334833 4373429966 ... 1853867889 6284072854 6284072854]
对比真实值和预测值： 25862616    False
8622703      True
11091100    False
10670039    False
23130637    False
            ...  
5452709      True
7286332      True
28159569    False
13965343    False
11357540    False
Name: place_id, Length: 18231, dtype: bool
准确率为： 0.357797158685755
最佳参数： {'n_neighbors': 1}
最佳结果： 0.3449625159992686
最佳预估器： KNeighborsClassifier(n_neighbors=1)
交叉验证结果： {'mean_fit_time': array([0.07480812, 0.06316797, 0.06250111, 0.07180913, 0.06549207,
       0.07280493]), 'std_fit_time': array([0.01345018, 0.00046384, 0.00092276, 0.00777671, 0.00384322,
       0.00776772]), 'mean_score_time': array([0.50465131, 0.57345891, 0.62266811, 0.66687298, 0.7134165 ,
       0.72738767]), 'std_score_time': array([0.00987267, 0.00487732, 0.00791522, 0.00204762, 0.00367248,
       0.00367241]), 'param_n_neighbors': masked_array(data=[1, 3, 5, 7, 9, 11],
             mask=[False, False, False, False, False, Fa