In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

In [2]:
# 1.获取数据集
# 2.基本数据处理
# 2.1 缩⼩数据范围
# 2.2 选择时间特征
# 2.3 去掉签到较少的地⽅
# 2.4 确定特征值和⽬标值
# 2.5 分割数据集
# 3.特征⼯程 -- 特征预处理(标准化)
# 4.机器学习 -- knn+cv
# 5.模型评估

In [3]:
# 1.获取数据集
data = pd.read_csv("./data/FBlocation/train.csv")

In [4]:
data.head()

Unnamed: 0,row_id,x,y,accuracy,time,place_id
0,0,0.7941,9.0809,54,470702,8523065625
1,1,5.9567,4.7968,13,186555,1757726713
2,2,8.3078,7.0407,74,322648,1137537235
3,3,7.3665,2.5165,65,704587,6567393236
4,4,4.0961,1.1307,31,472130,7440663949


In [5]:
data.describe()

Unnamed: 0,row_id,x,y,accuracy,time,place_id
count,29118020.0,29118020.0,29118020.0,29118020.0,29118020.0,29118020.0
mean,14559010.0,4.99977,5.001814,82.84912,417010.4,5493787000.0
std,8405649.0,2.857601,2.887505,114.7518,231176.1,2611088000.0
min,0.0,0.0,0.0,1.0,1.0,1000016000.0
25%,7279505.0,2.5347,2.4967,27.0,203057.0,3222911000.0
50%,14559010.0,5.0091,4.9883,62.0,433922.0,5518573000.0
75%,21838520.0,7.4614,7.5103,75.0,620491.0,7764307000.0
max,29118020.0,10.0,10.0,1033.0,786239.0,9999932000.0


In [6]:
data.shape

(29118021, 6)

In [7]:
# 2.基本数据处理
# 2.1 缩⼩数据范围
partial_data = data.query("x>2.0 & x<2.5 & y>2.0 & y< 2.5")

In [8]:
partial_data.head()

Unnamed: 0,row_id,x,y,accuracy,time,place_id
163,163,2.1663,2.3755,84,669737,3869813743
310,310,2.3695,2.2034,3,234719,2636621520
658,658,2.3236,2.1768,66,502343,7877745055
1368,1368,2.2613,2.3392,73,319822,9775192577
1627,1627,2.3331,2.0011,66,595084,6731326909


In [9]:
partial_data.shape

(71664, 6)

In [10]:
# 2.2 选择时间特征
partial_data["time"].head()

163     669737
310     234719
658     502343
1368    319822
1627    595084
Name: time, dtype: int64

In [11]:
time = pd.to_datetime(partial_data["time"], unit="s")
# 脱敏
time.head()

163    1970-01-08 18:02:17
310    1970-01-03 17:11:59
658    1970-01-06 19:32:23
1368   1970-01-04 16:50:22
1627   1970-01-07 21:18:04
Name: time, dtype: datetime64[ns]

In [12]:
time = pd.DatetimeIndex(time)
time

DatetimeIndex(['1970-01-08 18:02:17', '1970-01-03 17:11:59',
               '1970-01-06 19:32:23', '1970-01-04 16:50:22',
               '1970-01-07 21:18:04', '1970-01-02 03:14:59',
               '1970-01-07 03:45:16', '1970-01-05 03:28:43',
               '1970-01-01 18:59:03', '1970-01-09 07:50:12',
               ...
               '1970-01-09 20:03:34', '1970-01-08 09:26:50',
               '1970-01-07 04:45:59', '1970-01-07 22:36:18',
               '1970-01-06 23:29:43', '1970-01-03 12:31:26',
               '1970-01-04 15:19:20', '1970-01-01 20:49:14',
               '1970-01-03 09:17:37', '1970-01-02 20:34:43'],
              dtype='datetime64[ns]', name='time', length=71664, freq=None)

In [13]:
time.hour

Int64Index([18, 17, 19, 16, 21,  3,  3,  3, 18,  7,
            ...
            20,  9,  4, 22, 23, 12, 15, 20,  9, 20],
           dtype='int64', name='time', length=71664)

In [14]:
time.day

Int64Index([8, 3, 6, 4, 7, 2, 7, 5, 1, 9,
            ...
            9, 8, 7, 7, 6, 3, 4, 1, 3, 2],
           dtype='int64', name='time', length=71664)

In [15]:
partial_data["hour"] = time.hour
partial_data["day"] = time.day
partial_data["weekday"] = time.weekday

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  partial_data["hour"] = time.hour
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  partial_data["day"] = time.day
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  partial_data["weekday"] = time.weekday


In [16]:
partial_data.head()

Unnamed: 0,row_id,x,y,accuracy,time,place_id,hour,day,weekday
163,163,2.1663,2.3755,84,669737,3869813743,18,8,3
310,310,2.3695,2.2034,3,234719,2636621520,17,3,5
658,658,2.3236,2.1768,66,502343,7877745055,19,6,1
1368,1368,2.2613,2.3392,73,319822,9775192577,16,4,6
1627,1627,2.3331,2.0011,66,595084,6731326909,21,7,2


In [17]:
# 2.3 去掉签到较少的地⽅
place_count = partial_data.groupby("place_id").count()

In [18]:
place_count.head()

Unnamed: 0_level_0,row_id,x,y,accuracy,time,hour,day,weekday
place_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1006234733,1,1,1,1,1,1,1,1
1008823061,4,4,4,4,4,4,4,4
1012580558,3,3,3,3,3,3,3,3
1025585791,21,21,21,21,21,21,21,21
1026507711,220,220,220,220,220,220,220,220


In [19]:
place_count = place_count[place_count["row_id"]>3]
place_count.head()

Unnamed: 0_level_0,row_id,x,y,accuracy,time,hour,day,weekday
place_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1008823061,4,4,4,4,4,4,4,4
1025585791,21,21,21,21,21,21,21,21
1026507711,220,220,220,220,220,220,220,220
1032417180,10,10,10,10,10,10,10,10
1040557418,123,123,123,123,123,123,123,123


In [20]:
partial_data = partial_data[partial_data["place_id"]. isin(place_count.index)]
partial_data.head()

Unnamed: 0,row_id,x,y,accuracy,time,place_id,hour,day,weekday
163,163,2.1663,2.3755,84,669737,3869813743,18,8,3
310,310,2.3695,2.2034,3,234719,2636621520,17,3,5
658,658,2.3236,2.1768,66,502343,7877745055,19,6,1
1368,1368,2.2613,2.3392,73,319822,9775192577,16,4,6
1627,1627,2.3331,2.0011,66,595084,6731326909,21,7,2


In [21]:
partial_data.shape

(69264, 9)

In [22]:
# 2.4 确定特征值和⽬标值
x = partial_data[["x", "y", "accuracy", "hour", "day", "weekday"]]
y = partial_data["place_id"]

In [23]:
x.head()

Unnamed: 0,x,y,accuracy,hour,day,weekday
163,2.1663,2.3755,84,18,8,3
310,2.3695,2.2034,3,17,3,5
658,2.3236,2.1768,66,19,6,1
1368,2.2613,2.3392,73,16,4,6
1627,2.3331,2.0011,66,21,7,2


In [24]:
y.head()

163     3869813743
310     2636621520
658     7877745055
1368    9775192577
1627    6731326909
Name: place_id, dtype: int64

In [25]:
# 2.5 分割数据集
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=2, test_size=0.25)
x_train.head()

Unnamed: 0,x,y,accuracy,hour,day,weekday
19509166,2.3217,2.2029,1,0,8,3
20577315,2.48,2.2129,175,20,7,2
24044078,2.2389,2.3447,79,3,8,3
11279021,2.085,2.2789,119,1,9,4
19491154,2.2408,2.0092,168,0,8,3


In [26]:
# 3.特征⼯程 -- 特征预处理(标准化)
transfer = StandardScaler()
x_train = transfer.fit_transform(x_train)
x_test = transfer.transform(x_test)

In [27]:
# 4.机器学习 -- knn+cv
# 4.1 实例化一个训练器
estimator = KNeighborsClassifier()

# 4.2 交叉验证，网络搜索实现
param_grid = {"n_neighbors": [3, 5, 7, 9]}
estimator = GridSearchCV(estimator=estimator, param_grid=param_grid, cv=10, n_jobs=-1) # n_jobs:cpu的数量

# 4.3 模型训练
estimator.fit(x_train, y_train)



In [28]:
# 5.模型评估
# 5.1 准确率输出
score_ret = estimator.score(x_test, y_test)
print("准确率为:\n", score_ret)

准确率为:
 0.3693116193116193


In [29]:
# 5.2 预测结果
y_pre = estimator.predict(x_test)
print("预测值:\n", y_pre)

预测值:
 [2225211839 8980163153 1247398579 ... 1891783132 8169595806 3661555534]


In [30]:
# 5.3 其他结果输出
print("最好的模型是:\n", estimator.best_estimator_)
print("最好的结果是:\n", estimator.best_score_)
print("所有的结果 是:\n", estimator.cv_results_)

最好的模型是:
 KNeighborsClassifier()
最好的结果是:
 0.36103403905372417
所有的结果 是:
 {'mean_fit_time': array([0.13590701, 0.13994305, 0.13146608, 0.13414617]), 'std_fit_time': array([0.02319974, 0.03845817, 0.02040376, 0.03042105]), 'mean_score_time': array([0.62977064, 0.71646326, 0.77553704, 0.63035402]), 'std_score_time': array([0.05780875, 0.0549796 , 0.08020924, 0.19707421]), 'param_n_neighbors': masked_array(data=[3, 5, 7, 9],
             mask=[False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'n_neighbors': 3}, {'n_neighbors': 5}, {'n_neighbors': 7}, {'n_neighbors': 9}], 'split0_test_score': array([0.34725698, 0.36438884, 0.36612127, 0.3599615 ]), 'split1_test_score': array([0.34687199, 0.35938402, 0.35919153, 0.35688162]), 'split2_test_score': array([0.35052936, 0.35899904, 0.3574591 , 0.35784408]), 'split3_test_score': array([0.34898941, 0.36458133, 0.36246391, 0.35899904]), 'split4_test_score': array([0.35264678, 0.36381136, 0.36458133, 0.35803657]