In [123]:
import os
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt

In [126]:
gps = pd.read_csv('../data/train_mGps1.csv', index_col=0)
gps.head()

Unnamed: 0,timestamp,lat,lon,accuracy,userId,date
0,39,37.544851,127.054421,34.077,user01,2020-08-30
1,43,37.544952,127.054391,37.184,user01,2020-08-30
2,53,37.544954,127.054413,34.264,user01,2020-08-30
3,59,37.544954,127.054413,34.264,user01,2020-08-30
4,3,37.544825,127.054485,68.169,user01,2020-08-30


In [82]:
gps.describe()

Unnamed: 0,timestamp,lat,lon,accuracy
count,5538301.0,5538301.0,5538301.0,5538301.0
mean,29.51425,37.42538,127.0224,52.48133
std,17.31624,0.2805442,0.2342028,273.0639
min,0.0,35.05041,126.3092,1.0
25%,15.0,37.38709,126.9295,9.78
50%,30.0,37.48707,127.0084,14.108
75%,45.0,37.53498,127.0917,18.51
max,59.0,38.22056,129.197,18797.02


In [83]:
gps.groupby(['userId','date']).count().sort_values(by='lat')

Unnamed: 0_level_0,Unnamed: 1_level_0,timestamp,lat,lon,accuracy
userId,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
user21,2020-09-19,10,10,10,10
user22,2020-09-01,56,56,56,56
user21,2020-09-20,137,137,137,137
user05,2020-09-11,266,266,266,266
user27,2020-09-15,542,542,542,542
...,...,...,...,...,...
user07,2020-09-10,24266,24266,24266,24266
user07,2020-09-11,24491,24491,24491,24491
user07,2020-09-26,25715,25715,25715,25715
user06,2020-09-15,25918,25918,25918,25918


## Kalman Filter로 Smoothing 후 일자별 이동거리 계산

In [84]:
from pykalman import KalmanFilter
from geopy.distance import geodesic

In [85]:
def calculate_gps_distance(df, date, user, lat, lon):
    gps_distance = pd.DataFrame()
    add_data = pd.DataFrame()

    # Kalman filter로 smoothing
    kf = KalmanFilter(
        initial_state_mean = [df[lat].iloc[0], df[lon].iloc[0]],
        transition_matrices = [[1, 0], [0, 1]],
        observation_matrices = [[1, 0], [0, 1]],
        observation_covariance = 1e-4 * np.eye(2),  # 관측 노이즈 공분산
        transition_covariance = 1e-5 * np.eye(2)  # 상태 노이즈 공분산
    )
    
    kf_smooth = kf.smooth(df[[lat, lon]])[0]
    df.loc[:,'lat_smooth'] = kf_smooth[:, 0]
    df.loc[:,'lon_smooth'] = kf_smooth[:, 1]

    # 이동 거리 계산
    total_distance = 0
    for k in range(1, len(df)):
        start = (df['lat_smooth'].iloc[k-1], df['lon_smooth'].iloc[k-1])
        end = (df['lat_smooth'].iloc[k], df['lon_smooth'].iloc[k])
        total_distance += geodesic(start, end).km
            
    return total_distance

In [95]:
gps1 = pd.DataFrame()
users = gps['userId'].unique()
    
for i in range(len(users)):
    df1 = gps[gps['userId']==users[i]]
    dates = df1['date'].unique()
    
    for j in range(len(dates)):
        df = df1[df1['date']==dates[j]].copy()
        
        add_df = pd.DataFrame({
            'timestamp': [dates[j]],
            'subject_id': [users[i]],
            'gps_distance': [calculate_gps_distance(df, 'date', 'userId', 'lat', 'lon')]
        })
        
        gps1 = pd.concat([gps1, add_df], ignore_index=True)
gps1  

Unnamed: 0,timestamp,subject_id,gps_distance
0,2020-08-30,user01,28.088965
1,2020-08-31,user01,114.502223
2,2020-09-05,user01,48.058927
3,2020-09-07,user01,52.759719
4,2020-09-08,user01,28.673744
...,...,...,...
529,2020-09-23,user30,20.145071
530,2020-09-24,user30,16.149133
531,2020-09-25,user30,25.236322
532,2020-09-26,user30,0.913059


임계값 = 53

In [97]:
gps_cut = gps[gps['accuracy']<=53]

In [100]:
gps2 = pd.DataFrame()
users = gps_cut['userId'].unique()
    
for i in range(len(users)):
    df1 = gps_cut[gps_cut['userId']==users[i]]
    dates = df1['date'].unique()
    
    for j in range(len(dates)):
        df = df1[df1['date']==dates[j]].copy()
        
        add_df = pd.DataFrame({
            'timestamp': [dates[j]],
            'subject_id': [users[i]],
            'gps_distance': [calculate_gps_distance(df, 'date', 'userId', 'lat', 'lon')]
        })
        
        gps2 = pd.concat([gps2, add_df], ignore_index=True)
gps2   

Unnamed: 0,timestamp,subject_id,gps_distance
0,2020-08-30,user01,27.959994
1,2020-08-31,user01,93.417040
2,2020-09-05,user01,28.073488
3,2020-09-07,user01,24.984973
4,2020-09-08,user01,23.955756
...,...,...,...
529,2020-09-23,user30,18.020900
530,2020-09-24,user30,15.249634
531,2020-09-25,user30,24.320630
532,2020-09-26,user30,0.824783


### 결과 확인

In [96]:
gps1['gps_distance'].describe()

count    534.000000
mean      46.815547
std       72.611226
min        0.000325
25%        6.470641
50%       23.990926
75%       60.801333
max      781.269146
Name: gps_distance, dtype: float64

In [101]:
gps2['gps_distance'].describe()

count    534.000000
mean      41.927825
std       67.513350
min        0.000325
25%        5.972970
50%       19.972144
75%       53.726366
max      742.698716
Name: gps_distance, dtype: float64

In [None]:
gps1.groupby(['subject_id']).agg(['mean','min','max']).reset_index()

In [104]:
gps2

Unnamed: 0,timestamp,subject_id,gps_distance
0,2020-08-30,user01,27.959994
1,2020-08-31,user01,93.417040
2,2020-09-05,user01,28.073488
3,2020-09-07,user01,24.984973
4,2020-09-08,user01,23.955756
...,...,...,...
529,2020-09-23,user30,18.020900
530,2020-09-24,user30,15.249634
531,2020-09-25,user30,24.320630
532,2020-09-26,user30,0.824783


### 결과 저장

In [120]:
# 유저별 최댓값으로 나눈 데이터
distance_max = gps2.groupby(['subject_id'])['gps_distance'].max().reset_index()
distance_max = distance_max.rename(columns={'gps_distance':'max_distance'})

final_gps = pd.merge(gps2, distance_max, on='subject_id')
final_gps['distance_ratio'] = final_gps['gps_distance']/final_gps['max_distance']
final_gps

Unnamed: 0,timestamp,subject_id,gps_distance,max_distance,distance_ratio
0,2020-08-30,user01,27.959994,104.800651,0.266792
1,2020-08-31,user01,93.417040,104.800651,0.891378
2,2020-09-05,user01,28.073488,104.800651,0.267875
3,2020-09-07,user01,24.984973,104.800651,0.238405
4,2020-09-08,user01,23.955756,104.800651,0.228584
...,...,...,...,...,...
529,2020-09-23,user30,18.020900,36.363621,0.495575
530,2020-09-24,user30,15.249634,36.363621,0.419365
531,2020-09-25,user30,24.320630,36.363621,0.668818
532,2020-09-26,user30,0.824783,36.363621,0.022682


In [121]:
final_gps = final_gps.rename(columns={'timestamp':'date'})

In [122]:
final_gps.to_csv('../data/train_gps_distance.csv')