In [160]:
import os
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt

In [161]:
test_gps = pd.read_parquet('../data/test dataset/ch2024_test_m_gps.parquet') 
val_gps = pd.read_parquet('../data/val dataset/ch2024_val__m_gps.parquet') 

In [162]:
test_gps['timestamp'] = test_gps['timestamp'].dt.date
test_gps = test_gps.astype({'timestamp':str})

val_gps['timestamp'] = val_gps['timestamp'].dt.date
val_gps = val_gps.astype({'timestamp':str})

display(test_gps.head(), val_gps.head())

Unnamed: 0,subject_id,timestamp,altitude,latitude,longitude,speed
0,5,2023-11-05,95.734328,0.03837,0.028696,0.126473
1,5,2023-11-05,95.734328,0.038373,0.028697,0.03271
2,5,2023-11-05,95.734328,0.038373,0.028704,0.237968
3,5,2023-11-05,95.734328,0.038367,0.028711,0.146265
4,5,2023-11-05,95.734328,0.038362,0.028713,0.051259


Unnamed: 0,subject_id,timestamp,altitude,latitude,longitude,speed
0,1,2023-08-20,144.217651,0.016095,0.926485,0.143791
1,1,2023-08-20,144.217651,0.01609,0.926477,0.160771
2,1,2023-08-20,144.217651,0.016091,0.926478,0.006571
3,1,2023-08-20,144.217651,0.016091,0.926474,0.05931
4,1,2023-08-20,144.217651,0.016092,0.926477,0.049454


In [163]:
test_gps.groupby(['subject_id','timestamp']).count().sort_values(by='latitude')

Unnamed: 0_level_0,Unnamed: 1_level_0,altitude,latitude,longitude,speed
subject_id,timestamp,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
7,2023-11-26,2,2,2,2
5,2023-11-15,2,2,2,2
5,2023-11-23,4,4,4,4
6,2023-10-31,5,5,5,5
6,2023-10-29,5,5,5,5
...,...,...,...,...,...
5,2023-11-07,20541,20541,20541,20541
5,2023-11-12,21249,21249,21249,21249
5,2023-11-06,21305,21305,21305,21305
5,2023-11-09,21338,21338,21338,21338


In [164]:
val_gps.groupby(['subject_id','timestamp']).count().sort_values(by='latitude')

Unnamed: 0_level_0,Unnamed: 1_level_0,altitude,latitude,longitude,speed
subject_id,timestamp,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,2023-09-28,5,5,5,5
1,2023-09-23,8,8,8,8
1,2023-09-18,10,10,10,10
4,2023-10-15,12,12,12,12
4,2023-10-09,13,13,13,13
...,...,...,...,...,...
2,2023-09-15,21045,21045,21045,21045
2,2023-09-27,21448,21448,21448,21448
2,2023-09-26,21633,21633,21633,21633
2,2023-09-14,21648,21648,21648,21648


## Kalman Filter로 Smoothing 후 일자별 이동거리 계산

In [165]:
from pykalman import KalmanFilter
from geopy.distance import geodesic

In [166]:
def calculate_gps_distance(df, date, user, lat, lon):
    gps_distance = pd.DataFrame()
    add_data = pd.DataFrame()

    # Kalman filter로 smoothing
    kf = KalmanFilter(
        initial_state_mean = [df[lat].iloc[0], df[lon].iloc[0]],
        transition_matrices = [[1, 0], [0, 1]],
        observation_matrices = [[1, 0], [0, 1]],
        observation_covariance = 1e-4 * np.eye(2),  # 관측 노이즈 공분산
        transition_covariance = 1e-5 * np.eye(2)  # 상태 노이즈 공분산
    )
    
    kf_smooth = kf.smooth(df[[lat, lon]])[0]
    df.loc[:,'lat_smooth'] = kf_smooth[:, 0]
    df.loc[:,'lon_smooth'] = kf_smooth[:, 1]

    # 이동 거리 계산
    total_distance = 0
    for k in range(1, len(df)):
        start = (df['lat_smooth'].iloc[k-1], df['lon_smooth'].iloc[k-1])
        end = (df['lat_smooth'].iloc[k], df['lon_smooth'].iloc[k])
        total_distance += geodesic(start, end).km
            
    return total_distance

##### validation

In [167]:
val_gps1 = pd.DataFrame()
users = val_gps['subject_id'].unique()
    
for i in range(len(users)):
    df1 = val_gps[val_gps['subject_id']==users[i]]
    dates = df1['timestamp'].unique()
    
    for j in range(len(dates)):
        df = df1[df1['timestamp']==dates[j]].copy()
        
        add_df = pd.DataFrame({
            'timestamp': [dates[j]],
            'subject_id': [users[i]],
            'gps_distance': [calculate_gps_distance(df, 'timestamp', 'subject_id', 'latitude', 'longitude')]
        })
        
        val_gps1 = pd.concat([val_gps1, add_df], ignore_index=True)
val_gps1    

Unnamed: 0,timestamp,subject_id,gps_distance
0,2023-08-20,1,12.728039
1,2023-08-21,1,10.879238
2,2023-08-22,1,11.372360
3,2023-08-23,1,18.676591
4,2023-08-24,1,7.164520
...,...,...,...
96,2023-10-27,4,21.067014
97,2023-10-28,4,14.722594
98,2023-10-29,4,31.145407
99,2023-10-30,4,35.936848


##### test

In [168]:
test_gps1 = pd.DataFrame()
users = test_gps['subject_id'].unique()
    
for i in range(len(users)):
    df1 = test_gps[test_gps['subject_id']==users[i]]
    dates = df1['timestamp'].unique()
    
    for j in range(len(dates)):
        df = df1[df1['timestamp']==dates[j]].copy()
        
        add_df = pd.DataFrame({
            'timestamp': [dates[j]],
            'subject_id': [users[i]],
            'gps_distance': [calculate_gps_distance(df, 'timestamp', 'subject_id', 'latitude', 'longitude')]
        })
        
        test_gps1 = pd.concat([test_gps1, add_df], ignore_index=True)
test_gps1            

Unnamed: 0,timestamp,subject_id,gps_distance
0,2023-11-05,5,19.189911
1,2023-11-06,5,23.507009
2,2023-11-07,5,19.830692
3,2023-11-08,5,18.805130
4,2023-11-09,5,21.914918
...,...,...,...
108,2023-11-05,8,6.616197
109,2023-11-06,8,58.828139
110,2023-11-07,8,20.485850
111,2023-11-08,8,13.449348


### 결과 확인

In [169]:
val_gps1['gps_distance'].describe()

count    101.000000
mean      16.278450
std       29.643772
min        0.000489
25%        1.479076
50%       10.184801
75%       18.752844
max      225.699928
Name: gps_distance, dtype: float64

In [170]:
test_gps1['gps_distance'].describe()

count    113.000000
mean      17.482268
std       51.680716
min        0.000368
25%        0.009102
50%        2.577064
75%       14.836563
max      385.594827
Name: gps_distance, dtype: float64

In [171]:
val_gps1.groupby(['subject_id','timestamp']).agg(['mean','min','max']).reset_index()

Unnamed: 0_level_0,subject_id,timestamp,gps_distance,gps_distance,gps_distance
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,mean,min,max
0,1,2023-08-20,12.728039,12.728039,12.728039
1,1,2023-08-21,10.879238,10.879238,10.879238
2,1,2023-08-22,11.372360,11.372360,11.372360
3,1,2023-08-23,18.676591,18.676591,18.676591
4,1,2023-08-24,7.164520,7.164520,7.164520
...,...,...,...,...,...
96,4,2023-10-27,21.067014,21.067014,21.067014
97,4,2023-10-28,14.722594,14.722594,14.722594
98,4,2023-10-29,31.145407,31.145407,31.145407
99,4,2023-10-30,35.936848,35.936848,35.936848


In [172]:
test_gps1.groupby(['subject_id','timestamp']).agg(['mean','min','max']).reset_index()

Unnamed: 0_level_0,subject_id,timestamp,gps_distance,gps_distance,gps_distance
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,mean,min,max
0,5,2023-11-05,19.189911,19.189911,19.189911
1,5,2023-11-06,23.507009,23.507009,23.507009
2,5,2023-11-07,19.830692,19.830692,19.830692
3,5,2023-11-08,18.805130,18.805130,18.805130
4,5,2023-11-09,21.914918,21.914918,21.914918
...,...,...,...,...,...
108,8,2023-11-05,6.616197,6.616197,6.616197
109,8,2023-11-06,58.828139,58.828139,58.828139
110,8,2023-11-07,20.485850,20.485850,20.485850
111,8,2023-11-08,13.449348,13.449348,13.449348


### 결과 저장

In [184]:
# 유저별 최댓값으로 나눈 데이터
distance_max1 = val_gps1.groupby(['subject_id'])['gps_distance'].max().reset_index()
distance_max1 = distance_max1.rename(columns={'gps_distance':'max_distance'})

final_gps1 = pd.merge(val_gps1, distance_max1, on='subject_id')
final_gps1['distance_ratio'] = final_gps1['gps_distance']/final_gps1['max_distance']
final_gps1 = final_gps1.rename(columns={'timestamp':'date'})
final_gps1

Unnamed: 0,date,subject_id,gps_distance,max_distance,distance_ratio
0,2023-08-20,1,12.728039,118.121587,0.107754
1,2023-08-21,1,10.879238,118.121587,0.092102
2,2023-08-22,1,11.372360,118.121587,0.096277
3,2023-08-23,1,18.676591,118.121587,0.158113
4,2023-08-24,1,7.164520,118.121587,0.060654
...,...,...,...,...,...
96,2023-10-27,4,21.067014,225.699928,0.093341
97,2023-10-28,4,14.722594,225.699928,0.065231
98,2023-10-29,4,31.145407,225.699928,0.137995
99,2023-10-30,4,35.936848,225.699928,0.159224


In [185]:
# 유저별 최댓값으로 나눈 데이터
distance_max2 = test_gps1.groupby(['subject_id'])['gps_distance'].max().reset_index()
distance_max2 = distance_max2.rename(columns={'gps_distance':'max_distance'})

final_gps2 = pd.merge(test_gps1, distance_max2, on='subject_id')
final_gps2['distance_ratio'] = final_gps2['gps_distance']/final_gps2['max_distance']
final_gps2 = final_gps2.rename(columns={'timestamp':'date'})
final_gps2

Unnamed: 0,date,subject_id,gps_distance,max_distance,distance_ratio
0,2023-11-05,5,19.189911,23.507009,0.816349
1,2023-11-06,5,23.507009,23.507009,1.000000
2,2023-11-07,5,19.830692,23.507009,0.843608
3,2023-11-08,5,18.805130,23.507009,0.799980
4,2023-11-09,5,21.914918,23.507009,0.932272
...,...,...,...,...,...
108,2023-11-05,8,6.616197,338.004687,0.019574
109,2023-11-06,8,58.828139,338.004687,0.174045
110,2023-11-07,8,20.485850,338.004687,0.060608
111,2023-11-08,8,13.449348,338.004687,0.039790


In [186]:
final_gps1.to_csv('../data/val_gps_distance.csv')
final_gps2.to_csv('../data/test_gps_distance.csv')