<a href="https://colab.research.google.com/github/kyochanpy/Google_Smartphone_Decimeter_Challenge/blob/main/PP/outlier_01.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
from pathlib import Path 
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

In [19]:
# data
path = Path("/content/drive/MyDrive/GSDC")
test_base = pd.read_csv(path / "baseline_locations_test.csv")
sub = pd.read_csv(path / "sample_submission.csv")

In [20]:
truths = (path / "train").rglob("ground_truth.csv")

In [21]:
df_list = []
cols = ["collectionName", "phoneName", "millisSinceGpsEpoch", "latDeg", "lngDeg"]

for t in tqdm(truths, total=73):
    df_phone = pd.read_csv(t, usecols=cols)
    df_list.append(df_phone)
df_truth = pd.concat(df_list, ignore_index=True)

train_base = pd.read_csv(path / "baseline_locations_train.csv", usecols=cols)
all_df = df_truth.merge(train_base, how="inner", on=cols[:3], suffixes=("_truth", '_train_base'))

HBox(children=(FloatProgress(value=0.0, max=73.0), HTML(value='')))




In [39]:
def calc_haversine(lat1, lon1, lat2, lon2):
    RADIUS = 6_367_000
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    d = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    dist = 2 * RADIUS * np.arcsin(d**0.5)
    return dist

In [40]:
all_df['dist'] = calc_harversine(all_df.latDeg_truth, all_df.lngDeg_truth,
                                 all_df.latDeg_train_base, all_df.lngDeg_train_base)

In [41]:
all_df.dist.describe()

count    131342.000000
mean          3.846848
std          30.739767
min           0.001338
25%           1.210976
50%           2.065769
75%           3.560001
max        8340.257976
Name: dist, dtype: float64

In [42]:
all_df.sort_values(by="dist", ascending=False)[["collectionName", "dist"]].head(50)

Unnamed: 0,collectionName,dist
59220,2020-09-04-US-SF-1,8340.257976
47991,2020-07-17-US-MTV-2,5050.995543
110383,2021-04-26-US-SVL-1,2254.344928
46999,2020-07-17-US-MTV-2,2026.294654
46997,2020-07-17-US-MTV-2,1934.676643
126704,2021-04-29-US-SJC-2,1599.570433
12409,2020-05-29-US-MTV-1,1128.348831
46998,2020-07-17-US-MTV-2,1044.316856
90154,2021-01-05-US-SVL-1,653.703379
97486,2021-04-15-US-MTV-1,549.061548


In [43]:
test_base["dist_pre"] = 0
test_base["dist_pro"] = 0

In [44]:
test_base['latDeg_pre'] = test_base['latDeg'].shift(periods=1,fill_value=0)
test_base['lngDeg_pre'] = test_base['lngDeg'].shift(periods=1,fill_value=0)
test_base['latDeg_pro'] = test_base['latDeg'].shift(periods=-1,fill_value=0)
test_base['lngDeg_pro'] = test_base['lngDeg'].shift(periods=-1,fill_value=0)
test_base['dist_pre'] = calc_haversine(test_base.latDeg_pre, test_base.lngDeg_pre, test_base.latDeg, test_base.lngDeg)
test_base['dist_pro'] = calc_haversine(test_base.latDeg, test_base.lngDeg, test_base.latDeg_pro, test_base.lngDeg_pro)

list_phone = test_base['phone'].unique()
for phone in list_phone:
    ind_s = test_base[test_base['phone'] == phone].index[0]
    ind_e = test_base[test_base['phone'] == phone].index[-1]
    test_base.loc[ind_s,'dist_pre'] = 0
    test_base.loc[ind_e,'dist_pro'] = 0

In [45]:
test_base.dist_pre.describe()

count    91486.000000
mean        16.937410
std         12.526582
min          0.000000
25%          5.200745
50%         14.842604
75%         28.551707
max        391.394578
Name: dist_pre, dtype: float64

In [48]:
pro_95 = test_base['dist_pro'].mean() + (test_base['dist_pro'].std() * 2)
pre_95 = test_base['dist_pre'].mean() + (test_base['dist_pre'].std() * 2)
ind = test_base[(test_base['dist_pro'] > pro_95)&(test_base['dist_pre'] > pre_95)][['dist_pre','dist_pro']].index

for i in ind:
    test_base.loc[i,'latDeg'] = (test_base.loc[i-1,'latDeg'] + test_base.loc[i+1,'latDeg'])/2
    test_base.loc[i,'lngDeg'] = (test_base.loc[i-1,'lngDeg'] + test_base.loc[i+1,'lngDeg'])/2

# Kalman Filter

In [49]:
!pip install simdkalman
import simdkalman

Collecting simdkalman
  Downloading https://files.pythonhosted.org/packages/d9/8a/6a4508231837fa2e5af4c6f0d6fa3d987a27f7aeb287b4e990c29b4e1815/simdkalman-1.0.1-py2.py3-none-any.whl
Installing collected packages: simdkalman
Successfully installed simdkalman-1.0.1


In [53]:
T = 1.0
state_transition = np.array([[1, 0, T, 0, 0.5 * T ** 2, 0], [0, 1, 0, T, 0, 0.5 * T ** 2], [0, 0, 1, 0, T, 0],
                             [0, 0, 0, 1, 0, T], [0, 0, 0, 0, 1, 0], [0, 0, 0, 0, 0, 1]])
process_noise = np.diag([1e-5, 1e-5, 5e-6, 5e-6, 1e-6, 1e-6]) + np.ones((6, 6)) * 1e-9
observation_model = np.array([[1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0]])
observation_noise = np.diag([7e-5, 7e-5]) + np.ones((2, 2)) * 1e-9

kf = simdkalman.KalmanFilter(
        state_transition = state_transition,
        process_noise = process_noise,
        observation_model = observation_model,
        observation_noise = observation_noise)

In [54]:
def apply_kf_smoothing(df, kf_=kf):
    unique_paths = df[['collectionName', 'phoneName']].drop_duplicates().to_numpy()
    for collection, phone in tqdm(unique_paths):
        cond = np.logical_and(df['collectionName'] == collection, df['phoneName'] == phone)
        data = df[cond][['latDeg', 'lngDeg']].to_numpy()
        data = data.reshape(1, len(data), 2)
        smoothed = kf_.smooth(data)
        df.loc[cond, 'latDeg'] = smoothed.states.mean[0, :, 0]
        df.loc[cond, 'lngDeg'] = smoothed.states.mean[0, :, 1]
    return df

In [55]:
kf_smoothed_baseline = apply_kf_smoothing(test_base)
sub = sub.assign(
    latDeg = kf_smoothed_baseline.latDeg,
    lngDeg = kf_smoothed_baseline.lngDeg
)
sub.to_csv('submission_08.csv', index=False)

HBox(children=(FloatProgress(value=0.0, max=48.0), HTML(value='')))


