<a href="https://colab.research.google.com/github/kyochanpy/Google_Smartphone_Decimeter_Challenge/blob/main/note_book/submission_18.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install optuna > /dev/null
!pip install pyproj > /dev/null
!pip install simdkalman > /dev/null
    
import os
from glob import glob
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib_venn import venn2, venn2_circles
import seaborn as sns

import optuna
import plotly
import plotly.express as px
import pyproj
from pathlib import Path
from pyproj import Proj, transform
from tqdm.notebook import tqdm
import simdkalman

In [3]:
path = Path("/content/drive/MyDrive/GSDC")
test_base = pd.read_csv(path / "baseline_locations_test.csv")
sub = pd.read_csv(path / "sample_submission.csv")

truths = (path / "train").rglob("ground_truth.csv")

df_list = []
cols = ["collectionName", "phoneName", "millisSinceGpsEpoch", "latDeg", "lngDeg"]

for t in tqdm(truths, total=73):
    df_phone = pd.read_csv(t, usecols=cols)
    df_list.append(df_phone)
df_truth = pd.concat(df_list, ignore_index=True)

train_base = pd.read_csv(path / "baseline_locations_train.csv")
all_df = df_truth.merge(train_base, how="inner", on=cols[:3], suffixes=("_truth", '_train_base'))

HBox(children=(FloatProgress(value=0.0, max=73.0), HTML(value='')))




In [4]:
def get_groundtruth(path: Path) -> pd.DataFrame:
        output_df = pd.DataFrame()
        
        for path in glob(str(path / 'train/*/*/ground_truth.csv')):
            _df = pd.read_csv(path)
            output_df = pd.concat([output_df, _df])
        output_df = output_df.reset_index(drop=True)
        
        _columns = ['latDeg', 'lngDeg', 'heightAboveWgs84EllipsoidM']
        output_df[['t_'+col for col in _columns]] = output_df[_columns]
        output_df = output_df.drop(columns=_columns, axis=1)
        return output_df


In [5]:
train_base = train_base.merge(
    get_groundtruth(path),
    on=['collectionName', 'phoneName', 'millisSinceGpsEpoch']
)

# calc_haversine

In [12]:
def calc_haversine(lat1, lon1, lat2, lon2):
    RADIUS = 6_367_000
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    d = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    dist = 2 * RADIUS * np.arcsin(d**0.5)
    return dist

# Check Score

In [13]:
def check_score(input_df: pd.DataFrame) -> pd.DataFrame:
    output_df = input_df.copy()
    
    output_df['meter'] = input_df.apply(
        lambda r: calc_haversine(
            r.latDeg, r.lngDeg, r.t_latDeg, r.t_lngDeg
        ),
        axis=1
    )

    meter_score = output_df['meter'].mean()
    print(f'error meter: {meter_score}')

    scores = []
    for phone in output_df['phone'].unique():
        _index = output_df['phone']==phone
        p_50 = np.percentile(output_df.loc[_index, 'meter'], 50)
        p_95 = np.percentile(output_df.loc[_index, 'meter'], 95)
        scores.append(p_50)
        scores.append(p_95)

    score = sum(scores) / len(scores)
    print(f'score: {score}')
    
    return output_df

In [14]:
#mean_prediction
def mean_prediction_train(input_df):
    def make_lerp_data(input_df):
        org_colus = input_df.columns

        time_list = input_df[["collectionName", "millisSinceGpsEpoch"]].drop_duplicates()
        phone_list = input_df[["collectionName", "phoneName"]].drop_duplicates()
        tmp = time_list.merge(phone_list, on="collectionName", how="outer")

        output_df = tmp.merge(input_df, on=["collectionName", "millisSinceGpsEpoch", "phoneName"], how="left")
        output_df["phone"] = output_df["collectionName"] + "_" + output_df["phoneName"]
        output_df = output_df.sort_values(["phone", "millisSinceGpsEpoch"])

        output_df["latDeg_pre"] = output_df["latDeg"].shift(1)
        output_df["latDeg_pro"] = output_df["latDeg"].shift(-1)
        output_df["lngDeg_pre"] = output_df["lngDeg"].shift(1)
        output_df["lngDeg_pro"] = output_df["lngDeg"].shift(-1)
        output_df["phone_pre"] = output_df["phone"].shift(1)
        output_df["phone_pro"] = output_df["phone"].shift(-1)
        output_df["millisSinceGpsEpoch_pre"] = output_df["millisSinceGpsEpoch"].shift(1)
        output_df["millisSinceGpsEpoch_pro"] = output_df["millisSinceGpsEpoch"].shift(-1)

        output_df = output_df[(output_df["latDeg"].isnull())&(output_df["phone"] == output_df["phone_pre"])&
                            (output_df["phone"] == output_df["phone_pro"])].copy()

        output_df["latDeg"] = output_df["latDeg_pre"] + ((output_df["latDeg_pro"] - output_df["latDeg_pre"]) * 
                                                        ((output_df["millisSinceGpsEpoch"] - output_df["millisSinceGpsEpoch_pre"]) /
                                                        (output_df["millisSinceGpsEpoch_pro"] - output_df["millisSinceGpsEpoch_pre"])))
        output_df["lngDeg"] = output_df["lngDeg_pre"] + ((output_df["lngDeg_pro"] - output_df["lngDeg_pre"]) * 
                                                        ((output_df["millisSinceGpsEpoch"] - output_df["millisSinceGpsEpoch_pre"]) /
                                                        (output_df["millisSinceGpsEpoch_pro"] - output_df["millisSinceGpsEpoch_pre"])))
        
        output_df = output_df[~output_df['latDeg'].isnull()]

        return output_df[org_colus]

    
    def calc_mean_pred(input_df, lerp_df):
        input_df["phone"] = input_df["collectionName"] + "_" + input_df["phoneName"]
        add_lerp = pd.concat([input_df, lerp_df])
        mean_pred_result = add_lerp.groupby(["collectionName", "millisSinceGpsEpoch"])[["latDeg", "lngDeg"]].mean().reset_index()
        output_df = input_df[["collectionName", "phoneName", "millisSinceGpsEpoch"]].copy()
        output_df = output_df.merge(mean_pred_result[["collectionName", "millisSinceGpsEpoch", "latDeg", "lngDeg"]],
                                        on=["collectionName", "millisSinceGpsEpoch"], how="left")
        output_df["phone"] = output_df["collectionName"] + "_" + output_df["phoneName"]
        return output_df

    
    def add_distance_diff(df):
        df['latDeg_pre'] = df['latDeg'].shift(1)
        df['latDeg_pro'] = df['latDeg'].shift(-1)
        df['lngDeg_pre'] = df['lngDeg'].shift(1)
        df['lngDeg_pro'] = df['lngDeg'].shift(-1)
        df['phone_pre'] = df['phone'].shift(1)
        df['phone_pro'] = df['phone'].shift(-1)
        
        df['dist_pre'] = calc_haversine(df['latDeg'], df['lngDeg'], df['latDeg_pre'], df['lngDeg_pre'])
        df['dist_pro'] = calc_haversine(df['latDeg'], df['lngDeg'], df['latDeg_pro'], df['lngDeg_pro'])
        
        df.loc[df['phone']!=df['phone_pre'], ['latDeg_pre', 'lngDeg_pre', 'dist_pre']] = np.nan
        df.loc[df['phone']!=df['phone_pro'], ['latDeg_next', 'lngDeg_pro', 'dist_pro']] = np.nan
        
        return df

    input_df_ = add_distance_diff(input_df)
    th = 50
    input_df_.loc[((input_df_['dist_pre'] > th) & (input_df_['dist_pro'] > th)), ['latDeg', 'lngDeg']] = np.nan

    test_lerp = make_lerp_data(input_df_)
    test_mean_pred  = calc_mean_pred(input_df_, test_lerp)

    output_df = sub.copy()

    output_df["latDeg"] = test_mean_pred["latDeg"]
    output_df["lngDeg"] = test_mean_pred["lngDeg"]

    output_df["phoneName"] = input_df["phoneName"]
    output_df["collectionName"] = input_df["collectionName"]
    output_df["millisSinceGpsEpoch"] = input_df["millisSinceGpsEpoch"]
    output_df["t_latDeg"] = input_df["t_latDeg"]
    output_df["t_lngDeg"] = input_df["t_lngDeg"]

    return output_df

In [15]:
#outlier
def outlier_train(input_df):
    output_df = input_df

    def calc_haversine(lat1, lon1, lat2, lon2):
        RADIUS = 6_367_000
        lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
        dlat = lat2 - lat1
        dlon = lon2 - lon1
        d = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
        dist = 2 * RADIUS * np.arcsin(d**0.5)
        return dist
    
    output_df['dist'] = calc_haversine(output_df.t_latDeg, output_df.t_lngDeg,
                                        output_df.latDeg, output_df.lngDeg)
    
    per_95 = output_df['dist'].mean() + (output_df['dist'].std() * 2)
    ind = output_df[(output_df['dist'] > per_95)][['dist']].index

    for i in ind:
        output_df.loc[i,'latDeg'] = (output_df.loc[i-1,'latDeg'] + output_df.loc[i+1,'latDeg'])/2
        output_df.loc[i,'lngDeg'] = (output_df.loc[i-1,'lngDeg'] + output_df.loc[i+1,'lngDeg'])/2

    return output_df

In [16]:
#position_shift
def position_shift_train(input_df):
    sub_cols = sub.columns

    train_p_s = pd.read_csv(path / "baseline_locations_train.csv")
    train_b = train_p_s[sub_cols]
    test_b = test_base[sub_cols]

    msge = "millisSinceGpsEpoch"

    testdir = path / 'test'
    traindir = path / 'train'

    g_t = pd.DataFrame()
    for d in os.listdir(traindir):
        for p in os.listdir(traindir/d):
            g_t = g_t.append(pd.read_csv(traindir/d/p/'ground_truth.csv'))

    g_t["phone"] = g_t["collectionName"] + "_" + g_t["phoneName"]
    g_t_sub_cols = g_t[sub_cols]


    def compute_dist(fname_1, fname_2=g_t):
        oof = fname_1
        g_t = fname_2
        df = oof.merge(g_t, on=["phone", "millisSinceGpsEpoch"])
        dist_oof = calc_haversine(df.latDeg_x, df.lngDeg_x, df.latDeg_y, df.lngDeg_y)
        scores = pd.DataFrame({"phone":df.phone, "dist":dist_oof})
        scores_grp = scores.groupby("phone")
        d_50 = scores_grp.quantile(.50).reset_index()
        d_50.columns = ["phone", "q_50"]
        d_95 = scores_grp.quantile(.95).reset_index()
        d_95.columns = ["phone", "q_95"]
        return (scores_grp.quantile(.50).mean() + scores_grp.quantile(.95).mean()) / 2, d_50.merge(d_95)


    def WGS84_to_ECEF(lat, lng, alt):
        rad_lat = lat * (np.pi / 180.0)
        rad_lng = lng * (np.pi / 180.0)
        a = 6378137.0
        finv = 298.257223563
        f = 1 / finv   
        e2 = 1 - (1 - f) * (1 - f)    
        N = a / np.sqrt(1 - e2 * np.sin(rad_lat) * np.sin(rad_lat))
        x = (N + alt) * np.cos(rad_lat) * np.cos(rad_lng)
        y = (N + alt) * np.cos(rad_lat) * np.sin(rad_lng)
        z = (N * (1 - e2) + alt)        * np.sin(rad_lat)
        return x, y, z    


    transformer = pyproj.Transformer.from_crs(
        {"proj":'geocent', "ellps":'WGS84', "datum":'WGS84'},
        {"proj":'latlong', "ellps":'WGS84', "datum":'WGS84'},)


    def ECEF_to_WGS84(x, y, z):
        lng, lat, alt = transformer.transform(x, y, z, radians=False)
        return lng, lat, alt


    def position_shift_(input_df, a):
        output_df = input_df.copy()
        output_df["heightAboveWgs84EllipsoidM"] = 63.5
        output_df["x"], output_df["y"], output_df["z"] = zip(*output_df.apply(lambda x: WGS84_to_ECEF(x.latDeg, x.lngDeg, 
                                                                                                    x.heightAboveWgs84EllipsoidM), axis=1))
        output_df.sort_values(["phone", msge], inplace=True)
        for fi in ["x", "y", "z"]:
            output_df[[fi + "_p"]] = output_df[fi].shift().where(output_df["phone"].eq(output_df["phone"].shift()))
            output_df[[fi + "_diff"]] = output_df[fi] - output_df[fi + "_p"]

        output_df[["dist"]] = np.sqrt(output_df["x_diff"]**2 + output_df["y_diff"]**2 + output_df["z_diff"]**2)
        for fi in ["x", "y", "z"]:
            output_df[[fi + "_new"]] = output_df[fi + "_p"] + output_df[fi + "_diff"] * (1 - a/output_df["dist"])
        lng, lat, alt = ECEF_to_WGS84(output_df["x_new"].values, output_df["y_new"].values, output_df["z_new"].values)

        lat[np.isnan(lat)] = output_df.loc[np.isnan(lat), "latDeg"]
        lng[np.isnan(lng)] = output_df.loc[np.isnan(lng), "lngDeg"]
        output_df["latDeg"] = lat
        output_df["lngDeg"] = lng

        output_df.sort_values(["phone", msge], inplace=True)

        return output_df[sub_cols]


    def objective(trial):
        a = trial.suggest_uniform("a", -1, 1)
        score, scores = compute_dist(position_shift_(train_b, a), g_t)
        return score


    study = optuna.create_study()
    study.optimize(objective, n_trials=30)

    output_df = position_shift_(input_df, a=study.best_params["a"])

    output_df["phoneName"] = input_df["phoneName"]
    output_df["collectionName"] = input_df["collectionName"]
    output_df["millisSinceGpsEpoch"] = input_df["millisSinceGpsEpoch"]
    output_df["t_latDeg"] = input_df["t_latDeg"]
    output_df["t_lngDeg"] = input_df["t_lngDeg"]

    return output_df

In [32]:
#remove_device
def remove_device_train(input_df):

    def get_removedevice(input_df: pd.DataFrame, divece: str) -> pd.DataFrame:
        input_df['index'] = input_df.index
        input_df = input_df.sort_values('millisSinceGpsEpoch')
        input_df.index = input_df['millisSinceGpsEpoch'].values

        output_df = pd.DataFrame() 
        for _, subdf in input_df.groupby('collectionName'):

            phones = subdf['phoneName'].unique()

            if (len(phones) == 1) or (not divece in phones):
                output_df = pd.concat([output_df, subdf])
                continue

            origin_df = subdf.copy()
            
            _index = subdf['phoneName']==divece
            subdf.loc[_index, 'latDeg'] = np.nan
            subdf.loc[_index, 'lngDeg'] = np.nan
            subdf = subdf.interpolate(method='index', limit_area='inside')

            _index = subdf['latDeg'].isnull()
            subdf.loc[_index, 'latDeg'] = origin_df.loc[_index, 'latDeg'].values
            subdf.loc[_index, 'lngDeg'] = origin_df.loc[_index, 'lngDeg'].values

            output_df = pd.concat([output_df, subdf])

        output_df.index = output_df['index'].values
        output_df = output_df.sort_index()

        del output_df['index']
        
        return output_df

    output_df = get_removedevice(input_df, 'SamsungS20Ultra')

    return output_df

In [33]:
#kalman filter
def kalman_filter_train(input_df):
    T = 1.0
    state_transition = np.array([[1, 0, T, 0, 0.5 * T ** 2, 0], [0, 1, 0, T, 0, 0.5 * T ** 2], [0, 0, 1, 0, T, 0],
                                [0, 0, 0, 1, 0, T], [0, 0, 0, 0, 1, 0], [0, 0, 0, 0, 0, 1]])
    process_noise = np.diag([1e-5, 1e-5, 5e-6, 5e-6, 1e-6, 1e-6]) + np.ones((6, 6)) * 1e-9
    observation_model = np.array([[1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0]])
    observation_noise = np.diag([1e-4, 1e-4]) + np.ones((2, 2)) * 1e-9

    kf = simdkalman.KalmanFilter(
            state_transition = state_transition,
            process_noise = process_noise,
            observation_model = observation_model,
            observation_noise = observation_noise)
    
    def apply_kf_smoothing(df, kf_=kf):
        unique_paths = df[['collectionName', 'phoneName']].drop_duplicates().to_numpy()
        for collection, phone in tqdm(unique_paths):
            cond = np.logical_and(df['collectionName'] == collection, df['phoneName'] == phone)
            data = df[cond][['latDeg', 'lngDeg']].to_numpy()
            data = data.reshape(1, len(data), 2)
            smoothed = kf_.smooth(data)
            df.loc[cond, 'latDeg'] = smoothed.states.mean[0, :, 0]
            df.loc[cond, 'lngDeg'] = smoothed.states.mean[0, :, 1]
        return df

    kf_smoothed_baseline = apply_kf_smoothing(input_df)
    output_df = sub.assign(latDeg = kf_smoothed_baseline.latDeg, lngDeg = kf_smoothed_baseline.lngDeg)
    output_df["phoneName"] = input_df["phoneName"]
    output_df["collectionName"] = input_df["collectionName"]
    output_df["millisSinceGpsEpoch"] = input_df["millisSinceGpsEpoch"]
    output_df["t_latDeg"] = input_df["t_latDeg"]
    output_df["t_lngDeg"] = input_df["t_lngDeg"]
    
    return output_df

In [37]:
#check_score
CV_2 = outlier_train(train_base)
CV_3 = mean_prediction_train(CV_2)
CV_4 = remove_device_train(CV_3)
CV_5 = position_shift_train(CV_4)

[32m[I 2021-07-01 14:22:25,880][0m A new study created in memory with name: no-name-0782e3a7-c118-441c-8280-43133ac30bd0[0m
[32m[I 2021-07-01 14:22:32,162][0m Trial 0 finished with value: 5.5001511352447645 and parameters: {'a': -0.5405892213573176}. Best is trial 0 with value: 5.5001511352447645.[0m
[32m[I 2021-07-01 14:22:38,482][0m Trial 1 finished with value: 5.421250422256026 and parameters: {'a': -0.36888627298886223}. Best is trial 1 with value: 5.421250422256026.[0m
[32m[I 2021-07-01 14:22:44,691][0m Trial 2 finished with value: 5.678008922513721 and parameters: {'a': -0.8699115456817548}. Best is trial 1 with value: 5.421250422256026.[0m
[32m[I 2021-07-01 14:22:51,021][0m Trial 3 finished with value: 5.364930515828075 and parameters: {'a': -0.227338219376217}. Best is trial 3 with value: 5.364930515828075.[0m
[32m[I 2021-07-01 14:22:57,348][0m Trial 4 finished with value: 5.188735259077459 and parameters: {'a': 0.6223120198482182}. Best is trial 4 with value: 

In [38]:
#check_score(CV_1)
print("----------------------")
check_score(CV_2)
print("----------------------")
check_score(CV_3)
print("----------------------")
check_score(CV_4)
print("----------------------")
check_score(CV_5)
print("----------------------")

----------------------
error meter: 3.4588279128326707
score: nan
----------------------
error meter: 2.435973575116457
score: nan
----------------------
error meter: 2.435973575116457
score: nan
----------------------
error meter: 2.410539685302506
score: nan
----------------------


# Mean Prediction

In [39]:
def mean_prediction(input_df):
    def make_lerp_data(input_df):
        org_colus = input_df.columns

        time_list = input_df[["collectionName", "millisSinceGpsEpoch"]].drop_duplicates()
        phone_list = input_df[["collectionName", "phoneName"]].drop_duplicates()
        tmp = time_list.merge(phone_list, on="collectionName", how="outer")

        output_df = tmp.merge(input_df, on=["collectionName", "millisSinceGpsEpoch", "phoneName"], how="left")
        output_df["phone"] = output_df["collectionName"] + "_" + output_df["phoneName"]
        output_df = output_df.sort_values(["phone", "millisSinceGpsEpoch"])

        output_df["latDeg_pre"] = output_df["latDeg"].shift(1)
        output_df["latDeg_pro"] = output_df["latDeg"].shift(-1)
        output_df["lngDeg_pre"] = output_df["lngDeg"].shift(1)
        output_df["lngDeg_pro"] = output_df["lngDeg"].shift(-1)
        output_df["phone_pre"] = output_df["phone"].shift(1)
        output_df["phone_pro"] = output_df["phone"].shift(-1)
        output_df["millisSinceGpsEpoch_pre"] = output_df["millisSinceGpsEpoch"].shift(1)
        output_df["millisSinceGpsEpoch_pro"] = output_df["millisSinceGpsEpoch"].shift(-1)

        output_df = output_df[(output_df["latDeg"].isnull())&(output_df["phone"] == output_df["phone_pre"])&
                            (output_df["phone"] == output_df["phone_pro"])].copy()

        output_df["latDeg"] = output_df["latDeg_pre"] + ((output_df["latDeg_pro"] - output_df["latDeg_pre"]) * 
                                                        ((output_df["millisSinceGpsEpoch"] - output_df["millisSinceGpsEpoch_pre"]) /
                                                        (output_df["millisSinceGpsEpoch_pro"] - output_df["millisSinceGpsEpoch_pre"])))
        output_df["lngDeg"] = output_df["lngDeg_pre"] + ((output_df["lngDeg_pro"] - output_df["lngDeg_pre"]) * 
                                                        ((output_df["millisSinceGpsEpoch"] - output_df["millisSinceGpsEpoch_pre"]) /
                                                        (output_df["millisSinceGpsEpoch_pro"] - output_df["millisSinceGpsEpoch_pre"])))
        
        output_df = output_df[~output_df['latDeg'].isnull()]

        return output_df[org_colus]

    
    def calc_mean_pred(input_df, lerp_df):
        input_df["phone"] = input_df["collectionName"] + "_" + input_df["phoneName"]
        add_lerp = pd.concat([input_df, lerp_df])
        mean_pred_result = add_lerp.groupby(["collectionName", "millisSinceGpsEpoch"])[["latDeg", "lngDeg"]].mean().reset_index()
        output_df = input_df[["collectionName", "phoneName", "millisSinceGpsEpoch"]].copy()
        output_df = output_df.merge(mean_pred_result[["collectionName", "millisSinceGpsEpoch", "latDeg", "lngDeg"]],
                                        on=["collectionName", "millisSinceGpsEpoch"], how="left")
        output_df["phone"] = output_df["collectionName"] + "_" + output_df["phoneName"]
        return output_df

    
    def add_distance_diff(df):
        df['latDeg_pre'] = df['latDeg'].shift(1)
        df['latDeg_pro'] = df['latDeg'].shift(-1)
        df['lngDeg_pre'] = df['lngDeg'].shift(1)
        df['lngDeg_pro'] = df['lngDeg'].shift(-1)
        df['phone_pre'] = df['phone'].shift(1)
        df['phone_pro'] = df['phone'].shift(-1)
        
        df['dist_pre'] = calc_haversine(df['latDeg'], df['lngDeg'], df['latDeg_pre'], df['lngDeg_pre'])
        df['dist_pro'] = calc_haversine(df['latDeg'], df['lngDeg'], df['latDeg_pro'], df['lngDeg_pro'])
        
        df.loc[df['phone']!=df['phone_pre'], ['latDeg_pre', 'lngDeg_pre', 'dist_pre']] = np.nan
        df.loc[df['phone']!=df['phone_pro'], ['latDeg_next', 'lngDeg_pro', 'dist_pro']] = np.nan
        
        return df

    input_df_ = add_distance_diff(input_df)
    th = 50
    input_df_.loc[((input_df_['dist_pre'] > th) & (input_df_['dist_pro'] > th)), ['latDeg', 'lngDeg']] = np.nan

    test_lerp = make_lerp_data(input_df_)
    test_mean_pred  = calc_mean_pred(input_df_, test_lerp)

    output_df = sub.copy()

    output_df["latDeg"] = test_mean_pred["latDeg"]
    output_df["lngDeg"] = test_mean_pred["lngDeg"]

    output_df["phoneName"] = input_df["phoneName"]
    output_df["collectionName"] = input_df["collectionName"]
    output_df["millisSinceGpsEpoch"] = input_df["millisSinceGpsEpoch"]
    #output_df["t_latDeg"] = input_df["t_latDeg"]
    #output_df["t_lngDeg"] = input_df["t_lngDeg"]

    return output_df

# Outlier 

In [40]:
def outlier(input_df):
    output_df = input_df

    output_df["dist_pre"] = 0
    output_df["dist_pro"] = 0

    output_df['latDeg_pre'] = output_df['latDeg'].shift(periods=1,fill_value=0)
    output_df['lngDeg_pre'] = output_df['lngDeg'].shift(periods=1,fill_value=0)
    output_df['latDeg_pro'] = output_df['latDeg'].shift(periods=-1,fill_value=0)
    output_df['lngDeg_pro'] = output_df['lngDeg'].shift(periods=-1,fill_value=0)
    output_df['dist_pre'] = calc_haversine(output_df.latDeg_pre, output_df.lngDeg_pre, output_df.latDeg, output_df.lngDeg)
    output_df['dist_pro'] = calc_haversine(output_df.latDeg, output_df.lngDeg, output_df.latDeg_pro, output_df.lngDeg_pro)

    list_phone = output_df['phone'].unique()
    for phone in list_phone:
        ind_s = output_df[test_base['phone'] == phone].index[0]
        ind_e = output_df[test_base['phone'] == phone].index[-1]
        output_df.loc[ind_s,'dist_pre'] = 0
        output_df.loc[ind_e,'dist_pro'] = 0

    pro_95 = output_df['dist_pro'].mean() + (output_df['dist_pro'].std() * 2)
    pre_95 = output_df['dist_pre'].mean() + (output_df['dist_pre'].std() * 2)
    ind = output_df[(output_df['dist_pro'] > pro_95)&(output_df['dist_pre'] > pre_95)][['dist_pre','dist_pro']].index

    for i in ind:
        output_df.loc[i,'latDeg'] = (output_df.loc[i-1,'latDeg'] + output_df.loc[i+1,'latDeg'])/2
        output_df.loc[i,'lngDeg'] = (output_df.loc[i-1,'lngDeg'] + output_df.loc[i+1,'lngDeg'])/2

    output_df["phoneName"] = input_df["phoneName"]
    output_df["collectionName"] = input_df["collectionName"]
    output_df["millisSinceGpsEpoch"] = input_df["millisSinceGpsEpoch"]

    return output_df

# Position Shift

In [41]:
def position_shift(input_df):
    sub_cols = sub.columns

    train_p_s = pd.read_csv(path / "baseline_locations_train.csv")
    train_b = train_p_s[sub_cols]
    test_b = test_base[sub_cols]

    msge = "millisSinceGpsEpoch"

    testdir = path / 'test'
    traindir = path / 'train'

    g_t = pd.DataFrame()
    for d in os.listdir(traindir):
        for p in os.listdir(traindir/d):
            g_t = g_t.append(pd.read_csv(traindir/d/p/'ground_truth.csv'))

    g_t["phone"] = g_t["collectionName"] + "_" + g_t["phoneName"]
    g_t_sub_cols = g_t[sub_cols]


    def compute_dist(fname_1, fname_2=g_t):
        oof = fname_1
        g_t = fname_2
        df = oof.merge(g_t, on=["phone", "millisSinceGpsEpoch"])
        dist_oof = calc_haversine(df.latDeg_x, df.lngDeg_x, df.latDeg_y, df.lngDeg_y)
        scores = pd.DataFrame({"phone":df.phone, "dist":dist_oof})
        scores_grp = scores.groupby("phone")
        d_50 = scores_grp.quantile(.50).reset_index()
        d_50.columns = ["phone", "q_50"]
        d_95 = scores_grp.quantile(.95).reset_index()
        d_95.columns = ["phone", "q_95"]
        return (scores_grp.quantile(.50).mean() + scores_grp.quantile(.95).mean()) / 2, d_50.merge(d_95)


    def WGS84_to_ECEF(lat, lng, alt):
        rad_lat = lat * (np.pi / 180.0)
        rad_lng = lng * (np.pi / 180.0)
        a = 6378137.0
        finv = 298.257223563
        f = 1 / finv   
        e2 = 1 - (1 - f) * (1 - f)    
        N = a / np.sqrt(1 - e2 * np.sin(rad_lat) * np.sin(rad_lat))
        x = (N + alt) * np.cos(rad_lat) * np.cos(rad_lng)
        y = (N + alt) * np.cos(rad_lat) * np.sin(rad_lng)
        z = (N * (1 - e2) + alt)        * np.sin(rad_lat)
        return x, y, z    


    transformer = pyproj.Transformer.from_crs(
        {"proj":'geocent', "ellps":'WGS84', "datum":'WGS84'},
        {"proj":'latlong', "ellps":'WGS84', "datum":'WGS84'},)


    def ECEF_to_WGS84(x, y, z):
        lng, lat, alt = transformer.transform(x, y, z, radians=False)
        return lng, lat, alt


    def position_shift_(input_df, a):
        output_df = input_df.copy()
        output_df["heightAboveWgs84EllipsoidM"] = 63.5
        output_df["x"], output_df["y"], output_df["z"] = zip(*output_df.apply(lambda x: WGS84_to_ECEF(x.latDeg, x.lngDeg, 
                                                                                                    x.heightAboveWgs84EllipsoidM), axis=1))
        output_df.sort_values(["phone", msge], inplace=True)
        for fi in ["x", "y", "z"]:
            output_df[[fi + "_p"]] = output_df[fi].shift().where(output_df["phone"].eq(output_df["phone"].shift()))
            output_df[[fi + "_diff"]] = output_df[fi] - output_df[fi + "_p"]

        output_df[["dist"]] = np.sqrt(output_df["x_diff"]**2 + output_df["y_diff"]**2 + output_df["z_diff"]**2)
        for fi in ["x", "y", "z"]:
            output_df[[fi + "_new"]] = output_df[fi + "_p"] + output_df[fi + "_diff"] * (1 - a/output_df["dist"])
        lng, lat, alt = ECEF_to_WGS84(output_df["x_new"].values, output_df["y_new"].values, output_df["z_new"].values)

        lat[np.isnan(lat)] = output_df.loc[np.isnan(lat), "latDeg"]
        lng[np.isnan(lng)] = output_df.loc[np.isnan(lng), "lngDeg"]
        output_df["latDeg"] = lat
        output_df["lngDeg"] = lng

        output_df.sort_values(["phone", msge], inplace=True)

        return output_df[sub_cols]


    def objective(trial):
        a = trial.suggest_uniform("a", -1, 1)
        score, scores = compute_dist(position_shift_(train_b, a), g_t)
        return score


    study = optuna.create_study()
    study.optimize(objective, n_trials=30)

    output_df = position_shift_(input_df, a=study.best_params["a"])

    output_df["phoneName"] = input_df["phoneName"]
    output_df["collectionName"] = input_df["collectionName"]
    output_df["millisSinceGpsEpoch"] = input_df["millisSinceGpsEpoch"]
    #output_df["t_latDeg"] = input_df["t_latDeg"]
    #output_df["t_lngDeg"] = input_df["t_lngDeg"]

    return output_df

# Remove Device

In [42]:
def remove_device(input_df):

    def get_removedevice(input_df: pd.DataFrame, divece: str) -> pd.DataFrame:
        input_df['index'] = input_df.index
        input_df = input_df.sort_values('millisSinceGpsEpoch')
        input_df.index = input_df['millisSinceGpsEpoch'].values

        output_df = pd.DataFrame() 
        for _, subdf in input_df.groupby('collectionName'):

            phones = subdf['phoneName'].unique()

            if (len(phones) == 1) or (not divece in phones):
                output_df = pd.concat([output_df, subdf])
                continue

            origin_df = subdf.copy()
            
            _index = subdf['phoneName']==divece
            subdf.loc[_index, 'latDeg'] = np.nan
            subdf.loc[_index, 'lngDeg'] = np.nan
            subdf = subdf.interpolate(method='index', limit_area='inside')

            _index = subdf['latDeg'].isnull()
            subdf.loc[_index, 'latDeg'] = origin_df.loc[_index, 'latDeg'].values
            subdf.loc[_index, 'lngDeg'] = origin_df.loc[_index, 'lngDeg'].values

            output_df = pd.concat([output_df, subdf])

        output_df.index = output_df['index'].values
        output_df = output_df.sort_index()

        del output_df['index']
        
        return output_df

    output_df = get_removedevice(input_df, 'SamsungS20Ultra')

    return output_df

In [None]:
remove_device(test_base)

Unnamed: 0,collectionName,phoneName,millisSinceGpsEpoch,latDeg,lngDeg,heightAboveWgs84EllipsoidM,phone,dist_pre,dist_pro,latDeg_pre,lngDeg_pre,latDeg_pro,lngDeg_pro
0,2020-05-15-US-MTV-1,Pixel4,1273608785432,37.416554,-122.082085,-30.69,2020-05-15-US-MTV-1_Pixel4,0.000000,2.314801,0.000000,0.000000,37.416646,-122.082040
1,2020-05-15-US-MTV-1,Pixel4,1273608786432,37.416589,-122.082072,-31.76,2020-05-15-US-MTV-1_Pixel4,2.314801,0.720397,37.416628,-122.082053,37.416653,-122.082039
2,2020-05-15-US-MTV-1,Pixel4,1273608787432,37.416571,-122.082071,-31.65,2020-05-15-US-MTV-1_Pixel4,0.720397,5.475268,37.416646,-122.082040,37.416607,-122.082063
3,2020-05-15-US-MTV-1,Pixel4,1273608788432,37.416577,-122.082073,-31.52,2020-05-15-US-MTV-1_Pixel4,5.475268,0.883032,37.416653,-122.082039,37.416609,-122.082073
4,2020-05-15-US-MTV-1,Pixel4,1273608789432,37.416626,-122.082041,-28.95,2020-05-15-US-MTV-1_Pixel4,0.883032,1.298255,37.416607,-122.082063,37.416601,-122.082083
...,...,...,...,...,...,...,...,...,...,...,...,...,...
91481,2021-04-29-US-SJC-3,SamsungS20Ultra,1303763185000,37.334539,-121.899383,-8.39,2021-04-29-US-SJC-3_SamsungS20Ultra,1.048893,0.758992,37.334547,-121.899389,37.334545,-121.899380
91482,2021-04-29-US-SJC-3,SamsungS20Ultra,1303763186000,37.334545,-121.899380,-7.36,2021-04-29-US-SJC-3_SamsungS20Ultra,0.758992,1.053741,37.334539,-121.899383,37.334551,-121.899371
91483,2021-04-29-US-SJC-3,SamsungS20Ultra,1303763187000,37.334551,-121.899371,-4.08,2021-04-29-US-SJC-3_SamsungS20Ultra,1.053741,1.233520,37.334545,-121.899380,37.334540,-121.899371
91484,2021-04-29-US-SJC-3,SamsungS20Ultra,1303763188000,37.334540,-121.899371,-5.70,2021-04-29-US-SJC-3_SamsungS20Ultra,1.233520,2.840957,37.334551,-121.899371,37.334562,-121.899354


In [None]:
get_removedevice(test_base, 'SamsungS20Ultra')

Unnamed: 0,collectionName,phoneName,millisSinceGpsEpoch,latDeg,lngDeg,heightAboveWgs84EllipsoidM,phone,dist_pre,dist_pro,latDeg_pre,lngDeg_pre,latDeg_pro,lngDeg_pro
0,2020-05-15-US-MTV-1,Pixel4,1273608785432,37.416628,-122.082053,-30.69,2020-05-15-US-MTV-1_Pixel4,0.000000,2.314801,0.000000,0.000000,37.416646,-122.082040
1,2020-05-15-US-MTV-1,Pixel4,1273608786432,37.416646,-122.082040,-31.76,2020-05-15-US-MTV-1_Pixel4,2.314801,0.720397,37.416628,-122.082053,37.416653,-122.082039
2,2020-05-15-US-MTV-1,Pixel4,1273608787432,37.416653,-122.082039,-31.65,2020-05-15-US-MTV-1_Pixel4,0.720397,5.475268,37.416646,-122.082040,37.416607,-122.082063
3,2020-05-15-US-MTV-1,Pixel4,1273608788432,37.416607,-122.082063,-31.52,2020-05-15-US-MTV-1_Pixel4,5.475268,0.883032,37.416653,-122.082039,37.416609,-122.082073
4,2020-05-15-US-MTV-1,Pixel4,1273608789432,37.416609,-122.082073,-28.95,2020-05-15-US-MTV-1_Pixel4,0.883032,1.298255,37.416607,-122.082063,37.416601,-122.082083
...,...,...,...,...,...,...,...,...,...,...,...,...,...
91481,2021-04-29-US-SJC-3,SamsungS20Ultra,1303763185000,37.334539,-121.899383,-8.39,2021-04-29-US-SJC-3_SamsungS20Ultra,1.048893,0.758992,37.334547,-121.899389,37.334545,-121.899380
91482,2021-04-29-US-SJC-3,SamsungS20Ultra,1303763186000,37.334545,-121.899380,-7.36,2021-04-29-US-SJC-3_SamsungS20Ultra,0.758992,1.053741,37.334539,-121.899383,37.334551,-121.899371
91483,2021-04-29-US-SJC-3,SamsungS20Ultra,1303763187000,37.334551,-121.899371,-4.08,2021-04-29-US-SJC-3_SamsungS20Ultra,1.053741,1.233520,37.334545,-121.899380,37.334540,-121.899371
91484,2021-04-29-US-SJC-3,SamsungS20Ultra,1303763188000,37.334540,-121.899371,-5.70,2021-04-29-US-SJC-3_SamsungS20Ultra,1.233520,2.840957,37.334551,-121.899371,37.334562,-121.899354


# Kalman Filter

In [46]:
def kalman_filter(input_df):

    T = 1.0
    state_transition = np.array([[1, 0, T, 0, 0.5 * T ** 2, 0], [0, 1, 0, T, 0, 0.5 * T ** 2], [0, 0, 1, 0, T, 0],
                                [0, 0, 0, 1, 0, T], [0, 0, 0, 0, 1, 0], [0, 0, 0, 0, 0, 1]])
    process_noise = np.diag([1e-5, 1e-5, 5e-6, 5e-6, 1e-6, 1e-6]) + np.ones((6, 6)) * 1e-9
    observation_model = np.array([[1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0]])
    observation_noise = np.diag([1e-4, 1e-4]) + np.ones((2, 2)) * 1e-9

    kf = simdkalman.KalmanFilter(
            state_transition = state_transition,
            process_noise = process_noise,
            observation_model = observation_model,
            observation_noise = observation_noise)
    
    def apply_kf_smoothing(df, kf_=kf):
        unique_paths = df[['collectionName', 'phoneName']].drop_duplicates().to_numpy()
        for collection, phone in tqdm(unique_paths):
            cond = np.logical_and(df['collectionName'] == collection, df['phoneName'] == phone)
            data = df[cond][['latDeg', 'lngDeg']].to_numpy()
            data = data.reshape(1, len(data), 2)
            smoothed = kf_.smooth(data)
            df.loc[cond, 'latDeg'] = smoothed.states.mean[0, :, 0]
            df.loc[cond, 'lngDeg'] = smoothed.states.mean[0, :, 1]
        return df

    kf_smoothed_baseline = apply_kf_smoothing(input_df)
    output_df = sub.assign(latDeg = kf_smoothed_baseline.latDeg, lngDeg = kf_smoothed_baseline.lngDeg)
    output_df["phoneName"] = input_df["phoneName"]
    output_df["collectionName"] = input_df["collectionName"]
    output_df["millisSinceGpsEpoch"] = input_df["millisSinceGpsEpoch"]
    #output_df["t_latDeg"] = input_df["t_latDeg"]
    #output_df["t_lngDeg"] = input_df["t_lngDeg"]
    
    return output_df

In [None]:
a = kalman_filter(test_base)

HBox(children=(FloatProgress(value=0.0, max=48.0), HTML(value='')))




In [None]:
submission_5.iloc[77716, 2] = submission_5.iloc[77715, 2] 
submission_5.iloc[77717, 2] = submission_5.iloc[77715, 2]

submission_5.iloc[77716, 3] = submission_5.iloc[77715, 3]
submission_5.iloc[77717, 3] = submission_5.iloc[77715, 3]

In [None]:
test_base.iloc[77710:77720]

Unnamed: 0,collectionName,phoneName,millisSinceGpsEpoch,latDeg,lngDeg,heightAboveWgs84EllipsoidM,phone,dist_pre,dist_pro,latDeg_pre,lngDeg_pre,latDeg_pro,lngDeg_pro,index
77710,2021-04-26-US-SVL-2,SamsungS20Ultra,1303516812000,37.371014,-122.044308,3.57,2021-04-26-US-SVL-2_SamsungS20Ultra,0.0,0.0,37.371014,-122.044308,37.371014,-122.044308,77710
77711,2021-04-26-US-SVL-2,SamsungS20Ultra,1303516813000,37.371014,-122.044308,3.57,2021-04-26-US-SVL-2_SamsungS20Ultra,0.0,0.0,37.371014,-122.044308,37.371014,-122.044308,77711
77712,2021-04-26-US-SVL-2,SamsungS20Ultra,1303516814000,37.371014,-122.044308,3.57,2021-04-26-US-SVL-2_SamsungS20Ultra,0.0,0.0,37.371014,-122.044308,37.371014,-122.044308,77712
77713,2021-04-26-US-SVL-2,SamsungS20Ultra,1303516815000,37.371014,-122.044308,3.57,2021-04-26-US-SVL-2_SamsungS20Ultra,0.0,0.0,37.371014,-122.044308,37.371014,-122.044308,77713
77714,2021-04-26-US-SVL-2,SamsungS20Ultra,1303516816000,37.371014,-122.044308,3.57,2021-04-26-US-SVL-2_SamsungS20Ultra,0.0,0.0,37.371014,-122.044308,37.371014,-122.044308,77714
77715,2021-04-26-US-SVL-2,SamsungS20Ultra,1303516817000,37.371014,-122.044308,3.57,2021-04-26-US-SVL-2_SamsungS20Ultra,0.0,0.0,37.371014,-122.044308,37.371014,-122.044308,77715
77716,2021-04-26-US-SVL-2,SamsungS20Ultra,1303516818000,37.371014,-122.044308,3.57,2021-04-26-US-SVL-2_SamsungS20Ultra,0.0,306.230082,37.371014,-122.044308,37.371728,-122.047656,77716
77717,2021-04-26-US-SVL-2,SamsungS20Ultra,1303516819000,37.371728,-122.047656,4.96,2021-04-26-US-SVL-2_SamsungS20Ultra,306.230082,11.177992,37.371014,-122.044308,37.371745,-122.047781,77717
77718,2021-04-26-US-SVL-2,SamsungS20Ultra,1303516820000,37.371745,-122.047781,5.51,2021-04-26-US-SVL-2_SamsungS20Ultra,11.177992,8.375912,37.371728,-122.047656,37.371762,-122.047874,77718
77719,2021-04-26-US-SVL-2,SamsungS20Ultra,1303516821000,37.371762,-122.047874,6.35,2021-04-26-US-SVL-2_SamsungS20Ultra,8.375912,9.948056,37.371745,-122.047781,37.371777,-122.047985,77719


In [None]:
submission_5.iloc[77710:77720]

Unnamed: 0,phone,millisSinceGpsEpoch,latDeg,lngDeg,phoneName,collectionName
77710,2021-04-26-US-SVL-2_SamsungS20Ultra,1303516812000,37.370989,-122.04419,SamsungS20Ultra,2021-04-26-US-SVL-2
77711,2021-04-26-US-SVL-2_SamsungS20Ultra,1303516813000,37.370987,-122.044182,SamsungS20Ultra,2021-04-26-US-SVL-2
77712,2021-04-26-US-SVL-2_SamsungS20Ultra,1303516814000,37.370993,-122.044207,SamsungS20Ultra,2021-04-26-US-SVL-2
77713,2021-04-26-US-SVL-2_SamsungS20Ultra,1303516815000,37.371019,-122.044328,SamsungS20Ultra,2021-04-26-US-SVL-2
77714,2021-04-26-US-SVL-2_SamsungS20Ultra,1303516816000,37.371071,-122.044574,SamsungS20Ultra,2021-04-26-US-SVL-2
77715,2021-04-26-US-SVL-2_SamsungS20Ultra,1303516817000,37.371159,-122.04499,SamsungS20Ultra,2021-04-26-US-SVL-2
77716,2021-04-26-US-SVL-2_SamsungS20Ultra,1303516818000,37.371159,-122.04499,SamsungS20Ultra,2021-04-26-US-SVL-2
77717,2021-04-26-US-SVL-2_SamsungS20Ultra,1303516819000,37.371159,-122.04499,SamsungS20Ultra,2021-04-26-US-SVL-2
77718,2021-04-26-US-SVL-2_SamsungS20Ultra,1303516820000,37.371602,-122.047109,SamsungS20Ultra,2021-04-26-US-SVL-2
77719,2021-04-26-US-SVL-2_SamsungS20Ultra,1303516821000,37.371698,-122.047584,SamsungS20Ultra,2021-04-26-US-SVL-2


# SUB

In [44]:
submission_1 = outlier(test_base)

submission_2 = remove_device(submission_1)

submission_3 = position_shift(submission_2)

submission_4 = mean_prediction(submission_3)

submission_5 = kalman_filter(submission_4)

[32m[I 2021-07-01 14:29:55,872][0m A new study created in memory with name: no-name-834359e9-fb13-4892-9707-b5f5834cbd10[0m
[32m[I 2021-07-01 14:30:02,312][0m Trial 0 finished with value: 5.196431741807936 and parameters: {'a': 0.43121018858565696}. Best is trial 0 with value: 5.196431741807936.[0m
[32m[I 2021-07-01 14:30:08,739][0m Trial 1 finished with value: 5.201704692701228 and parameters: {'a': 0.37958016972193653}. Best is trial 0 with value: 5.196431741807936.[0m
[32m[I 2021-07-01 14:30:14,947][0m Trial 2 finished with value: 5.191197107674712 and parameters: {'a': 0.724299272581165}. Best is trial 2 with value: 5.191197107674712.[0m
[32m[I 2021-07-01 14:30:21,319][0m Trial 3 finished with value: 5.4495098522188075 and parameters: {'a': -0.43334331875542564}. Best is trial 2 with value: 5.191197107674712.[0m
[32m[I 2021-07-01 14:30:27,661][0m Trial 4 finished with value: 5.3693382700332 and parameters: {'a': -0.23876427655946908}. Best is trial 2 with value: 5.

KeyError: ignored

In [47]:
submission_5 = kalman_filter(submission_4)

HBox(children=(FloatProgress(value=0.0, max=48.0), HTML(value='')))




In [53]:
display(submission_1[sub.columns].head())
display(submission_2[sub.columns].head())
display(submission_3[sub.columns].head())
display(submission_4[sub.columns].head())
display(submission_5[sub.columns].head())

Unnamed: 0,phone,millisSinceGpsEpoch,latDeg,lngDeg
0,2020-05-15-US-MTV-1_Pixel4,1273608785432,37.416628,-122.082053
1,2020-05-15-US-MTV-1_Pixel4,1273608786432,37.416646,-122.08204
2,2020-05-15-US-MTV-1_Pixel4,1273608787432,37.416653,-122.082039
3,2020-05-15-US-MTV-1_Pixel4,1273608788432,37.416607,-122.082063
4,2020-05-15-US-MTV-1_Pixel4,1273608789432,37.416609,-122.082073


Unnamed: 0,phone,millisSinceGpsEpoch,latDeg,lngDeg
0,2020-05-15-US-MTV-1_Pixel4,1273608785432,37.416628,-122.082053
1,2020-05-15-US-MTV-1_Pixel4,1273608786432,37.416646,-122.08204
2,2020-05-15-US-MTV-1_Pixel4,1273608787432,37.416653,-122.082039
3,2020-05-15-US-MTV-1_Pixel4,1273608788432,37.416607,-122.082063
4,2020-05-15-US-MTV-1_Pixel4,1273608789432,37.416609,-122.082073


Unnamed: 0,phone,millisSinceGpsEpoch,latDeg,lngDeg
0,2020-05-15-US-MTV-1_Pixel4,1273608785432,37.416628,-122.082053
1,2020-05-15-US-MTV-1_Pixel4,1273608786432,37.416641,-122.082044
2,2020-05-15-US-MTV-1_Pixel4,1273608787432,37.416647,-122.08204
3,2020-05-15-US-MTV-1_Pixel4,1273608788432,37.416613,-122.08206
4,2020-05-15-US-MTV-1_Pixel4,1273608789432,37.416608,-122.082066


Unnamed: 0,phone,millisSinceGpsEpoch,latDeg,lngDeg
0,2020-05-15-US-MTV-1_Pixel4,1273608785432,37.416549,-122.081906
1,2020-05-15-US-MTV-1_Pixel4,1273608786432,37.416576,-122.08198
2,2020-05-15-US-MTV-1_Pixel4,1273608787432,37.416593,-122.082028
3,2020-05-15-US-MTV-1_Pixel4,1273608788432,37.416601,-122.082056
4,2020-05-15-US-MTV-1_Pixel4,1273608789432,37.416606,-122.08207


Unnamed: 0,phone,millisSinceGpsEpoch,latDeg,lngDeg
0,2020-05-15-US-MTV-1_Pixel4,1273608785432,37.416549,-122.081906
1,2020-05-15-US-MTV-1_Pixel4,1273608786432,37.416576,-122.08198
2,2020-05-15-US-MTV-1_Pixel4,1273608787432,37.416593,-122.082028
3,2020-05-15-US-MTV-1_Pixel4,1273608788432,37.416601,-122.082056
4,2020-05-15-US-MTV-1_Pixel4,1273608789432,37.416606,-122.08207


In [54]:
submission_5[sub.columns].to_csv('submission_18.csv', index=False)

In [None]:
display(submission_1.iloc[77717:77719])
display(submission_2.iloc[77717:77719])
display(submission_3.iloc[77717:77719])
display(submission_4.iloc[77717:77719])
display(submission_5.iloc[77717:77719])

Unnamed: 0,collectionName,phoneName,millisSinceGpsEpoch,latDeg,lngDeg,heightAboveWgs84EllipsoidM,phone,dist_pre,dist_pro,latDeg_pre,lngDeg_pre,latDeg_pro,lngDeg_pro,index
77717,2021-04-26-US-SVL-2,SamsungS20Ultra,1303516819000,37.371728,-122.047656,4.96,2021-04-26-US-SVL-2_SamsungS20Ultra,306.230082,11.177992,37.371014,-122.044308,37.371745,-122.047781,77717
77718,2021-04-26-US-SVL-2,SamsungS20Ultra,1303516820000,37.371745,-122.047781,5.51,2021-04-26-US-SVL-2_SamsungS20Ultra,11.177992,8.375912,37.371728,-122.047656,37.371762,-122.047874,77718


Unnamed: 0,phone,millisSinceGpsEpoch,latDeg,lngDeg,phoneName,collectionName,latDeg_pre,latDeg_pro,lngDeg_pre,lngDeg_pro,phone_pre,phone_pro,dist_pre,dist_pro,latDeg_next
77717,2021-04-26-US-SVL-2_SamsungS20Ultra,1303516819000,,,SamsungS20Ultra,2021-04-26-US-SVL-2,37.371291,37.371602,-122.045617,-122.047109,2021-04-26-US-SVL-2_SamsungS20Ultra,2021-04-26-US-SVL-2_SamsungS20Ultra,75.886834,60.345386,
77718,2021-04-26-US-SVL-2_SamsungS20Ultra,1303516820000,37.371602,-122.047109,SamsungS20Ultra,2021-04-26-US-SVL-2,37.371465,37.3717,-122.046448,-122.047592,2021-04-26-US-SVL-2_SamsungS20Ultra,2021-04-26-US-SVL-2_SamsungS20Ultra,60.345386,43.954314,


Unnamed: 0,phone,millisSinceGpsEpoch,latDeg,lngDeg,phoneName,collectionName,index
77717,2021-04-26-US-SVL-2_SamsungS20Ultra,1303516819000,,,SamsungS20Ultra,2021-04-26-US-SVL-2,77717
77718,2021-04-26-US-SVL-2_SamsungS20Ultra,1303516820000,37.371602,-122.047109,SamsungS20Ultra,2021-04-26-US-SVL-2,77718


Unnamed: 0,phone,millisSinceGpsEpoch,latDeg,lngDeg,phoneName,collectionName
77717,2021-04-26-US-SVL-2_SamsungS20Ultra,1303516819000,,,SamsungS20Ultra,2021-04-26-US-SVL-2
77718,2021-04-26-US-SVL-2_SamsungS20Ultra,1303516820000,37.371602,-122.047109,SamsungS20Ultra,2021-04-26-US-SVL-2


Unnamed: 0,phone,millisSinceGpsEpoch,latDeg,lngDeg,phoneName,collectionName
77717,2021-04-26-US-SVL-2_SamsungS20Ultra,1303516819000,,,SamsungS20Ultra,2021-04-26-US-SVL-2
77718,2021-04-26-US-SVL-2_SamsungS20Ultra,1303516820000,37.371602,-122.047109,SamsungS20Ultra,2021-04-26-US-SVL-2


In [None]:
submission_5.isna().count()

phone                  91486
millisSinceGpsEpoch    91486
latDeg                 91486
lngDeg                 91486
phoneName              91486
collectionName         91486
dtype: int64

In [None]:
submission_5

Unnamed: 0,phone,millisSinceGpsEpoch,latDeg,lngDeg,phoneName,collectionName
0,2020-05-15-US-MTV-1_Pixel4,1273608785432,37.416575,-122.081987,Pixel4,2020-05-15-US-MTV-1
1,2020-05-15-US-MTV-1_Pixel4,1273608786432,37.416587,-122.082015,Pixel4,2020-05-15-US-MTV-1
2,2020-05-15-US-MTV-1_Pixel4,1273608787432,37.416595,-122.082039,Pixel4,2020-05-15-US-MTV-1
3,2020-05-15-US-MTV-1_Pixel4,1273608788432,37.416600,-122.082054,Pixel4,2020-05-15-US-MTV-1
4,2020-05-15-US-MTV-1_Pixel4,1273608789432,37.416602,-122.082061,Pixel4,2020-05-15-US-MTV-1
...,...,...,...,...,...,...
91481,2021-04-29-US-SJC-3_SamsungS20Ultra,1303763185000,37.334543,-121.899398,SamsungS20Ultra,2021-04-29-US-SJC-3
91482,2021-04-29-US-SJC-3_SamsungS20Ultra,1303763186000,37.334543,-121.899389,SamsungS20Ultra,2021-04-29-US-SJC-3
91483,2021-04-29-US-SJC-3_SamsungS20Ultra,1303763187000,37.334544,-121.899380,SamsungS20Ultra,2021-04-29-US-SJC-3
91484,2021-04-29-US-SJC-3_SamsungS20Ultra,1303763188000,37.334547,-121.899371,SamsungS20Ultra,2021-04-29-US-SJC-3


In [None]:
submission_1.head()

Unnamed: 0,collectionName,phoneName,millisSinceGpsEpoch,latDeg,lngDeg,heightAboveWgs84EllipsoidM,phone,latDeg_pre,latDeg_pro,lngDeg_pre,lngDeg_pro,phone_pre,phone_pro,dist_pre,dist_pro,latDeg_next
0,2020-05-15-US-MTV-1,Pixel4,1273608785432,37.416462,-122.081497,-30.69,2020-05-15-US-MTV-1_Pixel4,0.0,37.416552,0.0,-122.081793,,2020-05-15-US-MTV-1_Pixel4,0.0,16.280497,
1,2020-05-15-US-MTV-1,Pixel4,1273608786432,37.416525,-122.081708,-31.76,2020-05-15-US-MTV-1_Pixel4,37.416502,37.416589,-122.08162,-122.081922,2020-05-15-US-MTV-1_Pixel4,2020-05-15-US-MTV-1_Pixel4,16.280497,12.084129,
2,2020-05-15-US-MTV-1,Pixel4,1273608787432,37.416573,-122.081868,-31.65,2020-05-15-US-MTV-1_Pixel4,37.416552,37.416614,-122.081793,-122.082009,2020-05-15-US-MTV-1_Pixel4,2020-05-15-US-MTV-1_Pixel4,12.084129,8.205756,
3,2020-05-15-US-MTV-1,Pixel4,1273608788432,37.416607,-122.08198,-31.52,2020-05-15-US-MTV-1_Pixel4,37.416589,37.41663,-122.081922,-122.082061,2020-05-15-US-MTV-1_Pixel4,2020-05-15-US-MTV-1_Pixel4,8.205756,4.936569,
4,2020-05-15-US-MTV-1,Pixel4,1273608789432,37.416629,-122.082052,-28.95,2020-05-15-US-MTV-1_Pixel4,37.416614,37.416639,-122.082009,-122.082086,2020-05-15-US-MTV-1_Pixel4,2020-05-15-US-MTV-1_Pixel4,4.936569,2.424999,


In [None]:
test_base.head()

Unnamed: 0,collectionName,phoneName,millisSinceGpsEpoch,latDeg,lngDeg,heightAboveWgs84EllipsoidM,phone,latDeg_pre,latDeg_pro,lngDeg_pre,lngDeg_pro,phone_pre,phone_pro,dist_pre,dist_pro,latDeg_next
0,2020-05-15-US-MTV-1,Pixel4,1273608785432,37.416462,-122.081497,-30.69,2020-05-15-US-MTV-1_Pixel4,0.0,37.416552,0.0,-122.081793,,2020-05-15-US-MTV-1_Pixel4,0.0,16.280497,
1,2020-05-15-US-MTV-1,Pixel4,1273608786432,37.416525,-122.081708,-31.76,2020-05-15-US-MTV-1_Pixel4,37.416502,37.416589,-122.08162,-122.081922,2020-05-15-US-MTV-1_Pixel4,2020-05-15-US-MTV-1_Pixel4,16.280497,12.084129,
2,2020-05-15-US-MTV-1,Pixel4,1273608787432,37.416573,-122.081868,-31.65,2020-05-15-US-MTV-1_Pixel4,37.416552,37.416614,-122.081793,-122.082009,2020-05-15-US-MTV-1_Pixel4,2020-05-15-US-MTV-1_Pixel4,12.084129,8.205756,
3,2020-05-15-US-MTV-1,Pixel4,1273608788432,37.416607,-122.08198,-31.52,2020-05-15-US-MTV-1_Pixel4,37.416589,37.41663,-122.081922,-122.082061,2020-05-15-US-MTV-1_Pixel4,2020-05-15-US-MTV-1_Pixel4,8.205756,4.936569,
4,2020-05-15-US-MTV-1,Pixel4,1273608789432,37.416629,-122.082052,-28.95,2020-05-15-US-MTV-1_Pixel4,37.416614,37.416639,-122.082009,-122.082086,2020-05-15-US-MTV-1_Pixel4,2020-05-15-US-MTV-1_Pixel4,4.936569,2.424999,
