<a href="https://colab.research.google.com/github/kyochanpy/Google_Smartphone_Decimeter_Challenge/blob/main/note_book/submission_58.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#ライブラリ

In [27]:
!pip install optuna > /dev/null
!pip install pyproj > /dev/null
!pip install simdkalman > /dev/null
!pip install osmnx momepy geopandas > /dev/null
    
import os
from glob import glob
from pathlib import Path

import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
from matplotlib_venn import venn2, venn2_circles
import seaborn as sns
from scipy.ndimage import gaussian_filter1d
from scipy.interpolate import interp1d

from shapely.geometry import Point
import osmnx as ox
import momepy
import geopandas as gpd
import optuna
import plotly
import plotly.express as px
import pyproj
from pathlib import Path
from pyproj import Proj, transform
from tqdm.notebook import tqdm
import simdkalman

#Data

In [42]:
dir = Path("/content/drive/MyDrive/GSDC")
train_base = pd.read_csv(dir / "baseline_locations_train.csv")

#test_base = pd.read_csv(dir / "baseline_locations_test.csv")
test_base = pd.read_csv("/content/drive/MyDrive/GSDC/test_predict_next_point_SJC_2.csv")

sub = pd.read_csv(dir / "sample_submission.csv")


def get_groundtruth(path: Path) -> pd.DataFrame:
        output_df = pd.DataFrame()
        
        for path in glob(str(dir / 'train/*/*/ground_truth.csv')):
            _df = pd.read_csv(path)
            output_df = pd.concat([output_df, _df])
        output_df = output_df.reset_index(drop=True)
        
        _columns = ['latDeg', 'lngDeg', 'heightAboveWgs84EllipsoidM']
        output_df[['t_'+col for col in _columns]] = output_df[_columns]
        output_df = output_df.drop(columns=_columns, axis=1)
        return output_df

train_base = train_base.merge(
    get_groundtruth(dir),
    on=['collectionName', 'phoneName', 'millisSinceGpsEpoch']
)

In [3]:
test_SJC_list = ["2021-04-22-US-SJC-2", "2021-04-29-US-SJC-3"]

test_SJCs = []
for SJC in test_SJC_list:
    SJC_df = test_base[test_base["collectionName"] == SJC]
    test_SJCs.append(SJC_df)
test_SJC = pd.concat(test_SJCs)

#共通関数

In [4]:
def calc_haversine(lat1, lon1, lat2, lon2):
    RADIUS = 6_367_000
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    d = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    dist = 2 * RADIUS * np.arcsin(d**0.5)
    return dist

#defs

## make triangle

In [5]:
#必要な特徴量を追加
def add_triangle_features(input_df):
    output_df = input_df.copy()
    output_df["latDeg_pro_1"] = output_df["latDeg"].shift(-1)
    output_df["latDeg_pro_2"] = output_df["latDeg"].shift(-2)
    output_df["lngDeg_pro_1"] = output_df["lngDeg"].shift(-1)
    output_df["lngDeg_pro_2"] = output_df["lngDeg"].shift(-2)
    output_df["millisSinceGpsEpoch_pro_1"] = output_df["millisSinceGpsEpoch"].shift(-1)
    output_df["millisSinceGpsEpoch_pro_2"] = output_df["millisSinceGpsEpoch"].shift(-2)
    output_df["latDeg_mean_point"] = (output_df["latDeg"] + ((output_df["latDeg_pro_2"] - output_df["latDeg"]) * 
                                                            ((output_df["millisSinceGpsEpoch_pro_1"] - output_df["millisSinceGpsEpoch"]) /
                                                            (output_df["millisSinceGpsEpoch_pro_2"] - output_df["millisSinceGpsEpoch"])))).shift(1)
        
    output_df["lngDeg_mean_point"] = (output_df["lngDeg"] + ((output_df["lngDeg_pro_2"] - output_df["lngDeg"]) * 
                                                            ((output_df["millisSinceGpsEpoch_pro_1"] - output_df["millisSinceGpsEpoch"]) /
                                                            (output_df["millisSinceGpsEpoch_pro_2"] - output_df["millisSinceGpsEpoch"])))).shift(1)


    degree_list = []
    for lat, lng, lat_1, lng_1, lat_2, lng_2 in zip(
        output_df["latDeg"].to_numpy(),
        output_df["lngDeg"].to_numpy(),
        output_df["latDeg_pro_1"].to_numpy(),
        output_df["lngDeg_pro_1"].to_numpy(),
        output_df["latDeg_pro_2"].to_numpy(),
        output_df["lngDeg_pro_2"].to_numpy()
    ):
        p0 = np.array([lat, lng])
        p1 = np.array([lat_1, lng_1])
        p2 = np.array([lat_2, lng_2])
            
        vec_p0 = p0 - p1
        vec_p2 = p2 - p1
        length_vec_p0 = np.linalg.norm(vec_p0)
        length_vec_p2 = np.linalg.norm(vec_p2)
        inner = np.inner(vec_p0, vec_p2)
        degree = np.rad2deg(np.arccos(inner / (length_vec_p0 * length_vec_p2)))
        degree_list.append(degree)
    degree_list.insert(0, 180)
    degree_list.pop(-1)
    degree_list[-1] = 180
    output_df["degree"] = degree_list
    return output_df

In [6]:
#主な処理
def make_triangle(input_df):
    output_df = input_df.copy()

    lat_list = []
    lng_list = []

    for collection in output_df["collectionName"].unique():
        collection_df = output_df[output_df["collectionName"] == collection]
        if collection in output_df["collectionName"].unique():
            for phone in collection_df["phoneName"].unique():
                phone_df = collection_df[collection_df["phoneName"] == phone]
                triangle_df = add_triangle_features(phone_df)
                for lat, lng, lat_mp, lng_mp, deg in zip(
                    triangle_df["latDeg"].to_numpy(),
                    triangle_df["lngDeg"].to_numpy(),
                    triangle_df["latDeg_mean_point"].to_numpy(),
                    triangle_df["lngDeg_mean_point"].to_numpy(),
                    triangle_df["degree"].to_numpy()
                ):
                    if deg < 155:
                        lat_f = (lat + lat_mp)/2
                        lng_f = (lng + lng_mp)/2
                        lat_list.append(lat_f)
                        lng_list.append(lng_f)
                    else:
                        lat_list.append(lat)
                        lng_list.append(lng)
        else:
            for lat, lng in zip(
                collection_df["latDeg"].to_numpy(),
                collection_df["lngDeg"].to_numpy(),
            ):
                lat_list.append(lat)
                lng_list.append(lng)
    output_df["latDeg"] = lat_list
    output_df["lngDeg"] = lng_list
    return output_df

##outlier

In [7]:
def outlier_train(input_df):
    output_df = input_df.copy()
    output_df["phone"] = output_df["collectionName"] + "_" + output_df["phoneName"]

    output_df["dist_pre"] = 0
    output_df["dist_pro"] = 0

    output_df['latDeg_pre'] = output_df['latDeg'].shift(periods=1,fill_value=0)
    output_df['lngDeg_pre'] = output_df['lngDeg'].shift(periods=1,fill_value=0)
    output_df['latDeg_pro'] = output_df['latDeg'].shift(periods=-1,fill_value=0)
    output_df['lngDeg_pro'] = output_df['lngDeg'].shift(periods=-1,fill_value=0)
    output_df['dist_pre'] = calc_haversine(output_df.latDeg_pre, output_df.lngDeg_pre, output_df.latDeg, output_df.lngDeg)
    output_df['dist_pro'] = calc_haversine(output_df.latDeg, output_df.lngDeg, output_df.latDeg_pro, output_df.lngDeg_pro)

    list_phone = output_df['phone'].unique()
    for phone in list_phone:
        ind_s = output_df[train_base['phone'] == phone].index[0]
        ind_e = output_df[train_base['phone'] == phone].index[-1]
        output_df.loc[ind_s,'dist_pre'] = 0
        output_df.loc[ind_e,'dist_pro'] = 0

    pro_95 = output_df['dist_pro'].mean() + (output_df['dist_pro'].std() * 2)
    pre_95 = output_df['dist_pre'].mean() + (output_df['dist_pre'].std() * 2)
    ind = output_df[(output_df['dist_pro'] > pro_95)&(output_df['dist_pre'] > pre_95)][['dist_pre','dist_pro']].index

    for i in ind:
        output_df.loc[i,'latDeg'] = (output_df.loc[i-1,'latDeg'] + output_df.loc[i+1,'latDeg'])/2
        output_df.loc[i,'lngDeg'] = (output_df.loc[i-1,'lngDeg'] + output_df.loc[i+1,'lngDeg'])/2

    return output_df

In [8]:
def outlier(input_df):
    output_df = input_df

    output_df["dist_pre"] = 0
    output_df["dist_pro"] = 0

    output_df['latDeg_pre'] = output_df['latDeg'].shift(periods=1,fill_value=0)
    output_df['lngDeg_pre'] = output_df['lngDeg'].shift(periods=1,fill_value=0)
    output_df['latDeg_pro'] = output_df['latDeg'].shift(periods=-1,fill_value=0)
    output_df['lngDeg_pro'] = output_df['lngDeg'].shift(periods=-1,fill_value=0)
    output_df['dist_pre'] = calc_haversine(output_df.latDeg_pre, output_df.lngDeg_pre, output_df.latDeg, output_df.lngDeg)
    output_df['dist_pro'] = calc_haversine(output_df.latDeg, output_df.lngDeg, output_df.latDeg_pro, output_df.lngDeg_pro)

    list_phone = output_df['phone'].unique()
    for phone in list_phone:
        ind_s = output_df[test_base['phone'] == phone].index[0]
        ind_e = output_df[test_base['phone'] == phone].index[-1]
        output_df.loc[ind_s,'dist_pre'] = 0
        output_df.loc[ind_e,'dist_pro'] = 0

    pro_95 = output_df['dist_pro'].mean() + (output_df['dist_pro'].std() * 2)
    pre_95 = output_df['dist_pre'].mean() + (output_df['dist_pre'].std() * 2)
    ind = output_df[(output_df['dist_pro'] > pro_95)&(output_df['dist_pre'] > pre_95)][['dist_pre','dist_pro']].index

    for i in ind:
        output_df.loc[i,'latDeg'] = (output_df.loc[i-1,'latDeg'] + output_df.loc[i+1,'latDeg'])/2
        output_df.loc[i,'lngDeg'] = (output_df.loc[i-1,'lngDeg'] + output_df.loc[i+1,'lngDeg'])/2

    output_df["phoneName"] = input_df["phoneName"]
    output_df["collectionName"] = input_df["collectionName"]
    output_df["millisSinceGpsEpoch"] = input_df["millisSinceGpsEpoch"]

    return output_df

##phones mean 

In [9]:
def add_distance_diff(df):
    df['latDeg_pre'] = df['latDeg'].shift(1)
    df['latDeg_pro'] = df['latDeg'].shift(-1)
    df['lngDeg_pre'] = df['lngDeg'].shift(1)
    df['lngDeg_pro'] = df['lngDeg'].shift(-1)
    df['phone_pre'] = df['phone'].shift(1)
    df['phone_pro'] = df['phone'].shift(-1)
        
    df['dist_pre'] = calc_haversine(df['latDeg'], df['lngDeg'], df['latDeg_pre'], df['lngDeg_pre'])
    df['dist_pro'] = calc_haversine(df['latDeg'], df['lngDeg'], df['latDeg_pro'], df['lngDeg_pro'])
        
    df.loc[df['phone']!=df['phone_pre'], ['latDeg_pre', 'lngDeg_pre', 'dist_pre']] = np.nan
    df.loc[df['phone']!=df['phone_pro'], ['latDeg_pro', 'lngDeg_pro', 'dist_pro']] = np.nan
        
    return df


def make_lerp_data(input_df):
    org_colus = input_df.columns

    time_list = input_df[["collectionName", "millisSinceGpsEpoch"]].drop_duplicates()
    phone_list = input_df[["collectionName", "phoneName"]].drop_duplicates()
    tmp = time_list.merge(phone_list, on="collectionName", how="outer")

    output_df = tmp.merge(input_df, on=["collectionName", "millisSinceGpsEpoch", "phoneName"], how="left")
    output_df["phone"] = output_df["collectionName"] + "_" + output_df["phoneName"]
    output_df = output_df.sort_values(["phone", "millisSinceGpsEpoch"])

    output_df["latDeg_pre"] = output_df["latDeg"].shift(1)
    output_df["latDeg_pro"] = output_df["latDeg"].shift(-1)
    output_df["lngDeg_pre"] = output_df["lngDeg"].shift(1)
    output_df["lngDeg_pro"] = output_df["lngDeg"].shift(-1)
    output_df["phone_pre"] = output_df["phone"].shift(1)
    output_df["phone_pro"] = output_df["phone"].shift(-1)
    output_df["millisSinceGpsEpoch_pre"] = output_df["millisSinceGpsEpoch"].shift(1)
    output_df["millisSinceGpsEpoch_pro"] = output_df["millisSinceGpsEpoch"].shift(-1)

    output_df = output_df[(output_df["latDeg"].isnull())&(output_df["phone"] == output_df["phone_pre"])&
                        (output_df["phone"] == output_df["phone_pro"])].copy()

    #preとproの間を経過時間を考慮して算出
    output_df["latDeg"] = output_df["latDeg_pre"] + ((output_df["latDeg_pro"] - output_df["latDeg_pre"]) * 
                                                    ((output_df["millisSinceGpsEpoch"] - output_df["millisSinceGpsEpoch_pre"]) /
                                                    (output_df["millisSinceGpsEpoch_pro"] - output_df["millisSinceGpsEpoch_pre"])))
    output_df["lngDeg"] = output_df["lngDeg_pre"] + ((output_df["lngDeg_pro"] - output_df["lngDeg_pre"]) * 
                                                    ((output_df["millisSinceGpsEpoch"] - output_df["millisSinceGpsEpoch_pre"]) /
                                                    (output_df["millisSinceGpsEpoch_pro"] - output_df["millisSinceGpsEpoch_pre"])))
        
    output_df = output_df[~output_df['latDeg'].isnull()]

    return output_df[org_colus]

def calc_mean_pred(input_df, lerp_df):
    input_df["phone"] = input_df["collectionName"] + "_" + input_df["phoneName"]
    add_lerp = pd.concat([input_df, lerp_df])
    mean_pred_result = add_lerp.groupby(["collectionName", "millisSinceGpsEpoch"])[["latDeg", "lngDeg"]].mean().reset_index()
    output_df = input_df[["collectionName", "phoneName", "millisSinceGpsEpoch"]].copy()
    output_df = output_df.merge(mean_pred_result[["collectionName", "millisSinceGpsEpoch", "latDeg", "lngDeg"]],
                                    on=["collectionName", "millisSinceGpsEpoch"], how="left")
    output_df["phone"] = output_df["collectionName"] + "_" + output_df["phoneName"]
    return output_df

In [34]:
def mean_prediction_train(input_df):
    input_df["phone"] = input_df["collectionName"] + "_" + input_df["phoneName"]

    input_df_ = add_distance_diff(input_df)
    th = 43
    input_df_.loc[((input_df_['dist_pre'] > th) & (input_df_['dist_pro'] > th)), ['latDeg', 'lngDeg']] = np.nan
    
    lerp = make_lerp_data(input_df_)
    mean_pred  = calc_mean_pred(input_df_, lerp)

    output_df = kalman_filter(mean_pred)
    output_df["t_latDeg"] = input_df["t_latDeg"]
    output_df["t_lngDeg"] = input_df["t_lngDeg"]
    
    return output_df

In [11]:
def mean_prediction(input_df):
    input_df_ = add_distance_diff(input_df)
    th = 43
    input_df_.loc[((input_df_['dist_pre'] > th) & (input_df_['dist_pro'] > th)), ['latDeg', 'lngDeg']] = np.nan

    test_lerp = make_lerp_data(input_df_)
    test_mean_pred  = calc_mean_pred(input_df_, test_lerp)

    output_df = kalman_filter(test_mean_pred)    
    
    return output_df

##remove device

In [12]:
def get_removedevice(input_df: pd.DataFrame, divece: str) -> pd.DataFrame:
    input_df['index'] = input_df.index
    input_df = input_df.sort_values('millisSinceGpsEpoch')
    input_df.index = input_df['millisSinceGpsEpoch'].values

    output_df = pd.DataFrame() 
    for _, subdf in input_df.groupby('collectionName'):

        phones = subdf['phoneName'].unique()

        if (len(phones) == 1) or (not divece in phones):
            output_df = pd.concat([output_df, subdf])
            continue

        origin_df = subdf.copy()
            
        _index = subdf['phoneName']==divece
        subdf.loc[_index, 'latDeg'] = np.nan
        subdf.loc[_index, 'lngDeg'] = np.nan
        subdf = subdf.interpolate(method='index', limit_area='inside')

        _index = subdf['latDeg'].isnull()
        subdf.loc[_index, 'latDeg'] = origin_df.loc[_index, 'latDeg'].values
        subdf.loc[_index, 'lngDeg'] = origin_df.loc[_index, 'lngDeg'].values

        output_df = pd.concat([output_df, subdf])

    output_df.index = output_df['index'].values
    output_df = output_df.sort_index()

    del output_df['index']
        
    return output_df

In [13]:
#remove_device
def remove_device(input_df):

    output_df = get_removedevice(input_df, 'SamsungS20Ultra')

    return output_df

##position shift

In [14]:
def position_shift_train(input_df):
    sub_cols = sub.columns

    train_p_s = pd.read_csv(dir / "baseline_locations_train.csv")
    train_b = train_p_s[sub_cols]
    test_b = test_base[sub_cols]

    msge = "millisSinceGpsEpoch"

    testdir = dir / 'test'
    traindir = dir / 'train'

    g_t = pd.DataFrame()
    for d in os.listdir(traindir):
        for p in os.listdir(traindir/d):
            g_t = g_t.append(pd.read_csv(traindir/d/p/'ground_truth.csv'))

    g_t["phone"] = g_t["collectionName"] + "_" + g_t["phoneName"]
    g_t_sub_cols = g_t[sub_cols]

    transformer = pyproj.Transformer.from_crs(
        {"proj":'geocent', "ellps":'WGS84', "datum":'WGS84'},
        {"proj":'latlong', "ellps":'WGS84', "datum":'WGS84'},)
    
    def compute_dist(fname_1, fname_2):
        oof = fname_1
        g_t = fname_2
        df = oof.merge(g_t, on=["phone", "millisSinceGpsEpoch"])
        dist_oof = calc_haversine(df.latDeg_x, df.lngDeg_x, df.latDeg_y, df.lngDeg_y)
        scores = pd.DataFrame({"phone":df.phone, "dist":dist_oof})
        scores_grp = scores.groupby("phone")
        d_50 = scores_grp.quantile(.50).reset_index()
        d_50.columns = ["phone", "q_50"]
        d_95 = scores_grp.quantile(.95).reset_index()
        d_95.columns = ["phone", "q_95"]
        return (scores_grp.quantile(.50).mean() + scores_grp.quantile(.95).mean()) / 2, d_50.merge(d_95)


    def WGS84_to_ECEF(lat, lng, alt):
        rad_lat = lat * (np.pi / 180.0)
        rad_lng = lng * (np.pi / 180.0)
        a = 6378137.0
        finv = 298.257223563
        f = 1 / finv   
        e2 = 1 - (1 - f) * (1 - f)    
        N = a / np.sqrt(1 - e2 * np.sin(rad_lat) * np.sin(rad_lat))
        x = (N + alt) * np.cos(rad_lat) * np.cos(rad_lng)
        y = (N + alt) * np.cos(rad_lat) * np.sin(rad_lng)
        z = (N * (1 - e2) + alt)        * np.sin(rad_lat)
        return x, y, z    


    def ECEF_to_WGS84(x, y, z):
        lng, lat, alt = transformer.transform(x, y, z, radians=False)
        return lng, lat, alt

    
    def position_shift_(input_df, a):
        output_df = input_df.copy()
        output_df["heightAboveWgs84EllipsoidM"] = 63.5
        output_df["x"], output_df["y"], output_df["z"] = zip(
            *output_df.apply(lambda x: WGS84_to_ECEF(x.latDeg, x.lngDeg, 
                                                    x.heightAboveWgs84EllipsoidM), 
                            axis=1)

        )
        output_df.sort_values(["phone", msge], inplace=True)
        for fi in ["x", "y", "z"]:
            output_df[[fi + "_p"]] = output_df[fi].shift().where(output_df["phone"].eq(output_df["phone"].shift()))
            output_df[[fi + "_diff"]] = output_df[fi] - output_df[fi + "_p"]

        output_df[["dist"]] = np.sqrt(output_df["x_diff"]**2 + output_df["y_diff"]**2 + output_df["z_diff"]**2)
        for fi in ["x", "y", "z"]:
            output_df[[fi + "_new"]] = output_df[fi + "_p"] + output_df[fi + "_diff"] * (1 - a/output_df["dist"])
        lng, lat, alt = ECEF_to_WGS84(output_df["x_new"].values, output_df["y_new"].values, output_df["z_new"].values)

        lat[np.isnan(lat)] = output_df.loc[np.isnan(lat), "latDeg"]
        lng[np.isnan(lng)] = output_df.loc[np.isnan(lng), "lngDeg"]
        output_df["latDeg"] = lat
        output_df["lngDeg"] = lng

        output_df.sort_values(["phone", msge], inplace=True)

        return output_df[sub_cols]

    
    def objective(trial):
        a = trial.suggest_uniform("a", -1, 1)
        score, scores = compute_dist(position_shift_(train_b, a), g_t)
        return score

    study = optuna.create_study()
    study.optimize(objective, n_trials=30)

    output_df = position_shift_(input_df, a=study.best_params["a"])

    output_df["phoneName"] = input_df["phoneName"]
    output_df["collectionName"] = input_df["collectionName"]
    output_df["millisSinceGpsEpoch"] = input_df["millisSinceGpsEpoch"]
    output_df["t_latDeg"] = input_df["t_latDeg"]
    output_df["t_lngDeg"] = input_df["t_lngDeg"]

    return output_df

In [15]:
def position_shift(input_df):
    sub_cols = sub.columns

    train_p_s = pd.read_csv(dir / "baseline_locations_train.csv")
    train_b = train_p_s[sub_cols]
    test_b = test_base[sub_cols]

    msge = "millisSinceGpsEpoch"

    testdir = dir / 'test'
    traindir = dir / 'train'

    g_t = pd.DataFrame()
    for d in os.listdir(traindir):
        for p in os.listdir(traindir/d):
            g_t = g_t.append(pd.read_csv(traindir/d/p/'ground_truth.csv'))

    g_t["phone"] = g_t["collectionName"] + "_" + g_t["phoneName"]
    g_t_sub_cols = g_t[sub_cols]

    transformer = pyproj.Transformer.from_crs(
        {"proj":'geocent', "ellps":'WGS84', "datum":'WGS84'},
        {"proj":'latlong', "ellps":'WGS84', "datum":'WGS84'},)
    
    def compute_dist(fname_1, fname_2):
        oof = fname_1
        g_t = fname_2
        df = oof.merge(g_t, on=["phone", "millisSinceGpsEpoch"])
        dist_oof = calc_haversine(df.latDeg_x, df.lngDeg_x, df.latDeg_y, df.lngDeg_y)
        scores = pd.DataFrame({"phone":df.phone, "dist":dist_oof})
        scores_grp = scores.groupby("phone")
        d_50 = scores_grp.quantile(.50).reset_index()
        d_50.columns = ["phone", "q_50"]
        d_95 = scores_grp.quantile(.95).reset_index()
        d_95.columns = ["phone", "q_95"]
        return (scores_grp.quantile(.50).mean() + scores_grp.quantile(.95).mean()) / 2, d_50.merge(d_95)


    def WGS84_to_ECEF(lat, lng, alt):
        rad_lat = lat * (np.pi / 180.0)
        rad_lng = lng * (np.pi / 180.0)
        a = 6378137.0
        finv = 298.257223563
        f = 1 / finv   
        e2 = 1 - (1 - f) * (1 - f)    
        N = a / np.sqrt(1 - e2 * np.sin(rad_lat) * np.sin(rad_lat))
        x = (N + alt) * np.cos(rad_lat) * np.cos(rad_lng)
        y = (N + alt) * np.cos(rad_lat) * np.sin(rad_lng)
        z = (N * (1 - e2) + alt)        * np.sin(rad_lat)
        return x, y, z    


    def ECEF_to_WGS84(x, y, z):
        lng, lat, alt = transformer.transform(x, y, z, radians=False)
        return lng, lat, alt

    
    def position_shift_(input_df, a):
        output_df = input_df.copy()
        output_df["heightAboveWgs84EllipsoidM"] = 63.5
        output_df["x"], output_df["y"], output_df["z"] = zip(
            *output_df.apply(lambda x: WGS84_to_ECEF(x.latDeg, x.lngDeg, 
                                                    x.heightAboveWgs84EllipsoidM), 
                            axis=1)

        )
        output_df.sort_values(["phone", msge], inplace=True)
        for fi in ["x", "y", "z"]:
            output_df[[fi + "_p"]] = output_df[fi].shift().where(output_df["phone"].eq(output_df["phone"].shift()))
            output_df[[fi + "_diff"]] = output_df[fi] - output_df[fi + "_p"]

        output_df[["dist"]] = np.sqrt(output_df["x_diff"]**2 + output_df["y_diff"]**2 + output_df["z_diff"]**2)
        for fi in ["x", "y", "z"]:
            output_df[[fi + "_new"]] = output_df[fi + "_p"] + output_df[fi + "_diff"] * (1 - a/output_df["dist"])
        lng, lat, alt = ECEF_to_WGS84(output_df["x_new"].values, output_df["y_new"].values, output_df["z_new"].values)

        lat[np.isnan(lat)] = output_df.loc[np.isnan(lat), "latDeg"]
        lng[np.isnan(lng)] = output_df.loc[np.isnan(lng), "lngDeg"]
        output_df["latDeg"] = lat
        output_df["lngDeg"] = lng

        output_df.sort_values(["phone", msge], inplace=True)

        return output_df[sub_cols]

    
    def objective(trial):
        a = trial.suggest_uniform("a", -1, 1)
        score, scores = compute_dist(position_shift_(train_b, a), g_t)
        return score

    study = optuna.create_study()
    study.optimize(objective, n_trials=30)

    output_df = position_shift_(input_df, a=study.best_params["a"])

    output_df["phoneName"] = input_df["phoneName"]
    output_df["collectionName"] = input_df["collectionName"]
    output_df["millisSinceGpsEpoch"] = input_df["millisSinceGpsEpoch"]

    return output_df

##kalman filter

In [16]:
def apply_kf_smoothing(input_df, kf_):
    output_df = input_df.copy()
    unique_paths = output_df[['collectionName', 'phoneName']].drop_duplicates().to_numpy()
    for collection, phone in tqdm(unique_paths):
        cond = np.logical_and(output_df['collectionName'] == collection, output_df['phoneName'] == phone)
        data = output_df[cond][['latDeg', 'lngDeg']].to_numpy()
        data = data.reshape(1, len(data), 2)
        smoothed = kf_.smooth(data)
        output_df.loc[cond, 'latDeg'] = smoothed.states.mean[0, :, 0]
        output_df.loc[cond, 'lngDeg'] = smoothed.states.mean[0, :, 1]
    return output_df


In [17]:
def kalman_filter(input_df):
    T = 1.0
    state_transition = np.array([[1, 0, T, 0, 0.5 * T ** 2, 0], [0, 1, 0, T, 0, 0.5 * T ** 2], [0, 0, 1, 0, T, 0],
                                [0, 0, 0, 1, 0, T], [0, 0, 0, 0, 1, 0], [0, 0, 0, 0, 0, 1]])
    process_noise = np.diag([1e-5, 1e-5, 5e-6, 5e-6, 1e-6, 1e-6]) + np.ones((6, 6)) * 1e-9
    observation_model = np.array([[1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0]])
    observation_noise = np.diag([3e-5, 3e-5]) + np.ones((2, 2)) * 1e-9

    kf = simdkalman.KalmanFilter(
            state_transition = state_transition,
            process_noise = process_noise,
            observation_model = observation_model,
            observation_noise = observation_noise)
    
    output_df = apply_kf_smoothing(input_df, kf)

    return output_df


##adaptive_gauss+phone_mean

In [28]:
def apply_gauss_smoothing(df, params):
    SZ_1 = params['sz_1']
    SZ_2 = params['sz_2']
    SZ_CRIT = params['sz_crit']    
    
    unique_paths = df[['collectionName', 'phoneName']].drop_duplicates().to_numpy()
    for collection, phone in unique_paths:
        cond = np.logical_and(df['collectionName'] == collection, df['phoneName'] == phone)
        data = df[cond][['latDeg', 'lngDeg']].to_numpy()
                
        lat_g1 = gaussian_filter1d(data[:, 0], np.sqrt(SZ_1))
        lon_g1 = gaussian_filter1d(data[:, 1], np.sqrt(SZ_1))
        lat_g2 = gaussian_filter1d(data[:, 0], np.sqrt(SZ_2))
        lon_g2 = gaussian_filter1d(data[:, 1], np.sqrt(SZ_2))

        lat_dif = data[1:,0] - data[:-1,0]
        lon_dif = data[1:,1] - data[:-1,1]

        lat_crit = np.append(np.abs(gaussian_filter1d(lat_dif, np.sqrt(SZ_CRIT)) / (1e-9 + gaussian_filter1d(np.abs(lat_dif), np.sqrt(SZ_CRIT)))),[0])
        lon_crit = np.append(np.abs(gaussian_filter1d(lon_dif, np.sqrt(SZ_CRIT)) / (1e-9 + gaussian_filter1d(np.abs(lon_dif), np.sqrt(SZ_CRIT)))),[0])           
            
        df.loc[cond, 'latDeg'] = lat_g1 * lat_crit + lat_g2 * (1.0 - lat_crit)
        df.loc[cond, 'lngDeg'] = lon_g1 * lon_crit + lon_g2 * (1.0 - lon_crit)    
                       
    return df

In [29]:
def mean_with_other_phones(df):
    collections_list = df[['collectionName']].drop_duplicates().to_numpy()

    for collection in collections_list:
        phone_list = df[df['collectionName'].to_list() == collection][['phoneName']].drop_duplicates().to_numpy()

        phone_data = {}
        corrections = {}
        for phone in phone_list:
            cond = np.logical_and(df['collectionName'] == collection[0], df['phoneName'] == phone[0]).to_list()
            phone_data[phone[0]] = df[cond][['millisSinceGpsEpoch', 'latDeg', 'lngDeg']].to_numpy()

        for current in phone_data:
            correction = np.ones(phone_data[current].shape, dtype=np.float)
            correction[:,1:] = phone_data[current][:,1:]
            
            # Telephones data don't complitely match by time, so - interpolate.
            for other in phone_data:
                if other == current:
                    continue

                loc = interp1d(phone_data[other][:,0], 
                               phone_data[other][:,1:], 
                               axis=0, 
                               kind='linear', 
                               copy=False, 
                               bounds_error=None, 
                               fill_value='extrapolate', 
                               assume_sorted=True)
                
                start_idx = 0
                stop_idx = 0
                for idx, val in enumerate(phone_data[current][:,0]):
                    if val < phone_data[other][0,0]:
                        start_idx = idx
                    if val < phone_data[other][-1,0]:
                        stop_idx = idx

                if stop_idx - start_idx > 0:
                    correction[start_idx:stop_idx,0] += 1
                    correction[start_idx:stop_idx,1:] += loc(phone_data[current][start_idx:stop_idx,0])                    

            correction[:,1] /= correction[:,0]
            correction[:,2] /= correction[:,0]
            
            corrections[current] = correction.copy()
        
        for phone in phone_list:
            cond = np.logical_and(df['collectionName'] == collection[0], df['phoneName'] == phone[0]).to_list()
            
            df.loc[cond, ['latDeg', 'lngDeg']] = corrections[phone[0]][:,1:]            
            
    return df

In [41]:
def gauss_filter(input_df):
    input_df_ = input_df.copy()
    output_df = mean_with_other_phones(
        apply_gauss_smoothing(input_df_, {'sz_1' : 0.85, 'sz_2' : 5.65, 'sz_crit' : 1.5})
    )
    return output_df

##move closer center SJC

In [18]:
def nearest_point(x, y, points):
    result = {}
    if len(points) == 0:
        return result
    result[0] = points[0][0]
    result[1] = points[0][1]
    stdval = math.sqrt((points[0][0] - x) ** 2 + (points[0][1] - y) ** 2)
    for point in points:
        distance = math.sqrt((point[0] - x) ** 2 + (point[1] - y) ** 2)
        if stdval > distance:
            result[0] = point[0]
            result[1] = point[1]
            stdval = distance
    return [result[0], result[1]]

In [19]:
def create_dataset_center(input_df):
    output_df = input_df.copy()
    collection_list = output_df["collectionName"].unique()

    line_points_dfs = []
    c = 1
    for collection in collection_list:
        print(f"now : {c}/{len(collection_list)}")
        c += 1
        print(collection)
        target_df = output_df[output_df["collectionName"] == collection].reset_index(drop=True)
        target_df["geometry"] = [Point(p) for p in target_df[["lngDeg", "latDeg"]].to_numpy()]
        target_gdf = gpd.GeoDataFrame(target_df, geometry=target_df["geometry"])

        offset = 0.1**5
        bbox = target_gdf.bounds + [-(offset+0.01), -(offset+0.01), offset+0.01, offset+0.01]
        east = bbox["minx"].min()
        west = bbox["maxx"].max()
        south = bbox["miny"].min()
        north = bbox["maxy"].max()
        G = ox.graph.graph_from_bbox(north, south, east, west, network_type='drive')

        nodes, edges = momepy.nx_to_gdf(G)
        
        edges = edges.dropna(subset=["geometry"]).reset_index(drop=True)
        hits = bbox.apply(lambda row: list(edges.sindex.intersection(row)), axis=1)
        tmp = pd.DataFrame({
            "pt_idx": np.repeat(hits.index, hits.apply(len)),
            "line_i": np.concatenate(hits.values)
        })
        tmp = tmp.join(edges.reset_index(drop=True), on="line_i")
        tmp = tmp.join(target_gdf.geometry.rename("point"), on="pt_idx")
        tmp = gpd.GeoDataFrame(tmp, geometry="geometry", crs=target_gdf.crs)

        tmp["snap_dist"] = tmp.geometry.distance(gpd.GeoSeries(tmp.point))

        tolerance = 0.0005  
        tmp = tmp.loc[tmp.snap_dist <= tolerance]
        tmp = tmp.sort_values(by=["snap_dist"])

        closest = tmp.groupby("pt_idx").first()
        closest = gpd.GeoDataFrame(closest, geometry="geometry")
        closest = closest.drop_duplicates("line_i").reset_index(drop=True)

        line_points_list = []
        split = 200  # param: number of split in each LineString
        for dist in range(0, split, 1):
            dist = dist/split
            line_points = closest["geometry"].interpolate(dist, normalized=True)
            line_points_list.append(line_points)
        line_points = pd.concat(line_points_list).reset_index(drop=True)
        line_points = line_points.reset_index().rename(columns={0:"geometry"})
        line_points["lngDeg"] = line_points["geometry"].x
        line_points["latDeg"] = line_points["geometry"].y
        
        line_points_ = line_points.loc[:, ["lngDeg", "latDeg"]]
        line_points_dfs.append(line_points_)

    line_points_df = pd.concat(line_points_dfs)
    line_points_list = sorted(line_points_df.values.tolist())
    
    print(len(line_points_list))

    nearest_point_list = []
    
    for lng, lat in zip(
        output_df["lngDeg"].to_numpy(),
        output_df["latDeg"].to_numpy()
    ):
        nearest_point_list.append(nearest_point(lng, lat, line_points_list))
        
    nearest_point_df = pd.DataFrame(nearest_point_list)
    output_df["latDeg_center"] = nearest_point_df[1].values
    output_df["lngDeg_center"] = nearest_point_df[0].values
    
    
        
    output_df["latDeg_center_pre"] = output_df["latDeg_center"].shift(1)
    output_df["lngDeg_center_pre"] = output_df["lngDeg_center"].shift(1)

    output_df['meter'] = output_df.apply(
            lambda r: calc_haversine(
                r.latDeg, r.lngDeg, r.latDeg_center, r.lngDeg_center
            ),
            axis=1
        )
    
    rl_list = []
    #for collection in output_df["collectionName"].unique():
        #for phone in output_df["phoneName"].unique():
            #phone_df = output_df[output_df["phone"] == f"{collection}_{phone}"]
    for lat, lng, lat_c, lng_c, lat_c_pre, lng_c_pre in zip(
                output_df["latDeg"].to_numpy(),
                output_df["lngDeg"].to_numpy(),
                output_df["latDeg_center"].to_numpy(),
                output_df["lngDeg_center"].to_numpy(),
                output_df["latDeg_center_pre"].to_numpy(),
                output_df["lngDeg_center_pre"].to_numpy()
            ):
                base_vec = np.array([lat-lat_c_pre, lng-lng_c_pre])
                center_vec = np.array([lat_c-lat_c_pre, lng_c-lng_c_pre])
                r_or_l = np.cross(base_vec,center_vec)
                if r_or_l > 0:
                    rl_list.append("Right")
                else:
                    rl_list.append("Left")
    output_df["Right_or_left"] = rl_list
    
    return output_df

In [61]:
def check_meter_from_center(input_df):
        output_df = input_df.copy()
        lat_list = []
        lng_list = []
        for lat, lng, lat_c, lng_c, meter, rl in zip(
            output_df["latDeg"].to_numpy(),
            output_df["lngDeg"].to_numpy(),
            output_df["latDeg_center"].to_numpy(),
            output_df["lngDeg_center"].to_numpy(),
            output_df["meter"].to_numpy(),
            output_df["Right_or_left"].values
        ):
            if rl == "Left":
                lat = lat_c
                lng = lng_c
                lat_list.append(lat)
                lng_list.append(lng)
            else:
                if meter > 30:
                    lat = lat
                    lng = lng
                elif meter > 10:
                    lat = ((((lat + lat_c)/2 + lat_c)/2 + lat_c)/2 + lat_c)/2
                    lng = ((((lng + lng_c)/2 + lng_c)/2 + lng_c)/2 + lng_c)/2
                elif meter > 7.5:
                    lat = (((lat + lat_c)/2 + lat_c)/2 + lat_c)/2
                    lng = (((lng + lng_c)/2 + lng_c)/2 + lng_c)/2
                elif meter > 5:
                    lat = ((lat + lat_c)/2 + lat_c)/2
                    lng = ((lng + lng_c)/2 + lng_c)/2
                elif meter > 2.5:
                    lat = (lat + lat_c)/2
                    lng = (lng + lng_c)/2

                lat_list.append(lat)
                lng_list.append(lng)


        output_df.iloc[:, 3] = lat_list
        output_df.iloc[:, 4] = lng_list
        return output_df

In [21]:
def move_closer_center_train(input_df):
    train_SJC_list = ["2021-04-22-US-SJC-1", "2021-04-28-US-SJC-1", "2021-04-29-US-SJC-2"]
    train_dfs = []
    for collection in input_df["collectionName"].unique():
        if collection in train_SJC_list:
            phones_dfs = []
            collection_df = input_df[input_df["collectionName"] == collection]
            for phone in collection_df["phone"].unique():
                phone_df = collection_df[collection_df["phone"] == phone]
                phone_df = create_dataset_center(phone_df)
                phone_df = check_meter_from_center(phone_df)
                phones_dfs.append(phone_df)
            collection_df = pd.concat(phones_dfs)
            train_dfs.append(collection_df)
        else:
            train_dfs.append(input_df[input_df["collectionName"] == collection])
    output_df = pd.concat(train_dfs)

    return output_df

In [22]:
def move_closer_center(input_df):
    test_SJC_list = ["2021-04-22-US-SJC-2", "2021-04-29-US-SJC-3"]
    test_dfs = []
    for collection in input_df["collectionName"].unique():
        if collection in test_SJC_list:
            phones_dfs = []
            collection_df = input_df[input_df["collectionName"] == collection]
            for phone in collection_df["phone"].unique():
                phone_df = collection_df[collection_df["phone"] == phone]
                phone_df = create_dataset_center(phone_df)
                phone_df = check_meter_from_center(phone_df)
                phones_dfs.append(phone_df)
            collection_df = pd.concat(phones_dfs)
            test_dfs.append(collection_df)
        else:
            test_dfs.append(input_df[input_df["collectionName"] == collection])
    output_df = pd.concat(test_dfs)

    return output_df

##move closer truth SJC

In [23]:
def create_dataset_truth_train(input_df):
    train_SJC_list = ["2021-04-22-US-SJC-1", "2021-04-28-US-SJC-1", "2021-04-29-US-SJC-2"]
    #train_SJCs = []
    #for SJC in train_SJC_list:
       # SJC_df = input_df[input_df["collectionName"] == SJC]
       # train_SJCs.append(SJC_df)
    #train_SJC = pd.concat(train_SJCs)
    output_df = input_df.copy()
    lat_list = []
    lng_list = []
    for collection in train_SJC_list:
        collection_df = train_base[train_base["collectionName"] == collection]
        for phone in collection_df["phone"].unique():
            phone_df = collection_df[collection_df["phone"] == phone]
            phone_df["t_latDeg_pro"] = phone_df["t_latDeg"].shift(-1)
            phone_df["t_lngDeg_pro"] = phone_df["t_lngDeg"].shift(-1)
            for lat, lng, lat_pre, lng_pre in zip(
                phone_df["t_latDeg"].to_numpy(),
                phone_df["t_lngDeg"].to_numpy(),
                phone_df["t_latDeg_pro"].to_numpy(),
                phone_df["t_lngDeg_pro"].to_numpy()
            ):
                lat_list.append(lat)
                lng_list.append(lng)
                lat_mean = (lat + lat_pre)/2
                lng_mean = (lng + lng_pre)/2
                lat_list.append(lat_mean)
                lng_list.append(lng_mean)

    points_list = []
    for lat, lng in zip(
        lat_list,
        lng_list
    ):
        points_list.append([lat, lng])

    nearest_points_list = []
    for lat, lng in zip(
        output_df["latDeg"].to_numpy(),
        output_df["lngDeg"].to_numpy()
    ):
        nearest_points_list.append(nearest_point(lat, lng, points_list))

    nearest_points_df = pd.DataFrame(nearest_points_list)
    output_df["latDeg_from_gt"] = nearest_points_df[0].values
    output_df["lngDeg_from_gt"] = nearest_points_df[1].values
    
    lat_gt_pre_list = []
    lng_gt_pre_list = []
    #for phone in train_SJC["phone"].unique():
        #phone_df = train_SJC[train_SJC["phone"] == phone]
        #gt_df = pd.DataFrame()     
    for lat_gt_pre, lng_gt_pre in zip(
            output_df["latDeg_from_gt"].shift(1).to_numpy(),
            output_df["lngDeg_from_gt"].shift(1).to_numpy()
        ):
            lat_gt_pre_list.append(lat_gt_pre)
            lng_gt_pre_list.append(lng_gt_pre)
            

    output_df["latDeg_from_gt_pre"] = lat_gt_pre_list
    output_df["lngDeg_from_gt_pre"] = lng_gt_pre_list


    output_df["meter"] = output_df.apply(
        lambda r:calc_haversine(
            r.latDeg, r.lngDeg, r.latDeg_from_gt, r.lngDeg_from_gt
        ),
        axis=1
    )
    return output_df


In [62]:
def check_meter_from_truth(input_df):
    output_df = input_df.copy()
    lat_list = []
    lng_list = []
    for lat, lng, lat_gt, lng_gt, meter in zip(
        output_df["latDeg"].to_numpy(),
        output_df["lngDeg"].to_numpy(),
        output_df["latDeg_from_gt"].to_numpy(),
        output_df["lngDeg_from_gt"].to_numpy(),
        output_df["meter"].to_numpy(),
    ):
        if meter > 30:
            lat = lat
            lng = lng
        elif meter > 10:
            lat = ((((lat + lat_gt)/2 + lat_gt)/2 + lat_gt)/2 + lat_gt)/2
            lng = ((((lng + lng_gt)/2 + lng_gt)/2 + lng_gt)/2 + lng_gt)/2
        elif meter > 7.5:
            lat = (((lat + lat_gt)/2 + lat_gt)/2 + lat_gt)/2
            lng = (((lng + lng_gt)/2 + lng_gt)/2 + lng_gt)/2
        elif meter > 5:
            lat = ((lat + lat_gt)/2 + lat_gt)/2
            lng = ((lng + lng_gt)/2 + lng_gt)/2
        elif meter > 2.5:
            lat = (lat + lat_gt)/2
            lng = (lng + lng_gt)/2

        lat_list.append(lat)
        lng_list.append(lng)


    output_df.iloc[:, 3] = lat_list
    output_df.iloc[:, 4] = lng_list
    return output_df

In [63]:
def move_closer_truth_train(input_df):
    train_SJC_list = ["2021-04-22-US-SJC-1", "2021-04-28-US-SJC-1", "2021-04-29-US-SJC-2"]
    train_dfs = []
    for collection in input_df["collectionName"].unique():
        if collection in train_SJC_list:
            phones_dfs = []
            collection_df = input_df[input_df["collectionName"] == collection]
            for phone in collection_df["phone"].unique():
                phone_df = collection_df[collection_df["phone"] == phone]
                phone_df = create_dataset_truth_train(phone_df)
                phone_df = check_meter_from_truth(phone_df)
                phones_dfs.append(phone_df)
            collection_df = pd.concat(phones_dfs)
            train_dfs.append(collection_df)
        else:
            train_dfs.append(input_df[input_df["collectionName"] == collection])
    output_df = pd.concat(train_dfs)

    return output_df

In [64]:
def move_closer_truth(input_df):
    test_SJC_list = ["2021-04-22-US-SJC-2", "2021-04-29-US-SJC-3"]
    test_dfs = []
    for collection in input_df["collectionName"].unique():
        if collection in test_SJC_list:
            phones_dfs = []
            collection_df = input_df[input_df["collectionName"] == collection]
            for phone in collection_df["phone"].unique():
                phone_df = collection_df[collection_df["phone"] == phone]
                phone_df = create_dataset_truth_train(phone_df)
                phone_df = check_meter_from_truth(phone_df)
                phones_dfs.append(phone_df)
            collection_df = pd.concat(phones_dfs)
            test_dfs.append(collection_df)
        else:
            test_dfs.append(input_df[input_df["collectionName"] == collection])
    output_df = pd.concat(test_dfs)

    return output_df

#cheke CV

In [43]:
def check_score(input_df: pd.DataFrame) -> pd.DataFrame:
    output_df = input_df.copy()    
    output_df['meter'] = input_df.apply(
        lambda r: calc_haversine(
            r.latDeg, r.lngDeg, r.t_latDeg, r.t_lngDeg
        ),
        axis=1
    )

    meter_score = output_df['meter'].mean()
    print(f'error meter: {meter_score}')

    scores = []
    for phone in output_df['phone'].unique():
        _index = output_df['phone']==phone
        p_50 = np.percentile(output_df.loc[_index, 'meter'], 50)
        p_95 = np.percentile(output_df.loc[_index, 'meter'], 95)
        scores.append(p_50)
        scores.append(p_95)

    score = sum(scores) / len(scores)
    print(f'score: {score}')
    
    return output_df

In [47]:
#gauss filter
CV1 = gauss_filter(train_base)
print('<<<gauss filter>>>')
CV1 = check_score(CV1)
print('--------------------')


#make triangle 1
CV3 = make_triangle(CV1)
for i in range(100):
    CV3 = make_triangle(CV3)
print('<<<triangle_1>>>')
CV3 = check_score(CV3)
print('--------------------')

<<<gauss filter>>>
error meter: 2.8205612571399405
score: 3.7882711914857086
--------------------



invalid value encountered in double_scalars



<<<triangle_1>>>
error meter: 2.738735054276848
score: 3.768684622347603
--------------------


In [None]:
#make triangle 1
CV1 = make_triangle(train_base)
for i in range(100):
    CV1 = make_triangle(CV1)
print('<<<triangle_1>>>')
CV1 = check_score(CV1)
print('--------------------')

#outlier
CV2 = outlier_train(CV1)
print('<<<outlier>>>')
CV2 = check_score(CV2)
print('--------------------')

#make triangle 2
CV3 = make_triangle(CV2)
for i in range(100):
    CV3 = make_triangle(CV3)
print('<<<triangle_2>>>')
CV3 = check_score(CV3)
print('--------------------')

#phonse mean
CV4 = mean_prediction_train(CV3)
print('<<<phones_mean>>>')
CV4 = check_score(CV4)
print('--------------------')

#make triangle 3
CV5 = make_triangle(CV4)
for i in range(100):
    CV5 = make_triangle(CV5)
print('<<<triangle_3>>>')
CV5 = check_score(CV5)
print('--------------------')

#remove device
CV6 = remove_device_train(CV5)
print('<<<remove_device>>>')
CV6 = check_score(CV6)
print('--------------------')

#make triangle 4
CV7 = make_triangle(CV6)
for i in range(100):
    CV7 = make_triangle(CV7)
print('<<<triangle_4>>>')
CV7 = check_score(CV7)
print('--------------------')

#position shift
CV8 = position_shift_train(CV7)
print('<<<position_shift>>>')
CV8 = check_score(CV8)
print('--------------------')

#make triangle 5
CV9 = make_triangle(CV8)
for i in range(100):
    CV9 = make_triangle(CV9)
print('<<<triangle_5>>>')
CV9 = check_score(CV9)
print('--------------------')



invalid value encountered in double_scalars


invalid value encountered in arccos



<<<triangle_1>>>
< from truth >
error meter: 3.3579582394393928
score: 4.753446293953488
--------------------
<<<outlier>>>
< from truth >
error meter: 3.357550751702286
score: 4.753387012675808
--------------------



invalid value encountered in double_scalars


invalid value encountered in arccos



<<<triangle_2>>>
< from truth >
error meter: 3.3458543163626406
score: 4.749936642584164
--------------------


HBox(children=(FloatProgress(value=0.0, max=73.0), HTML(value='')))


<<<phones_mean>>>
< from truth >
error meter: 2.888001174482605
score: 3.969272183371574
--------------------



invalid value encountered in double_scalars



<<<triangle_3>>>
< from truth >
error meter: 2.877919914830917
score: 3.952583078422353
--------------------
<<<remove_device>>>
< from truth >
error meter: 2.792666413318442
score: 3.8125002035665694
--------------------
<<<triangle_4>>>
< from truth >
error meter: 2.7903026843920595
score: 3.809055902507315
--------------------


[32m[I 2021-07-21 15:14:22,918][0m A new study created in memory with name: no-name-0e695bc1-9cf4-48e0-9936-d7bfdc0793e8[0m
[32m[I 2021-07-21 15:14:30,007][0m Trial 0 finished with value: 5.55789091378337 and parameters: {'a': -0.651006694425184}. Best is trial 0 with value: 5.55789091378337.[0m
[32m[I 2021-07-21 15:14:36,488][0m Trial 1 finished with value: 5.212820667647379 and parameters: {'a': 0.9857360336461825}. Best is trial 1 with value: 5.212820667647379.[0m
[32m[I 2021-07-21 15:14:43,397][0m Trial 2 finished with value: 5.249670535888447 and parameters: {'a': 0.13403957908322006}. Best is trial 1 with value: 5.212820667647379.[0m
[32m[I 2021-07-21 15:14:50,000][0m Trial 3 finished with value: 5.6768648818753675 and parameters: {'a': -0.8678790759023678}. Best is trial 1 with value: 5.212820667647379.[0m
[32m[I 2021-07-21 15:14:56,946][0m Trial 4 finished with value: 5.244536931286543 and parameters: {'a': 0.15647489822387328}. Best is trial 1 with value: 5.21

<<<position_shift>>>
< from truth >
error meter: 2.747748615901277
score: 3.75760914274848
--------------------



invalid value encountered in arccos



<<<triangle_5>>>
< from truth >
error meter: 2.7420579484803342
score: 3.7460382249735136
--------------------


#submission

In [49]:
#gauss filter
sub1 = gauss_filter(test_base)
print('<<<gauss filter>>>')
print('--------------------')


#make triangle 1
sub2 = make_triangle(sub1)
for i in range(300):
    sub2 = make_triangle(sub2)
print('<<<triangle_1>>>')
print('--------------------')

<<<gauss filter>>>
--------------------



invalid value encountered in double_scalars



<<<triangle_1>>>
--------------------


In [65]:
sub3 = move_closer_center(sub2)

now : 1/1
2021-04-22-US-SJC-2



Approach is not set. Defaulting to 'primal'.


Geometry is in a geographic CRS. Results from 'distance' are likely incorrect. Use 'GeoSeries.to_crs()' to re-project geometries to a projected CRS before this operation.



CRS mismatch between the CRS of left geometries and the CRS of right geometries.
Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: EPSG:4326
Right CRS: None




23200
now : 1/1
2021-04-29-US-SJC-3



Approach is not set. Defaulting to 'primal'.


Geometry is in a geographic CRS. Results from 'distance' are likely incorrect. Use 'GeoSeries.to_crs()' to re-project geometries to a projected CRS before this operation.



CRS mismatch between the CRS of left geometries and the CRS of right geometries.
Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: EPSG:4326
Right CRS: None




22600
now : 1/1
2021-04-29-US-SJC-3



Approach is not set. Defaulting to 'primal'.


Geometry is in a geographic CRS. Results from 'distance' are likely incorrect. Use 'GeoSeries.to_crs()' to re-project geometries to a projected CRS before this operation.



CRS mismatch between the CRS of left geometries and the CRS of right geometries.
Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: EPSG:4326
Right CRS: None




22600


In [66]:
sub3

Unnamed: 0,collectionName,phoneName,millisSinceGpsEpoch,latDeg,lngDeg,heightAboveWgs84EllipsoidM,phone,latDeg_center,lngDeg_center,latDeg_center_pre,lngDeg_center_pre,meter,Right_or_left
0,2020-05-15-US-MTV-1,Pixel4,1273608785432,37.416597,-122.082067,-30.69,2020-05-15-US-MTV-1_Pixel4,,,,,,
1,2020-05-15-US-MTV-1,Pixel4,1273608786432,37.416601,-122.082064,-31.76,2020-05-15-US-MTV-1_Pixel4,,,,,,
2,2020-05-15-US-MTV-1,Pixel4,1273608787432,37.416602,-122.082064,-31.65,2020-05-15-US-MTV-1_Pixel4,,,,,,
3,2020-05-15-US-MTV-1,Pixel4,1273608788432,37.416603,-122.082064,-31.52,2020-05-15-US-MTV-1_Pixel4,,,,,,
4,2020-05-15-US-MTV-1,Pixel4,1273608789432,37.416603,-122.082064,-28.95,2020-05-15-US-MTV-1_Pixel4,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
91481,2021-04-29-US-SJC-3,SamsungS20Ultra,1303763185000,37.334576,-121.899479,-8.39,2021-04-29-US-SJC-3_SamsungS20Ultra,37.334576,-121.899479,37.334570,-121.899492,3.459972,Left
91482,2021-04-29-US-SJC-3,SamsungS20Ultra,1303763186000,37.334582,-121.899467,-7.36,2021-04-29-US-SJC-3_SamsungS20Ultra,37.334582,-121.899467,37.334576,-121.899479,3.870017,Left
91483,2021-04-29-US-SJC-3,SamsungS20Ultra,1303763187000,37.334582,-121.899467,-4.08,2021-04-29-US-SJC-3_SamsungS20Ultra,37.334582,-121.899467,37.334582,-121.899467,3.865141,Left
91484,2021-04-29-US-SJC-3,SamsungS20Ultra,1303763188000,37.334589,-121.899454,-5.70,2021-04-29-US-SJC-3_SamsungS20Ultra,37.334589,-121.899454,37.334582,-121.899467,4.072996,Left


In [67]:
sub4 = move_closer_truth(sub3)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [58]:
sub4.isna().sum()

collectionName                    0
phoneName                         0
millisSinceGpsEpoch               0
latDeg                            0
lngDeg                            0
heightAboveWgs84EllipsoidM        0
phone                             0
latDeg_center                 85122
lngDeg_center                 85122
latDeg_center_pre             85125
lngDeg_center_pre             85125
meter                         85122
Right_or_left                 85122
latDeg_from_gt                85122
lngDeg_from_gt                85122
latDeg_from_gt_pre            85125
lngDeg_from_gt_pre            85125
dtype: int64

In [68]:
fig = px.scatter_mapbox(sub3,
                            
                        # Here, plotly gets, (x,y) coordinates
                        lat="latDeg",
                        lon="lngDeg",
                            
                        #Here, plotly detects color of series
                        color="phone",
                        labels="phone",
                            
                        zoom=400,
                        height=600,
                        width=800)
fig.update_layout(mapbox_style='stamen-terrain')
fig.update_layout(margin={"r": 0, "t": 0, "l": 0, "b": 0})
fig.update_layout(title_text="GPS trafic")
fig.show()

In [57]:
sub4[sub.columns].to_csv("submission_58.csv", index=False)

In [72]:
sub4.isna().sum()

collectionName                    0
phoneName                         0
millisSinceGpsEpoch               0
latDeg                            0
lngDeg                            0
heightAboveWgs84EllipsoidM        0
phone                             0
latDeg_center                 85122
lngDeg_center                 85122
latDeg_center_pre             85125
lngDeg_center_pre             85125
meter                         85122
Right_or_left                 85122
latDeg_from_gt                85122
lngDeg_from_gt                85122
latDeg_from_gt_pre            85125
lngDeg_from_gt_pre            85125
dtype: int64

In [71]:
sub

Unnamed: 0,phone,millisSinceGpsEpoch,latDeg,lngDeg
0,2020-05-15-US-MTV-1_Pixel4,1273608785432,37.904611,-86.481078
1,2020-05-15-US-MTV-1_Pixel4,1273608786432,37.904611,-86.481078
2,2020-05-15-US-MTV-1_Pixel4,1273608787432,37.904611,-86.481078
3,2020-05-15-US-MTV-1_Pixel4,1273608788432,37.904611,-86.481078
4,2020-05-15-US-MTV-1_Pixel4,1273608789432,37.904611,-86.481078
...,...,...,...,...
91481,2021-04-29-US-SJC-3_SamsungS20Ultra,1303763185000,37.904611,-86.481078
91482,2021-04-29-US-SJC-3_SamsungS20Ultra,1303763186000,37.904611,-86.481078
91483,2021-04-29-US-SJC-3_SamsungS20Ultra,1303763187000,37.904611,-86.481078
91484,2021-04-29-US-SJC-3_SamsungS20Ultra,1303763188000,37.904611,-86.481078
