In [69]:
import numpy as np
import pandas as pd
import glob
import os
import math
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from pathlib import Path
import multiprocessing
from scipy.spatial.distance import cdist
import yaml
from datetime import datetime

import warnings
warnings.simplefilter('ignore')

In [14]:
def init_logger(log_file='logger.log'):
    from logging import getLogger, INFO, FileHandler,  Formatter,  StreamHandler
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=log_file)
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger
    
today = datetime.now().strftime('%Y-%m-%d')
logger = init_logger(log_file=f'./{today}.log')
logger.info('Start Logging...')

Start Logging...


In [15]:
def seed_everything(seed: int):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)

In [16]:
import pickle
def to_pickle(filename, obj):
    with open(filename, mode='wb') as f:
        pickle.dump(obj, f)
        
def from_pickle(filename):
    with open(filename, mode='rb') as f:
        obj = pickle.load(f)
    return obj

In [17]:
def get_ground_truth(args):
    (collectionName, phoneName), df = args
    
    path = data_dir / f"train/{collectionName}/{phoneName}/ground_truth.csv"
    target_df = pd.read_csv(path)
    output_df = pd.DataFrame()
    # merge derived and target by 'millisSinceGpsEpoch'
    for epoch, epoch_df in df.groupby('millisSinceGpsEpoch'):
        idx = (target_df['millisSinceGpsEpoch'] - epoch).abs().argmin()
        epoch_diff = epoch - target_df.loc[idx, 'millisSinceGpsEpoch']
        epoch_df['epoch_diff'] = epoch_diff
        epoch_df['target_latDeg'] = target_df.loc[idx, 'latDeg']
        epoch_df['target_lngDeg'] = target_df.loc[idx, 'lngDeg']
        epoch_df['speedMps'] = target_df.loc[idx, 'speedMps']
        output_df = pd.concat([output_df, epoch_df]).reset_index(drop=True)    
    return output_df

In [18]:
# metric
def calc_haversine(lat1, lon1, lat2, lon2):
    """Calculates the great circle distance between two points
    on the earth. Inputs are array-like and specified in decimal degrees.
    """
    RADIUS = 6_367_000
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + \
        np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    dist = 2 * RADIUS * np.arcsin(a**0.5)
    return dist

def check_score(input_df: pd.DataFrame) -> pd.DataFrame:
    if "phone" not in input_df.columns:
        input_df["phone"] = input_df["collectionName"] + "_" + input_df["phoneName"]

    if "target_latDeg" not in input_df.columns:
        processes = multiprocessing.cpu_count()
        with multiprocessing.Pool(processes=processes) as pool:
            gr = input_df.groupby(['collectionName','phoneName'])
            dfs = pool.imap_unordered(get_ground_truth, gr)
            dfs = tqdm(dfs, total=len(gr))
            dfs = list(dfs)
        input_df = pd.concat(dfs).sort_values(['collectionName', 'phoneName', 'millisSinceGpsEpoch']).reset_index(drop=True)     


    output_df = input_df.copy()
    
    output_df['error'] = input_df.apply(
        lambda r: calc_haversine(
            r.latDeg, r.lngDeg, r.target_latDeg, r.target_lngDeg
        ),
        axis=1
    )

    meter_score = output_df['error'].mean()
    logger.info(f'mean error: {meter_score}')

    scores = []
    p_50_scores = []
    p_95_scores = []
    mean_scores = []
    phones = []
    score_df = pd.DataFrame()
    for phone in output_df['phone'].unique():
        _index = output_df['phone']==phone
        p_50 = np.percentile(output_df.loc[_index, 'error'], 50)
        p_95 = np.percentile(output_df.loc[_index, 'error'], 95)
        # print(f"{phone} | 50:{p_50:.5g}| 95:{p_95:.5g}")
        p_50_scores.append(p_50)
        p_95_scores.append(p_95)
        mean_scores.append(np.mean([p_50, p_95]))
        phones.append(phone)

        scores.append(p_50)
        scores.append(p_95)

    score_df["phone"] = phones
    score_df["p_50_score"] = p_50_scores
    score_df["p_95_score"] = p_95_scores
    score_df["mean_score"] = mean_scores
    
    comp_score = sum(scores) / len(scores)
    logger.info(f"competition metric:{comp_score}")
    return output_df, score_df

In [19]:
road_points = pd.read_csv('../../input/road_points.csv')
road_points = road_points[["lngDeg", "latDeg"]]
road_points

Unnamed: 0,lngDeg,latDeg
0,-122.162258,37.453195
1,-122.143181,37.458301
2,-122.138501,37.459129
3,-122.141583,37.460767
4,-122.142622,37.461768
...,...,...
12795,-122.157943,37.449564
12796,-122.158041,37.449466
12797,-122.158688,37.448881
12798,-122.159135,37.448374


In [46]:
fig = px.scatter_mapbox(road_points,                              
                        # Here, plotly gets, (x,y) coordinates
                        lat="latDeg",
                        lon="lngDeg",
                            
                        zoom=9,
                        center={"lat":37.423576, "lon":-122.094132},
                        height=600,
                        width=800)
fig.update_layout(mapbox_style='stamen-terrain')
fig.update_layout(margin={"r": 0, "t": 0, "l": 0, "b": 0})
# fig.update_layout(title_text="{collection_name}")
fig.show()

In [26]:
def snap_to_grid(df, min_thr, max_thr):
    """
    Snap to grid if within a threshold.
    
    x, y are the predicted points.
    x_, y_ are the closest grid points.
    _x_, _y_ are the new predictions after post processing.
    """
    df['_latDeg_'] = df['latDeg']
    df['_lngDeg_'] = df['lngDeg']
    df["dist"] = calc_haversine(df["latDeg"], df["lngDeg"], df["latDeg_"], df["lngDeg_"])
    query = (min_thr <= df['dist'])&(df['dist'] < max_thr)
    df.loc[query, '_latDeg_'] = df.loc[query, 'latDeg_']
    df.loc[query, '_lngDeg_'] = df.loc[query, 'lngDeg_']

    df = df.drop(["latDeg_", "lngDeg_", "latDeg", "lngDeg"], axis=1).rename(columns={"_latDeg_":"latDeg", "_lngDeg_":"lngDeg"})
    return df

from scipy.spatial.distance import cdist

def add_xy(df):
    df['point'] = [(x, y) for x,y in zip(df['lngDeg'], df['latDeg'])]
    return df

def closest_point(point, points):
    """ Find closest point from a list of points. """
    return points[cdist([point], points).argmin()]


def matching_point(target_df, target_road_points=road_points):
    # place = target_df["place"].unique()[0]
    # target_road_points = road_points[road_points["place"]==place]
    target_df['matched_point'] = [closest_point(x, list(target_road_points['point'])) for x in target_df['point']]
    target_df['lngDeg_'] = target_df['matched_point'].apply(lambda x: x[0])
    target_df['latDeg_'] = target_df['matched_point'].apply(lambda x: x[1])
    return target_df


def apply_snap_to_grid(df, road_points, min_thr, max_thr):
    logger.info('[START] Snap to Grid')

    df = add_xy(df)
    road_points = add_xy(road_points)
    df_list = []
    ncpu = multiprocessing.cpu_count()
    for collection_name, target_df in tqdm(df.groupby("collectionName")):
        # if collection_type_dict[phase][collection_name] in target_type:
        if collection_name=="2021-03-25-US-PAO-1":
            target_df = target_df.reset_index(drop=True)

            k = round(len(target_df)/ncpu)
            if k == 0:
                k = 1
            target_dfs = [target_df.loc[i:i+k-1, :] for i in range(0, len(target_df), k)]
            with multiprocessing.Pool(processes=ncpu) as pool:
                dfs = pool.imap_unordered(matching_point, target_dfs)
                dfs = list(tqdm(dfs, total=len(target_dfs)))
                target_df = pd.concat(dfs)
            target_df = snap_to_grid(target_df, min_thr, max_thr)
            df_list.append(target_df)
        else:
            df_list.append(target_df)

    df = pd.concat(df_list).reset_index(drop=True)
    
    return df

In [45]:
def outlier_interpolate(input_df, thr=0.005):
    logger.info("outlier to nan")
    df_list = []
    for (collection_name, phone_name), df in input_df.groupby(["collectionName", "phoneName"]):
        if collection_name=="2021-03-25-US-PAO-1":
            df = df.sort_values("millisSinceGpsEpoch").reset_index(drop=True)
            df["dist"] = calc_haversine(df["latDeg"],df["lngDeg"],df["latDeg"].shift(1),df["lngDeg"].shift(1)).values
            df["delta_t"] = df["millisSinceGpsEpoch"] - df["millisSinceGpsEpoch"].shift(1)
            df["speed"] = df["dist"] / df["delta_t"]
            df["speed_pre_delta"] = (df["speed"] - df["speed"].shift(1)).abs()
            df["speed_post_delta"] = (df["speed"] - df["speed"].shift(-1)).abs()
            query = (df["speed_pre_delta"]>=thr)&(df["speed_post_delta"]>=thr)
            print(collection_name, phone_name, len(df[query]))
            df.loc[query, "latDeg"] = np.nan
            df.loc[query, "latDeg"] = np.nan
            # 時間に合わせて線形補間
            df['dummy_datetime'] = pd.to_datetime(df['millisSinceGpsEpoch'])
            df = df.set_index('dummy_datetime') 
            df = df.interpolate(method='time').reset_index(drop=True)
        df_list.append(df)
    output_df = pd.concat(df_list).reset_index(drop=True)
    return output_df

In [63]:
def check_time(_df):
    data = _df.copy().reset_index(drop=True)
    data['ms_dif'] = data['millisSinceGpsEpoch']-data['millisSinceGpsEpoch'].shift()
    mis_time = (len(data)-1-data['ms_dif'].value_counts()[1000.0])/len(data)
    return mis_time>0.0

def adjust_time(df, kf):
    data = df.copy().reset_index(drop=True)
    new_data = data.copy()
    t = 'millisSinceGpsEpoch'
    start = data['millisSinceGpsEpoch'].values[0]
    end = data['millisSinceGpsEpoch'].values[-1]
    ind = math.ceil((end-start)/1000)
    _data = pd.DataFrame(index=range(ind))
    _data['latDeg']=0
    _data['lngDeg']=0
    #_data['millisSinceGpsEpoch'] = 0
    _data['millisSinceGpsEpoch'] = start+1000*_data.index
    
    cnt = 0
    for i in _data.index:
        while _data.loc[i,t]<new_data.loc[cnt,t] or _data.loc[i,t]>new_data.loc[cnt+1,t]:
            cnt+=1
        if _data.loc[i,t]==new_data.loc[cnt,t]:
            _data.loc[i,'latDeg'] = new_data.loc[cnt,'latDeg']
            _data.loc[i,'lngDeg'] = new_data.loc[cnt,'lngDeg']
        else:
            w=(new_data.loc[cnt+1,t]-_data.loc[i,t])/(new_data.loc[cnt+1,t]-new_data.loc[cnt,t])
            if w<0 or w>1:
                print(w)
            _data.loc[i,'latDeg'] = new_data.loc[cnt,'latDeg']*w+new_data.loc[cnt+1,'latDeg']*(1-w)
            _data.loc[i,'lngDeg'] = new_data.loc[cnt,'lngDeg']*w+new_data.loc[cnt+1,'lngDeg']*(1-w)
    _data.head()
    __data = _data.loc[:,['latDeg', 'lngDeg']].to_numpy()
    __data = __data.reshape(1, len(__data), 2)
    smoothed = kf.smooth(__data)
    _data.loc[:,'latDeg_pred'] = smoothed.states.mean[0, :, 0]
    _data.loc[:,'lngDeg_pred'] = smoothed.states.mean[0, :, 1]
    _data.loc[:,'latDeg_cov_pred'] = smoothed.states.cov[0, :, 0,0]
    _data.loc[:,'lngDeg_cov_pred'] = smoothed.states.cov[0, :, 1,1]
    data['latDeg_cov'] = 1
    data['lngDeg_cov'] = 1

    cnt = 0
    for i in data.index:
        try:
            while data.loc[i,t]<_data.loc[cnt,t] or data.loc[i,t]>_data.loc[cnt+1,t]:
                cnt+=1
            if data.loc[i,t]==_data.loc[cnt,t]:
                data.loc[i,'latDeg'] = _data.loc[cnt,'latDeg_pred']
                data.loc[i,'lngDeg'] = _data.loc[cnt,'lngDeg_pred']
                data.loc[i,'latDeg_cov'] = _data.loc[cnt,'latDeg_cov_pred']
                data.loc[i,'lngDeg_cov'] = _data.loc[cnt,'lngDeg_cov_pred']
            else:
                w=(_data.loc[cnt+1,t]-data.loc[i,t])/(_data.loc[cnt+1,t]-_data.loc[cnt,t])
                if w<0 or w>1:
                    print(w)
                data.loc[i,'latDeg'] = _data.loc[cnt,'latDeg_pred']*w+_data.loc[cnt+1,'latDeg_pred']*(1-w)
                data.loc[i,'lngDeg'] = _data.loc[cnt,'lngDeg_pred']*w+_data.loc[cnt+1,'lngDeg_pred']*(1-w)
                data.loc[i,'latDeg_cov'] = _data.loc[cnt,'latDeg_cov_pred']*w+_data.loc[cnt+1,'latDeg_cov_pred']*(1-w)
                data.loc[i,'lngDeg_cov'] = _data.loc[cnt,'lngDeg_cov_pred']*w+_data.loc[cnt+1,'lngDeg_cov_pred']*(1-w)
        except:
            pass
    return data

In [81]:
def stop_mean(input_df, max_speed_thr = 0.95, min_second_thr=0, max_second_thr = 2, phase="train"):
    """
    # 予測した速度がmax_speed_thr以下のものがmax_second_thr(s)以上連続する区間を停止区間とみなして平均をとる。
    TODO 道路タイプで適用可否決めたほうがいいかも。都市のところは開始、終了だけにするとか。
    """
    logger.info('[START] stop mean')
    use_col = input_df.columns
    output_df = pd.DataFrame()
    for collection_name, df in input_df.groupby(['collectionName']):
        if collection_name == "2021-03-25-US-PAO-1":
            df = df.sort_values('millisSinceGpsEpoch')
            df['flag'] = df['pred_speedMps'] < max_speed_thr
            df['stop_group'] = (df['flag'] != df['flag'].shift()).cumsum()
            df['num_flaged_data'] = df.groupby('stop_group')['flag'].transform(sum)  # stop group内でのflag(stopしていると判定された)の数
            df.loc[(min_second_thr <= df['num_flaged_data'])&(df['num_flaged_data'] < max_second_thr), 'stop_group'] = 0

            for i in df['stop_group'].unique():
                if i == 0:
                    continue

                # when start or end
                if (i == 1)or(i == df['stop_group'].unique()[-1]):
                    df.loc[df['stop_group']==i, 'latDeg'] = df.loc[df['stop_group']==i, 'latDeg'].mean()
                    df.loc[df['stop_group']==i, 'lngDeg'] = df.loc[df['stop_group']==i, 'lngDeg'].mean()

        output_df = pd.concat([output_df, df])
    return output_df[use_col].sort_index()

In [64]:
import simdkalman

def apply_kf_smoothing(input_df):
    logger.info('[START] Kalman Smoothing')
    # define kf model
    df_list = []
    for (collection_name,phone_name), df in input_df.groupby(["collectionName", "phoneName"]):
        df = df.sort_values("millisSinceGpsEpoch")
        if collection_name == "2021-03-25-US-PAO-1":
            # score: 2.8204355271575277 -> 2.8193315973589166
            # Best hyperparameters: {'T': 2.0, 'p_noise': 2.705460166533897e-07, 'o_noise': 1.3966292122575907e-09}
            # defaultを使用
            T = 1.0
            p_noise = 1e-9
            o_noise = 1e-9

            state_transition = np.array([[1, 0, T, 0, 0.5 * T ** 2, 0], [0, 1, 0, T, 0, 0.5 * T ** 2], [0, 0, 1, 0, T, 0],
                                        [0, 0, 0, 1, 0, T], [0, 0, 0, 0, 1, 0], [0, 0, 0, 0, 0, 1]])
            process_noise = np.diag([1e-5, 1e-5, 5e-6, 5e-6, 1e-6, 1e-6]) + np.ones((6, 6)) * p_noise
            observation_model = np.array([[1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0]])    
            observation_noise = np.diag([5e-5, 5e-5]) + np.ones((2, 2)) * o_noise


            kf = simdkalman.KalmanFilter(
                state_transition=state_transition,
                process_noise=process_noise,
                observation_model=observation_model,
                observation_noise=observation_noise)

            if check_time(df):
                _df = adjust_time(df, kf)
                df['latDeg'] = _df['latDeg'].values
                df['lngDeg'] = _df['lngDeg'].values
                df['latDeg_cov'] = _df['latDeg_cov'].values
                df['lngDeg_cov'] = _df['lngDeg_cov'].values
            else:
                data = df[['latDeg', 'lngDeg']].to_numpy()
                data = data.reshape(1, len(data), 2)
                smoothed = kf.smooth(data)
                df['latDeg'] = smoothed.states.mean[0, :, 0]
                df['latDeg_cov'] = smoothed.states.cov[0, :, 0,0]
                df['lngDeg'] = smoothed.states.mean[0, :, 1]
                df['lngDeg_cov'] = smoothed.states.cov[0, :, 1,1]
            df_list.append(df)
        else:
            df_list.append(df)
    output_df = pd.concat(df_list).reset_index(drop=True) # .sort_index()
    return output_df

In [94]:
base_df = pd.read_csv("../../input/google-smartphone-decimeter-challenge/baseline_locations_test_with_speed.csv")[["phone", "millisSinceGpsEpoch", "pred_speedMps"]]
base_df

Unnamed: 0,phone,millisSinceGpsEpoch,pred_speedMps
0,2020-05-15-US-MTV-1_Pixel4,1273608785432,-0.000374
1,2020-05-15-US-MTV-1_Pixel4,1273608786432,0.006241
2,2020-05-15-US-MTV-1_Pixel4,1273608787432,0.015441
3,2020-05-15-US-MTV-1_Pixel4,1273608788432,0.035798
4,2020-05-15-US-MTV-1_Pixel4,1273608789432,-0.033386
...,...,...,...
91481,2021-04-29-US-SJC-3_SamsungS20Ultra,1303763185000,0.004456
91482,2021-04-29-US-SJC-3_SamsungS20Ultra,1303763186000,0.025227
91483,2021-04-29-US-SJC-3_SamsungS20Ultra,1303763187000,0.010388
91484,2021-04-29-US-SJC-3_SamsungS20Ultra,1303763188000,-0.103886


In [97]:
test_df = pd.read_csv("exp102_blend_submission2_additional_snap.csv")
test_df["collectionName"] = test_df["phone"].str.split("_", expand=True)[0]
test_df["phoneName"] = test_df["phone"].str.split("_", expand=True)[1]
test_df = test_df.merge(base_df, on=["phone", "millisSinceGpsEpoch"])
test_df

Unnamed: 0,phone,millisSinceGpsEpoch,latDeg,lngDeg,collectionName,phoneName,pred_speedMps
0,2020-05-15-US-MTV-1_Pixel4,1273608785432,37.416583,-122.082064,2020-05-15-US-MTV-1,Pixel4,-0.000374
1,2020-05-15-US-MTV-1_Pixel4,1273608786432,37.416583,-122.082064,2020-05-15-US-MTV-1,Pixel4,0.006241
2,2020-05-15-US-MTV-1_Pixel4,1273608787432,37.416583,-122.082064,2020-05-15-US-MTV-1,Pixel4,0.015441
3,2020-05-15-US-MTV-1_Pixel4,1273608788432,37.416583,-122.082064,2020-05-15-US-MTV-1,Pixel4,0.035798
4,2020-05-15-US-MTV-1_Pixel4,1273608789432,37.416583,-122.082064,2020-05-15-US-MTV-1,Pixel4,-0.033386
...,...,...,...,...,...,...,...
91481,2021-04-29-US-SJC-3_SamsungS20Ultra,1303763185000,37.334563,-121.899467,2021-04-29-US-SJC-3,SamsungS20Ultra,0.004456
91482,2021-04-29-US-SJC-3_SamsungS20Ultra,1303763186000,37.334563,-121.899467,2021-04-29-US-SJC-3,SamsungS20Ultra,0.025227
91483,2021-04-29-US-SJC-3_SamsungS20Ultra,1303763187000,37.334563,-121.899467,2021-04-29-US-SJC-3,SamsungS20Ultra,0.010388
91484,2021-04-29-US-SJC-3_SamsungS20Ultra,1303763188000,37.334563,-121.899467,2021-04-29-US-SJC-3,SamsungS20Ultra,-0.103886


In [98]:
test_df["collectionName"].unique()

array(['2020-05-15-US-MTV-1', '2020-05-28-US-MTV-1',
       '2020-05-28-US-MTV-2', '2020-06-04-US-MTV-2',
       '2020-06-10-US-MTV-1', '2020-06-10-US-MTV-2',
       '2020-08-03-US-MTV-2', '2020-08-13-US-MTV-1',
       '2021-03-16-US-MTV-2', '2021-03-16-US-RWC-2',
       '2021-03-25-US-PAO-1', '2021-04-02-US-SJC-1',
       '2021-04-08-US-MTV-1', '2021-04-21-US-MTV-1',
       '2021-04-22-US-SJC-2', '2021-04-26-US-SVL-2',
       '2021-04-28-US-MTV-2', '2021-04-29-US-MTV-2',
       '2021-04-29-US-SJC-3'], dtype=object)

In [99]:
fig = px.scatter_mapbox(test_df[test_df["collectionName"]=="2021-03-25-US-PAO-1"],                              
                        # Here, plotly gets, (x,y) coordinates
                        lat="latDeg",
                        lon="lngDeg",
                            
                        zoom=9,
                        center={"lat":37.423576, "lon":-122.094132},
                        height=600,
                        width=800)
fig.update_layout(mapbox_style='stamen-terrain')
fig.update_layout(margin={"r": 0, "t": 0, "l": 0, "b": 0})
# fig.update_layout(title_text="{collection_name}")
fig.show()

In [100]:
test_df2 = apply_snap_to_grid(test_df, road_points, min_thr=2, max_thr=10)
test_df2 = outlier_interpolate(test_df2)
test_df2 = apply_kf_smoothing(test_df2)
test_df2 = stop_mean(test_df2)

[START] Snap to Grid


  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

outlier to nan
[START] Kalman Smoothing
2021-03-25-US-PAO-1 Mi8 17
2021-03-25-US-PAO-1 Pixel4 20
2021-03-25-US-PAO-1 Pixel4Modded 18
2021-03-25-US-PAO-1 Pixel5 17
2021-03-25-US-PAO-1 SamsungS20Ultra 17
[START] stop mean


In [101]:
test_df["type"] = "not snap"
test_df2["type"] = "snap"
df = pd.concat([test_df, test_df2])

In [102]:
fig = px.scatter_mapbox(df[df["collectionName"]=="2021-03-25-US-PAO-1"],                              
                        # Here, plotly gets, (x,y) coordinates
                        lat="latDeg",
                        lon="lngDeg",
                        color="type",
                        labels="type",
                        zoom=9,
                        center={"lat":37.423576, "lon":-122.094132},
                        height=600,
                        width=800)
fig.update_layout(mapbox_style='stamen-terrain')
fig.update_layout(margin={"r": 0, "t": 0, "l": 0, "b": 0})
# fig.update_layout(title_text="{collection_name}")
fig.show()

In [103]:
calc_haversine(test_df["latDeg"],test_df["lngDeg"],test_df2["latDeg"],test_df2["lngDeg"]).mean()

0.13873434631643458

In [106]:
sub_df = test_df.copy()
sub_df["latDeg"] = test_df["latDeg"]*0.5 + test_df2["latDeg"]*0.5
sub_df["lngDeg"] = test_df["lngDeg"]*0.5 + test_df2["lngDeg"]*0.5

In [107]:
calc_haversine(test_df["latDeg"],test_df["lngDeg"],sub_df["latDeg"],sub_df["lngDeg"]).mean()

0.06936717202717171

In [112]:
sub_df[["phone", "millisSinceGpsEpoch", "latDeg", "lngDeg"]].to_csv("exp102_blend_submission2_additional_snap_v3.csv", index=False)

In [108]:
fig = px.scatter_mapbox(sub_df[sub_df["collectionName"]=="2021-03-25-US-PAO-1"],                              
                        # Here, plotly gets, (x,y) coordinates
                        lat="latDeg",
                        lon="lngDeg",
                        color="type",
                        labels="type",
                        zoom=9,
                        center={"lat":37.423576, "lon":-122.094132},
                        height=600,
                        width=800)
fig.update_layout(mapbox_style='stamen-terrain')
fig.update_layout(margin={"r": 0, "t": 0, "l": 0, "b": 0})
# fig.update_layout(title_text="{collection_name}")
fig.show()