<a href="https://colab.research.google.com/github/kyochanpy/Google_Smartphone_Decimeter_Challenge/blob/main/PP/mean_prediction_01.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib_venn import venn2, venn2_circles
import seaborn as sns
from tqdm.notebook import tqdm
from pathlib import Path
import plotly
import plotly.express as px
from glob import glob

In [None]:
def calc_haversine(lat1, lon1, lat2, lon2):
    RADIUS = 6_367_000
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    d = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    dist = 2 * RADIUS * np.arcsin(d**0.5)
    return dist

In [None]:
def visualize_trafic(df, center, zoom=9):
    fig = px.scatter_mapbox(df,
                            
                            lat="latDeg",
                            lon="lngDeg",
                            
                            color="phoneName",
                            labels="phoneName",
                            
                            zoom=zoom,
                            center=center,
                            height=600,
                            width=800)
    fig.update_layout(mapbox_style='stamen-terrain')
    fig.update_layout(margin={"r": 0, "t": 0, "l": 0, "b": 0})
    fig.update_layout(title_text="GPS trafic")
    fig.show()
    
def visualize_collection(df, collection):
    target_df = df[df['collectionName']==collection].copy()
    lat_center = target_df['latDeg'].mean()
    lng_center = target_df['lngDeg'].mean()
    center = {"lat":lat_center, "lon":lng_center}
    
    visualize_trafic(target_df, center)

In [None]:
path = Path("/content/drive/MyDrive/GSDC")
test_base = pd.read_csv(path / "baseline_locations_test.csv")
sub = pd.read_csv(path / "sample_submission.csv")

truths = (path / "train").rglob("ground_truth.csv")

df_list = []
cols = ["collectionName", "phoneName", "millisSinceGpsEpoch", "latDeg", "lngDeg"]

for t in tqdm(truths, total=73):
    df_phone = pd.read_csv(t, usecols=cols)
    df_list.append(df_phone)
df_truth = pd.concat(df_list, ignore_index=True)

train_base = pd.read_csv(path / "baseline_locations_train.csv", usecols=cols)
all_df = df_truth.merge(train_base, how="inner", on=cols[:3], suffixes=("_truth", '_train_base'))

HBox(children=(FloatProgress(value=0.0, max=73.0), HTML(value='')))




In [None]:
def make_lerp_data(input_df):
    org_colus = input_df.columns

    time_list = input_df[["collectionName", "millisSinceGpsEpoch"]].drop_duplicates()
    phone_list = input_df[["collectionName", "phoneName"]].drop_duplicates()
    tmp = time_list.merge(phone_list, on="collectionName", how="outer")

    output_df = tmp.merge(input_df, on=["collectionName", "millisSinceGpsEpoch", "phoneName"], how="left")
    output_df["phone"] = output_df["collectionName"] + "_" + output_df["phoneName"]
    output_df = output_df.sort_values(["phone", "millisSinceGpsEpoch"])

    output_df["latDeg_pre"] = output_df["latDeg"].shift(1)
    output_df["latDeg_pro"] = output_df["latDeg"].shift(-1)
    output_df["lngDeg_pre"] = output_df["lngDeg"].shift(1)
    output_df["lngDeg_pro"] = output_df["lngDeg"].shift(-1)
    output_df["phone_pre"] = output_df["phone"].shift(1)
    output_df["phone_pro"] = output_df["phone"].shift(-1)
    output_df["millisSinceGpsEpoch_pre"] = output_df["millisSinceGpsEpoch"].shift(1)
    output_df["millisSinceGpsEpoch_pro"] = output_df["millisSinceGpsEpoch"].shift(-1)

    output_df = output_df[(output_df["latDeg"].isnull())&(output_df["phone"] == output_df["phone_pre"])&
                          (output_df["phone"] == output_df["phone_pro"])].copy()

    output_df["latDeg"] = output_df["latDeg_pre"] + ((output_df["latDeg_pro"] - output_df["latDeg_pre"]) * 
                                                    ((output_df["millisSinceGpsEpoch"] - output_df["millisSinceGpsEpoch_pre"]) /
                                                     (output_df["millisSinceGpsEpoch_pro"] - output_df["millisSinceGpsEpoch_pre"])))
    output_df["lngDeg"] = output_df["lngDeg_pre"] + ((output_df["lngDeg_pro"] - output_df["lngDeg_pre"]) * 
                                                    ((output_df["millisSinceGpsEpoch"] - output_df["millisSinceGpsEpoch_pre"]) /
                                                     (output_df["millisSinceGpsEpoch_pro"] - output_df["millisSinceGpsEpoch_pre"])))
    
    output_df = output_df[~output_df['latDeg'].isnull()]

    return output_df[org_colus]

In [None]:
def calc_mean_pred(input_df, lerp_df):
    input_df["phone"] = input_df["collectionName"] + "_" + input_df["phoneName"]
    add_lerp = pd.concat([input_df, lerp_df])
    mean_pred_result = add_lerp.groupby(["collectionName", "millisSinceGpsEpoch"])[["latDeg", "lngDeg"]].mean().reset_index()
    output_df = input_df[["collectionName", "phoneName", "millisSinceGpsEpoch"]].copy()
    output_df = output_df.merge(mean_pred_result[["collectionName", "millisSinceGpsEpoch", "latDeg", "lngDeg"]],
                                      on=["collectionName", "millisSinceGpsEpoch"], how="left")
    output_df["phone"] = output_df["collectionName"] + "_" + output_df["phoneName"]
    return output_df

In [None]:
train_lerp = make_lerp_data(train_base)
train_mean_pred = calc_mean_pred(train_base, train_lerp)

In [None]:
train_mean_pred

Unnamed: 0,collectionName,phoneName,millisSinceGpsEpoch,latDeg,lngDeg,phone
0,2020-05-14-US-MTV-1,Pixel4,1273529463442,37.423575,-122.094091,2020-05-14-US-MTV-1_Pixel4
1,2020-05-14-US-MTV-1,Pixel4,1273529464442,37.423578,-122.094101,2020-05-14-US-MTV-1_Pixel4
2,2020-05-14-US-MTV-1,Pixel4,1273529465442,37.423573,-122.094111,2020-05-14-US-MTV-1_Pixel4
3,2020-05-14-US-MTV-1,Pixel4,1273529466442,37.423583,-122.094121,2020-05-14-US-MTV-1_Pixel4
4,2020-05-14-US-MTV-1,Pixel4,1273529467442,37.423576,-122.094133,2020-05-14-US-MTV-1_Pixel4
...,...,...,...,...,...,...
131337,2021-04-29-US-SJC-2,SamsungS20Ultra,1303760315000,37.334460,-121.899600,2021-04-29-US-SJC-2_SamsungS20Ultra
131338,2021-04-29-US-SJC-2,SamsungS20Ultra,1303760316000,37.334472,-121.899583,2021-04-29-US-SJC-2_SamsungS20Ultra
131339,2021-04-29-US-SJC-2,SamsungS20Ultra,1303760317000,37.334491,-121.899597,2021-04-29-US-SJC-2_SamsungS20Ultra
131340,2021-04-29-US-SJC-2,SamsungS20Ultra,1303760318000,37.334495,-121.899583,2021-04-29-US-SJC-2_SamsungS20Ultra


In [None]:
tmp1 = train_base.copy()
tmp2 = train_mean_pred.copy()
tmp2['phoneName'] = tmp2['phoneName'] + '_MEAN'
tmp3 = df_truth.copy()
tmp3['phoneName'] = tmp3['phoneName'] + '_GT'
tmp = pd.concat([tmp1, tmp2, tmp3])
visualize_collection(tmp, '2020-05-14-US-MTV-1')

In [None]:
def check_score(input_df: pd.DataFrame) -> pd.DataFrame:
    output_df = input_df.copy()
    
    output_df['meter'] = input_df.apply(
        lambda r: calc_haversine(
            r.latDeg, r.lngDeg, r.t_latDeg, r.t_lngDeg
        ),
        axis=1
    )

    meter_score = output_df['meter'].mean()
    print(f'error meter: {meter_score}')

    scores = []
    for phone in output_df['phone'].unique():
        _index = output_df['phone']==phone
        p_50 = np.percentile(output_df.loc[_index, 'meter'], 50)
        p_95 = np.percentile(output_df.loc[_index, 'meter'], 95)
        scores.append(p_50)
        scores.append(p_95)

    score = sum(scores) / len(scores)
    print(f'score: {score}')
    
    return output_df

In [None]:
def get_groundtruth(path: Path) -> pd.DataFrame:
    output_df = pd.DataFrame()
    
    for path in glob(str(path / 'train/*/*/ground_truth.csv')):
        _df = pd.read_csv(path)
        output_df = pd.concat([output_df, _df])
    output_df = output_df.reset_index(drop=True)
    
    _columns = ['latDeg', 'lngDeg', 'heightAboveWgs84EllipsoidM']
    output_df[['t_'+col for col in _columns]] = output_df[_columns]
    output_df = output_df.drop(columns=_columns, axis=1)
    return output_df

In [None]:
train_base

Unnamed: 0,collectionName,phoneName,millisSinceGpsEpoch,latDeg,lngDeg,heightAboveWgs84EllipsoidM,phone,timeSinceFirstFixSeconds,hDop,vDop,speedMps,courseDegree,t_latDeg,t_lngDeg,t_heightAboveWgs84EllipsoidM,meter
0,2020-05-14-US-MTV-1,Pixel4,1273529463442,37.423575,-122.094091,-34.06,2020-05-14-US-MTV-1_Pixel4,551.44,1.1,0.0,0.0,12.7,37.423576,-122.094132,33.21,3.586842
1,2020-05-14-US-MTV-1,Pixel4,1273529464442,37.423578,-122.094101,-33.29,2020-05-14-US-MTV-1_Pixel4,552.44,1.1,0.0,0.0,12.7,37.423576,-122.094132,33.21,2.745901
2,2020-05-14-US-MTV-1,Pixel4,1273529465442,37.423573,-122.094111,-30.99,2020-05-14-US-MTV-1_Pixel4,553.44,1.1,0.0,0.0,12.7,37.423576,-122.094132,33.21,1.888409
3,2020-05-14-US-MTV-1,Pixel4,1273529466442,37.423583,-122.094121,-32.83,2020-05-14-US-MTV-1_Pixel4,554.44,1.1,0.0,0.0,12.7,37.423576,-122.094132,33.20,1.213483
4,2020-05-14-US-MTV-1,Pixel4,1273529467442,37.423579,-122.094114,-34.49,2020-05-14-US-MTV-1_Pixel4,555.44,1.1,0.0,0.0,12.7,37.423576,-122.094132,33.20,1.650722
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
131337,2021-04-29-US-SJC-2,SamsungS20Ultra,1303760315000,37.334460,-121.899600,-8.09,2021-04-29-US-SJC-2_SamsungS20Ultra,2632.00,1.1,0.0,0.0,60.0,37.334475,-121.899613,56.62,2.052491
131338,2021-04-29-US-SJC-2,SamsungS20Ultra,1303760316000,37.334472,-121.899583,-7.59,2021-04-29-US-SJC-2_SamsungS20Ultra,2633.00,1.1,0.0,0.0,60.0,37.334475,-121.899613,56.62,2.671673
131339,2021-04-29-US-SJC-2,SamsungS20Ultra,1303760317000,37.334491,-121.899597,-8.35,2021-04-29-US-SJC-2_SamsungS20Ultra,2634.00,0.9,0.0,0.0,60.0,37.334475,-121.899613,56.62,2.287458
131340,2021-04-29-US-SJC-2,SamsungS20Ultra,1303760318000,37.334495,-121.899583,-8.73,2021-04-29-US-SJC-2_SamsungS20Ultra,2635.00,1.0,0.0,0.0,60.0,37.334475,-121.899613,56.63,3.454306


In [None]:
base = Path('/content/drive/MyDrive/GSDC')
train_base = pd.read_csv(base / 'baseline_locations_train.csv')
test_base = pd.read_csv(base / 'baseline_locations_test.csv')

# merge graoundtruth
train_base = train_base.merge(
    get_groundtruth(base),
    on=['collectionName', 'phoneName', 'millisSinceGpsEpoch']
)
sub = pd.read_csv(base / 'sample_submission.csv')
train_base = check_score(train_base)

error meter: 3.846848374990627
score: 5.287970649084159


In [None]:
train_mean_pred = train_mean_pred.merge(
    get_groundtruth(base),
    on=['collectionName', 'phoneName', 'millisSinceGpsEpoch']
)

In [None]:
check_score(train_mean_pred)

error meter: 3.5183998771309057
score: 4.771303002034734


Unnamed: 0,collectionName,phoneName,millisSinceGpsEpoch,latDeg,lngDeg,phone,timeSinceFirstFixSeconds,hDop,vDop,speedMps,courseDegree,t_latDeg,t_lngDeg,t_heightAboveWgs84EllipsoidM,meter
0,2020-05-14-US-MTV-1,Pixel4,1273529463442,37.423575,-122.094091,2020-05-14-US-MTV-1_Pixel4,551.44,1.1,0.0,0.0,12.7,37.423576,-122.094132,33.21,3.586842
1,2020-05-14-US-MTV-1,Pixel4,1273529464442,37.423578,-122.094101,2020-05-14-US-MTV-1_Pixel4,552.44,1.1,0.0,0.0,12.7,37.423576,-122.094132,33.21,2.745901
2,2020-05-14-US-MTV-1,Pixel4,1273529465442,37.423573,-122.094111,2020-05-14-US-MTV-1_Pixel4,553.44,1.1,0.0,0.0,12.7,37.423576,-122.094132,33.21,1.888409
3,2020-05-14-US-MTV-1,Pixel4,1273529466442,37.423583,-122.094121,2020-05-14-US-MTV-1_Pixel4,554.44,1.1,0.0,0.0,12.7,37.423576,-122.094132,33.20,1.213483
4,2020-05-14-US-MTV-1,Pixel4,1273529467442,37.423576,-122.094133,2020-05-14-US-MTV-1_Pixel4,555.44,1.1,0.0,0.0,12.7,37.423576,-122.094132,33.20,0.122650
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
131337,2021-04-29-US-SJC-2,SamsungS20Ultra,1303760315000,37.334460,-121.899600,2021-04-29-US-SJC-2_SamsungS20Ultra,2632.00,1.1,0.0,0.0,60.0,37.334475,-121.899613,56.62,2.052491
131338,2021-04-29-US-SJC-2,SamsungS20Ultra,1303760316000,37.334472,-121.899583,2021-04-29-US-SJC-2_SamsungS20Ultra,2633.00,1.1,0.0,0.0,60.0,37.334475,-121.899613,56.62,2.671673
131339,2021-04-29-US-SJC-2,SamsungS20Ultra,1303760317000,37.334491,-121.899597,2021-04-29-US-SJC-2_SamsungS20Ultra,2634.00,0.9,0.0,0.0,60.0,37.334475,-121.899613,56.62,2.287458
131340,2021-04-29-US-SJC-2,SamsungS20Ultra,1303760318000,37.334495,-121.899583,2021-04-29-US-SJC-2_SamsungS20Ultra,2635.00,1.0,0.0,0.0,60.0,37.334475,-121.899613,56.63,3.454306


In [None]:
test_lerp = make_lerp_data(test_base)
test_mean_pred = calc_mean_pred(test_base, test_lerp)

In [None]:
sub["latDeg"] = test_mean_pred["latDeg"]
sub["lngDeg"] = test_mean_pred["lngDeg"]
sub.to_csv("submission_10_02.cev", index=False)