In [4]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', 100)
import scipy.optimize as opt
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')
import multiprocessing
from pathlib import Path
import pathlib
from datetime import datetime
from tqdm.notebook import tqdm
INPUT = '../../input/google-smartphone-decimeter-challenge/'
root = Path(INPUT)

In [5]:
EXP_NAME = str(Path().resolve()).split('/')[-1]

In [6]:
def init_logger(log_file='logger.log'):
    from logging import getLogger, INFO, FileHandler,  Formatter,  StreamHandler
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=log_file)
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger
    
today = datetime.now().strftime('%Y-%m-%d')
logger = init_logger(log_file=f'./{today}.log')
logger.info('Start Logging...')

Start Logging...
Start Logging...


In [7]:
def ecef2lla(x, y, z):
    # x, y and z are scalars or vectors in meters
    x = np.array([x]).reshape(np.array([x]).shape[-1], 1)
    y = np.array([y]).reshape(np.array([y]).shape[-1], 1)
    z = np.array([z]).reshape(np.array([z]).shape[-1], 1)

    a=6378137
    a_sq=a**2
    e = 8.181919084261345e-2
    e_sq = 6.69437999014e-3

    f = 1/298.257223563
    b = a*(1-f)

    # calculations:
    r = np.sqrt(x**2 + y**2)
    ep_sq  = (a**2-b**2)/b**2
    ee = (a**2-b**2)
    f = (54*b**2)*(z**2)
    g = r**2 + (1 - e_sq)*(z**2) - e_sq*ee*2
    c = (e_sq**2)*f*r**2/(g**3)
    s = (1 + c + np.sqrt(c**2 + 2*c))**(1/3.)
    p = f/(3.*(g**2)*(s + (1./s) + 1)**2)
    q = np.sqrt(1 + 2*p*e_sq**2)
    r_0 = -(p*e_sq*r)/(1+q) + np.sqrt(0.5*(a**2)*(1+(1./q)) - p*(z**2)*(1-e_sq)/(q*(1+q)) - 0.5*p*(r**2))
    u = np.sqrt((r - e_sq*r_0)**2 + z**2)
    v = np.sqrt((r - e_sq*r_0)**2 + (1 - e_sq)*z**2)
    z_0 = (b**2)*z/(a*v)
    h = u*(1 - b**2/(a*v))
    phi = np.arctan((z + ep_sq*z_0)/r)
    lambd = np.arctan2(y, x)

    return phi*180/np.pi, lambd*180/np.pi, h

def calc_haversine(lat1, lon1, lat2, lon2):
    """Calculates the great circle distance between two points
    on the earth. Inputs are array-like and specified in decimal degrees.
    """
    RADIUS = 6_367_000
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + \
      np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    dist = 2 * RADIUS * np.arcsin(a**0.5)
    return dist

In [8]:
def make_gt(recal):
    if recal:
        p = pathlib.Path(INPUT)
        gt_files = list(p.glob('train/*/*/ground_truth.csv'))
        print('ground_truth.csv count : ', len(gt_files))

        gts = []
        for gt_file in tqdm(gt_files):
            gts.append(pd.read_csv(gt_file))
        ground_truth = pd.concat(gts)
        ground_truth.to_csv(root / 'gt.csv',index=False)
    else:
        ground_truth = pd.read_csv(root / 'gt.csv')
    return ground_truth
    
gt = make_gt(recal=True)

ground_truth.csv count :  73


  0%|          | 0/73 [00:00<?, ?it/s]

In [9]:
def percentile50(x):
    return np.percentile(x, 50)
def percentile95(x):
    return np.percentile(x, 95)

def get_train_score_df(df, gt):
    gt = gt.rename(columns={'latDeg':'latDeg_gt', 'lngDeg':'lngDeg_gt'})
    df = df.merge(gt, on=['collectionName', 'phoneName', 'millisSinceGpsEpoch'], how='left')
    # calc_distance_error
    df['err'] = calc_haversine(df['latDeg_gt'], df['lngDeg_gt'], df['latDeg'], df['lngDeg'])
    return df

def get_train_score(df, gt):
    gt = gt.rename(columns={'latDeg':'latDeg_gt', 'lngDeg':'lngDeg_gt'})
    df = df.merge(gt, on=['collectionName', 'phoneName', 'millisSinceGpsEpoch'], how='inner')
    # calc_distance_error
    df['err'] = calc_haversine(df['latDeg_gt'], df['lngDeg_gt'], df['latDeg'], df['lngDeg'])
    # calc_evaluate_score
    # df['phone'] = df['collectionName'] + '_' + df['phoneName']
    res = df.groupby(['collectionName', 'phoneName'])['err'].agg([percentile50, percentile95]).reset_index()
    res['score'] = (res['percentile50'] + res['percentile95']) / 2 
    return res

In [10]:
def gnss_log_to_dataframes(path):
    '''Load GNSS Log'''
    print('Loading ' + path, flush = True)
    gnss_section_names = {'Raw', 'UncalAccel', 'UncalGyro', 'UncalMag', 'Fix', 'Status', 'OrientationDeg'}
    with open(path) as f_open:
        datalines = f_open.readlines()

    datas = {k: [] for k in gnss_section_names}
    gnss_map = {k: [] for k in gnss_section_names}
    for dataline in datalines:
        is_header = dataline.startswith('#')
        dataline = dataline.strip('#').strip().split(',')
        # skip over notes, version numbers, etc
        if is_header and dataline[0] in gnss_section_names:
            gnss_map[dataline[0]] = dataline[1:]
        elif not is_header:
            datas[dataline[0]].append(dataline[1:])

    results = dict()
    for k, v in datas.items():
        results[k] = pd.DataFrame(v, columns=gnss_map[k])
    # pandas doesn't properly infer types from these lists by default
    for k, df in results.items():
        for col in df.columns:
            if col == 'CodeType':
                continue
            results[k][col] = pd.to_numeric(results[k][col])

    return results

In [11]:
# apply tips1
# _derivedのmillisSinceGpsEpochが次のepoch(rawの)を示しているので元に戻す
def apply_tips1(raw_df, derived_df):
    # Create a new column in df_raw that corresponds to derivedの['millisSinceGpsEpoch']
    raw_df['millisSinceGpsEpoch'] = np.floor((raw_df['TimeNanos'] - raw_df['FullBiasNanos']) / 1000000.0).astype(int)
        
    # Change each value in df_derived['MillisSinceGpsEpoch'] to be the prior epoch.
    raw_timestamps = raw_df['millisSinceGpsEpoch'].unique()
    derived_timestamps = derived_df['millisSinceGpsEpoch'].unique()

    # The timestamps in derived are one epoch ahead. We need to map each epoch
    # in derived to the prior one (in Raw).
    indexes = np.searchsorted(raw_timestamps, derived_timestamps)
    from_t_to_fix_derived = dict(zip(derived_timestamps, raw_timestamps[indexes-1]))
    derived_df['millisSinceGpsEpoch'] = np.array(list(map(lambda v: from_t_to_fix_derived[v], derived_df['millisSinceGpsEpoch'])))
    return derived_df

# apply tips5
# derivedの重複している行を削除
def apply_tips5(derived_df):
    delta_millis = derived_df['millisSinceGpsEpoch'] - derived_df['receivedSvTimeInGpsNanos'] / 1e6
    where_good_signals = (delta_millis > 0) & (delta_millis < 300)
    return derived_df[where_good_signals]

In [12]:
output_dir = f'../../input/baseline_{EXP_NAME}/'
os.makedirs(output_dir, exist_ok=True)

In [13]:
base_train = pd.read_csv(root/ 'baseline_locations_train_with_speed.csv')
base_train.loc[:,['px','py','pz']] = 0
base_train.head()

Unnamed: 0,collectionName,phoneName,millisSinceGpsEpoch,latDeg,lngDeg,heightAboveWgs84EllipsoidM,phone,pred_speedMps,px,py,pz
0,2020-05-14-US-MTV-1,Pixel4,1273529463442,37.423575,-122.094091,-34.06,2020-05-14-US-MTV-1_Pixel4,0.170657,0,0,0
1,2020-05-14-US-MTV-1,Pixel4,1273529464442,37.423578,-122.094101,-33.29,2020-05-14-US-MTV-1_Pixel4,-0.0903,0,0,0
2,2020-05-14-US-MTV-1,Pixel4,1273529465442,37.423573,-122.094111,-30.99,2020-05-14-US-MTV-1_Pixel4,0.050574,0,0,0
3,2020-05-14-US-MTV-1,Pixel4,1273529466442,37.423583,-122.094121,-32.83,2020-05-14-US-MTV-1_Pixel4,-0.007225,0,0,0
4,2020-05-14-US-MTV-1,Pixel4,1273529467442,37.423579,-122.094114,-34.49,2020-05-14-US-MTV-1_Pixel4,0.05152,0,0,0


In [14]:
# kwargsはdict
def distance(x, **kwargs):
    # weight = kwargs["uncertaintyWeight"]
    weight = np.array(kwargs["uncertaintyWeight"])
    prm = kwargs["correctedPrM"]
    
    satx = kwargs['xSatPosMRotated'] - x[0]
    saty = kwargs['ySatPosMRotated'] - x[1]
    satz = kwargs['zSatPosMRotated'] - x[2]
    d = weight * (np.sqrt((satx**2 + saty**2 +satz**2)) + x[3] - prm)
    return d


# Set up least squares methods
# def distance(x, **kwargs):
#     satx = kwargs["xSatPosMRotated"] - x[0]
#     saty = kwargs["ySatPosMRotated"] - x[1]
#     satz = kwargs["zSatPosMRotated"] - x[2]
#     weight = kwargs["uncertaintyWeight"]
#     prm = kwargs["correctedPrM"]
#     # isrbm = kwargs["isrbM"]
#     isrbms = [k for k in kwargs.keys() if "_isrbM" in k]
#     N = len(isrbms)
#     isrbms_loss = 0
#     for i in range(N):
#         isrbms_loss += x[4+i] - kwargs[isrbms[i]]
#         # isrbms_loss += x[4+i]

#     d = weight * (np.sqrt(satx**2 + saty**2 +satz**2) + x[3] - prm + isrbms_loss)
#     return d


In [27]:
import multiprocessing

for (collection_name, phone_name), base_df in base_train.groupby(['collectionName','phoneName']):
    break

derived_df = pd.read_csv(root / f"train/{collection_name}/{phone_name}/{phone_name}_derived.csv")
gnss_df = gnss_log_to_dataframes(str(root / f"train/{collection_name}/{phone_name}/{phone_name}_GnssLog.txt"))
raw_df = gnss_df['Raw']
derived_df = apply_tips1(raw_df, derived_df)
derived_df = apply_tips5(derived_df)

derived_df = derived_df.sort_values('millisSinceGpsEpoch')   

Loading ../../input/google-smartphone-decimeter-challenge/train/2020-05-14-US-MTV-1/Pixel4/Pixel4_GnssLog.txt


In [28]:
derived_df['signalType'].unique()

array(['GLO_G1', 'GAL_E5A', 'GPS_L5', 'GAL_E1', 'GPS_L1'], dtype=object)

In [29]:
for epoch, df in derived_df.groupby(['millisSinceGpsEpoch', 'constellationType', 'svid']):
    if len(df) > 1:
        break
df

Unnamed: 0,collectionName,phoneName,millisSinceGpsEpoch,constellationType,svid,signalType,receivedSvTimeInGpsNanos,xSatPosM,ySatPosM,zSatPosM,xSatVelMps,ySatVelMps,zSatVelMps,satClkBiasM,satClkDriftMps,rawPrM,rawPrUncM,isrbM,ionoDelayM,tropoDelayM
26,2020-05-14-US-MTV-1,Pixel4,1273529463442,1,6,GPS_L5,1273529463367963648,6164666.631,-13585090.0,22024520.0,2357.977,1469.52,251.424,-81412.404,-0.002,22325460.0,2.998,-2352.94,9.864,4.095
15,2020-05-14-US-MTV-1,Pixel4,1273529463442,1,6,GPS_L1,1273529463367955840,6164666.612,-13585090.0,22024520.0,2357.977,1469.52,251.424,-81411.407,-0.002,22327800.0,2.398,0.0,5.5,4.095


In [30]:
# 電離層遅延の誤差項は除く
df['correctedPrM'] = df.apply(lambda r: r.rawPrM + r.satClkBiasM - r.isrbM - r.tropoDelayM,axis=1)  
df

Unnamed: 0,collectionName,phoneName,millisSinceGpsEpoch,constellationType,svid,signalType,receivedSvTimeInGpsNanos,xSatPosM,ySatPosM,zSatPosM,xSatVelMps,ySatVelMps,zSatVelMps,satClkBiasM,satClkDriftMps,rawPrM,rawPrUncM,isrbM,ionoDelayM,tropoDelayM,correctedPrM
26,2020-05-14-US-MTV-1,Pixel4,1273529463442,1,6,GPS_L5,1273529463367963648,6164666.631,-13585090.0,22024520.0,2357.977,1469.52,251.424,-81412.404,-0.002,22325460.0,2.998,-2352.94,9.864,4.095,22246400.0
15,2020-05-14-US-MTV-1,Pixel4,1273529463442,1,6,GPS_L1,1273529463367955840,6164666.612,-13585090.0,22024520.0,2357.977,1469.52,251.424,-81411.407,-0.002,22327800.0,2.398,0.0,5.5,4.095,22246390.0


In [19]:
# GPS_L1: f = 1575.42MHz
# GPS_L5: f = 1176.45MHz
# http://gnss.co.jp/wp-content/uploads/2016/07/ddd790b4eae745d43594c4f302b14761.pdf

f1 = 1575.42 * 10**6
f2 = 1176.45 * 10**6
alpha = f1**2/f2**2
beta = f1**2/(f1**2-f2**2)
gamma = f2**2/(f1**2-f2**2)

c = 299_792_458
lambda1 = c/f1
lambda2 = c/f2

# beta*lambda1*x[4]
# gamma*lambda2*x[5]

# fai_if = fai1 - (f2/f1)*fal2  # pseudorange
lambda_if = lambda1*lambda2/(lambda2 - (f2/f1)*lambda1)
Nif = N1 - (f2/f1)*N2

In [20]:
# # pseudorangeの修正
# derived_df['correctedPrM'] = derived_df.apply(lambda r: r.rawPrM + r.satClkBiasM - r.isrbM - r.ionoDelayM - r.tropoDelayM,axis=1)


# # 伝播時間=擬似距離/光速
# # 受信時刻と送信時刻の差分となる
# light_speed = 299_792_458
# derived_df['transmissionTimeSeconds'] = derived_df['correctedPrM'] / light_speed

# # Compute true sat positions at arrival time
# # 到着までに衛星位置が移動しているのでこれを補正
# omega_e = 7.2921151467e-5
# derived_df['xSatPosMRotated'] = \
#     np.cos(omega_e * derived_df['transmissionTimeSeconds']) * derived_df['xSatPosM'] \
#     + np.sin(omega_e * derived_df['transmissionTimeSeconds']) * derived_df['ySatPosM']

# derived_df['ySatPosMRotated'] = \
#     - np.sin(omega_e * derived_df['transmissionTimeSeconds']) * derived_df['xSatPosM'] \
#     + np.cos(omega_e * derived_df['transmissionTimeSeconds']) * derived_df['ySatPosM']

# derived_df['zSatPosMRotated'] = derived_df['zSatPosM']

# derived_df['uncertaintyWeight'] = 1 / derived_df['rawPrUncM']

# output_df = pd.DataFrame()
# d_list = []
# x_list = []
# y_list = []
# z_list = []
# epoch_list = []
# for epoch, df in derived_df.groupby('millisSinceGpsEpoch'): 

In [21]:
def estimate_position_by_derived(args):
    (collection_name, phone_name), base_df = args
    phase = base_df['phase'].unique()[0]
    # Train df here only contains one collection and one measurement
    # if os.path.exists(f'../input/derived/{collection_name}_{phone_name}_derived.csv'):
    #     derived_df = pd.read_csv(f'../input/derived/{collection_name}_{phone_name}_derived.csv')
    # else:
    derived_df = pd.read_csv(root / f"{phase}/{collection_name}/{phone_name}/{phone_name}_derived.csv")
    gnss_df = gnss_log_to_dataframes(str(root / f"{phase}/{collection_name}/{phone_name}/{phone_name}_GnssLog.txt"))
    raw_df = gnss_df['Raw']
    derived_df = apply_tips1(raw_df, derived_df)
    derived_df = apply_tips5(derived_df)       

    derived_df = derived_df.sort_values('millisSinceGpsEpoch')

    # pseudorangeの修正
    derived_df['correctedPrM'] = derived_df.apply(lambda r: r.rawPrM + r.satClkBiasM - r.isrbM - r.ionoDelayM - r.tropoDelayM,axis=1)


    # 伝播時間=擬似距離/光速
    # 受信時刻と送信時刻の差分となる
    light_speed = 299_792_458
    derived_df['transmissionTimeSeconds'] = derived_df['correctedPrM'] / light_speed

    # Compute true sat positions at arrival time
    # 到着までに衛星位置が移動しているのでこれを補正
    omega_e = 7.2921151467e-5
    derived_df['xSatPosMRotated'] = \
        np.cos(omega_e * derived_df['transmissionTimeSeconds']) * derived_df['xSatPosM'] \
        + np.sin(omega_e * derived_df['transmissionTimeSeconds']) * derived_df['ySatPosM']

    derived_df['ySatPosMRotated'] = \
        - np.sin(omega_e * derived_df['transmissionTimeSeconds']) * derived_df['xSatPosM'] \
        + np.cos(omega_e * derived_df['transmissionTimeSeconds']) * derived_df['ySatPosM']

    derived_df['zSatPosMRotated'] = derived_df['zSatPosM']

    # derived_df['uncertaintyWeight'] = 1 / derived_df['rawPrUncM']
    derived_df['uncertaintyWeight'] = 1/derived_df['rawPrUncM'].apply(lambda x:x*x)

    output_df = pd.DataFrame()
    x_list = []
    y_list = []
    z_list = []
    epoch_list = []
    for epoch, df in derived_df.groupby('millisSinceGpsEpoch'): 

        # Corrected pseudorange according to data instructions
        # Time it took for signal to travel
        # Start point for the optimiser
        # N = len([i for i in df["signalType"].unique() if i != "GPS_L1"])
        df = df[df['signalType'].isin(['GPS_L1', 'GPS_L5','GAL_E1', 'GAL_E5A', 'QZS_J1', 'QZS_J5'])]

        # 最小2乗法による座標の推定
        x0 = [0]*4
        opt_res = opt.least_squares(distance, x0, kwargs=df.to_dict(orient="list"))

        # Optimiser yields a position in the ECEF coordinates
        opt_res_pos = opt_res.x
        d = distance(opt_res_pos, **df.to_dict(orient="list"))

        # ECEF position to lat/long
        wls_estimated_pos = ecef2lla(*opt_res_pos[:3])
        wls_estimated_pos = np.squeeze(wls_estimated_pos)
        x_list.append(wls_estimated_pos[0])
        y_list.append(wls_estimated_pos[1])
        z_list.append(wls_estimated_pos[2])
        epoch_list.append(epoch)
    
    output_df["latDeg"] = x_list
    output_df["lngDeg"] = y_list
    output_df['heightAboveWgs84EllipsoidM'] = z_list
    output_df['millisSinceGpsEpoch'] = epoch_list
    output_df['collectionName'] = collection_name
    output_df['phoneName'] = phone_name

    output_df.to_csv(output_dir + f'{collection_name}_{phone_name}_derived.csv', index=False)
    return output_df

In [22]:
import multiprocessing

base_train['phase'] = 'train'
gr = base_train.groupby(['collectionName','phoneName'])
processes = multiprocessing.cpu_count()
with multiprocessing.Pool(processes=processes) as pool:
    dfs = pool.imap_unordered(estimate_position_by_derived, gr)
    dfs = tqdm(dfs, total=len(gr))
    dfs = list(dfs)
all_derived_df = pd.concat(dfs).sort_values(['collectionName', 'phoneName', 'millisSinceGpsEpoch']).reset_index(drop=True)     

  0%|          | 0/73 [00:00<?, ?it/s]

Loading ../../input/google-smartphone-decimeter-challenge/train/2020-05-14-US-MTV-2/Pixel4XLModded/Pixel4XLModded_GnssLog.txt
Loading ../../input/google-smartphone-decimeter-challenge/train/2020-05-14-US-MTV-1/Pixel4XLModded/Pixel4XLModded_GnssLog.txt
Loading ../../input/google-smartphone-decimeter-challenge/train/2020-05-29-US-MTV-1/Pixel4XL/Pixel4XL_GnssLog.txt
Loading ../../input/google-smartphone-decimeter-challenge/train/2020-06-05-US-MTV-1/Pixel4XLModded/Pixel4XLModded_GnssLog.txt
Loading ../../input/google-smartphone-decimeter-challenge/train/2020-05-21-US-MTV-2/Pixel4XL/Pixel4XL_GnssLog.txtLoading ../../input/google-smartphone-decimeter-challenge/train/2020-05-14-US-MTV-1/Pixel4/Pixel4_GnssLog.txt
Loading ../../input/google-smartphone-decimeter-challenge/train/2020-05-29-US-MTV-2/Pixel4/Pixel4_GnssLog.txt

Loading ../../input/google-smartphone-decimeter-challenge/train/2020-05-29-US-MTV-2/Pixel4XL/Pixel4XL_GnssLog.txt
Loading ../../input/google-smartphone-decimeter-challenge/tr

In [23]:
df_list = []
for (collection_name, phone_name), base_df in tqdm(base_train.groupby(['collectionName','phoneName'])):
    print(f"\n{collection_name} {phone_name}")
    base_df = base_df.sort_values('millisSinceGpsEpoch')
    target_gt = gt[(gt['collectionName']==collection_name)&(gt['phoneName']==phone_name)].sort_values('millisSinceGpsEpoch').reset_index(drop=True)
    
    derived_df = pd.read_csv(output_dir + f'{collection_name}_{phone_name}_derived.csv')
    derived_df = derived_df[~derived_df["millisSinceGpsEpoch"].duplicated()].sort_values("millisSinceGpsEpoch").reset_index(drop=True)
    derived_df = derived_df.rename(columns={"latDeg":"_latDeg", "lngDeg":"_lngDeg"})
    print(base_df.shape, target_gt.shape, derived_df.shape)
    base_score = get_train_score(base_df, target_gt)
    print("baseline:", base_score)

    derived_df = pd.merge_asof(base_df, derived_df[["millisSinceGpsEpoch", "_latDeg", "_lngDeg"]], on=["millisSinceGpsEpoch"], tolerance=10, direction='nearest')

    # replace if data is nan 
    derived_df.loc[derived_df["_latDeg"].isna(), "_latDeg"] = derived_df.loc[derived_df["_latDeg"].isna(), "latDeg"].values
    derived_df.loc[derived_df["_lngDeg"].isna(), "_lngDeg"] = derived_df.loc[derived_df["_lngDeg"].isna(), "lngDeg"].values

    derived_df = derived_df.drop(["latDeg", "lngDeg"], axis=1).rename(columns={"_latDeg":"latDeg","_lngDeg":"lngDeg"})
    _derived_df = get_train_score_df(derived_df, target_gt)
    df_list.append(_derived_df)
    derived_score = get_train_score(derived_df, target_gt)
    print("derived:", derived_score)
corrected_base_df = pd.concat(df_list).reset_index(drop=True)

  0%|          | 0/73 [00:00<?, ?it/s]

V-2    Pixel4      1.522371      4.233494  2.877933

2020-06-05-US-MTV-2 Pixel4XL
(1755, 12) (1755, 11) (1749, 6)
baseline:         collectionName phoneName  percentile50  percentile95     score
0  2020-06-05-US-MTV-2  Pixel4XL      1.178248      3.403874  2.291061
derived:         collectionName phoneName  percentile50  percentile95     score
0  2020-06-05-US-MTV-2  Pixel4XL      1.457614      3.946148  2.701881

2020-06-11-US-MTV-1 Pixel4
(1990, 12) (1990, 11) (1950, 6)
baseline:         collectionName phoneName  percentile50  percentile95     score
0  2020-06-11-US-MTV-1    Pixel4      1.356489      3.811824  2.584156
derived:         collectionName phoneName  percentile50  percentile95     score
0  2020-06-11-US-MTV-1    Pixel4      1.741077       4.30266  3.021869

2020-06-11-US-MTV-1 Pixel4XL
(1774, 12) (1774, 11) (1716, 6)
baseline:         collectionName phoneName  percentile50  percentile95     score
0  2020-06-11-US-MTV-1  Pixel4XL      1.017757      2.706992  1.862374
derive

In [24]:
pd.set_option('display.max_rows', 200)
logger.info('GPS/GALLILEO/')

# baseline
score_df = get_train_score(base_train, gt)
base_score= score_df['score'].mean()
logger.info(f"baseline:{base_score}")

# reproduce
score_df2 = get_train_score(corrected_base_df.drop(['latDeg_gt', 'lngDeg_gt'], axis=1), gt)
corrected_base_score = score_df2['score'].mean()
logger.info(f"corrected baseline:{corrected_base_score}")

GPS/GALLILEO/
GPS/GALLILEO/
baseline:5.287970649047861
baseline:5.287970649047861
corrected baseline:5.456514449054676
corrected baseline:5.456514449054676


In [25]:
score_df.groupby('phoneName').mean().sort_values("score", ascending=False).style.bar(subset=['percentile50', 'percentile95', 'score'], color=['teal'])

Unnamed: 0_level_0,percentile50,percentile95,score
phoneName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SamsungS20Ultra,5.11629,20.117212,12.616751
Pixel4,2.343907,8.603656,5.473782
Pixel5,3.007355,7.495669,5.251512
Pixel4Modded,2.410609,5.435563,3.923086
Pixel4XL,2.02796,5.524822,3.776391
Pixel4XLModded,2.009718,5.113532,3.561625
Mi8,2.015012,4.638771,3.326891


In [26]:
score_df2.groupby('phoneName').mean().sort_values("score", ascending=False).style.bar(subset=['percentile50', 'percentile95', 'score'], color=['teal'])

Unnamed: 0_level_0,percentile50,percentile95,score
phoneName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SamsungS20Ultra,4.589553,17.680005,11.134779
Pixel4,2.474404,9.587846,6.031125
Pixel5,3.172287,7.91644,5.544364
Pixel4XL,2.248535,5.987982,4.118259
Pixel4Modded,2.264562,5.518779,3.891671
Pixel4XLModded,2.070939,5.587581,3.82926
Mi8,1.894433,4.695327,3.29488


In [None]:
corrected_base_df.to_csv(f"baseline_locations_train_{EXP_NAME}.csv", index=False)

In [None]:
mean_df = base_train.copy()
mean_df["latDeg"] = (base_train["latDeg"]*0.6 + corrected_base_df["latDeg"]*0.4)
mean_df["lngDeg"] = (base_train["lngDeg"]*0.6 + corrected_base_df["lngDeg"]*0.4)
mean_score_df = get_train_score(mean_df, gt)
print("mean baseline:", mean_score_df['score'].mean())
# mean_df.to_csv(f"baseline_locations_train_fixed_{EXP_NAME}.csv", index=False)

In [None]:
base_test = pd.read_csv(root/ 'baseline_locations_test_with_speed.csv')
base_test.loc[:,['px','py','pz']] = 0
base_test.head()

In [None]:
import multiprocessing
base_test['phase'] = 'test'
gr = base_test.groupby(['collectionName','phoneName'])
processes = multiprocessing.cpu_count()
with multiprocessing.Pool(processes=processes) as pool:
    dfs = pool.imap_unordered(estimate_position_by_derived, gr)
    dfs = tqdm(dfs, total=len(gr))
    dfs = list(dfs)
all_derived_df = pd.concat(dfs).sort_values(['collectionName', 'phoneName', 'millisSinceGpsEpoch']).reset_index(drop=True)     

In [None]:
df_list = []
for (collection_name, phone_name), base_df in tqdm(base_test.groupby(['collectionName','phoneName'])):

    print(f"\n{collection_name} {phone_name}")
    base_df = base_df.sort_values('millisSinceGpsEpoch')

    derived_df = pd.read_csv(output_dir + f'{collection_name}_{phone_name}_derived.csv')
    derived_df = derived_df[~derived_df["millisSinceGpsEpoch"].duplicated()].sort_values("millisSinceGpsEpoch").reset_index(drop=True)
    derived_df = derived_df.rename(columns={"latDeg":"_latDeg", "lngDeg":"_lngDeg"})
    derived_df = pd.merge_asof(base_df, derived_df[["millisSinceGpsEpoch", "_latDeg", "_lngDeg"]], on=["millisSinceGpsEpoch"], tolerance=10, direction='nearest')

    # replace if data is nan 
    derived_df.loc[derived_df["_latDeg"].isna(), "_latDeg"] = derived_df.loc[derived_df["_latDeg"].isna(), "latDeg"].values
    derived_df.loc[derived_df["_lngDeg"].isna(), "_lngDeg"] = derived_df.loc[derived_df["_lngDeg"].isna(), "lngDeg"].values

    derived_df = derived_df.drop(["latDeg", "lngDeg"], axis=1).rename(columns={"_latDeg":"latDeg","_lngDeg":"lngDeg"})
    df_list.append(derived_df)
corrected_base_df = pd.concat(df_list).reset_index(drop=True)

In [None]:
corrected_base_df.to_csv(f"baseline_locations_test_{EXP_NAME}.csv", index=False)

In [None]:
# mean_df = base_test.copy()
# mean_df["latDeg"] = (base_test["latDeg"]*0.6 + corrected_base_df["latDeg"]*0.4)
# mean_df["lngDeg"] = (base_test["lngDeg"]*0.6 + corrected_base_df["lngDeg"]*0.4)
# # mean_df.to_csv(root / "baseline_locations_test_{EXP_NAME}.csv", index=False)