In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pathlib
from tqdm import tqdm
from sklearn.metrics import mean_absolute_error, mean_square_error
import random
import math
import lightgbm as lgb

# 概要
## 目的 
- 緯度・経度速度を推定し、カルマンフィルタの観測に速度を追加することである。
緯度・経度速度の定義は以下の通りである。
$$
LatVelocity(i) = \frac{Lat(i)-Lat(i-1)}{time(i)-time(i-1)},
LngVelocity(i) = \frac{Lng(i)-Lng(i-1)}{time(i)-time(i-1)}
$$

## 結果
### 速度推定
- ぱっと見は良く推定できているように見える
- ノイズ処理するだけでこれぐらいよくなる可能性はあるので、本質的に有効な情報かの判断は現状できない。
- 推定速度の大きさに上限があるっぽいので、やはり早いところで使うと誤差の原因になりそう
- SJCなどパルス的に精度の悪いことがある
### 速度カルマンフィルタ
- 0.05弱通常のカルマンフィルタより向上した（collectionでvalidationもしているのである程度効果はありそう）
 - Vanilla Kalman Filter:4.721716804543573
 - Velocity Kalman Filter:4.678559609622235
- アンサンブルに関しては効果はほぼない

In [None]:
def make_gt():
    p = pathlib.Path(INPUT)
    gt_files = list(p.glob('train/*/*/ground_truth.csv'))
    print('ground_truth.csv count : ', len(gt_files))

    gts = []
    for gt_file in tqdm(gt_files):
        gts.append(pd.read_csv(gt_file))
    ground_truth = pd.concat(gts)
    return ground_truth

In [None]:
INPUT = '../../input/google-smartphone-decimeter-challenge'
base_train = pd.read_csv(INPUT + '/' + 'baseline_locations_train.csv')
base_test = pd.read_csv(INPUT + '/' + 'baseline_locations_test.csv')
gt = make_gt()

In [None]:
def calc_velocity(_df,col,shift=1,th=1e-1):
    df = _df.copy()
    df['out'] = (df[col]-df[col].shift(shift))/(df['millisSinceGpsEpoch']-df['millisSinceGpsEpoch'].shift(shift))
    cond = (df['phoneName']!=df['phoneName'].shift(shift)) | (df['collectionName']!=df['collectionName'].shift(shift))
    df.loc[cond,'out'] = 0
    df['out'] = df['out']*1000#msからsのための変換
    cond = df['out'].apply(lambda x:abs(x)) > th#以上データを除去するための処理
    df.loc[cond,'out'] = 0
    return df['out']

def calc_shift(_df,col,shift=1):
    df = _df.copy()
    df['out'] = df[col].shift(shift)
    cond = (df['phoneName']!=df['phoneName'].shift(shift)) | (df['collectionName']!=df['collectionName'].shift(shift))
    df.loc[cond,'out'] = 0
    return df['out']

# 正解データ作り

In [None]:
gt['latv']=calc_velocity(gt,'latDeg')
gt['lngv']=calc_velocity(gt,'lngDeg')

In [None]:
plt.figure(figsize=(20,4))
plt.subplot(1,2,1)
plt.hist(gt['latv'],bins=50,range=(-5e-4,5e-4))
plt.subplot(1,2,2)
plt.hist(gt['lngv'],bins=50,range=(-5e-4,5e-4))
plt.show()

# BaseLineから計算してみる

In [None]:
base_train['latv']=calc_velocity(base_train,'latDeg',th=5e-4)
base_train['lngv']=calc_velocity(base_train,'lngDeg',th=5e-4)

In [None]:
plt.figure(figsize=(20,4))
plt.subplot(1,2,1)
plt.hist(base_train['latv'],bins=50,range=(-5e-4,5e-4))
plt.subplot(1,2,2)
plt.hist(base_train['lngv'],bins=50,range=(-5e-4,5e-4))
plt.show()

In [None]:
gt = gt.rename(columns={'latv':'latv_truth','lngv':'lngv_truth'})
base_train = pd.merge(base_train,gt.loc[:,['collectionName','phoneName','millisSinceGpsEpoch','latv_truth','lngv_truth']],\
                      on=['collectionName','phoneName','millisSinceGpsEpoch'],how='left')
base_train.head()

In [None]:
plt.figure(figsize=(20,4))
plt.subplot(1,2,1)
plt.xlim(-5e-4,5e-4)
plt.ylim(-5e-4,5e-4)
plt.scatter(base_train['latv'],base_train['latv_truth'],s=1)
plt.subplot(1,2,2)
plt.xlim(-5e-4,5e-4)
plt.ylim(-5e-4,5e-4)
plt.scatter(base_train['lngv'],base_train['lngv_truth'],s=1)
plt.show()

# LightGBMで予想させる

## 特徴量生成（詳細説明略）

In [None]:
for i in [1,2,4,8,16,32]:
    base_train['dlatp'+str(i).zfill(2)]=calc_velocity(base_train,'latDeg',shift=i,th=1e-3)
    base_train['dlngp'+str(i).zfill(2)]=calc_velocity(base_train,'lngDeg',shift=i,th=1e-3)
    base_train['dlata'+str(i).zfill(2)]=calc_velocity(base_train,'latDeg',shift=(-i),th=1e-3)
    base_train['dlnga'+str(i).zfill(2)]=calc_velocity(base_train,'lngDeg',shift=(-i),th=1e-3)
    for j in range(1,2,4):
        base_train['dlatp'+str(i).zfill(2)+'_shiftp'+str(j).zfill(2)] = calc_shift(base_train,'dlatp'+str(i).zfill(2),shift=j)
        base_train['dlngp'+str(i).zfill(2)+'_shiftp'+str(j).zfill(2)] = calc_shift(base_train,'dlngp'+str(i).zfill(2),shift=j)
        base_train['dlatp'+str(i).zfill(2)+'_shifta'+str(j).zfill(2)] = calc_shift(base_train,'dlatp'+str(i).zfill(2),shift=(-j))
        base_train['dlngp'+str(i).zfill(2)+'_shifta'+str(j).zfill(2)] = calc_shift(base_train,'dlngp'+str(i).zfill(2),shift=(-j))
        base_train['dlata'+str(i).zfill(2)+'_shiftp'+str(j).zfill(2)] = calc_shift(base_train,'dlata'+str(i).zfill(2),shift=j)
        base_train['dlnga'+str(i).zfill(2)+'_shiftp'+str(j).zfill(2)] = calc_shift(base_train,'dlnga'+str(i).zfill(2),shift=j)
        base_train['dlata'+str(i).zfill(2)+'_shifta'+str(j).zfill(2)] = calc_shift(base_train,'dlata'+str(i).zfill(2),shift=(-j))
        base_train['dlnga'+str(i).zfill(2)+'_shifta'+str(j).zfill(2)] = calc_shift(base_train,'dlnga'+str(i).zfill(2),shift=(-j))
print(len(base_train.columns))

# 学習

In [None]:
data = base_train.copy()
train_val_collection = list(base_train['collectionName'].unique())
test_collection = random.sample(train_val_collection,14)
train_val_collection = list(set(train_val_collection)-set(test_collection))
print(len(train_val_collection),len(test_collection))

In [None]:
# 過去のコンペのものを引っ張ってきただけ
params = {
        'task' : 'train',
        'boosting_type' : 'gbdt',
        'objective' : 'regression',
        'metric' : {'l1'},
        'num_leaves' : 11,
        'learning_rate' : 0.1,
        'feature_fraction' : 0.9,
        'bagging_fraction' : 0.8,
        'bagging_freq': 5,
        'verbose' : 0,
        'early_stopping_rounds': 100
}

## 緯度の学習

In [None]:
epoch = 5
test = data[data['collectionName'].isin(test_collection)]
testX = test.drop(columns=['collectionName','phoneName','latDeg','lngDeg','millisSinceGpsEpoch','heightAboveWgs84EllipsoidM','phone','latv_truth','lngv_truth'])

for e in range(epoch):
    train_collection = random.sample(train_val_collection,10)
    val_collection = list(set(train_val_collection)-set(train_collection))
    train = data[data['collectionName'].isin(train_collection)]
    val = data[data['collectionName'].isin(val_collection)]
    trainX = train.drop(columns=['collectionName','phoneName','latDeg','lngDeg','millisSinceGpsEpoch','heightAboveWgs84EllipsoidM','phone','latv_truth','lngv_truth'])
    trainY = train['latv_truth']
    valX = val.drop(columns=['collectionName','phoneName','latDeg','lngDeg','millisSinceGpsEpoch','heightAboveWgs84EllipsoidM','phone','latv_truth','lngv_truth'])
    valY = val['latv_truth']
    
    lgb_train = lgb.Dataset(trainX, label=trainY)
    lgb_val = lgb.Dataset(valX, label=valY, reference= lgb_train)
    gbm = lgb.train(params,lgb_train,valid_sets=lgb_val,num_boost_round=100000,verbose_eval=100)
    if e==0:
        test['latv_pred'] = gbm.predict(testX)/epoch
    else:
        test['latv_pred'] += gbm.predict(testX)/epoch

In [None]:
#epoch = 5
#test = data[data['collectionName'].isin(test_collection)]
#testX = test.drop(columns=['collectionName','phoneName','latDeg','lngDeg','millisSinceGpsEpoch','heightAboveWgs84EllipsoidM','phone','latv_truth','lngv_truth'])

for e in range(epoch):
    train_collection = random.sample(train_val_collection,10)
    val_collection = list(set(train_val_collection)-set(train_collection))
    train = data[data['collectionName'].isin(train_collection)]
    val = data[data['collectionName'].isin(val_collection)]
    trainX = train.drop(columns=['collectionName','phoneName','latDeg','lngDeg','millisSinceGpsEpoch','heightAboveWgs84EllipsoidM','phone','latv_truth','lngv_truth'])
    trainY = train['lngv_truth']
    valX = val.drop(columns=['collectionName','phoneName','latDeg','lngDeg','millisSinceGpsEpoch','heightAboveWgs84EllipsoidM','phone','latv_truth','lngv_truth'])
    valY = val['lngv_truth']
    
    lgb_train = lgb.Dataset(trainX, label=trainY)
    lgb_val = lgb.Dataset(valX, label=valY, reference= lgb_train)
    gbm = lgb.train(params,lgb_train,valid_sets=lgb_val,num_boost_round=100000,verbose_eval=100)
    if e==0:
        test['lngv_pred'] = gbm.predict(testX)/epoch
    else:
        test['lngv_pred'] += gbm.predict(testX)/epoch

In [None]:
plt.figure(figsize=(20,4))
plt.subplot(1,2,1)
plt.xlim(-5e-4,5e-4)
plt.ylim(-5e-4,5e-4)
plt.scatter(test['latv_truth'],test['latv_pred'],s=1)
plt.subplot(1,2,2)
plt.xlim(-5e-4,5e-4)
plt.ylim(-5e-4,5e-4)
plt.scatter(test['lngv_truth'],test['lngv_pred'],s=1)
plt.show()

In [None]:
for c,d in test.groupby(['collectionName','phoneName']):
    print(c)
    plt.figure(figsize=(25,3))
    plt.plot(d['latv_pred'])
    plt.plot(d['latv_truth'])
    plt.show()

# ここまでの考察
- ぱっと見は良く推定できているように見える
- ノイズ処理するだけでこれぐらいよくなる可能性はあるので、本質的に有効な情報かの判断は現状できない。
- 推定速度の大きさに上限があるっぽいので、やはり早いところで使うと誤差の原因になりそう
- SJCはやや精度が悪い？

# Vanilla Kalman Filter

In [None]:
!pip install simdkalman
import simdkalman

In [None]:
def calc_haversine(lat1, lon1, lat2, lon2):
    """Calculates the great circle distance between two points
    on the earth. Inputs are array-like and specified in decimal degrees.
    """
    RADIUS = 6_367_000
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + \
        np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    dist = 2 * RADIUS * np.arcsin(a**0.5)
    return dist

def make_gt(recal):
    if recal:
        p = pathlib.Path(INPUT)
        gt_files = list(p.glob('train/*/*/ground_truth.csv'))
        print('ground_truth.csv count : ', len(gt_files))

        gts = []
        for gt_file in tqdm(gt_files):
            gts.append(pd.read_csv(gt_file))
        ground_truth = pd.concat(gts)
        #ground_truth.to_csv('./base/gt.csv',index=False)
    else:
        ground_truth = pd.read_csv('./base/gt.csv')
    return ground_truth

def add_distance_diff(df):
    df['latDeg_prev'] = df['latDeg'].shift(1)
    df['latDeg_next'] = df['latDeg'].shift(-1)
    df['lngDeg_prev'] = df['lngDeg'].shift(1)
    df['lngDeg_next'] = df['lngDeg'].shift(-1)
    df['phone_prev'] = df['phone'].shift(1)
    df['phone_next'] = df['phone'].shift(-1)
    
    df['dist_prev'] = calc_haversine(df['latDeg'], df['lngDeg'], df['latDeg_prev'], df['lngDeg_prev'])
    df['dist_next'] = calc_haversine(df['latDeg'], df['lngDeg'], df['latDeg_next'], df['lngDeg_next'])
    
    df.loc[df['phone']!=df['phone_prev'], ['latDeg_prev', 'lngDeg_prev', 'dist_prev']] = np.nan
    df.loc[df['phone']!=df['phone_next'], ['latDeg_next', 'lngDeg_next', 'dist_next']] = np.nan
    
    return df

T = 1.0
state_transition = np.array([[1, 0, T, 0, 0.5 * T ** 2, 0], [0, 1, 0, T, 0, 0.5 * T ** 2], [0, 0, 1, 0, T, 0],
                             [0, 0, 0, 1, 0, T], [0, 0, 0, 0, 1, 0], [0, 0, 0, 0, 0, 1]])
process_noise = np.diag([1e-5, 1e-5, 5e-6, 5e-6, 1e-6, 1e-6]) + np.ones((6, 6)) * 1e-9
observation_model = np.array([[1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0]])
observation_noise = np.diag([5e-5, 5e-5]) + np.ones((2, 2)) * 1e-9

kf = simdkalman.KalmanFilter(
        state_transition = state_transition,
        process_noise = process_noise,
        observation_model = observation_model,
        observation_noise = observation_noise)

def check_time(_df):
    data = _df.copy().reset_index(drop=True)
    data['ms_dif'] = data['millisSinceGpsEpoch']-data['millisSinceGpsEpoch'].shift()
    mis_time = (len(data)-1-data['ms_dif'].value_counts()[1000.0])/len(data)
    return mis_time>0.0

def adjust_time(df):
    data = df.copy().reset_index(drop=True)
    new_data = data.copy()
    t = 'millisSinceGpsEpoch'
    start = data['millisSinceGpsEpoch'].values[0]
    end = data['millisSinceGpsEpoch'].values[-1]
    ind = math.ceil((end-start)/1000)
    _data = pd.DataFrame(index=range(ind))
    _data['latDeg']=0
    _data['lngDeg']=0
    #_data['millisSinceGpsEpoch'] = 0
    _data['millisSinceGpsEpoch'] = start+1000*_data.index
    
    cnt = 0
    for i in _data.index:
        while _data.loc[i,t]<new_data.loc[cnt,t] or _data.loc[i,t]>new_data.loc[cnt+1,t]:
            cnt+=1
        if _data.loc[i,t]==new_data.loc[cnt,t]:
            _data.loc[i,'latDeg'] = new_data.loc[cnt,'latDeg']
            _data.loc[i,'lngDeg'] = new_data.loc[cnt,'lngDeg']
        else:
            w=(new_data.loc[cnt+1,t]-_data.loc[i,t])/(new_data.loc[cnt+1,t]-new_data.loc[cnt,t])
            if w<0 or w>1:
                print(w)
            _data.loc[i,'latDeg'] = new_data.loc[cnt,'latDeg']*w+new_data.loc[cnt+1,'latDeg']*(1-w)
            _data.loc[i,'lngDeg'] = new_data.loc[cnt,'lngDeg']*w+new_data.loc[cnt+1,'lngDeg']*(1-w)
    _data.head()
    __data = _data.loc[:,['latDeg', 'lngDeg']].to_numpy()
    __data = __data.reshape(1, len(__data), 2)
    smoothed = kf.smooth(__data)
    _data.loc[:,'latDeg_pred'] = smoothed.states.mean[0, :, 0]
    _data.loc[:,'lngDeg_pred'] = smoothed.states.mean[0, :, 1]
    _data.loc[:,'latDeg_cov_pred'] = smoothed.states.cov[0, :, 0,0]
    _data.loc[:,'lngDeg_cov_pred'] = smoothed.states.cov[0, :, 1,1]
    data['latDeg_cov'] = 1
    data['lngDeg_cov'] = 1

    cnt = 0
    for i in data.index:
        try:
            while data.loc[i,t]<_data.loc[cnt,t] or data.loc[i,t]>_data.loc[cnt+1,t]:
                cnt+=1
            if data.loc[i,t]==_data.loc[cnt,t]:
                data.loc[i,'latDeg'] = _data.loc[cnt,'latDeg_pred']
                data.loc[i,'lngDeg'] = _data.loc[cnt,'lngDeg_pred']
                data.loc[i,'latDeg_cov'] = _data.loc[cnt,'latDeg_cov_pred']
                data.loc[i,'lngDeg_cov'] = _data.loc[cnt,'lngDeg_cov_pred']
            else:
                w=(_data.loc[cnt+1,t]-data.loc[i,t])/(_data.loc[cnt+1,t]-_data.loc[cnt,t])
                if w<0 or w>1:
                    print(w)
                data.loc[i,'latDeg'] = _data.loc[cnt,'latDeg_pred']*w+_data.loc[cnt+1,'latDeg_pred']*(1-w)
                data.loc[i,'lngDeg'] = _data.loc[cnt,'lngDeg_pred']*w+_data.loc[cnt+1,'lngDeg_pred']*(1-w)
                data.loc[i,'latDeg_cov'] = _data.loc[cnt,'latDeg_cov_pred']*w+_data.loc[cnt+1,'latDeg_cov_pred']*(1-w)
                data.loc[i,'lngDeg_cov'] = _data.loc[cnt,'lngDeg_cov_pred']*w+_data.loc[cnt+1,'lngDeg_cov_pred']*(1-w)
        except:
            pass
    return data

def apply_kf_smoothing(df, kf_=kf):
    unique_paths = df[['collectionName', 'phoneName']].drop_duplicates().to_numpy()
    for collection, phone in unique_paths:
        cond = np.logical_and(df['collectionName'] == collection, df['phoneName'] == phone)
        if check_time(df[cond]):
            _df = adjust_time(df[cond])
            df.loc[cond, 'latDeg'] = _df['latDeg'].values
            df.loc[cond, 'lngDeg'] = _df['lngDeg'].values
            df.loc[cond, 'latDeg_cov'] = _df['latDeg_cov'].values
            df.loc[cond, 'lngDeg_cov'] = _df['lngDeg_cov'].values
        else:
            data = df[cond][['latDeg', 'lngDeg']].to_numpy()
            data = data.reshape(1, len(data), 2)
            smoothed = kf_.smooth(data)
            df.loc[cond, 'latDeg'] = smoothed.states.mean[0, :, 0]
            #print(smoothed.states.mean.shape,smoothed.states.cov.shape)
            df.loc[cond, 'latDeg_cov'] = smoothed.states.cov[0, :, 0,0]
            df.loc[cond, 'lngDeg'] = smoothed.states.mean[0, :, 1]
            df.loc[cond, 'lngDeg_cov'] = smoothed.states.cov[0, :, 1,1]
    return df

def weighted_make_lerp_data(df):
    '''
    Generate interpolated lat,lng values for different phone times in the same collection.
    '''
    org_columns = df.columns
    
    # Generate a combination of time x collection x phone and combine it with the original data (generate records to be interpolated)
    time_list = df[['collectionName', 'millisSinceGpsEpoch']].drop_duplicates()
    phone_list =df[['collectionName', 'phoneName']].drop_duplicates()
    tmp = time_list.merge(phone_list, on='collectionName', how='outer')
    
    lerp_df = tmp.merge(df, on=['collectionName', 'millisSinceGpsEpoch', 'phoneName'], how='left')
    lerp_df['phone'] = lerp_df['collectionName'] + '_' + lerp_df['phoneName']
    lerp_df = lerp_df.sort_values(['phone', 'millisSinceGpsEpoch'])
    
    # linear interpolation
    lerp_df['latDeg_prev'] = lerp_df['latDeg'].shift(1)
    lerp_df['latDeg_next'] = lerp_df['latDeg'].shift(-1)
    lerp_df['lngDeg_prev'] = lerp_df['lngDeg'].shift(1)
    lerp_df['lngDeg_next'] = lerp_df['lngDeg'].shift(-1)
    lerp_df['phone_prev'] = lerp_df['phone'].shift(1)
    lerp_df['phone_next'] = lerp_df['phone'].shift(-1)
    lerp_df['time_prev'] = lerp_df['millisSinceGpsEpoch'].shift(1)
    lerp_df['time_next'] = lerp_df['millisSinceGpsEpoch'].shift(-1)
    lerp_df['latDeg_cov_prev'] = lerp_df['latDeg_cov'].shift(1)
    lerp_df['latDeg_cov_next'] = lerp_df['latDeg_cov'].shift(-1)
    lerp_df['lngDeg_cov_prev'] = lerp_df['lngDeg_cov'].shift(1)
    lerp_df['lngDeg_cov_next'] = lerp_df['lngDeg_cov'].shift(-1)
    # Leave only records to be interpolated
    lerp_df = lerp_df[(lerp_df['latDeg'].isnull())&(lerp_df['phone']==lerp_df['phone_prev'])&(lerp_df['phone']==lerp_df['phone_next'])].copy()
    # calc lerp
    lerp_df['latDeg'] = lerp_df['latDeg_prev'] + ((lerp_df['latDeg_next'] - lerp_df['latDeg_prev']) * ((lerp_df['millisSinceGpsEpoch'] - lerp_df['time_prev']) / (lerp_df['time_next'] - lerp_df['time_prev']))) 
    lerp_df['lngDeg'] = lerp_df['lngDeg_prev'] + ((lerp_df['lngDeg_next'] - lerp_df['lngDeg_prev']) * ((lerp_df['millisSinceGpsEpoch'] - lerp_df['time_prev']) / (lerp_df['time_next'] - lerp_df['time_prev']))) 
    lerp_df['latDeg_cov'] = lerp_df['latDeg_cov_prev'] + ((lerp_df['latDeg_cov_next'] - lerp_df['latDeg_cov_prev']) * ((lerp_df['millisSinceGpsEpoch'] - lerp_df['time_prev']) / (lerp_df['time_next'] - lerp_df['time_prev']))) 
    lerp_df['lngDeg_cov'] = lerp_df['lngDeg_cov_prev'] + ((lerp_df['lngDeg_cov_next'] - lerp_df['lngDeg_cov_prev']) * ((lerp_df['millisSinceGpsEpoch'] - lerp_df['time_prev']) / (lerp_df['time_next'] - lerp_df['time_prev']))) 
    
    # Leave only the data that has a complete set of previous and next data.
    lerp_df = lerp_df[~lerp_df['latDeg'].isnull()]
    
    return lerp_df[org_columns]

def weighted_calc_mean_pred(df, lerp_df):
    '''
    Make a prediction based on the average of the predictions of phones in the same collection.
    '''
    cnt = 0
    add_lerp = pd.concat([df, lerp_df])
    #mean_pred_result = add_lerp.groupby(['collectionName', 'millisSinceGpsEpoch'])[['latDeg', 'lngDeg']].mean().reset_index()
    mean_pred_result = []
    for i,(c,_df) in enumerate(add_lerp.groupby(['collectionName', 'millisSinceGpsEpoch'])):
        #print(c,len(df))
        if len(_df)>1:
            wsum_lat = 0
            wsum_lng = 0
            for j in range(len(_df)):
                if j==0:
                    _df.loc[_df.index[0],'latDeg'] = _df.loc[_df.index[j],'latDeg']/np.power(_df.loc[_df.index[j],'latDeg_cov'],8)
                    _df.loc[_df.index[0],'lngDeg'] = _df.loc[_df.index[j],'lngDeg']/np.power(_df.loc[_df.index[j],'lngDeg_cov'],8)
                else:
                    _df.loc[_df.index[0],'latDeg'] += _df.loc[_df.index[j],'latDeg']/np.power(_df.loc[_df.index[j],'latDeg_cov'],8)
                    _df.loc[_df.index[0],'lngDeg'] += _df.loc[_df.index[j],'lngDeg']/np.power(_df.loc[_df.index[j],'lngDeg_cov'],8)
                wsum_lat += 1/np.power(_df.loc[_df.index[j],'latDeg_cov'],8)
                wsum_lng += 1/np.power(_df.loc[_df.index[j],'lngDeg_cov'],8)
            _df.loc[_df.index[0],'latDeg'] = _df.loc[_df.index[0],'latDeg']/wsum_lat
            _df.loc[_df.index[0],'lngDeg'] = _df.loc[_df.index[0],'lngDeg']/wsum_lng
            _df = _df.loc[[_df.index[0]],:]
            #_df.iloc[0,3] = (_df.iloc[0,3]*_df.iloc[1,5]*_df.iloc[1,5]+_df.iloc[1,3]*_df.iloc[0,5]*_df.iloc[0,5])/(_df.iloc[0,5]*_df.iloc[0,5]+_df.iloc[1,5]*_df.iloc[1,5])
            #_df.iloc[0,4] = (_df.iloc[0,4]*_df.iloc[1,6]*_df.iloc[1,6]+_df.iloc[1,4]*_df.iloc[0,6]*_df.iloc[0,6])/(_df.iloc[0,6]*_df.iloc[0,6]+_df.iloc[1,6]*_df.iloc[1,6])
            #_df = _df.iloc[[0],:]
        mean_pred_result.append(_df)
    mean_pred_result = pd.concat(mean_pred_result)  
    mean_pred_df = df[['collectionName', 'phoneName', 'millisSinceGpsEpoch']].copy()
    mean_pred_df = mean_pred_df.merge(mean_pred_result[['collectionName', 'millisSinceGpsEpoch', 'latDeg', 'lngDeg']], on=['collectionName', 'millisSinceGpsEpoch'], how='left')
    return mean_pred_df

def percentile50(x):
    return np.percentile(x, 50)
def percentile95(x):
    return np.percentile(x, 95)

def get_train_score(df, gt):
    gt = gt.rename(columns={'latDeg':'latDeg_gt', 'lngDeg':'lngDeg_gt'})
    df = df.merge(gt, on=['collectionName', 'phoneName', 'millisSinceGpsEpoch'], how='left')
    # calc_distance_error
    df['err'] = calc_haversine(df['latDeg_gt'], df['lngDeg_gt'], df['latDeg'], df['lngDeg'])
    # calc_evaluate_score
    df['phone'] = df['collectionName'] + '_' + df['phoneName']
    res = df.groupby('phone')['err'].agg([percentile50, percentile95])
    res['p50_p90_mean'] = (res['percentile50'] + res['percentile95']) / 2 
    score = res['p50_p90_mean'].mean()
    return score

def get_removedevice(input_df: pd.DataFrame, divece: str) -> pd.DataFrame:
    input_df['index'] = input_df.index
    input_df = input_df.sort_values('millisSinceGpsEpoch')
    input_df.index = input_df['millisSinceGpsEpoch'].values

    output_df = pd.DataFrame() 
    for _, subdf in input_df.groupby('collectionName'):

        phones = subdf['phoneName'].unique()

        if (len(phones) == 1) or (not divece in phones):
            output_df = pd.concat([output_df, subdf])
            continue

        origin_df = subdf.copy()
        
        _index = subdf['phoneName']==divece
        subdf.loc[_index, 'latDeg'] = np.nan
        subdf.loc[_index, 'lngDeg'] = np.nan
        subdf = subdf.interpolate(method='index', limit_area='inside')

        _index = subdf['latDeg'].isnull()
        subdf.loc[_index, 'latDeg'] = origin_df.loc[_index, 'latDeg'].values
        subdf.loc[_index, 'lngDeg'] = origin_df.loc[_index, 'lngDeg'].values

        output_df = pd.concat([output_df, subdf])

    output_df.index = output_df['index'].values
    output_df = output_df.sort_index()

    del output_df['index']
    
    return output_df

def get_remove_data(input_df: pd.DataFrame, collection_device) -> pd.DataFrame:
    input_df['index'] = input_df.index
    input_df = input_df.sort_values('millisSinceGpsEpoch')
    input_df.index = input_df['millisSinceGpsEpoch'].values

    output_df = pd.DataFrame() 
    for c, subdf in input_df.groupby('collectionName'):

        phones = subdf['phoneName'].unique()

        devices = [x[1] for x in collection_device if x[0]==c]
        if (len(phones) == 1) or (len(devices)==0):
            output_df = pd.concat([output_df, subdf])
            continue

        origin_df = subdf.copy()
        _index = subdf['phoneName'].isin(devices)
        subdf.loc[_index, 'latDeg'] = np.nan
        subdf.loc[_index, 'lngDeg'] = np.nan
        subdf = subdf.interpolate(method='index', limit_area='inside')

        _index = subdf['latDeg'].isnull()
        subdf.loc[_index, 'latDeg'] = origin_df.loc[_index, 'latDeg'].values
        subdf.loc[_index, 'lngDeg'] = origin_df.loc[_index, 'lngDeg'].values

        output_df = pd.concat([output_df, subdf])

    output_df.index = output_df['index'].values
    output_df = output_df.sort_index()

    del output_df['index']
    
    return output_df

In [None]:
test_ro = add_distance_diff(test).copy()
th = 50
print('outlier filtering')
test_ro.loc[((test_ro['dist_prev'] > th) & (test_ro['dist_next'] > th)), ['latDeg', 'lngDeg']] = np.nan
cols = ['collectionName', 'phoneName', 'millisSinceGpsEpoch', 'latDeg', 'lngDeg']
print('appling kf smoothing')
test_ro_kf = apply_kf_smoothing(test_ro[cols])
vanilla_kal = test_ro_kf.copy()
print(get_train_score(test,gt))
print(get_train_score(test_ro_kf,gt))

# Velocity Kalman Filter

In [None]:
T = 1.0
state_transition = np.array([[1, 0, T, 0, 0.5 * T ** 2, 0], [0, 1, 0, T, 0, 0.5 * T ** 2], [0, 0, 1, 0, T, 0],
                             [0, 0, 0, 1, 0, T], [0, 0, 0, 0, 1, 0], [0, 0, 0, 0, 0, 1]])
process_noise = np.diag([1e-5, 1e-5, 5e-6, 5e-6, 1e-6, 1e-6]) + np.ones((6, 6)) * 1e-9
observation_model = np.array([[1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0],[0,0,1,0,0,0],[0,0,0,1,0,0]])
observation_noise = np.diag([5e-5, 5e-5, 5e-6,5e-6]) + np.ones((4, 4)) * 1e-9

kfv = simdkalman.KalmanFilter(
        state_transition = state_transition,
        process_noise = process_noise,
        observation_model = observation_model,
        observation_noise = observation_noise)

def adjust_time(df):
    data = df.copy().reset_index(drop=True)
    new_data = data.copy()
    t = 'millisSinceGpsEpoch'
    start = data['millisSinceGpsEpoch'].values[0]
    end = data['millisSinceGpsEpoch'].values[-1]
    ind = math.ceil((end-start)/1000)
    _data = pd.DataFrame(index=range(ind))
    _data['latDeg']=0
    _data['lngDeg']=0
    _data['latv_pred']=0
    _data['lngv_pred']=0
    #_data['millisSinceGpsEpoch'] = 0
    _data['millisSinceGpsEpoch'] = start+1000*_data.index
    
    cnt = 0
    for i in _data.index:
        while _data.loc[i,t]<new_data.loc[cnt,t] or _data.loc[i,t]>new_data.loc[cnt+1,t]:
            cnt+=1
        if _data.loc[i,t]==new_data.loc[cnt,t]:
            _data.loc[i,'latDeg'] = new_data.loc[cnt,'latDeg']
            _data.loc[i,'lngDeg'] = new_data.loc[cnt,'lngDeg']
            _data.loc[i,'latv_pred'] = new_data.loc[cnt,'latv_pred']
            _data.loc[i,'lngv_pred'] = new_data.loc[cnt,'lngv_pred']
        else:
            w=(new_data.loc[cnt+1,t]-_data.loc[i,t])/(new_data.loc[cnt+1,t]-new_data.loc[cnt,t])
            if w<0 or w>1:
                print(w)
            _data.loc[i,'latDeg'] = new_data.loc[cnt,'latDeg']*w+new_data.loc[cnt+1,'latDeg']*(1-w)
            _data.loc[i,'lngDeg'] = new_data.loc[cnt,'lngDeg']*w+new_data.loc[cnt+1,'lngDeg']*(1-w)
            _data.loc[i,'latv_pred'] = new_data.loc[cnt,'latv_pred']*w+new_data.loc[cnt+1,'latv_pred']*(1-w)
            _data.loc[i,'lngv_pred'] = new_data.loc[cnt,'lngv_pred']*w+new_data.loc[cnt+1,'lngv_pred']*(1-w)
    _data.head()
    __data = _data.loc[:,['latDeg', 'lngDeg','latv_pred','lngv_pred']].to_numpy()
    __data = __data.reshape(1, len(__data), 4)
    smoothed = kfv.smooth(__data)
    _data.loc[:,'latDeg_pred'] = smoothed.states.mean[0, :, 0]
    _data.loc[:,'lngDeg_pred'] = smoothed.states.mean[0, :, 1]
    _data.loc[:,'latDeg_cov_pred'] = smoothed.states.cov[0, :, 0,0]
    _data.loc[:,'lngDeg_cov_pred'] = smoothed.states.cov[0, :, 1,1]
    data['latDeg_cov'] = 1
    data['lngDeg_cov'] = 1

    cnt = 0
    for i in data.index:
        try:
            while data.loc[i,t]<_data.loc[cnt,t] or data.loc[i,t]>_data.loc[cnt+1,t]:
                cnt+=1
            if data.loc[i,t]==_data.loc[cnt,t]:
                data.loc[i,'latDeg'] = _data.loc[cnt,'latDeg_pred']
                data.loc[i,'lngDeg'] = _data.loc[cnt,'lngDeg_pred']
                data.loc[i,'latDeg_cov'] = _data.loc[cnt,'latDeg_cov_pred']
                data.loc[i,'lngDeg_cov'] = _data.loc[cnt,'lngDeg_cov_pred']
            else:
                w=(_data.loc[cnt+1,t]-data.loc[i,t])/(_data.loc[cnt+1,t]-_data.loc[cnt,t])
                if w<0 or w>1:
                    print(w)
                data.loc[i,'latDeg'] = _data.loc[cnt,'latDeg_pred']*w+_data.loc[cnt+1,'latDeg_pred']*(1-w)
                data.loc[i,'lngDeg'] = _data.loc[cnt,'lngDeg_pred']*w+_data.loc[cnt+1,'lngDeg_pred']*(1-w)
                data.loc[i,'latDeg_cov'] = _data.loc[cnt,'latDeg_cov_pred']*w+_data.loc[cnt+1,'latDeg_cov_pred']*(1-w)
                data.loc[i,'lngDeg_cov'] = _data.loc[cnt,'lngDeg_cov_pred']*w+_data.loc[cnt+1,'lngDeg_cov_pred']*(1-w)
        except:
            pass
    return data

def apply_kf_smoothing(df, kf_=kfv):
    unique_paths = df[['collectionName', 'phoneName']].drop_duplicates().to_numpy()
    for collection, phone in unique_paths:
        cond = np.logical_and(df['collectionName'] == collection, df['phoneName'] == phone)
        if check_time(df[cond]):
            _df = adjust_time(df[cond])
            df.loc[cond, 'latDeg'] = _df['latDeg'].values
            df.loc[cond, 'lngDeg'] = _df['lngDeg'].values
            df.loc[cond, 'latDeg_cov'] = _df['latDeg_cov'].values
            df.loc[cond, 'lngDeg_cov'] = _df['lngDeg_cov'].values
        else:
            data = df[cond][['latDeg', 'lngDeg','latv_pred','lngv_pred']].to_numpy()
            data = data.reshape(1, len(data), 4)
            smoothed = kf_.smooth(data)
            df.loc[cond, 'latDeg'] = smoothed.states.mean[0, :, 0]
            #print(smoothed.states.mean.shape,smoothed.states.cov.shape)
            df.loc[cond, 'latDeg_cov'] = smoothed.states.cov[0, :, 0,0]
            df.loc[cond, 'lngDeg'] = smoothed.states.mean[0, :, 1]
            df.loc[cond, 'lngDeg_cov'] = smoothed.states.cov[0, :, 1,1]
    return df

In [None]:
test_ro = add_distance_diff(test).copy()
th = 50
print('outlier filtering')
test_ro.loc[((test_ro['dist_prev'] > th) & (test_ro['dist_next'] > th)), ['latDeg', 'lngDeg']] = np.nan
cols = ['collectionName', 'phoneName', 'millisSinceGpsEpoch', 'latDeg', 'lngDeg','latv_pred','lngv_pred']
print('appling kf smoothing')
test_ro_kf = apply_kf_smoothing(test_ro[cols])
print(get_train_score(test,gt))
print(get_train_score(test_ro_kf,gt))
velocity_kal = test_ro_kf.copy()

In [None]:
en = velocity_kal.copy()
cond = (velocity_kal['latv_pred'].apply(lambda x:np.abs(x))+velocity_kal['latv_pred'].apply(lambda x:np.abs(x)))>2e-4
print(cond.sum()/len(en))
en.loc[cond,'latDeg'] = vanilla_kal.loc[cond,'latDeg']*1.0
en.loc[cond,'lngDeg'] = vanilla_kal.loc[cond,'lngDeg']*1.0
print('通常カルマン:',get_train_score(vanilla_kal,gt))
print('速度カルマン',get_train_score(velocity_kal,gt))
print('早いところ通常カルマン、遅いところは速度カルマン:',get_train_score(en,gt))

# 考察
- 0.05弱通常のカルマンフィルタより向上した（collectionでvalidationもしているのである程度効果はありそう）
 - Vanilla Kalman Filter:4.721716804543573
 - Velocity Kalman Filter:4.678559609622235
- アンサンブルに関しては効果はほぼない


- 