<a href="https://colab.research.google.com/github/keymc021/work/blob/master/demonstration_of_the_kalman_filter.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In this sample notebook it is demonstrated how to apply Kalman Filter to improve the baseline slightly

The notebook is based on https://www.kaggle.com/jpmiller/baseline-from-host-data

## ensure you have everything you need

In [None]:
!pip install simdkalman

Collecting simdkalman
  Downloading simdkalman-1.0.1-py2.py3-none-any.whl (11 kB)
Installing collected packages: simdkalman
Successfully installed simdkalman-1.0.1


Please read the documentation if you would like to learn more about this implementation of kf: https://simdkalman.readthedocs.io/en/latest/

In [None]:
from pathlib import Path
import numpy as np
import pandas as pd
import simdkalman
from tqdm.notebook import tqdm

## define kf model


In [None]:
T = 1.0
state_transition = np.array([[1, 0, T, 0, 0.5 * T ** 2, 0], [0, 1, 0, T, 0, 0.5 * T ** 2], [0, 0, 1, 0, T, 0],
                             [0, 0, 0, 1, 0, T], [0, 0, 0, 0, 1, 0], [0, 0, 0, 0, 0, 1]])
process_noise = np.diag([1e-5, 1e-5, 5e-6, 5e-6, 1e-6, 1e-6]) + np.ones((6, 6)) * 1e-9
observation_model = np.array([[1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0]])
observation_noise = np.diag([5e-5, 5e-5]) + np.ones((2, 2)) * 1e-9

kf = simdkalman.KalmanFilter(
        state_transition = state_transition,
        process_noise = process_noise,
        observation_model = observation_model,
        observation_noise = observation_noise)

In [None]:
def apply_kf_smoothing(df, kf_=kf):
    unique_paths = df[['collectionName', 'phoneName']].drop_duplicates().to_numpy()
    for collection, phone in tqdm(unique_paths):
        cond = np.logical_and(df['collectionName'] == collection, df['phoneName'] == phone)
        data = df[cond][['latDeg', 'lngDeg']].to_numpy()
        data = data.reshape(1, len(data), 2)
        smoothed = kf_.smooth(data)
        df.loc[cond, 'latDeg'] = smoothed.states.mean[0, :, 0]
        df.loc[cond, 'lngDeg'] = smoothed.states.mean[0, :, 1]
    return df

## evaluate train error

In [None]:
data_path = Path("../input/google-smartphone-decimeter-challenge")

truths = (data_path / 'train').rglob('ground_truth.csv')
    # returns a generator

df_list = []
cols = ['collectionName', 'phoneName', 'millisSinceGpsEpoch', 'latDeg', 'lngDeg']

for t in tqdm(truths, total=73):
    df_phone = pd.read_csv(t, usecols=cols)  
    df_list.append(df_phone)
df_truth = pd.concat(df_list, ignore_index=True)

df_basepreds_kf = apply_kf_smoothing(pd.read_csv('../input/google-smartphone-decimeter-challenge/baseline_locations_train.csv', usecols=cols))
df_all = df_truth.merge(df_basepreds_kf, how='inner', on=cols[:3], suffixes=('_truth', '_basepred'))

  0%|          | 0/73 [00:00<?, ?it/s]

  0%|          | 0/73 [00:00<?, ?it/s]

In [None]:
# simplified haversine distance
def calc_haversine(lat1, lon1, lat2, lon2):
    """Calculates the great circle distance between two points
    on the earth. Inputs are array-like and specified in decimal degrees.
    """
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2

    c = 2 * np.arcsin(a**0.5)
    dist = 6_367_000 * c
    return dist

In [None]:
df_all['dist'] = calc_haversine(df_all.latDeg_truth, df_all.lngDeg_truth, 
    df_all.latDeg_basepred, df_all.lngDeg_basepred)

print(f'mean error on train dataset: {df_all.dist.mean():.3f}m - slightly better than the baseline, but still a lot of improvements are needed')


mean error on train dataset: 3.391m - slightly better than the baseline, but still a lot of improvements are needed


## prepare a submission based on the sample submission

In [None]:
test_base = pd.read_csv(
    '../input/google-smartphone-decimeter-challenge/baseline_locations_test.csv')

sub = pd.read_csv('../input/google-smartphone-decimeter-challenge/sample_submission.csv')

kf_smoothed_baseline = apply_kf_smoothing(test_base)
sub = sub.assign(
    latDeg = kf_smoothed_baseline.latDeg,
    lngDeg = kf_smoothed_baseline.lngDeg
)
sub.to_csv('submission.csv', index=False)

  0%|          | 0/48 [00:00<?, ?it/s]