### testの方も完全なwifiベースでデータセットを作成
testのreal timestampは公開notebookのものに変更
kalman filterでwaypointを補完する　

In [1]:
import pandas as pd
import numpy as np
import glob
import os
import gc
import json 
import pickle
from tqdm import tqdm
import matplotlib.pyplot as plt
import dask
from dask.distributed import Client, wait, LocalCluster
from pathlib import Path
import warnings
warnings.simplefilter('ignore')

In [2]:
def to_pickle(filename, obj):
    with open(filename, mode='wb') as f:
        pickle.dump(obj, f)
        
def from_pickle(filename):
    with open(filename, mode='rb') as f:
        obj = pickle.load(f)
    return obj

In [4]:
# waypointを補正したdataset
root_dir = Path('../input/')
with open(root_dir/'2kaido_wifi_dataset_v5/train_10000_7.pkl', 'rb') as f:
  train_df = pickle.load(f)

with open(root_dir/'2kaido_wifi_dataset_v5/test_10000_7.pkl', 'rb') as f:
  test_df = pickle.load(f)

In [5]:
train_df = train_df.astype({'timestamp':int,'x': np.float32, 'y': np.float32, 'floor':np.float32, 'ix':np.float32, 'iy':np.float32,  'fx':np.float32, 'fy':np.float32})
train_df['floor'] = train_df['floor'].astype(int)  # str -> float -> intで負の数をintにする
test_df = test_df.astype({'timestamp':int})

In [6]:
train_df['distance'] = np.sqrt((train_df['ix']-train_df['fx'])**2 + (train_df['iy']-train_df['fy'])**2)
itrain_df = train_df.drop(['fx','fy'], axis=1).copy()  # linearのtrain
ftrain_df = train_df.drop(['ix','iy'], axis=1).copy()  # kalmanのtrain 
ftrain_df = ftrain_df[ftrain_df['distance']<5].reset_index(drop=True)   # 5<distanceは信頼性低いので削る
ftrain_df = ftrain_df.rename(columns={'fx':'ix', 'fy':'iy'})
train_df = pd.concat([itrain_df, ftrain_df]).drop_duplicates().reset_index(drop=True)  # 重複削除によって 503421 -> 503119

In [7]:
train_df['site'] + '_' + train_df['path'] + '_' + train_df['timestamp']

Unnamed: 0,ssid_0,ssid_1,ssid_2,ssid_3,ssid_4,ssid_5,ssid_6,ssid_7,ssid_8,ssid_9,...,itimestamp,ix,iy,floor,floor_str,path,site_id,timediff,itimediff,distance
0,356d66c73c423be835bd5d07cb1ebdfa821d8e23,c3513a636d1a813db081a2ffc33f297b4fefe28d,990847ae755de95a44c7b1be54e66f4e56fdc7f4,4e601619b7c7d9df8d61490ad2c134f08ea01d61,c3513a636d1a813db081a2ffc33f297b4fefe28d,b14c56d7cd73a5b026fc118c671e24dc2ba2558b,d839a45ebe64ab48b60a407d837fb01d3c0dfef9,7182afc4e5c212133d5d7d76eb3df6c24618302b,b9f0208be00bd8b337be7f12e02e3a3ce846e22b,3745dc080c9396d2317f4c05d7141c1df83acf9d,...,1578466134278,113.615387,156.634796,-1,B1,5e1580d1f4c3420006d520e4,5a0546857ecc773753327266,-1545,-45,0.587131
1,356d66c73c423be835bd5d07cb1ebdfa821d8e23,f44fa6118fed7198296c8b45b2f2684903d99620,c3513a636d1a813db081a2ffc33f297b4fefe28d,990847ae755de95a44c7b1be54e66f4e56fdc7f4,d839a45ebe64ab48b60a407d837fb01d3c0dfef9,b7e6027447eb1f81327d66cfd3adbe557aabf26c,7182afc4e5c212133d5d7d76eb3df6c24618302b,da39a3ee5e6b4b0d3255bfef95601890afd80709,b9f0208be00bd8b337be7f12e02e3a3ce846e22b,b6ffe5619e02871fcd04f61c9bb4b5c53a3f46b7,...,1578466136278,112.655884,156.358200,-1,B1,5e1580d1f4c3420006d520e4,5a0546857ecc773753327266,-3512,-12,0.772321
2,f44fa6118fed7198296c8b45b2f2684903d99620,356d66c73c423be835bd5d07cb1ebdfa821d8e23,8aed75f7c344e6a3d4916750029dd4ee47c1e7c5,c3513a636d1a813db081a2ffc33f297b4fefe28d,5a368e0bd5050bdb4653dc39c86bee1fd2b8aeb7,4e601619b7c7d9df8d61490ad2c134f08ea01d61,2ce029b3a3dea973a44bf0587be5e2b93a74e5f1,b14c56d7cd73a5b026fc118c671e24dc2ba2558b,3fa90121039c7b6e24ae985d228e0366ae15fba4,3745dc080c9396d2317f4c05d7141c1df83acf9d,...,1578466138278,111.936256,156.150772,-1,B1,5e1580d1f4c3420006d520e4,5a0546857ecc773753327266,-5471,29,1.045990
3,356d66c73c423be835bd5d07cb1ebdfa821d8e23,f44fa6118fed7198296c8b45b2f2684903d99620,5a368e0bd5050bdb4653dc39c86bee1fd2b8aeb7,990847ae755de95a44c7b1be54e66f4e56fdc7f4,da39a3ee5e6b4b0d3255bfef95601890afd80709,b6ffe5619e02871fcd04f61c9bb4b5c53a3f46b7,b7e6027447eb1f81327d66cfd3adbe557aabf26c,3fa90121039c7b6e24ae985d228e0366ae15fba4,d839a45ebe64ab48b60a407d837fb01d3c0dfef9,d839a45ebe64ab48b60a407d837fb01d3c0dfef9,...,1578466140278,110.976761,155.874176,-1,B1,5e1580d1f4c3420006d520e4,5a0546857ecc773753327266,-7428,72,1.889729
4,f44fa6118fed7198296c8b45b2f2684903d99620,356d66c73c423be835bd5d07cb1ebdfa821d8e23,990847ae755de95a44c7b1be54e66f4e56fdc7f4,c3513a636d1a813db081a2ffc33f297b4fefe28d,d839a45ebe64ab48b60a407d837fb01d3c0dfef9,b7e6027447eb1f81327d66cfd3adbe557aabf26c,7182afc4e5c212133d5d7d76eb3df6c24618302b,b9f0208be00bd8b337be7f12e02e3a3ce846e22b,b6ffe5619e02871fcd04f61c9bb4b5c53a3f46b7,8aed75f7c344e6a3d4916750029dd4ee47c1e7c5,...,1578466142278,110.017258,155.597580,-1,B1,5e1580d1f4c3420006d520e4,5a0546857ecc773753327266,7401,105,2.400107
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
503114,18336f0964fbd470a2ee7116d33b96946c633999,072159287058774aa9b450c9163129b309bc982b,18336f0964fbd470a2ee7116d33b96946c633999,040667e2fdd3a9bbb54e970c430ad6f08416c52e,072159287058774aa9b450c9163129b309bc982b,da39a3ee5e6b4b0d3255bfef95601890afd80709,da39a3ee5e6b4b0d3255bfef95601890afd80709,da39a3ee5e6b4b0d3255bfef95601890afd80709,da39a3ee5e6b4b0d3255bfef95601890afd80709,90ef6526fa145d95dc065edc3b3a3193406f3ee4,...,1573892866000,128.817688,108.838440,6,F7,5dcfb393878f3300066c70a6,5dc8cea7659e181adb076a3f,-2224,38,0.209345
503115,18336f0964fbd470a2ee7116d33b96946c633999,18336f0964fbd470a2ee7116d33b96946c633999,072159287058774aa9b450c9163129b309bc982b,072159287058774aa9b450c9163129b309bc982b,ea9e102c49dbc834b1059c5e8b5dc4a017a82dff,040667e2fdd3a9bbb54e970c430ad6f08416c52e,da39a3ee5e6b4b0d3255bfef95601890afd80709,ea9e102c49dbc834b1059c5e8b5dc4a017a82dff,da39a3ee5e6b4b0d3255bfef95601890afd80709,022a370e1c578e795c972b955120606c5dcc6cc8,...,1573892868000,131.251892,111.029167,6,F7,5dcfb393878f3300066c70a6,5dc8cea7659e181adb076a3f,1098,126,0.789737
503116,18336f0964fbd470a2ee7116d33b96946c633999,18336f0964fbd470a2ee7116d33b96946c633999,072159287058774aa9b450c9163129b309bc982b,ea9e102c49dbc834b1059c5e8b5dc4a017a82dff,072159287058774aa9b450c9163129b309bc982b,040667e2fdd3a9bbb54e970c430ad6f08416c52e,1f09251bbfadafb11c63c87963af25238d6bc886,da39a3ee5e6b4b0d3255bfef95601890afd80709,da39a3ee5e6b4b0d3255bfef95601890afd80709,ea9e102c49dbc834b1059c5e8b5dc4a017a82dff,...,1573892870000,132.174164,111.875511,6,F7,5dcfb393878f3300066c70a6,5dc8cea7659e181adb076a3f,-790,238,1.013994
503117,18336f0964fbd470a2ee7116d33b96946c633999,18336f0964fbd470a2ee7116d33b96946c633999,072159287058774aa9b450c9163129b309bc982b,040667e2fdd3a9bbb54e970c430ad6f08416c52e,da39a3ee5e6b4b0d3255bfef95601890afd80709,072159287058774aa9b450c9163129b309bc982b,90ef6526fa145d95dc065edc3b3a3193406f3ee4,ea9e102c49dbc834b1059c5e8b5dc4a017a82dff,1f09251bbfadafb11c63c87963af25238d6bc886,da39a3ee5e6b4b0d3255bfef95601890afd80709,...,1573892871500,134.495392,114.157982,6,F7,5dcfb393878f3300066c70a6,5dc8cea7659e181adb076a3f,2619,-187,0.713653


In [8]:
def split_col(df):
    """
    Split submission site/path/timestamp into individual columns.
    """
    df = pd.concat(
        [
            df["site_path_timestamp"]
            .str.split("_", expand=True)
            .rename(columns={0: "site", 1: "path", 2: "timestamp"}),
            df,
        ],
        axis=1,
    ).copy()
    return df

In [9]:
def add_predictions_location(args):
    (site, floorNo) , df_submission = args
    df_result = df_submission.copy()
    with open(f"../input/indoor-location-navigation/metadata/{site}/{floorNo}/geojson_map.json") as json_file:
        geofloor_data = json.load(json_file)
    with open(f"../input/indoor-location-navigation/metadata/{site}/{floorNo}/floor_info.json") as json_file:
        floor_info = json.load(json_file)
    type_poly = geofloor_data['features'][0]['geometry']['type']
    if type_poly == 'Polygon':
        polygon = np.array(geofloor_data['features'][0]['geometry']['coordinates'][0])
    else:
        polygon = np.array(geofloor_data['features'][0]['geometry']['coordinates'][0][0])
    floor_polygons = Polygon(polygon)
    store_polygons_l = [Polygon(features['geometry']['coordinates'][0]) for features in geofloor_data['features'][1:]]
    store_polygons = so.unary_union(store_polygons_l)
    safe_area_polygons = floor_polygons.difference(store_polygons)
    x_max, x_min = polygon[:, 0].max(), polygon[:, 0].min()
    y_max, y_min = polygon[:, 1].max(), polygon[:, 1].min()
    df_result['x_scaled'] = x_min + df_result['x'] * (x_max - x_min) / floor_info['map_info']['width']
    df_result['y_scaled'] = y_min + df_result['y'] * (y_max - y_min) / floor_info['map_info']['height']
    df_result['InFloor'] = df_result.apply(lambda row: floor_polygons.contains(Point(row['x_scaled'], row['y_scaled'])), axis=1)
    df_result['InStore'] = df_result.apply(lambda row: store_polygons.contains(Point(row['x_scaled'], row['y_scaled'])), axis=1)
    df_result['InSafe'] = df_result.apply(lambda row: safe_area_polygons.contains(Point(row['x_scaled'], row['y_scaled'])), axis=1)
    return df_result

In [None]:
import multiprocessing
from tqdm.notebook import tqdm

processes = multiprocessing.cpu_count()
with multiprocessing.Pool(processes=processes) as pool:
    dfs = pool.imap_unordered(add_predictions_location, train_df.groupby(['site_id', 'floor']))
    dfs = tqdm(dfs)
    dfs = list(dfs)
sub = pd.concat(dfs).sort_values('site_path_timestamp')