In [1]:
import os
import sys
import glob
import yaml
import pickle
import random

import numpy as np
import pandas as pd
import scipy.stats as stats
from pathlib import Path

from sklearn import model_selection
from sklearn.preprocessing import StandardScaler, LabelEncoder

import wandb
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim

import pytorch_lightning as pl
from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.callbacks import EarlyStopping
from pytorch_lightning.utilities.seed import seed_everything

## config

In [2]:
# config
with open('config.yaml') as f:
    config = yaml.safe_load(f)

# globals variable
SEED = config['globals']['seed']
MAX_EPOCHS = config['globals']['max_epochs']
N_SPLITS = config['globals']['n_splits']
USE_FOLDS = config['globals']['use_folds']
DEBUG = config['globals']['debug']
EXP_MESSAGE = config['globals']['exp_message']
NOTES = config['globals']['notes']
MODEL_SAVE = config['globals']['model_save']
ONLY_PRED = config['globals']['only_pred']
PRETRAINED = config['globals']['pretrained']
PRETRAINED_PATH = config['globals']['pretrained_path']
EXP_NAME = str(Path().resolve()).split('/')[-1]

# seed
seed_everything(SEED)

Global seed set to 1996


1996

In [3]:
EXP_NAME

'exp013'

In [4]:
!wandb login 1bb2d0449c11d8b987e25c38b9d8dda176310fb6

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/user/.netrc


## read data

In [5]:
# waypointを補正したdataset
root_dir = Path('../../input/')
train_df = pd.read_csv(root_dir/'kuto_wifi_dataset_v4/train/5000_10/train.csv')

test_df = pd.read_csv(root_dir/'kuto_wifi_dataset_v4/test/5000_10/test.csv')

sub_df = pd.read_csv('../../notebook/real_timestamp_sample_submission_v2.csv', index_col=0)

In [6]:
train_df

Unnamed: 0,site,path,timestamp,file_name,floor,floor_str,x,y
0,5a0546857ecc773753327266,5e1580d1f4c3420006d520e4,1578466132778,5a0546857ecc773753327266_5e1580d1f4c3420006d52...,-1,B1,114.335010,156.842240
1,5a0546857ecc773753327266,5e1580d1f4c3420006d520e4,1578466149574,5a0546857ecc773753327266_5e1580d1f4c3420006d52...,-1,B1,106.659010,154.629520
2,5a0546857ecc773753327266,5e1580d1f4c3420006d520e4,1578466158395,5a0546857ecc773753327266_5e1580d1f4c3420006d52...,-1,B1,102.168240,158.429080
3,5a0546857ecc773753327266,5e1580d1f4c3420006d520e4,1578466166621,5a0546857ecc773753327266_5e1580d1f4c3420006d52...,-1,B1,107.850440,161.892620
4,5a0546857ecc773753327266,5e1580bb1506f2000638fc62,1578466886458,5a0546857ecc773753327266_5e1580bb1506f2000638f...,-1,B1,41.316772,180.017100
...,...,...,...,...,...,...,...,...
74636,5dc8cea7659e181adb076a3f,5dcfb393878f3300066c70a6,1573892854685,5dc8cea7659e181adb076a3f_5dcfb393878f3300066c7...,6,F7,117.176710,99.235780
74637,5dc8cea7659e181adb076a3f,5dcfb393878f3300066c70a6,1573892859436,5dc8cea7659e181adb076a3f_5dcfb393878f3300066c7...,6,F7,122.269950,102.664960
74638,5dc8cea7659e181adb076a3f,5dcfb393878f3300066c70a6,1573892863738,5dc8cea7659e181adb076a3f_5dcfb393878f3300066c7...,6,F7,126.631090,107.011640
74639,5dc8cea7659e181adb076a3f,5dcfb393878f3300066c70a6,1573892868972,5dc8cea7659e181adb076a3f_5dcfb393878f3300066c7...,6,F7,131.792860,111.526085


In [7]:
test_df

Unnamed: 0,site,path,timestamp,file_name
0,5a0546857ecc773753327266,5e1580d1f4c3420006d520e4,1578466132778,5a0546857ecc773753327266_5e1580d1f4c3420006d52...
1,5a0546857ecc773753327266,5e1580d1f4c3420006d520e4,1578466132778,5a0546857ecc773753327266_5e1580d1f4c3420006d52...
2,5a0546857ecc773753327266,5e1580d1f4c3420006d520e4,1578466132778,5a0546857ecc773753327266_5e1580d1f4c3420006d52...
3,5a0546857ecc773753327266,5e1580d1f4c3420006d520e4,1578466132778,5a0546857ecc773753327266_5e1580d1f4c3420006d52...
4,5a0546857ecc773753327266,5e1580d1f4c3420006d520e4,1578466132778,5a0546857ecc773753327266_5e1580d1f4c3420006d52...
...,...,...,...,...
9627,5a0546857ecc773753327266,5e1580d1f4c3420006d520e4,1578466132778,5a0546857ecc773753327266_5e1580d1f4c3420006d52...
9628,5a0546857ecc773753327266,5e1580d1f4c3420006d520e4,1578466132778,5a0546857ecc773753327266_5e1580d1f4c3420006d52...
9629,5a0546857ecc773753327266,5e1580d1f4c3420006d520e4,1578466132778,5a0546857ecc773753327266_5e1580d1f4c3420006d52...
9630,5a0546857ecc773753327266,5e1580d1f4c3420006d520e4,1578466132778,5a0546857ecc773753327266_5e1580d1f4c3420006d52...


In [8]:
# 対象のsiteのみで実験
target_site = '5a0546857ecc773753327266'
train_df = train_df[train_df['site'] == target_site]
test_df = test_df[test_df['site'] == target_site]

In [9]:
# sampleを取り出す
EXT = '.npy'
TRAIN_DATA_DIR = '../../input/kuto_wifi_dataset_v4/train/5000_10/' 
file_name = train_df['file_name'].sample(1).values[0]

sample_file_path = TRAIN_DATA_DIR + file_name + EXT
sample = np.load(sample_file_path)
sample.shape

(2838, 19)

nan

In [10]:
sample

array([[nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       ...,
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan]])

In [11]:
import json
with open(f"../../notebook/bssid_50.json") as f:
    bssid = json.load(f)

In [12]:
target_bssids = bssid[target_site]
wifi_bssids_size = len(target_bssids)
wifi_bssids_size

2838

In [15]:
wifi_pos = pd.read_csv(f'../../input/nb013_bssid_position/nb013_bssid_position_{target_site}.csv')
wifi_pos

Unnamed: 0,ssid,bssid,bssid_x,bssid_y,n_samples,n_samples_rssi_over_m50,n_samples_rssi_over_m55,n_samples_rssi_over_m60,n_samples_rssi_over_m65,n_samples_rssi_over_m70,site,floor
0,da39a3ee5e6b4b0d3255bfef95601890afd80709,c08ad78a45798cfe176a42b35c7381ae602711c5,220.011148,157.668275,52,14,14,14,16,18,5a0546857ecc773753327266,B1
1,da39a3ee5e6b4b0d3255bfef95601890afd80709,ffc7c34369257431c7de9129094deb923bb3e3af,208.995536,156.881556,25,1,5,13,14,14,5a0546857ecc773753327266,B1
2,da39a3ee5e6b4b0d3255bfef95601890afd80709,24d178b2fe580b871c853757e2c4668b16bc3ffc,230.002040,153.508576,11,0,0,0,1,1,5a0546857ecc773753327266,B1
3,da39a3ee5e6b4b0d3255bfef95601890afd80709,23bfb8a3a2936f0536b7b8c1ee4a13706b8c18c6,204.576861,162.133758,47,0,1,4,4,17,5a0546857ecc773753327266,B1
4,da39a3ee5e6b4b0d3255bfef95601890afd80709,059ea3d13de011f91587f1d176599605274f8ee8,170.015288,167.586402,65,0,4,10,10,14,5a0546857ecc773753327266,B1
...,...,...,...,...,...,...,...,...,...,...,...,...
6551,ec46466c42df6238fbcae2b890eee2a012e54a02,7ff6dc072ab78c4ece184ec846d1f43f45c5c6c6,175.603702,79.293471,7,0,0,0,0,7,5a0546857ecc773753327266,F4
6552,345fe449374546fd68b02b8e2f2ee4c4b0a5ec81,ad9070a00c25e44b2c0e2e495818c7b43e50828e,180.331317,44.985035,17,0,0,3,12,17,5a0546857ecc773753327266,F4
6553,809c2396be248fe8c23bca3a04a761294db2a95e,317edaf3ea5c9a9f4958556ab14e8cee1a611a56,153.047750,45.166750,5,0,0,0,0,5,5a0546857ecc773753327266,F4
6554,4e3d9bf00149830025e0bcc7090f4f0ac059da8f,367750d787ec9c41106b733273b08d0a5b8046ae,150.634583,45.510945,10,0,0,0,0,3,5a0546857ecc773753327266,F4


In [21]:
wifi_pos.groupby('bssid')[['bssid_x', 'bssid_y']].mean()

Unnamed: 0_level_0,bssid_x,bssid_y
bssid,Unnamed: 1_level_1,Unnamed: 2_level_1
000840e5c600de293cea57f13326f273c86c3988,109.360597,103.353381
005246b6f51feb1a069e8f005d3e6aba2591b65b,31.989152,59.272956
0076ff7a084cb2ac8c146139965ab1be296e72c4,117.134146,111.699868
0089ad1dd75b13e2c3ceda344988c9f89a83a2f9,139.756293,89.002372
009a3ed672be7bd1b9c4437b43a53296771af098,31.747870,54.314438
...,...,...
ff70d33df144bdc870f2fd804d713e3c430f1c9b,115.684780,104.532918
ffa41c79865d7fb336f586e0dec8b080db1027fb,36.318478,121.153091
ffc7c34369257431c7de9129094deb923bb3e3af,208.995536,156.881556
ffe53bd4dcfaa42668baf5ea0d2ddc676538fce0,60.921142,59.771431


In [26]:
target_bssids_df = pd.DataFrame(target_bssids, columns=['bssid'])
target_bssids_df
# target_bssids_df.merge(wifi_pos.loc[:, ['bssid']])

Unnamed: 0,bssid
0,a52823c2ed57e18f81da316e5bcac8bd2754ce96
1,61a14256d195624aadd9dfd55c8643505635edd7
2,c93c29d2173b811a18de34940ccc210a3064230e
3,a09ab3d8a7700fec7b83389c06088c91748be41c
4,4cff1c8cfec27801ddc9a690ade87c57f1142ee0
...,...
2833,8553514711ebda556c2563b1ae2f3a6d4a2ba023
2834,6fbea29d369bf4483eea00f03f8fd4942f327a8d
2835,166f82b0bfe87b860c471e1ff6ec201cddd3f0a9
2836,55c79013845087ab922af62ad08d981b7ead947c


In [24]:
target_bssids_df['bssid'].nunique()

2838

## preprocessing

In [14]:
np.nan_to_num(sample, copy=False, nan=-100)

array([[-100., -100., -100., ..., -100., -100., -100.],
       [-100., -100., -100., ..., -100., -100., -100.],
       [-100., -100., -100., ..., -100., -100., -100.],
       ...,
       [-100., -100., -100., ..., -100., -100., -100.],
       [-100., -100., -100., ..., -100., -100., -100.],
       [-100., -100., -100., ..., -100., -100., -100.]])

In [15]:
sample / -100

array([[1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       ...,
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.]])

## PyTorch model
- embedding layerが重要  

In [16]:
# dataset
from torch.utils.data import Dataset, DataLoader

class IndoorDataset(Dataset):
    def __init__(self, df, phase='train'):
        self.df = df
        self.phase = phase
        self.file_name = df['file_name'].values

        if phase in ['train', 'valid']:
            # self.xy = df[['x', 'y']].values.astype(np.float32)
            self.xy = df[['x', 'y']].values.astype(np.float32)  # wifiにより補正したx,yを使用
            self.floor = df['floor'].values.astype(np.float32)
        
    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, idx):
        file_name = self.file_name[idx]
        feat = np.load(TRAIN_DATA_DIR + file_name + EXT)
        np.nan_to_num(feat, copy=False, nan=-100)
        feat = (feat / -100).astype(np.float32)
        
        if self.phase in ['train', 'valid']:
            target = {
                'xy':self.xy[idx],
                'floor':self.floor[idx]
            }
        else:
            target = {}
        return feat, target

In [17]:
def mean_position_error(xhat, yhat, fhat, x, y, f):
    intermediate = np.sqrt(np.power(xhat-x, 2) + np.power(yhat-y, 2)) + 15 * np.abs(fhat-f)
    return intermediate.sum()/xhat.shape[0]

def to_np(input):
    return input.detach().cpu().numpy()

In [18]:
def get_optimizer(model: nn.Module, config: dict):
    optimizer_config = config["optimizer"]
    optimizer_name = optimizer_config.get("name")
    base_optimizer_name = optimizer_config.get("base_name")
    optimizer_params = optimizer_config['params']

    if hasattr(optim, optimizer_name):
        optimizer = optim.__getattribute__(optimizer_name)(model.parameters(), **optimizer_params)
        return optimizer
    else:
        base_optimizer = optim.__getattribute__(base_optimizer_name)
        optimizer = globals().get(optimizer_name)(
            model.parameters(), 
            base_optimizer,
            **optimizer_config["params"])
        return  optimizer

def get_scheduler(optimizer, config: dict):
    scheduler_config = config["scheduler"]
    scheduler_name = scheduler_config.get("name")

    if scheduler_name is None:
        return
    else:
        return optim.lr_scheduler.__getattribute__(scheduler_name)(
            optimizer, **scheduler_config["params"])


def get_criterion(config: dict):
    loss_config = config["loss"]
    loss_name = loss_config["name"]
    loss_params = {} if loss_config.get("params") is None else loss_config.get("params")
    if hasattr(nn, loss_name):
        criterion = nn.__getattribute__(loss_name)(**loss_params)
    else:
        criterion = globals().get(loss_name)(**loss_params)

    return criterion

def worker_init_fn(worker_id):                                                          
    np.random.seed(np.random.get_state()[1][0] + worker_id)

In [19]:
# Learner class(pytorch-lighting)
class Learner(pl.LightningModule):
    def __init__(self, model, config):
        super().__init__()
        self.model = model
        self.config = config
        self.xy_criterion = get_criterion(config)
        self.f_criterion = get_criterion(config)
    
    def training_step(self, batch, batch_idx):
        x, y = batch
        output = self.model(x)
        loss = self.xy_criterion(output["xy"], y["xy"])
        return loss
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        output = self.model(x)
        xy_loss = self.xy_criterion(output["xy"], y["xy"])
        f_loss = self.f_criterion(output["floor"], y["floor"])
        loss = xy_loss  # + f_loss
        mpe = mean_position_error(
            to_np(output['xy'][:, 0]), to_np(output['xy'][:, 1]), 0, 
            to_np(y['xy'][:, 0]), to_np(y['xy'][:, 1]), 0)
        
        # floor lossは現状は無視して良い
        self.log(f'Loss/val', loss, on_step=False, on_epoch=True, prog_bar=False, logger=True)
        self.log(f'Loss/xy', xy_loss, on_step=False, on_epoch=True, prog_bar=False, logger=True)
        self.log(f'Loss/floor', f_loss, on_step=False, on_epoch=True, prog_bar=False, logger=True)
        self.log(f'MPE/val', mpe, on_step=False, on_epoch=True, prog_bar=False, logger=True)
        return loss

    def configure_optimizers(self):
        optimizer = get_optimizer(self.model, self.config)
        scheduler = get_scheduler(optimizer, self.config)
        return {"optimizer": optimizer, "lr_scheduler": scheduler, "monitor": "Loss/val"}

In [20]:
# oof
def evaluate(model, loaders, phase):
    x_list = []
    y_list = []
    f_list = []
    with torch.no_grad():
        for batch in loaders[phase]:
            x, y = batch
            output = model(x)
            x_list.append(to_np(output['xy'][:, 0]))
            y_list.append(to_np(output['xy'][:, 1]))
            f_list.append(to_np(output['floor']))

    x_list = np.concatenate(x_list)
    y_list = np.concatenate(y_list)
    f_list = np.concatenate(f_list)
    return x_list, y_list, f_list

In [21]:
import torch
from torch import nn

class LSTMModel(nn.Module):
    def __init__(self, wifi_bssids_size=2838, seq_length=19):
        super(LSTMModel, self).__init__()
        
        self.wifi_bssids_size = wifi_bssids_size
        self.seq_length = seq_length
        self.batch_norm1 = nn.BatchNorm1d(seq_length)
        self.lstm1 = nn.LSTM(input_size=wifi_bssids_size,hidden_size=128,dropout=0.3, batch_first=True)
        self.lstm2 = nn.LSTM(input_size=128,hidden_size=16,dropout=0.1, batch_first=True)
        self.pooling = nn.AdaptiveAvgPool1d(1)

        self.fc_xy = nn.Linear(16, 2)
        self.fc_floor = nn.Linear(16, 1)

    
    def forward(self, x):
        # input embedding
        batch_size = x.shape[0]
        # x = self.linear_layer(x)
        
        
        # lstm layer
        x = x.view(batch_size, self.seq_length, -1)  # [batch, 1]->[batch, 1, 1]
        x = self.batch_norm1(x)
        x, _ = self.lstm1(x)
        x = torch.relu(x)
        x, _ = self.lstm2(x)
        x = torch.relu(x)
        x = x.transpose(1, 2)
        x = self.pooling(x)
        x = x.squeeze(2)
        xy = self.fc_xy(x)
        floor = torch.relu(self.fc_floor(x)).view(-1)
        return {"xy": xy, "floor": floor}

## train

In [22]:
oofs = []  # 全てのoofをdfで格納する
predictions = []  # 全ての予測値をdfで格納する
val_scores = []
# skf = model_selection.StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)
gkf = model_selection.GroupKFold(n_splits=N_SPLITS)
train_fold = [(trn_idx, val_idx) for trn_idx, val_idx in gkf.split(train_df['path'], groups=train_df['path'])]
# 今回はtargetを均等に分ける必要はなくpathが均等に分かれればいいのでskf.split()にpathを与えている。
for fold in range(5):
    # 指定したfoldのみループを回す
    if fold not in USE_FOLDS:
        continue

    print('=' * 20)
    print(f'Fold {fold}')
    print('=' * 20)

    # train/valid data
    trn_idx, val_idx = train_fold[fold]
    trn_df = train_df.loc[trn_idx, :].reset_index(drop=True)
    val_df = train_df.loc[val_idx, :].reset_index(drop=True)

    # data loader
    loaders = {}
    loader_config = config["loader"]
    loaders["train"] = DataLoader(IndoorDataset(trn_df, phase="train"), **loader_config["train"], worker_init_fn=worker_init_fn) 
    loaders["valid"] = DataLoader(IndoorDataset(val_df, phase="valid"), **loader_config["valid"], worker_init_fn=worker_init_fn)
    loaders["test"] = DataLoader(IndoorDataset(test_df, phase="test"), **loader_config["test"], worker_init_fn=worker_init_fn)

    # model
    model = LSTMModel()  # +1としているのはLEを1スタートで始めているため
    model_name = model.__class__.__name__

    # callbacks
    callbacks = []
    checkpoint_callback = ModelCheckpoint(
        monitor=f'Loss/val',
        mode='min',
        dirpath=f"../../model/{EXP_NAME}",
        verbose=False,
        filename=f'{model_name}-{fold}')
    
    if MODEL_SAVE:
        callbacks.append(checkpoint_callback)

    early_stop_callback = EarlyStopping(
        monitor='Loss/val',
        min_delta=0.00,
        patience=200,
        verbose=False,
        mode='min')
    callbacks.append(early_stop_callback)

    # loggers
    RUN_NAME = EXP_NAME + "_" + EXP_MESSAGE
    wandb.init(project='indoor', notes=NOTES, entity='kuto5046', group=RUN_NAME)
    wandb.run.name = RUN_NAME + f'-fold-{fold}'
    wandb_config = wandb.config
    wandb_config.model_name = model_name
    wandb_config.LB = None
    wandb.watch(model)
    
    
    loggers = []
    loggers.append(WandbLogger())

    learner = Learner(model, config)
    # pretrained flag
    if PRETRAINED:
        ckpt = torch.load(PRETRAINED_PATH + f'{model_name}-{fold}.ckpt')
        learner.load_state_dict(ckpt['state_dict'])

    if not ONLY_PRED:
        trainer = pl.Trainer(
            logger=loggers, 
            callbacks=callbacks,
            max_epochs=MAX_EPOCHS,
            gpus=[0],
            fast_dev_run=DEBUG,
            deterministic=True,
            # precision=16,
            progress_bar_refresh_rate=0  # vscodeの時progress barの動作が遅いので表示しない
            )

        trainer.fit(learner, train_dataloader=loaders['train'], val_dataloaders=loaders['valid'])

    #############
    # validation (to make oof)
    #############
#     model.eval()  
#     oof_df = train.loc[val_idx_for_train, ['timestamp', 'x', 'y', 'site_id','site_id_str', 'wifi_x','wifi_y', 'floor', 'floor_str', 'path', 'time_diff']].reset_index(drop=True)
#     oof_x, oof_y, oof_f = evaluate(model, loaders, phase="valid")
#     oof_df["oof_x"] = oof_x
#     oof_df["oof_y"] = oof_y
#     oof_df["oof_floor"] = oof_f
#     oofs.append(oof_df)
    
#     val_score = mean_position_error(
#         oof_df["oof_x"].values, oof_df["oof_y"].values, 0,
#         oof_df['wifi_x'].values, oof_df['wifi_y'].values, 0)
#     val_scores.append(val_score)
#     print(f"fold {fold}: mean position error {val_score}")

    #############
    # inference
    #############n

#     preds_x, preds_y, preds_f = evaluate(model, loaders, phase="test")
#     test_preds = pd.DataFrame(np.stack((preds_f, preds_x, preds_y))).T
#     test_preds.columns = sub_df.columns
#     test_preds["site_path_timestamp"] = test["site_path_timestamp"]
#     test_preds["floor"] = test_preds["floor"].astype(int)
#     test_preds.to_csv(f'{EXP_NAME}_fold{fold}.csv', index=False)
#     predictions.append(test_preds)
#     wandb.finish()

Fold 0


[34m[1mwandb[0m: Currently logged in as: [33mkuto5046[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.10.26 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


GPU available: True, used: True
TPU available: None, using: 0 TPU cores

  | Name         | Type      | Params
-------------------------------------------
0 | model        | LSTMModel | 1.5 M 
1 | xy_criterion | MSELoss   | 0     
2 | f_criterion  | MSELoss   | 0     
-------------------------------------------
1.5 M     Trainable params
0         Non-trainable params
1.5 M     Total params
6.116     Total estimated model params size (MB)


## validation

In [23]:
if len(USE_FOLDS) > 1:
    oofs_df = pd.concat(oofs)
else:
    oofs_df = oofs[0]

oofs_df['site_path_timestamp'] = oofs_df['site_id_str'].astype(str) + '_' + oofs_df['path'] + '_' + oofs_df['timestamp'].astype(str)
oofs_df = oofs_df.sort_values('site_path_timestamp').reset_index(drop=True)
oofs_df.to_csv("oof.csv", index=False)
oofs_df

IndexError: list index out of range

In [None]:
# waypoint補正前のx,yでの評価
oof_score = mean_position_error(
    oofs_df['oof_x'], oofs_df['oof_y'], 0, 
    oofs_df['x'], oofs_df['y'], 0
    )
print(f"CV:{oof_score}")

In [None]:
# waypoint補正後のx,yでの評価
oof_score = mean_position_error(
    oofs_df['oof_x'], oofs_df['oof_y'], 0, 
    oofs_df['wifi_x'], oofs_df['wifi_y'], 0
    )
print(f"CV:{oof_score}")

## testのwaypointを予測結果(wifi)から再度線形補完

In [None]:
if len(USE_FOLDS) > 1:
    # foldの結果を平均した後、reindexでsubmission fileにindexを合わせる
    sub = pd.concat(predictions).groupby('site_path_timestamp').mean().reindex(sub_df.index)
else:
    sub = predictions[0].reindex(sub_df.index)
sub

In [None]:
# floorの数値を置換
simple_accurate_99 = pd.read_csv(root_dir / 'simple-99-accurate-floor-model/submission.csv')
sub['floor'] = simple_accurate_99['floor'].values
sub

In [None]:
import sys 
sys.path.append("../../")
from multiprocessing import Pool
from src.io_f import read_data_file
from scipy import interpolate

# timestampとwaypointを元に線形補完しtarget_timestampに対応するtarget_waypointを求める　
def wifi_waypoint_by_linear_interpolation(
    observed_timestamp: np.ndarray, 
    observed_x:np.ndarray, 
    observed_y:np.ndarray, 
    target_timestamp:np.ndarray, 
    delta_time=500
    ):
    """
    observed: すでに知っている情報
    observed-timestamp,x,yは全て同じ要素数となる

    observedの情報からfitting関数を作成
    """
    target_waypoint_list = []
    num_interpolation = len(observed_timestamp) - 1  # 補完回数 
    # 各waypoint間で線形補完
    for i in range(num_interpolation):
        # 潜在的なtimestampを作成(これのどれかにwifiを当てはめるような形)
        n_split = int((max(observed_timestamp[i:i+2]) - min(observed_timestamp[i:i+2])) / delta_time) + 2  # delta_time刻みとなるように分割数を指定(+2は始点と終点分)
        latent_timestamp = np.linspace(min(observed_timestamp[i:i+2]), max(observed_timestamp[i:i+2]), n_split).astype(int) 
        
        # xが昇順の場合はlatentも昇順になるようにする
        if observed_x[i] < observed_x[i+1]:
            latent_x = np.linspace(min(observed_x[i:i+2]), max(observed_x[i:i+2]), n_split)
        # xが降順の場合はlatentも降順になるようにする
        else:
            latent_x = np.linspace(min(observed_x[i:i+2]), max(observed_x[i:i+2]), n_split)[::-1]
        
        # 線形補完関数の適用
        fitting_func = interpolate.interp1d(observed_x[i:i+2], observed_y[i:i+2])

        # wifiのtimestampに最も近いものをsplit_timestampから取得しそれに対応するwaypointをwifiのwaypointとして取得
        target_x = []
        target_y = []

        # 区間内のwifiデータのみ考える
        if i == num_interpolation-1:
            # pathの最後の区間にobserved timestampより未来にあるtimestampに対処
            target_idx = min(observed_timestamp[i:i+2]) <= target_timestamp
        elif i == 0:
            # pathの最初の区間にobserved timestampより過去にあるtimestampに対処
            target_idx = target_timestamp < max(observed_timestamp[i:i+2])
        else:
            target_idx = (min(observed_timestamp[i:i+2]) <= target_timestamp) & (target_timestamp < max(observed_timestamp[i:i+2]))
        target_use_timestamp = target_timestamp[target_idx]

        # timestampが最も近いものをtarget waypointとして取得
        for t in target_use_timestamp:
            idx = np.abs(latent_timestamp - t).argmin()  # targetとtimestampが最も近いものをlatentから取得
            target_x.append(latent_x[idx])
            # 原因はよくわからないがfittingするとnanが発生する場合がある
            # その場合は元の値をそのまま使う
            if np.isnan(fitting_func(latent_x[idx])).sum() > 0:
                # print('yに欠損値あり')
                idx = np.abs(observed_timestamp - t).argmin()  # 0→wifiの最初のtimestampで問題が起きている
                target_y.append(observed_y[idx])
                
            else:
                target_y.append(fitting_func(latent_x[idx]))

        assert len(target_x) == len(target_y)
        target_waypoint = np.stack([target_x, target_y], axis=1)
        target_waypoint_list.append(target_waypoint)

    target_waypoint = np.concatenate(target_waypoint_list)
    return target_waypoint

In [None]:
test_df = test_df.merge(sub, on="site_path_timestamp")
test_df["path"] = test_df["site_path_timestamp"].str.split("_", expand=True)[1]
test_df["timestamp"] = test_df["site_path_timestamp"].str.split("_", expand=True)[2]
test_df['wifi_timestamp'] = test_df['timestamp'].astype(int) - test_df['time_diff']
test_df

In [None]:
# # 線形補完
# path = test_df['path'].unique()[23]  #15
# df = test_df[test_df['path']==path]

# waypoint = wifi_waypoint_by_linear_interpolation(
#     observed_timestamp=df['wifi_timestamp'].astype(int).values, 
#     observed_x=df['x'].values,
#     observed_y=df['y'].values, 
#     target_timestamp=df['timestamp'].astype(int).values
#     )
# print(len(df), len(waypoint))

# import matplotlib.pyplot as plt
# # print(df["timestamp"].astype(int) - df["timestamp"].astype(int).values[0])
# plt.plot(df["x"].values, df["y"].values, ls='--', marker="o", label='wifi')
# plt.plot(waypoint[:,0], waypoint[:,1], ls='--', marker="o", label='waypoint')
# plt.legend()

In [None]:
# 線形補完でwaypointを修正
waypoint_dfs = []
for path,df in test_df.groupby('path'):  #15
    tmp_waypoint = wifi_waypoint_by_linear_interpolation(
        observed_timestamp=df['wifi_timestamp'].astype(int).values, 
        observed_x=df['x'].values,
        observed_y=df['y'].values, 
        target_timestamp=df['timestamp'].astype(int).values
        )
    df['_x'] = tmp_waypoint[:,0]
    df['_y'] = tmp_waypoint[:,1]
    waypoint_dfs.append(df)

waypoint_df = pd.concat(waypoint_dfs).reset_index(drop=True)
waypoint_df

In [None]:
import matplotlib.pyplot as plt
(waypoint_df['x'] - waypoint_df['_x']).hist(label='x')
(waypoint_df['y'] - waypoint_df['_y']).hist(label='y')
plt.legend()

In [None]:
sub = sub.merge(waypoint_df[['site_path_timestamp', '_x', '_y']], on='site_path_timestamp').set_index('site_path_timestamp')
sub = sub.drop(['x','y'], axis=1).rename(columns={'_x':'x', '_y':'y'})
sub

In [None]:
sub.to_csv(EXP_NAME + '_sub.csv')

## 後処理

## cost minimaization

In [None]:
import multiprocessing
import scipy.interpolate
import scipy.sparse
from tqdm import tqdm
import sys
sys.path.append('../../')
from src.io_f import read_data_file
from src import compute_f

In [None]:
def compute_rel_positions(acce_datas, ahrs_datas):
    step_timestamps, step_indexs, step_acce_max_mins = compute_f.compute_steps(acce_datas)
    headings = compute_f.compute_headings(ahrs_datas)
    stride_lengths = compute_f.compute_stride_length(step_acce_max_mins)
    step_headings = compute_f.compute_step_heading(step_timestamps, headings)
    rel_positions = compute_f.compute_rel_positions(stride_lengths, step_headings)
    return rel_positions
    
def correct_path(args):
    path, path_df = args
    
    T_ref  = path_df['timestamp'].values
    xy_hat = path_df[['x', 'y']].values
    
    example = read_data_file(f'{root_dir}/indoor-location-navigation/test/{path}.txt')
    rel_positions = compute_rel_positions(example.acce, example.ahrs)
    if T_ref[-1] > rel_positions[-1, 0]:
        rel_positions = [np.array([[0, 0, 0]]), rel_positions, np.array([[T_ref[-1], 0, 0]])]
    else:
        rel_positions = [np.array([[0, 0, 0]]), rel_positions]
    rel_positions = np.concatenate(rel_positions)
    
    T_rel = rel_positions[:, 0]
    delta_xy_hat = np.diff(scipy.interpolate.interp1d(T_rel, np.cumsum(rel_positions[:, 1:3], axis=0), axis=0)(T_ref), axis=0)

    N = xy_hat.shape[0]
    delta_t = np.diff(T_ref)
    alpha = (8.1)**(-2) * np.ones(N)
    beta  = (0.3 + 0.3 * 1e-3 * delta_t)**(-2)
    A = scipy.sparse.spdiags(alpha, [0], N, N)
    B = scipy.sparse.spdiags( beta, [0], N-1, N-1)
    D = scipy.sparse.spdiags(np.stack([-np.ones(N), np.ones(N)]), [0, 1], N-1, N)

    Q = A + (D.T @ B @ D)
    c = (A @ xy_hat) + (D.T @ (B @ delta_xy_hat))
    xy_star = scipy.sparse.linalg.spsolve(Q, c)

    return pd.DataFrame({
        'site_path_timestamp' : path_df['site_path_timestamp'],
        'floor' : path_df['floor'],
        'x' : xy_star[:, 0],
        'y' : xy_star[:, 1],
    })

def correct_path_train(args):
    #print(args)
    (site_id, path, floor), path_df = args
    
    T_ref  = path_df['timestamp'].values
    xy_hat = path_df[['x', 'y']].values
    
    example = read_data_file(f'{root_dir}/indoor-location-navigation/train/{site_id}/{floor}/{path}.txt')
    rel_positions = compute_rel_positions(example.acce, example.ahrs)
    if T_ref[-1] > rel_positions[-1, 0]:
        rel_positions = [np.array([[0, 0, 0]]), rel_positions, np.array([[T_ref[-1], 0, 0]])]
    else:
        rel_positions = [np.array([[0, 0, 0]]), rel_positions]
    rel_positions = np.concatenate(rel_positions)
    
    T_rel = rel_positions[:, 0]

    try:
        delta_xy_hat = np.diff(scipy.interpolate.interp1d(T_rel, np.cumsum(rel_positions[:, 1:3], axis=0), axis=0)(T_ref), axis=0)
    except:
        return pd.DataFrame({
            'site_path_timestamp' : path_df['site_path_timestamp'],
            'floor' : path_df['floor'],
            'x' : path_df['x'].to_numpy(),
            'y' : path_df['y'].to_numpy()
        })
    

    N = xy_hat.shape[0]
    delta_t = np.diff(T_ref)
    alpha = (8.1)**(-2) * np.ones(N)
    beta  = (0.3 + 0.3 * 1e-3 * delta_t)**(-2)
    A = scipy.sparse.spdiags(alpha, [0], N, N)
    B = scipy.sparse.spdiags( beta, [0], N-1, N-1)
    D = scipy.sparse.spdiags(np.stack([-np.ones(N), np.ones(N)]), [0, 1], N-1, N)

    Q = A + (D.T @ B @ D)
    c = (A @ xy_hat) + (D.T @ (B @ delta_xy_hat))
    xy_star = scipy.sparse.linalg.spsolve(Q, c)


    return pd.DataFrame({
        'site_path_timestamp' : path_df['site_path_timestamp'],
        'floor' : path_df['floor'],
        'x' : xy_star[:, 0],
        'y' : xy_star[:, 1],
    })

In [None]:
%%time

oofs_df = oofs_df.rename(columns={'x':'target_x', 'y':'target_y', 'oof_x':'x', 'oof_y':'y'})
processes = multiprocessing.cpu_count()
with multiprocessing.Pool(processes=processes) as pool:
    dfs = pool.imap_unordered(correct_path_train, oofs_df.groupby(['site_id_str', 'path', 'floor_str']))
    dfs = tqdm(dfs)
    dfs = list(dfs)

oof_post_process = pd.concat(dfs).sort_index()
oofs_df['oof_min_x'] = oof_post_process['x']
oofs_df['oof_min_y'] = oof_post_process['y']

# 元に戻す
oofs_df = oofs_df.rename(columns={'x':'oof_x', 'y':'oof_y'})
oofs_df

In [None]:
# waypoint補正前のx,yでの評価
oof_score_post_process = mean_position_error(
    oofs_df['oof_min_x'], oofs_df['oof_min_y'], 0, 
    oofs_df['target_x'], oofs_df['target_y'], 0
    )
print(f"(after cost-min) CV:{oof_score_post_process}")

In [None]:
# waypoint補正後のx,yでの評価
oof_score_post_process = mean_position_error(
    oofs_df['oof_min_x'], oofs_df['oof_min_y'], 0, 
    oofs_df['wifi_x'], oofs_df['wifi_y'], 0
    )
print(f"(after cost-min) CV:{oof_score_post_process}")

In [None]:
# x_pred = oofs_df["oof_min_x"]
# y_pred = oofs_df["oof_min_y"]
# f_pred = oofs_df["floor"]  # 正解を与える
# x_true = oofs_df["wifi_x"]
# y_true = oofs_df["wifi_y"]
# f_true = oofs_df["floor"]
# site_arr = oofs_df["site_id_str"]
# df_result_site2 = calc_metrics_site(x_pred, y_pred, f_pred, x_true, y_true, f_true, site_arr)
# df_result_site2.style.bar(subset=['n_sample', 'score'], color=['teal'])

In [None]:
# site_eval_report(df_result_site2)

In [None]:
sub = sub.reset_index()
sub_org = sub.copy()
tmp = sub['site_path_timestamp'].apply(lambda s : pd.Series(s.split('_')))
sub['site'] = tmp[0]
sub['path'] = tmp[1]
sub['timestamp'] = tmp[2].astype(float)
sub

In [None]:
processes = multiprocessing.cpu_count()
with multiprocessing.Pool(processes=processes) as pool:
    dfs = pool.imap_unordered(correct_path, sub.groupby(['path']))
    dfs = tqdm(dfs)
    dfs = list(dfs)
new_sub = pd.concat(dfs).sort_values('site_path_timestamp')
new_sub

In [None]:
new_sub.to_csv(RUN_NAME + '_cost_sub.csv', index=False)

## snap to grid

In [None]:
from scipy.spatial.distance import cdist
def sub_process(sub, train_waypoints):
    train_waypoints['isTrainWaypoint'] = True
    sub = split_col(sub[['site_path_timestamp','floor','x','y']]).copy()
    sub = sub.merge(train_waypoints[['site','floorNo','floor']].drop_duplicates(), how='left')
    sub = sub.merge(
        train_waypoints[['x','y','site','floor','isTrainWaypoint']].drop_duplicates(),
        how='left',
        on=['site','x','y','floor']
             )
    sub['isTrainWaypoint'] = sub['isTrainWaypoint'].fillna(False)
    return sub.copy()

def split_col(df):
    df = pd.concat([
        df['site_path_timestamp'].str.split('_', expand=True) \
        .rename(columns={0:'site',
                         1:'path',
                         2:'timestamp'}),
        df
    ], axis=1).copy()
    return df

floor_map = {"B2":-2, "B1":-1, "F1":0, "F2": 1, "F3":2,
             "F4":3, "F5":4, "F6":5, "F7":6,"F8":7,"F9":8,
             "1F":0, "2F":1, "3F":2, "4F":3, "5F":4, "6F":5,
             "7F":6, "8F": 7, "9F":8}

def add_xy(df):
    df['xy'] = [(x, y) for x,y in zip(df['x'], df['y'])]
    return df

def closest_point(point, points):
    """ Find closest point from a list of points. """
    return points[cdist([point], points).argmin()]

def snap_to_grid(sub, threshold):
    """
    Snap to grid if within a threshold.
    
    x, y are the predicted points.
    x_, y_ are the closest grid points.
    _x_, _y_ are the new predictions after post processing.
    """
    sub['_x_'] = sub['x']
    sub['_y_'] = sub['y']
    sub.loc[sub['dist'] < threshold, '_x_'] = sub.loc[sub['dist'] < threshold]['x_']
    sub.loc[sub['dist'] < threshold, '_y_'] = sub.loc[sub['dist'] < threshold]['y_']
    return sub.copy()


In [None]:
oofs_df

In [None]:
train_waypoints = pd.read_csv('../../input/indoor-location-train-waypoints/train_waypoints.csv')
snap_df = oofs_df[['site_path_timestamp','floor','oof_min_x','oof_min_y']].copy()
snap_df = snap_df.rename(columns={'oof_min_x':'x', 'oof_min_y':'y'})
# snap_df = oofs_df[['site_path_timestamp','floor','oof_x','oof_y']].copy()
# snap_df = snap_df.rename(columns={'oof_x':'x', 'oof_y':'y'})
snap_df = sub_process(snap_df, train_waypoints)
snap_df = add_xy(snap_df)
train_waypoints = add_xy(train_waypoints)

ds = []
for (site, myfloor), d in tqdm(snap_df.groupby(['site','floor'])):
    true_floor_locs = train_waypoints.loc[(train_waypoints['floor'] == myfloor) &
                                          (train_waypoints['site'] == site)] \
        .reset_index(drop=True)
    if len(true_floor_locs) == 0:
        print(f'Skipping {site} {myfloor}')
        continue
    d['matched_point'] = [closest_point(x, list(true_floor_locs['xy'])) for x in d['xy']]
    d['oof_min_snap_x'] = d['matched_point'].apply(lambda x: x[0])
    d['oof_min_snap_y'] = d['matched_point'].apply(lambda x: x[1])
    ds.append(d)

# 上書き
snap_df = pd.concat(ds).sort_index()
snap_df

In [None]:
oofs_df['oof_min_snap_x'] = snap_df['oof_min_snap_x']
oofs_df['oof_min_snap_y'] = snap_df['oof_min_snap_y']
oofs_df

In [None]:
# waypoint補正前のx,yでの評価
oof_score_post_process = mean_position_error(
    oofs_df['oof_min_snap_x'], oofs_df['oof_min_snap_y'], 0, 
    oofs_df['target_x'], oofs_df['target_y'], 0
    )
print(f"(after cost-min + snap) CV:{oof_score_post_process}")

In [None]:
# waypoint補正前のx,yでの評価
oof_score_post_process = mean_position_error(
    oofs_df['oof_min_snap_x'], oofs_df['oof_min_snap_y'], 0, 
    oofs_df['wifi_x'], oofs_df['wifi_y'], 0
    )
print(f"(after cost-min + snap) CV:{oof_score_post_process}")

In [None]:
# import matplotlib.pyplot as plt
# path = oofs_df["path"].unique()[9] # "5d10a1669c50c70008fe8977"
# tmp = oofs_df[oofs_df["path"]==path].copy()
# plt.plot(tmp["wifi_x"], tmp["wifi_y"], marker="o", label="target")
# plt.plot(tmp["oof_x"], tmp["oof_y"], marker="o", label="pred")
# window=int(len(tmp)/5)
# plt.plot(tmp["oof_x"].rolling(window,min_periods=1).mean(), tmp["oof_y"].rolling(window,min_periods=1).mean(), marker="o", label="smoothing")
# plt.legend()

for path, df in oofs_df.groupby("path"):
    if len(df) >= 5:
        window = int(len(df)/5)
        oofs_df.loc[oofs_df["path"]==path, "oof_smoothing_x"] = df["oof_min_snap_x"].rolling(window, min_periods=1).mean()
        oofs_df.loc[oofs_df["path"]==path, "oof_smoothing_y"] = df["oof_min_snap_y"].rolling(window, min_periods=1).mean()
    else:
        oofs_df.loc[oofs_df["path"]==path, "oof_smoothing_x"] = df["oof_min_snap_x"]
        oofs_df.loc[oofs_df["path"]==path, "oof_smoothing_y"] = df["oof_min_snap_y"]

In [None]:
# waypoint補正後のx,yでの評価
oof_score_post_process = mean_position_error(
    oofs_df['oof_smoothing_x'], oofs_df['oof_smoothing_y'], 0, 
    oofs_df['wifi_x'], oofs_df['wifi_y'], 0
    )
print(f"(after cost-min) CV:{oof_score_post_process}")

In [None]:
# x_pred = oofs_df["oof_min_snap_x"]
# y_pred = oofs_df["oof_min_snap_y"]
# f_pred = oofs_df["floor"]  # 正解を与える
# x_true = oofs_df["wifi_x"]
# y_true = oofs_df["wifi_y"]
# f_true = oofs_df["floor"]
# site_arr = oofs_df["site_id_str"]
# df_result_site3 = calc_metrics_site(x_pred, y_pred, f_pred, x_true, y_true, f_true, site_arr)
# df_result_site3.style.bar(subset=['n_sample', 'score'], color=['teal'])

In [None]:
# site_eval_report(df_result_site3)

In [None]:
train_waypoints = pd.read_csv('../../input/indoor-location-train-waypoints/train_waypoints.csv')

new_sub = sub_process(new_sub, train_waypoints)
new_sub = add_xy(new_sub)
train_waypoints = add_xy(train_waypoints)

ds = []
for (site, myfloor), d in new_sub.groupby(['site','floor']):
    true_floor_locs = train_waypoints.loc[(train_waypoints['floor'] == myfloor) &
                                          (train_waypoints['site'] == site)] \
        .reset_index(drop=True)
    if len(true_floor_locs) == 0:
        print(f'Skipping {site} {myfloor}')
        continue
    d['matched_point'] = [closest_point(x, list(true_floor_locs['xy'])) for x in d['xy']]
    d['x_'] = d['matched_point'].apply(lambda x: x[0])
    d['y_'] = d['matched_point'].apply(lambda x: x[1])
    ds.append(d)

new_sub2 = pd.concat(ds)

In [None]:
new_sub2.rename(columns=)

In [None]:
new_sub2 = new_sub2[['site_path_timestamp','floor','x_','y_']].sort_index()
new_sub2 = new_sub2.rename(columns={'x_':'x', 'y_':'y'})
new_sub2

In [None]:
new_sub2.to_csv(RUN_NAME + '_cost_snap_sub.csv', index=False)

In [None]:
# import matplotlib.pyplot as plt
# df_result_site1['score'].plot(label='oof')
# df_result_site2['score'].plot(label='cost')
# df_result_site3['score'].plot(label='cost+snap')
# plt.legend()
# plt.grid()
# plt.savefig('site_result.png')