In [1]:
import os
import sys
import glob
import yaml
import pickle
import random

import numpy as np
import pandas as pd
import scipy.stats as stats
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.preprocessing import StandardScaler, LabelEncoder

import json
from PIL import Image
import wandb
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim

import pytorch_lightning as pl
from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.callbacks import EarlyStopping
from pytorch_lightning.utilities.seed import seed_everything

In [2]:
# config
with open('config.yaml') as f:
    config = yaml.safe_load(f)

# globals variable
SEED = config['globals']['seed']
MAX_EPOCHS = config['globals']['max_epochs']
N_SPLITS = config['globals']['n_splits']
USE_FOLDS = config['globals']['use_folds']
DEBUG = config['globals']['debug']
EXP_MESSAGE = config['globals']['exp_message']
NOTES = config['globals']['notes']
MODEL_SAVE = config['globals']['model_save']
ONLY_PRED = config['globals']['only_pred']
PRETRAINED = config['globals']['pretrained']
PRETRAINED_PATH = config['globals']['pretrained_path']
EXP_NAME = str(Path().resolve()).split('/')[-1]

# seed
seed_everything(SEED)

Global seed set to 1996


1996

In [3]:
EXP_NAME

'exp011'

In [4]:
!wandb login 1bb2d0449c11d8b987e25c38b9d8dda176310fb6

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/user/.netrc


In [5]:
# waypointを補正したdataset
root_dir = Path('../../input/')
data_dir = root_dir/'google-smartphone-decimeter-challenge'
train_base_df = pd.read_csv(data_dir / "baseline_locations_train.csv")
test_base_df = pd.read_csv(data_dir / "baseline_locations_test.csv")
sub_df = pd.read_csv(data_dir / 'sample_submission.csv', index_col=0)


In [6]:
import pickle

def to_pickle(filename, obj):
    with open(filename, mode='wb') as f:
        pickle.dump(obj, f)


def from_pickle(filename):
    with open(filename, mode='rb') as f:
        obj = pickle.load(f)
    return obj

## create dataset

In [7]:
from tqdm.notebook import tqdm
# get derived data
def create_dervied_dataset(phase='train'):
    derived_files = (data_dir / phase).rglob('*_derived.csv')
    cols = ['millisSinceGpsEpoch', 'svid', 'correctedPrM']

    df_list = []
    for t in tqdm(derived_files):
        derived = pd.read_csv(t).drop_duplicates(['millisSinceGpsEpoch', 'svid'])
        derived['correctedPrM'] = (derived['rawPrM'] + derived['satClkBiasM'] - derived['isrbM'] - 
                                derived['ionoDelayM'] - derived['tropoDelayM'])
        df_list.append(derived[['collectionName', 'phoneName', 'millisSinceGpsEpoch', 'svid', 'correctedPrM']])
        
    derived_df = pd.concat(df_list, ignore_index=True)
    derived_df['phone'] = derived_df['collectionName'] + '_' + derived_df['phoneName']
    derived_df.drop(['collectionName', 'phoneName'], axis=1, inplace=True)

    derived_pivot_df = pd.pivot_table(derived_df, 
                                    values='correctedPrM', 
                                    index=['phone', 'millisSinceGpsEpoch'],
                                    columns=['svid'],
                                    aggfunc=np.mean)
    derived_pivot_df.columns = [f'svid_{x}' for x in derived_pivot_df.columns]
    derived_pivot_df.reset_index(inplace=True)
    derived_pivot_df['sSinceGpsEpoch'] = derived_pivot_df['millisSinceGpsEpoch'] // 1000
    return derived_pivot_df

In [8]:
# ground truth data

def merge_dataset(base_df, derived_df, data_dir, phase):

    if phase == 'train':
        # from https://www.kaggle.com/jpmiller/baseline-from-host-data
        label_files = (data_dir / phase).rglob('ground_truth.csv')

        df_list = []
        for t in tqdm(label_files, total=73):
            label = pd.read_csv(t, usecols=['collectionName','phoneName','millisSinceGpsEpoch','latDeg','lngDeg'])
            df_list.append(label)

        label_df = pd.concat(df_list, ignore_index=True)
        label_df['phone'] = label_df['collectionName'] + '_' + label_df['phoneName']

        df = label_df.merge(base_df, how='inner', on=['phone', 'millisSinceGpsEpoch'], 
                            suffixes=('_gt', '')).drop(['collectionName', 'phoneName'], axis=1).rename(columns={'collectionName_gt':'collectionName', 'phoneName_gt':'phoneName'})
        df['latDeg_dt'] = df['latDeg_gt'] - df["latDeg"]
        df['lngDeg_dt'] = df['lngDeg_gt'] - df["lngDeg"]
    else:
        df = base_df.copy()
    
    df['sSinceGpsEpoch'] = df['millisSinceGpsEpoch'] // 1000
    df = df.merge(derived_df, how='left', on=['phone', 'sSinceGpsEpoch'], suffixes=['', '_'])
    df.drop(['sSinceGpsEpoch', 'millisSinceGpsEpoch_'], axis=1, inplace=True)
    return df

In [9]:
train_derived_df = create_dervied_dataset(phase='train')
test_derived_df = create_dervied_dataset(phase='test')

train_df = merge_dataset(train_base_df, train_derived_df, data_dir, phase='train')
test_df = merge_dataset(test_base_df, test_derived_df, data_dir, phase='test')

0it [00:00, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/73 [00:00<?, ?it/s]

In [10]:
train_df.shape, test_df.shape

((131347, 48), (91496, 44))

In [11]:
set(train_df.columns)-set(test_df.columns)

{'latDeg_dt', 'latDeg_gt', 'lngDeg_dt', 'lngDeg_gt'}

In [12]:
train_df

Unnamed: 0,collectionName,phoneName,millisSinceGpsEpoch,latDeg_gt,lngDeg_gt,phone,latDeg,lngDeg,heightAboveWgs84EllipsoidM,latDeg_dt,...,svid_28,svid_29,svid_30,svid_31,svid_32,svid_33,svid_34,svid_35,svid_36,svid_37
0,2020-06-04-US-MTV-1,Pixel4XLModded,1275339495434,37.416314,-122.080466,2020-06-04-US-MTV-1_Pixel4XLModded,37.416320,-122.080459,-32.33,-0.000007,...,,,,,,,,,,
1,2020-06-04-US-MTV-1,Pixel4XLModded,1275339496434,37.416314,-122.080466,2020-06-04-US-MTV-1_Pixel4XLModded,37.416342,-122.080471,-31.77,-0.000028,...,,2.293725e+07,2.340839e+07,,,,,,2.250868e+07,
2,2020-06-04-US-MTV-1,Pixel4XLModded,1275339497434,37.416314,-122.080466,2020-06-04-US-MTV-1_Pixel4XLModded,37.416353,-122.080493,-33.34,-0.000040,...,,2.293681e+07,2.188019e+07,,,,,,2.250873e+07,
3,2020-06-04-US-MTV-1,Pixel4XLModded,1275339498434,37.416314,-122.080466,2020-06-04-US-MTV-1_Pixel4XLModded,37.416343,-122.080474,-33.47,-0.000029,...,,2.293638e+07,2.188017e+07,,,,,,2.250879e+07,
4,2020-06-04-US-MTV-1,Pixel4XLModded,1275339499434,37.416314,-122.080466,2020-06-04-US-MTV-1_Pixel4XLModded,37.416340,-122.080474,-34.06,-0.000027,...,,2.293596e+07,2.188014e+07,,,,,,2.250884e+07,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
131342,2021-01-04-US-RWC-2,Pixel5,1293837130433,37.444463,-122.232459,2021-01-04-US-RWC-2_Pixel5,37.444467,-122.232437,8.80,-0.000004,...,2.525797e+07,,2.078545e+07,,,,,,2.363825e+07,2.455588e+07
131343,2021-01-04-US-RWC-2,Pixel5,1293837131433,37.444463,-122.232459,2021-01-04-US-RWC-2_Pixel5,37.444474,-122.232426,11.04,-0.000011,...,2.525806e+07,,2.078525e+07,,,,,,2.363792e+07,2.455629e+07
131344,2021-01-04-US-RWC-2,Pixel5,1293837132433,37.444463,-122.232459,2021-01-04-US-RWC-2_Pixel5,37.444469,-122.232438,8.14,-0.000007,...,2.525815e+07,,2.078505e+07,,,,,,2.363758e+07,2.455671e+07
131345,2021-01-04-US-RWC-2,Pixel5,1293837133433,37.444463,-122.232459,2021-01-04-US-RWC-2_Pixel5,37.444467,-122.232418,10.60,-0.000004,...,2.525822e+07,,2.078485e+07,,,,,,2.363724e+07,2.455712e+07


In [13]:
test_df

Unnamed: 0,collectionName,phoneName,millisSinceGpsEpoch,latDeg,lngDeg,heightAboveWgs84EllipsoidM,phone,svid_1,svid_2,svid_3,...,svid_28,svid_29,svid_30,svid_31,svid_32,svid_33,svid_34,svid_35,svid_36,svid_37
0,2020-05-15-US-MTV-1,Pixel4,1273608785432,37.416628,-122.082053,-30.69,2020-05-15-US-MTV-1_Pixel4,,,,...,,,,,,,,,,
1,2020-05-15-US-MTV-1,Pixel4,1273608786432,37.416646,-122.082040,-31.76,2020-05-15-US-MTV-1_Pixel4,,2.162738e+07,,...,2.387446e+07,,2.549334e+07,,,,,,2.350364e+07,
2,2020-05-15-US-MTV-1,Pixel4,1273608787432,37.416652,-122.082039,-31.65,2020-05-15-US-MTV-1_Pixel4,,2.162693e+07,,...,2.387486e+07,,2.549292e+07,,,,,,2.350371e+07,
3,2020-05-15-US-MTV-1,Pixel4,1273608788432,37.416607,-122.082063,-31.52,2020-05-15-US-MTV-1_Pixel4,,2.162650e+07,,...,2.387527e+07,,2.549251e+07,,,,,,2.350377e+07,
4,2020-05-15-US-MTV-1,Pixel4,1273608789432,37.416609,-122.082073,-28.95,2020-05-15-US-MTV-1_Pixel4,,2.162604e+07,,...,2.387567e+07,,2.549210e+07,,,,,,2.350384e+07,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91491,2021-04-29-US-SJC-3,SamsungS20Ultra,1303763185000,37.334539,-121.899383,-8.39,2021-04-29-US-SJC-3_SamsungS20Ultra,,2.361403e+07,,...,2.188545e+07,,2.597680e+07,,,,,,2.371918e+07,2.228825e+07
91492,2021-04-29-US-SJC-3,SamsungS20Ultra,1303763186000,37.334545,-121.899380,-7.36,2021-04-29-US-SJC-3_SamsungS20Ultra,,2.361408e+07,,...,2.188547e+07,,2.597651e+07,,,,,,2.371931e+07,2.228815e+07
91493,2021-04-29-US-SJC-3,SamsungS20Ultra,1303763187000,37.334551,-121.899371,-4.08,2021-04-29-US-SJC-3_SamsungS20Ultra,,2.361412e+07,,...,2.188549e+07,,2.597621e+07,,,,,,2.371944e+07,2.228805e+07
91494,2021-04-29-US-SJC-3,SamsungS20Ultra,1303763188000,37.334540,-121.899371,-5.70,2021-04-29-US-SJC-3_SamsungS20Ultra,,2.361417e+07,,...,2.188551e+07,,2.597592e+07,,,,,,2.371957e+07,2.228794e+07


In [14]:
# metric
def calc_haversine(lat1, lon1, lat2, lon2):
    """Calculates the great circle distance between two points
    on the earth. Inputs are array-like and specified in decimal degrees.
    """
    RADIUS = 6_367_000
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    dist = 2 * RADIUS * np.arcsin(a**0.5)
    
    # angle = np.degrees(np.arctan2(dlat, dlon))  # radian -> degree
    return dist

In [15]:
# # 水平距離の50パーセンタイルと95パーセンタイルの平均
# def gsdc_metric(dist):
    
#     score_per50 = np.percentile(dist, 50)
#     score_per95 = np.percentile(dist, 95)
#     score = np.mean(score_per50, score_per95)
#     return score

In [16]:
dist = calc_haversine(train_df['latDeg'], train_df['lngDeg'], train_df['latDeg_gt'],train_df['lngDeg_gt'])
print(f"50per:{np.percentile(dist, 50)} \n95per:{np.percentile(dist, 95)} \nfinal score:{np.percentile(dist, [50,90]).mean()}")

50per:2.0657727415512954 
95per:10.464714214322617 
final score:4.446476287034587


In [17]:
train_df

Unnamed: 0,collectionName,phoneName,millisSinceGpsEpoch,latDeg_gt,lngDeg_gt,phone,latDeg,lngDeg,heightAboveWgs84EllipsoidM,latDeg_dt,...,svid_28,svid_29,svid_30,svid_31,svid_32,svid_33,svid_34,svid_35,svid_36,svid_37
0,2020-06-04-US-MTV-1,Pixel4XLModded,1275339495434,37.416314,-122.080466,2020-06-04-US-MTV-1_Pixel4XLModded,37.416320,-122.080459,-32.33,-0.000007,...,,,,,,,,,,
1,2020-06-04-US-MTV-1,Pixel4XLModded,1275339496434,37.416314,-122.080466,2020-06-04-US-MTV-1_Pixel4XLModded,37.416342,-122.080471,-31.77,-0.000028,...,,2.293725e+07,2.340839e+07,,,,,,2.250868e+07,
2,2020-06-04-US-MTV-1,Pixel4XLModded,1275339497434,37.416314,-122.080466,2020-06-04-US-MTV-1_Pixel4XLModded,37.416353,-122.080493,-33.34,-0.000040,...,,2.293681e+07,2.188019e+07,,,,,,2.250873e+07,
3,2020-06-04-US-MTV-1,Pixel4XLModded,1275339498434,37.416314,-122.080466,2020-06-04-US-MTV-1_Pixel4XLModded,37.416343,-122.080474,-33.47,-0.000029,...,,2.293638e+07,2.188017e+07,,,,,,2.250879e+07,
4,2020-06-04-US-MTV-1,Pixel4XLModded,1275339499434,37.416314,-122.080466,2020-06-04-US-MTV-1_Pixel4XLModded,37.416340,-122.080474,-34.06,-0.000027,...,,2.293596e+07,2.188014e+07,,,,,,2.250884e+07,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
131342,2021-01-04-US-RWC-2,Pixel5,1293837130433,37.444463,-122.232459,2021-01-04-US-RWC-2_Pixel5,37.444467,-122.232437,8.80,-0.000004,...,2.525797e+07,,2.078545e+07,,,,,,2.363825e+07,2.455588e+07
131343,2021-01-04-US-RWC-2,Pixel5,1293837131433,37.444463,-122.232459,2021-01-04-US-RWC-2_Pixel5,37.444474,-122.232426,11.04,-0.000011,...,2.525806e+07,,2.078525e+07,,,,,,2.363792e+07,2.455629e+07
131344,2021-01-04-US-RWC-2,Pixel5,1293837132433,37.444463,-122.232459,2021-01-04-US-RWC-2_Pixel5,37.444469,-122.232438,8.14,-0.000007,...,2.525815e+07,,2.078505e+07,,,,,,2.363758e+07,2.455671e+07
131345,2021-01-04-US-RWC-2,Pixel5,1293837133433,37.444463,-122.232459,2021-01-04-US-RWC-2_Pixel5,37.444467,-122.232418,10.60,-0.000004,...,2.525822e+07,,2.078485e+07,,,,,,2.363724e+07,2.455712e+07


In [18]:
TARGET = ['latDeg_dt', 'lngDeg_dt']
BASE_FEATS = ['latDeg', 'lngDeg']
SVID_FEATS  = [f'svid_{i}' for i in range(1, 38, 1)]

use_feats = BASE_FEATS + SVID_FEATS + ['phoneName','collectionName']  
# XPOS_FEATS  = [f'xSatPosM_{i}' for i in range(NUM_FEATS)]
# YPOS_FEATS  = [f'ySatPosM_{i}' for i in range(NUM_FEATS)]
# ZPOS_FEATS  = [f'zSatPosM_{i}' for i in range(NUM_FEATS)]
# XVEL_FEATS  = [f'xSatVelMps_{i}' for i in range(NUM_FEATS)]
# YVEL_FEATS  = [f'ySatVelMps_{i}' for i in range(NUM_FEATS)]
# ZVEL_FEATS  = [f'zSatVelMps_{i}' for i in range(NUM_FEATS)]
# BIAS_FEATS  = [f'satClkBiasM_{i}' for i in range(NUM_FEATS)]
# DRIFT_FEATS  = [f'satClkDriftMps_{i}' for i in range(NUM_FEATS)]
# RAWPR_FEATS  = [f'rawPrM_{i}' for i in range(NUM_FEATS)]
# RAWPRUNC_FEATS  = [f'rawPrUncM_{i}' for i in range(NUM_FEATS)]
# ISRBM_FEATS  = [f'isrbM_{i}' for i in range(NUM_FEATS)]
# IONODELAY_FEATS  = [f'ionoDelayM_{i}' for i in range(NUM_FEATS)]
# TROPODELAY_FEATS  = [f'tropoDelayM_{i}' for i in range(NUM_FEATS)]


In [19]:
len(use_feats)

41

## preprocessing

In [20]:
whole_df = pd.concat([train_df, test_df]).reset_index(drop=True)
phone_size = whole_df['phoneName'].nunique()

train = train_df[use_feats + TARGET].copy()
test = test_df[use_feats].copy()


# le = LabelEncoder()
# le.fit(whole_df['phoneName'].to_numpy().flatten())
# train['phoneName'] = le.transform(train['phoneName'])
# test['phoneName'] = le.transform(test['phoneName'])

# SS
ss = StandardScaler()
ss.fit(whole_df[SVID_FEATS])
train[SVID_FEATS] = ss.transform(train[SVID_FEATS])
test[SVID_FEATS] = ss.transform(test[SVID_FEATS])

ss_base = StandardScaler()
ss_base.fit(whole_df[BASE_FEATS])
train[BASE_FEATS] = ss_base.transform(train[BASE_FEATS])
test[BASE_FEATS] = ss_base.transform(test[BASE_FEATS])

ss_gt = StandardScaler()
ss_gt.fit(whole_df[TARGET])
train[TARGET] = ss_gt.transform(train[TARGET])

In [21]:
train.head(2).T

Unnamed: 0,0,1
latDeg,-0.299769,-0.299523
lngDeg,0.507042,0.506960
svid_1,,
svid_2,,0.351314
svid_3,,0.272167
svid_4,,
svid_5,,
svid_6,,-0.337677
svid_7,,
svid_8,,


In [22]:
train.shape, test.shape

((131347, 43), (91496, 41))

## model

In [23]:
def get_optimizer(model: nn.Module, config: dict):
    optimizer_config = config["optimizer"]
    optimizer_name = optimizer_config.get("name")
    base_optimizer_name = optimizer_config.get("base_name")
    optimizer_params = optimizer_config['params']

    if hasattr(optim, optimizer_name):
        optimizer = optim.__getattribute__(optimizer_name)(model.parameters(), **optimizer_params)
        return optimizer
    else:
        base_optimizer = optim.__getattribute__(base_optimizer_name)
        optimizer = globals().get(optimizer_name)(
            model.parameters(), 
            base_optimizer,
            **optimizer_config["params"])
        return  optimizer

def get_scheduler(optimizer, config: dict):
    scheduler_config = config["scheduler"]
    scheduler_name = scheduler_config.get("name")

    if scheduler_name is None:
        return
    else:
        return optim.lr_scheduler.__getattribute__(scheduler_name)(
            optimizer, **scheduler_config["params"])


def get_criterion(config: dict):
    loss_config = config["loss"]
    loss_name = loss_config["name"]
    loss_params = {} if loss_config.get("params") is None else loss_config.get("params")
    
    
    if hasattr(nn, loss_name):
        criterion = nn.__getattribute__(loss_name)(**loss_params)
    else:
        criterion = globals().get(loss_name)(**loss_params)

    return criterion

def worker_init_fn(worker_id):                                                          
    np.random.seed(np.random.get_state()[1][0] + worker_id)

In [None]:

# Learner class(pytorch-lighting)
class Learner(pl.LightningModule):
    def __init__(self, model, config, ss_gt, ss_base):
        super().__init__()
        self.model = model
        self.config = config
        self.criterion = get_criterion(config)
        self.ss_gt = ss_gt
        self.ss_base = ss_base
    
    def training_step(self, batch, batch_idx):
        x, target = batch
        output = self.model(x)
        loss = self.criterion(output, target)

        # target - base = delta
        output = self.ss_gt.inverse_transform(to_np(output))
        target = self.ss_gt.inverse_transform(to_np(target))
        base = self.ss_base.inverse_transform(to_np(x["base"]))
        
        dist = calc_haversine((output[:, 0] + base[:, 0]), (output[:, 1] + base[:, 0]), 
                              (target[:, 0] + base[:, 0]), (target[:, 1] + base[:, 0]))
        
        return loss
    
    def validation_step(self, batch, batch_idx):
        x, target = batch
        output = self.model(x)
        loss = self.criterion(output + base, target + base)

        # target - base = delta
        output = self.ss_gt.inverse_transform(to_np(output))
        target = self.ss_gt.inverse_transform(to_np(target))
        base = self.ss_base.inverse_transform(to_np(x["base"]))
        
        dist = calc_haversine((output[:, 0] + base[:, 0]), (output[:, 1] + base[:, 0]), 
                              (target[:, 0] + base[:, 0]), (target[:, 1] + base[:, 0]))
        all_dist_score = np.mean(dist)
        dist_50per_score = np.percentile(dist, 50)
        dist_95per_score = np.percentile(dist, 95)
        comp_score = np.mean([dist_50per_score, dist_95per_score])
        self.log(f'Loss/val', loss, on_step=False, on_epoch=True, prog_bar=False, logger=True)
        self.log(f'dist_50/val', dist_50per_score, on_step=False, on_epoch=True, prog_bar=False, logger=True)
        self.log(f'dist_95/val', dist_95per_score, on_step=False, on_epoch=True, prog_bar=False, logger=True)
        self.log(f'dist_all/val', all_dist_score, on_step=False, on_epoch=True, prog_bar=False, logger=True)
        self.log(f'comp_score/val', comp_score, on_step=False, on_epoch=True, prog_bar=False, logger=True)
        return loss

    def configure_optimizers(self):
        optimizer = get_optimizer(self.model, self.config)
        scheduler = get_scheduler(optimizer, self.config)
        return {"optimizer": optimizer, "lr_scheduler": scheduler, "monitor": "Loss/val"}

In [25]:
def to_np(input):
    return input.detach().cpu().numpy().astype(np.float32)

# oof
def evaluate(model, loaders, ss_gt, phase):
    lat_list = []
    lng_list = []

    with torch.no_grad():
        for batch in loaders[phase]:
            x, y = batch
            output = model(x)
            output = ss_gt.inverse_transform(output)
            lat_list.append(output[:, 0])
            lng_list.append(output[:, 1])
    lat_list = np.concatenate(lat_list)
    lng_list = np.concatenate(lng_list)
    return lat_list, lng_list

In [56]:
class DistMSELoss(nn.Module):
    def __init__(self):
        super().__init__()
        self.loss = nn.MSELoss(reduction='none')
        self.radius = 6_367_000

    def forward(self, input, target):

        input = torch.deg2rad(input)
        target = torch.deg2rad(target)
        lat1 = input[:,0]
        lon1 = input[:,1]
        lat2 = target[:,0]
        lon2 = target[:,1]

        dlat = lat2 - lat1
        dlon = lon2 - lon1
        a = torch.sin(dlat/2)**2 + torch.cos(lat1) * torch.cos(lat2) * torch.sin(dlon/2)**2

        loss = 2 * self.radius * torch.arcsin(a**0.5)
        return loss.mean()

In [57]:
# dataset
from torch.utils.data import Dataset, DataLoader
class OutdoorDataset(Dataset):
    def __init__(self, df, phase='train'):
        self.df = df
        self.phase = phase

        # categorical features
        # self.collname_feats = df['collectionName'].to_numpy()
        self.phonename_feats = df['phoneName'].to_numpy()
        self.svid_feats = df[SVID_FEATS].fillna(0).to_numpy()
        self.base_feats = df[BASE_FEATS].to_numpy()
        # self.signaltype_feats = df[SIGNALTYPE_FEATS].to_numpy()
        # self.gpsnanos_feats = df[GPSNANOS_FEATS].to_numpy()  # as categorical
        # self.categorical_feats = np.concatenate([
        #     self.collname_feats,
        #     self.phonename_feats,
        #     self.collname_feats,
        #     self.svid_feats,
        #     self.signaltype_feats,
        # ])
        
        # numerical features
        # self.xpos_feats = df[XPOS_FEATS].to_numpy()
        # self.ypos_feats = df[YPOS_FEATS].to_numpy()
        # self.zpos_feats = df[ZPOS_FEATS].to_numpy()
        # self.xvel_feats = df[XVEL_FEATS].to_numpy()
        # self.yvel_feats = df[YVEL_FEATS].to_numpy()
        # self.zvel_feats = df[ZVEL_FEATS].to_numpy()
        # self.bias_feats = df[BIAS_FEATS].to_numpy()
        # self.drift_feats = df[DRIFT_FEATS].to_numpy()
        # self.rawpr_feats = df[RAWPR_FEATS].to_numpy()
        # self.rawprunc_feats = df[RAWPRUNC_FEATS].to_numpy()
        # self.isrbm_feats = df[ISRBM_FEATS].to_numpy()
        # self.inondelay_feats = df[IONODELAY_FEATS].to_numpy()
        # self.tropodelay_feats = df[TROPODELAY_FEATS].to_numpy()

        # self.numerical_feats = np.concatenate([
        #     self.xpos_feats,
        #     self.ypos_feats,
        #     self.zpos_feats,
        #     self.xvel_feats,
        #     self.yvel_feats,
        #     self.zvel_feats,
        #     self.bias_feats,
        #     self.drift_feats,
        #     self.rawpr_feats,
        #     self.rawprunc_feats,
        #     self.isrbm_feats,
        #     self.inondelay_feats,
        #     self.tropodelay_feats, 
        #     ], axis=1)
        # self.epoch_feats = df['millisSinceGpsEpoch'].to_numpy()
        if phase in ['train', 'valid']:
            self.target = df[TARGET].values.astype(np.float32)
                    
        
    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, idx):

        feature = {
            'base': self.base_feats[idx].astype(np.float32),
            'phonename': self.phonename_feats[idx],
            'svid': self.svid_feats[idx].astype(np.float32),
        }

        if self.phase in ['train', 'valid']:
            target = self.target[idx]
        
        else:
            target = None
        return feature, target

In [58]:
class MLPModel(nn.Module):
    def __init__(self, phone_size):
        super(MLPModel, self).__init__()

        # phone
        self.embedding_layer1 = nn.Sequential(
            nn.Embedding(phone_size, 1),
            nn.Flatten(start_dim=-1)           
        )

        feature_size = 37 + 2
        self.fc1 = nn.Sequential(
            nn.BatchNorm1d(feature_size),
            nn.Linear(feature_size, 256),
            nn.ReLU()
        )

        self.fc2 = nn.Sequential(
            nn.BatchNorm1d(256),
            nn.Linear(256, 128),
            nn.ReLU()
        )


        self.fc3 = nn.Sequential(
            nn.BatchNorm1d(128),
            nn.Linear(128, 16),
            nn.ReLU()
        )
        self.fc = nn.Linear(16, 2)  # deltaのlat, lng, distance, angle

    
    def forward(self, x):
        # input embedding
        batch_size = x["base"].shape[0]

        # x_phone = self.embedding_layer1(x['phonename'])
        x = torch.cat([x['base'], x['svid']], dim=-1)

        # MLP
        x = self.fc1(x)
        x = self.fc2(x)
        x = self.fc3(x)

        # output [batch, 4] delta_lat, delta_lng, distance, angle
        x = self.fc(x).squeeze(1)
        return x

## train

In [59]:

oofs = []  # 全てのoofをdfで格納する
predictions = []  # 全ての予測値をdfで格納する
val_scores = []

os.makedirs(f'../../model/{EXP_NAME}', exist_ok=True)

# skf = model_selection.StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)
gkf = model_selection.GroupKFold(n_splits=N_SPLITS)

train_fold = [(trn_idx, val_idx) for trn_idx, val_idx in gkf.split(train.loc[:, 'collectionName'], groups=train.loc[:, 'collectionName'])]
for fold in range(5):
    # 指定したfoldのみループを回す
    if fold not in USE_FOLDS:
        continue

    print('=' * 20)
    print(f'Fold {fold}')
    print('=' * 20)

    # train/valid data
    trn_idx_for_train, val_idx_for_train = train_fold[fold]
    trn_df = train.loc[trn_idx_for_train, :].reset_index(drop=True)
    val_df = train.loc[val_idx_for_train, :].reset_index(drop=True)

    # data loader
    loaders = {}
    loader_config = config["loader"]
    loaders["train"] = DataLoader(OutdoorDataset(trn_df, phase="train"), **loader_config["train"], worker_init_fn=worker_init_fn) 
    loaders["valid"] = DataLoader(OutdoorDataset(val_df, phase="valid"), **loader_config["valid"], worker_init_fn=worker_init_fn)
    loaders["test"] = DataLoader(OutdoorDataset(test, phase="test"), **loader_config["test"], worker_init_fn=worker_init_fn)

    # model
    model = MLPModel(phone_size) 
    model_name = model.__class__.__name__

    # callbacks
    callbacks = []
    checkpoint_callback = ModelCheckpoint(
        monitor=f'Loss/val',
        mode='min',
        dirpath=f"../../model/{EXP_NAME}",
        verbose=False,
        filename=f'{model_name}-{fold}')
    
    if MODEL_SAVE:
        callbacks.append(checkpoint_callback)

    early_stop_callback = EarlyStopping(
        monitor='Loss/val',
        min_delta=0.00,
        patience=20,
        verbose=False,
        mode='min')
    callbacks.append(early_stop_callback)

    # loggers
    RUN_NAME = EXP_NAME + "_" + EXP_MESSAGE
    wandb.init(project='outdoor', entity='kuto5046', group=RUN_NAME)
    wandb.run.name = RUN_NAME + f'-fold-{fold}'
    wandb_config = wandb.config
    wandb_config['model_name'] = model_name
    wandb_config['comment'] = NOTES
    wandb.watch(model)
    
    
    loggers = []
    loggers.append(WandbLogger())

    learner = Learner(model, config, ss_gt, ss_base)
    # pretrained flag
    if PRETRAINED:
        ckpt = torch.load(PRETRAINED_PATH + f'{model_name}-{fold}.ckpt')
        learner.load_state_dict(ckpt['state_dict'])

    if not ONLY_PRED:
        trainer = pl.Trainer(
            logger=loggers, 
            callbacks=callbacks,
            max_epochs=MAX_EPOCHS,
            gpus=[0],
            fast_dev_run=DEBUG,
            deterministic=True,
            # precision=16,
            progress_bar_refresh_rate=0  # vscodeの時progress barの動作が遅いので表示しない
            )

        trainer.fit(learner, train_dataloader=loaders['train'], val_dataloaders=loaders['valid'])

    #############
    # validation (to make oof)
    #############
    model.eval() 
    oof_df = train.loc[val_idx_for_train, :].reset_index(drop=True)
    oof_lat, oof_lng = evaluate(model, loaders, ss_gt, phase='valid')
    oof_df["oof_latDeg"] = oof_lat
    oof_df["oof_lngDeg"] = oof_lng
    oofs.append(oof_df)
    val_score = calc_haversine(oof_df['oof_latDeg'], oof_df['oof_lngDeg'], oof_df['latDeg'], oof_df['lngDeg']).mean()
    val_scores.append(val_score)
    print(f"fold {fold}: mean position error {val_score}")

    #############
    # inference
    #############
    preds_lat, preds_lng = evaluate(model, loaders, ss_gt, phase="test")
    test_preds = pd.DataFrame(np.stack((preds_lat, preds_lng))).T
    test_preds.columns = sub_df.columns
    # test_preds.to_csv(f'{EXP_NAME}_fold{fold}.csv', index=False)
    predictions.append(test_preds)

    if fold != 4:
        wandb.finish()

Fold 0


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
Loss/val,269230.78125
dist_50/val,271.08649
dist_95/val,12743.20801
dist_all/val,2448.12915
comp_score/val,6507.14697
epoch,18.0
trainer/global_step,31140.0
_runtime,169.0
_timestamp,1622982340.0
_step,19.0


0,1
Loss/val,▁▁▁▁▁▁▁▁▁▁▁▁█▁▄▄▄▅▆
dist_50/val,▁▁▁▁▁▁▁▁▁▁▁▁▂▁▁█▁▁▂
dist_95/val,▁▁▁▁▁▁▁▁▁▁▁▁▂▁▂█▁▂▆
dist_all/val,▁▁▁▁▁▁▁▁▁▁▁▁▂▁▂█▁▂▄
comp_score/val,▁▁▁▁▁▁▁▁▁▁▁▁▂▁▂█▁▂▅
epoch,▁▁▂▂▃▃▃▄▄▅▅▅▆▆▆▇▇██
trainer/global_step,▁▁▂▂▃▃▃▄▄▅▅▅▆▆▆▇▇██
_runtime,▁▁▂▂▂▃▃▃▄▄▄▅▅▅▆▆▆▇▇█
_timestamp,▁▁▂▂▂▃▃▃▄▄▄▅▅▅▆▆▆▇▇█
_step,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██


[34m[1mwandb[0m: wandb version 0.10.31 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


GPU available: True, used: True
TPU available: None, using: 0 TPU cores

  | Name      | Type        | Params
------------------------------------------
0 | model     | MLPModel    | 46.1 K
1 | criterion | DistMSELoss | 0     
------------------------------------------
46.1 K    Trainable params
0         Non-trainable params
46.1 K    Total params
0.184     Total estimated model params size (MB)


fold 0: mean position error 138850.41733919643


TypeError: Caught TypeError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/torch/utils/data/_utils/worker.py", line 198, in _worker_loop
    data = fetcher.fetch(index)
  File "/usr/local/lib/python3.8/dist-packages/torch/utils/data/_utils/fetch.py", line 47, in fetch
    return self.collate_fn(data)
  File "/usr/local/lib/python3.8/dist-packages/torch/utils/data/_utils/collate.py", line 83, in default_collate
    return [default_collate(samples) for samples in transposed]
  File "/usr/local/lib/python3.8/dist-packages/torch/utils/data/_utils/collate.py", line 83, in <listcomp>
    return [default_collate(samples) for samples in transposed]
  File "/usr/local/lib/python3.8/dist-packages/torch/utils/data/_utils/collate.py", line 85, in default_collate
    raise TypeError(default_collate_err_msg_format.format(elem_type))
TypeError: default_collate: batch must contain tensors, numpy arrays, numbers, dicts or lists; found <class 'NoneType'>


## validation

In [None]:
if len(USE_FOLDS) > 1:
    oofs_df = pd.concat(oofs)
else:
    oofs_df = oofs[0]

oofs_df['site_path_timestamp'] = oofs_df['site_id_str'].astype(str) + '_' + oofs_df['path'] + '_' + oofs_df['timestamp'].astype(str)
oofs_df = oofs_df.sort_values('site_path_timestamp').reset_index(drop=True)
# oofs_df.to_csv("oof.csv", index=False)
oofs_df

In [None]:
# waypoint補正前のx,yでの評価
oof_score = mean_position_error(
    oofs_df['oof_x'], oofs_df['oof_y'], 0, 
    oofs_df['x'], oofs_df['y'], 0
    )
wandb_config['CV'] = oof_score
print(f"CV:{oof_score}")

In [None]:
if len(USE_FOLDS) > 1:
    # foldの結果を平均した後、reindexでsubmission fileにindexを合わせる
    # all_preds = pd.concat(predictions).groupby('site_path_timestamp').mean()
    all_preds = pd.concat(predictions).reset_index().groupby('index').mean()
    all_preds['site_path_timestamp'] = predictions[0]['site_path_timestamp'].values
else:
    all_preds = predictions[0]

In [None]:
sub.to_csv(EXP_NAME + '_sub.csv')