In [1]:
# for local
import os
os.environ["CUDA_VISIBLE_DEVICES"]="2"

In [2]:
# ====================================================
# Directory settings
# ====================================================
import os

EXP_NAME='1023_lstm_6layer-ver6_mod-feat11_R20'

DATA_DIR = "../input/ventilator-pressure-prediction/R_20/"

OUTPUT_DIR = f'./results/{EXP_NAME}/'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

# Config

In [3]:
# ====================================================
# CFG
# ====================================================
class CFG:
    experiment_name=EXP_NAME
    competition='ventilator'
    apex=True
    print_freq=1000
    num_workers=4
    model_name='lstm'
    scheduler='CosineAnnealingWarmRestarts' # ['linear', 'cosine', 'ReduceLROnPlateau', 'CosineAnnealingLR', 'CosineAnnealingWarmRestarts']
    batch_scheduler=False
    #num_warmup_steps=100 # ['linear', 'cosine']
    #num_cycles=0.5 # 'cosine'
    factor=0.995 # ReduceLROnPlateau
    patience=7 # ReduceLROnPlateau
    eps=1e-6 # ReduceLROnPlateau
    T_max=50 # CosineAnnealingLR
    T_0=20 # CosineAnnealingWarmRestarts
    epochs=300
    max_grad_norm=1000
    gradient_accumulation_steps=1
    hidden_size=1024
    lr=1e-3
    min_lr=1e-5
    weight_decay=1e-6
    batch_size=256
    n_fold=5
    trn_fold=[0, 1, 2, 3, 4]
    cate_seq_cols=[]
    cont_seq_cols=['C', 'time_step', 'u_in', 'u_out'] #'R'
    train=True
    inference=True
    feature_importance=True
    debug=False
    wandb=True

if CFG.debug:
    CFG.epochs = 2
    CFG.trn_fold=[0]

# import

In [4]:
# ====================================================
# Library
# ====================================================
import os
import gc
import sys
import json
import math
import random
from time import time
from datetime import datetime
from collections import Counter, defaultdict

import scipy as sp
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

from tqdm.auto import tqdm
import category_encoders as ce

from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.init as init
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts, CosineAnnealingLR, ReduceLROnPlateau
from torch.cuda.amp import GradScaler
from torch.cuda.amp import autocast

from transformers import AdamW
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup

import warnings
warnings.filterwarnings("ignore")

#if CFG.apex:
#    from apex import amp

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

2021-10-23 15:52:30.598068: W tensorflow/stream_executor/platform/default/dso_loader.cc:59] Could not load dynamic library 'libcudart.so.10.1'; dlerror: libcudart.so.10.1: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2021-10-23 15:52:30.598090: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


# wandb

In [5]:
# ====================================================
# wandb
# ====================================================
if CFG.wandb:
    import wandb

    # try:
    #     from kaggle_secrets import UserSecretsClient
    #     user_secrets = UserSecretsClient()
    #     secret_value_0 = user_secrets.get_secret("wandb_api")
    #     wandb.login(key=secret_value_0)
    #     anony = None
    # except:
    #     anony = "must"
    #     print('If you want to use your W&B account, go to Add-ons -> Secrets and provide your W&B access token. Use the Label name as wandb_api. \nGet your W&B access token from here: https://wandb.ai/authorize')

    anony=None # not for kaggle kernel

    def class2dict(f):
        return dict((name, getattr(f, name)) for name in dir(f) if not name.startswith('__'))

    run = wandb.init(project="Ventilator-Pressure-Public", 
                     # name=CFG.model_name,
                     config=class2dict(CFG),
                     group=CFG.experiment_name,
                     job_type="train",
                     anonymous=anony)

[34m[1mwandb[0m: Currently logged in as: [33mhypknot[0m (use `wandb login --relogin` to force relogin)
2021-10-23 15:52:33.233937: W tensorflow/stream_executor/platform/default/dso_loader.cc:59] Could not load dynamic library 'libcudart.so.10.1'; dlerror: libcudart.so.10.1: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2021-10-23 15:52:33.233957: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.

CondaEnvException: Unable to determine environment

Please re-run this command with one of the following options:

* Provide an environment name via --name or -n
* Re-run this command inside an activated conda environment.



# Utils

In [6]:
# ====================================================
# Utils
# ====================================================
def get_score(y_trues, y_preds):
    score = mean_absolute_error(y_trues, y_preds)
    return score


def init_logger(log_file=OUTPUT_DIR+'train.log'):
    from logging import getLogger, INFO, FileHandler,  Formatter,  StreamHandler
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=log_file)
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = init_logger()


def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything()

def decorate(s: str, decoration=None):
    if decoration is None:
        decoration = '★' * 20

    return ' '.join([decoration, str(s), decoration])

class Timer:
    def __init__(self, logger=None, format_str='{:.3f}[s]', prefix=None, suffix=None, sep=' ', verbose=0):

        if prefix: format_str = str(prefix) + sep + format_str
        if suffix: format_str = format_str + sep + str(suffix)
        self.format_str = format_str
        self.logger = logger
        self.start = None
        self.end = None
        self.verbose = verbose

    @property
    def duration(self):
        if self.end is None:
            return 0
        return self.end - self.start

    def __enter__(self):
        self.start = time()

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.end = time()
        if self.verbose is None:
            return
        out_str = self.format_str.format(self.duration)
        if self.logger:
            self.logger.info(out_str)
        else:
            print(out_str)

# data loading

In [7]:
# ====================================================
# Data Loading
# ====================================================
train = pd.read_csv(DATA_DIR + 'train.csv')
if CFG.debug:
    train = train[:80*5000]
test = pd.read_csv(DATA_DIR + 'test.csv')
sub = pd.read_csv(DATA_DIR + 'sample_submission.csv')

display(train.head())
display(test.head())
display(sub.head())

unique_pressures = train["pressure"].unique()
sorted_pressures = np.sort(unique_pressures)
total_pressures_len = len(sorted_pressures)

Unnamed: 0,id,breath_id,R,C,time_step,u_in,u_out,pressure
0,1,1,20,50,0.0,0.083334,0,5.837492
1,2,1,20,50,0.033652,18.383041,0,5.907794
2,3,1,20,50,0.067514,22.509278,0,7.876254
3,4,1,20,50,0.101542,22.808822,0,11.742872
4,5,1,20,50,0.135756,25.35585,0,12.234987


Unnamed: 0,id,breath_id,R,C,time_step,u_in,u_out
0,321,31,20,50,0.0,100.0,0
1,322,31,20,50,0.034079,58.192753,0
2,323,31,20,50,0.068005,12.08352,0
3,324,31,20,50,0.102059,13.780551,0
4,325,31,20,50,0.135973,48.687221,0


Unnamed: 0,id,pressure
0,321,0
1,322,0
2,323,0
3,324,0
4,325,0


# create features

In [8]:
class AbstractBaseBlock:
    def fit(self, input_df: pd.DataFrame, y=None):
        return self.transform(input_df)

    def transform(self, input_df: pd.DataFrame) -> pd.DataFrame:
        raise NotImplementedError()


class AddMultiplyingDividing(AbstractBaseBlock):
    def transform(self, input_df):
        input_df['area'] = input_df['time_step'] * input_df['u_in']
        input_df['area'] = input_df.groupby('breath_id')['area'].cumsum()
        input_df['cross'] = input_df['u_in']*input_df['u_out']
        input_df['cross2'] = input_df['time_step']*input_df['u_out']
        input_df['u_in_cumsum'] = (input_df['u_in']).groupby(input_df['breath_id']).cumsum()
        input_df['one'] = 1
        input_df['count'] = (input_df['one']).groupby(input_df['breath_id']).cumsum()
        input_df['u_in_cummean'] = input_df['u_in_cumsum'] / input_df['count']
        # input_df = input_df.merge(
        #     input_df[input_df["u_out"]==0].groupby('breath_id')['u_in'].agg(["mean", "std", "max"]).add_prefix("u_out0_").reset_index(),
        #     on="breath_id"
        # )
        # input_df = input_df.merge(
        #     input_df[input_df["u_out"]==1].groupby('breath_id')['u_in'].agg(["mean", "std", "max"]).add_prefix("u_out1_").reset_index(),
        #     on="breath_id"
        # )

        # feat-11
        input_df['time_step_cumsum'] = input_df.groupby(['breath_id'])['time_step'].cumsum()
        input_df['breath_id__u_in__max'] = input_df.groupby(['breath_id'])['u_in'].transform('max')
        input_df['breath_id__u_in__mean'] = input_df.groupby(['breath_id'])['u_in'].transform('mean')
        input_df['breath_id__u_in__diffmax'] = input_df.groupby(['breath_id'])['u_in'].transform('max') - input_df['u_in']
        input_df['breath_id__u_in__diffmean'] = input_df.groupby(['breath_id'])['u_in'].transform('mean') - input_df['u_in']

        output_df = pd.DataFrame(
            {
                "area": input_df['area'],
                #"cross": input_df['cross'],
                #"cross2": input_df['cross2'],
                "u_in_cumsum": input_df['u_in_cumsum'],
                "u_in_cummean": input_df['u_in_cummean'],
                'time_step_cumsum': input_df['time_step_cumsum'],
                "breath_id__u_in__max": input_df['breath_id__u_in__max'],
                "breath_id__u_in__mean": input_df['breath_id__u_in__mean'],
                "breath_id__u_in__diffmax": input_df['breath_id__u_in__diffmax'],
                "breath_id__u_in__diffmean": input_df['breath_id__u_in__diffmean'],

            }
        )
        CFG.cont_seq_cols += output_df.add_suffix(f'@{self.__class__.__name__}').columns.tolist()
        return output_df


# class RCDummry(AbstractBaseBlock):
#     def transform(self, input_df):
#         input_df['R_dummy'] = input_df['R'].astype(str)
#         input_df['C_dummy'] = input_df['C'].astype(str)
#         #input_df['RC_dummy'] = input_df['R_dummy'] + input_df['C_dummy']
#         output_df = pd.get_dummies(input_df[["R_dummy", "C_dummy"]])
#         CFG.cont_seq_cols += output_df.add_suffix(f'@{self.__class__.__name__}').columns.tolist()
#         return output_df


class AddBreathTimeAndUInTime(AbstractBaseBlock):
    def transform(self, input_df):
        output_df = pd.DataFrame(
            {
                "breath_time": input_df['time_step'] - input_df['time_step'].shift(1),
                "u_in_time": input_df['u_in'] - input_df['u_in'].shift(1)
            }
        )
        output_df.loc[input_df['time_step'] == 0, 'breath_time'] = output_df['breath_time'].mean()
        output_df.loc[input_df['time_step'] == 0, 'u_in_time'] = output_df['u_in_time'].mean()
        CFG.cont_seq_cols += output_df.add_suffix(f'@{self.__class__.__name__}').columns.tolist()
        return output_df

class LagFeatures(AbstractBaseBlock):
    def transform(self, input_df):
        output_df = pd.DataFrame(
            {
                "u_in_lag1": input_df.groupby("breath_id")["u_in"].shift(1).fillna(0),
                "u_in_lag2": input_df.groupby("breath_id")["u_in"].shift(2).fillna(0),
                "u_in_lag3": input_df.groupby("breath_id")["u_in"].shift(3).fillna(0),
                "u_in_lag4": input_df.groupby("breath_id")["u_in"].shift(4).fillna(0),
                "u_in_lag-1": input_df.groupby("breath_id")["u_in"].shift(-1).fillna(0),
                "u_in_lag-2": input_df.groupby("breath_id")["u_in"].shift(-2).fillna(0),
                "u_in_lag-3": input_df.groupby("breath_id")["u_in"].shift(-3).fillna(0),
                "u_in_lag-4": input_df.groupby("breath_id")["u_in"].shift(-4).fillna(0),
                "u_out_lag1": input_df.groupby("breath_id")["u_out"].shift(1).fillna(0),
                "u_out_lag2": input_df.groupby("breath_id")["u_out"].shift(2).fillna(0),
                "u_out_lag3": input_df.groupby("breath_id")["u_out"].shift(3).fillna(0),
                "u_out_lag4": input_df.groupby("breath_id")["u_out"].shift(4).fillna(0),
                "u_out_lag-1": input_df.groupby("breath_id")["u_out"].shift(-1).fillna(0),
                "u_out_lag-2": input_df.groupby("breath_id")["u_out"].shift(-2).fillna(0),
                "u_out_lag-3": input_df.groupby("breath_id")["u_out"].shift(-3).fillna(0),
                "u_out_lag-4": input_df.groupby("breath_id")["u_out"].shift(-4).fillna(0),
            }
        )
        output_df["u_in_lag1_diff"] = input_df["u_in"] - output_df["u_in_lag1"]
        output_df["u_in_lag2_diff"] = input_df["u_in"] - output_df["u_in_lag2"]
        output_df["u_in_lag3_diff"] = input_df["u_in"] - output_df["u_in_lag3"]
        output_df["u_in_lag4_diff"] = input_df["u_in"] - output_df["u_in_lag4"]
        output_df["u_out_lag1_diff"] = input_df["u_out"] - output_df["u_out_lag1"]
        output_df["u_out_lag2_diff"] = input_df["u_out"] - output_df["u_out_lag2"]
        output_df["u_out_lag3_diff"] = input_df["u_out"] - output_df["u_out_lag3"]
        output_df["u_out_lag4_diff"] = input_df["u_out"] - output_df["u_out_lag4"]
        output_df["u_in_lag-1_diff"] = input_df["u_in"] - output_df["u_in_lag-1"]
        output_df["u_in_lag-2_diff"] = input_df["u_in"] - output_df["u_in_lag-2"]
        output_df["u_out_lag-1_diff"] = input_df["u_out"] - output_df["u_out_lag-1"]
        output_df["u_out_lag-2_diff"] = input_df["u_out"] - output_df["u_out_lag-2"]

        output_df['u_in_ewm9'] = (input_df.groupby('breath_id')['u_in'].ewm(halflife=9).mean().reset_index(level=0,drop=True))
        output_df['u_in_ewm15'] = (input_df.groupby('breath_id')['u_in'].ewm(halflife=15).mean().reset_index(level=0,drop=True))

        output_df["u_in_rolling_mean2"] = input_df[["breath_id", "u_in"]].groupby("breath_id").rolling(2).mean()["u_in"].reset_index(drop=True)
        output_df["u_in_rolling_mean4"] = input_df[["breath_id", "u_in"]].groupby("breath_id").rolling(4).mean()["u_in"].reset_index(drop=True)
        output_df["u_in_rolling_mean15"] = input_df[["breath_id", "u_in"]].groupby("breath_id").rolling(15).mean()["u_in"].reset_index(drop=True)
        if not CFG.debug:
            output_df["u_in_rolling_max2"] = input_df[["breath_id", "u_in"]].groupby("breath_id").rolling(2).max()["u_in"].reset_index(drop=True)
            output_df["u_in_rolling_max4"] = input_df[["breath_id", "u_in"]].groupby("breath_id").rolling(4).max()["u_in"].reset_index(drop=True)
            output_df["u_in_rolling_max15"] = input_df[["breath_id", "u_in"]].groupby("breath_id").rolling(15).max()["u_in"].reset_index(drop=True)
            output_df["u_in_rolling_min2"] = input_df[["breath_id", "u_in"]].groupby("breath_id").rolling(2).min()["u_in"].reset_index(drop=True)
            output_df["u_in_rolling_min4"] = input_df[["breath_id", "u_in"]].groupby("breath_id").rolling(4).min()["u_in"].reset_index(drop=True)
            output_df["u_in_rolling_min15"] = input_df[["breath_id", "u_in"]].groupby("breath_id").rolling(15).min()["u_in"].reset_index(drop=True)
            output_df["u_in_rolling_std2"] = input_df[["breath_id", "u_in"]].groupby("breath_id").rolling(2).std()["u_in"].reset_index(drop=True)
            output_df["u_in_rolling_std4"] = input_df[["breath_id", "u_in"]].groupby("breath_id").rolling(4).std()["u_in"].reset_index(drop=True)
            output_df["u_in_rolling_std15"] = input_df[["breath_id", "u_in"]].groupby("breath_id").rolling(15).std()["u_in"].reset_index(drop=True)
            output_df["u_in_rolling_sum2"] = input_df[["breath_id", "u_in"]].groupby("breath_id").rolling(2).sum()["u_in"].reset_index(drop=True)
            output_df["u_in_rolling_sum4"] = input_df[["breath_id", "u_in"]].groupby("breath_id").rolling(4).sum()["u_in"].reset_index(drop=True)
            output_df["u_in_rolling_sum15"] = input_df[["breath_id", "u_in"]].groupby("breath_id").rolling(15).sum()["u_in"].reset_index(drop=True)
        for col in output_df.columns:
            output_df[col] = output_df[col].fillna(output_df[col].mean())
        CFG.cont_seq_cols += output_df.add_suffix(f'@{self.__class__.__name__}').columns.tolist()
        return output_df

In [9]:
feature_blocks = [
    AddMultiplyingDividing(),
    AddBreathTimeAndUInTime(),
#     RCDummry(),
    LagFeatures()
]

In [10]:
def run_blocks(input_df, blocks, y=None, test=False):
    out_df = pd.DataFrame()

    print(decorate('start run blocks...'))

    with Timer(prefix='run test={}'.format(test)):
        for block in feature_blocks:
            with Timer(prefix='out_df shape: {} \t- {}'.format(out_df.shape, str(block))):
                if not test:
                    out_i = block.fit(input_df.copy(), y=y)
                else:
                    out_i = block.transform(input_df.copy())

            assert len(input_df) == len(out_i), block
            name = block.__class__.__name__
            out_df = pd.concat([out_df, out_i.add_suffix(f'@{name}')], axis=1)
    print(f"out_df shape: {out_df.shape}")

    return pd.concat([input_df, out_df], axis=1)

train = run_blocks(train, blocks=feature_blocks)
test = run_blocks(test, blocks=feature_blocks, test=True)
CFG.cont_seq_cols = list(set(CFG.cont_seq_cols))
display(train.head())
display(test.head())

★★★★★★★★★★★★★★★★★★★★ start run blocks... ★★★★★★★★★★★★★★★★★★★★
out_df shape: (0, 0) 	- <__main__.AddMultiplyingDividing object at 0x7fab26cc10d0> 0.346[s]
out_df shape: (1637120, 8) 	- <__main__.AddBreathTimeAndUInTime object at 0x7fab26cc1350> 0.040[s]
out_df shape: (1637120, 10) 	- <__main__.LagFeatures object at 0x7fab26cc14d0> 22.745[s]
run test=False 23.550[s]
out_df shape: (1637120, 55)
★★★★★★★★★★★★★★★★★★★★ start run blocks... ★★★★★★★★★★★★★★★★★★★★
out_df shape: (0, 0) 	- <__main__.AddMultiplyingDividing object at 0x7fab26cc10d0> 0.170[s]
out_df shape: (1110400, 8) 	- <__main__.AddBreathTimeAndUInTime object at 0x7fab26cc1350> 0.017[s]
out_df shape: (1110400, 10) 	- <__main__.LagFeatures object at 0x7fab26cc14d0> 14.731[s]
run test=True 15.185[s]
out_df shape: (1110400, 55)


Unnamed: 0,id,breath_id,R,C,time_step,u_in,u_out,pressure,area@AddMultiplyingDividing,u_in_cumsum@AddMultiplyingDividing,u_in_cummean@AddMultiplyingDividing,time_step_cumsum@AddMultiplyingDividing,breath_id__u_in__max@AddMultiplyingDividing,breath_id__u_in__mean@AddMultiplyingDividing,breath_id__u_in__diffmax@AddMultiplyingDividing,breath_id__u_in__diffmean@AddMultiplyingDividing,breath_time@AddBreathTimeAndUInTime,u_in_time@AddBreathTimeAndUInTime,u_in_lag1@LagFeatures,u_in_lag2@LagFeatures,u_in_lag3@LagFeatures,u_in_lag4@LagFeatures,u_in_lag-1@LagFeatures,u_in_lag-2@LagFeatures,u_in_lag-3@LagFeatures,u_in_lag-4@LagFeatures,u_out_lag1@LagFeatures,u_out_lag2@LagFeatures,u_out_lag3@LagFeatures,u_out_lag4@LagFeatures,u_out_lag-1@LagFeatures,u_out_lag-2@LagFeatures,u_out_lag-3@LagFeatures,u_out_lag-4@LagFeatures,u_in_lag1_diff@LagFeatures,u_in_lag2_diff@LagFeatures,u_in_lag3_diff@LagFeatures,u_in_lag4_diff@LagFeatures,u_out_lag1_diff@LagFeatures,u_out_lag2_diff@LagFeatures,u_out_lag3_diff@LagFeatures,u_out_lag4_diff@LagFeatures,u_in_lag-1_diff@LagFeatures,u_in_lag-2_diff@LagFeatures,u_out_lag-1_diff@LagFeatures,u_out_lag-2_diff@LagFeatures,u_in_ewm9@LagFeatures,u_in_ewm15@LagFeatures,u_in_rolling_mean2@LagFeatures,u_in_rolling_mean4@LagFeatures,u_in_rolling_mean15@LagFeatures,u_in_rolling_max2@LagFeatures,u_in_rolling_max4@LagFeatures,u_in_rolling_max15@LagFeatures,u_in_rolling_min2@LagFeatures,u_in_rolling_min4@LagFeatures,u_in_rolling_min15@LagFeatures,u_in_rolling_std2@LagFeatures,u_in_rolling_std4@LagFeatures,u_in_rolling_std15@LagFeatures,u_in_rolling_sum2@LagFeatures,u_in_rolling_sum4@LagFeatures,u_in_rolling_sum15@LagFeatures
0,1,1,20,50,0.0,0.083334,0,5.837492,0.0,0.083334,0.083334,0.0,28.313036,10.146007,28.229702,10.062673,2e-06,3e-06,0.0,0.0,0.0,0.0,18.383041,22.509278,22.808822,25.35585,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.083334,0.083334,0.083334,0.083334,0.0,0.0,0.0,0.0,-18.299707,-22.425944,0.0,0.0,0.083334,0.083334,7.59225,7.404287,6.515106,8.602252,9.450181,11.543731,6.582249,5.535887,3.00269,1.428358,1.823449,2.94739,15.184501,29.617148,97.726585
1,2,1,20,50,0.033652,18.383041,0,5.907794,0.618632,18.466375,9.233188,0.033652,28.313036,10.146007,9.929994,-8.237035,0.033652,18.299707,0.083334,0.0,0.0,0.0,22.509278,22.808822,25.35585,27.259866,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18.299707,18.383041,18.383041,18.383041,0.0,0.0,0.0,0.0,-4.126236,-4.425781,0.0,0.0,9.585358,9.444557,9.233188,7.404287,6.515106,18.383041,9.450181,11.543731,0.083334,5.535887,3.00269,12.939847,1.823449,2.94739,18.466375,29.617148,97.726585
2,3,1,20,50,0.067514,22.509278,0,7.876254,2.138333,40.975653,13.658551,0.101167,28.313036,10.146007,5.803758,-12.363271,0.033862,4.126236,18.383041,0.083334,0.0,0.0,22.808822,25.35585,27.259866,27.127486,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.126236,22.425944,22.509278,22.509278,0.0,0.0,0.0,0.0,-0.299544,-2.846573,0.0,0.0,14.22904,14.002181,20.44616,7.404287,6.515106,22.509278,9.450181,11.543731,18.383041,5.535887,3.00269,2.91769,1.823449,2.94739,40.892319,29.617148,97.726585
3,4,1,20,50,0.101542,22.808822,0,11.742872,4.454391,63.784476,15.946119,0.202709,28.313036,10.146007,5.504214,-12.662816,0.034028,0.299544,22.509278,18.383041,0.083334,0.0,25.35585,27.259866,27.127486,26.807732,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.299544,4.425781,22.725488,22.808822,0.0,0.0,0.0,0.0,-2.547028,-4.451044,0.0,0.0,16.627759,16.358716,22.65905,15.946119,6.515106,22.808822,22.808822,11.543731,22.509278,0.083334,3.00269,0.21181,10.766279,2.94739,45.3181,63.784476,97.726585
4,5,1,20,50,0.135756,25.35585,0,12.234987,7.896588,89.140326,17.828065,0.338464,28.313036,10.146007,2.957185,-15.209844,0.034213,2.547028,22.808822,22.509278,18.383041,0.083334,27.259866,27.127486,26.807732,27.864715,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.547028,2.846573,6.972809,25.272516,0.0,0.0,0.0,0.0,-1.904016,-1.771635,0.0,0.0,18.652046,18.328164,24.082336,22.264248,6.515106,25.35585,25.35585,11.543731,22.808822,18.383041,3.00269,1.801021,2.885502,2.94739,48.164673,89.056992,97.726585


Unnamed: 0,id,breath_id,R,C,time_step,u_in,u_out,area@AddMultiplyingDividing,u_in_cumsum@AddMultiplyingDividing,u_in_cummean@AddMultiplyingDividing,time_step_cumsum@AddMultiplyingDividing,breath_id__u_in__max@AddMultiplyingDividing,breath_id__u_in__mean@AddMultiplyingDividing,breath_id__u_in__diffmax@AddMultiplyingDividing,breath_id__u_in__diffmean@AddMultiplyingDividing,breath_time@AddBreathTimeAndUInTime,u_in_time@AddBreathTimeAndUInTime,u_in_lag1@LagFeatures,u_in_lag2@LagFeatures,u_in_lag3@LagFeatures,u_in_lag4@LagFeatures,u_in_lag-1@LagFeatures,u_in_lag-2@LagFeatures,u_in_lag-3@LagFeatures,u_in_lag-4@LagFeatures,u_out_lag1@LagFeatures,u_out_lag2@LagFeatures,u_out_lag3@LagFeatures,u_out_lag4@LagFeatures,u_out_lag-1@LagFeatures,u_out_lag-2@LagFeatures,u_out_lag-3@LagFeatures,u_out_lag-4@LagFeatures,u_in_lag1_diff@LagFeatures,u_in_lag2_diff@LagFeatures,u_in_lag3_diff@LagFeatures,u_in_lag4_diff@LagFeatures,u_out_lag1_diff@LagFeatures,u_out_lag2_diff@LagFeatures,u_out_lag3_diff@LagFeatures,u_out_lag4_diff@LagFeatures,u_in_lag-1_diff@LagFeatures,u_in_lag-2_diff@LagFeatures,u_out_lag-1_diff@LagFeatures,u_out_lag-2_diff@LagFeatures,u_in_ewm9@LagFeatures,u_in_ewm15@LagFeatures,u_in_rolling_mean2@LagFeatures,u_in_rolling_mean4@LagFeatures,u_in_rolling_mean15@LagFeatures,u_in_rolling_max2@LagFeatures,u_in_rolling_max4@LagFeatures,u_in_rolling_max15@LagFeatures,u_in_rolling_min2@LagFeatures,u_in_rolling_min4@LagFeatures,u_in_rolling_min15@LagFeatures,u_in_rolling_std2@LagFeatures,u_in_rolling_std4@LagFeatures,u_in_rolling_std15@LagFeatures,u_in_rolling_sum2@LagFeatures,u_in_rolling_sum4@LagFeatures,u_in_rolling_sum15@LagFeatures
0,321,31,20,50,0.0,100.0,0,0.0,100.0,100.0,0.0,100.0,11.051577,0.0,-88.948423,2e-06,-8.6e-05,0.0,0.0,0.0,0.0,58.192753,12.08352,13.780551,48.687221,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,100.0,100.0,100.0,0.0,0.0,0.0,0.0,41.807247,87.91648,0.0,0.0,100.0,100.0,7.56283,7.375002,6.479142,8.557883,9.397677,11.467631,6.567778,5.526231,2.991942,1.407217,1.804946,2.92997,15.125661,29.500007,97.187134
1,322,31,20,50,0.034079,58.192753,0,1.983169,158.192753,79.096376,0.034079,100.0,11.051577,41.807247,-47.141176,0.034079,-41.807247,100.0,0.0,0.0,0.0,12.08352,13.780551,48.687221,23.392914,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-41.807247,58.192753,58.192753,58.192753,0.0,0.0,0.0,0.0,46.109233,44.412202,0.0,0.0,78.291814,78.613486,79.096376,7.375002,6.479142,100.0,9.397677,11.467631,58.192753,5.526231,2.991942,29.562188,1.804946,2.92997,158.192753,29.500007,97.187134
2,323,31,20,50,0.068005,12.08352,0,2.804913,170.276273,56.758758,0.102085,100.0,11.051577,87.91648,-1.031943,0.033926,-46.109233,58.192753,100.0,0.0,0.0,13.780551,48.687221,23.392914,5.957594,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-46.109233,-87.91648,12.08352,12.08352,0.0,0.0,0.0,0.0,-1.69703,-36.603701,0.0,0.0,54.502568,55.404528,35.138137,7.375002,6.479142,58.192753,9.397677,11.467631,12.08352,5.526231,2.991942,32.604151,1.804946,2.92997,70.276273,29.500007,97.187134
3,324,31,20,50,0.102059,13.780551,0,4.211347,184.056824,46.014206,0.204144,100.0,11.051577,86.219449,-2.728974,0.034054,1.69703,12.08352,58.192753,100.0,0.0,48.687221,23.392914,5.957594,44.794983,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.69703,-44.412202,-86.219449,13.780551,0.0,0.0,0.0,0.0,-34.90667,-9.612363,0.0,0.0,43.117588,44.26653,12.932036,46.014206,6.479142,13.780551,100.0,11.467631,12.08352,12.08352,2.991942,1.199982,41.845282,2.92997,25.864071,184.056824,97.187134
4,325,31,20,50,0.135973,48.687221,0,10.831482,232.744045,46.548809,0.340117,100.0,11.051577,51.312779,-37.635644,0.033913,34.90667,13.780551,12.08352,58.192753,100.0,23.392914,5.957594,44.794983,38.069183,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,34.90667,36.603701,-9.505532,-51.312779,0.0,0.0,0.0,0.0,25.294307,42.729627,0.0,0.0,44.409341,45.234207,31.233886,33.186011,6.479142,48.687221,58.192753,11.467631,13.780551,12.08352,2.991942,24.682743,23.717164,2.92997,62.467772,132.744045,97.187134


# normalization

In [11]:
train_col_order = ["u_out"] + train.columns.drop("u_out").tolist()
test_col_order = ["u_out"] + test.columns.drop("u_out").tolist()
train = train[train_col_order]
test = test[test_col_order]
scaler = RobustScaler()
scaler_targets = [col for col in CFG.cont_seq_cols if col != "u_out"]
print(f"Apply Standerd Scaler these columns: {scaler_targets}")
for scaler_target in tqdm(scaler_targets):
    scaler.fit(train.loc[:,[scaler_target]])
    train.loc[:,[scaler_target]] = scaler.transform(train.loc[:,[scaler_target]])
    test.loc[:,[scaler_target]] = scaler.transform(test.loc[:,[scaler_target]])
display(train.head())
display(test.head())

Apply Standerd Scaler these columns: ['u_in_rolling_min2@LagFeatures', 'u_in_lag2_diff@LagFeatures', 'u_in', 'u_in_rolling_std4@LagFeatures', 'u_in_cummean@AddMultiplyingDividing', 'u_out_lag3@LagFeatures', 'u_in_lag3_diff@LagFeatures', 'u_in_lag-2@LagFeatures', 'u_in_rolling_mean15@LagFeatures', 'u_in_lag3@LagFeatures', 'breath_id__u_in__diffmean@AddMultiplyingDividing', 'u_in_rolling_max15@LagFeatures', 'u_out_lag3_diff@LagFeatures', 'u_in_rolling_mean2@LagFeatures', 'u_in_rolling_min4@LagFeatures', 'u_out_lag1@LagFeatures', 'u_in_lag1_diff@LagFeatures', 'u_out_lag-2@LagFeatures', 'u_in_rolling_std15@LagFeatures', 'u_out_lag2_diff@LagFeatures', 'u_out_lag-2_diff@LagFeatures', 'C', 'u_in_rolling_mean4@LagFeatures', 'u_out_lag-4@LagFeatures', 'area@AddMultiplyingDividing', 'u_in_lag1@LagFeatures', 'u_in_ewm15@LagFeatures', 'breath_id__u_in__diffmax@AddMultiplyingDividing', 'u_out_lag4@LagFeatures', 'u_in_lag-2_diff@LagFeatures', 'u_in_rolling_max4@LagFeatures', 'u_in_lag-1@LagFeatures'

  0%|          | 0/58 [00:00<?, ?it/s]

Unnamed: 0,u_out,id,breath_id,R,C,time_step,u_in,pressure,area@AddMultiplyingDividing,u_in_cumsum@AddMultiplyingDividing,u_in_cummean@AddMultiplyingDividing,time_step_cumsum@AddMultiplyingDividing,breath_id__u_in__max@AddMultiplyingDividing,breath_id__u_in__mean@AddMultiplyingDividing,breath_id__u_in__diffmax@AddMultiplyingDividing,breath_id__u_in__diffmean@AddMultiplyingDividing,breath_time@AddBreathTimeAndUInTime,u_in_time@AddBreathTimeAndUInTime,u_in_lag1@LagFeatures,u_in_lag2@LagFeatures,u_in_lag3@LagFeatures,u_in_lag4@LagFeatures,u_in_lag-1@LagFeatures,u_in_lag-2@LagFeatures,u_in_lag-3@LagFeatures,u_in_lag-4@LagFeatures,u_out_lag1@LagFeatures,u_out_lag2@LagFeatures,u_out_lag3@LagFeatures,u_out_lag4@LagFeatures,u_out_lag-1@LagFeatures,u_out_lag-2@LagFeatures,u_out_lag-3@LagFeatures,u_out_lag-4@LagFeatures,u_in_lag1_diff@LagFeatures,u_in_lag2_diff@LagFeatures,u_in_lag3_diff@LagFeatures,u_in_lag4_diff@LagFeatures,u_out_lag1_diff@LagFeatures,u_out_lag2_diff@LagFeatures,u_out_lag3_diff@LagFeatures,u_out_lag4_diff@LagFeatures,u_in_lag-1_diff@LagFeatures,u_in_lag-2_diff@LagFeatures,u_out_lag-1_diff@LagFeatures,u_out_lag-2_diff@LagFeatures,u_in_ewm9@LagFeatures,u_in_ewm15@LagFeatures,u_in_rolling_mean2@LagFeatures,u_in_rolling_mean4@LagFeatures,u_in_rolling_mean15@LagFeatures,u_in_rolling_max2@LagFeatures,u_in_rolling_max4@LagFeatures,u_in_rolling_max15@LagFeatures,u_in_rolling_min2@LagFeatures,u_in_rolling_min4@LagFeatures,u_in_rolling_min15@LagFeatures,u_in_rolling_std2@LagFeatures,u_in_rolling_std4@LagFeatures,u_in_rolling_std15@LagFeatures,u_in_rolling_sum2@LagFeatures,u_in_rolling_sum4@LagFeatures,u_in_rolling_sum15@LagFeatures
0,0,1,1,20,0.75,-0.980732,-0.989839,5.837492,-0.53058,-0.742594,-0.703117,-0.495562,0.00549,0.637685,0.209202,1.061599,-26.491835,-0.046256,-0.906911,-0.890922,-0.872571,-0.850923,2.802562,3.611786,3.688712,4.214684,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.455149,0.219465,0.128364,0.06774,0.0,0.0,0.0,0.0,-123.513619,-92.752936,0.0,0.0,-0.595377,-0.620365,0.633144,0.45909,0.396114,0.735303,0.626721,0.910399,0.431017,0.273304,0.167721,3.211678,1.995268,0.57925,0.633144,0.45909,0.396114
1,0,2,1,20,0.75,-0.955707,3.028841,5.907794,-0.52825,-0.700602,0.026002,-0.494933,0.00549,0.637685,-0.263759,-1.371939,-0.19877,123.513619,-0.890191,-0.890922,-0.872571,-0.850923,3.639201,3.671901,4.200223,4.597327,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,110.271417,59.047891,39.810492,27.181467,0.0,0.0,0.0,0.0,-27.814132,-18.267041,0.0,0.0,0.673025,0.44172,0.984984,0.45909,0.396114,2.588583,0.626721,0.910399,-0.873935,0.273304,0.167721,30.24215,1.995268,0.57925,0.984984,0.45909,0.396114
2,0,3,1,20,0.75,-0.930527,3.934976,7.876254,-0.522527,-0.649184,0.378643,-0.493672,0.00549,0.637685,-0.370403,-1.920655,-0.034836,27.814132,2.781313,-0.874193,-0.872571,-0.850923,3.699937,4.183062,4.5826,4.570723,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,24.827532,72.044687,48.758058,33.295097,0.0,0.0,0.0,0.0,-1.976253,-11.732171,0.0,0.0,1.292899,0.958809,3.389207,0.45909,0.396114,3.37043,0.626721,0.910399,2.80056,0.273304,0.167721,6.708823,1.995268,0.57925,3.389207,0.45909,0.396114
3,0,4,1,20,0.75,-0.905224,4.000757,11.742872,-0.513804,-0.597082,0.560931,-0.491776,0.00549,0.637685,-0.378144,-1.960489,0.094635,1.976253,3.609167,2.799461,-0.85583,-0.850923,4.216374,4.565178,4.556014,4.506464,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.75856,14.179213,49.226901,33.738917,0.0,0.0,0.0,0.0,-17.151295,-18.371581,0.0,0.0,1.613098,1.226171,3.863682,1.898243,0.396114,3.427188,2.435314,0.910399,3.629089,-0.824205,0.167721,0.355066,12.837915,0.57925,3.863682,1.898243,0.396114
4,0,5,1,20,0.75,-0.879783,4.560094,12.234987,-0.50084,-0.539162,0.710897,-0.48924,0.00549,0.637685,-0.443973,-2.299199,0.239568,17.151295,3.669266,3.6278,2.820367,-0.834169,4.602434,4.538611,4.491799,4.718882,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,15.307374,9.102502,15.067893,37.389245,0.0,0.0,0.0,0.0,-12.809665,-7.284008,0.0,0.0,1.883316,1.449616,4.168855,2.96274,0.396114,3.909803,2.78015,0.910399,3.689236,2.859225,0.167721,4.086739,3.282944,0.57925,4.168855,2.96274,0.396114


Unnamed: 0,u_out,id,breath_id,R,C,time_step,u_in,area@AddMultiplyingDividing,u_in_cumsum@AddMultiplyingDividing,u_in_cummean@AddMultiplyingDividing,time_step_cumsum@AddMultiplyingDividing,breath_id__u_in__max@AddMultiplyingDividing,breath_id__u_in__mean@AddMultiplyingDividing,breath_id__u_in__diffmax@AddMultiplyingDividing,breath_id__u_in__diffmean@AddMultiplyingDividing,breath_time@AddBreathTimeAndUInTime,u_in_time@AddBreathTimeAndUInTime,u_in_lag1@LagFeatures,u_in_lag2@LagFeatures,u_in_lag3@LagFeatures,u_in_lag4@LagFeatures,u_in_lag-1@LagFeatures,u_in_lag-2@LagFeatures,u_in_lag-3@LagFeatures,u_in_lag-4@LagFeatures,u_out_lag1@LagFeatures,u_out_lag2@LagFeatures,u_out_lag3@LagFeatures,u_out_lag4@LagFeatures,u_out_lag-1@LagFeatures,u_out_lag-2@LagFeatures,u_out_lag-3@LagFeatures,u_out_lag-4@LagFeatures,u_in_lag1_diff@LagFeatures,u_in_lag2_diff@LagFeatures,u_in_lag3_diff@LagFeatures,u_in_lag4_diff@LagFeatures,u_out_lag1_diff@LagFeatures,u_out_lag2_diff@LagFeatures,u_out_lag3_diff@LagFeatures,u_out_lag4_diff@LagFeatures,u_in_lag-1_diff@LagFeatures,u_in_lag-2_diff@LagFeatures,u_out_lag-1_diff@LagFeatures,u_out_lag-2_diff@LagFeatures,u_in_ewm9@LagFeatures,u_in_ewm15@LagFeatures,u_in_rolling_mean2@LagFeatures,u_in_rolling_mean4@LagFeatures,u_in_rolling_mean15@LagFeatures,u_in_rolling_max2@LagFeatures,u_in_rolling_max4@LagFeatures,u_in_rolling_max15@LagFeatures,u_in_rolling_min2@LagFeatures,u_in_rolling_min4@LagFeatures,u_in_rolling_min15@LagFeatures,u_in_rolling_std2@LagFeatures,u_in_rolling_std4@LagFeatures,u_in_rolling_std15@LagFeatures,u_in_rolling_sum2@LagFeatures,u_in_rolling_sum4@LagFeatures,u_in_rolling_sum15@LagFeatures
0,0,321,31,20,0.75,-0.980732,20.952208,-0.53058,-0.514355,7.258882,-0.495562,1.720031,0.790869,-0.520402,-12.105126,-26.491232,-0.046854,-0.906911,-0.890922,-0.872571,-0.850923,10.874412,1.519447,1.875594,8.90351,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,602.796525,321.423516,216.793355,148.109079,0.0,0.0,0.0,0.0,282.329425,363.851493,0.0,0.0,12.74226,10.715759,0.626836,0.454156,0.388174,0.726896,0.619612,0.899854,0.428111,0.271361,0.164927,3.162037,1.972835,0.572488,0.626836,0.454156,0.388174
1,0,322,31,20,0.75,-0.95539,11.771191,-0.523111,-0.381425,5.593148,-0.494925,1.720031,0.790869,0.560118,-6.545502,0.134873,-282.329425,19.156275,-0.890922,-0.872571,-0.850923,1.525265,1.860023,8.885784,3.820199,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-252.079601,187.024945,126.136139,86.165451,0.0,0.0,0.0,0.0,311.376496,183.827843,0.0,0.0,9.844486,8.289335,15.964657,0.454156,0.388174,18.053505,0.619612,0.899854,10.794162,0.271361,0.164927,69.273563,1.972835,0.572488,15.964657,0.454156,0.388174
2,0,323,31,20,0.75,-0.930162,1.645444,-0.520016,-0.353823,3.813143,-0.493655,1.720031,0.790869,1.751825,-0.413789,0.015089,-311.376496,10.768409,19.18401,-0.872571,-0.850923,1.869357,8.86542,3.806013,0.316282,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-278.013853,-282.675249,26.150251,17.847794,0.0,0.0,0.0,0.0,-11.412099,-151.421455,0.0,0.0,6.668916,5.656145,6.539378,0.454156,0.388174,10.131796,0.619612,0.899854,1.535646,0.271361,0.164927,76.416487,1.972835,0.572488,6.539378,0.454156,0.388174
3,0,324,31,20,0.75,-0.904839,2.018118,-0.514719,-0.322344,2.956949,-0.491749,1.720031,0.790869,1.707964,-0.639464,0.115127,11.412099,1.517428,10.791234,19.21626,-0.850923,8.947062,3.789122,0.304536,8.1213,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,10.183217,-142.821202,-187.015509,20.362196,0.0,0.0,0.0,0.0,-235.644076,-39.72947,0.0,0.0,5.149163,4.392474,1.77807,6.964203,0.388174,1.716496,12.886039,0.899854,1.535646,1.591235,0.164927,2.675422,50.519347,0.572488,1.77807,6.964203,0.388174
4,0,325,31,20,0.75,-0.879621,9.683743,-0.489786,-0.211128,2.999549,-0.489209,1.720031,0.790869,0.805791,-5.281434,0.005216,235.644076,1.857907,1.534837,10.817673,19.254392,3.818368,0.290039,8.104119,6.769639,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,210.385454,117.622198,-20.66468,-76.083213,0.0,0.0,0.0,0.0,170.833812,176.865233,0.0,0.0,5.321596,4.502263,5.702251,4.802871,0.388174,8.330671,7.225858,0.899854,1.876402,1.591235,0.164927,57.815993,28.540088,0.572488,5.702251,4.802871,0.388174


# reshape

In [None]:
print(set(train.drop(["id", "breath_id", "pressure", "R"], axis=1).columns) - set(CFG.cont_seq_cols))
print(train.drop(["id", "breath_id", "pressure", "R"], axis=1).shape)
print(len(CFG.cont_seq_cols))

X = np.float32(train.drop(["id", "breath_id", "pressure", "R"], axis=1)).reshape(-1, 80, len(CFG.cont_seq_cols))
y = np.float32(train["pressure"]).reshape(-1, 80, 1)
X_test = np.float32(test.drop(["id", "breath_id", "R"], axis=1)).reshape(-1, 80, len(CFG.cont_seq_cols))

# cv split

In [None]:
# ====================================================
# CV split
# ====================================================
# Fold = GroupKFold(n_splits=5)
# groups = train['breath_id'].values
# for n, (train_index, val_index) in enumerate(Fold.split(train, train['pressure'], groups)):
#     train.loc[val_index, 'fold'] = int(n)
# train['fold'] = train['fold'].astype(int)
# print(train.groupby('fold').size())

# Dataset

In [None]:
# ====================================================
# Dataset
# ====================================================
# class TrainDataset(Dataset):
#     def __init__(self, df):
#         self.df = df
#         self.groups = df.groupby('breath_id').groups
#         self.keys = list(self.groups.keys())
#         
#     def __len__(self):
#         return len(self.groups)
# 
#     def __getitem__(self, idx):
#         indexes = self.groups[self.keys[idx]]
#         df = self.df.iloc[indexes]
#         cont_seq_x = torch.FloatTensor(df[CFG.cont_seq_cols].values)
#         u_out = torch.LongTensor(df['u_out'].values)
#         label = torch.FloatTensor(df['pressure'].values)
#         return cont_seq_x, u_out, label
#     
# 
# class TestDataset(Dataset):
#     def __init__(self, df):
#         self.df = df
#         self.groups = df.groupby('breath_id').groups
#         self.keys = list(self.groups.keys())
#         
#     def __len__(self):
#         return len(self.groups)
# 
#     def __getitem__(self, idx):
#         indexes = self.groups[self.keys[idx]]
#         df = self.df.iloc[indexes]
#         cont_seq_x = torch.FloatTensor(df[CFG.cont_seq_cols].values)
#         return cont_seq_x

# Loss

In [None]:
class L1Loss_masked(nn.Module):
    def __init__(self):
        super().__init__()
    def forward(self, preds, y, u_out):

        mask = 1 - u_out
        mae = torch.abs(mask * (y - preds))
        mae = torch.sum(mae) / torch.sum(mask)

        return mae

# Model

In [None]:
# ====================================================
# Model
# ====================================================
class CustomModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.cfg = cfg
        self.hidden_size = cfg.hidden_size
        self.seq_emb = nn.Sequential(
            nn.Linear(len(cfg.cont_seq_cols), self.hidden_size),
            nn.LayerNorm(self.hidden_size),
            nn.GELU(),
            #nn.Dropout(0.1),
        )
        self.lstm1 = nn.LSTM(self.hidden_size, self.hidden_size, batch_first=True, bidirectional=True)
        self.lstm2 = nn.LSTM(self.hidden_size * 2, self.hidden_size//2, batch_first=True, bidirectional=True)
        self.lstm3 = nn.LSTM(self.hidden_size//2 * 2, self.hidden_size//2, batch_first=True, bidirectional=True)
        self.lstm4 = nn.LSTM(self.hidden_size//2 * 2, self.hidden_size//4, batch_first=True, bidirectional=True)
        self.lstm5 = nn.LSTM(self.hidden_size//4 * 2, self.hidden_size//8, batch_first=True, bidirectional=True)
        self.lstm6 = nn.LSTM(self.hidden_size//8 * 2, self.hidden_size//16, batch_first=True, bidirectional=True)
        self.head = nn.Sequential(
            # nn.Linear(self.hidden_size//8 * 2, self.hidden_size//8 * 2),
            nn.LayerNorm(self.hidden_size//16 * 2),
            nn.GELU(),
            #nn.Dropout(0.),
            nn.Linear(self.hidden_size//16 * 2, 1),
        )
        for n, m in self.named_modules():
            if isinstance(m, nn.LSTM):
                print(f'init {m}')
                for param in m.parameters():
                    if len(param.shape) >= 2:
                        nn.init.orthogonal_(param.data)
                    else:
                        nn.init.normal_(param.data)
            elif isinstance(m, nn.GRU):
                print(f"init {m}")
                for param in m.parameters():
                    if len(param.shape) >= 2:
                        init.orthogonal_(param.data)
                    else:
                        init.normal_(param.data)

    def forward(self, cont_seq_x):
        bs = cont_seq_x.size(0)
        seq_emb = self.seq_emb(cont_seq_x)
        seq_emb, _ = self.lstm1(seq_emb)
        seq_emb, _ = self.lstm2(seq_emb)
        seq_emb, _ = self.lstm3(seq_emb)
        seq_emb, _ = self.lstm4(seq_emb)
        seq_emb, _ = self.lstm5(seq_emb)
        seq_emb, _ = self.lstm6(seq_emb)
        output = self.head(seq_emb)#.view(bs, -1)
        return output
print(CustomModel(CFG))

# helper function

In [None]:
# ====================================================
# helper function
# ====================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))


scaler = GradScaler()

def train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device):
    model.train()
    losses = AverageMeter()
    start = end = time()
    for step, (inputs, y) in enumerate(train_loader):
        inputs, y = inputs.to(device), y.to(device)
        batch_size = inputs.size(0)
        with autocast():
            pred = model(inputs)
            loss = criterion(pred, y, inputs[:,:,0].reshape(-1,80,1))
        losses.update(loss.item(), batch_size)
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        if CFG.apex:
            scaler.scale(loss).backward()
        else:
            loss.backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            if CFG.apex:
                scaler.step(optimizer)
            else:
                optimizer.step()
            optimizer.zero_grad()
            lr = 0
            if CFG.batch_scheduler:
                scheduler.step()
                lr = scheduler.get_lr()[0]
        if CFG.apex:
            scaler.update()
        end = time()
        if CFG.wandb:
            wandb.log({f"[fold{fold}] loss": losses.val, 
                       f"[fold{fold}] lr": scheduler.get_lr()[0]})
    return losses.avg


def valid_fn(valid_loader, model, criterion, device):
    model.eval()
    preds = []
    losses = AverageMeter()
    start = end = time()
    for step, (inputs, y) in enumerate(valid_loader):
        inputs, y = inputs.to(device), y.to(device)
        batch_size = inputs.size(0)
        with torch.no_grad():
            pred = model(inputs)
        loss = criterion(pred, y, inputs[:,:,0].reshape(-1,80,1))
        losses.update(loss.item(), batch_size)
        preds.append(pred.view(-1).detach().cpu().numpy())
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        end = time()
    preds = np.concatenate(preds)
    return losses.avg, preds


def inference_fn(test_loader, model, device):
    model.eval()
    model.to(device)
    preds = []
    tk0 = tqdm(enumerate(test_loader), total=len(test_loader))
    for step, (cont_seq_x) in tk0:
        cont_seq_x = cont_seq_x.to(device)
        with torch.no_grad():
            pred = model(cont_seq_x)
        preds.append(pred.view(-1).detach().cpu().numpy())
    preds = np.concatenate(preds)
    return preds

def find_nearest(prediction):
    '''
    予測値は離散値であるため、学習データにある最も近い離散値に置き換える
    '''
    insert_idx = np.searchsorted(sorted_pressures, prediction)
    if insert_idx == total_pressures_len:
        # If the predicted value is bigger than the highest pressure in the train dataset,
        # return the max value.
        return sorted_pressures[-1]
    elif insert_idx == 0:
        # Same control but for the lower bound.
        return sorted_pressures[0]
    lower_val = sorted_pressures[insert_idx - 1]
    upper_val = sorted_pressures[insert_idx]
    return lower_val if abs(lower_val - prediction) < abs(upper_val - prediction) else upper_val

def feature_importance_fn(X_valid, y_valid, model, criterion, device):
    valid_dataset = torch.utils.data.TensorDataset(
        torch.from_numpy(X_valid),
        torch.from_numpy(y_valid)
    )
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
    model.to(device)
    model.eval()
    losses = AverageMeter()
    for step, (inputs, y) in enumerate(valid_loader):
        inputs, y = inputs.to(device), y.to(device)
        batch_size = inputs.size(0)
        with torch.no_grad():
            pred = model(inputs)
        loss = criterion(pred, y, inputs[:,:,0].reshape(-1,80,1))
        losses.update(loss.item(), batch_size)
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
    return losses.avg

# Train Loop

In [None]:
train["breath_id"].unique()

In [None]:
# ====================================================
# train loop
# ====================================================
def train_loop(folds, fold, trn_idx, val_idx):

    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # loader
    # ====================================================
    #trn_idx = folds[folds['fold'] != fold].index
    #val_idx = folds[folds['fold'] == fold].index
    
    train_folds = X[trn_idx]
    valid_folds = X[val_idx]
    groups = train["breath_id"].unique()[val_idx]
    oof_folds = train[train["breath_id"].isin(groups)].reset_index(drop=True)
    y_train = y[trn_idx]
    y_true = y[val_idx]

    # train_dataset = TrainDataset(train_folds)
    # valid_dataset = TrainDataset(valid_folds)
    train_dataset = torch.utils.data.TensorDataset(
        torch.from_numpy(train_folds),
        torch.from_numpy(y_train)
    )
    valid_dataset = torch.utils.data.TensorDataset(
        torch.from_numpy(valid_folds),
        torch.from_numpy(y_true)
    )

    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

    # ====================================================
    # model & optimizer
    # ====================================================
    model = CustomModel(CFG)
    model.to(device)

    optimizer = AdamW(model.parameters(), lr=CFG.lr, weight_decay=CFG.weight_decay)
    #optimizer = torch.optim.Adam(model.parameters(), lr=0.0008, eps=1e-08)
    num_train_steps = int(len(train_folds) / CFG.batch_size * CFG.epochs)
    
    def get_scheduler(optimizer):
        if CFG.scheduler=='linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=CFG.num_warmup_steps, num_training_steps=num_train_steps
            )
        elif CFG.scheduler=='cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=CFG.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=CFG.num_cycles
            )
        elif CFG.scheduler=='ReduceLROnPlateau':
            scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=CFG.factor, patience=CFG.patience, verbose=True, eps=CFG.eps)
        elif CFG.scheduler=='CosineAnnealingLR':
            scheduler = CosineAnnealingLR(optimizer, T_max=CFG.T_max, eta_min=CFG.min_lr, last_epoch=-1)
        elif CFG.scheduler=='CosineAnnealingWarmRestarts':
            scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=CFG.T_0, T_mult=1, eta_min=CFG.min_lr, last_epoch=-1)
        return scheduler

    scheduler = get_scheduler(optimizer)

    # ====================================================
    # apex
    # ====================================================
    #if CFG.apex:
    #    model, optimizer = amp.initialize(model, optimizer, opt_level='O1', verbosity=0)

    # ====================================================
    # loop
    # ====================================================
    criterion = L1Loss_masked()

    best_score = np.inf

    avg_losses = []
    avg_val_losses = []
    for epoch in range(CFG.epochs):

        start_time = time()

        # train
        avg_loss = train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device)
        #avg_loss = train_fn(fold, train_loader, model, criterion, optimizer, epoch, None, device)
        avg_losses.append(avg_loss)
        
        # eval
        avg_val_loss, preds = valid_fn(valid_loader, model, criterion, device)
        avg_val_losses.append(avg_val_loss)
        
        if isinstance(scheduler, ReduceLROnPlateau):
            scheduler.step(avg_val_loss)
        elif isinstance(scheduler, CosineAnnealingLR):
            scheduler.step()
        elif isinstance(scheduler, CosineAnnealingWarmRestarts):
            scheduler.step()

        # scoring
        score = avg_val_loss #get_score(y_true[non_expiratory_phase_val_idx], preds[non_expiratory_phase_val_idx])

        elapsed = time() - start_time

        best_notice = ""
        if score < best_score:
            best_notice = "Best Score"
            best_score = score
            # LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'preds': preds},
                        OUTPUT_DIR+f"fold{fold}_best.pth")
        if CFG.wandb:
            wandb.log({f"[fold{fold}] epoch": epoch+1, 
                       f"[fold{fold}] avg_train_loss": avg_loss, 
                       f"[fold{fold}] avg_val_loss": avg_val_loss,
                       f"[fold{fold}] score": score,
                       f"[fold{fold}] best_score": best_score})
    
        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s, lr: {optimizer.param_groups[0]["lr"]:.5f}, MAE Score: {score:.4f}, {best_notice}')

    plt.figure(figsize=(14,6))
    plt.plot(avg_losses, label="Train Loss")
    plt.plot(avg_val_losses, label="Train Loss")
    plt.title(f"Fold {fold + 1} - Best score {best_score:.4f}", size=18)
    plt.show()

    preds = torch.load(OUTPUT_DIR+f"fold{fold}_best.pth", map_location=torch.device('cpu'))['preds']
    oof_folds['preds'] = preds.flatten()

    torch.cuda.empty_cache()
    gc.collect()
    
    return oof_folds

# Main

In [None]:
# ====================================================
# main
# ====================================================
def main():
    
    """
    Prepare: 1.train 2.test
    """
    
    def get_result(result_df):
        preds = result_df['preds'].values
        labels = result_df['pressure'].values
        non_expiratory_phase_val_idx = result_df[result_df['u_out'] == 0].index # The expiratory phase is not scored
        score = get_score(labels[non_expiratory_phase_val_idx], preds[non_expiratory_phase_val_idx])
        LOGGER.info(f'Score (without expiratory phase): {score:<.4f}')
    
    if CFG.train:
        # train 
        oof_df = pd.DataFrame()
        kfold = KFold(n_splits=CFG.n_fold, random_state=42, shuffle=True)
        for fold, (trn_idx, val_idx) in enumerate(kfold.split(X=X, y=y)):
            if fold in CFG.trn_fold:
                _oof_df = train_loop(X, fold, trn_idx, val_idx)
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f"========== fold: {fold} result ==========")
                get_result(_oof_df)
        # CV result
        LOGGER.info(f"========== CV ==========")
        get_result(oof_df)
        # save result
        oof_df.to_csv(OUTPUT_DIR+'oof_df.csv', index=False)
        for i, breath_id in enumerate(oof_df["breath_id"].unique()):
            oof_df[oof_df["breath_id"]==breath_id].plot(x="time_step", y=["preds", "pressure", "u_out"], figsize=(16, 5))
            plt.show()
            if i == 10:
                break
    
    if CFG.inference:
        test_loader = torch.utils.data.DataLoader(X_test, batch_size=512, shuffle=False, pin_memory=True)
        #test_loader = DataLoader(test_dataset, batch_size=CFG.batch_size * 2, shuffle=False, num_workers=CFG.num_workers, pin_memory=True)
        for fold in CFG.trn_fold:
            model = CustomModel(CFG)
            path = OUTPUT_DIR+f"fold{fold}_best.pth"
            state = torch.load(path, map_location=torch.device('cpu'))
            model.load_state_dict(state['model'])
            predictions = inference_fn(test_loader, model, device)
            test[f'fold{fold}'] = predictions
            del state, predictions; gc.collect()
            torch.cuda.empty_cache()
            
        # submission
        test['pressure'] = test[[f'fold{fold}' for fold in CFG.trn_fold]].mean(1)
        test['pressure'] = test['pressure'].apply(find_nearest)
        test[['id', 'pressure']+[f'fold{fold}' for fold in CFG.trn_fold]].to_csv(OUTPUT_DIR+'raw_submission_mean.csv', index=False)
        test[['id', 'pressure']].to_csv(OUTPUT_DIR+'submission_mean.csv', index=False)
        
        test['pressure'] = test[[f'fold{fold}' for fold in CFG.trn_fold]].median(1)
        test['pressure'] = test['pressure'].apply(find_nearest)
        test[['id', 'pressure']+[f'fold{fold}' for fold in CFG.trn_fold]].to_csv(OUTPUT_DIR+'raw_submission_median.csv', index=False)
        test[['id', 'pressure']].to_csv(OUTPUT_DIR+'submission_median.csv', index=False)
        
    if CFG.feature_importance:
        fi_results = []
        print('Computing LSTM feature importance...')
        kfold = KFold(n_splits=CFG.n_fold, random_state=42, shuffle=True)
        for fold, (trn_idx, val_idx) in enumerate(kfold.split(X=X, y=y)):
            model = CustomModel(CFG)
            path = OUTPUT_DIR+f"fold{fold}_best.pth"
            state = torch.load(path, map_location=torch.device('cpu'))
            model.load_state_dict(state['model'])
            X_valid = X[val_idx]
            y_valid = y[val_idx]
            if fold in CFG.trn_fold: 
                for k in tqdm(range(len(CFG.cont_seq_cols))):
                    criterion = L1Loss_masked()
                    if k>0: 
                        save_col = X_valid[:,:,k-1].copy()
                        np.random.shuffle(X_valid[:,:,k-1])
                    
                    avg_val_loss = feature_importance_fn(X_valid, y_valid, model, criterion, device)
                    fi_results.append({'feature':CFG.cont_seq_cols[k],'avg_val_loss':avg_val_loss})

                    if k>0: 
                        X_valid[:,:,k-1] = save_col
            # compute feature importance with only one fold
            break
        # DISPLAY LSTM FEATURE IMPORTANCE
        print()
        fi_df = pd.DataFrame(fi_results)
        fi_df = fi_df.sort_values('avg_val_loss')
        fig, ax = plt.subplots(figsize=(10,20))
        ax.barh(np.arange(len(CFG.cont_seq_cols)),fi_df.avg_val_loss)
        plt.yticks(np.arange(len(CFG.cont_seq_cols)),fi_df.feature.values)
        plt.title('LSTM Feature Importance',size=16)
        plt.ylim((-1,len(CFG.cont_seq_cols)))
        plt.show()
        fig.savefig(OUTPUT_DIR+f'{CFG.model_name}_feature_imporance.png')

        # SAVE LSTM FEATURE IMPORTANCE
        fi_df = fi_df.sort_values('avg_val_loss',ascending=False)
        fi_df.to_csv(OUTPUT_DIR+f'{CFG.model_name}_feature_importance_fold_{fold}.csv',index=False)
    if CFG.wandb:
        wandb.finish()

In [None]:
if __name__ == '__main__':
    main()