In [1]:
import numpy as np
import pandas as pd
import torch
import matplotlib.pyplot as plt

df = pd.read_csv("./data/train.csv")

In [2]:
"""
pre-processing
"""

# time diff
df.rename({"time_step": "t"}, axis=1, inplace=True)
df["t_lag1"] = df.groupby("breath_id")["t"].shift(-1)
df["dt"] = df["t_lag1"] - df["t"]
df["dt"].fillna(method='ffill', inplace=True)
df["dt2"] = df["dt"] ** 2

# u_in - 1st diff and derivative
df["u_in_lag1"] = df.groupby("breath_id")["u_in"].shift(-1).fillna(method="ffill")
df["d1_u_in"] = df["u_in_lag1"] - df["u_in"]
df["prime1_u_in"] = df["d1_u_in"] / df["dt"]

# u_in - 2nd diff and derivative
df["prime1_u_in_lag1"] = df.groupby("breath_id")["prime1_u_in"].shift(-1).fillna(method="ffill")
df["d2_u_in"] = df["prime1_u_in_lag1"] - df["prime1_u_in"]
df["prime2_u_in"] = df["d2_u_in"] / df["dt"]

# integration
df["int1_u_in"] = (df["u_in"] * df["dt"]).cumsum()
df["int2_u_in"] = (df["int1_u_in"] * df["dt"]).cumsum()

"""
re-scaling
"""
from sklearn.preprocessing import StandardScaler

scaling_keys = ["pressure", "u_in", "d1_u_in", "d2_u_in", "prime1_u_in", "prime2_u_in", "int1_u_in", "int2_u_in"]
scalers = {}
for key in scaling_keys:
    scaler = StandardScaler()
    scaler.fit(df[[key]])
    df[[key]]= scaler.transform(df[[key]])
    scalers[key] = scaler

print(df[df["breath_id"] == 1])

    id  breath_id   R   C         t      u_in  u_out  pressure    t_lag1  \
0    1          1  20  50  0.000000 -0.538775      0 -0.663762  0.033652   
1    2          1  20  50  0.033652  0.823348      0 -0.655094  0.067514   
2    3          1  20  50  0.067514  1.130480      0 -0.412365  0.101542   
3    4          1  20  50  0.101542  1.152777      0  0.064425  0.135756   
4    5          1  20  50  0.135756  1.342362      0  0.125107  0.169698   
..  ..        ...  ..  ..       ...       ...    ...       ...       ...   
75  76          1  20  50  2.553593 -0.174707      1 -0.594411  2.587754   
76  77          1  20  50  2.587754 -0.174409      1 -0.568405  2.621773   
77  78          1  20  50  2.621773 -0.174159      1 -0.603080  2.655746   
78  79          1  20  50  2.655746 -0.173947      1 -0.577074  2.689766   
79  80          1  20  50  2.689766 -0.173769      1 -0.585742       NaN   

          dt       dt2  u_in_lag1   d1_u_in  prime1_u_in  prime1_u_in_lag1  \
0   0.033

In [3]:
"""
separating R and C
"""

from itertools import product

unique_rs = df["R"].unique()
unique_cs = df["C"].unique()
unique_rcs = list(product(unique_rs, unique_cs))

df_rc_dict = {}
for r, c in unique_rcs:
    df_rc_dict[(r, c)] = df[(df["R"] == r) & (df["C"] == c)]
    
df_rc = df_rc_dict[unique_rcs[0]]
breath_ids = df_rc["breath_id"].unique()
n_breath_ids = len(breath_ids)
n_time_steps = len(df_rc) // n_breath_ids

In [8]:
"""
getting x and y
"""

n_in = 5
cols = ["dt", "dt2", "u_in", "d1_u_in", "prime1_u_in", "d2_u_in", "prime2_u_in", "int1_u_in", "int2_u_in"]
n_cols = len(cols)

x = np.zeros((n_breath_ids, n_in, n_cols))
y = np.zeros(n_breath_ids)
for ib, b in enumerate(breath_ids):         
    x[ib, :, :] = df_rc[df_rc["breath_id"] == b][cols].iloc[:n_in]
    y[ib] = df_rc[df_rc["breath_id"] == b]["pressure"].iloc[0]
x = x.reshape(n_breath_ids, n_in * n_cols)

In [9]:
"""
simple models
"""

from sklearn.pipeline import make_pipeline
from sklearn.model_selection import KFold
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import r2_score, mean_absolute_error

def get_score(model, x, y, n_folds=10):
    kfold = KFold(n_splits=n_folds, shuffle=True)
    scores = list()
    for fold, (train_ids, test_ids) in enumerate(kfold.split(x)):
        
        train_x, test_x = x[train_ids], x[test_ids]
        train_y, test_y = y[train_ids], y[test_ids]
        model.fit(train_x, train_y)
        score = r2_score(test_y, model.predict(test_x))
        #score = 1 - mean_absolute_error(test_y, model.predict(test_x))
        scores.append(score)
        print("model", type(model).__name__, "fold", fold, "score", score)
    return np.mean(scores), np.std(scores)


from sklearn.linear_model import Ridge
model_ridge = make_pipeline(RobustScaler(), Ridge(alpha=0.1, random_state=0))
print("ridge", get_score(model_ridge, x, y))

import xgboost as xgb
model_xgb = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=3, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, verbosity=0,
                             random_state=7, nthread=-1)
print("xgboost", get_score(model_xgb, x, y))

model Pipeline fold 0 score 0.39271841810129315
model Pipeline fold 1 score 0.3869457938496179
model Pipeline fold 2 score 0.3912478577356219
model Pipeline fold 3 score 0.3909253998165466
model Pipeline fold 4 score 0.35937655590536577
model Pipeline fold 5 score 0.3917197808919787
model Pipeline fold 6 score 0.2965590402509698
model Pipeline fold 7 score 0.43163188747311687
model Pipeline fold 8 score 0.41792417958391836
model Pipeline fold 9 score 0.41960248613899653
ridge (0.38786513997474253, 0.03610374245369726)
model XGBRegressor fold 0 score 0.5997631920282605
model XGBRegressor fold 1 score 0.5847058496012668
model XGBRegressor fold 2 score 0.5537364105201658
model XGBRegressor fold 3 score 0.5465320526506084
model XGBRegressor fold 4 score 0.5753368838256243
model XGBRegressor fold 5 score 0.5570710995790596
model XGBRegressor fold 6 score 0.545316544251985
model XGBRegressor fold 7 score 0.5570973314365393
model XGBRegressor fold 8 score 0.5355815139427227
model XGBRegressor