In [1]:
import numpy as np
import pandas as pd
import torch
import matplotlib.pyplot as plt

df = pd.read_csv("./data/train.csv")

In [2]:
"""
pre-processing
"""

# R and C
from sklearn.preprocessing import LabelEncoder

df["rnc"] = df["R"].astype(str) + '_' + df["C"].astype(str)
le_rnc = LabelEncoder()
le_rnc.fit(df.loc[:, "rnc"])
df.loc[:, "rnc"] = le_rnc.transform(list(df.loc[:, "rnc"]))

# time diff
df.rename({"time_step": "t"}, axis=1, inplace=True)
df["t_lag1"] = df.groupby("breath_id")["t"].shift(-1)
df["dt"] = df["t_lag1"] - df["t"]
df["dt"].fillna(method='ffill', inplace=True)
df["dt2"] = df["dt"] ** 2

# u_in - 1st diff and derivative
df["u_in_lag1"] = df.groupby("breath_id")["u_in"].shift(-1).fillna(method="ffill")
df["d1_u_in"] = df["u_in_lag1"] - df["u_in"]
df["prime1_u_in"] = df["d1_u_in"] / df["dt"]

# u_in - 2nd diff and derivative
df["prime1_u_in_lag1"] = df.groupby("breath_id")["prime1_u_in"].shift(-1).fillna(method="ffill")
df["d2_u_in"] = df["prime1_u_in_lag1"] - df["prime1_u_in"]
df["prime2_u_in"] = df["d2_u_in"] / df["dt"]

# integration
df["int1_u_in"] = (df["u_in"] * df["dt"]).cumsum()
df["int2_u_in"] = (df["int1_u_in"] * df["dt"]).cumsum()

"""
re-scaling
"""
#from sklearn.preprocessing import StandardScaler

#scaling_keys = ["pressure", "u_in", "d1_u_in", "d2_u_in", "prime1_u_in", "prime2_u_in", "int1_u_in", "int2_u_in"]
#scalers = {}
#for key in scaling_keys:
#    scaler = StandardScaler()
#    scaler.fit(df[[key]])
#    df[[key]]= scaler.transform(df[[key]])
#    scalers[key] = scaler

print(df[df["breath_id"] == 1])

    id  breath_id   R   C         t       u_in  u_out   pressure  rnc  \
0    1          1  20  50  0.000000   0.083334      0   5.837492    2   
1    2          1  20  50  0.033652  18.383041      0   5.907794    2   
2    3          1  20  50  0.067514  22.509278      0   7.876254    2   
3    4          1  20  50  0.101542  22.808822      0  11.742872    2   
4    5          1  20  50  0.135756  25.355850      0  12.234987    2   
..  ..        ...  ..  ..       ...        ...    ...        ...  ...   
75  76          1  20  50  2.553593   4.974474      1   6.399909    2   
76  77          1  20  50  2.587754   4.978481      1   6.610815    2   
77  78          1  20  50  2.621773   4.981847      1   6.329607    2   
78  79          1  20  50  2.655746   4.984683      1   6.540513    2   
79  80          1  20  50  2.689766   4.987079      1   6.470211    2   

      t_lag1        dt       dt2  u_in_lag1    d1_u_in  prime1_u_in  \
0   0.033652  0.033652  0.001132  18.383041  18.2997

In [3]:
"""
getting x and y
"""

breath_ids = df["breath_id"].unique()
n_breath_ids = len(breath_ids)
n_time_steps = len(df) // n_breath_ids

n_in = 5
cols = ["rnc", "dt", "u_in", "d1_u_in", "prime1_u_in"]
n_cols = len(cols)

x = df[cols].to_numpy().reshape((n_breath_ids, n_time_steps, n_cols))[:, :n_in, :]
x = x.reshape((n_breath_ids, n_in * n_cols))
y = df["pressure"].to_numpy().reshape((n_breath_ids, n_time_steps))[:, 0]

In [4]:
"""
simple models
"""

from sklearn.pipeline import make_pipeline
from sklearn.model_selection import KFold
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import r2_score, mean_absolute_error

def get_score(model, x, y, n_folds=10):
    kfold = KFold(n_splits=n_folds, shuffle=True)
    scores = list()
    for fold, (train_ids, test_ids) in enumerate(kfold.split(x)):
        
        train_x, test_x = x[train_ids], x[test_ids]
        train_y, test_y = y[train_ids], y[test_ids]
        model.fit(train_x, train_y)
        score = r2_score(test_y, model.predict(test_x))
        #score = 1 - mean_absolute_error(test_y, model.predict(test_x))
        scores.append(score)
        print("model", type(model).__name__, "fold", fold, "score", score)
    return np.mean(scores), np.std(scores)


from sklearn.linear_model import Ridge
model_ridge = make_pipeline(RobustScaler(), Ridge(alpha=0.1, random_state=0))
print("ridge", get_score(model_ridge, x, y))

import xgboost as xgb
model_xgb = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=3, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, verbosity=0,
                             random_state=7, nthread=-1)
print("xgboost", get_score(model_xgb, x, y))

model Pipeline fold 0 score 0.09347006093596433
model Pipeline fold 1 score 0.09974338296010166
model Pipeline fold 2 score 0.08447247291205784
model Pipeline fold 3 score 0.09116360067487939
model Pipeline fold 4 score 0.09737686342514384
model Pipeline fold 5 score 0.08722145356763844
model Pipeline fold 6 score -0.024826675562532152
model Pipeline fold 7 score 0.09734142342795671
model Pipeline fold 8 score 0.10079144775187576
model Pipeline fold 9 score 0.06220222814224119
ridge (0.0788956258235327, 0.036188648656635604)
model XGBRegressor fold 0 score 0.7931472157963921
model XGBRegressor fold 1 score 0.7877132510899444
model XGBRegressor fold 2 score 0.7794340390994567
model XGBRegressor fold 3 score 0.7788986762684349
model XGBRegressor fold 4 score 0.7852678161340767
model XGBRegressor fold 5 score 0.7731949580975728
model XGBRegressor fold 6 score 0.7848256132475883
model XGBRegressor fold 7 score 0.7772859033249654
model XGBRegressor fold 8 score 0.7740490405238859
model XGBR