In [10]:
import pandas as pd
from datetime import datetime

folder = "./data/20210921/"

In [4]:
dfp_train = pd.read_csv(folder + "train.csv")
dfp_test = pd.read_csv(folder + "test.csv")
print(len(dfp_train))

dfp_mapping = pd.read_csv("./data/train_breath_mapping.csv")
dfp_train = pd.merge(dfp_train, dfp_mapping[dfp_mapping["type_record"] == "training"]["breath_id"], on=["breath_id"])
print(len(dfp_train))

dfp_train.sort_values(["breath_id", "time_step"], inplace=True)
dfp_test.sort_values(["breath_id", "time_step"], inplace=True)

6036000
4828800


## GEt kaggle features

In [6]:
# From https://www.kaggle.com/tenffe/finetune-of-tensorflow-bidirectional-lstm
def add_features(df):
    df['area'] = df['time_step'] * df['u_in']
    df['area'] = df.groupby('breath_id')['area'].cumsum()
    df['u_in_cumsum'] = (df['u_in']).groupby(df['breath_id']).cumsum()
    df['u_in_lag1'] = df.groupby('breath_id')['u_in'].shift(1)
    df['u_out_lag1'] = df.groupby('breath_id')['u_out'].shift(1)
    df['u_in_lag_back1'] = df.groupby('breath_id')['u_in'].shift(-1)
    df['u_out_lag_back1'] = df.groupby('breath_id')['u_out'].shift(-1)
    df['u_in_lag2'] = df.groupby('breath_id')['u_in'].shift(2)
    df['u_out_lag2'] = df.groupby('breath_id')['u_out'].shift(2)
    df['u_in_lag_back2'] = df.groupby('breath_id')['u_in'].shift(-2)
    df['u_out_lag_back2'] = df.groupby('breath_id')['u_out'].shift(-2)
    df['u_in_lag3'] = df.groupby('breath_id')['u_in'].shift(3)
    df['u_out_lag3'] = df.groupby('breath_id')['u_out'].shift(3)
    df['u_in_lag_back3'] = df.groupby('breath_id')['u_in'].shift(-3)
    df['u_out_lag_back3'] = df.groupby('breath_id')['u_out'].shift(-3)
    df['u_in_lag4'] = df.groupby('breath_id')['u_in'].shift(4)
    df['u_out_lag4'] = df.groupby('breath_id')['u_out'].shift(4)
    df['u_in_lag_back4'] = df.groupby('breath_id')['u_in'].shift(-4)
    df['u_out_lag_back4'] = df.groupby('breath_id')['u_out'].shift(-4)
    df = df.fillna(0)
    df['breath_id__u_in__max'] = df.groupby(['breath_id'])['u_in'].transform('max')
    df['breath_id__u_out__max'] = df.groupby(['breath_id'])['u_out'].transform('max')
    df['u_in_diff1'] = df['u_in'] - df['u_in_lag1']
    df['u_out_diff1'] = df['u_out'] - df['u_out_lag1']
    df['u_in_diff2'] = df['u_in'] - df['u_in_lag2']
    df['u_out_diff2'] = df['u_out'] - df['u_out_lag2']
    df['breath_id__u_in__diffmax'] = df.groupby(['breath_id'])['u_in'].transform('max') - df['u_in']
    df['breath_id__u_in__diffmean'] = df.groupby(['breath_id'])['u_in'].transform('mean') - df['u_in']
    df['breath_id__u_in__diffmax'] = df.groupby(['breath_id'])['u_in'].transform('max') - df['u_in']
    df['breath_id__u_in__diffmean'] = df.groupby(['breath_id'])['u_in'].transform('mean') - df['u_in']
    df['u_in_diff3'] = df['u_in'] - df['u_in_lag3']
    df['u_out_diff3'] = df['u_out'] - df['u_out_lag3']
    df['u_in_diff4'] = df['u_in'] - df['u_in_lag4']
    df['u_out_diff4'] = df['u_out'] - df['u_out_lag4']
    df['cross']= df['u_in']*df['u_out']
    df['cross2']= df['time_step']*df['u_out']
    df['R'] = df['R'].astype(str)
    df['C'] = df['C'].astype(str)
    df['R__C'] = df["R"].astype(str) + '__' + df["C"].astype(str)
    df = pd.get_dummies(df)
    return df

In [7]:
%%time
dfp_train_augmented = add_features(dfp_train)
dfp_test_augmented = add_features(dfp_test)

CPU times: user 16.3 s, sys: 2.6 s, total: 18.9 s
Wall time: 18.9 s


In [9]:
column_target = "pressure"
metric = "mae"

columns_features = []
for elt in list(dfp_train_augmented.columns):
    if elt not in ["id", "breath_id", "pressure"]:
        columns_features.append(elt)
        
print("Features:", columns_features)

X_train, y_train = dfp_train_augmented[columns_features], dfp_train_augmented[column_target]
X_test = dfp_test_augmented.sort_values("id")[columns_features]

Features: ['time_step', 'u_in', 'u_out', 'area', 'u_in_cumsum', 'u_in_lag1', 'u_out_lag1', 'u_in_lag_back1', 'u_out_lag_back1', 'u_in_lag2', 'u_out_lag2', 'u_in_lag_back2', 'u_out_lag_back2', 'u_in_lag3', 'u_out_lag3', 'u_in_lag_back3', 'u_out_lag_back3', 'u_in_lag4', 'u_out_lag4', 'u_in_lag_back4', 'u_out_lag_back4', 'breath_id__u_in__max', 'breath_id__u_out__max', 'u_in_diff1', 'u_out_diff1', 'u_in_diff2', 'u_out_diff2', 'breath_id__u_in__diffmax', 'breath_id__u_in__diffmean', 'u_in_diff3', 'u_out_diff3', 'u_in_diff4', 'u_out_diff4', 'cross', 'cross2', 'R_20', 'R_5', 'R_50', 'C_10', 'C_20', 'C_50', 'R__C_20__10', 'R__C_20__20', 'R__C_20__50', 'R__C_50__10', 'R__C_50__20', 'R__C_50__50', 'R__C_5__10', 'R__C_5__20', 'R__C_5__50']


In [12]:
from supervised.automl import AutoML

for mode in ["Compete"]:
    dfp_submissions = dfp_test_augmented.sort_values("id").copy()
    automl = AutoML(mode=mode, eval_metric=metric)
    automl.fit(X_train, y_train)

    dfp_submissions["pressure"] = automl.predict(X_test)
    dfp_submissions = dfp_submissions[["id", "pressure"]]
    dfp_submissions.to_csv(f"./data/submissions/{datetime.utcnow().strftime('%Y%m%d%H')}_mljar_{mode.lower()}.csv", index=None)

Linear algorithm was disabled.
AutoML directory: AutoML_3
The task is regression with evaluation metric mae
AutoML will use algorithms: ['Decision Tree', 'Random Forest', 'Extra Trees', 'LightGBM', 'Xgboost', 'CatBoost', 'Neural Network', 'Nearest Neighbors']
AutoML will stack models
AutoML will ensemble availabe models
AutoML steps: ['adjust_validation', 'simple_algorithms', 'default_algorithms', 'not_so_random', 'golden_features', 'kmeans_features', 'insert_random_feature', 'features_selection', 'hill_climbing_1', 'hill_climbing_2', 'boost_on_errors', 'ensemble', 'stack', 'ensemble_stacked']
* Step adjust_validation will try to check up to 1 model
1_DecisionTree mae 2.346279 trained in 46.85 seconds
Disable stacking for split validation
* Step simple_algorithms will try to check up to 2 models
2_DecisionTree mae 2.147211 trained in 58.49 seconds
3_DecisionTree mae 2.147211 trained in 58.09 seconds
* Step default_algorithms will try to check up to 6 models
4_Default_LightGBM mae 0.411