In [8]:
import pandas as pd
from datetime import datetime

folder = "./data/20210921/"

In [9]:
dfp_train = pd.read_csv(folder + "train.csv")
dfp_test = pd.read_csv(folder + "test.csv")
print(len(dfp_train))

dfp_mapping = pd.read_csv("./data/train_breath_mapping.csv")
dfp_train = pd.merge(dfp_train, dfp_mapping[dfp_mapping["type_record"] == "training"]["breath_id"], on=["breath_id"])
print(len(dfp_train))

dfp_train.sort_values(["breath_id", "time_step"], inplace=True)
dfp_test.sort_values(["breath_id", "time_step"], inplace=True)

6036000
4828800


In [10]:
#
folder_cluster = "./data/clusters/"
clusters_type = ["cluster_ed","cluster_dtw"]
for cluster_type in clusters_type:
    dfp_assignations = pd.read_csv(folder_cluster + f"train_{cluster_type}_assignations.csv")
    del dfp_assignations["Unnamed: 0"]
    dfp_assignations = dfp_assignations[["breath_id", "closest_cluster"]]
    dfp_assignations["closest_cluster"] = dfp_assignations["closest_cluster"].astype(str)
    dfp_assignations.columns = ["breath_id", f"closest_cluster_{cluster_type}"]
    dfp_train = pd.merge(dfp_train, dfp_assignations, on=["breath_id"])
    

for cluster_type in clusters_type:
    dfp_assignations = pd.read_csv(folder_cluster + f"test_{cluster_type}_assignations.csv")
    del dfp_assignations["Unnamed: 0"]
    dfp_assignations = dfp_assignations[["breath_id", "closest_cluster"]]
    dfp_assignations["closest_cluster"] = dfp_assignations["closest_cluster"].astype(str)
    dfp_assignations.columns = ["breath_id", f"closest_cluster_{cluster_type}"]
    dfp_test = pd.merge(dfp_test, dfp_assignations, on=["breath_id"])

In [11]:
dfp_train.iloc[0]

id                                    1
breath_id                             1
R                                    20
C                                    50
time_step                           0.0
u_in                           0.083334
u_out                                 0
pressure                       5.837492
closest_cluster_cluster_ed            3
closest_cluster_cluster_dtw          31
Name: 0, dtype: object

## GEt kaggle features

In [12]:
# From https://www.kaggle.com/tenffe/finetune-of-tensorflow-bidirectional-lstm
def add_features(df):
    df['area'] = df['time_step'] * df['u_in']
    df['area'] = df.groupby('breath_id')['area'].cumsum()
    df['u_in_cumsum'] = (df['u_in']).groupby(df['breath_id']).cumsum()
    df['u_in_lag1'] = df.groupby('breath_id')['u_in'].shift(1)
    df['u_out_lag1'] = df.groupby('breath_id')['u_out'].shift(1)
    df['u_in_lag_back1'] = df.groupby('breath_id')['u_in'].shift(-1)
    df['u_out_lag_back1'] = df.groupby('breath_id')['u_out'].shift(-1)
    df['u_in_lag2'] = df.groupby('breath_id')['u_in'].shift(2)
    df['u_out_lag2'] = df.groupby('breath_id')['u_out'].shift(2)
    df['u_in_lag_back2'] = df.groupby('breath_id')['u_in'].shift(-2)
    df['u_out_lag_back2'] = df.groupby('breath_id')['u_out'].shift(-2)
    df['u_in_lag3'] = df.groupby('breath_id')['u_in'].shift(3)
    df['u_out_lag3'] = df.groupby('breath_id')['u_out'].shift(3)
    df['u_in_lag_back3'] = df.groupby('breath_id')['u_in'].shift(-3)
    df['u_out_lag_back3'] = df.groupby('breath_id')['u_out'].shift(-3)
    df['u_in_lag4'] = df.groupby('breath_id')['u_in'].shift(4)
    df['u_out_lag4'] = df.groupby('breath_id')['u_out'].shift(4)
    df['u_in_lag_back4'] = df.groupby('breath_id')['u_in'].shift(-4)
    df['u_out_lag_back4'] = df.groupby('breath_id')['u_out'].shift(-4)
    df = df.fillna(0)
    df['breath_id__u_in__max'] = df.groupby(['breath_id'])['u_in'].transform('max')
    df['breath_id__u_out__max'] = df.groupby(['breath_id'])['u_out'].transform('max')
    df['u_in_diff1'] = df['u_in'] - df['u_in_lag1']
    df['u_out_diff1'] = df['u_out'] - df['u_out_lag1']
    df['u_in_diff2'] = df['u_in'] - df['u_in_lag2']
    df['u_out_diff2'] = df['u_out'] - df['u_out_lag2']
    df['breath_id__u_in__diffmax'] = df.groupby(['breath_id'])['u_in'].transform('max') - df['u_in']
    df['breath_id__u_in__diffmean'] = df.groupby(['breath_id'])['u_in'].transform('mean') - df['u_in']
    df['breath_id__u_in__diffmax'] = df.groupby(['breath_id'])['u_in'].transform('max') - df['u_in']
    df['breath_id__u_in__diffmean'] = df.groupby(['breath_id'])['u_in'].transform('mean') - df['u_in']
    df['u_in_diff3'] = df['u_in'] - df['u_in_lag3']
    df['u_out_diff3'] = df['u_out'] - df['u_out_lag3']
    df['u_in_diff4'] = df['u_in'] - df['u_in_lag4']
    df['u_out_diff4'] = df['u_out'] - df['u_out_lag4']
    df['cross']= df['u_in']*df['u_out']
    df['cross2']= df['time_step']*df['u_out']
    df['R'] = df['R'].astype(str)
    df['C'] = df['C'].astype(str)
    df['R__C'] = df["R"].astype(str) + '__' + df["C"].astype(str)
#     df = pd.get_dummies(df)
    return df

In [13]:
%%time
dfp_train_augmented = add_features(dfp_train)
dfp_test_augmented = add_features(dfp_test)

CPU times: user 12.7 s, sys: 1.56 s, total: 14.2 s
Wall time: 14.2 s


In [14]:
dfp_train_augmented.iloc[0]

id                                     1
breath_id                              1
R                                     20
C                                     50
time_step                            0.0
u_in                            0.083334
u_out                                  0
pressure                        5.837492
closest_cluster_cluster_ed             3
closest_cluster_cluster_dtw           31
area                                 0.0
u_in_cumsum                     0.083334
u_in_lag1                            0.0
u_out_lag1                           0.0
u_in_lag_back1                 18.383041
u_out_lag_back1                      0.0
u_in_lag2                            0.0
u_out_lag2                           0.0
u_in_lag_back2                 22.509278
u_out_lag_back2                      0.0
u_in_lag3                            0.0
u_out_lag3                           0.0
u_in_lag_back3                 22.808822
u_out_lag_back3                      0.0
u_in_lag4       

In [15]:
column_target = "pressure"
metric = "mae"

columns_features = []
for elt in list(dfp_train_augmented.columns):
    if elt not in ["id", "breath_id", "pressure"]:
        columns_features.append(elt)
        
print("Features:", columns_features)

X_train, y_train = dfp_train_augmented[columns_features], dfp_train_augmented[column_target]
X_test = dfp_test_augmented.sort_values("id")[columns_features]

Features: ['R', 'C', 'time_step', 'u_in', 'u_out', 'closest_cluster_cluster_ed', 'closest_cluster_cluster_dtw', 'area', 'u_in_cumsum', 'u_in_lag1', 'u_out_lag1', 'u_in_lag_back1', 'u_out_lag_back1', 'u_in_lag2', 'u_out_lag2', 'u_in_lag_back2', 'u_out_lag_back2', 'u_in_lag3', 'u_out_lag3', 'u_in_lag_back3', 'u_out_lag_back3', 'u_in_lag4', 'u_out_lag4', 'u_in_lag_back4', 'u_out_lag_back4', 'breath_id__u_in__max', 'breath_id__u_out__max', 'u_in_diff1', 'u_out_diff1', 'u_in_diff2', 'u_out_diff2', 'breath_id__u_in__diffmax', 'breath_id__u_in__diffmean', 'u_in_diff3', 'u_out_diff3', 'u_in_diff4', 'u_out_diff4', 'cross', 'cross2', 'R__C']


In [16]:
# from supervised.automl import AutoML

# for mode in ["Compete"]:
#     dfp_submissions = dfp_test_augmented.sort_values("id").copy()
#     automl = AutoML(mode=mode, eval_metric=metric)
#     automl.fit(X_train, y_train)

#     dfp_submissions["pressure"] = automl.predict(X_test)
#     dfp_submissions = dfp_submissions[["id", "pressure"]]
#     dfp_submissions.to_csv(f"./data/submissions/{datetime.utcnow().strftime('%Y%m%d%H')}_mljar_{mode.lower()}_with_closest_cluster_wo_dummies.csv", index=None)

In [None]:
from flaml import AutoML

metric = "mae"
time_budget = 3 * 3600

automl_settings = {
    "time_budget": time_budget,  # in seconds
    "metric": metric,
    "task": "regression",
    "log_file_name": f"flaml_{time_budget}_nf20211028_{metric}.log",
}

dfp_submissions = dfp_test_augmented.sort_values("id").copy()
automl = AutoML()
automl.fit(X_train=X_train, y_train=y_train, **automl_settings)

dfp_submissions["pressure"] = automl.predict(X_test)
dfp_submissions = dfp_submissions[["id", "pressure"]]
dfp_submissions.to_csv(f"./data/submissions/{datetime.utcnow().strftime('%Y%m%d%H')}_flaml{time_budget}_with_closest_cluster_wo_dummies.csv", index=None)

[flaml.automl: 10-28 13:52:10] {1432} INFO - Evaluation method: holdout
[flaml.automl: 10-28 13:52:16] {1478} INFO - Minimizing error metric: mae
[flaml.automl: 10-28 13:52:16] {1515} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'catboost', 'xgboost', 'extra_tree']
[flaml.automl: 10-28 13:52:16] {1748} INFO - iteration 0, current learner lgbm
[flaml.automl: 10-28 13:52:16] {1866} INFO - Estimated sufficient time budget=1123580s. Estimated necessary time budget=2416s.
[flaml.automl: 10-28 13:52:16] {1944} INFO -  at 60.7s,	estimator lgbm's best error=4.5014,	best estimator lgbm's best error=4.5014
[flaml.automl: 10-28 13:52:16] {1748} INFO - iteration 1, current learner lgbm
[flaml.automl: 10-28 13:52:16] {1944} INFO -  at 60.9s,	estimator lgbm's best error=4.5014,	best estimator lgbm's best error=4.5014
[flaml.automl: 10-28 13:52:16] {1748} INFO - iteration 2, current learner lgbm
[flaml.automl: 10-28 13:52:16] {1944} INFO -  at 61.1s,	estimator lgbm's best error=3.0438,	be

In [15]:
# import pickle
# with open("./data/flaml_automl_best.pkl", "wb") as f:
#     pickle.dump(automl, f, pickle.HIGHEST_PROTOCOL)

In [None]:
# # from flaml import AutoML

# # time_budget = 2 * 3600
# # automl = AutoML()
# automl_settings = {
#     "time_budget": time_budget,  # in seconds
#     "metric": metric,
#     "task": "regression",
#     "log_file_name": f"flaml_{time_budget}_nf20211007_{metric}.log",
# }
