In [1]:
import pandas as pd

folder = "./data/20210921/"

In [2]:
dfp_train = pd.read_csv(folder + "train.csv")
dfp_test = pd.read_csv(folder + "test.csv")

dfp_train.sort_values(["breath_id", "time_step"], inplace=True)
dfp_test.sort_values(["breath_id", "time_step"], inplace=True)

## GEt kaggle features

In [16]:
# From https://www.kaggle.com/tenffe/finetune-of-tensorflow-bidirectional-lstm
def add_features(df):
    df['area'] = df['time_step'] * df['u_in']
    df['area'] = df.groupby('breath_id')['area'].cumsum()
    df['u_in_cumsum'] = (df['u_in']).groupby(df['breath_id']).cumsum()
    df['u_in_lag1'] = df.groupby('breath_id')['u_in'].shift(1)
    df['u_out_lag1'] = df.groupby('breath_id')['u_out'].shift(1)
    df['u_in_lag_back1'] = df.groupby('breath_id')['u_in'].shift(-1)
    df['u_out_lag_back1'] = df.groupby('breath_id')['u_out'].shift(-1)
    df['u_in_lag2'] = df.groupby('breath_id')['u_in'].shift(2)
    df['u_out_lag2'] = df.groupby('breath_id')['u_out'].shift(2)
    df['u_in_lag_back2'] = df.groupby('breath_id')['u_in'].shift(-2)
    df['u_out_lag_back2'] = df.groupby('breath_id')['u_out'].shift(-2)
    df['u_in_lag3'] = df.groupby('breath_id')['u_in'].shift(3)
    df['u_out_lag3'] = df.groupby('breath_id')['u_out'].shift(3)
    df['u_in_lag_back3'] = df.groupby('breath_id')['u_in'].shift(-3)
    df['u_out_lag_back3'] = df.groupby('breath_id')['u_out'].shift(-3)
    df['u_in_lag4'] = df.groupby('breath_id')['u_in'].shift(4)
    df['u_out_lag4'] = df.groupby('breath_id')['u_out'].shift(4)
    df['u_in_lag_back4'] = df.groupby('breath_id')['u_in'].shift(-4)
    df['u_out_lag_back4'] = df.groupby('breath_id')['u_out'].shift(-4)
    df = df.fillna(0)
    df['breath_id__u_in__max'] = df.groupby(['breath_id'])['u_in'].transform('max')
    df['breath_id__u_out__max'] = df.groupby(['breath_id'])['u_out'].transform('max')
    df['u_in_diff1'] = df['u_in'] - df['u_in_lag1']
    df['u_out_diff1'] = df['u_out'] - df['u_out_lag1']
    df['u_in_diff2'] = df['u_in'] - df['u_in_lag2']
    df['u_out_diff2'] = df['u_out'] - df['u_out_lag2']
    df['breath_id__u_in__diffmax'] = df.groupby(['breath_id'])['u_in'].transform('max') - df['u_in']
    df['breath_id__u_in__diffmean'] = df.groupby(['breath_id'])['u_in'].transform('mean') - df['u_in']
    df['breath_id__u_in__diffmax'] = df.groupby(['breath_id'])['u_in'].transform('max') - df['u_in']
    df['breath_id__u_in__diffmean'] = df.groupby(['breath_id'])['u_in'].transform('mean') - df['u_in']
    df['u_in_diff3'] = df['u_in'] - df['u_in_lag3']
    df['u_out_diff3'] = df['u_out'] - df['u_out_lag3']
    df['u_in_diff4'] = df['u_in'] - df['u_in_lag4']
    df['u_out_diff4'] = df['u_out'] - df['u_out_lag4']
    df['cross']= df['u_in']*df['u_out']
    df['cross2']= df['time_step']*df['u_out']
    df['R'] = df['R'].astype(str)
    df['C'] = df['C'].astype(str)
    df['R__C'] = df["R"].astype(str) + '__' + df["C"].astype(str)
    df = pd.get_dummies(df)
    return df

In [18]:
%%time
dfp_train_augmented = add_features(dfp_train)
dfp_test_augmented = add_features(dfp_test)

CPU times: user 18.2 s, sys: 3.2 s, total: 21.4 s
Wall time: 21.4 s


In [19]:
dfp_train_augmented.to_csv("./data/updated_datasets/train_20211007_kaggle.csv", index=None)
dfp_test_augmented.to_csv("./data/updated_datasets/test_20211007_kaggle.csv", index=None)

## kaggle features + JM

In [5]:
def add_features_v2(df):
    df['area'] = df['time_step'] * df['u_in']
    df['area'] = df.groupby('breath_id')['area'].cumsum()
    df['u_in_cumsum'] = (df['u_in']).groupby(df['breath_id']).cumsum()
    df['u_in_lag1'] = df.groupby('breath_id')['u_in'].shift(1)
    df['u_out_lag1'] = df.groupby('breath_id')['u_out'].shift(1)
    df['u_in_lag_back1'] = df.groupby('breath_id')['u_in'].shift(-1)
    df['u_out_lag_back1'] = df.groupby('breath_id')['u_out'].shift(-1)
    df['u_in_lag2'] = df.groupby('breath_id')['u_in'].shift(2)
    df['u_out_lag2'] = df.groupby('breath_id')['u_out'].shift(2)
    df['u_in_lag_back2'] = df.groupby('breath_id')['u_in'].shift(-2)
    df['u_out_lag_back2'] = df.groupby('breath_id')['u_out'].shift(-2)
    df['u_in_lag3'] = df.groupby('breath_id')['u_in'].shift(3)
    df['u_out_lag3'] = df.groupby('breath_id')['u_out'].shift(3)
    df['u_in_lag_back3'] = df.groupby('breath_id')['u_in'].shift(-3)
    df['u_out_lag_back3'] = df.groupby('breath_id')['u_out'].shift(-3)
    df['u_in_lag4'] = df.groupby('breath_id')['u_in'].shift(4)
    df['u_out_lag4'] = df.groupby('breath_id')['u_out'].shift(4)
    df['u_in_lag_back4'] = df.groupby('breath_id')['u_in'].shift(-4)
    df['u_out_lag_back4'] = df.groupby('breath_id')['u_out'].shift(-4)
    df = df.fillna(0)
    df['breath_id__u_in__max'] = df.groupby(['breath_id'])['u_in'].transform('max')
    df['breath_id__u_out__max'] = df.groupby(['breath_id'])['u_out'].transform('max')
    df['u_in_diff1'] = df['u_in'] - df['u_in_lag1']
    df['u_out_diff1'] = df['u_out'] - df['u_out_lag1']
    df['u_in_diff2'] = df['u_in'] - df['u_in_lag2']
    df['u_out_diff2'] = df['u_out'] - df['u_out_lag2']
    df['breath_id__u_in__diffmax'] = df.groupby(['breath_id'])['u_in'].transform('max') - df['u_in']
    df['breath_id__u_in__diffmean'] = df.groupby(['breath_id'])['u_in'].transform('mean') - df['u_in']
    df['breath_id__u_in__diffmax'] = df.groupby(['breath_id'])['u_in'].transform('max') - df['u_in']
    df['breath_id__u_in__diffmean'] = df.groupby(['breath_id'])['u_in'].transform('mean') - df['u_in']
    df['u_in_diff3'] = df['u_in'] - df['u_in_lag3']
    df['u_out_diff3'] = df['u_out'] - df['u_out_lag3']
    df['u_in_diff4'] = df['u_in'] - df['u_in_lag4']
    df['u_out_diff4'] = df['u_out'] - df['u_out_lag4']
    df['cross']= df['u_in']*df['u_out']
    df['cross2']= df['time_step']*df['u_out']
    df['R'] = df['R'].astype(str)
    df['C'] = df['C'].astype(str)
    df['R__C'] = df["R"].astype(str) + '__' + df["C"].astype(str)
    
    
    df_u_out_features = df[df["u_out"] == 1].groupby("breath_id").agg({"time_step" : "min"}).reset_index()
    df_u_out_features.columns = ["breath_id", "time_step_threshold"]
    df = pd.merge(df, df_u_out_features, on=["breath_id"])
    df["u_out_threshold_reached"] = df["time_step"] - df["time_step_threshold"]
    df["u_out_threshold_reached"] = df["u_out_threshold_reached"].apply(lambda value: 0 if value < 0 else value)
    del df["time_step_threshold"]
    
    
    df = pd.get_dummies(df)
    return df

In [6]:
%%time
dfp_train_augmented = add_features_v2(dfp_train)
dfp_test_augmented = add_features_v2(dfp_test)

CPU times: user 25.2 s, sys: 5.51 s, total: 30.7 s
Wall time: 30.7 s


In [7]:
dfp_train_augmented.to_csv("./data/updated_datasets/train_20211007_kaggle_v2.csv", index=None)
dfp_test_augmented.to_csv("./data/updated_datasets/test_20211007_kaggle_v2.csv", index=None)

In [15]:
columns_features = []
for elt in list(dfp_train_augmented.columns):
    if elt not in ["id", "breath_id", "pressure"]:
        columns_features.append(elt)
        
print("Features:", columns_features)

Features: ['time_step', 'u_in', 'u_out', 'area', 'u_in_cumsum', 'u_in_lag1', 'u_out_lag1', 'u_in_lag_back1', 'u_out_lag_back1', 'u_in_lag2', 'u_out_lag2', 'u_in_lag_back2', 'u_out_lag_back2', 'u_in_lag3', 'u_out_lag3', 'u_in_lag_back3', 'u_out_lag_back3', 'u_in_lag4', 'u_out_lag4', 'u_in_lag_back4', 'u_out_lag_back4', 'breath_id__u_in__max', 'breath_id__u_out__max', 'u_in_diff1', 'u_out_diff1', 'u_in_diff2', 'u_out_diff2', 'breath_id__u_in__diffmax', 'breath_id__u_in__diffmean', 'u_in_diff3', 'u_out_diff3', 'u_in_diff4', 'u_out_diff4', 'cross', 'cross2', 'u_out_threshold_reached', 'R_20', 'R_5', 'R_50', 'C_10', 'C_20', 'C_50', 'R__C_20__10', 'R__C_20__20', 'R__C_20__50', 'R__C_50__10', 'R__C_50__20', 'R__C_50__50', 'R__C_5__10', 'R__C_5__20', 'R__C_5__50']


SyntaxError: invalid syntax (<ipython-input-17-7f2766e1a10a>, line 1)