In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')

### Load Files

In [2]:
%%time
# Load files
train = pd.read_csv('/kaggle/input/ventilator-pressure-prediction/train.csv')
test = pd.read_csv('/kaggle/input/ventilator-pressure-prediction/test.csv')
y = train['pressure']

### Reduce data size

In [None]:
mem_train_init = train.memory_

In [3]:
# down convert columns to save memory...

# train
train['id'] = train['id'].astype(np.int32)
train['breath_id'] = train['breath_id'].astype(np.int32)
train['R'] = train['R'].astype(np.int8)  #or OHC?
train['C'] = train['C'].astype(np.int8)
train['u_out'] = train['u_out'].astype(np.int8)
train['u_in'] = train['u_in'].astype(np.float32)
train['time_step'] = train['time_step'].astype(np.float32)

#test
test['id'] = test['id'].astype(np.int32)
test['breath_id'] = test['breath_id'].astype(np.int32)
test['R'] = test['R'].astype(np.int8)  #or OHC?
test['C'] = test['C'].astype(np.int8)
test['u_out'] = test['u_out'].astype(np.int8)
test['u_in'] = test['u_in'].astype(np.float32)
test['time_step'] = test['time_step'].astype(np.float32)

# check that they converted
for col in test.columns:
    print(test[col].dtype)

In [4]:
train.head()

### Split into inhalitory and exhailtory groups (only scored on inhale)

In [5]:
train_in = train[train.u_out == 0]
test_in = test[test.u_out == 0]
y_in = train_in['pressure']

train_out = train[train.u_out == 1]
test_out = test[test.u_out == 1]

In [6]:
train_out.head()

# Add Features

In [7]:
# apply shift in training set
u_in_lag = train_in.u_in.shift(1,fill_value=0)
train_in['u_in_lag'] = u_in_lag
train_in.drop(['u_in'],axis=1,inplace=True)

# and for test set
u_in_lag = test_in.u_in.shift(1,fill_value=0)
test_in['u_in_lag'] = u_in_lag
test_in.drop(['u_in'],axis=1,inplace=True)

In [8]:
# add dt and du_in to train_in
dt = np.diff(train_in.time_step)
ldt = list(dt)
ldt.append(ldt[-1])
du_in = np.diff(train_in.u_in_lag)
ldu_in = list(du_in)
ldu_in.append(ldu_in[-1])
train_in['dt'] = ldt
train_in['d_u_in'] = ldu_in
 
# Add dt and du_in to test_in
dt = np.diff(test_in.time_step)
ldt = list(dt)
ldt.append(ldt[-1])
du_in = np.diff(test_in.u_in_lag)
ldu_in = list(du_in)
ldu_in.append(ldu_in[-1])
test_in['dt'] = ldt
test_in['d_u_in'] = ldu_in

In [9]:
test_in.head()

10/17/21 New in this run:
* Shifted by time lag 0f 2
* Added two new features:
    * dt
    * du_in


# Model

In [10]:
from sklearn.metrics import mean_absolute_error # ,confusion_matrix, classification_report

In [11]:
# Split data - after all analysis is done
from sklearn.model_selection import train_test_split

train_in.drop(columns = ['pressure','id','breath_id'], inplace = True)
#test = test.drop(columns = 'id', inplace = True)

X_train, X_valid, y_train, y_valid = train_test_split(train_in, y_in, train_size=0.8, test_size=0.2,
                                                      random_state=12)
X_test_in = test_in.copy().drop(columns=['id','breath_id'])

In [12]:
# from sklearn.tree import DecisionTreeRegressor

# for i in range(100,200,20): 
#     model_dt = DecisionTreeRegressor(n_estimators = i, max_depth=16, random_state=12)
#     model_dt.fit(X_train, y_train)
#     pred_dt = model_dt.predict(X_valid)

#     #dt = roc_auc_score(y_valid,pred_dt)
#     dt_mae_score = mean_absolute_error(pred_dt, y_valid)
#     print(f'Decision Tree MAE Score for max_depth={i} is : {dt_mae_score}')
# with default params, score = 2.413, but lb = 4.791.  Why the large difference?
# 11/17/21 changed to inhale only and my scores are matching better
    
# Decision Tree MAE Score for max_depth=2 is : 6.3443955300614405
# Decision Tree MAE Score for max_depth=3 is : 6.028395026290743
# Decision Tree MAE Score for max_depth=4 is : 5.678540615661189
# Decision Tree MAE Score for max_depth=5 is : 5.3517588341023155
# Decision Tree MAE Score for max_depth=6 is : 4.952135011724083
# Decision Tree MAE Score for max_depth=7 is : 4.669474094251829
# Decision Tree MAE Score for max_depth=8 is : 4.430497791001828
# Decision Tree MAE Score for max_depth=9 is : 4.235200007248183
# Decision Tree MAE Score for max_depth=10 is : 4.111949020503175
# Decision Tree MAE Score for max_depth=11 is : 4.002433270178995
# Decision Tree MAE Score for max_depth=12 is : 3.9195341438467897
# Decision Tree MAE Score for max_depth=13 is : 3.84778054263597
# Decision Tree MAE Score for max_depth=14 is : 3.79583210088494
# Decision Tree MAE Score for max_depth=15 is : 3.75439201392607
# Decision Tree MAE Score for max_depth=16 is : 3.7410021329185414
# Decision Tree MAE Score for max_depth=17 is : 3.744247688109701
# Decision Tree MAE Score for max_depth=18 is : 3.7645064417036505
# Decision Tree MAE Score for max_depth=19 is : 3.800083196701423
# Decision Tree MAE Score for max_depth=20 is : 3.852334638461889
# Decision Tree MAE Score for max_depth=21 is : 3.911532812448746
# Decision Tree MAE Score for max_depth=22 is : 3.9779119228181203
# Decision Tree MAE Score for max_depth=23 is : 4.044951907548471
# Decision Tree MAE Score for max_depth=24 is : 4.112202068631238
# min at max_depth=16 (3.741)


In [13]:
%%time
# random forest
from sklearn.ensemble import RandomForestRegressor
model_rf = RandomForestRegressor()
model_rf.fit(X_train, y_train)
pred_rf = model_rf.predict(X_valid)
rf_mae = mean_absolute_error(pred_rf,y_valid)
print(f'Random Forest MAE Score: {rf_mae}')

# (n_estimators=100, max_depth=7,min_samples_leaf=0.06, random_state=12), mae=3.12775, lb score = 6.431(?)
# Why is random forest worse?  
# 10/17/21: Still worse after using only inhales
# defaults: runs out of memory!!!???
# (n_estimators=100, max_depth=7,min_samples_leaf=0.06, random_state=12) = 5.867
# reduced dtype sizes on ints: 
# 3.627 default, lb = 3.710
# changed criterion to mae


In [14]:
model_rf.get_params()

### Final Model

In [15]:
# create outpreds = average out value
out_preds = np.ones(len(test_out))
i = list(test_out.id)
out_preds_s = pd.Series(out_preds,index = i)
out_preds_s

In [16]:
pred_final = model_rf.predict(X_test_in)
# add indexs to recombine with out preds
pred_final_s = pd.Series(pred_final,index=list(test_in.id))
pred_final_s.head()

In [17]:
both = pred_final_s.append(out_preds_s).sort_index()
both.values

In [18]:
output = pd.DataFrame({'id': test.id, 'pressure': both.values})
output.to_csv('submission.csv', index=False)
print("Submission saved!")