In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')

### Load Files

In [24]:
%%time
# Load files
train = pd.read_csv(r'C:\Sync\Work\Kaggle Competitions\Ventilator Pressure Prediction\Data\train.csv')
test = pd.read_csv(r'C:\Sync\Work\Kaggle Competitions\Ventilator Pressure Prediction\Data\test.csv')

y = train['pressure']


Wall time: 4.73 s


In [25]:
print(f'Train memory usage: {train.memory_usage().sum()}')
print(f'Test memory usage: {test.memory_usage().sum()}')

Train memory usage: 386304128
Test memory usage: 225344128


In [26]:
# down convert columns to save memory...
# probably do float64's too

# train
train['id'] = train['id'].astype(np.int32)
train['breath_id'] = train['breath_id'].astype(np.int32)
train['R'] = train['R'].astype(np.int8)  #or OHC?
train['C'] = train['C'].astype(np.int8)
train['u_out'] = train['u_out'].astype(np.int8)
train['u_in'] = train['u_in'].astype(np.float32)
train['time_step'] = train['time_step'].astype(np.float32)


#test
test['id'] = test['id'].astype(np.int32)
test['breath_id'] = test['breath_id'].astype(np.int32)
test['R'] = test['R'].astype(np.int8)  #or OHC?
test['C'] = test['C'].astype(np.int8)
test['u_out'] = test['u_out'].astype(np.int8)
test['u_in'] = test['u_in'].astype(np.float32)
test['time_step'] = test['time_step'].astype(np.float32)

for col in test.columns:
    print(test[col].dtype)

int32
int32
int8
int8
float32
float32
int8


In [27]:
print(f'Train memory usage: {train.memory_usage().sum()}')
print(f'Test memory usage: {test.memory_usage().sum()}')


Train memory usage: 162972128
Test memory usage: 76456128


### Split data into inhalitory and exhalitory phase (only scored on inhale)

In [3]:
train_in = train[train.u_out == 0]
test_in = test[test.u_out == 0]
y_in = train_in['pressure']

train_out = train[train.u_out == 1]
test_out = test[test.u_out == 1]

In [4]:
test_out.head()

Unnamed: 0,id,breath_id,R,C,time_step,u_in,u_out
31,32,0,5,20,0.989089,0.0,1
32,33,0,5,20,1.021021,0.0,1
33,34,0,5,20,1.052835,0.0,1
34,35,0,5,20,1.084613,0.0,1
35,36,0,5,20,1.116543,0.0,1


# Model

In [5]:
from sklearn.metrics import mean_absolute_error  #confusion_matrix, classification_report

In [6]:
# Split data - after all analysis is done
from sklearn.model_selection import train_test_split

train_in.drop(columns = ['pressure','id'], inplace = True)
#test = test.drop(columns = 'id', inplace = True)

X_train, X_valid, y_train, y_valid = train_test_split(train_in, y_in, train_size=0.8, test_size=0.2,
                                                      random_state=12)
X_test_in = test_in.drop(columns=['id'],inplace=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [12]:
# Logistic Regression - not working...yet.
X_test_in

Unnamed: 0,breath_id,R,C,time_step,u_in,u_out
0,0,5,20,0.000000,0.000000,0
1,0,5,20,0.031904,7.515046,0
2,0,5,20,0.063827,14.651675,0
3,0,5,20,0.095751,21.230610,0
4,0,5,20,0.127644,26.320956,0
...,...,...,...,...,...,...
4023945,125748,20,10,0.842145,0.000000,0
4023946,125748,20,10,0.875648,0.000000,0
4023947,125748,20,10,0.909185,0.121375,0
4023948,125748,20,10,0.943148,0.000000,0


In [8]:
%%time
# ok, decision tree instead:
# from sklearn.tree import DecisionTreeRegressor

# for i in range(100,200,20): 
#     model_dt = DecisionTreeRegressor(n_iterations = i, max_depth=16, random_state=12)
#     model_dt.fit(X_train, y_train)
#     pred_dt = model_dt.predict(X_valid)

#     #dt = roc_auc_score(y_valid,pred_dt)
#     dt_mae_score = mean_absolute_error(pred_dt, y_valid)
#     print(f'Decision Tree MAE Score for max_depth={i} is : {dt_mae_score}')
# with default params, score = 2.413, but lb = 4.791.  Why the large difference?
# 11/17/21 changed to inhale only and my scores are matching better
    
# Decision Tree MAE Score for max_depth=2 is : 6.3443955300614405
# Decision Tree MAE Score for max_depth=3 is : 6.028395026290743
# Decision Tree MAE Score for max_depth=4 is : 5.678540615661189
# Decision Tree MAE Score for max_depth=5 is : 5.3517588341023155
# Decision Tree MAE Score for max_depth=6 is : 4.952135011724083
# Decision Tree MAE Score for max_depth=7 is : 4.669474094251829
# Decision Tree MAE Score for max_depth=8 is : 4.430497791001828
# Decision Tree MAE Score for max_depth=9 is : 4.235200007248183
# Decision Tree MAE Score for max_depth=10 is : 4.111949020503175
# Decision Tree MAE Score for max_depth=11 is : 4.002433270178995
# Decision Tree MAE Score for max_depth=12 is : 3.9195341438467897
# Decision Tree MAE Score for max_depth=13 is : 3.84778054263597
# Decision Tree MAE Score for max_depth=14 is : 3.79583210088494
# Decision Tree MAE Score for max_depth=15 is : 3.75439201392607
# Decision Tree MAE Score for max_depth=16 is : 3.7410021329185414
# Decision Tree MAE Score for max_depth=17 is : 3.744247688109701
# Decision Tree MAE Score for max_depth=18 is : 3.7645064417036505
# Decision Tree MAE Score for max_depth=19 is : 3.800083196701423
# Decision Tree MAE Score for max_depth=20 is : 3.852334638461889
# Decision Tree MAE Score for max_depth=21 is : 3.911532812448746
# Decision Tree MAE Score for max_depth=22 is : 3.9779119228181203
# Decision Tree MAE Score for max_depth=23 is : 4.044951907548471
# Decision Tree MAE Score for max_depth=24 is : 4.112202068631238
# min at max_depth=16 (3.741)


Wall time: 0 ns


In [9]:
%%time
# random forest
# from sklearn.ensemble import RandomForestRegressor

# model_rf = RandomForestRegressor()
# model_rf.fit(X_train, y_train)
# pred_rf = model_rf.predict(X_valid)
# rf_mae = mean_absolute_error(pred_rf,y_valid)
# print(f'Random Forsest MAE Score: {rf_mae}')
# (n_estimators=100, max_depth=7,min_samples_leaf=0.06, random_state=12), mae=3.12775, lb score = 6.431(?)
# Why is random forest worse?  
# 10/17/21: Still worse after using only inhales
# (n_estimators=100, max_depth=7,min_samples_leaf=0.06, random_state=12) = 5.867
# defaults: runs out of memory!!!???



Wall time: 0 ns


In [19]:
%%time
from catboost import CatBoostRegressor
# loop for manual type cv
#preds = []
for i in np.arange(1,2):
#     X_train, X_valid, y_train, y_valid = train_test_split(train, y, train_size=0.8, test_size=0.2,
#                                                       random_state=i)
    model_cat = CatBoostRegressor(loss_function="MAE",
                               eval_metric="MAE",
                               task_type="GPU",
                               learning_rate=.6,
                               iterations=400,
                               l2_leaf_reg=50,
                               random_seed=12,
                               od_type="Iter",
                               depth=5,
                               #early_stopping_rounds=6500,
                               border_count=64,
                               verbose=False
                              )
    model_cat.fit(X_train,y_train)
    pred_cat = model_cat.predict(X_valid)
    score_cat = mean_absolute_error(y_valid,pred_cat)
    #print(f'iters={i}, lr={j}, CatBoost MAE Score: {score_cat}')
    print(f'CatBoost MAE Score: {score_cat}')
    #preds.append(model_cat.predict_proba(X_test)[:,1])
    # 400, .6 = 3.976


CatBoost MAE Score: 3.9759579031274104
Wall time: 7.96 s


In [None]:
# Results
# below was with 400 iterations
# LR=0.01, CatBoost MAE Score: 4.612070402194106
# LR=0.02, CatBoost MAE Score: 4.355098889990542
# LR=0.03, CatBoost MAE Score: 4.256470209779093
# LR=0.04, CatBoost MAE Score: 4.206820030265317
# LR=0.05, CatBoost MAE Score: 4.17202774418495
# LR=0.06, CatBoost MAE Score: 4.154663398741475
# LR=0.07, CatBoost MAE Score: 4.139378210785428
# LR=0.08, CatBoost MAE Score: 4.123714277138671
# LR=0.09, CatBoost MAE Score: 4.118331120102717
# LR=0.10, CatBoost MAE Score: 4.100228688249339

#below is actually varying iterations with lr=.01
# LR=500, CatBoost MAE Score: 4.51716759258337
# LR=750, CatBoost MAE Score: 4.375308068777867
# LR=1000, CatBoost MAE Score: 4.296309314073668
# LR=1250, CatBoost MAE Score: 4.253560494871421
# LR=1500, CatBoost MAE Score: 4.215301334682263
# LR=1750, CatBoost MAE Score: 4.192200549938411
# LR=2000, CatBoost MAE Score: 4.175215683257243

# iters=2500, CatBoost MAE Score: 4.152780577777932
# iters=3000, CatBoost MAE Score: 4.1334934487070445
# iters=3500, CatBoost MAE Score: 4.116628441891895
# iters=4000, CatBoost MAE Score: 4.1022695941252145
# iters=4500, CatBoost MAE Score: 4.090963847495107
# iters=5000, CatBoost MAE Score: 4.081444606900306
# iters=5500, CatBoost MAE Score: 4.073453635612619
# iters=6000, CatBoost MAE Score: 4.065357346625841
# iters=6500, CatBoost MAE Score: 4.0580621865794475
# iters=7000, CatBoost MAE Score: 4.051370365282632
# iters=7500, CatBoost MAE Score: 4.044719753925026

# iters=500, lr=0.02, CatBoost MAE Score: 4.29701850764207
# iters=500, lr=0.05, CatBoost MAE Score: 4.147902585346878
# iters=500, lr=0.07, CatBoost MAE Score: 4.114997921114166
# iters=500, lr=0.1, CatBoost MAE Score: 4.080464756927912
# iters=1500, lr=0.02, CatBoost MAE Score: 4.132905138782809
# iters=1500, lr=0.05, CatBoost MAE Score: 4.047442916427927
# iters=1500, lr=0.07, CatBoost MAE Score: 4.015655697064174
# iters=1500, lr=0.1, CatBoost MAE Score: 3.9860950279206313
# iters=2500, lr=0.02, CatBoost MAE Score: 4.080349172017659
# iters=2500, lr=0.05, CatBoost MAE Score: 4.00344905190381
# iters=2500, lr=0.07, CatBoost MAE Score: 3.97255891177489
# iters=2500, lr=0.1, CatBoost MAE Score: 3.945357239716338
# iters=3500, lr=0.02, CatBoost MAE Score: 4.049843291723955
# iters=3500, lr=0.05, CatBoost MAE Score: 3.9769153953776546
# iters=3500, lr=0.07, CatBoost MAE Score: 3.947205269139265
# iters=3500, lr=0.1, CatBoost MAE Score: 3.9192260371734102
# iters=4500, lr=0.02, CatBoost MAE Score: 4.026783032935908
# iters=4500, lr=0.05, CatBoost MAE Score: 3.9564148356726743
# iters=4500, lr=0.07, CatBoost MAE Score: 3.9280201377022457
# iters=4500, lr=0.1, CatBoost MAE Score: 3.9000372543624953
# iters=5500, lr=0.02, CatBoost MAE Score: 4.009688968194791
# iters=5500, lr=0.05, CatBoost MAE Score: 3.942193598109401
# iters=5500, lr=0.07, CatBoost MAE Score: 3.913565297332533
# iters=5500, lr=0.1, CatBoost MAE Score: 3.8867024760785163
# iters=6500, lr=0.02, CatBoost MAE Score: 3.995706885359131
# iters=6500, lr=0.05, CatBoost MAE Score: 3.930118804001449
# iters=6500, lr=0.07, CatBoost MAE Score: 3.9008648431146096
# iters=6500, lr=0.1, CatBoost MAE Score: 3.8760939727521952

# iters=7000, lr=0.1, CatBoost MAE Score: 3.8701849364646037
# iters=7000, lr=0.2, CatBoost MAE Score: 3.832271018713966
# iters=7000, lr=0.3, CatBoost MAE Score: 3.813999075250384
# iters=8000, lr=0.1, CatBoost MAE Score: 3.8615182905510332
# iters=8000, lr=0.2, CatBoost MAE Score: 3.8262070859664354
# iters=8000, lr=0.3, CatBoost MAE Score: 3.807241460040403
# iters=9000, lr=0.1, CatBoost MAE Score: 3.854176590436379
# iters=9000, lr=0.2, CatBoost MAE Score: 3.819835918374386
# iters=9000, lr=0.3, CatBoost MAE Score: 3.8026171830372437
# iters=10000, lr=0.1, CatBoost MAE Score: 3.8471917412388086
# iters=10000, lr=0.2, CatBoost MAE Score: 3.8155878816788293
# iters=10000, lr=0.3, CatBoost MAE Score: 3.798745010106013

# iters=400, lr=0.4, CatBoost MAE Score: 4.0003747902127005
# iters=400, lr=0.5, CatBoost MAE Score: 3.978775740223764
# iters=400, lr=0.6, CatBoost MAE Score: 3.9759579031274104
# iters=2400, lr=0.4, CatBoost MAE Score: 3.863015566934512
# iters=2400, lr=0.5, CatBoost MAE Score: 3.843572625507345
# iters=2400, lr=0.6, CatBoost MAE Score: 3.832996544409944
# iters=4400, lr=0.4, CatBoost MAE Score: 3.8271209282976675
# iters=4400, lr=0.5, CatBoost MAE Score: 3.8173973166841293
# iters=4400, lr=0.6, CatBoost MAE Score: 3.8070874148309573
# iters=6400, lr=0.4, CatBoost MAE Score: 3.8084562255562036
# iters=6400, lr=0.5, CatBoost MAE Score: 3.8010546875296916
# iters=6400, lr=0.6, CatBoost MAE Score: 3.7936695403426497
# iters=8400, lr=0.4, CatBoost MAE Score: 3.7975296940509082
# iters=8400, lr=0.5, CatBoost MAE Score: 3.7908920298460234
# iters=8400, lr=0.6, CatBoost MAE Score: 3.7870319032951114
# iters=10400, lr=0.4, CatBoost MAE Score: 3.7900308113706846
# iters=10400, lr=0.5, CatBoost MAE Score: 3.784820853620411
# iters=10400, lr=0.6, CatBoost MAE Score: 3.781172960402657
# iters=12000, lr=0.5, CatBoost MAE Score: 3.7815228260181173
# iters=12000, lr=0.6, CatBoost MAE Score: 3.777218472858538
# iters=12000, lr=0.7, CatBoost MAE Score: 3.78113783759607
# iters=14000, lr=0.5, CatBoost MAE Score: 3.7779973919271335
# iters=14000, lr=0.6, CatBoost MAE Score: 3.774235019577053
# iters=14000, lr=0.7, CatBoost MAE Score: 3.778554786393208
# iters=16000, lr=0.5, CatBoost MAE Score: 3.7751062068666092
# iters=16000, lr=0.6, CatBoost MAE Score: 3.7718265432847127
# iters=16000, lr=0.7, CatBoost MAE Score: 3.775893605520106
# iters=18000, lr=0.5, CatBoost MAE Score: 3.7720468761497528
# iters=18000, lr=0.6, CatBoost MAE Score: 3.7696176049907453
# iters=18000, lr=0.7, CatBoost MAE Score: 3.772692310777104
# iters=20000, lr=0.5, CatBoost MAE Score: 3.770537531969578
# iters=20000, lr=0.6, CatBoost MAE Score: 3.7683620306766623

### Final Model

In [12]:
# create outpreds = average out value
out_preds = np.ones(len(test_out))*6
i = list(test_out.id)
out_preds_s = pd.Series(out_preds,index = i)
out_preds_s

32         6.0
33         6.0
34         6.0
35         6.0
36         6.0
          ... 
4023996    6.0
4023997    6.0
4023998    6.0
4023999    6.0
4024000    6.0
Length: 2496435, dtype: float64

In [13]:
pred_final = model_dt.predict(X_test_in)
# add indexs to recombine with out preds
pred_final_s = pd.Series(pred_final,index=list(test_in.id))
pred_final_s.head()

NameError: name 'model_dt' is not defined

In [None]:
both = pred_final_s.append(out_preds_s).sort_index()
both.values

In [None]:
output = pd.DataFrame({'id': test.id, 'pressure': both.values})
output.to_csv('submission.csv', index=False)
print("Submission saved!")