## Random Forest feature importance 

Author : Ines Krissaane 

In [12]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt, seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

## Import data and pre-processing (cf 1.Pre-processing.ipynb)

In [13]:
X_tr = pd.read_csv('sepsis_data_all.csv')
X_tr.shape

(1552210, 44)

In [14]:
# Remove variables with more than 82% of na.
X_tr.drop(['EtCO2', 'BaseExcess','HCO3', 'FiO2', 'pH', 'PaCO2', 'SaO2', 'AST', 'BUN', 'Alkalinephos',
    'Calcium', 'Chloride', 'Creatinine', 'Bilirubin_direct' ,'Lactate','Magnesium','Phosphate',
    'Potassium', 'Bilirubin_total', 'TroponinI','Hct', 'Hgb','PTT',  'WBC', 'Unnamed: 0','Fibrinogen', 'Platelets',
    "Glucose", 'Unit1', "Unit2", "HospAdmTime",'X', 'ID'], axis = 1, inplace = True)

In [15]:
X_tr.fillna(method='bfill', inplace=True)
X_tr.fillna(method='ffill', inplace=True)

In [16]:
X_tr.head()

Unnamed: 0,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,Age,Gender,ICULOS,SepsisLabel
0,97.0,95.0,36.11,98.0,75.33,43.0,19.0,83.14,0,1,0
1,97.0,95.0,36.11,98.0,75.33,43.0,19.0,83.14,0,2,0
2,89.0,99.0,36.11,122.0,86.0,43.0,22.0,83.14,0,3,0
3,90.0,95.0,36.11,122.0,91.33,43.0,30.0,83.14,0,4,0
4,103.0,88.5,36.11,122.0,91.33,43.0,24.5,83.14,0,5,0


## Random Forest classifier

In [18]:
Y_tr = X_tr.SepsisLabel
X_tr.drop(['SepsisLabel'], axis = 1, inplace = True)
X_train, X_test, y_train, y_test = train_test_split(X_tr,Y_tr,test_size=.3, random_state=40)

In [None]:
#train the random forest classifier 
rf = RandomForestClassifier(n_estimators = 100, random_state = 42)
rf.fit(X_train, y_train);

In [None]:
# Get numerical feature importances
feature_list = list(X_train.columns)
importances = list(rf.feature_importances_)
# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]
# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];