## Random Forest Model
**Predicting Stage 4 Liver Cirrhosis**

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', None)
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
%matplotlib inline

In [5]:
cirr = pd.read_csv('../datasets/cirr_clean.csv')

In [6]:
cirr.head(2)

Unnamed: 0,target,id,n_days,age,bilirubin,cholesterol,albumin,copper,alk_phos,sgot,tryglicerides,platelets,prothrombin,status_C,status_CL,status_D,drug_D-penicillamine,drug_Placebo,sex_F,sex_M,ascites_N,ascites_Y,hepatomegaly_N,hepatomegaly_Y,spiders_N,spiders_Y,edema_N,edema_S,edema_Y
0,1,1,400,21464,14.5,261.0,2.6,156.0,1718.0,137.95,172.0,190.0,12.2,0,0,1,1,0,1,0,0,1,0,1,0,1,0,0,1
1,0,2,4500,20617,1.1,302.0,4.14,54.0,7394.8,113.52,88.0,221.0,10.6,1,0,0,1,0,1,0,1,0,0,1,0,1,1,0,0


In [7]:
features = ['hepatomegaly_Y', 'hepatomegaly_N', 'ascites_Y', 'ascites_N', 'albumin', 'prothrombin', 
            'spiders_Y', 'spiders_N', 'edema_Y', 'edema_N', 'copper', 'platelets', 'bilirubin', 'age']
X = cirr[features]
y = cirr['target']

In [8]:
y.value_counts(normalize=True) # normalize == percentage

0    0.65942
1    0.34058
Name: target, dtype: float64

In [9]:
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state = 42)

In [10]:
X_train.head()

Unnamed: 0,hepatomegaly_Y,hepatomegaly_N,ascites_Y,ascites_N,albumin,prothrombin,spiders_Y,spiders_N,edema_Y,edema_N,copper,platelets,bilirubin,age
101,0,1,0,1,4.19,10.3,1,0,0,1,81.0,307.0,0.7,15574
90,1,0,1,0,3.67,11.1,1,0,1,0,57.0,110.0,2.5,17884
195,0,1,0,1,3.35,9.6,0,1,0,1,41.0,165.0,0.7,13486
271,1,0,0,1,3.58,10.4,0,1,0,1,24.0,288.0,0.4,22347
77,0,1,0,1,4.06,12.0,0,1,0,0,37.0,442.0,0.6,15119


In [11]:
y_train.head()

101    0
90     1
195    0
271    0
77     0
Name: target, dtype: int64

In [12]:
len(features)

14

In [13]:
list(np.random.choice(features, size=3, replace=False))

['prothrombin', 'albumin', 'spiders_N']

In [14]:
rf = RandomForestClassifier()

In [15]:
et = ExtraTreesClassifier()

In [16]:
cross_val_score(rf, X_train, y_train, cv=5).mean()



0.7539421440726034

In [17]:
cross_val_score(et, X_train, y_train, cv=5).mean()



0.7293250141803742

In [18]:
rf = RandomForestClassifier(random_state=42)
rf_params = {
    'n_estimators': [100, 125],
    'max_depth': [None, 4, 5, 6],
    'max_features': [None, # bagging
                    'auto'] # random forest
}
gs = GridSearchCV(rf, param_grid=rf_params, cv=5)
gs.fit(X_train, y_train)
print(gs.best_score_) # cross val score
gs.best_params_

0.7729468599033816




{'max_depth': 4, 'max_features': 'auto', 'n_estimators': 125}