---
# Implementing a Random Forest Classifier
---

### 1. Import libraries and data

In [27]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import pandas as pd

In [9]:
df = pd.read_csv('kyphosis.csv')

In [35]:
df.head(3) #Age in months, number of vertabrae involved, start - number of topmost vertabrae that was operated on

Unnamed: 0,Kyphosis,Age,Number,Start
0,absent,71,3,5
1,absent,158,3,14
2,present,128,4,5


### 3. Prepare training and testing data

In [11]:
#Features dataframe (X) 
X = df.drop('Kyphosis',axis=1) 

#labels dataframe (y)
y = df['Kyphosis']

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

### 4. Create and fit Random Forest Classifier

In [18]:
rfc = RandomForestClassifier()

In [19]:
rfc.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=True, random_state=None, verbose=0, warm_start=False)

### 5. Make Predictions using Random Forest Classifier

In [20]:
rfc_pred = rfc.predict(X_test)

In [16]:
rfc_pred

array(['absent', 'absent', 'absent', 'absent', 'present', 'absent',
       'absent', 'absent', 'absent', 'absent', 'absent', 'absent',
       'absent', 'absent', 'absent', 'absent', 'present'], dtype=object)

___
### ** Note on parameters **

Some useful parameters to tune:
- min_samples_leaf (the minimum samles which can be put into each lef)
- n_estimators (the number of decision trains)
- max_features (the size of the subset of features to be examined at each split)

An optional feature to take advantage of:
- oob_score (a way of seeing how well the estimator did by cross-validiting on the "out of bag" data, i.e. the data
    for each tree that was not used in the sample)

#### oob_score example

In [30]:
rfc = RandomForestClassifier(oob_score=True)

In [34]:
rfc.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=True, random_state=None, verbose=0, warm_start=False)

In [32]:
rfc.oob_score_

0.796875