In [9]:
% matplotlib inline

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn import tree

In [4]:
iris = datasets.load_iris()

irisData = pd.DataFrame(iris.data, columns=iris.feature_names)
irisData['class'] = pd.Categorical.from_codes(iris.target, iris.target_names)
irisData.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),class
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [5]:
titanicData = pd.read_csv("data/titanic.csv")
titanicData = titanicData.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis = 1)
titanicData = titanicData.dropna()
titanicData['Sex'] = pd.Categorical(titanicData['Sex']).codes
titanicData['Embarked'] = pd.Categorical(titanicData['Embarked']).codes
titanicData['class'] = titanicData['Survived']
titanicData = titanicData.drop(['Survived'], axis = 1)

titanicData.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,class
0,3,1,22.0,1,0,7.25,2,0
1,1,0,38.0,1,0,71.2833,0,1
2,3,0,26.0,0,0,7.925,2,1
3,1,0,35.0,1,0,53.1,2,1
4,3,1,35.0,0,0,8.05,2,0


In [22]:
def treeCrossValidate(data, model, name=''):
    data['train'] = (np.random.uniform(0,1, len(data)) * 10).astype(int)
    err = 0
    print(f"\n*** {name} ***")
    for i in range(10):
        data_train = data[data['train'] != i]
        data_test = data[data['train'] == i]
        
        y_train = data_train['class']
        X_train = data_train.drop('class', axis = 1)
        model.fit(X_train, y_train)
        y_test = data_test['class']
        X_test = data_test.drop('class', axis = 1)
        
        err += model.score(X_test, y_test)
#         print(f'\t{model.score(X_test, y_test)}')
    print(err/10)
    return err

In [7]:
help(RandomForestClassifier)

Help on class RandomForestClassifier in module sklearn.ensemble.forest:

class RandomForestClassifier(ForestClassifier)
 |  RandomForestClassifier(n_estimators=10, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=1, random_state=None, verbose=0, warm_start=False, class_weight=None)
 |  
 |  A random forest classifier.
 |  
 |  A random forest is a meta estimator that fits a number of decision tree
 |  classifiers on various sub-samples of the dataset and use averaging to
 |  improve the predictive accuracy and control over-fitting.
 |  The sub-sample size is always the same as the original
 |  input sample size but the samples are drawn with replacement if
 |  `bootstrap=True` (default).
 |  
 |  Read more in the :ref:`User Guide <forest>`.
 |  
 |  Parameters
 |  ----------
 |  n_estimators : integ

In [24]:
help(ExtraTreesClassifier)

Help on class ExtraTreesClassifier in module sklearn.ensemble.forest:

class ExtraTreesClassifier(ForestClassifier)
 |  ExtraTreesClassifier(n_estimators=10, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=False, oob_score=False, n_jobs=1, random_state=None, verbose=0, warm_start=False, class_weight=None)
 |  
 |  An extra-trees classifier.
 |  
 |  This class implements a meta estimator that fits a number of
 |  randomized decision trees (a.k.a. extra-trees) on various sub-samples
 |  of the dataset and use averaging to improve the predictive accuracy
 |  and control over-fitting.
 |  
 |  Read more in the :ref:`User Guide <forest>`.
 |  
 |  Parameters
 |  ----------
 |  n_estimators : integer, optional (default=10)
 |      The number of trees in the forest.
 |  
 |  criterion : string, optional (default="gini")
 |      The f

In [8]:
models = [
    (
        RandomForestClassifier(n_estimators=20, max_depth=13, min_samples_split=10),
        'RandomForestClassifier'
    ),
    (
        ExtraTreesClassifier(n_estimators=30, min_samples_split=20, max_leaf_nodes=15),
        'ExtraTreesClassifier'
    )
]

In [23]:
for data, data_name in zip([irisData, titanicData], ['Iris', 'Titanic']):
    for model, model_name in models:
        treeCrossValidate(data, model, data_name+': ' + model_name)


*** Iris: RandomForestClassifier ***
0.9516666666666668

*** Iris: ExtraTreesClassifier ***
0.9609596530920061

*** Titanic: RandomForestClassifier ***
0.8156327432299356

*** Titanic: ExtraTreesClassifier ***
0.7934202833575882
