In [2]:
% matplotlib inline

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn import tree

  from numpy.core.umath_tests import inner1d


In [3]:
def treeCrossValidate(data, model, name=''):
    data['train'] = (np.random.uniform(0,1, len(data)) * 10).astype(int)
    err = 0
    print(f"\n*** {name} ***")
    for i in range(10):
        data_train = data[data['train'] != i]
        data_test = data[data['train'] == i]
        
        y_train = data_train['class']
        X_train = data_train.drop('class', axis = 1)
        model.fit(X_train, y_train)
        y_test = data_test['class']
        X_test = data_test.drop('class', axis = 1)
        
        err += model.score(X_test, y_test)
#         print(f'\t{model.score(X_test, y_test)}')
    print('Score: ', err/10)
    return err

In [17]:
data = pd.read_csv('data/car.data', 
                   names=[
                       'buying',
                       'maint',
                       'doors',
                       'persons',
                       'lug_boot',
                       'safety',
                       'class'
                   ]
                  )
data.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [20]:
for name in data.columns.values:
    data[name] = pd.Categorical(data[name]).codes
data.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,3,3,0,0,2,1,2
1,3,3,0,0,2,2,2
2,3,3,0,0,2,0,2
3,3,3,0,0,1,1,2
4,3,3,0,0,1,2,2


In [25]:
models = [
    (
        RandomForestClassifier(n_estimators=60, max_depth=40, min_samples_split=10),
        'RandomForestClassifier'
    ),
    (
        ExtraTreesClassifier(n_estimators=30, min_samples_split=20, max_leaf_nodes=15),
        'ExtraTreesClassifier'
    ),
    (
        tree.DecisionTreeClassifier(), 'Default DecisionTreeClassifier'
    ),
    (
        RandomForestClassifier(), 'Default RandomForestClassifier'
    )
]

In [26]:
for model, name in models:
    treeCrossValidate(data, model, name)


*** RandomForestClassifier ***
Score:  0.9377893324448145

*** ExtraTreesClassifier ***
Score:  0.7479749403370896

*** Default DecisionTreeClassifier ***
Score:  0.9733054999876289

*** Default RandomForestClassifier ***
Score:  0.9331278457296197
