In [1]:
#
# Inpired by https://arxiv.org/abs/1702.08835 and https://github.com/STO-OTZ/my_gcForest/
#
import numpy as np
import random
import uuid

from sklearn.datasets import fetch_mldata
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split

from deep_forest import MGCForest

# The MNIST dataset

In [2]:
mnist = fetch_mldata('MNIST original', data_home='~/scikit-learn-datasets')
mnist.data.shape

print('Data: {}, target: {}'.format(mnist.data.shape, mnist.target.shape))

Data: (70000, 784), target: (70000,)


In [3]:
X_train, X_test, y_train, y_test = train_test_split(mnist.data, mnist.target, test_size=0.5, random_state=42)

#
# Limit the size of the dataset
#
X_train = X_train[:1000]
y_train = y_train[:1000]
X_test = X_test[:1000]
y_test = y_test[:1000]

print('X_train:', X_train.shape)
print('y_train:', y_train.shape)
print('X_test:', X_test.shape)
print('y_test:', y_test.shape)

X_train: (1000, 784)
y_train: (1000,)
X_test: (1000, 784)
y_test: (1000,)


## Using the MGCForest

Creates a simple *MGCForest* with 2 random forests for the *Multi-Grained-Scanning* process and 2 other random forests for the *Cascade* process.

In [4]:
mgc_forest = MGCForest(
    estimator_class=RandomForestClassifier,
    estimator_params={
        'mgs': [{
            'n_estimators': 30,
            'min_samples_split': 21,
            'max_features': 1,
            'n_jobs': -1,
        }, {
            'n_estimators': 30,
            'min_samples_split': 21,
            'max_features': 'sqrt',
            'n_jobs': -1,
        }],
        'cascade': [{
            'n_estimators': 1000,
            'min_samples_split': 11,
            'max_features': 1,
            'n_jobs': -1,
        }, {
            'n_estimators': 1000,
            'min_samples_split': 11,
            'max_features': 'sqrt',
            'n_jobs': -1,
        }]
    },
    stride_ratios=[1.0 / 4, 1.0 / 9, 1.0 / 16]
)

mgc_forest.fit(X_train, y_train)

<MultiGrainedScanner 0.25> - Scanning and fitting for X ((1000, 784)) and y ((1000,)) started
<MultiGrainedScanner 0.25> - Scanning turned X ((1000, 784)) into newX ((589000, 196)). 589 new instances were added per sample
<MultiGrainedScanner 0.1111111111111111> - Scanning and fitting for X ((1000, 784)) and y ((1000,)) started
<MultiGrainedScanner 0.1111111111111111> - Scanning turned X ((1000, 784)) into newX ((698000, 87)). 698 new instances were added per sample
<MultiGrainedScanner 0.0625> - Scanning and fitting for X ((1000, 784)) and y ((1000,)) started
<MultiGrainedScanner 0.0625> - Scanning turned X ((1000, 784)) into newX ((736000, 49)). 736 new instances were added per sample
<CascadeForest 2> - Cascade fitting for X ((1000, 40460)) and y ((1000,)) started
<CascadeForest 2> - Level 1:: X with shape: (1000, 40460)
<CascadeForest 2> - Level 1:: got all predictions
<CascadeForest 2> - Level 1:: got accuracy 0.929
<CascadeForest 2> - Level 2:: X with shape: (1000, 40480)
<Cascad

In [5]:
y_pred = mgc_forest.predict(X_test)

print('Prediction shape:', y_pred.shape)
print('Accuracy:', accuracy_score(y_test, y_pred), 'F1 score:', f1_score(y_test, y_pred, average='weighted'))

<MultiGrainedScanner 0.25> - Predicting X ((1000, 784))
<MultiGrainedScanner 0.1111111111111111> - Predicting X ((1000, 784))
<MultiGrainedScanner 0.0625> - Predicting X ((1000, 784))
<CascadeForest 2> - Shape of predictions: (2, 1000, 10) shape of X: (1000, 40460)


Prediction shape: (1000,)
Accuracy: 0.938 F1 score: 0.938046118093
