In [1]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, BaggingClassifier, \
                             AdaBoostClassifier, GradientBoostingRegressor, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.svm import SVC
from sklearn.datasets import make_moons
from scipy import stats

## Ensemble models
### Hard Voting

In [2]:
np.random.seed(42)
X, y = make_moons(n_samples=500, noise=0.30)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

log_cl = LogisticRegression()
rf_cl = RandomForestClassifier()
svm_cl = SVC()

voting_cl = VotingClassifier([('lg',log_cl),('rf', rf_cl),('svc', svm_cl)], voting='hard')

for model in [log_cl, rf_cl, svm_cl, voting_cl]:
    model.fit(X_train, y_train)
    print(model.__class__.__name__,accuracy_score(y_pred=model.predict(X_test), y_true=y_test))
    
    

LogisticRegression 0.864
RandomForestClassifier 0.88
SVC 0.888
VotingClassifier 0.904


### Soft Voting (Using Probabilities instead of hard decisions)

In [3]:
np.random.seed(42)
X, y = make_moons(n_samples=500, noise=0.30)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

log_cl = LogisticRegression()
rf_cl = RandomForestClassifier()
svm_cl = SVC(probability=True)

voting_cl = VotingClassifier([('lg',log_cl),('rf', rf_cl),('svc', svm_cl)], voting='soft')

for model in [log_cl, rf_cl, svm_cl, voting_cl]:
    model.fit(X_train, y_train)
    print(model.__class__.__name__,accuracy_score(y_pred=model.predict(X_test), y_true=y_test))

LogisticRegression 0.864
RandomForestClassifier 0.88
SVC 0.888
VotingClassifier 0.92


## Bagging and Pasting (Sampling without replacement)
### Single Decision Tree trained with all data

In [4]:
dt_clf = DecisionTreeClassifier()
dt_clf.fit(X_train, y_train)
print("Accuracy Score: {:.3f}".format(accuracy_score(y_pred=dt_clf.predict(X_test), y_true=y_test)))

Accuracy Score: 0.872


### Bagging:

#### Simulating Bagging

In [45]:
data = range(10000)
result = []
for i in range(500):
    result.append(set(np.random.choice(data, 10000, replace=True)))

count =[]
for i in range(10000):
    acum = 0
    for j in range(500):
        if i in result[j]:
            acum +=1
    count.append(acum/500)   
print("Ratio of samples which does not contain a number {:.2f}".format(1 - np.mean(count)))

Ratio of samples which does not contain a number 0.37


In [5]:
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=500, max_samples=100, bootstrap=True, n_jobs=-1, oob_score=True
)
bag_clf.fit(X_train, y_train)
print("OOB Score estimation {}".format(bag_clf.oob_score_))
print("Accuracy Score: {:.3f}".format(accuracy_score(y_pred=bag_clf.predict(X_test), y_true=y_test)))

OOB Score estimation 0.9253333333333333
Accuracy Score: 0.912


### Pasting:

In [6]:
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=500, max_samples=100, bootstrap=False, n_jobs=-1
)
bag_clf.fit(X_train, y_train)
print("Accuracy Score: {:.3f}".format(accuracy_score(y_pred=bag_clf.predict(X_test), y_true=y_test)))

Accuracy Score: 0.904


## Random Forest

In [42]:
bag_clf = BaggingClassifier(
    DecisionTreeClassifier( max_leaf_nodes=16, splitter='random'),
    n_estimators=500, max_samples=1.0, bootstrap=True, n_jobs=-1, oob_score=True
)
bag_clf.fit(X_train, y_train)
print("OOB Score estimation {:.3f}".format(bag_clf.oob_score_))
print("Accuracy Score: {:.3f}".format(accuracy_score(y_pred=bag_clf.predict(X_test), y_true=y_test)))

rf_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1, oob_score=True)
rf_clf.fit(X_train, y_train)
print("OOB Score estimation {:.3f}".format(rf_clf.oob_score_))
print("Accuracy Score: {:.3f}".format(accuracy_score(y_pred=rf_clf.predict(X_test), y_true=y_test)))

OOB Score estimation 0.745
Accuracy Score: 0.713
OOB Score estimation 0.814
Accuracy Score: 0.807


## AdaBoost

In [8]:
ada_clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), 
                            n_estimators=200, algorithm='SAMME.R', learning_rate=0.5)
ada_clf.fit(X_train, y_train)
print('Accuracy Score {:.3f}'.format(accuracy_score(y_pred=ada_clf.predict(X_test), y_true=y_test)))

Accuracy Score 0.896


## Gradient Boosting

In [9]:
np.random.seed(42)
X = np.random.rand(100, 1) - 0.5
y = 3*X[:, 0]**2 + 0.05 * np.random.randn(100)

In [10]:
from sklearn.tree import DecisionTreeRegressor

tree_reg1 = DecisionTreeRegressor(max_depth=2, random_state=42)
tree_reg1.fit(X, y)

y2 = y - tree_reg1.predict(X)
tree_reg2 = DecisionTreeRegressor(max_depth=2, random_state=42)
tree_reg2.fit(X, y2)

y3 = y2 - tree_reg2.predict(X)
tree_reg3 = DecisionTreeRegressor(max_depth=2, random_state=42)
tree_reg3.fit(X, y3)

X_new = np.array([[0.8]])

print('DT Predicted MSE:{}'.format(mean_squared_error(y_pred=tree_reg1.predict(X), y_true=y)))
y_pred = sum(tree.predict(X) for tree in (tree_reg1, tree_reg2, tree_reg3))
print('GTB Predicted MSE:{}'.format(mean_squared_error(y_pred=y_pred, y_true=y)))

DT Predicted MSE:0.013303033484734628
GTB Predicted MSE:0.005038058938993283


# Exercise



In [11]:
from sklearn.datasets import fetch_mldata
from sklearn.utils import shuffle
from sklearn import pipeline
from sklearn.preprocessing import scale, FunctionTransformer, MaxAbsScaler
from sklearn.dummy import DummyClassifier
import math

In [12]:
mnist = fetch_mldata('MNIST original')
print('MNIST contains {} samples of {} features each'.format(mnist.data.shape[0],mnist.data.shape[1]))
print('Each feature corresponds to a b/w intensity pixel in a picture of {0}x{0} pixels'.
      format(int(math.sqrt(mnist.data.shape[1]))))

MNIST contains 70000 samples of 784 features each
Each feature corresponds to a b/w intensity pixel in a picture of 28x28 pixels


In [13]:
X, y = shuffle(mnist.data, mnist.target)
X_train, X_validation, X_test = X[0:50000], X[50000:60000], X[60000:70000]
y_train, y_validation, y_test = y[0:50000], y[50000:60000], y[60000:70000]

In [14]:
X_train, X_validation, X_test = X[0:5000], X[5000:6000], X[6000:7000]
y_train, y_validation, y_test = y[0:5000], y[5000:6000], y[6000:7000]

In [15]:
dummy_cl = DummyClassifier()
dummy_cl.fit(X_train, y_train)
y_pred = dummy_cl.predict(X_validation)
print('Random accuracy {}'.format(accuracy_score(y_pred, y_validation)))

Random accuracy 0.089


In [None]:
for i in [1,50,100,150,200, 255]:
    svm_cl = SVC()
    %time svm_cl.fit(X_train/i, y_train)
    y_pred = svm_cl.predict(X_validation/i)
    print('SVM accuracy standarised by {}: {}'.format(i, accuracy_score(y_pred, y_validation)))

In [16]:
svm_cl = SVC()
svm_cl.fit(X_train, y_train)
y_pred = svm_cl.predict(X_validation)
print('SVM accuracy {}'.format(accuracy_score(y_pred, y_validation)))

SVM accuracy 0.11


In [17]:
transformer = FunctionTransformer(func=lambda x: x/255)
svm_pipe = pipeline.Pipeline([('scale',transformer), ('svm', SVC(probability=True))])
svm_pipe.fit(X_train, y_train)
y_pred = svm_pipe.predict(X_validation)
print('Scaled SVM accuracy {}'.format(accuracy_score(y_pred, y_validation)))

Scaled SVM accuracy 0.92


In [18]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_validation)
print('RF accuracy {}'.format(accuracy_score(y_pred, y_validation)))

RF accuracy 0.902


In [19]:
et = ExtraTreesClassifier()
et.fit(X_train, y_train)
y_pred = et.predict(X_validation)
print('ET accuracy {}'.format(accuracy_score(y_pred, y_validation)))

ET accuracy 0.895


In [21]:
estimators = [('svm',svm_pipe), ('rf',rf), ('et',et)]
hard_ensemble = VotingClassifier(estimators, voting='hard')
hard_ensemble.fit(X_train, y_train)
y_pred = hard_ensemble.predict(X_validation)
print('Hard Ensemble accuracy {}'.format(accuracy_score(y_pred, y_validation)))

soft_ensemble = VotingClassifier(estimators, voting='soft')
soft_ensemble.fit(X_train, y_train)
y_pred = soft_ensemble.predict(X_validation)
print('Soft Ensemble accuracy {}'.format(accuracy_score(y_pred, y_validation)))


Hard Ensemble accuracy 0.924
Soft Ensemble accuracy 0.951


### Evaluate in Test

In [22]:
estimators = [('svm',svm_pipe), ('rf',rf), ('et',et),('hard', hard_ensemble),('soft', soft_ensemble)]
for e in estimators:
    y_pred = e[1].predict(X_test)
    print('{} accuracy {}'.format(e[0].upper(),accuracy_score(y_pred, y_test)))

SVM accuracy 0.899
RF accuracy 0.882
ET accuracy 0.889
HARD accuracy 0.908
SOFT accuracy 0.917


## Create an stack

 - First train models using train_data:


In [23]:
estimators = [('svm',svm_pipe), ('rf',rf), ('et',et)]
pred_validation = []
pred_test = []
for e in estimators:
    pred_validation.append(e[1].predict(X_validation))
    pred_test.append(e[1].predict(X_test))
    
pred_validation = np.array(pred_validation).T
pred_test = np.array(pred_test).T

In [24]:
#Train Meta
meta = RandomForestClassifier()
meta.fit(pred_validation, y_validation)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [25]:
y_pred = meta.predict(pred_test)
print('Meta accuracy {}'.format(accuracy_score(y_pred, y_test)))

Meta accuracy 0.88
