In [1]:
import pandas as pd
import numpy as np
import sklearn as sk
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
spam = pd.read_csv("spambase.data", header=None)

In [3]:
from sklearn.model_selection import train_test_split
X = spam.loc[:,0:56]
y = spam.loc[:,57]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

## Voting

In [4]:
from sklearn import datasets
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier

clf1 = LogisticRegression(random_state=1)
clf2 = RandomForestClassifier(random_state=1)
clf3 = GradientBoostingClassifier(random_state=1)

eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard')

for clf, label in zip([clf1, clf2, clf3, eclf], ['Logistic Regression', 'Random Forest', 'Boosting', 'Ensemble']):
    scores = cross_val_score(clf, X_train, y_train, cv=5, scoring='accuracy')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))

Accuracy: 0.92 (+/- 0.01) [Logistic Regression]
Accuracy: 0.94 (+/- 0.01) [Random Forest]
Accuracy: 0.95 (+/- 0.01) [Boosting]
Accuracy: 0.95 (+/- 0.01) [Ensemble]


In [7]:
from sklearn.metrics import accuracy_score
eclf.fit(X_train, y_train)
accuracy_score(y_test, eclf.predict(X_test))

0.94424330195510497

## Soft Voting

In [8]:
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from itertools import product
from sklearn.ensemble import VotingClassifier

# Training classifiers
clf1 = DecisionTreeClassifier(max_depth=4)
clf2 = RandomForestClassifier(random_state=1)
clf3 = GradientBoostingClassifier(random_state=1)
eclf = VotingClassifier(estimators=[('dt', clf1), ('knn', clf2), ('svc', clf3)], voting='soft', weights=[1,3,3])

for clf, label in zip([clf1, clf2, clf3, eclf], ['Decision Tree', 'RF', 'SVM', 'Ensemble']):
    scores = cross_val_score(clf, X_train, y_train, cv=5, scoring='accuracy')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))

Accuracy: 0.90 (+/- 0.01) [Decision Tree]
Accuracy: 0.94 (+/- 0.01) [RF]
Accuracy: 0.95 (+/- 0.01) [SVM]
Accuracy: 0.94 (+/- 0.01) [Ensemble]


In [9]:
from sklearn.metrics import accuracy_score
eclf.fit(X_train, y_train)
accuracy_score(y_test, eclf.predict(X_test))

0.94207096307023896

## Soft Voting with Grid Search

In [11]:
from sklearn.model_selection import GridSearchCV
clf1 = LogisticRegression(random_state=1)
clf2 = RandomForestClassifier(random_state=1)
clf3 = GradientBoostingClassifier(random_state=1)
eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='soft')

params = {'lr__C': [1.0, 100.0], 'rf__n_estimators': [20, 200],'gnb': [0.01, 0.1]}

grid = GridSearchCV(estimator=eclf, param_grid=params, cv=5, scoring='accuracy')
grid = grid.fit(X_train, y_train)

In [12]:
print("Accuracy: %0.3f" % (grid.best_score_)) 

Accuracy: 0.947


In [13]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, grid.predict(X_test))

0.95365677045619113

## Stacking

In [14]:
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB 
from sklearn.ensemble import RandomForestClassifier
from mlxtend.classifier import StackingClassifier
import numpy as np

clf1 = LogisticRegression(random_state=1)
clf2 = RandomForestClassifier(random_state=1)
clf3 = GradientBoostingClassifier(random_state=1)
lr = LogisticRegression()
sclf = StackingClassifier(classifiers=[clf1, clf2, clf3], 
                          meta_classifier=lr)

print('3-fold cross validation:\n')

for clf, label in zip([clf1, clf2, clf3, sclf], 
                      ['Logistic', 
                       'Random Forest', 
                       'Boosting',
                       'StackingClassifier']):

    scores = model_selection.cross_val_score(clf, X, y, 
                                              cv=3, scoring='accuracy')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" 
          % (scores.mean(), scores.std(), label))

3-fold cross validation:

Accuracy: 0.90 (+/- 0.02) [Logistic]
Accuracy: 0.93 (+/- 0.03) [Random Forest]
Accuracy: 0.93 (+/- 0.02) [Boosting]
Accuracy: 0.93 (+/- 0.03) [StackingClassifier]


In [15]:
from sklearn.metrics import accuracy_score
sclf.fit(X_train, y_train)
accuracy_score(y_test, sclf.predict(X_test))

0.93482983345401882

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import GridSearchCV
from mlxtend.classifier import StackingClassifier

# Initializing models

clf1 = RandomForestClassifier(random_state=1)
clf2 = GradientBoostingClassifier(random_state=1, )
clf3 = AdaBoostClassifier(random_state=1)
clf4 = ExtraTreesClassifier(random_state=1)
lr = LogisticRegression()
sclf = StackingClassifier(classifiers=[clf1, clf2, clf3, clf4], 
                          meta_classifier=lr)

params = {'randomforestclassifier__n_estimators': [50, 100],
          'extratreesclassifier__n_estimators': [50, 100],
          'gradientboostingclassifier__learning_rate': [0.01, 0.1],
          'meta-logisticregression__C': [0.1, 10.0]}

grid = GridSearchCV(estimator=sclf, 
                    param_grid=params, 
                    cv=5,
                    refit=True)
grid.fit(X_train, y_train)

cv_keys = ('mean_test_score', 'std_test_score', 'params')

for r, _ in enumerate(grid.cv_results_['mean_test_score']):
    print("%0.3f +/- %0.2f %r"
          % (grid.cv_results_[cv_keys[0]][r],
             grid.cv_results_[cv_keys[1]][r] / 2.0,
             grid.cv_results_[cv_keys[2]][r]))

print('Best parameters: %s' % grid.best_params_)
print('Accuracy: %.2f' % grid.best_score_)

0.950 +/- 0.00 {'gradientboostingclassifier__learning_rate': 0.01, 'meta-logisticregression__C': 0.1, 'extratreesclassifier__n_estimators': 50, 'randomforestclassifier__n_estimators': 50}
0.952 +/- 0.00 {'gradientboostingclassifier__learning_rate': 0.01, 'meta-logisticregression__C': 0.1, 'extratreesclassifier__n_estimators': 50, 'randomforestclassifier__n_estimators': 100}
0.952 +/- 0.00 {'gradientboostingclassifier__learning_rate': 0.01, 'meta-logisticregression__C': 10.0, 'extratreesclassifier__n_estimators': 50, 'randomforestclassifier__n_estimators': 50}
0.951 +/- 0.00 {'gradientboostingclassifier__learning_rate': 0.01, 'meta-logisticregression__C': 10.0, 'extratreesclassifier__n_estimators': 50, 'randomforestclassifier__n_estimators': 100}
0.950 +/- 0.00 {'gradientboostingclassifier__learning_rate': 0.1, 'meta-logisticregression__C': 0.1, 'extratreesclassifier__n_estimators': 50, 'randomforestclassifier__n_estimators': 50}
0.952 +/- 0.00 {'gradientboostingclassifier__learning_rat

In [17]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, grid.predict(X_test))

0.95800144822592326