# Chapter 7

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import operator
import pandas as pd
from sklearn import datasets
from sklearn.base import BaseEstimator, ClassifierMixin, clone
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import _name_estimators, Pipeline
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier

plt.style.use({'figure.facecolor': 'white'})

## 7.2

In [2]:
print(np.argmax(np.bincount([0, 0, 1], weights=[0.2, 0.2, 0.6])))

ex = np.array([[0.9, 0.1],
               [0.8, 0.2],
               [0.4, 0.6]])
p = np.average(ex, axis=0, weights=[0.2, 0.2, 0.6])
print(p)
print(np.argmax(p))

1
[0.58 0.42]
0


In [3]:
class MajorityVoteClassfier(BaseEstimator, ClassifierMixin):
    def __init__(self, classifiers, vote="classlabel", weights=None):
        self.classifiers = classifiers
        self.named_classfiers = {key: value for key, value in _name_estimators((classifiers))}
        self.vote = vote
        self.weights = weights

    def fit(self, X, y):
        if self.vote not in {"probability", "classlabel"}:
            raise ValueError("vote must bi 'probability' or 'classlabel': got (vote=%r)" % self.vote)
        if self.weights and len(self.weights) != len(self.classifiers):
            raise ValueError("Number of classifiers and weights must bi equal; got %d weights, %d classifiers" % (len(self.weights), len(self.classifiers)))

        self.labelnc_ = LabelEncoder()
        self.labelnc_.fit(y)
        self.classes_ = self.labelnc_.classes_
        self.classifiers_ = []
        for clf in self.classifiers:
            fitted_clf = clone(clf).fit(X, self.labelnc_.transform(y))
            self.classifiers_.append(fitted_clf)
        return self

    def predict(self, X):
        if self.vote == "probability":
            maj_vote = np.argmax(self.predict_proba(X), axis=1)
        else:
            predictions = np.asarray([clf.predict(X) for clf in self.classifiers_]).T
            maj_vote = np.apply_along_axis(lambda x: np.argmax(np.bincount(x, weights=self.weights)), axis=1, arr=predictions)

        maj_vote = self.labelnc_.inverse_transform(maj_vote)
        return maj_vote

    def predict_proba(self, X):
        probas = np.asarray([clf.predict_proba(X) for clf in self.classifiers_])
        avg_proba = np.average(probas, axis=0, weights=self.weights)
        return avg_proba

    def get_params(self, deep=True):
        if not deep:
            return super(MajorityVoteClassfier, self).get_params(deep=False)
        else:
            out = self.named_classfiers.copy()
            for name, step in self.named_classfiers.item():
                for key, value in step.get_params(deep=True).items():
                    out["%s__%s" % (name, key)] == value
            return out

In [14]:
iris = datasets.load_iris()
X, y = iris.data[50:, [1, 2]], iris.target[50:]
le = LabelEncoder()
y = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=1, stratify=y)

clf1 = LogisticRegression(penalty="l2", C=0.001, solver="lbfgs")
clf2 = DecisionTreeClassifier(max_depth=1, criterion="entropy", random_state=0)
clf3 = KNeighborsClassifier(n_neighbors=1, p=2, metric="minkowski")
pipe1 = Pipeline([["sc", StandardScaler()], ["clf", clf1]])
pipe3 = Pipeline([["sc", StandardScaler()], ["clf", clf3]])
clf_labels = ["Logistic regression", "Decision tree", "KNN"]
print("10-fold cross validation\n")
for clf, label in zip([pipe1, clf2, pipe3], clf_labels):
    scores = cross_val_score(estimator=clf, X=X_train, y=y_train, cv=10, scoring="roc_auc")
    print("ROC AUC: %0.2f (+/- $0.2f) [%s]" % (scores.mean(), scores.std(), label))

print("="*10, "MajorityVoteClassfier", "="*10)

mv_clf = MajorityVoteClassfier(classifiers=[pipe1, clf2, pipe3])
clf_labels += ["Majority voting"]
all_clf = [pipe1, clf2, pipe3, mv_clf]
for clf, label in zip(all_clf, clf_labels):
    scores = cross_val_score(estimator=clf, X=X_train, y=y_train, cv=10, scoring="roc_auc")
    print("ROC AUC: %0.2f (+/- $0.2f) [%s]".format(scores.mean(), scores.std(), label))

10-fold cross validation



TypeError: not all arguments converted during string formatting