In [1]:
import numpy as np
import utils
from sklearn.metrics import roc_auc_score
from sklearn.base import BaseEstimator, ClassifierMixin
import utils
import classifiers
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from classifiers import spec_repr
from sklearn.cross_validation import StratifiedKFold
from sklearn.cross_validation import KFold
from sklearn.svm import SVC

In [2]:
class SVNSupermodel(BaseEstimator, ClassifierMixin):
    
    def __init__(self,
                 specs=utils.get_classifier_specs(),
                 C = [1] * len(utils.get_classifier_specs()), 
                 superclassifier = LogisticRegression()):
        assert len(specs) == len(C), "Specs and regularization constants arrays lengths don't match"
        self.C = C
        self.specs = specs
        self.clfs = classifiers.get(self.specs, self.C)
        self.margins = None
        self.superclassifier = superclassifier
        
    def fit(self, X, y=None, verbose=True):
        """
        Fitting each SVM, calculating margins then fitting superclassifier
        """
        for clf, spec in zip(self.clfs, self.specs):
            clf.fit(X, y)
            if verbose:
                print("Fitted %s" % spec_repr(spec))
        self.margins = utils.get_margins(self.clfs, X)
        if verbose:
            print("Margins calculated")
        self.superclassifier.fit(self.margins, y)
        
    def predict(self, X, y=None):
        margins = utils.get_margins(self.clfs, X)
        return self.superclassifier.predict(margins)
    
    def predict_proba(self, X, y=None):
        margins = utils.get_margins(self.clfs, X)
        return self.superclassifier.predict_proba(margins)
    
    def score(self, X, y=None):
        return roc_auc_score(y, self.predict_proba(X))
    
    def score_stats(self, X, y=None, verbose=True):
        """
        Calculating ROC for each SVM and for supermodel
        """
        scores = []
        for clf, spec in zip(self.clfs, self.specs):
            prediction = clf.decision_function(X)
            scores.append(roc_auc_score(y, prediction))
            if verbose:
                print("%s ROC: %f" % (spec_repr(spec), scores[-1]))
        super_score = roc_auc_score(y, self.predict_proba(X)[:, 1])
        if verbose:
            print("Superclassifier ROC: %f" % super_score)
        return scores, super_score
    
    def fit_score_stats(self, X_train, X_test, y_train, y_test, verbose=True):
        """
        Fitting and scoring combined
        """
        self.fit(X_train, y_train, verbose=verbose)
        if verbose:
            print("Fitted")
        return self.score_stats(X_test, y_test, verbose=verbose)
    
    def cross_val_stats(self, X, y, cv_type="stratified", n_folds=5):
        """
        The function includes data splitting and scoring
        """
        if cv_type == "stratified":
            cv = StratifiedKFold(y, n_folds=n_folds)
        elif cv_type == "simple":
            cv = KFold(np.size(y), n_folds=n_folds)
        else:
            raise ValueError("Invalid cv_type")
        scores = []
        for train, test in cv:
            clfs_scores, super_score = \
                self.fit_score_stats(X[train], X[test], y[train], y[test], verbose=True)
            scores.append((clfs_scores, super_score))
        return scores

In [3]:
cls = SVNSupermodel()

In [4]:
for name in utils.datasets:
    X, y, _ = utils.get_dataset(name)
    print("TESTING %s" % name.upper())
    x = cls.cross_val_stats(X, y, cv_type="stratified", n_folds=5)
    

TESTING HOUSING
Fitted Kernel type: linear
Fitted Kernel type: poly, degree = 2
Fitted Kernel type: poly, degree = 3
Fitted Kernel type: poly, degree = 4
Fitted Kernel type: ink, degree = 1, downer limit (a) = -3.000000
Fitted Kernel type: ink, degree = 2, downer limit (a) = -3.000000
Fitted Kernel type: rbf, gamma = 0.000100
Fitted Kernel type: rbf, gamma = 0.001000
Fitted Kernel type: rbf, gamma = 0.001000
Fitted Kernel type: rbf, gamma = 0.010000
Fitted Kernel type: rbf, gamma = 0.100000
Fitted Kernel type: rbf, gamma = 1.000000
Margins calculated
Fitted
Kernel type: linear ROC: 0.930390
Kernel type: poly, degree = 2 ROC: 0.733506
Kernel type: poly, degree = 3 ROC: 0.938182
Kernel type: poly, degree = 4 ROC: 0.841558
Kernel type: ink, degree = 1, downer limit (a) = -3.000000 ROC: 0.943377
Kernel type: ink, degree = 2, downer limit (a) = -3.000000 ROC: 0.942338
Kernel type: rbf, gamma = 0.000100 ROC: 0.925714
Kernel type: rbf, gamma = 0.001000 ROC: 0.925714
Kernel type: rbf, gamma = 



TESTING GERMAN
Fitted Kernel type: linear
Fitted Kernel type: poly, degree = 2
Fitted Kernel type: poly, degree = 3
Fitted Kernel type: poly, degree = 4
Fitted Kernel type: ink, degree = 1, downer limit (a) = -3.000000
Fitted Kernel type: ink, degree = 2, downer limit (a) = -3.000000
Fitted Kernel type: rbf, gamma = 0.000100
Fitted Kernel type: rbf, gamma = 0.001000
Fitted Kernel type: rbf, gamma = 0.001000
Fitted Kernel type: rbf, gamma = 0.010000
Fitted Kernel type: rbf, gamma = 0.100000
Fitted Kernel type: rbf, gamma = 1.000000
Margins calculated
Fitted
Kernel type: linear ROC: 0.765119
Kernel type: poly, degree = 2 ROC: 0.565357
Kernel type: poly, degree = 3 ROC: 0.764048
Kernel type: poly, degree = 4 ROC: 0.686667
Kernel type: ink, degree = 1, downer limit (a) = -3.000000 ROC: 0.780714
Kernel type: ink, degree = 2, downer limit (a) = -3.000000 ROC: 0.785000
Kernel type: rbf, gamma = 0.000100 ROC: 0.769048
Kernel type: rbf, gamma = 0.001000 ROC: 0.769286
Kernel type: rbf, gamma = 0