In [4]:
import numpy as np
import utils
from sklearn.metrics import roc_auc_score
from sklearn.base import BaseEstimator, ClassifierMixin
import utils
import classifiers
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from classifiers import spec_repr
from sklearn.cross_validation import StratifiedKFold
from sklearn.cross_validation import KFold
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

In [10]:
class SVNSupermodel(BaseEstimator, ClassifierMixin):
    
    def __init__(self,
                 specs=utils.get_classifier_specs(),
                 C = [100] * len(utils.get_classifier_specs()), 
                 superclassifier = LogisticRegression()):
        assert len(specs) == len(C), "Specs and regularization constants arrays lengths don't match"
        self.C = C
        self.specs = specs
        self.clfs = classifiers.get(self.specs, self.C)
        self.margins = None
        self.superclassifier = superclassifier
        self.margin_mean_abs = None
        
    def scale_train_margins(self, margins):
        abs_margins = np.abs(margins)
        margin_mean_abs = np.mean(abs_margins, axis=0)
        self.margin_mean_abs = margin_mean_abs
        return margins / margin_mean_abs
    
    def scale_test_margins(self, margins):
        return margins / self.margin_mean_abs
#     def scale_train_margins(self, margins):
#         self.scaler = StandardScaler()
#         return self.scaler.fit_transform(margins)
    
#     def scale_test_margins(self, margins):
#         return self.scaler.transform(margins)
        
    def fit(self, X, y=None, verbose=True):
        """
        Fitting each SVM, calculating margins then fitting superclassifier
        """
        for clf, spec in zip(self.clfs, self.specs):
            clf.fit(X, y)
            if verbose:
                print("Fitted %s" % spec_repr(spec))
        self.margins = self.scale_train_margins(utils.get_margins(self.clfs, X))
        if verbose:
            print("Margins calculated")
        self.superclassifier.fit(self.margins, y)
        print("Logregr coefs", self.superclassifier.coef_)
#         print(cls.clfs[6].predict(X))
        
    def predict(self, X, y=None):
        margins = utils.get_margins(self.clfs, X)
        margins = self.scale_test_margins(margins)
        return self.superclassifier.predict(margins)
    
    def predict_proba(self, X, y=None):
        margins = utils.get_margins(self.clfs, X)
        margins = self.scale_test_margins(margins)
        print("margin mins", np.min(margins, axis=0))
        print("margin maxs", np.max(margins, axis=0))
        print("margin means", np.mean(margins, axis=0))
        print("margin abs means", np.mean(np.abs(margins), axis=0))
        print("margin stds", np.std(margins, axis=0))
        return self.superclassifier.predict_proba(margins)
    
    def score(self, X, y=None):
        return roc_auc_score(y, self.predict_proba(X))
    
    def score_stats(self, X, y=None, verbose=True):
        """
        Calculating ROC for each SVM and for supermodel
        """
        scores = []
        acc_scores = []
        for clf, spec in zip(self.clfs, self.specs):
            prediction = clf.decision_function(X)
            scores.append(roc_auc_score(y, prediction))
            acc_scores.append(accuracy_score(y, clf.predict(X)))
            if verbose:
                print("%s ROC: %f" % (spec_repr(spec), scores[-1]))
#                 print("%s Acc: %f" % (spec_repr(spec), acc_scores[-1]))
        super_score = roc_auc_score(y, self.predict_proba(X)[:, 1])
        if verbose:
            print("Superclassifier ROC: %f" % super_score)
        return scores, super_score
    
    def fit_score_stats(self, X_train, X_test, y_train, y_test, verbose=True):
        """
        Fitting and scoring combined
        """
        self.fit(X_train, y_train, verbose=verbose)
        if verbose:
            print("Fitted")
        return self.score_stats(X_test, y_test, verbose=verbose)
    
    def cross_val_stats(self, X, y, cv_type="stratified", n_folds=5):
        """
        The function includes data splitting and scoring
        """
        if cv_type == "stratified":
            cv = StratifiedKFold(y, n_folds=n_folds)
        elif cv_type == "simple":
            cv = KFold(np.size(y), n_folds=n_folds)
        else:
            raise ValueError("Invalid cv_type")
        scores = []
        for train, test in cv:
            clfs_scores, super_score = \
                self.fit_score_stats(X[train], X[test], y[train], y[test], verbose=True)
            scores.append((clfs_scores, super_score))
        return scores

In [11]:
cls = SVNSupermodel()

## Logregr coefs here

In [12]:
for name in ["german"]:
    X, y, _ = utils.get_dataset(name)
    print(np.sum(y==1))
    print(np.sum(y==0))
    print("TESTING %s" % name.upper())
    x = cls.cross_val_stats(X, y, cv_type="stratified", n_folds=5)



300
700
TESTING GERMAN




Fitted Kernel type: linear
Fitted Kernel type: poly, degree = 2
Fitted Kernel type: poly, degree = 3
Fitted Kernel type: poly, degree = 4
Fitted Kernel type: ink, degree = 1, downer limit (a) = -3.000000
Fitted Kernel type: ink, degree = 2, downer limit (a) = -3.000000
Fitted Kernel type: rbf, gamma = 0.000100
Fitted Kernel type: rbf, gamma = 0.001000
Fitted Kernel type: rbf, gamma = 0.010000
Fitted Kernel type: rbf, gamma = 0.100000
Fitted Kernel type: rbf, gamma = 1.000000
Margins calculated
Logregr coefs [[ 0.00788161  0.22025555  0.86253403  1.02886066  0.7957329   0.60019978
  -0.01286558  0.00565579  0.29010771  1.6699392   1.81754941]]
Fitted
Kernel type: linear ROC: 0.758929
Kernel type: poly, degree = 2 ROC: 0.605595
Kernel type: poly, degree = 3 ROC: 0.698690
Kernel type: poly, degree = 4 ROC: 0.642024
Kernel type: ink, degree = 1, downer limit (a) = -3.000000 ROC: 0.685119
Kernel type: ink, degree = 2, downer limit (a) = -3.000000 ROC: 0.686429
Kernel type: rbf, gamma = 0.00

In [6]:
for name in utils.datasets:
    cls = SVNSupermodel()
    X, y, _ = utils.get_dataset(name)
    print("TESTING %s" % name.upper())
    x = cls.cross_val_stats(X, y, cv_type="stratified", n_folds=5)
    



TESTING GERMAN
Fitted Kernel type: linear
Fitted Kernel type: poly, degree = 2
Fitted Kernel type: poly, degree = 3
Fitted Kernel type: poly, degree = 4
Fitted Kernel type: ink, degree = 1, downer limit (a) = -3.000000
Fitted Kernel type: ink, degree = 2, downer limit (a) = -3.000000
Fitted Kernel type: rbf, gamma = 0.000100
Fitted Kernel type: rbf, gamma = 0.001000
Fitted Kernel type: rbf, gamma = 0.010000
Fitted Kernel type: rbf, gamma = 0.100000
Fitted Kernel type: rbf, gamma = 1.000000
Margins calculated
Logregr coefs [[-0.0111976   0.13809596  0.58744742  0.70042898  0.34585317  0.27016642
  -0.19996763 -0.06135211 -0.05824642  1.76757965  2.88933883]]
Fitted
Kernel type: linear ROC: 0.766429
Kernel type: poly, degree = 2 ROC: 0.597857
Kernel type: poly, degree = 3 ROC: 0.718095
Kernel type: poly, degree = 4 ROC: 0.658452
Kernel type: ink, degree = 1, downer limit (a) = -3.000000 ROC: 0.765595
Kernel type: ink, degree = 2, downer limit (a) = -3.000000 ROC: 0.762857
Kernel type: rb



Fitted Kernel type: linear
Fitted Kernel type: poly, degree = 2
Fitted Kernel type: poly, degree = 3
Fitted Kernel type: poly, degree = 4
Fitted Kernel type: ink, degree = 1, downer limit (a) = -3.000000
Fitted Kernel type: ink, degree = 2, downer limit (a) = -3.000000
Fitted Kernel type: rbf, gamma = 0.000100
Fitted Kernel type: rbf, gamma = 0.001000
Fitted Kernel type: rbf, gamma = 0.010000
Fitted Kernel type: rbf, gamma = 0.100000
Fitted Kernel type: rbf, gamma = 1.000000
Margins calculated
Logregr coefs [[  9.58353381e-01   2.74581240e-01   3.84503799e-01  -3.55302721e-03
    5.50467263e-01   3.23192205e-01  -7.77035248e-01   2.76369946e-01
    4.97221162e-02   1.26204314e+00   4.03227335e+00]]
Fitted
Kernel type: linear ROC: 0.970085
Kernel type: poly, degree = 2 ROC: 0.948224
Kernel type: poly, degree = 3 ROC: 0.950164
Kernel type: poly, degree = 4 ROC: 0.925079
Kernel type: ink, degree = 1, downer limit (a) = -3.000000 ROC: 0.978235
Kernel type: ink, degree = 2, downer limit (a)