In [6]:
import numpy as np
import utils
from sklearn.metrics import roc_auc_score
from sklearn.base import BaseEstimator, ClassifierMixin
import utils
import classifiers
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from classifiers import spec_repr
from sklearn.cross_validation import StratifiedKFold
from sklearn.cross_validation import KFold
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
import pandas as pd

In [7]:
def to_csv(superm, X, y, margins=None, ending=""):
    name = superm.dataset_name + "_" + ending + ".csv"
    if margins is None:
        margins = superm.margins
    col_names = ["y"] + ["X" + str(i) for i in range(1, X.shape[1] + 1)] + [spec_repr(val) for val in superm.specs]
    print(y.shape)
    print(X.shape)
    print(margins.shape)
    df = pd.DataFrame(data=np.hstack((y.reshape((y.shape[0], 1)), X, margins)), index=None, columns=col_names)
    df["y"] = df["y"].astype("int")
    df.to_csv(name, sep=",", header=True, index=False)
    return df

In [8]:
class SVNSupermodel(BaseEstimator, ClassifierMixin):
    
    def __init__(self,
                 specs=utils.get_classifier_specs(),
                 C = [10] * len(utils.get_classifier_specs()), 
                 superclassifier = LogisticRegression(),
                 dataset_name=""):
        assert len(specs) == len(C), "Specs and regularization constants arrays lengths don't match"
        self.C = C
        self.specs = specs
        self.clfs = classifiers.get(self.specs, self.C)
        self.margins = None
        self.superclassifier = superclassifier
        self.margin_mean_abs = None
        self.dataset_name = dataset_name
        
    def scale_train_margins(self, margins):
        abs_margins = np.abs(margins)
        margin_mean_abs = np.mean(abs_margins, axis=0)
        self.margin_mean_abs = margin_mean_abs
        return margins / margin_mean_abs
    
    def scale_test_margins(self, margins):
        return margins / self.margin_mean_abs
#     def scale_train_margins(self, margins):
#         self.scaler = StandardScaler()
#         return self.scaler.fit_transform(margins)
    
#     def scale_test_margins(self, margins):
#         return self.scaler.transform(margins)
        
    def fit(self, X, y=None, verbose=True):
        """
        Fitting each SVM, calculating margins then fitting superclassifier
        """
        for clf, spec in zip(self.clfs, self.specs):
            clf.fit(X, y)
            if verbose:
                print("Fitted %s" % spec_repr(spec))
        self.margins = self.scale_train_margins(utils.get_margins(self.clfs, X))
        if verbose:
            print("Margins calculated")
        self.superclassifier.fit(self.margins, y)
        to_csv(self, X, y, ending="_train")
        print("Logregr coefs", self.superclassifier.coef_)
#         print(cls.clfs[6].predict(X))
        
    def predict(self, X, y=None):
        margins = utils.get_margins(self.clfs, X)
        margins = self.scale_test_margins(margins)
        return self.superclassifier.predict(margins)
    
    def predict_proba(self, X, y=None):
        margins = utils.get_margins(self.clfs, X)
        margins = self.scale_test_margins(margins)
        print("margin mins", np.min(margins, axis=0))
        print("margin maxs", np.max(margins, axis=0))
        print("margin means", np.mean(margins, axis=0))
        print("margin abs means", np.mean(np.abs(margins), axis=0))
        print("margin stds", np.std(margins, axis=0))
        to_csv(self, X, y, margins=margins, ending="_test")
        return self.superclassifier.predict_proba(margins)
    
    def score(self, X, y=None):
        return roc_auc_score(y, self.predict_proba(X))
    
    def score_stats(self, X, y=None, verbose=True):
        """
        Calculating ROC for each SVM and for supermodel
        """
        scores = []
        acc_scores = []
        for clf, spec in zip(self.clfs, self.specs):
            prediction = clf.decision_function(X)
            scores.append(roc_auc_score(y, prediction))
            acc_scores.append(accuracy_score(y, clf.predict(X)))
            if verbose:
                print("%s ROC: %f" % (spec_repr(spec), scores[-1]))
#                 print("%s Acc: %f" % (spec_repr(spec), acc_scores[-1]))
        super_score = roc_auc_score(y, self.predict_proba(X, y)[:, 1])
        if verbose:
            print("Superclassifier ROC: %f" % super_score)
        return scores, super_score
    
    def fit_score_stats(self, X_train, X_test, y_train, y_test, verbose=True):
        """
        Fitting and scoring combined
        """
        self.fit(X_train, y_train, verbose=verbose)
        if verbose:
            print("Fitted")
        return self.score_stats(X_test, y_test, verbose=verbose)
    
    def cross_val_stats(self, X, y, cv_type="stratified", n_folds=5, only=None):
        """
        The function includes data splitting and scoring
        """
        if cv_type == "stratified":
            cv = StratifiedKFold(y, n_folds=n_folds, shuffle=True)
        elif cv_type == "simple":
            cv = KFold(np.size(y), n_folds=n_folds, shuffle=True)
        else:
            raise ValueError("Invalid cv_type")
        scores = []
        for num, train_test in enumerate(cv):
            train, test = train_test
            if not (only is None) and num != only:
                continue
            clfs_scores, super_score = \
                self.fit_score_stats(X[train], X[test], y[train], y[test], verbose=True)
            scores.append((clfs_scores, super_score))
        return scores

## Logregr coefs here

In [None]:
for name in ["german"]:
    cls = SVNSupermodel(dataset_name=name)
    X, y, _ = utils.get_dataset(name)
    print(np.sum(y==1))
    print(np.sum(y==0))
    print("TESTING %s" % name.upper())
    x = cls.cross_val_stats(X, y, cv_type="stratified", n_folds=5)

In [9]:
for name in utils.datasets:
    cls = SVNSupermodel(dataset_name=name)
    X, y, _ = utils.get_dataset(name)
    print(np.std(X, axis=0))
    print("TESTING %s" % name.upper())
    x = cls.cross_val_stats(X, y, cv_type="stratified", n_folds=5, only=3)
    

[ 1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.]
TESTING HOUSING
Fitted Kernel type: linear
Fitted Kernel type: poly, degree = 2
Fitted Kernel type: poly, degree = 3
Fitted Kernel type: poly, degree = 4
Fitted Kernel type: ink, degree = 1, downer limit (a) = -3.000000
Fitted Kernel type: ink, degree = 2, downer limit (a) = -3.000000
Fitted Kernel type: rbf, gamma = 0.000100
Fitted Kernel type: rbf, gamma = 0.001000
Fitted Kernel type: rbf, gamma = 0.010000
Fitted Kernel type: rbf, gamma = 0.100000
Fitted Kernel type: rbf, gamma = 1.000000
Margins calculated
(405,)
(405, 13)
(405, 11)
Logregr coefs [[ 0.19135577  0.3200176   0.42284158  0.39852269  0.39729712  0.30409658
  -0.14536909  0.10420262  0.14488711  0.86337574  3.16997977]]
Fitted
Kernel type: linear ROC: 0.930000
Kernel type: poly, degree = 2 ROC: 0.893684
Kernel type: poly, degree = 3 ROC: 0.966316
Kernel type: poly, degree = 4 ROC: 0.910000
Kernel type: ink, degree = 1, downer limit (a) = -3.000000 ROC: 0.966842
Kernel



Fitted Kernel type: linear
Fitted Kernel type: poly, degree = 2
Fitted Kernel type: poly, degree = 3
Fitted Kernel type: poly, degree = 4
Fitted Kernel type: ink, degree = 1, downer limit (a) = -3.000000
Fitted Kernel type: ink, degree = 2, downer limit (a) = -3.000000
Fitted Kernel type: rbf, gamma = 0.000100
Fitted Kernel type: rbf, gamma = 0.001000
Fitted Kernel type: rbf, gamma = 0.010000
Fitted Kernel type: rbf, gamma = 0.100000
Fitted Kernel type: rbf, gamma = 1.000000
Margins calculated
(3682,)
(3682, 57)
(3682, 11)
Logregr coefs [[ 0.70428986  0.27027114  0.22407011 -0.03707646  0.53947413  0.38367616
  -0.45821805  0.21063494  0.19928838  1.15300301  4.22540903]]
Fitted
Kernel type: linear ROC: 0.969102
Kernel type: poly, degree = 2 ROC: 0.952349
Kernel type: poly, degree = 3 ROC: 0.952771
Kernel type: poly, degree = 4 ROC: 0.940868
Kernel type: ink, degree = 1, downer limit (a) = -3.000000 ROC: 0.985975
Kernel type: ink, degree = 2, downer limit (a) = -3.000000 ROC: 0.984487




Fitted Kernel type: linear
Fitted Kernel type: poly, degree = 2
Fitted Kernel type: poly, degree = 3
Fitted Kernel type: poly, degree = 4
Fitted Kernel type: ink, degree = 1, downer limit (a) = -3.000000
Fitted Kernel type: ink, degree = 2, downer limit (a) = -3.000000
Fitted Kernel type: rbf, gamma = 0.000100
Fitted Kernel type: rbf, gamma = 0.001000
Fitted Kernel type: rbf, gamma = 0.010000
Fitted Kernel type: rbf, gamma = 0.100000
Fitted Kernel type: rbf, gamma = 1.000000
Margins calculated
(800,)
(800, 24)
(800, 11)
Logregr coefs [[ -1.78200608e-03   8.77337002e-02   7.40502017e-01   7.76033919e-01
    3.64327445e-01   2.45654582e-01  -1.97979796e-01  -2.19677396e-02
   -3.64505648e-02   1.87050068e+00   2.65857153e+00]]
Fitted
Kernel type: linear ROC: 0.750833
Kernel type: poly, degree = 2 ROC: 0.705357
Kernel type: poly, degree = 3 ROC: 0.726071
Kernel type: poly, degree = 4 ROC: 0.713214
Kernel type: ink, degree = 1, downer limit (a) = -3.000000 ROC: 0.754524
Kernel type: ink, d