# The Staker......

In [None]:
import numpy as np
import pandas as pd
import os, time, gc

from sklearn.externals import joblib
from warnings import warn

from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, \
    f1_score
from sklearn.metrics import roc_curve, cohen_kappa_score, log_loss, \
    adjusted_mutual_info_score
from sklearn.metrics.regression import r2_score, mean_squared_error, \
    mean_absolute_error, explained_variance_score
from scipy.stats import entropy

import matplotlib.pyplot as plt
from sklearn.calibration import calibration_curve
import seaborn as sns


In [None]:
from sklearn.datasets import load_boston, load_breast_cancer, load_iris

def get_data(dataname):
    if dataname == 'iris':
        data = load_iris()
    elif dataname == 'boston':
        data = load_boston()
    elif dataname == 'cancer':
        data = load_breast_cancer()
    df = pd.concat([pd.DataFrame(data.data), pd.DataFrame(data.target)], axis=1)
    names = [i for i in data.feature_names]
    names.append('target')
    df.columns = names
    print(df.head())
    print(df.describe())
    return df


def mape_1(y_true, y_pred):
    abs_true = np.absolute(y_true)
    abs_pred = np.absolute(y_true - y_pred)
    n = y_true.shape[0]

    return np.mean((abs_pred / abs_true)) * 100


# todo:
# 1) percentage concordant discordant
# 2) kendal's tau
# 3) gamma
# 4) k
objectives = {
    'f1_score': f1_score,
    'accuracy': accuracy_score,
    'loss': log_loss,
    'cohen_kappa': cohen_kappa_score,
    'f1_score_multi': f1_score,
    'accuracy_multi': accuracy_score,
    'loss_multi': log_loss,
    'cohen_kappa_multi': cohen_kappa_score,
    '1_mape': mape_1,
    'mse': mean_squared_error,
    'mae': mean_absolute_error,
    'mi': adjusted_mutual_info_score,
    'kld': entropy
}


In [None]:
class ModelBlender:

    def __init__(self, problem='cls', training_ratio=0.7, blend_ratio=0.1,
                 holdout_ratio=0.2, stack_levels=2,
                 na_treatment='omit', sample_generation='random', shuffle=True,
                 n_cross=1, seed=None, n_jobs=1):
        """

        :param problem: cls(classification)(default), reg(regression) and clus(clustering),
                        Type of problem to generate statistics.

        :param training_ratio: 0 to 1, 0.7 (default), Ratio of training and
                                blending data
        :param blend_ratio: 0 to 1, 0.2 (default), Ratio to blend training data
        :param holdout_ratio: 0 to 1, 0.2 (default), Ratio of blending  data

        :param blended_samples: 2 (default), No of blending samples to make

        :param stack_levels: 2 (default), Levels of stacking

        :param na_treatment: 'omit'(default), 'impute', 'keep', 'keep_sep',
                            omit: drop the nan rows in data
                            impute: have to supply imputer/or imputtion function
                                    with the data
                            keep: will keep data as is
                            keep_sep: will keep data separately and not process
                                    in models

        :param sample_generation:'random' (default), 'sequential'
                            random: randomly sampling
                            seq:  sequential splitting, need to pass column for splitting

        :param shuffle: True(default), Boolean to indicate shuffling before
                        modeling individual models

        :param n_cross: Perform cross validation while model training

        :param seed: Random Seed

        :param n_jobs: 1 (default) Train and Score score in parallel fashion. Note individual
                        model should be fed with linear processing, else training
                        will not happen in sequential fashion only.

        Note: Sum of training_ration,blend_ratio and hold_out ratio should ideally add
              upto 1. If not training_ratio will be 1 - hold_out + blend_ration.
        """

        self.problem = problem
        self.training_ratio = training_ratio
        self.blend_ratio = blend_ratio
        self.holdout_ratio = holdout_ratio
        self.stack_levels = stack_levels
        self.na_treatment = na_treatment
        self.sample_generation = sample_generation
        self.shuffle = shuffle
        self.n_cross = n_cross
        self.seed = seed
        self.n_jobs = n_jobs

        if self.training_ratio + self.blend_ratio + self.holdout_ratio > 1:
            self.training_ratio = 1 - self.blend_ratio + self.holdout_ratio

        # data variables
        self._train = None
        self._blend = None
        self._blend_out = None
        self._hold_out = None
        self._na_data = None
        self._y_col = None

        # model variables
        self.modelstore = {'stack_{}'.format(i + 1): {} for i in
                           range(self.stack_levels)}
        self.stack_info = None
        self.model_scores = None
        # some basic stuff to do
        # todo: set random seed
        # todo: ensure n_jobs equal to 1 for sklearn model if n_jobs is -1
        # todo: write more fuctions as metrics

    def _get_metrics(self, y_true, y_pred, domain='Train', threshold=None):
        if self.problem == 'cls':
            y_cat_true = y_true
            y_pred_cat = y_pred
            fpr_test, tpr_test, thresholds_test = roc_curve(y_cat_true,
                                                            y_pred_cat,
                                                            pos_label=True)
            sum_sensitivity_specificity_test = tpr_test + (1 - fpr_test)
            best_threshold_id_test = np.argmax(sum_sensitivity_specificity_test)
            best_threshold = thresholds_test[best_threshold_id_test]

            if threshold is None:
                y_test = np.array(y_pred_cat >= best_threshold, 'uint8')
                threshold = best_threshold
            else:
                y_test = np.array(y_pred_cat >= threshold, 'uint8')

            cm_test = confusion_matrix(y_cat_true, y_test)
            acc_test = accuracy_score(y_cat_true, y_test)
            auc_test = roc_auc_score(y_cat_true, y_test)
            f1_score_test = f1_score(y_cat_true, y_test)

            print('{} Threshold       : {}'.format(domain, threshold))
            print('{} Cat. KLD        : {}'.format(domain, objectives['kld'](
                y_cat_true.reshape(-1, 1), y_test.reshape(-1, 1))))
            print('{} Accuracy        : {}'.format(domain,acc_test))
            print('{} AUC             : {}'.format(domain, auc_test))
            print('{} Cohe Kappa      : {}'.format(domain,
                                                   objectives['cohen_kappa'](
                                                       y_cat_true, y_test)))
            print('{} Log Loss        : {}'.format(domain, objectives['loss'](
                y_cat_true, y_test)))
            print('{} F1 Score        : {}'.format(domain,
                                                   objectives['f1_score'](
                                                       y_cat_true, y_test)))
            print('{} Confusion Matrix:'.format(domain))
            print(cm_test)
        elif self.problem == 'reg':
            print('{} MAPE            : {}'.format(domain,
                                                   objectives['1_mape'](y_true,
                                                                        y_pred)))
            print('{} MSE             : {}'.format(domain,
                                                   objectives['mse'](y_true,
                                                                     y_pred)))
            print('{} MAE             : {}'.format(domain,
                                                   objectives['mae'](y_true,
                                                                     y_pred)))
            print('{} Cont. KLD       : {}'.format(domain, objectives['kld'](
                y_true.reshape(-1, 1), y_pred.reshape(-1, 1))))
        elif self.problem == 'clus':
            pass
        else:
            pass

    def _get_prediction(self, model, X):

        if hasattr(model, "predict_proba"):
            prob_pos = model.predict_proba(X)[:, 1]
        else:  # use decision function
            prob_pos = model.decision_function(X)
            prob_pos = \
                (prob_pos - prob_pos.min()) / (
                        prob_pos.max() - prob_pos.min())
        return prob_pos

    def _get_data(self, domain='Train', stack_level=None, namescope='title'):

        if stack_level is None or stack_level == 0:
            if domain == 'Train' or domain == 'train':
                X = self._train.drop(self._y_col, axis=1)
                y = self._train[self._y_col].values
                name = domain.title() if namescope == 'title' else domain.lower()
                return X, y, name

            elif domain == 'blend' or domain == 'Blend':
                X = self._blend.drop(self._y_col, axis=1)
                y = self._blend[self._y_col].values
                name = domain.title() if namescope == 'title' else domain.lower()
                return X, y, name

            elif domain == 'holdout' or domain == 'hold_out' or domain == 'Holdout':
                X = self._hold_out.drop(self._y_col, axis=1)
                y = self._hold_out[self._y_col].values
                name = domain.title() if namescope == 'title' else domain.lower()
                return X, y, name
        else:
            X, y, _ = self._get_data(domain=domain, stack_level=None)

            models = self.modelstore['stack_{}'.format(stack_level)].items()
            pred_df = pd.DataFrame(index=X.index)
            for m_name, m_dict in models:
                model = m_dict['model']
                col_name = '{}_{}'.format(m_name, stack_level)
                pred_df[col_name] = self._get_prediction(model, X)

            X = pd.concat((X, pred_df), axis=1)
            name = domain.title() if namescope == 'title' else domain.lower()
            print('Stacking Done at level: {}'.format(stack_level))
            return X, y, name

    def _util_remove_dup(self, duplicate):
        final_list = []
        dups = []
        for num in duplicate:
            if num not in final_list:
                final_list.append(num)
            else:
                dups.append(num)
        # print(dups)
        return final_list

    def _plot_calibration(self, stack_level=1, domain='train'):

        plt.figure(figsize=(10, 10))
        ax1 = plt.subplot2grid((3, 1), (0, 0), rowspan=2)
        ax2 = plt.subplot2grid((3, 1), (2, 0))

        X, y, _data_name = self._get_data(domain, stack_level=stack_level)
        ax1.plot([0, 1], [0, 1], "k:", label="Perfectly calibrated")
        for name, m_dict in self.modelstore[
            'stack_{}'.format(stack_level)].items():
            clf = m_dict['model']
            cols = m_dict['columns']
            prob_pos = self._get_prediction(clf, X[cols])
            fraction_of_positives, mean_predicted_value = \
                calibration_curve(y, prob_pos, n_bins=10)

            ax1.plot(mean_predicted_value, fraction_of_positives, "s-",
                     label="%s" % (name,))

            ax2.hist(prob_pos, range=(0, 1), bins=10, label=name,
                     histtype="step", lw=2)

        ax1.set_ylabel("Fraction of positives")
        ax1.set_ylim([-0.05, 1.05])
        ax1.legend(loc="lower right")
        ax1.set_title(
            'Calibration plots (reliability curve) for {} Data'.format(
                _data_name))

        ax2.set_xlabel("Mean predicted value")
        ax2.set_ylabel("Count")
        ax2.legend(loc="upper center", ncol=2)

        plt.tight_layout()
        plt.show()

    def fit_data(self, df, na_imputation=None, seq_column=None):

        """

        :param df: A Pandas Dataframe
        :param na_imputation: Imputer function
        :param seq_column: Sequential column
        :return:
        """
        X = df.copy(deep=True)
        shp = X.shape
        if na_imputation is None and self.na_treatment == 'impute':
            warn('Imputer not supplied! Will keep the data separately!')
            self._na_data = X[X.isnull().any(axis=1)]
            X.dropna(inplace=True)
        elif self.na_treatment == 'impute':
            X = na_imputation(X)
        elif self.na_treatment == 'omit':
            X.dropna(inplace=True)
        elif self.na_treatment == 'keep_sep':
            self._na_data = X[X.isnull().any(axis=1)]
            X.dropna(inplace=True)
        elif self.na_treatment == 'keep':
            pass

        if self.sample_generation == 'seq' and seq_column is None:
            warn('Column for sequential not supplied! Will do random sampling!')
            pass
        elif self.sample_generation == 'seq':
            pass
        else:
            self._hold_out = X.sample(frac=self.holdout_ratio,
                                      random_state=self.seed)
            X.drop(self._hold_out.index, inplace=True)
            self._train = X.sample(frac=self.training_ratio,
                                   random_state=self.seed)
            X.drop(self._train.index, inplace=True)

            indexes = self._train.index
            n_blend = int(np.ceil(self.blend_ratio * shp[0]))

            part = np.random.choice(indexes, size=n_blend,
                                    replace=False)
            blend = self._train.drop(part)
            blend = pd.concat((blend, X), axis=0)
            # blend = self._train.copy(deep=True)
            # blend.loc[part] = X
            self._blend = blend.copy(deep=True)
            del blend
            gc.collect()
            self._blend_out = X

        print('Data Splitting Summary:')
        print('Input Shape    : {} {}'.format(df.shape,
                                              np.sum(np.isnan(df.values))))
        print('Training Shape : {} {}'.format(self._train.shape,
                                              np.sum(np.isnan(
                                                  self._train.values))))
        print('Blend Shape    : {} {}'.format(self._blend.shape,
                                              np.sum(np.isnan(
                                                  self._blend.values))))
        print('Hold Out Shape : {} {}'.format(self._hold_out.shape,
                                              np.sum(np.isnan(
                                                  self._hold_out.values))))

    def fit_model(self, model_list, y_col, stack_level=None, threshold=None):

        """

        :param model_list: list to models to train on [(name,est)]
        :param y_col: target variable
        :param stack_level: stack_level at which model is to be trained
        :param threshold: threshold for classifier
        :return:
        """

        if y_col not in self._train.columns:
            raise ValueError('{} is not in fitted data!'.format(y_col))
        else:
            if self._y_col is None:
                self._y_col = y_col
            else:
                print(
                    'Depedent Variable changed from \'{}\' to \'{}\' !'.format(
                        self._y_col, y_col))
                self._y_col = y_col

        if stack_level is None and self.stack_info is None:
            self.stack_info = 1
            stack_level = 1

        if isinstance(model_list, list):  # or isinstance(model_list, tuple):
            n_models = len(model_list)
            print('{} models to _train on training dataset'.format(n_models))
            for m_name, m_est in model_list:
                print('Training for model no.: {}'.format(m_name))
                X, y, _ = self._get_data('train')
                cols = X.columns
                m_est.fit(X, y)
                pred = m_est.predict(X)
                self._get_metrics(y, pred,
                                  domain='Model {} Training'.format(m_name),
                                  threshold=threshold)

                X, y, _ = self._get_data('blend')
                pred = m_est.predict(X)
                self._get_metrics(y, pred,
                                  domain='Model {} on Blended'.format(m_name),
                                  threshold=threshold)

                self.modelstore['stack_{}'.format(stack_level)][m_name] \
                    = {'model': m_est, 'columns': cols, 'level': 1}

        self._plot_calibration(stack_level=1, domain='train')
        self._plot_calibration(stack_level=1, domain='blend')

    def stack_train(self, model_list, stack_level, threshold=None):

        """

        :param model_list: a list of tuples of [(name, estimator, columns)]
        :param model_config:
        :param stack_level: Level at which stacking need to be done.
        :return:
         Note before any training all previous stacking steps will be done.
         The nomenclature of columns to be used for stacking should be of
         consumed for either get_variable_name() or '{}_{}_{}'.format(
         name of estimator,stack_level,type of data)
        """

        if isinstance(model_list, list):
            X, y, _ = self._get_data('blend', stack_level)
            X_hold, y_hold, _ = self._get_data('hold_out', stack_level)
            for m_name, m_est, m_col in model_list:
                if len(m_col) == 0:
                    print('All columns will be used to fit {} model'.format(
                        m_name))
                    m_col = X.columns
                elif np.any([True if i in X.columns else False for i in m_col]):
                    raise ValueError('Few columns of {} are not in {}'.format(
                        m_col, X.columns))

                m_est.fit(X[m_col], y)
                pred = m_est.predict(X[m_col])
                self._get_metrics(y, pred,
                                  domain='Model {} Blended'.format(m_name),
                                  threshold=threshold)

                pred = m_est.predict(X_hold[m_col])
                self._get_metrics(y_hold, pred,
                                  domain='Model {} on Hold_out'.format(m_name),
                                  threshold=threshold)

                self.modelstore['stack_{}'.format(stack_level + 1)][m_name] \
                    = {'model': m_est, 'columns': m_col, 'level': 2}

        # self._plot_calibration(stack_level=1, domain='blend')
        # self._plot_calibration(stack_level=2, domain='holdout')

    def reblend(self):
        indexes = self._train.index
        shp = self._blend_out
        part = np.random.choice(indexes, size=int(shp[0]),
                                replace=False)
        blend = self._train.drop(part)
        blend = pd.concat((blend, self._blend_out), axis=0)
        # blend = self._train.copy(deep=True)
        # blend.loc[part] = X
        self._blend = blend.copy(deep=True)
        del blend
        gc.collect()

    def get_variable_name(self, stack_level):
        cols = []
        for i in self.modelstore['stack_{}'.format(stack_level)].keys():
            col = self.modelstore['stack_{}'.format(stack_level)][i]['columns']
            cols.extend(col)

        return self._util_remove_dup(cols)

    def plot_model(self, stack_level=None):
        pass

    def plot_scores(self, stack_level=None):
        pass

    def generate_stack(self, domain, stack_level=None):
        X, y, _ = self._get_data(domain, stack_level)
        return X, y

    def save(self):
        pass

    def load(self):
        pass

    def predict(self, stack_level=None):
        pass

    def score_on_holdout(self, stack_level=None, threshold=None):
        if stack_level is None:
            for stack in self.modelstore.keys():
                for m_name, m_dict in self.modelstore[stack].items():
                    # print(m_name)
                    # print(m_dict)
                    lvl = int(m_dict['level'])
                    m_est = m_dict['model']
                    m_col = m_dict['columns']
                    X, y, _ = self._get_data('holdout', lvl - 1)

                    pred = m_est.predict(X[m_col])
                    self._get_metrics(y, pred,
                                      domain='Model {} on Hold Out'.format(
                                          m_name),
                                      threshold=threshold)



### Lets use it...


In [None]:
# X = np.random.random((1000,10))
# X = get_data('boston')
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC


In [5]:
X, y = make_classification(n_samples=50000, n_features=50,
                           n_informative=10, n_redundant=10,
                           n_clusters_per_class=2,n_repeated=5,shift=1.0,
                           scale=2, random_state=5454, shuffle=True)
# clf = RandomForestClassifier(n_estimators=100, max_depth=2,
#                              random_state=0)
# clf.fit(X, y)

df = pd.DataFrame(X)
df['target'] = y

lr = LogisticRegression(solver='lbfgs')
lr2 = LogisticRegression(solver='lbfgs')
gnb = GaussianNB()
svc = LinearSVC(C=1.0)
rfc = RandomForestClassifier(n_estimators=50)
rfc2 = RandomForestClassifier(n_estimators=50)

stacker = ModelBlender()
stacker.fit_data(df)
stacker.fit_model([('lr', lr), ('gnb', gnb), ('svc', svc),('rfc', rfc)],
                  y_col='target')
# print(stacker.get_variable_name(1))
# print(stacker.modelstore)
stacker.stack_train([('lr2', lr2, []), ('rfc', rfc2, [])], stack_level=1)
# print(stacker.modelstore)
stacker.score_on_holdout()

Data Splitting Summary:
Input Shape    : (50000, 51) 0
Training Shape : (28000, 51) 0
Blend Shape    : (35000, 51) 0
Hold Out Shape : (10000, 51) 0
4 models to _train on training dataset
Training for model no.: lr
Model lr Training Threshold       : 1
Model lr Training Cat. KLD        : [inf]
Model lr Training Accuracy        : 0.6786428571428571
Model lr Training AUC             : 0.6786025037870071
Model lr Training Cohe Kappa      : 0.3572327843731622
Model lr Training Log Loss        : 11.099403724732054
Model lr Training F1 Score        : 0.6719889180519102
Model lr Training Confusion Matrix:
[[9785 4245]
 [4753 9217]]
Model lr on Blended Threshold       : 1
Model lr on Blended Cat. KLD        : [inf]
Model lr on Blended Accuracy        : 0.6809142857142857
Model lr on Blended AUC             : 0.6808376985603124
Model lr on Blended Cohe Kappa      : 0.36172678743222153
Model lr on Blended Log Loss        : 11.020950898582244
Model lr on Blended F1 Score        : 0.674003152548309