In [1]:
from Model_Eval_helper import DataLoader, mean_encode
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler

from lightgbm import LGBMClassifier, LGBMRegressor
import gc

import matplotlib.pyplot as plt
import seaborn as sns

import os
from copy import deepcopy


In [3]:
DataLoader = DataLoader()

In [4]:
mean_encoding_list, cat_feats = DataLoader.generate_special_features()

In [None]:
%time data, test = DataLoader.import_train_test()

In [None]:
data.head()

In [None]:
def train_classifier(data,test, meanenc_feats, cat_feats):
    excluded_feats = ['SK_ID_CURR','TARGET'] + ['prev_sum_CODE_REJECT_REASON_CLIENT','bureau_sum_CREDIT_ACTIVE_Active']
    y = data['TARGET']
    
    folds = StratifiedKFold(n_splits=3, shuffle=True, random_state=90210)
    oof_preds = np.zeros(data.shape[0])
    sub_preds = np.zeros(test.shape[0])
    feature_importance_df = pd.DataFrame()

    
    scores = [] #fold scores
    clf_list = [] #get a list of the models we generate
    for n_fold, (trn_idx, val_idx) in enumerate(folds.split(data,data['TARGET'])):
        trn, val = data.iloc[trn_idx], data.iloc[val_idx]
        
        val_test = pd.concat([val,test],axis=0,sort=False)
        val_size = val.shape[0]
        test_size = test.shape[0]

        print ('doing mean_encoding')
        trn, val_test = helper.mean_encode(trn, val_test, meanenc_feats, 'TARGET', drop=True)
        features = [f_ for f_ in trn.columns if f_ not in excluded_feats]

        val  = val_test.iloc[0:val_size, :].copy(deep=True)
        test_x = val_test[features].iloc[-test_size:,:].copy(deep=True)

        trn_x, trn_y = trn[features], trn['TARGET']
        val_x, val_y = val[features], val['TARGET']

        model = LGBMClassifier(
            n_estimators=5000,
            learning_rate=0.03,
            num_leaves=26,
            metric='auc',
            colsample_bytree=0.3,
            subsample=0.9320,
            max_depth=4,
            reg_alpha=4.8299,
            reg_lambda=3.6335,
            min_split_gain=0.0068,
            min_child_weight=9.8138,
            silent=True,
            verbose=-1,
            n_jobs = 16,
            random_state = n_fold * 619,
            class_weight = {0:1,1:1.0122}
        )
        
        clf = BaggingClassifier(model, 3)

        clf.fit(trn_x, trn_y, 
                eval_set= [(val_x, val_y)], 
                eval_metric='auc', verbose=200, early_stopping_rounds=100,
                categorical_feature = cat_feats,
               )

        oof_preds[val_idx] = clf.predict_proba(val_x)[:, 1]
        sub_preds += clf.predict_proba(test_x)[:, 1] / folds.n_splits
    
        fold_score = roc_auc_score(val_y, oof_preds[val_idx])
        scores.append(fold_score)
        print('Fold %2d AUC : %.6f' % (n_fold + 1, fold_score))
        
        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = features
        fold_importance_df["importance_gain"] = clf.feature_importances_gain_
        fold_importance_df["importance_split"] = clf.feature_importances_split_
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        #store the last fold's validation set so we can see what happened on some specific cases 
        if n_fold == 3:
        #if n_fold == 5:
            val_x_df, val_y_Df = deepcopy(val_x), deepcopy(valy.deep_copy)
            clf_list.append(clf)

        del clf, trn_x, trn_y, val_x, val_y
        del trn, val
        gc.collect()
        return(oof_preds, feature_importance_df, sub_preds, clf_list, val_x_df, val_y_df)

In [None]:
%time oof_preds, feature_importance_df, sub_preds, clf_list, val_x, val_y = train_classifier(data,test,mean_encoding_list, cat_feats)

In [None]:
helper.mean_encode(0,1,2)

In [None]:
test.head()

In [None]:
def mean_encode(train, val, features_to_encode, target, drop=False):
    train_encode = train.copy(deep=True)
    val_encode = val.copy(deep=True)
    for feature in features_to_encode:
        train_global_mean = train[target].mean()
        train_encode_map = pd.DataFrame(index = train[feature].unique())
        train_encode[feature+'_mean_encode'] = np.nan
        kf = KFold(n_splits=5, shuffle=False)
        for rest, this in kf.split(train):
            train_rest_global_mean = train[target].iloc[rest].mean()
            encode_map = train.iloc[rest].groupby(feature)[target].mean()
            encoded_feature = train.iloc[this][feature].map(encode_map).values
            train_encode[feature+'_mean_encode'].iloc[this] = train[feature].iloc[this].map(encode_map).values
            train_encode_map = pd.concat((train_encode_map, encode_map), axis=1, sort=False)
            train_encode_map.fillna(train_rest_global_mean, inplace=True) 
            train_encode[feature+'_mean_encode'].fillna(train_rest_global_mean, inplace=True)
            
        train_encode_map['avg'] = train_encode_map.mean(axis=1)
        val_encode[feature+'_mean_encode'] = val[feature].map(train_encode_map['avg'])
        val_encode[feature+'_mean_encode'].fillna(train_global_mean,inplace=True)
        
    if drop: #drop unencoded features
        train_encode.drop(features_to_encode, axis=1, inplace=True)
        val_encode.drop(features_to_encode, axis=1, inplace=True)
    return train_encode, val_encode

In [None]:
class BaggingClassifier(object):
    """
    code copied and pasted from the lgbm1 notebook
    """
    def __init__(self, base_estimator, n_estimators):

        self.base_estimator_ = base_estimator
        self.n_estimators_ = n_estimators

    def fit(self, X, y, eval_set = None, eval_metric = None, verbose = None, early_stopping_rounds = None, categorical_feature = None):
        
        self.estimators_ = []
        self.feature_importances_gain_ = np.zeros(X.shape[1])
        self.feature_importances_split_ = np.zeros(X.shape[1])
        self.n_classes_ = y.nunique()

        if self.n_estimators_ == 1:
            print ('n_estimators=1, no downsampling')
            estimator = deepcopy(self.base_estimator_)
            estimator.fit(X, y, eval_set = [(X, y)] + eval_set,
                eval_metric = eval_metric, verbose = verbose, 
                early_stopping_rounds = early_stopping_rounds)
            self.estimators_.append(estimator)
            self.feature_importances_gain_ += estimator.booster_feature_importance(importance_type='gain')
            self.feature_importances_split_ += estimator.booster_feature_importance(importance_type='split')
            return

    #average down sampling results
        minority = y.value_counts().sort_values().index.values[0]
        majority = y.value_counts().sort_values().index.values[1]
        print('majority class:', majority)
        print('minority class:', minority)

        X_min = X.loc[y==minority]
        y_min = y.loc[y==minority]
        X_maj = X.loc[y==majority]
        y_maj = y.loc[y==majority]

        kf = KFold(self.n_estimators_, shuffle=True, random_state=42)

        for rest, this in kf.split(y_maj):

            print('training on a subset')
            X_maj_sub = X_maj.iloc[this]
            y_maj_sub = y_maj.iloc[this]
            X_sub = pd.concat([X_min, X_maj_sub])
            y_sub = pd.concat([y_min, y_maj_sub])

            estimator = deepcopy(self.base_estimator_)

            estimator.fit(X_sub, y_sub, eval_set = [(X_sub, y_sub)] + eval_set,
                eval_metric = eval_metric, verbose = verbose, 
                early_stopping_rounds = early_stopping_rounds,
                categorical_feature = categorical_feature)

            self.estimators_.append(estimator)
            self.feature_importances_gain_ += estimator.booster_.feature_importance(importance_type='gain')/self.n_estimators_
            self.feature_importances_split_ += estimator.booster_.feature_importance(importance_type='split')/self.n_estimators_


    def predict_proba(self, X):

        n_samples = X.shape[0]
        proba = np.zeros([n_samples, self.n_classes_])

        for estimator in self.estimators_:

            proba += estimator.predict_proba(X, num_iteration=estimator.best_iteration_)/self.n_estimators_

        return proba