In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold
from scipy.stats import norm
from copy import deepcopy
from tqdm import tqdm_notebook
import lightgbm as lgb

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [7]:
from lightgbm.sklearn import LGBMClassifier

In [10]:
LGBMClassifier?

In [5]:
# train_df = pd.read_csv('../input/train_df.csv', nrows=10000)
# feats = [f for f in train_df.columns if f not in [
#         'TARGET', 'SK_ID_CURR', 'SK_ID_BUREAU', 'SK_ID_PREV', 'index']]
X, y = train_df[feats], train_df['TARGET']

In [13]:
rf = LGBMClassifier('rf', subsample=0.75, subsample_freq=1, colsample_bytree=0.75)
rf.fit(X, y)

LGBMClassifier(boosting_type='rf', class_weight=None, colsample_bytree=0.75,
        learning_rate=0.1, max_depth=-1, min_child_samples=20,
        min_child_weight=0.001, min_split_gain=0.0, n_estimators=100,
        n_jobs=-1, num_leaves=31, objective=None, random_state=None,
        reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=0.75,
        subsample_for_bin=200000, subsample_freq=1)

In [2]:
from scipy.stats import norm
from tqdm import tqdm
import lightgbm as lgb
import numpy as np
import pandas as pd
import sys


class PIMP:
    """Permutaion Importance for correcting the biased RF feature importance.
    """

    def __init__(self, shuffle_times=50, alpha=0.05):
        self.__shuffle_times = shuffle_times
        self.__alpha = alpha

    # def fit(self, X, y):
    #     self.__clf.fit(X, y)
    #     actImp = self.__clf.feature_importances_
    #     # n_rows - #features, n_cols - #shuffle
    #     n_rows, n_cols = X.shape[1], self.__shuffle_times
    #     nullImp = np.zeros((n_rows, n_cols))
    #     for i in tqdm_notebook(range(n_cols)):
    #         clf = deepcopy(self.__clf)
    #         np.random.shuffle(y)
    #         clf.fit(X, y)
    #         nullImp[:, i] = clf.feature_importances_

    #     miu = np.mean(nullImp, axis=1)
    #     sigma = np.std(nullImp, axis=1)
    #     mean_sigma = np.mean(sigma)
    #     sigma[sigma < mean_sigma] = mean_sigma
    #     correctImp = np.zeros((n_rows,))

    #     for j in range(n_rows):
    #         normDist = norm(miu[j], sigma[j])
    #         correctImp[j] = 1 - normDist.cdf(actImp[j])

    #     self.__feature_importances = correctImp
    #     self.__significant_feats = np.argwhere(
    #         correctImp < self.__alpha).ravel()
    #     self.__actImp = actImp
    #     self.__nullImp = nullImp

    def run(self, X, y):
        data = lgb.Dataset(X, y, free_raw_data=False, silent=True)
        lgb_params = {
            'objective': 'binary',
            'boosting_type': 'rf',
            'subsample': 0.85,
            'colsample_bytree': 0.5,
            'num_leaves': 127,
            'max_depth': -1,
            'bagging_freq': 1,
            'num_threads': 16,
        }
#         print('Obtaining ACTUAL IMP ...')
#         clf = lgb.train(params=lgb_params, train_set=data, num_boost_round=250)
#         actImp = clf.feature_importance('gain')
        # n_rows - #features, n_cols - #shuffle
        n_rows, n_cols = X.shape[1], self.__shuffle_times

        nullImp = np.zeros((n_rows, n_cols))
        print('SHUFFLE STAGE ...')
        print('=' * 65)
        for i in tqdm(range(n_cols)):
            np.random.shuffle(y)
            print(y[:100])
            data.set_label(y)
            clf = lgb.train(lgb_params, data, num_boost_round=250)
            nullImp[:, i] = clf.feature_importance('gain')

        miu = np.mean(nullImp, axis=1)
        sigma = np.std(nullImp, axis=1)
        mean_sigma = np.mean(sigma)
        sigma[sigma < mean_sigma] = mean_sigma
        correctImp = np.zeros((n_rows,))

        for j in range(n_rows):
            normDist = norm(miu[j], sigma[j])
            correctImp[j] = 1 - normDist.cdf(actImp[j])

        self.__feature_importances = correctImp
        self.__significant_feats = np.argwhere(
            correctImp < self.__alpha).ravel()
        self.__actImp = actImp
        self.__nullImp = nullImp

    @property
    def feature_importances_(self):
        return self.__feature_importances

    @property
    def significant_features_(self):
        return self.__significant_feats

    @property
    def original_feature_importances_(self):
        return self.__actImp

    @property
    def null_feature_importances_(self):
        return self.__nullImp

In [6]:
def is_skip_i(i):
    if np.random.uniform() > 0.45:
        return True

In [7]:
data_file_path = '../workdir/withAggFeatsIn/train_df.csv'
train_df = pd.read_csv(data_file_path, skiprows=is_skip_i)
# train_df = train_df.sample(frac=0.5, replace=False)
feats = [f for f in train_df.columns if f not in [
    'TARGET', 'SK_ID_CURR', 'SK_ID_BUREAU', 'SK_ID_PREV', 'index']]
pImpCorrector = PIMP(2)
pImpCorrector.run(train_df[feats], train_df['TARGET'].values)
# pImpDf = pd.DataFrame(
#     {'feats': feats,
#      'pImp': pImpCorrector.feature_importances_,
#      'actImp': pImpCorrector.original_feature_importances_, }
# )
# pImpDf['is_significant'] = False
# pImpDf.loc[pImpCorrector.significant_features_, 'is_significant'] = True
# nullImpDf = pd.DataFrame(pImpCorrector.null_feature_importances_)
# nullImpDf.columns = pd.Index(['shuffle_%d' % (i + 1)
#                               for i in range(nullImpDf.shape[1])])
# df = pImpDf.join(nullImpDf)
# df.set_index('feats', inplace=True)
# df.to_csv('./PIMP_result.csv', index=False)

  0%|          | 0/2 [00:00<?, ?it/s]

SHUFFLE STAGE ...
[0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0.
 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0.
 0. 0. 0. 0.]


 50%|█████     | 1/2 [03:50<03:50, 230.02s/it]

[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.
 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1.
 0. 0. 0. 1.]


100%|██████████| 2/2 [07:05<00:00, 212.58s/it]


NameError: name 'actImp' is not defined

In [None]:
def plotDist(actImp, nullImp):
    n = len(actImp)
    plt.figure(figsize=(12, 36))
    for i in range(n):
        plt.subplot(int(np.ceil(n / 2)), 2, i + 1)
        height, _, _ = plt.hist(nullImp[i], label='nullImp')
        plt.vlines(x=actImp[i], ymin=0, ymax=height.max(), label='actImp')
        plt.legend()

# plotDist(pImp.original_feature_importances_, pImp.null_feature_importances_)