In [None]:
import pandas as pd
import numpy as np

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
import time
from lightgbm import LGBMClassifier
import lightgbm as lgb

import warnings
warnings.simplefilter('ignore', UserWarning)

import gc
gc.enable()

In [None]:
input_path=""
submission_path=""
feature_path=""
print("Load numerical features")
df_num = pd.concat([
            pd.read_feather(feature_path+"feature_num_features.ftr"),
            #pd.read_feather(feature_path+"feature_arithmetic+_combi2.ftr"),
            #pd.read_feather(feature_path+"feature_arithmetic-_combi2.ftr"),
            pd.read_feather(feature_path+"feature_arithmetic*_combi2.ftr"),
            pd.read_feather(feature_path+"feature_arithmeticdiv_combi2.ftr"),
            pd.read_feather(feature_path+"train_test_7th_df.ftr"),
            pd.read_feather(feature_path"train_test_oliverFE_df.ftr")
            ],axis=1,)
y_train = df_num["TARGET"].dropna()
df_num.drop(["TARGET"], axis=1, inplace=True)

print("Load categorical features")
df = pd.concat([
            pd.read_feather(feature_path+"feature_1way_label_encoding_with_te.ftr"),
            pd.read_feather(feature_path+"feature_2way_label_encoding_with_te.ftr"),
            #pd.read_feather(feature_path+"feature_3way_including_CODE_GENDER_label_encoding_with_te.ftr"),
            pd.read_feather(feature_path+"feature_round_num_label_encoding.ftr"),
            ],axis=1,)
df = pd.concat([df, df_num], axis=1)

In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [None]:
df=reduce_mem_usage(df)

In [None]:
X_train = df.iloc[:y_train.shape[0],:]
X_test = df.iloc[y_train.shape[0]:,:]
y_train=pd.read_csv(input_path+"train.csv").TARGET

In [None]:
actual_imp_df=pd.read_csv(feature_path+"actual_imp_df.csv")
null_imp_df=pd.read_csv(feature_path+"null_imp_df.csv") 

THRESHOLD = 97
# 閾値を超える特徴量を取得
imp_features = []
for feature in actual_imp_df["feature"]:
    actual_value = actual_imp_df.query(f"feature=='{feature}'")["importance"].values
    null_value = null_imp_df.query(f"feature=='{feature}'")["importance"].values
    percentage = (null_value < actual_value).sum() / null_value.size * 100
    if percentage >= THRESHOLD:
        imp_features.append(feature)
len(imp_features)

In [None]:
add_feature2=pd.read_feather(feature_path+"github_feature2.fhr")
train_df=pd.concat([X_train.loc[:, imp_features], add_feature2.iloc[:y_train.shape[0],:]], axis=1)
test_df=pd.concat([X_test.loc[:, imp_features], add_feature2.iloc[y_train.shape[0]:,:]], axis=1)

In [None]:
oof_preds = np.zeros(train_df.shape[0]) #num feature 226 thre=97 cv=0.7639
sub_preds = np.zeros(test_df.shape[0])
num_folds=5
folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=1001)

for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df, y_train)):
        train_x, train_y = train_df.iloc[train_idx], y_train.iloc[train_idx]
        valid_x, valid_y = train_df.iloc[valid_idx], y_train.iloc[valid_idx]

        # LightGBM parameters found by Bayesian optimization
        clf = LGBMClassifier(
            #nthread=4,
            n_estimators=100000,
            learning_rate=0.005134,
            num_leaves=int(28.7),
            feature_fraction=0.2373,
            bagging_fraction=0.946,
            #colsample_bytree=0.508716,
            #subsample=0.8715623,
            max_depth=int(5.798),
            reg_alpha=2.605,
            reg_lambda=0.163,
            min_split_gain=0.07958,
            min_child_weight=5.833,
            silent=-1,
            verbose=-1, )

        clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], 
            eval_metric= 'auc', verbose= 200, early_stopping_rounds= 200)

        oof_preds[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1]
        sub_preds += clf.predict_proba(test_df, num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits

        print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx])))
        del clf, train_x, train_y, valid_x, valid_y
        gc.collect()

result=pd.read_csv(input_path+"sample_submission.csv")
result["TARGET"]=sub_preds
result.to_csv(submission_path+"beyascv7639_226feat_5fold_LGBM.csv")