In [1]:
%matplotlib inline
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from keras import backend as K
from keras.models import Model, load_model, Sequential
from keras.utils import np_utils, multi_gpu_model, plot_model
from keras.callbacks import EarlyStopping
from keras.layers import Input, concatenate, Reshape, Flatten, Dense, Conv2D, MaxPool2D, Dropout, BatchNormalization
from keras.optimizers import Adam
from sklearn.externals import joblib

import warnings
warnings.filterwarnings("ignore")

Using TensorFlow backend.


In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
#Imputation : median
def imputation_median(df):
  #cat/num split
  features = pd.DataFrame({'feature' : df.columns, 'dtype' : df.dtypes}).reset_index(drop = 'True')
  cat_features = features[(features.dtype != 'int8') & (features.dtype != 'int16') & (features.dtype != 'int32')& (features.dtype != 'int64') & (features.dtype != 'float16') & (features.dtype != 'float32') & (features.dtype != 'float64')]

  features = df.columns
  cat_features = df.select_dtypes(['uint8','bool']).columns.tolist()
  num_features = [col for col in features if col not in cat_features]
  
  for col in num_features:
    df[col] = df[col].fillna(df[col].median())
  for col in cat_features:
    df[col] = df[col].fillna(df[col].mode())

In [0]:
import tensorflow as tf
from sklearn.metrics import roc_auc_score
from sklearn.datasets import make_classification
from keras.models import Sequential
from keras.layers import Dense
from keras.utils import np_utils
from keras.callbacks import Callback, EarlyStopping

def auc_roc(y_true, y_pred):
    # any tensorflow metric
    value, update_op = tf.contrib.metrics.streaming_auc(y_pred, y_true)

    # find all variables created for this metric
    metric_vars = [i for i in tf.local_variables() if 'auc_roc' in i.name.split('/')[1]]

    # Add metric variables to GLOBAL_VARIABLES collection.
    # They will be initialized for new session.
    for v in metric_vars:
        tf.add_to_collection(tf.GraphKeys.GLOBAL_VARIABLES, v)

    # force to update metric values
    with tf.control_dependencies([update_op]):
        value = tf.identity(value)
        return value

In [5]:
#Call Datasets
import pickle
import pandas as pd
with open('/content/gdrive/My Drive/DL/FRD/X_train_w_null_3.00.pkl', 'rb') as f:
    X = pickle.load(f)
with open('/content/gdrive/My Drive/DL/FRD/X_test_w_null_3.00.pkl', 'rb') as f:
    X_test = pickle.load(f)
with open('/content/gdrive/My Drive/DL/FRD/y.pkl', 'rb') as f:
    y = pickle.load(f)
    
imputation_median(X)
imputation_median(X_test)

from sklearn.model_selection import train_test_split
X_train, X_val = train_test_split(X, test_size=0.3, random_state=0)
y_train, y_val = train_test_split(y, test_size=0.3, random_state=0)
X_train.shape, X_val.shape, y_train.shape, y_val.shape

((413378, 1145), (177162, 1145), (413378,), (177162,))

In [0]:
#GPU Environment readiness
!git clone --recursive -qq https://github.com/Microsoft/LightGBM
!apt-get install -y -qq libboost-all-dev

In [0]:
%%bash
cd LightGBM
mkdir build
cd build
cmake -DUSE_GPU=1 -DOpenCL_LIBRARY=/usr/local/cuda/lib64/libOpenCL.so -DOpenCL_INCLUDE_DIR=/usr/local/cuda/include/ ..
make -j$(nproc)

-- The C compiler identification is GNU 7.4.0
-- The CXX compiler identification is GNU 7.4.0
-- Check for working C compiler: /usr/bin/cc
-- Check for working C compiler: /usr/bin/cc -- works
-- Detecting C compiler ABI info
-- Detecting C compiler ABI info - done
-- Detecting C compile features
-- Detecting C compile features - done
-- Check for working CXX compiler: /usr/bin/c++
-- Check for working CXX compiler: /usr/bin/c++ -- works
-- Detecting CXX compiler ABI info
-- Detecting CXX compiler ABI info - done
-- Detecting CXX compile features
-- Detecting CXX compile features - done
-- Found OpenMP_C: -fopenmp (found version "4.5") 
-- Found OpenMP_CXX: -fopenmp (found version "4.5") 
-- Found OpenMP: TRUE (found version "4.5")  
-- Looking for CL_VERSION_2_2
-- Looking for CL_VERSION_2_2 - not found
-- Looking for CL_VERSION_2_1
-- Looking for CL_VERSION_2_1 - not found
-- Looking for CL_VERSION_2_0
-- Looking for CL_VERSION_2_0 - not found
-- Looking for CL_VERSION_1_2
-- Looking

In [0]:
!cd LightGBM/python-package/;python3 setup.py install --precompile
!mkdir -p /etc/OpenCL/vendors && echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd
!rm -r LightGBM

running install
running build
running build_py
INFO:root:Generating grammar tables from /usr/lib/python3.6/lib2to3/Grammar.txt
INFO:root:Generating grammar tables from /usr/lib/python3.6/lib2to3/PatternGrammar.txt
creating build
creating build/lib
creating build/lib/lightgbm
copying lightgbm/engine.py -> build/lib/lightgbm
copying lightgbm/__init__.py -> build/lib/lightgbm
copying lightgbm/sklearn.py -> build/lib/lightgbm
copying lightgbm/basic.py -> build/lib/lightgbm
copying lightgbm/libpath.py -> build/lib/lightgbm
copying lightgbm/compat.py -> build/lib/lightgbm
copying lightgbm/plotting.py -> build/lib/lightgbm
copying lightgbm/callback.py -> build/lib/lightgbm
running egg_info
creating lightgbm.egg-info
writing lightgbm.egg-info/PKG-INFO
writing dependency_links to lightgbm.egg-info/dependency_links.txt
writing requirements to lightgbm.egg-info/requires.txt
writing top-level names to lightgbm.egg-info/top_level.txt
writing manifest file 'lightgbm.egg-info/SOURCES.txt'
reading man

In [0]:
#Model 6-1 Training : train with less features
%%time
import lightgbm as lgb
params={'learning_rate': 0.01,
        'objective': 'binary',
        'metric': 'auc',
        'num_threads': -1,
        'num_leaves': 256,
        'verbose': 2,
        'random_state': 42,
        'bagging_fraction': 1,
        'feature_fraction': 0.70,
        'n_estimators' : 10000,
        'device_type' : 'gpu',
        'lambda_l1' : 7e-02,
        'lambda_l2' : 7e-02
       }

model1 = lgb.LGBMClassifier(**params)
model1.fit(X = X_train, y = y_train, eval_set=[(X_val,y_val)],eval_metric  = 'auc', early_stopping_rounds = 500, verbose = True)
joblib.dump(model1, '/content/gdrive/My Drive/DL/FRD/model_1.pkl')

In [0]:
#Model 6-2 Training : train with strong constraints

%%time
import lightgbm as lgb
params={'learning_rate': 0.01,
        'objective': 'binary',
        'metric': 'auc',
        'num_threads': -1,
        'num_leaves': 256,
        'verbose': 2,
        'random_state': 42,
        'bagging_fraction': 1,
        'feature_fraction': 0.80,
        'n_estimators' : 10000,
        'device_type' : 'gpu',
        'lambda_l1' : 70e-02,
        'lambda_l2' : 70e-02
       }

model2 = lgb.LGBMClassifier(**params)
model2.fit(X = X_train, y = y_train, eval_set=[(X_val,y_val)],eval_metric  = 'auc', early_stopping_rounds = 500, verbose = True)
joblib.dump(model2, '/content/gdrive/My Drive/DL/FRD/model2.pkl')

In [0]:
#Model 6-3 Training : train with small num_leaves : 256 -> 128

%%time
import lightgbm as lgb
params={'learning_rate': 0.01,
        'objective': 'binary',
        'metric': 'auc',
        'num_threads': -1,
        'num_leaves': 128,
        'verbose': 2,
        'random_state': 42,
        'bagging_fraction': 1,
        'feature_fraction': 0.80,
        'n_estimators' : 10000,
        'device_type' : 'gpu',
        'lambda_l1' : 7e-02,
        'lambda_l2' : 7e-02
       }

model3 = lgb.LGBMClassifier(**params)
model3.fit(X = X_train, y = y_train, eval_set=[(X_val,y_val)],eval_metric  = 'auc', early_stopping_rounds = 500, verbose = True)
joblib.dump(model3, '/content/gdrive/My Drive/DL/FRD/model3.pkl')

In [0]:
#Model 6-4 Training : train with large num_leaves : 256 -> 512

%%time
import lightgbm as lgb
params={'learning_rate': 0.01,
        'objective': 'binary',
        'metric': 'auc',
        'num_threads': -1,
        'num_leaves': 512,
        'verbose': 2,
        'random_state': 42,
        'bagging_fraction': 1,
        'feature_fraction': 0.80,
        'n_estimators' : 10000,
        'device_type' : 'gpu',
        'lambda_l1' : 7e-02,
        'lambda_l2' : 7e-02
       }

model4 = lgb.LGBMClassifier(**params)
model4.fit(X = X_train, y = y_train, eval_set=[(X_val,y_val)],eval_metric  = 'auc', early_stopping_rounds = 500, verbose = True)
joblib.dump(model4, '/content/gdrive/My Drive/DL/FRD/model4.pkl')

In [0]:
#Model 6-5 Training : train with faster learning rate : 0.01 -> 0.05

%%time
import lightgbm as lgb
params={'learning_rate': 0.05,
        'objective': 'binary',
        'metric': 'auc',
        'num_threads': -1,
        'num_leaves': 256,
        'verbose': 2,
        'random_state': 42,
        'bagging_fraction': 1,
        'feature_fraction': 0.80,
        'n_estimators' : 10000,
        'device_type' : 'gpu',
        'lambda_l1' : 7e-02,
        'lambda_l2' : 7e-02
       }

model5 = lgb.LGBMClassifier(**params)
model5.fit(X = X_train, y = y_train, eval_set=[(X_val,y_val)],eval_metric  = 'auc', early_stopping_rounds = 500, verbose = True)
joblib.dump(model5, '/content/gdrive/My Drive/DL/FRD/model5.pkl')

[1]	valid_0's auc: 0.851126
Training until validation scores don't improve for 500 rounds
[2]	valid_0's auc: 0.871508
[3]	valid_0's auc: 0.878033
[4]	valid_0's auc: 0.882975
[5]	valid_0's auc: 0.885651
[6]	valid_0's auc: 0.891497
[7]	valid_0's auc: 0.893353
[8]	valid_0's auc: 0.896082
[9]	valid_0's auc: 0.896982
[10]	valid_0's auc: 0.89783
[11]	valid_0's auc: 0.899399
[12]	valid_0's auc: 0.900053
[13]	valid_0's auc: 0.902548
[14]	valid_0's auc: 0.90396
[15]	valid_0's auc: 0.904991
[16]	valid_0's auc: 0.905926
[17]	valid_0's auc: 0.907136
[18]	valid_0's auc: 0.908443
[19]	valid_0's auc: 0.909531
[20]	valid_0's auc: 0.910805
[21]	valid_0's auc: 0.912202
[22]	valid_0's auc: 0.913238
[23]	valid_0's auc: 0.914613
[24]	valid_0's auc: 0.915432
[25]	valid_0's auc: 0.916245
[26]	valid_0's auc: 0.916944
[27]	valid_0's auc: 0.918106
[28]	valid_0's auc: 0.918962
[29]	valid_0's auc: 0.919612
[30]	valid_0's auc: 0.920831
[31]	valid_0's auc: 0.922156
[32]	valid_0's auc: 0.923198
[33]	valid_0's auc: 0

In [0]:
#Predict for prediction

In [0]:
#Learn from predictions with DL
#Call models
from sklearn.externals import joblib
filename = '/content/gdrive/My Drive/DL/FRD/model_1.pkl' 
model1 = joblib.load(filename) 
filename = '/content/gdrive/My Drive/DL/FRD/model2.pkl' 
model2 = joblib.load(filename) 
filename = '/content/gdrive/My Drive/DL/FRD/model3.pkl' 
model3 = joblib.load(filename) 
filename = '/content/gdrive/My Drive/DL/FRD/model4.pkl' 
model4 = joblib.load(filename) 
filename = '/content/gdrive/My Drive/DL/FRD/model5.pkl' 
model5 = joblib.load(filename)

In [0]:
%%time
pred1 = model1.predict_proba(X_train)
pred2 = model2.predict_proba(X_train)
pred3 = model3.predict_proba(X_train)
pred4 = model4.predict_proba(X_train)
pred5 = model5.predict_proba(X_train)

X_train_stack = np.hstack((pred1,pred2,pred3,pred4,pred5))
np.save('/content/gdrive/My Drive/DL/FRD/X_train_stack',X_train_stack)

del pred1
del pred2
del pred3
del pred4
del pred5


pred1_val = model1.predict_proba(X_val)
pred2_val = model2.predict_proba(X_val)
pred3_val = model3.predict_proba(X_val)
pred4_val = model4.predict_proba(X_val)
pred5_val = model5.predict_proba(X_val)

X_val_stack = np.hstack((pred1_val,pred2_val,pred3_val,pred4_val,pred5_val))
np.save('/content/gdrive/My Drive/DL/FRD/X_val_stack',X_val_stack)

del pred1_val
del pred2_val
del pred3_val
del pred4_val
del pred5_val

CPU times: user 59min 23s, sys: 40.9 s, total: 1h 4s
Wall time: 17min 58s


In [0]:
pred1_test = model1.predict_proba(X_test)
np.save('/content/gdrive/My Drive/DL/FRD/pred1_test',pred1_test)
del pred1_test

In [0]:
pred2_test = model2.predict_proba(X_test)
np.save('/content/gdrive/My Drive/DL/FRD/pred2_test',pred2_test)
del pred2_test

In [0]:
pred3_test = model3.predict_proba(X_test)
np.save('/content/gdrive/My Drive/DL/FRD/pred3_test',pred3_test)
del pred3_test

In [0]:
pred4_test = model4.predict_proba(X_test)
np.save('/content/gdrive/My Drive/DL/FRD/pred4_test',pred4_test)
del pred4_test

In [0]:
pred5_test = model5.predict_proba(X_test)
np.save('/content/gdrive/My Drive/DL/FRD/pred5_test',pred5_test)
del pred5_test

In [0]:
pred1_test = np.load('/content/gdrive/My Drive/DL/FRD/pred1_test.npy')
pred2_test = np.load('/content/gdrive/My Drive/DL/FRD/pred2_test.npy')
pred3_test = np.load('/content/gdrive/My Drive/DL/FRD/pred3_test.npy')
pred4_test = np.load('/content/gdrive/My Drive/DL/FRD/pred4_test.npy')
pred5_test = np.load('/content/gdrive/My Drive/DL/FRD/pred5_test.npy')
X_test_stack = np.hstack((pred1_test,pred2_test,pred3_test,pred4_test,pred5_test))
np.save('/content/gdrive/My Drive/DL/FRD/X_test_stack',X_test_stack)

In [0]:
X_train_stack = np.load('/content/gdrive/My Drive/DL/FRD/X_train_stack.npy')
X_val_stack = np.load('/content/gdrive/My Drive/DL/FRD/X_val_stack.npy')
X_test_stack = np.load('/content/gdrive/My Drive/DL/FRD/X_test_stack.npy')

In [0]:
def model_stacking(X_train,X_val,y_train,y_val):
  #Define stacked model
  inputs = Input(shape=(10,))
  H = Dense(10, activation='tanh')(inputs)
  H = Dense(5, activation='tanh')(H)
  H = Dense(3, activation='tanh')(H)
  outputs = Dense(2, activation = 'softmax')(H)

  #model params
  stacked_model = Model(inputs,outputs)
  adam = Adam(lr = 1e-04)
  es = EarlyStopping(monitor='val_loss', mode='auto', patience=10, restore_best_weights=True)

  #model train
  stacked_model.compile(loss='binary_crossentropy', 
                    optimizer= adam, metrics=[auc_roc])

  history = stacked_model.fit(X_train, y_train, validation_data= [X_val,y_val], 
                          epochs=500, batch_size=512, shuffle=True, verbose=2, 
                          callbacks=[es])
  return stacked_model


In [16]:
from keras.utils import to_categorical
y_train_encoded = to_categorical(y_train,2)
y_val_encoded = to_categorical(y_val,2)
model_stacked = model_stacking(X_train_stack,X_val_stack,y_train_encoded,y_val_encoded)






Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Please switch to tf.metrics.auc. Note that the order of the labels and predictions arguments has been switched.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.

Train on 413378 samples, validate on 177162 samples
Epoch 1/500
 - 10s - loss: 0.0499 - auc_roc: 0.9988 - val_loss: 0.0632 - val_auc_roc: 0.9976
Epoch 2/500
 - 5s - loss: 0.0156 - auc_roc: 0.9975 - val_loss: 0.0604 - val_auc_roc: 0.9975
Epoch 3/500
 - 5s - loss: 0.0091 - auc_roc: 0.9974 - val_loss: 0.0616 - val_auc_ro