In [1]:
%%capture
!pip3 install -U lightautoml

!pip3 install -U pandas

!pip install adabelief-tf==0.2.0 --no-cache-dir 

In [2]:
import pandas as pd
import numpy as np
import dill as pickle   
from matplotlib import pyplot as plt
import random
import datetime
import math
import gc
import os
import warnings
import seaborn as sns
import itertools
import multiprocessing
import joblib
warnings.simplefilter(action='ignore', category=FutureWarning)
import pickle
from pickle import load,dump

from tqdm import tqdm
import h5py
from matplotlib.ticker import MaxNLocator
from colorama import Fore, Back, Style

from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression

from sklearn.utils import class_weight 
from sklearn.utils.class_weight import compute_class_weight
import tensorflow_addons as tfa
from adabelief_tf import AdaBeliefOptimizer
import tensorflow as tf
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1'
tf.config.threading.set_inter_op_parallelism_threads(4)
from tensorflow import keras
from tensorflow.keras.models import Model, load_model,model_from_json
from tensorflow.keras.callbacks import ReduceLROnPlateau, LearningRateScheduler, EarlyStopping
from tensorflow.keras.layers import Dense, Input, InputLayer, Add, Concatenate, Dropout, BatchNormalization, Conv1D, Reshape, Flatten
from tensorflow.keras.losses import binary_crossentropy
import tensorflow.keras.backend as K

import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import MultiStepLR
import torch.nn.functional as F

import warnings
warnings.filterwarnings('ignore')

In [3]:
GPU = True
try:
    import cupy, cudf
except ImportError:
    GPU = False
    
cudf = pd    

In [4]:
DEVICE = 'cpu'
EPOCHS = 15
BATCH_SIZE = 1024
LEARNING_RATE = 1e-3
NFOLDS = 5           #<-- Update
EARLY_STOPPING_STEPS = 10
EARLY_STOP = False

In [5]:
# VERSION NAME FOR SAVED MODEL FILES
VER = 1

# TRAIN RANDOM SEED
SEED = 62

# FILL NAN VALUE
NAN_VALUE = -127 # will fit in int8

# FOLDS PER MODEL
FOLDS = 5

In [6]:
def read_file(path="",usecols=None):
    if usecols is not None: 
        df = pd.read_parquet(path,columns = usecols)
    else: 
        df = pd.read_parquet(path)
        
    # REDUCE DTYPE FOR CUSTOMER AND DATE
    if GPU:
            df['customer_ID'] = df['customer_ID'].str[-16:].str.hex_to_int().astype('int64')
    else:
            df['customer_ID'] = df['customer_ID'].str[-16:].apply(int, base=16).astype('int64')

    print('shape of data:', df.shape)
        
    return df

In [7]:
def process_and_feature_engineer(train):
    
    year = pd.to_numeric(train['S_2'].str[:4])
    month = pd.to_numeric(train['S_2'].str[5:7])
    
    train.S_2 = pd.to_datetime( train.S_2 )

    train["SDist"]=train.groupby("customer_ID")["S_2"].diff() / np.timedelta64(1, 'D')
    # Impute with average distance 30.53 days
    train['SDist'].fillna(30.53, inplace=True)
    
    train['S_2'] = year.mul(12).add(month).sub(24207).astype('int8')
    
    ###############################################################################
    
    train.drop(["B_29","S_9"], axis=1, inplace = True)
    
    ## For each customer, count all NaN in any row, and count all NaN in the last row. Later we will add it as two columns
    df_nan = (train.mul(0) + 1).fillna(0)
    df_nan['customer_ID'] = train['customer_ID']
    nan_sum = df_nan.groupby("customer_ID").sum().sum(axis=1)
    nan_last = df_nan.groupby("customer_ID").last().sum(axis=1)
    del df_nan
    gc.collect()
    
    ########################################################################
    
    #https://www.kaggle.com/competitions/amex-default-prediction/discussion/328514
    train.loc[(train.R_13==0) & (train.R_17==0) & (train.R_20==0) & (train.R_8==0), 'R_6'] = 0
    train.loc[train.B_39==-1, 'B_36'] = 0
    
    all_cols = [c for c in list(train.columns) if c not in  ['customer_ID','S_2']]
    cat_features = ['B_30','B_38','D_114','D_117','D_120', 'D_66','D_64','D_63', 'D_126']
    
    ######################################################################
    
    delta_cols = ['S_27', 'S_7', 'S_5', 'R_5', 'S_11', 'D_142', 'D_43', 'D_136', 'S_22', 'D_110', 'R_28', 'B_6', 
                  'D_102', 'D_130', 'D_46', 'D_137', 'B_1', 'D_96', 'R_14', 'D_72', 'S_12', 'S_13', 'D_88', 'B_24',
                  'S_25', 'D_44', 'B_9', 'B_17', 'B_42', 'D_53', 'D_58', 'R_11', 'D_138', 'B_4', 'B_3', 'R_16', 
                  'SDist', 'D_48', 'D_84', 'R_7', 'B_2', 'D_39', 'D_41', 'D_82', 'D_49', 'B_16']
    
    ### Add delta
    test_num_agg2 = train.groupby("customer_ID")[delta_cols].nth(-1) - train.groupby("customer_ID")[delta_cols].nth(-2)
    test_num_agg2 = test_num_agg2.fillna(0)
    test_num_agg2.columns = [x + '_delta' for x in test_num_agg2.columns]
    
    print('Delta features Done')
    
    #####################################################################
    
    test_s2_agg = train.groupby("customer_ID")[["S_2"]].agg(['min', 'max'])
    test_s2_agg.columns = ['_'.join(x) for x in test_s2_agg.columns]
    test_s2_agg['S_2_min'] = test_s2_agg['S_2_min'] + 12 - test_s2_agg['S_2_max']
    test_s2_agg.drop(['S_2_max'],inplace=True,axis=1)
    
    #########################################################################
    
    ####### NUM AGG FEATURES ######
    
    std_cols = ['S_16', 'D_108', 'D_39', 'D_141', 'S_22', 'R_8', 'B_40', 'P_2', 'D_55', 'R_10', 'D_65', 'D_106', 
                'D_81', 'B_3', 'D_138', 'S_8', 'D_75', 'D_94', 'B_15', 'B_10', 'B_27', 'D_82', 'D_78', 'B_5', 'B_6',
                'B_9', 'D_110', 'D_88', 'D_83', 'B_21', 'D_130', 'B_42', 'R_28', 'D_124', 'R_23', 'D_96', 'D_144', 
                'B_37', 'R_15', 'B_7', 'D_49', 'D_69', 'D_115', 'R_12', 'D_112', 'D_92', 'D_72', 'S_24', 'S_12', 
                'D_125', 'S_5', 'D_51', 'D_123', 'B_4', 'R_14', 'B_19', 'S_13', 'B_2', 'B_17', 'D_41', 'B_14', 'B_41',
                'B_39', 'D_47', 'S_20', 'R_3', 'D_87', 'D_84', 'S_15', 'B_12', 'D_137', 'B_23', 'SDist', 'D_45', 
                'S_18', 'R_20', 'P_3', 'R_7', 'D_61', 'B_16', 'S_23']

    test_num_agg_std = train.groupby(by=['customer_ID'])[std_cols].std()
    test_num_agg_std.columns = [x+'_std' for x in test_num_agg_std.columns]
    
    max_cols = ['D_58', 'R_12', 'B_28', 'B_40', 'S_23', 'B_2', 'D_47', 'D_134', 'D_94', 'R_5', 'D_62', 'S_26', 
                'D_42', 'D_79', 'S_11', 'R_14', 'B_41', 'D_50', 'D_60', 'S_22', 'B_11', 'D_75', 'D_51', 'R_28', 
                'D_61', 'D_132', 'P_3', 'SDist', 'D_76', 'S_20', 'D_92', 'D_121', 'D_43', 'D_127', 'R_21', 'B_21', 
                'B_5', 'D_48', 'S_24', 'D_84', 'D_55', 'S_15', 'D_110', 'D_56', 'D_133', 'S_12', 'D_65', 'D_54', 
                'R_9', 'B_42', 'S_18', 'D_93', 'D_111', 'D_53', 'D_82', 'D_59', 'B_4', 'R_23', 'D_108', 'D_70', 
                'D_96', 'D_141', 'B_15', 'D_77', 'B_1', 'D_125', 'D_102', 'D_41', 'D_49', 'D_80', 'B_13', 'D_105', 
                'D_86', 'D_142', 'D_52', 'D_39', 'B_22', 'D_45', 'S_8', 'B_3', 'D_122', 'D_69', 'S_6', 'D_123', 
                'B_12', 'B_14', 'D_124','P_2']
    
    test_num_agg_max = train.groupby(by=['customer_ID'])[max_cols].max()
    test_num_agg_max.columns = [x+'_max' for x in test_num_agg_max.columns]
    
    min_cols = ['R_27', 'D_46', 'S_7', 'D_118', 'R_12', 'B_11', 'S_5', 'B_5', 'D_76', 'D_47', 'D_119', 'R_2', 
                'D_70', 'D_48', 'B_16', 'R_28', 'D_53', 'D_111', 'D_51', 'B_13', 'P_4', 'D_56', 'S_19', 'B_6',
                'D_135', 'R_4', 'D_52', 'S_23', 'D_141', 'R_9', 'D_61', 'S_15', 'D_109', 'S_12', 'B_39', 'D_62',
                'D_132', 'D_112', 'B_20', 'R_6', 'D_110', 'D_104', 'B_32', 'D_91', 'D_140', 'B_40', 'B_9', 'R_8',
                'D_92', 'S_3', 'B_18', 'D_42', 'D_93', 'B_31', 'P_2', 'D_125', 'SDist', 'B_42', 'S_8', 'R_5', 
                'R_11', 'B_33', 'D_83', 'R_23', 'D_127', 'S_25', 'D_121', 'D_94', 'D_65', 'S_2', 'B_8', 'S_11', 
                'D_134', 'B_17', 'D_39', 'S_6', 'D_45', 'D_122', 'D_59', 'D_84']

    test_num_agg_min = train.groupby(by=['customer_ID'])[min_cols].min()
    test_num_agg_min.columns = [x+'_min' for x in test_num_agg_min.columns]
    
    quantile_cols = ['D_127', 'B_22', 'D_76', 'R_14', 'D_121', 'S_12', 'B_2', 'D_56', 'D_50', 'D_47', 'SDist', 
                     'R_5', 'R_4', 'D_87', 'D_128', 'D_139', 'D_45', 'D_48', 'D_59', 'S_18', 'D_65', 'S_15', 
                     'D_141', 'S_13', 'D_143', 'B_13', 'D_119', 'S_26', 'D_43', 'D_134', 'D_51', 'R_7', 'B_17', 
                     'D_94', 'B_3', 'D_122', 'S_3', 'D_91', 'D_72', 'P_4', 'B_40', 'R_8', 'S_24', 'B_10', 'D_60', 
                     'D_71', 'B_42', 'D_111', 'D_110', 'B_9', 'D_61', 'P_2', 'R_26', 'D_96', 'D_70', 'B_14', 'D_75',
                     'D_92', 'D_74', 'R_25', 'D_107', 'D_42', 'R_9', 'D_62', 'R_27', 'D_93', 'D_125', 'B_32', 'B_5',
                     'D_105', 'R_24', 'D_44']
    
    test_num_agg_quantile = train.groupby(by=['customer_ID'])[quantile_cols].quantile()
    test_num_agg_quantile.columns = [x+'_quantile' for x in test_num_agg_quantile.columns]
    
    last_cols = ['D_83', 'B_9', 'R_5', 'R_11', 'R_16', 'D_50', 'S_25', 'D_111', 'B_4', 'D_76', 'R_2', 
                 'S_24', 'S_3', 'P_2', 'D_119', 'B_15', 'R_9', 'B_24', 'D_140', 'D_52', 'SDist', 'D_79', 
                 'B_23', 'D_127', 'D_49', 'B_1', 'D_59', 'S_26', 'R_20', 'B_18', 'S_23', 'D_91', 'D_139', 'S_13',
                 'R_24', 'D_138', 'D_45', 'B_17', 'D_94', 'B_41', 'D_56', 'D_47', 'D_78', 'D_135', 'R_8', 'D_43', 
                 'R_3', 'D_86', 'B_40', 'S_8', 'B_3', 'D_41', 'S_27', 'S_12', 'D_124', 'D_54', 'R_18', 'B_20', 
                 'D_110', 'D_121', 'D_62', 'D_82', 'B_25', 'D_92', 'B_5', 'R_12', 'B_39', 'D_108',
                 'S_7', 'B_11', 'D_75', 'B_42', 'B_22', 'B_14', 'R_23', 'D_42', 'R_27', 'D_112', 'B_2', 
                 'D_93', 'D_122', 'D_133', 'D_106', 'R_25', 'B_33', 'D_141', 'B_6', 'D_51', 'D_96',
                 'S_6', 'D_46', 'B_31', 'S_20', 'R_7', 'B_7', 'B_10', 'R_1', 'D_60', 'D_61', 'B_13', 'R_10', 'D_129',
                 'D_53', 'R_4', 'D_39','R_14', 'S_16', 'B_28', 'D_58', 'D_48', 'D_55', 'D_88', 'D_69', 'S_5','D_44',
                 'P_3','D_131','B_37','D_144','D_102','B_8','D_73','S_17','D_130','S_22']
    
    test_num_agg_last = train.groupby(by=['customer_ID'])[last_cols].last()
    test_num_agg_last.columns = [x+'_last' for x in test_num_agg_last.columns]
    
    mean_cols = ['B_20', 'R_23', 'D_59', 'D_140', 'D_75', 'D_106', 'R_8', 'D_130', 'R_26', 'D_74', 'R_15', 'D_52', 
                 'D_54', 'R_9', 'D_121', 'D_113', 'S_26', 'B_14', 'D_50', 'S_12', 'S_15', 'D_93', 'B_32', 'D_73',
                 'D_111', 'D_108', 'D_77', 'R_21', 'SDist', 'B_9', 'B_17', 'D_141', 'S_5', 'D_96', 'B_21', 'R_1', 
                 'D_48', 'D_61', 'R_5', 'S_16', 'D_92', 'D_91', 'D_45', 'B_40', 'D_110', 'B_18', 'D_82', 'R_27', 
                 'S_3', 'D_71', 'B_42', 'R_16', 'P_2', 'D_44', 'D_145', 'D_122', 'B_4', 'D_55', 'B_15', 'D_104', 
                 'D_62', 'S_6', 'B_5', 'D_94', 'D_72', 'S_11', 'D_127', 'D_142', 'D_143', 'D_76', 'B_41', 'S_18',
                 'D_60', 'R_3', 'R_11', 'B_7', 'D_118', 'D_51', 'R_25', 'B_2', 'D_42', 'B_22', 'D_47', 'D_43', 
                 'B_12','D_41', 'D_144', 'R_14', 'S_17', 'B_23', 'D_56', 'S_22', 'B_10', 'B_3', 'B_1', 'B_28', 
                 'B_6', 'D_131', 'S_24', 'D_46', 'D_58', 'D_53', 'S_7']
    
    test_num_agg_mean = train.groupby(by=['customer_ID'])[mean_cols].mean()
    test_num_agg_mean.columns = [x+'_mean' for x in test_num_agg_mean.columns]
    
    first_cols = ['D_124', 'D_54', 'R_3', 'D_135', 'D_60', 'B_6', 'D_74', 'S_25', 'D_102', 'D_145', 'R_28', 'B_42',
                  'R_25', 'S_22', 'S_13', 'D_93', 'D_65', 'D_141', 'D_55', 'D_119', 'P_2', 'B_20', 'R_14', 'D_75',
                  'B_9', 'D_94', 'S_26', 'D_127', 'D_42', 'D_123', 'D_52', 'B_39', 'R_8', 'D_88', 'D_45', 'P_3', 
                  'B_19', 'D_76', 'D_105', 'D_92', 'D_78', 'B_17', 'R_9', 'D_62', 'D_134', 'D_56', 'B_16', 'D_118',
                  'D_82', 'D_110', 'D_142', 'D_121', 'D_47', 'D_91', 'D_61', 'D_53', 'D_46', 'D_51', 'D_132', 'S_8',
                  'D_96','B_5', 'D_50', 'SDist', 'D_41', 'D_144', 'D_48', 'R_12', 'B_18', 'B_8', 'B_3', 'S_5', 
                  'D_112', 'D_58', 'D_69', 'B_15']
    
    test_num_agg_first = train.groupby(by=['customer_ID'])[first_cols].first()
    test_num_agg_first.columns = [x+'_first' for x in test_num_agg_first.columns]
    
    test_num_agg = pd.concat([test_num_agg_quantile,test_num_agg_mean,test_num_agg_last,
                            test_num_agg_first,test_num_agg_max,test_num_agg_std,test_num_agg_min],axis=1)
    
    del test_num_agg_quantile,test_num_agg_mean,test_num_agg_last,test_num_agg_first,test_num_agg_max,test_num_agg_std,test_num_agg_min
    
    cols = list(test_num_agg.dtypes[test_num_agg.dtypes == 'float64'].index)
    test_num_agg.loc[:,cols] = test_num_agg.loc[:,cols].apply(lambda x: x.astype(np.float32))
    
    test_cat_agg = train.groupby("customer_ID")[cat_features].agg(['last', 'nunique'])
    test_cat_agg.columns = ['_'.join(x) for x in test_cat_agg.columns]
    
    # Transform int64 columns to int32
    cols = list(test_cat_agg.dtypes[test_cat_agg.dtypes == 'int64'].index)
    test_cat_agg.loc[:,cols] = test_cat_agg.loc[:,cols].apply(lambda x: x.astype(np.int32))
    
    #add last statement date, statements count and "new customer" category (LT=0.5)
    test_date_agg = train.groupby("customer_ID")[["S_2","B_3","D_104"]].agg(['last','count'])
    test_date_agg.columns = ['_'.join(x) for x in test_date_agg.columns]
    test_date_agg.rename(columns = {'S_2_count':'LT'}, inplace = True)
    test_date_agg.loc[(test_date_agg.B_3_last.isnull()) & (test_date_agg.LT==1),'LT'] = 0.5
    test_date_agg.loc[(test_date_agg.D_104_last.isnull()) & (test_date_agg.LT==1),'LT'] = 0.5
    test_date_agg.drop(["B_3_last","D_104_last","B_3_count","D_104_count",'S_2_last'], axis=1, inplace = True)
    
    train= pd.concat([test_cat_agg,test_s2_agg,test_num_agg2,test_date_agg,test_num_agg],axis=1)
    del test_cat_agg,test_s2_agg,test_num_agg2,test_date_agg,test_num_agg
    
    print('shape after engineering1', train.shape )
    
    print('Aggregation Done')
                  
    ####################################################################################              
    
    cat_features = [f"{cf}_last" for cf in cat_features]
    
    for cat_col in cat_features:
        encoder = LabelEncoder()
        train[cat_col] = encoder.fit_transform(train[cat_col]) 
        
    print('Encoding done')
                  
    #######################################################################################              
        
    last_round_cols = ['D_42_last', 'B_17_last', 'B_3_last', 'D_62_last', 'B_18_last', 'R_12_last', 
                       'D_133_last', 'D_46_last', 'B_2_last', 'S_12_last', 'B_11_last', 'D_58_last',
                       'D_43_last', 'B_15_last', 'SDist_last', 'B_10_last', 'B_1_last', 'S_25_last', 
                       'D_112_last', 'S_3_last', 'R_1_last', 'D_47_last', 'D_50_last', 'D_55_last', 
                       'D_61_last', 'B_24_last', 'R_14_last', 'D_76_last', 'B_5_last', 'S_23_last', 
                       'D_48_last', 'D_45_last', 'B_42_last', 'B_6_last', 'D_110_last', 'S_5_last', 
                       'S_16_last', 'R_7_last', 'R_27_last', 'B_23_last', 'D_60_last', 'B_9_last', 
                       'B_28_last', 'D_88_last', 'D_69_last', 'D_121_last', 'S_24_last', 'D_56_last',
                       'D_119_last', 'D_41_last', 'B_39_last', 'D_52_last']
    
    for col in last_round_cols:
        train[col + '_round2'] = train[col].round(2)
        
    print('Rounding done') 
                  
    #####################################################################################              
    
    train['P_2_max-P_2_min'] = train['P_2_max']-train['P_2_min']
    train['P_2_max-P_2_min'] = train['P_2_max-P_2_min'].fillna(0) 
    
    train['D_44_last-B_37_last'] = train['D_44_last']-train['B_37_last']
    train['D_44_last-B_37_last'] = train['D_44_last-B_37_last'].fillna(0)
    
    train['D_44_last-P_2_last'] = train['D_44_last']-train['P_2_last']
    train['D_44_last-P_2_last'] = train['D_44_last-P_2_last'].fillna(0)
    
    train['D_44_last-P_3_last'] = train['D_44_last']-train['P_3_last']
    train['D_44_last-P_3_last'] = train['D_44_last-P_3_last'].fillna(0)
    
    train['B_23_last-P_2_last'] = train['B_23_last']-train['P_2_last']
    train['B_23_last-P_2_last'] = train['B_23_last-P_2_last'].fillna(0)
    
    train['B_23_last-P_3_last'] = train['B_23_last']-train['P_3_last']
    train['B_23_last-P_3_last'] = train['B_23_last-P_3_last'].fillna(0)
    
    train['P_3_last-D_131_last'] = train['P_3_last']-train['D_131_last']
    train['P_3_last-D_131_last'] = train['P_3_last-D_131_last'].fillna(0)
    
    train['P_2_last-D_131_last'] = train['P_2_last']-train['D_131_last']
    train['P_2_last-D_131_last'] = train['P_2_last-D_131_last'].fillna(0)
    
    train['B_17_last-P_2_last'] = train['B_17_last']-train['P_2_last']
    train['B_17_last-P_2_last'] = train['B_17_last-P_2_last'].fillna(0)
    
    train['B_17_last-P_3_last'] = train['B_17_last']-train['P_3_last']
    train['B_17_last-P_3_last'] = train['B_17_last-P_3_last'].fillna(0)
    
    train['B_14_last-P_2_last'] = train['B_14_last']-train['P_2_last']
    train['B_14_last-P_2_last'] = train['B_14_last-P_2_last'].fillna(0)
    
    train['B_11_last-P_2_last'] = train['B_11_last']-train['P_2_last']
    train['B_11_last-P_2_last'] = train['B_11_last-P_2_last'].fillna(0)
    
    train['B_14_last-P_3_last'] = train['B_14_last']-train['P_3_last']
    train['B_14_last-P_3_last'] = train['B_14_last-P_3_last'].fillna(0)
    
    train['B_11_last-P_3_last'] = train['B_11_last']-train['P_3_last']
    train['B_11_last-P_3_last'] = train['B_11_last-P_3_last'].fillna(0)
    
    train['B_2_last-P_2_last'] = train['B_2_last']-train['P_2_last']
    train['B_2_last-P_2_last'] = train['B_2_last-P_2_last'].fillna(0)
    
    train['B_2_last-P_3_last'] = train['B_2_last']-train['P_3_last']
    train['B_2_last-P_3_last'] = train['B_2_last-P_3_last'].fillna(0)
    
    train['D_48_last-B_2_last'] = train['D_48_last']-train['B_2_last']
    train['D_48_last-B_2_last'] = train['D_48_last-B_2_last'].fillna(0)
    
    train['D_48_last-B_1_last'] = train['D_48_last']-train['B_1_last']
    train['D_48_last-B_1_last'] = train['D_48_last-B_1_last'].fillna(0)
    
    train['D_42_last-P_2_last'] = train['D_42_last']-train['P_2_last']
    train['D_42_last-P_2_last'] = train['D_42_last-P_2_last'].fillna(0)
    
    train['D_42_last-P_3_last'] = train['D_42_last']-train['P_3_last']
    train['D_42_last-P_3_last'] = train['D_42_last-P_3_last'].fillna(0)
    
    train['D_39_last-P_2_last'] = train['D_39_last']-train['P_2_last']
    train['D_39_last-P_2_last'] = train['D_39_last-P_2_last'].fillna(0)
    
    train['D_39_last-P_3_last'] = train['D_39_last']-train['P_3_last']
    train['D_39_last-P_3_last'] = train['D_39_last-P_3_last'].fillna(0)
    
    train['B_9_last-P_2_last'] = train['B_9_last']-train['P_2_last']
    train['B_9_last-P_2_last'] = train['B_9_last-P_2_last'].fillna(0)
    
    train['B_9_last-P_3_last'] = train['B_9_last']-train['P_3_last']
    train['B_9_last-P_3_last'] = train['B_9_last-P_3_last'].fillna(0)
    
    train['B_4_last-D_62_last'] = train['B_4_last']-train['D_62_last']
    train['B_4_last-D_62_last'] = train['B_4_last-D_62_last'].fillna(0)
    
    train['P_3_last-S_23_last'] = train['P_3_last']-train['S_23_last']
    train['P_3_last-S_23_last'] = train['P_3_last-S_23_last'].fillna(0)
    
    train['P_2_last-S_23_last'] = train['P_2_last']-train['S_23_last']
    train['P_2_last-S_23_last'] = train['P_2_last-S_23_last'].fillna(0)
    
    train['P_3_last-S_16_last'] = train['P_3_last']-train['S_16_last']
    train['P_3_last-S_16_last'] = train['P_3_last-S_16_last'].fillna(0)
    
    train['P_2_last-S_16_last'] = train['P_2_last']-train['S_16_last']
    train['P_2_last-S_16_last'] = train['P_2_last-S_16_last'].fillna(0)
    
    print('Miscellaneous features Done')
                  
    ########################################################################              
    
    ####### Last-First ########
    
    train['S_5_last_first_diff'] = train['S_5_last'] - train['S_5_first']
    train['S_5_last_first_diff'] = train['S_5_last_first_diff'].fillna(0)
    
    train['R_14_last_first_diff'] = train['R_14_last'] - train['R_14_first']
    train['R_14_last_first_diff'] = train['R_14_last_first_diff'].fillna(0)
        
    train['SDist_last_first_diff'] = train['SDist_last'] - train['SDist_first']
    train['SDist_last_first_diff'] = train['SDist_last_first_diff'].fillna(0)
        
    train['D_112_last_first_diff'] = train['D_112_last'] - train['D_112_first']
    train['D_112_last_first_diff'] = train['D_112_last_first_diff'].fillna(0)
        
    train['B_3_last_first_diff'] = train['B_3_last'] - train['B_3_first']
    train['B_3_last_first_diff'] = train['B_3_last_first_diff'].fillna(0)
        
    train['B_9_last_first_diff'] = train['B_9_last'] - train['B_9_first']
    train['B_9_last_first_diff'] = train['B_9_last_first_diff'].fillna(0)  
                  
    train['D_144_last_first_diff'] = train['D_144_last'] - train['D_144_first']
    train['D_144_last_first_diff'] = train['D_144_last_first_diff'].fillna(0)  
                  
    train['B_15_last_first_diff'] = train['B_15_last'] - train['B_15_first']
    train['B_15_last_first_diff'] = train['B_15_last_first_diff'].fillna(0)       
      
    train['B_8_last_first_diff'] = train['B_8_last'] - train['B_8_first']
    train['B_8_last_first_diff'] = train['B_8_last_first_diff'].fillna(0)
                  
    train['D_102_last_first_diff'] = train['D_102_last'] - train['D_102_first']
    train['D_102_last_first_diff'] = train['D_102_last_first_diff'].fillna(0)  
                  
    train['D_41_last_first_diff'] = train['D_41_last'] - train['D_41_first']
    train['D_41_last_first_diff'] = train['D_41_last_first_diff'].fillna(0)  
                  
    train['D_48_last_first_diff'] = train['D_48_last'] - train['D_48_first']
    train['D_48_last_first_diff'] = train['D_48_last_first_diff'].fillna(0)  
                  
    train['D_69_last_first_diff'] = train['D_69_last'] - train['D_69_first']
    train['D_69_last_first_diff'] = train['D_69_last_first_diff'].fillna(0)  
                  
    train['R_12_last_first_diff'] = train['R_12_last'] - train['R_12_first']
    train['R_12_last_first_diff'] = train['R_12_last_first_diff'].fillna(0)  
                  
    train['D_50_last_first_diff'] = train['D_50_last'] - train['D_50_first']
    train['D_50_last_first_diff'] = train['D_50_last_first_diff'].fillna(0)  
                  
    train['B_17_last_first_diff'] = train['B_17_last'] - train['B_17_first']
    train['B_17_last_first_diff'] = train['B_17_last_first_diff'].fillna(0)  
                  
    train['D_141_last_first_diff'] = train['D_141_last'] - train['D_141_first']       
    train['D_141_last_first_diff'] = train['D_141_last_first_diff'].fillna(0)  
                  
    train['D_88_last_first_diff'] = train['D_88_last'] - train['D_88_first']
    train['D_88_last_first_diff'] = train['D_88_last_first_diff'].fillna(0)  
                  
    train['P_3_last_first_diff'] = train['P_3_last'] - train['P_3_first']
    train['P_3_last_first_diff'] = train['P_3_last_first_diff'].fillna(0)  
                  
    train['B_18_last_first_diff'] = train['B_18_last'] - train['B_18_first']
    train['B_18_last_first_diff'] = train['B_18_last_first_diff'].fillna(0)  
                  
    train['D_55_last_first_diff'] = train['D_55_last'] - train['D_55_first']
    train['D_55_last_first_diff'] = train['D_55_last_first_diff'].fillna(0)  
                  
    train['D_58_last_first_diff'] = train['D_58_last'] - train['D_58_first']
    train['D_58_last_first_diff'] = train['D_58_last_first_diff'].fillna(0)  
                  
    train['B_5_last_first_diff'] = train['B_5_last'] - train['B_5_first']
    train['B_5_last_first_diff'] = train['B_5_last_first_diff'].fillna(0)  
    
    print('Last-First Features Done')
                  
    ##################################################################################              
                  
    ####### last-mean #######        
                  
    train['D_58_last_mean_diff'] = train['D_58_last'] - train['D_58_mean']
    train['D_58_last_mean_diff'] = train['D_58_last_mean_diff'].fillna(0)
                  
    train['B_7_last_mean_diff'] = train['B_7_last'] - train['B_7_mean']
    train['B_7_last_mean_diff'] = train['B_7_last_mean_diff'].fillna(0)
                  
    train['B_14_last_mean_diff'] = train['B_14_last'] - train['B_14_mean']
    train['B_14_last_mean_diff'] = train['B_14_last_mean_diff'].fillna(0)
                  
    train['D_41_last_mean_diff'] = train['D_41_last'] - train['D_41_mean']
    train['D_41_last_mean_diff'] = train['D_41_last_mean_diff'].fillna(0)
                  
    train['D_73_last_mean_diff'] = train['D_73_last'] - train['D_73_mean']
    train['D_73_last_mean_diff'] = train['D_73_last_mean_diff'].fillna(0)
                  
    train['B_1_last_mean_diff'] = train['B_1_last'] - train['B_1_mean']
    train['B_1_last_mean_diff'] = train['B_1_last_mean_diff'].fillna(0)
                  
    train['D_55_last_mean_diff'] = train['D_55_last'] - train['D_55_mean']
    train['D_55_last_mean_diff'] = train['D_55_last_mean_diff'].fillna(0)
      
    train['D_48_last_mean_diff'] = train['D_48_last'] - train['D_48_mean']
    train['D_48_last_mean_diff'] = train['D_48_last_mean_diff'].fillna(0)
                  
    train['S_3_last_mean_diff'] = train['S_3_last'] - train['S_3_mean']
    train['S_3_last_mean_diff'] = train['S_3_last_mean_diff'].fillna(0)
                  
    train['R_14_last_mean_diff'] = train['R_14_last'] - train['R_14_mean']
    train['R_14_last_mean_diff'] = train['R_14_last_mean_diff'].fillna(0)
                  
    train['B_15_last_mean_diff'] = train['B_15_last'] - train['B_15_mean']
    train['B_15_last_mean_diff'] = train['B_15_last_mean_diff'].fillna(0)
                  
    train['B_23_last_mean_diff'] = train['B_23_last'] - train['B_23_mean']
    train['B_23_last_mean_diff'] = train['B_23_last_mean_diff'].fillna(0)
                  
    train['D_46_last_mean_diff'] = train['D_46_last'] - train['D_46_mean']
    train['D_46_last_mean_diff'] = train['D_46_last_mean_diff'].fillna(0)
                  
    train['B_9_last_mean_diff'] = train['B_9_last'] - train['B_9_mean']
    train['B_9_last_mean_diff'] = train['B_9_last_mean_diff'].fillna(0)
                  
    train['B_5_last_mean_diff'] = train['B_5_last'] - train['B_5_mean']
    train['B_5_last_mean_diff'] = train['B_5_last_mean_diff'].fillna(0)
                  
    train['D_131_last_mean_diff'] = train['D_131_last'] - train['D_131_mean']
    train['D_131_last_mean_diff'] = train['D_131_last_mean_diff'].fillna(0)
                  
    train['SDist_last_mean_diff'] = train['SDist_last'] - train['SDist_mean']
    train['SDist_last_mean_diff'] = train['SDist_last_mean_diff'].fillna(0)
                  
    train['S_12_last_mean_diff'] = train['S_12_last'] - train['S_12_mean']
    train['S_12_last_mean_diff'] = train['S_12_last_mean_diff'].fillna(0)
                  
    train['S_17_last_mean_diff'] = train['S_17_last'] - train['S_17_mean']
    train['S_17_last_mean_diff'] = train['S_17_last_mean_diff'].fillna(0)
                  
    train['D_121_last_mean_diff'] = train['D_121_last'] - train['D_121_mean']
    train['D_121_last_mean_diff'] = train['D_121_last_mean_diff'].fillna(0)
                  
    train['D_110_last_mean_diff'] = train['D_110_last'] - train['D_110_mean']
    train['D_110_last_mean_diff'] = train['D_110_last_mean_diff'].fillna(0)
                  
    train['D_141_last_mean_diff'] = train['D_141_last'] - train['D_141_mean']
    train['D_141_last_mean_diff'] = train['D_141_last_mean_diff'].fillna(0)
                  
    train['D_144_last_mean_diff'] = train['D_144_last'] - train['D_144_mean']
    train['D_144_last_mean_diff'] = train['D_144_last_mean_diff'].fillna(0)
                  
    train['S_24_last_mean_diff'] = train['S_24_last'] - train['S_24_mean']
    train['S_24_last_mean_diff'] = train['S_24_last_mean_diff'].fillna(0)
                  
    train['D_50_last_mean_diff'] = train['D_50_last'] - train['D_50_mean']
    train['D_50_last_mean_diff'] = train['D_50_last_mean_diff'].fillna(0)
                  
    train['B_6_last_mean_diff'] = train['B_6_last'] - train['B_6_mean']
    train['B_6_last_mean_diff'] = train['B_6_last_mean_diff'].fillna(0)
                  
    train['B_10_last_mean_diff'] = train['B_10_last'] - train['B_10_mean']
    train['B_10_last_mean_diff'] = train['B_10_last_mean_diff'].fillna(0)
                  
    train['D_53_last_mean_diff'] = train['D_53_last'] - train['D_53_mean']
    train['D_53_last_mean_diff'] = train['D_53_last_mean_diff'].fillna(0)
                  
    train['B_28_last_mean_diff'] = train['B_28_last'] - train['B_28_mean']
    train['B_28_last_mean_diff'] = train['B_28_last_mean_diff'].fillna(0)
                  
    train['S_22_last_mean_diff'] = train['S_22_last'] - train['S_22_mean']
    train['S_22_last_mean_diff'] = train['S_22_last_mean_diff'].fillna(0)
                  
    train['B_3_last_mean_diff'] = train['B_3_last'] - train['B_3_mean']
    train['B_3_last_mean_diff'] = train['B_3_last_mean_diff'].fillna(0)
                  
    train['D_56_last_mean_diff'] = train['D_56_last'] - train['D_56_mean']
    train['D_56_last_mean_diff'] = train['D_56_last_mean_diff'].fillna(0)
                  
    train['D_130_last_mean_diff'] = train['D_130_last'] - train['D_130_mean']
    train['D_130_last_mean_diff'] = train['D_130_last_mean_diff'].fillna(0)
                  
    train['S_7_last_mean_diff'] = train['S_7_last'] - train['S_7_mean']
    train['S_7_last_mean_diff'] = train['S_7_last_mean_diff'].fillna(0)    
    
    print('Last-Mean Features Done')
                       
    ###################################################################################
                  
    Final_drop = ['R_14_last', 'S_16_last', 'B_28_last', 'D_58_last', 'D_48_last', 'D_55_last', 'D_88_last', 
             'D_69_last', 'S_5_last','D_44_last','P_3_last','D_131_last','B_37_last','D_144_last',
             'D_102_last','B_8_last','S_17_last','S_22_last','D_130_last','D_73_last','B_5_first', 'D_50_first', 
             'SDist_first', 'D_41_first', 'D_144_first','P_2_min','P_2_max',
             'D_48_first', 'R_12_first','B_18_first', 'B_8_first', 'B_3_first', 'S_5_first', 'D_112_first', 
             'D_58_first', 'D_69_first', 'B_15_first','D_41_mean', 'D_144_mean', 'R_14_mean', 'S_17_mean', 
            'B_23_mean', 'D_56_mean', 'S_22_mean', 'B_10_mean', 'B_3_mean', 'B_1_mean', 'B_28_mean', 'B_6_mean',
            'D_131_mean', 'S_24_mean', 'D_46_mean', 'D_58_mean', 'D_53_mean', 'S_7_mean',
           "B_30_last","D_114_nunique","D_117_nunique","D_126_last","D_64_nunique"]    
    
    train.drop(Final_drop,axis=1,inplace=True) 
    
    print("Dropping Features Done")
                  
    ###################################################################################  
    
    num_cols = list(train.dtypes[(train.dtypes == 'float32') | (train.dtypes == 'float64')].index)
    train.loc[:,num_cols] = train.loc[:,num_cols].apply(lambda x: x.astype(np.float16))
    
    nan_col = ['D_50_mean', 'D_50_last', 'S_23_std', 'S_23_last']
    train[nan_col] = train[nan_col].fillna(-32783)
    train = train.fillna(0)
    
    ## Finally add NaN counts from earlier
    train["total_data_count"] = nan_sum
    train["total_data_last"] = nan_last

    train = train.loc[:,~train.columns.duplicated()]
    
    print('featuring2 done')
    
    print(f'Final shape after engineering{train.shape}')
    
    return train 

In [8]:
def kerasmodel(n_inputs):
    """Sequential neural network with a skip connection.
    Returns a compiled instance of tensorflow.keras.models.Model.
    """
    activation = 'elu'
    inputs = Input(shape=(n_inputs, ))
    x = Reshape((n_inputs, 1))(inputs)
    x = keras.layers.Conv1D(32,1,strides=1, activation=activation)(x)
    x = keras.layers.BatchNormalization()(x)
    x = keras.layers.Conv1D(24,1, activation=activation)(x)
    x = keras.layers.BatchNormalization()(x)
    x = keras.layers.Conv1D(16,1, activation=activation)(x)
    x = keras.layers.BatchNormalization()(x)
    x = keras.layers.Conv1D(4,1, activation=activation)(x)
    x = Flatten()(x)
    x = Dropout(0.3)(x)
    x = Dense(16, activation = activation)(x)
    x = BatchNormalization()(x)
    x = Dropout(0.1)(x)
    x = Dense(8, activation = activation)(x)
    outputs = Dense(1, activation='sigmoid')(x)
    gc.collect()
    return Model(inputs, outputs)

ALPHA= 5
GAMMA = 2
def FocalLoss(targets, inputs, alpha=ALPHA, gamma=GAMMA):    
    BCE = K.binary_crossentropy(targets, inputs)
    BCE_EXP = K.exp(-BCE)
    focal_loss = K.mean(alpha * K.pow((1-BCE_EXP), gamma) * BCE)
    
    return focal_loss
def run_trainingKeras(fold, seed , test):
    
    seed_everything(seed)
    
    ss = load(open('../input/standardscaler/standardscaler.pkl','rb'))
    
    test = ss.transform(test)
    test = np.array(test)                     
                   
    new_model = keras.models.load_model(f'../input/amex-keras-1d-cnn/model_fold{fold}_seed{seed}.h5',custom_objects={"AdaBeliefOptimizer":AdaBeliefOptimizer(learning_rate=0.02,weight_decay = 1e-5,epsilon = 1e-7,print_change_log = False),'FocalLoss': FocalLoss})
        
    y_pred = new_model.predict(test, batch_size=2048).reshape( (len(test), ))     
    
    return y_pred    

def run_k_foldKeras(NFOLDS,seed, test):
    predictions = np.zeros((len(test),))
    for fold in range(0,NFOLDS):
        pred_ = run_trainingKeras(fold, seed , test)
        predictions += pred_ / NFOLDS
    return predictions

In [9]:
import os
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

In [10]:
class TestDataset:
    def __init__(self, features):
        self.features = features
        
    def __len__(self):
        return (self.features.shape[0])
    
    def __getitem__(self, idx):
        dct = {
            'x' : torch.tensor(self.features[idx, :], dtype=torch.float)
        }
        return dct
 
class resnetModel(nn.Module):
    def __init__(self, num_features,hidden_size,ispretrain=False):
        super(resnetModel, self).__init__()
        self.ispretrain=ispretrain
        self.batch_norm1 = nn.BatchNorm1d(num_features)
        self.dense1 = nn.utils.weight_norm(nn.Linear(num_features, hidden_size))
        
        
        self.batch_norm2 = nn.BatchNorm1d(num_features+hidden_size)
        self.dropout2 = nn.Dropout(0.5)
        self.dense2 = nn.utils.weight_norm(nn.Linear(num_features+hidden_size, hidden_size))
        self.batch_norm20 = nn.BatchNorm1d(hidden_size)
        self.dropout20 = nn.Dropout(0.5)
        self.dense20 = nn.utils.weight_norm(nn.Linear(hidden_size, hidden_size))
        

        self.batch_norm3 = nn.BatchNorm1d(2*hidden_size)
        self.dropout3 = nn.Dropout(0.5)
        self.dense3 = nn.utils.weight_norm(nn.Linear(2*hidden_size, hidden_size))
        self.batch_norm30 = nn.BatchNorm1d(hidden_size)
        self.dropout30 = nn.Dropout(0.5)
        self.dense30 = nn.utils.weight_norm(nn.Linear(hidden_size, hidden_size))
        

        self.batch_norm4 = nn.BatchNorm1d(2*hidden_size)
        self.dropout4 = nn.Dropout(0.5)
        if self.ispretrain:
          self.dense4 = nn.utils.weight_norm(nn.Linear(2*hidden_size, 1))
        else:
          self.dense5 = nn.utils.weight_norm(nn.Linear(2*hidden_size, 1))
    
    def forward(self, x):
        x1 = self.batch_norm1(x)
        x1 = F.elu(self.dense1(x1))
        x = torch.cat([x,x1],1)
        
        x2 = self.batch_norm2(x)
        x2 = self.dropout2(x2)
        x2 = F.elu(self.dense2(x2))
        x2 = self.batch_norm20(x2)
        x2 = self.dropout20(x2)
        x2 = F.elu(self.dense20(x2))
        x = torch.cat([x1,x2],1)

        x3 = self.batch_norm3(x)
        x3 = self.dropout3(x3)
        x3 = F.elu(self.dense3(x3))
        x3 = self.batch_norm30(x3)
        x3 = self.dropout30(x3)
        x3 = F.elu(self.dense30(x3))
        x3 = torch.cat([x2,x3],1)
        
        x3 = self.batch_norm4(x3)
        x3 = self.dropout4(x3)
        if self.ispretrain:
          x3 = self.dense4(x3)
        else:
          x3 = self.dense5(x3)
        return x3
    
class my_model(nn.Module):
    def __init__(self, in_feat, hid_dim=512, activation=nn.ReLU(), dropout=0.5):
        super(my_model, self).__init__()
        self.batch_norm1 = nn.BatchNorm1d(in_feat)
        self.encode = nn.utils.weight_norm(nn.Linear(in_feat, hid_dim))
        self.activation = activation
        self.batch_norm2 = nn.BatchNorm1d(hid_dim)
        self.hidden1 = nn.utils.weight_norm(nn.Linear(hid_dim, 256))
        self.hidden2 = nn.utils.weight_norm(nn.Linear(256, 64))
        self.drop = nn.Dropout(dropout)
        self.batch_norm3 = nn.BatchNorm1d(64+hid_dim)
        self.hidden3 = nn.utils.weight_norm(nn.Linear(64+hid_dim, 128))
        self.hidden4 = nn.utils.weight_norm(nn.Linear(128, 128))
        self.batch_norm4 = nn.BatchNorm1d(128)
        self.hidden5 = nn.utils.weight_norm(nn.Linear(128,16))
        self.pred = nn.Linear(16, 1)
    
    def forward(self, x):
        h0 = self.drop(self.activation(self.encode(self.batch_norm1(x))))
        h1 = self.activation(self.hidden2(self.drop(self.activation(self.hidden1(self.batch_norm2(h0))))))
        h = self.drop(torch.cat([h0, h1], dim=-1))
        h = self.activation(self.hidden3(self.batch_norm3(h)))
        h = self.activation(self.hidden4(h))
        h = self.activation(self.hidden5(self.drop(self.batch_norm4(h))))
        return self.pred(h)
    
def run_training(fold, seed , test, modelname):
    
    seed_everything(seed)
    
    if (modelname=='simple' or modelname=='resnet'):
        x_test = test.values
    else: x_test = test    
    testdataset = TestDataset(x_test)
    testloader = torch.utils.data.DataLoader(testdataset, batch_size=512, shuffle=False)
    
    if(modelname=='simple'):
        model = my_model(in_feat=test.shape[1])      
        model.load_state_dict(torch.load(f'../input/amextransferlearning/SEED{seed}_FOLD{fold}_.pth',map_location=torch.device(DEVICE)))
        model.to(DEVICE)
    elif(modelname=='resnet'):
        model = resnetModel(test.shape[1],1024)   
        model.load_state_dict(torch.load(f'../input/amextransferlearningresnet/SEED{seed}_FOLD{fold}_.pth',map_location=torch.device(DEVICE)))
        model.to(DEVICE)
    else:
        model = resnetModel(test.shape[1],1024)   
        model.load_state_dict(torch.load(f'../input/stacking-ensemble2/MLP_stack_SEED42_FOLD{fold}_.pth',map_location=torch.device(DEVICE)))
        model.to(DEVICE)
        predictions = np.zeros((len(test),1))
    
    model.eval()
    preds = []
    
    for data in testloader:
        inputs = data['x'].to(DEVICE)

        with torch.no_grad():
            outputs = model(inputs)
        
        preds.append(outputs.sigmoid().detach().cpu().numpy())
        
    predictions = np.concatenate(preds)
    
    return predictions    

def run_k_fold(NFOLDS,seed, test , model_name):
    predictions = np.zeros((len(test),1))
    for fold in range(0,NFOLDS):
        pred_ = run_training(fold, seed ,test,model_name)
        predictions += pred_ / NFOLDS
    return predictions


In [11]:
%%time
# CALCULATE SIZE OF EACH SEPARATE TEST PART
def get_rows(customers, test, NUM_PARTS = 4, verbose = ''):
    chunk = len(customers)//NUM_PARTS
    if verbose != '':
        print(f'We will process {verbose} data as {NUM_PARTS} separate parts.')
        print(f'There will be {chunk} customers in each part (except the last part).')
        print('Below are number of rows in each part:')
    rows = []

    for k in range(NUM_PARTS):
        if k==NUM_PARTS-1: cc = customers[k*chunk:]
        else: cc = customers[k*chunk:(k+1)*chunk]
        s = test.loc[test.customer_ID.isin(cc)].shape[0]
        rows.append(s)
    if verbose != '': print( rows )
    return rows,chunk

# COMPUTE SIZE OF 15 PARTS FOR TEST DATA
NUM_PARTS = 25

TEST_PATH = '../input/amex-data-integer-dtypes-parquet-format/test.parquet'

print(f'Reading test data...')
test = read_file(path = TEST_PATH, usecols = ['customer_ID','S_2'])
customers = test[['customer_ID']].drop_duplicates().sort_index().values.flatten()
rows,num_cust = get_rows(customers, test[['customer_ID']], NUM_PARTS = NUM_PARTS, verbose = 'test')
del test

Reading test data...
shape of data: (11363762, 2)
We will process test data as 25 separate parts.
There will be 36984 customers in each part (except the last part).
Below are number of rows in each part:
[454316, 454828, 454790, 455914, 453770, 454201, 454325, 454248, 454528, 453960, 455276, 453856, 454333, 454861, 454880, 454789, 454752, 453866, 455217, 455155, 454825, 454683, 453515, 454030, 454844]
CPU times: user 18.2 s, sys: 4.5 s, total: 22.7 s
Wall time: 23 s


In [12]:
%%time

import xgboost as xgb
from catboost import CatBoostClassifier
import lightgbm as lgb

# INFER TEST DATA IN PARTS
skip_rows = 0
skip_cust = 0
PYRAMID_W = [0.5, 2/3, 0.75, 0.875, 1, 0]
# XGB MODEL PARAMETERS
BASE_LEARNING_RATE = 0.015

test_preds_cat = []
test_preds_lgbm = []
test_preds_xgb = []
test_preds_xgbpy = []
test_preds_automl = []
test_preds_lgbmquick = []
test_preds_transfer = []
test_preds_resnet = []
test_preds_cnn = []

for k in range(NUM_PARTS):
    
    # READ PART OF TEST DATA
    print(f'\nReading test data...')
    test = read_file(path = TEST_PATH)
    test = test.iloc[skip_rows:skip_rows+rows[k]]
    skip_rows += rows[k]
    print(f'=> Test part {k+1} has shape', test.shape )
    
    # PROCESS AND FEATURE ENGINEER PART OF TEST DATA
    print(f'\nFeature Engineering...')
    
    test = process_and_feature_engineer(test)
    
    if k==NUM_PARTS-1: test = test.loc[customers[skip_cust:]]
    else: test = test.loc[customers[skip_cust:skip_cust+num_cust]]
    skip_cust += num_cust
   
    # TEST DATA FOR XGB  
    X_test = test
    dtest = xgb.DMatrix(data=X_test)
    del X_test
    gc.collect()
    #####################################################################################################
    print(f'\nPredicting xgb...')       
    
    # INFER XGB MODELS ON TEST DATA     
    model = xgb.Booster()  
    model.load_model(f'../input/fork-of-amex-finalxgb/XGB_seed41_fold0.xgb')
    preds = model.predict(dtest)
    model.load_model(f'../input/fork-of-amex-finalxgb/XGB_seed42_fold0.xgb')
    preds += model.predict(dtest)
    model.load_model(f'../input/fork-of-amex-finalxgb/XGB_seed43_fold0.xgb')
    preds += model.predict(dtest)
    for f in range(1,FOLDS):
        model.load_model(f'../input/fork-of-amex-finalxgb/XGB_seed41_fold{f}.xgb')
        preds += model.predict(dtest)
        model.load_model(f'../input/fork-of-amex-finalxgb/XGB_seed42_fold{f}.xgb')
        preds += model.predict(dtest)
        model.load_model(f'../input/fork-of-amex-finalxgb/XGB_seed43_fold{f}.xgb')
        preds += model.predict(dtest)
    preds /= FOLDS*3
    test_preds_xgb.append(preds)
    
    del model
    _ = gc.collect()
    
    #####################################################################################################
    print(f'\nPredicting XGB-Pyramid...')  
    
    reset_margin = dtest.get_base_margin()

    # INFER XGB MODELS ON TEST DATA
    print(".")
    pred_folds = []
    for f in range(FOLDS):
        if (f > 0):
            dtest.set_base_margin(reset_margin)
        for (layer, w) in enumerate(PYRAMID_W[:-1]):    
                    model = xgb.Booster()
                    model.load_model(f'../input/amex-fe-pyramid/XGB_v1_fold{f}_layer{layer}.xgb')
                    print(f'Loaded fold{f}, layer{layer}')
                    ptest = model.predict(dtest, output_margin=True)

                    ## reduce the impact of all model layers so far by w. This should be another way to reduce over-specialization, without the computational cost of DART
                    if (w < 1.0):
                        ptest = ptest * w

                    ## This set_base_margin is what informs the next layer of the prior training.
                    ## See code example from official demos: https://github.com/dmlc/xgboost/blob/master/demo/guide-python/boost_from_prediction.py
                    dtest.set_base_margin(ptest)

        layer = len(PYRAMID_W) - 1
        model = xgb.Booster()
        model.load_model(f'../input/amex-fe-pyramid/XGB_v1_fold{f}_layer{layer}.xgb')
        print("Best_ntree_limit", model.best_ntree_limit)
        preds = model.predict(dtest, iteration_range=(0,model.best_ntree_limit))
                
        ## Create nested array to combine all predictions of a single fold together to rank them before averaging the predictions across folds.
        if f == len(test_preds_xgbpy):
            test_preds_xgbpy.append([])
        test_preds_xgbpy[f].append(preds)

    # CLEAN MEMORY
    del dtest, model, reset_margin
    _ = gc.collect()
    
    ###########################################################################################################
    print(f'\nPredicting LGBM Quick....')
    
    model = joblib.load('../input/amexlightgbmquickstart2seed/LGBM_Simple_fold0_SEED42.pkl')
    preds = model.predict_proba(test.astype('float32'))[:,1]
    model = joblib.load('../input/amexlightgbmquickstart2seed/LGBM_Simple_fold0_SEED43.pkl')
    preds += model.predict_proba(test.astype('float32'))[:,1]
    
    for f in range(1,FOLDS):        
            model = joblib.load(f'../input/amexlightgbmquickstart2seed/LGBM_Simple_fold{f}_SEED43.pkl')
            preds += model.predict_proba(test.astype('float32'))[:,1]
            model = joblib.load(f'../input/amexlightgbmquickstart2seed/LGBM_Simple_fold{f}_SEED42.pkl')
            preds += model.predict_proba(test.astype('float32'))[:,1]
    preds /= FOLDS*2
    test_preds_lgbmquick.append(preds)

    # CLEAN MEMORY                  
    del model
    _ = gc.collect()
    ##############################################################################################################
    print(f'\nPredicting cat...')              
     
    model = CatBoostClassifier()
    model.load_model(f'../input/amex-catboost770/cat_v1_fold0_seed42')
    preds = model.predict_proba(test)[:,1]
    model.load_model(f'../input/amex-catboost770-seed41/cat_v1_fold0_seed41')
    preds += model.predict_proba(test)[:,1]
    for f in range(1,FOLDS):
        model.load_model(f'../input/amex-catboost770/cat_v1_fold{f}_seed42')
        preds = preds + model.predict_proba(test)[:,1]
        model.load_model(f'../input/amex-catboost770-seed41/cat_v1_fold{f}_seed41')
        preds = preds + model.predict_proba(test)[:,1]
    preds = preds/FOLDS*2
    test_preds_cat.append(preds)

    # CLEAN MEMORY                  
    del model
    _ = gc.collect()
    ############################################################################################################
    print('Predicting TransferLearning simple')
    
    SEED = [41,42] 
    modelname='simple'
    predictions = np.zeros((len(test),1))
    for seed in SEED:
        predictions_ = run_k_fold(NFOLDS, seed,test,modelname)
        predictions += predictions_ / len(SEED)  
   
    test_preds_transfer.append(predictions[:,0])
    
    ############################################################################################################
    print('Predicting TransferLearning resnet')
    
    SEED = [41,42] 
    modelname='resnet'
    predictions = np.zeros((len(test),1))
    for seed in SEED:
        predictions_ = run_k_fold(NFOLDS, seed,test,modelname)
        predictions += predictions_ / len(SEED)  
   
    test_preds_resnet.append(predictions[:,0])
    
    ############################################################################################################
    print('Predicting cnn')
    
    SEED = [41,42] 
    predictions = np.zeros((len(test),))
    for seed in SEED:
        predictions_ = run_k_foldKeras(NFOLDS, seed,test)
        predictions += predictions_ / len(SEED)  
   
    test_preds_cnn.append(predictions)
    
    #############################################################################################################
    print(f'\nPredicting automl...')     
    
    model = joblib.load('../input/amex-lightautoml/automl.pkl')
    preds1 = model.predict(test)
    preds1 = preds1.data[:,0]
    model = joblib.load('../input/amex-lightautoml43/automl.pkl')
    preds2 = model.predict(test)
    preds2 = preds2.data[:,0]
    preds = (preds1+preds2)/2
    test_preds_automl.append(preds)

    # CLEAN MEMORY
    del model
    _ = gc.collect()
    ############################################################################################################
    print(f'\nPredicting LGBM DART...')              
     
    loaded_model=lgb.Booster(model_file='../input/lgbm770/lgbm_fold0(2).txt')
    preds = loaded_model.predict(test)

    for f in range(1,FOLDS):        
            loaded_model=lgb.Booster(model_file=f'../input/lgbm770/lgbm770_0.7982/lgbm_fold{f}(2).txt')
            preds += loaded_model.predict(test)
    preds /= FOLDS
    test_preds_lgbm.append(preds)
    
    # CLEAN MEMORY                  
    del loaded_model
    _ = gc.collect()
    ###############################################################################################################


Reading test data...
shape of data: (11363762, 190)
=> Test part 1 has shape (454316, 190)

Feature Engineering...
Delta features Done
shape after engineering1 (36984, 691)
Aggregation Done
Encoding done
Rounding done
Miscellaneous features Done
Last-First Features Done
Last-Mean Features Done
Dropping Features Done
featuring2 done
Final shape after engineering(36984, 770)

Predicting xgb...

Predicting XGB-Pyramid...
.
Loaded fold0, layer0
Loaded fold0, layer1
Loaded fold0, layer2
Loaded fold0, layer3
Loaded fold0, layer4
Best_ntree_limit 2777
Loaded fold1, layer0
Loaded fold1, layer1
Loaded fold1, layer2
Loaded fold1, layer3
Loaded fold1, layer4
Best_ntree_limit 4040
Loaded fold2, layer0
Loaded fold2, layer1
Loaded fold2, layer2
Loaded fold2, layer3
Loaded fold2, layer4
Best_ntree_limit 1415
Loaded fold3, layer0
Loaded fold3, layer1
Loaded fold3, layer2
Loaded fold3, layer3
Loaded fold3, layer4
Best_ntree_limit 5148
Loaded fold4, layer0
Loaded fold4, layer1
Loaded fold4, layer2
Load

In [13]:
def values_to_rank(p):
    u, v = np.unique(p, return_inverse=True)
    result = (np.cumsum(np.bincount(v)) - 1)[v]
    result = result.astype(np.float64) + p
    return result


def values_and_rank(p):
    u, v = np.unique(p, return_inverse=True)
    result = (np.cumsum(np.bincount(v)) - 1)[v]
    result = result / len(p)
    result = (result + p) / 2
    return result


final_preds = []
for preds in test_preds_xgbpy:
    preds = np.concatenate(preds)
    print(np.unique(preds).shape)
    preds = values_and_rank(preds)
    final_preds.append(preds)

test_preds_xgbpy = final_preds[0]
for i in range(1, len(final_preds)):
    test_preds_xgbpy += final_preds[i]
print(np.unique(test_preds_xgbpy).shape[0])
print(test_preds_xgbpy)
print(test_preds_xgbpy.shape[0])

(911898,)
(911491,)
(911475,)
(911923,)
(912070,)
924620
[ -8.11468421 -17.90639336  -7.85100667 ...   0.99649418  -0.89310003
  -5.20152819]
924621


In [14]:
test_preds_cat = pd.DataFrame(np.concatenate(test_preds_cat),columns=['test_preds_cat'])
test_preds_lgbm = pd.DataFrame(np.concatenate(test_preds_lgbm),columns=['test_preds_lgbm'])
test_preds_lgbmquick = pd.DataFrame(np.concatenate(test_preds_lgbmquick),columns=['test_preds_lgbmquick'])
test_preds_xgb = pd.DataFrame(np.concatenate(test_preds_xgb),columns=['test_preds_xgb'])
test_preds_automl = pd.DataFrame(np.concatenate(test_preds_automl),columns=['test_preds_automl'])
test_preds_transfer1 = pd.DataFrame(np.concatenate(test_preds_transfer),columns=['test_preds_transfer1'])
test_preds_transfer2 = pd.DataFrame(np.concatenate(test_preds_resnet),columns=['test_preds_transfer2'])
test_preds_cnn = pd.DataFrame(np.concatenate(test_preds_cnn),columns=['test_preds_cnn'])
test_preds_xgbpy = pd.DataFrame(test_preds_xgbpy,columns=['test_preds_xgbpy'])

In [15]:
test_preds = pd.concat([test_preds_xgb,test_preds_automl,test_preds_lgbm,test_preds_xgbpy,test_preds_cat,test_preds_lgbmquick,test_preds_cnn,test_preds_transfer1,test_preds_transfer2],axis=1)

In [16]:
test_preds

Unnamed: 0,test_preds_xgb,test_preds_automl,test_preds_lgbm,test_preds_xgbpy,test_preds_cat,test_preds_lgbmquick,test_preds_cnn,test_preds_transfer1,test_preds_transfer2
0,-3.695300,0.018649,0.024687,-8.114684,0.059118,0.019146,0.220555,0.024626,0.018660
1,-7.466874,0.001230,0.000822,-17.906393,0.005467,0.000367,0.064324,0.001204,0.001508
2,-3.481268,0.028394,0.033803,-7.851007,0.134772,0.027598,0.179471,0.022194,0.031720
3,-1.400114,0.188305,0.243977,-1.692307,0.735541,0.194262,0.395525,0.171328,0.223853
4,2.141793,0.888477,1.962394,7.350561,3.444126,0.889979,0.644833,0.870721,0.873383
...,...,...,...,...,...,...,...,...,...
924616,-3.994361,0.016921,0.020446,-9.014251,0.058928,0.013696,0.154466,0.012176,0.011445
924617,1.676528,0.855882,1.829377,6.435118,3.360863,0.853054,0.637281,0.817020,0.801949
924618,-0.380278,0.395851,0.551630,0.996494,1.390179,0.378728,0.522351,0.405553,0.436863
924619,-1.068116,0.298125,0.272010,-0.893100,1.207285,0.242232,0.408335,0.310263,0.341604


In [17]:
print('Predicting LR stack.......')

test_preds_lr = np.zeros((len(test_preds),))

final_estimator = LogisticRegression(penalty='l2',solver='liblinear',C=0.001,class_weight="balanced",max_iter=5000)
    
model = pickle.load(open('../input/amexlrmodel/lr_model0.sav', 'rb'))  
    
test_preds_lrstack = model.predict_proba(test_preds)[:,1]
    
for f in range(1,FOLDS):        
        model = pickle.load(open(f'../input/amexlrmodel/lr_model{f}.sav', 'rb'))  
        test_preds_lrstack += model.predict_proba(test_preds)[:,1]
test_preds_lrstack /= FOLDS
    
del model
_ = gc.collect()   

Predicting LR stack.......


In [18]:
print(f'\nPredicting LGBM stack....')
model = joblib.load('../input/stacking-ensemble2/LGBM_stack_fold0.pkl')
test_preds_lgbmstack = model.predict_proba(test_preds.astype('float32'))[:,1]
    
for f in range(1,FOLDS):        
        model = joblib.load(f'../input/stacking-ensemble2/LGBM_stack_fold{f}.pkl')
        test_preds_lgbmstack += model.predict_proba(test_preds.astype('float32'))[:,1]
test_preds_lgbmstack /= FOLDS
    
# CLEAN MEMORY                  
del model
_ = gc.collect()


Predicting LGBM stack....


In [19]:
# TEST DATA FOR XGB  
X_test = test_preds
dtest = xgb.DMatrix(data=X_test)
del X_test
gc.collect()
#####################################################################################################
print(f'\nPredicting xgbStack...')       
    
# INFER XGB MODELS ON TEST DATA     
model = xgb.Booster()  
model.load_model(f'../input/stacking-ensemble2/XGBstacked_fold0.xgb')
test_preds_xgbstack = model.predict(dtest)
for f in range(1,FOLDS):
        model.load_model(f'../input/stacking-ensemble2/XGBstacked_fold{f}.xgb')
        test_preds_xgbstack += model.predict(dtest)
test_preds_xgbstack /= FOLDS
del model
_ = gc.collect()


Predicting xgbStack...


In [20]:
print(f'\nPredicting catstack...')              
     
model = CatBoostClassifier()
model.load_model(f'../input/stacking-ensemblecat/cat_stacked_fold0_seed42')
test_preds_catstack = model.predict_proba(test_preds)[:,1]
for f in range(1,FOLDS):
    model.load_model(f'../input/stacking-ensemblecat/cat_stacked_fold{f}_seed42')
    test_preds_catstack += model.predict_proba(test_preds)[:,1]
test_preds_catstack /= FOLDS

# CLEAN MEMORY                  
del model
_ = gc.collect()


Predicting catstack...


In [21]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler = load(open('../input/amex9predscaler/predscaler','rb'))
test_preds =  scaler.transform(test_preds)

In [22]:
print('Predicting MLP Stack........')
    
SEED = [42] 
modelname='stack'
predictions = np.zeros((len(test_preds),1))
for seed in SEED:
    predictions_ = run_k_fold(NFOLDS, seed,test_preds,modelname)
    predictions += predictions_ / len(SEED)  

test_preds_mlpstack = predictions[:,0]

Predicting MLP Stack........


In [23]:
test_preds_ensemble = 0.5*test_preds_xgbstack + 0.1*test_preds_lgbmstack + 0.4*test_preds_catstack + 0.6*test_preds_lr + 14*test_preds_mlpstack

In [24]:
# WRITE SUBMISSION FILE
test = pd.DataFrame(data={'prediction':test_preds_ensemble})
sub = pd.read_csv('../input/amex-default-prediction/sample_submission.csv')[['customer_ID']]
sub = pd.concat([sub,test[['prediction']]], axis=1)
# DISPLAY PREDICTIONS
sub.to_csv(f'submission_Amex.csv',index=False)
print('Submission file shape is', sub.shape )
sub.head()

Submission file shape is (924621, 2)


Unnamed: 0,customer_ID,prediction
0,00000469ba478561f23a92a868bd366de6f6527a684c9a...,-1.519718
1,00001bf2e77ff879fab36aa4fac689b9ba411dae63ae39...,-2.602098
2,0000210045da4f81e5f122c6bde5c2a617d03eef67f82c...,-1.246468
3,00003b41e58ede33b8daf61ab56d9952f17c9ad1c3976c...,5.602667
4,00004b22eaeeeb0ec976890c1d9bfc14fd9427e98c4ee9...,15.907276
