In [1]:
import pandas as pd
import numpy as np
import gc
# import cupy
import xgboost as xgb
import lightgbm as lgb

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.inspection import permutation_importance
from sklearn.metrics import plot_confusion_matrix

from DataPipeline import agglomeration_function
from DataPipeline import null_filtering
from DataPipeline import pre_process_df

import optuna

from EvaluationMetrics import amex_metric_mod
from EvaluationMetrics import top_four_percent
from EvaluationMetrics import gini_metric

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
class Train_Parameters:
    compute_train_df = True # if True, we will compute a new train_df, otherwise we read from drive
    compute_test_df = True # if True, we will compute a new test_df, otherwise we read from drive
    fast_training = False # if True, we train on only 10% of the rows
    train_hard_samples = False

    if fast_training:
        training_rounds = 999
    else:
        training_rounds = 30000

train_df = pd.read_parquet('Processed_Data/cleaned_train.parquet')
train_df = pre_process_df(train_df)

if Train_Parameters.fast_training:
    train_df = train_df.sample(frac=0.01, ignore_index=True)
    
null_threshold = 0.99
null_columns = null_filtering(train_df.iloc[:, 1:-1], null_threshold=null_threshold)
    

In [5]:
# Key parameters
categorical_columns = ["D_63", 
                       "D_64", "D_66", 
                       "D_68", "B_30", 
                       "B_38", "D_114", 
                       "D_116", "D_117", 
                       "D_120", "D_126"]  

class Model_Parameters:
    irrelevant_columns = ["customer_ID", "target"]
    other_columns = ['S_2'] #['S_2']
    categorical_columns = ["D_63", 
                           "D_64", "D_66", 
                           "D_68", "B_30", 
                           "B_38", "D_114", 
                           "D_116", "D_117", 
                           "D_120", "D_126"]  
    train_test_delta_columns = ['R_1', 'D_59', 'S_11', 'B_29', 'S_9', 'D_45']#, 
                                # 'D_121', 'S_27', 'S_24', 'S_22', 'D_115', 'D_118', 
                                # 'D_119', 'S_13', 'D_47', 'D_55', 'S_3', 'B_6', 'D_61', 
                                # 'P_3', 'D_52', 'B_9', 'P_4', 'S_7', 'B_12', 'D_62', 
                                # 'D_42', 'B_13', 'D_43', 'P_2', 'B_28', 'B_38', 'D_46', 
                                # 'B_25', 'B_14', 'B_3', 'S_5', 'B_2', 'B_40', 'D_48', 'D_60'] 
                                # 'B_10', 'S_12', 'B_37', 'B_1', 'D_69', 'B_5', 'B_7', 'B_11', 'B_18', 
                                # 'D_71', 'D_39', 'B_23', 'S_8', 'B_17', 'S_17', 'D_77', 'D_142', 'S_25', 'D_141', 'B_15', 'D_58', 
                                # 'R_27', 'S_23', 'S_26', 'D_105', 'D_50', 'D_124', 'B_4', 'D_102', 'D_133', 'D_56',
                                # 'D_104', 'B_21', 'S_16', 'D_144', 'R_6', 'B_36', 'S_19', 'B_26', 'B_24', 'B_27', 'D_128', 'B_16', 'B_8', 'D_53', 
                                # 'D_112', 'B_19', 'D_41', 'D_132', 'D_130', 'D_74', 'D_68', 'D_131', 'D_75', 'B_20', 'D_76', 'D_44', 'D_122', 'D_49', 'D_113', 
                                # 'D_117', 'D_145', 'S_15', 'D_134', 'D_106', 'D_65', 'D_63', 'R_3', 'R_7', 'D_54', 'D_70', 'R_12', 'B_42', 'R_14', 'D_73', 'D_80', 
                                # 'R_26', 'D_78', 'D_72', 'B_39', 'D_84', 'D_107', 'D_110', 'D_64', 'R_16', 'D_82', 'R_5', 'R_9', 'D_81', 'R_8', 'B_22', 'R_20']#, 
                                # # 'D_103', 'R_10', 'D_79', 'D_136', 'D_139', 'R_11', 'B_41', 'D_51', 'D_91', 'D_143', 'D_83', 'D_89', 'R_17', 'B_30',
                                # # 'R_13', 'D_125', 'D_138', 'D_123', 'D_135', 'D_140', 'D_129', 'D_66', 'D_88', 'R_4', 'B_33', 'R_2', 'D_92', 'D_111' 
                                # # 'S_18', 'D_108', 'R_21', 'D_114', 'R_24', 'D_86', 'D_137', 'D_116', 'D_126', 'D_109', 'D_120', 'D_96', 'B_31']
    
    train_test_delta_columns = [i for i in train_test_delta_columns if i not in categorical_columns]
    
    #['R_1', 'D_59', 'S_11', 'B_29', 'S_9'] #['R_1', 'D_59', 'S_11', 'B_29', 'S_9'] # columns with different distributions between train and test
    
    ignored_columns = train_test_delta_columns + list(null_columns.values) + other_columns #+ categorical_columns 
    valid_size = 0.2
    SEED = 10
    FOLDS = 5 #5
    VER = 5 #"HARD_Samples_1" 'hard_indicator_1'
    
num_features = [col for col in train_df.columns if ((col not in Model_Parameters.ignored_columns) 
                                                    and (col not in Model_Parameters.categorical_columns) 
                                                    and (col not in Model_Parameters.irrelevant_columns))]
cat_features = [col for col in train_df.columns if col in Model_Parameters.categorical_columns]#

In [6]:
xgb_parms = { 
    'max_depth':6, #5, 
    'learning_rate':0.013, #0.05, 
    'subsample':0.8,
    'colsample_bytree':0.6, 
    'eval_metric':'logloss',
    'objective':'binary:logistic',
    'tree_method':'gpu_hist',
    'predictor':'gpu_predictor',
    'random_state':Model_Parameters.SEED,
    'scale_pos_weight':3 #3 #use 10 when doing hard samples mining
}

In [7]:
# Create Train_df
if Train_Parameters.compute_train_df:
    train = train_df.loc[:, ~train_df.columns.isin(Model_Parameters.ignored_columns)]

    train = agglomeration_function(train, num_features=num_features, cat_features=cat_features, apply_pca=False)
    customer_ID_cols = train_df.groupby('customer_ID')['customer_ID'].tail(1).reset_index(drop=True)

    train = pd.concat([customer_ID_cols, train], axis=1)
    train.to_csv('Processed_Data/train_df_cleaned.csv', index=False)
    

else:
    if Train_Parameters.fast_training:
        train = pd.read_csv('Processed_Data/train_df_cleaned.csv', nrows=10000)
        train = train.sample(frac=0.1, ignore_index=True)
    else:
        train = pd.read_csv('Processed_Data/train_df_cleaned.csv')
        
train_dtypes = train.iloc[:,1:-1].dtypes

input shape: (5531451, 195)



DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmente

output shape: (458913, 2764)


In [8]:
%who

KFold	 Model_Parameters	 StratifiedKFold	 Train_Parameters	 agglomeration_function	 amex_metric_mod	 cat_features	 categorical_columns	 customer_ID_cols	 
gc	 gini_metric	 lgb	 np	 null_columns	 null_filtering	 null_threshold	 num_features	 optuna	 
pd	 permutation_importance	 plot_confusion_matrix	 plt	 pre_process_df	 top_four_percent	 train	 train_df	 train_dtypes	 
train_test_split	 xgb	 xgb_parms	 


In [12]:
%prun 20+20
5+5

 

10

         3 function calls in 0.000 seconds

   Ordered by: internal time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000    0.000    0.000 {built-in method builtins.exec}
        1    0.000    0.000    0.000    0.000 <string>:1(<module>)
        1    0.000    0.000    0.000    0.000 {method 'disable' of '_lsprof.Profiler' objects}

In [7]:
# # OOF Analysis

# oof_xgb = pd.read_csv(f'oof_predictions/oof_xgb_v{Model_Parameters.VER}.csv')
# # oof_xgb[oof_xgb.target==1].hist()
# # oof_xgb[oof_xgb.target==0].hist()
# print('top 4% ', top_four_percent(oof_xgb.target.values, oof_xgb.oof_pred.values), '/n',
# 'gini metric ', gini_metric(oof_xgb.target.values, oof_xgb.oof_pred.values))

In [8]:
# # Create hard_df indicator
# ##############################
# if Train_Parameters.train_hard_samples:

#     hard_df = oof_xgb[
#         ((oof_xgb.target==1).values & (oof_xgb.oof_pred<=0.5).values) | 
#         ((oof_xgb.target==0).values & (oof_xgb.oof_pred>=0.5).values)  
#     ] 
#     hard_df.head()
#     hard_df['hard_indicator'] = 1

#     train = train.merge(hard_df[["customer_ID","hard_indicator"]], how='left', left_on='customer_ID', right_on='customer_ID')
#     train.hard_indicator.fillna(0, inplace=True)

#     train.drop(columns='target', inplace=True)
#     train.head()

In [18]:
#####
# Main training routine
#####

def train_xgb_model(X_valid, Y_valid, X_train, Y_train, params=None, num_rounds=1000, existing_model=None):
    dtrain = xgb.DMatrix(data=X_train, label=Y_train, enable_categorical=True)
    dvalid = xgb.DMatrix(data=X_valid, label=Y_valid, enable_categorical=True)

    if existing_model == None:
        model = xgb.train(params, 
                dtrain=dtrain,
                evals=[(dtrain,'train'),(dvalid,'valid')],
                num_boost_round=num_rounds,
                early_stopping_rounds=1000,
                verbose_eval=100)
    else:
        model = xgb.train(params, 
                dtrain=dtrain,
                evals=[(dtrain,'train'),(dvalid,'valid')],
                num_boost_round=num_rounds,
                early_stopping_rounds=1000,
                verbose_eval=100,
                xgb_model=existing_model) 

    oof_pred = model.predict(dvalid)
        
    return model, oof_pred

def train_lgb_model(X_valid, Y_valid, X_train, Y_train, params=None, num_rounds=1000):
    dtrain = lgb.Dataset(X_train, Y_train)
    dvalid = lgb.Dataset(X_valid, Y_valid, reference=dtrain)

    params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'boosting':'gbdt',
        'seed': 42,
        'num_leaves': 150, #100,
        'learning_rate': 0.01,
        'feature_fraction': 0.20,
        'bagging_freq': 10,
        'bagging_fraction': 0.50,
        'n_jobs': -1,
        'lambda_l2': 10, #2
        'min_data_in_leaf': 2000,#40
        'scale_pos_weight' : 0.1
       
    }
    
    model = lgb.train(params, 
            dtrain,
            valid_sets=[dtrain, dvalid],
            num_boost_round=num_rounds,
            early_stopping_rounds=2000,
            verbose_eval=100)

    oof_pred = model.predict(X_valid)
        
    return model, oof_pred

In [21]:
############################## LightGBM #####################

importances = []
permutation_importances = []
oof = []

TRAIN_SUBSAMPLE = 1.0
FEATURES = train.columns[1:-1]
gc.collect()


skf = StratifiedKFold(n_splits=Model_Parameters.FOLDS, shuffle=True, random_state=Model_Parameters.SEED)

if Train_Parameters.train_hard_samples:
    target = 'hard_indicator'
else:        
    target = 'target'


for fold,(train_idx, valid_idx) in enumerate(skf.split(
            train, train[f'{target}'])):

    # TRAIN WITH SUBSAMPLE OF TRAIN FOLD DATA
    if TRAIN_SUBSAMPLE<1.0:
        np.random.seed(SEED)
        train_idx = np.random.choice(train_idx, 
                       int(len(train_idx)*TRAIN_SUBSAMPLE), replace=False)
        np.random.seed(None)

    print('#'*25)
    print('### Fold',fold+1)
    print('### Train size',len(train_idx),'Valid size',len(valid_idx))
    print('### Train shape',train.loc[:, FEATURES].shape)
    print(f'### Training with {int(TRAIN_SUBSAMPLE*100)}% fold data...')
    print('#'*25)

    X_valid = train.loc[valid_idx, FEATURES]
    X_train = train.loc[train_idx, FEATURES]
    Y_valid = train.loc[valid_idx, f'{target}']
    Y_train = train.loc[train_idx, f'{target}']

    ###### LGBM Model #######
    # lgb_model, oof_preds = train_lgb_model(X_valid, Y_valid, X_train, Y_train, params=xgb_parms, num_rounds=100)#Train_Parameters.training_rounds)    
    lgb_model, oof_preds = train_lgb_model(X_valid, Y_valid, X_train, Y_train, num_rounds = 5000) #Train_Parameters.training_rounds)    
    lgb_model.save_model(f'Model_Weights/LGBM_v{Model_Parameters.VER}_fold{fold}.json')

    # GET FEATURE IMPORTANCE FOR FOLD K
    # dd = model.get_score(importance_type='weight')
    # df = pd.DataFrame({'feature':dd.keys(),f'importance_{fold}':dd.values()})
    # importances.append(df)

    # INFER OOF FOLD K
    acc = amex_metric_mod(Y_valid.values, oof_preds)
    print('Kaggle Metric LGBM validation =',acc,'\n')

    acc = top_four_percent(Y_valid.values, lgb_model.predict(X_valid))
    acc2 = gini_metric(Y_valid.values, lgb_model.predict(X_valid))
    
    print('Kaggle Metric LGBM top 4% =',acc,'\n', 'Gini Metric =', acc2)

    # SAVE OOF
    df = train.loc[valid_idx, ['customer_ID',f'{target}'] ].copy()
    df['oof_pred'] = oof_preds
    oof.append( df )

    del X_valid, Y_valid, X_train, Y_train, lgb_model
    _ = gc.collect()

print('#'*25)
oof = pd.concat(oof,axis=0,ignore_index=True).set_index('customer_ID')
acc = amex_metric_mod(oof[f'{target}'], oof.oof_pred.values)
print('OVERALL CV Kaggle Metric LightGBm=',acc)

oof_lgb = train_df.drop_duplicates(subset=['customer_ID'])

oof_lgb = oof_lgb.loc[:,'customer_ID']
oof_lgb = oof_lgb.to_frame()

oof_lgb = oof_lgb.merge(oof, left_on='customer_ID', right_on='customer_ID')

oof_lgb.to_csv(f'oof_lgb_v{Model_Parameters.VER}.csv',index=False)
oof_lgb.head()
del oof_lgb

#########################
### Fold 1
### Train size 367130 Valid size 91783
### Train shape (458913, 2253)
### Training with 100% fold data...
#########################
Training until validation scores don't improve for 2000 rounds
[100]	training's binary_logloss: 0.449621	valid_1's binary_logloss: 0.450597
[200]	training's binary_logloss: 0.422049	valid_1's binary_logloss: 0.424206
[300]	training's binary_logloss: 0.414199	valid_1's binary_logloss: 0.417323
[400]	training's binary_logloss: 0.409661	valid_1's binary_logloss: 0.413661
[500]	training's binary_logloss: 0.40533	valid_1's binary_logloss: 0.410108
[600]	training's binary_logloss: 0.40056	valid_1's binary_logloss: 0.406229
[700]	training's binary_logloss: 0.396132	valid_1's binary_logloss: 0.402637
[800]	training's binary_logloss: 0.391834	valid_1's binary_logloss: 0.399319
[900]	training's binary_logloss: 0.387791	valid_1's binary_logloss: 0.396318
[1000]	training's binary_logloss: 0.383842	valid_1's binary_logloss: 0.393368

In [None]:
Kaggle Metric LGBM top 4% = 0.7188571669016011 
 Gini Metric = 0.9392231327180331

In [11]:
############################## XGBoost ########

importances = []
permutation_importances = []
oof = []

TRAIN_SUBSAMPLE = 1.0
FEATURES = train.columns[1:-1]
gc.collect()


skf = StratifiedKFold(n_splits=Model_Parameters.FOLDS, shuffle=True, random_state=Model_Parameters.SEED)

if Train_Parameters.train_hard_samples:
    target = 'hard_indicator'
else:        
    target = 'target'


for fold,(train_idx, valid_idx) in enumerate(skf.split(
            train, train[f'{target}'])):

    # TRAIN WITH SUBSAMPLE OF TRAIN FOLD DATA
    if TRAIN_SUBSAMPLE<1.0:
        np.random.seed(SEED)
        train_idx = np.random.choice(train_idx, 
                       int(len(train_idx)*TRAIN_SUBSAMPLE), replace=False)
        np.random.seed(None)

    print('#'*25)
    print('### Fold',fold+1)
    print('### Train size',len(train_idx),'Valid size',len(valid_idx))
    print('### Train shape',train.loc[:, FEATURES].shape)
    print(f'### Training with {int(TRAIN_SUBSAMPLE*100)}% fold data...')
    print('#'*25)

    X_valid = train.loc[valid_idx, FEATURES]
    X_train = train.loc[train_idx, FEATURES]
    Y_valid = train.loc[valid_idx, f'{target}']
    Y_train = train.loc[train_idx, f'{target}']

    xgb_model, oof_preds = train_xgb_model(X_valid, Y_valid, X_train, Y_train, params=xgb_parms, num_rounds = Train_Parameters.training_rounds)    
    xgb_model.save_model(f'Model_Weights/XGB_v{Model_Parameters.VER}_fold{fold}.json')

    # GET FEATURE IMPORTANCE FOR FOLD K
    # dd = model.get_score(importance_type='weight')
    # df = pd.DataFrame({'feature':dd.keys(),f'importance_{fold}':dd.values()})
    # importances.append(df)

    # INFER OOF FOLD K
    acc = amex_metric_mod(Y_valid.values, oof_preds)
    print('Kaggle Metric XGBoost =',acc,'\n')
    
    acc = amex_metric_mod(Y_train.values, lgb_model.predict(X_train))
    print('Kaggle Metric LGBM validation =',acc,'\n')

    # SAVE OOF
    df = train.loc[valid_idx, ['customer_ID',f'{target}'] ].copy()
    df['oof_pred'] = oof_preds
    oof.append( df )

    del X_valid, Y_valid, X_train, Y_train, xgb_model
    _ = gc.collect()

print('#'*25)
oof = pd.concat(oof,axis=0,ignore_index=True).set_index('customer_ID')
acc = amex_metric_mod(oof[f'{target}'], oof.oof_pred.values)
print('OVERALL CV Kaggle Metric XGBoost =',acc)

oof_xgb = train_df.drop_duplicates(subset=['customer_ID'])

oof_xgb = oof_xgb.loc[:,'customer_ID']
oof_xgb = oof_xgb.to_frame()

oof_xgb = oof_xgb.merge(oof, left_on='customer_ID', right_on='customer_ID')

oof_xgb.to_csv(f'oof_xgb_v{Model_Parameters.VER}.csv',index=False)
oof_xgb.head()
del oof_xgb

#########################
### Fold 1
### Train size 367130 Valid size 91783
### Train shape (458913, 2253)
### Training with 100% fold data...
#########################



DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`



XGBoostError: [12:46:07] ../src/data/data.cc:1163: Check failed: valid: Input data contains `inf` or `nan`
Stack trace:
  [bt] (0) /home/julian/anaconda3/envs/tf-gpu/lib/python3.7/site-packages/xgboost/lib/libxgboost.so(+0x154e19) [0x7f59bcdb3e19]
  [bt] (1) /home/julian/anaconda3/envs/tf-gpu/lib/python3.7/site-packages/xgboost/lib/libxgboost.so(+0x179fbd) [0x7f59bcdd8fbd]
  [bt] (2) /home/julian/anaconda3/envs/tf-gpu/lib/python3.7/site-packages/xgboost/lib/libxgboost.so(+0x1ab08a) [0x7f59bce0a08a]
  [bt] (3) /home/julian/anaconda3/envs/tf-gpu/lib/python3.7/site-packages/xgboost/lib/libxgboost.so(+0x16a475) [0x7f59bcdc9475]
  [bt] (4) /home/julian/anaconda3/envs/tf-gpu/lib/python3.7/site-packages/xgboost/lib/libxgboost.so(XGDMatrixCreateFromDense+0x453) [0x7f59bcd13ae3]
  [bt] (5) /home/julian/anaconda3/envs/tf-gpu/lib/python3.7/lib-dynload/../../libffi.so.6(ffi_call_unix64+0x4c) [0x7f5a3e468ec0]
  [bt] (6) /home/julian/anaconda3/envs/tf-gpu/lib/python3.7/lib-dynload/../../libffi.so.6(ffi_call+0x22d) [0x7f5a3e46887d]
  [bt] (7) /home/julian/anaconda3/envs/tf-gpu/lib/python3.7/lib-dynload/_ctypes.cpython-37m-x86_64-linux-gnu.so(_ctypes_callproc+0x2ce) [0x7f5a3f435ede]
  [bt] (8) /home/julian/anaconda3/envs/tf-gpu/lib/python3.7/lib-dynload/_ctypes.cpython-37m-x86_64-linux-gnu.so(+0x12914) [0x7f5a3f436914]



In [None]:
del train, train_df
_ = gc.collect()

In [None]:
# # Optuna section
# ##################

# def objective(trial):
#     param = {
#         # 'max_depth':5, 
#         'max_depth': trial.suggest_int(
#             'max_depth',3, 10), 
#         'learning_rate': trial.suggest_float(
#             'learning_rate', 0.001, 0.05, step=0.001), 
#         'subsample':0.8,
#         'colsample_bytree':0.6, 
#         'eval_metric':'logloss',
#         'objective':'binary:logistic',
#         'tree_method':'gpu_hist',
#         'predictor':'gpu_predictor',
#         'random_state':Model_Parameters.SEED,
#         'scaled_pos_weight': trial.suggest_int(
#             'scaled_pos_weight', 1, 5, 1),
#     }
    
#     FEATURES = train.columns[1:-1]

#     X_valid = train.loc[len(train)*0.8:, FEATURES]
#     y_valid = train.loc[len(train)*0.8:, 'target']

#     dtrain = xgb.DMatrix(data=train.loc[:len(train)*0.8, FEATURES], label=train.loc[:len(train)*0.8, 'target'])
#     dvalid = xgb.DMatrix(data=X_valid, label=y_valid)
    
#     model = xgb.train(param, 
#                 dtrain=dtrain,
#                 evals=[(dtrain,'train'),(dvalid,'valid')],
#                 num_boost_round=9999,
#                 early_stopping_rounds=100,
#                 verbose_eval=100) 

#     oof_preds = model.predict(dvalid)
#     acc = amex_metric_mod(y_valid.values, oof_preds)
#     del model
#     return acc

# # %%time
# # study = optuna.create_study(direction='maximize')
# # study.optimize(objective, n_trials= 5)

In [None]:
# # PLOT OOF PREDICTIONS
# plt.hist(oof_xgb.oof_pred.values, bins=100)
# plt.title('OOF Predictions')
# plt.show()

In [None]:
# del oof_xgb, oof
# _ = gc.collect()

In [None]:
#####
# Test
test_df = pd.read_parquet('Processed_Data/test.parquet')
test_df = pre_process_df(test_df)


In [None]:
test = test_df.loc[:, ~test_df.columns.isin(Model_Parameters.ignored_columns)]
test = agglomeration_function(test, num_features=num_features, cat_features=cat_features, ignore=None, apply_pca=False)
customer_ID_cols = test_df.groupby('customer_ID')['customer_ID'].tail(1).reset_index(drop=True)
test = pd.concat([customer_ID_cols, test], axis=1)
test.to_csv('Processed_Data/test_FE', index=False)


In [None]:
test = test.astype(train_dtypes.to_dict())
del test_df

In [None]:
# if Train_Parameters.compute_test_df:
#     test = test_df.loc[:, ~test_df.columns.isin(Model_Parameters.ignored_columns)]
#     test = agglomeration_function(test, num_features=num_features, cat_features=cat_features, ignore=None, apply_pca=False)
#     customer_ID_cols = test_df.groupby('customer_ID')['customer_ID'].tail(1).reset_index(drop=True)
#     test = pd.concat([customer_ID_cols, test], axis=1)
#     test.to_csv('Processed_Data/test_FE', index=False)
# else:
#     test = pd.read_csv('Processed_Data/test_FE')

# # test = test.drop(list(less_important_features.values), axis=1)

In [None]:
# Main prediction routine for LGB
####################################
model = lgb.Booster(model_file=f'Model_Weights/LGBM_v{Model_Parameters.VER}_fold0.json')
# y_test = xgb.DMatrix(data=test.iloc[:, 1:], enable_categorical=True)
y_pred = model.predict(test.iloc[:, 1:])
# del model, test

for i in range(4):
    model = lgb.Booster(model_file=f'Model_Weights/LGBM_v{Model_Parameters.VER}_fold{i+1}.json')
    y_pred += model.predict(test.iloc[:, 1:])
    del model
    
y_pred = y_pred/5

# create submission
sample_df = pd.read_csv('amex-default-prediction/sample_submission.csv')
submission = pd.DataFrame({'customer_ID': test['customer_ID'], 'target': y_pred})
print(sample_df.shape, submission.shape)
sample_df['prediction'] = submission.target.values
sample_df.to_csv(f'output_LGBoost_ver{Model_Parameters.VER}.csv', index=False)
del sample_df

In [None]:
# Main prediction routine for XGB
####################################
test = pd.read_csv('Processed_Data/test_FE')
print(test.shape)
test = test.iloc[:500000, :]
model = xgb.Booster()
model.load_model(f'Model_Weights/XGB_v{Model_Parameters.VER}_fold0.json')
y_test = xgb.DMatrix(data=test.iloc[:, 1:], enable_categorical=True)
y_pred = model.predict(y_test)
customer_ID = test['customer_ID']
del model, test

for i in range(4):
    model = xgb.Booster()
    model.load_model(f'Model_Weights/XGB_v{Model_Parameters.VER}_fold{i+1}.json')
    y_pred += model.predict(y_test)
    del model
    
y_pred = y_pred/5

# create submission
sample_df = pd.read_csv('amex-default-prediction/sample_submission.csv')
sample_df = sample_df.iloc[:500000, :]
submission = pd.DataFrame({'customer_ID': customer_ID, 'target': y_pred})
print(sample_df.shape, submission.shape)
sample_df['prediction'] = submission.target.values
sample_df.to_csv(f'output_XGBoost_ver{Model_Parameters.VER}.csv', index=False)
del sample_df

In [None]:
# Main prediction routine for XGB
####################################
test = pd.read_csv('Processed_Data/test_FE')
print(test.shape)
test = test.iloc[500000:, :]
model = xgb.Booster()
model.load_model(f'Model_Weights/XGB_v{Model_Parameters.VER}_fold0.json')
y_test = xgb.DMatrix(data=test.iloc[:, 1:], enable_categorical=True)
y_pred = model.predict(y_test)
customer_ID = test['customer_ID']
del model, test

for i in range(4):
    model = xgb.Booster()
    model.load_model(f'Model_Weights/XGB_v{Model_Parameters.VER}_fold{i+1}.json')
    y_pred += model.predict(y_test)
    del model
    
y_pred = y_pred/5

# create submission
sample_df = pd.read_csv('amex-default-prediction/sample_submission.csv')
sample_df = sample_df.iloc[500000:, :]
submission = pd.DataFrame({'customer_ID': customer_ID, 'target': y_pred})
print(sample_df.shape, submission.shape)
sample_df['prediction'] = submission.target.values
sample_df.to_csv(f'output_XGBoost_ver{Model_Parameters.VER}2nd_half.csv', index=False)
del sample_df

In [None]:
# # Predictions for hard sample
# ####################################

# model = xgb.Booster()
# model.load_model(f'Model_Weights/XGB_hard_indicator_v{Model_Parameters.VER}_fold0.json')
# y_test = xgb.DMatrix(data=test.iloc[:, 1:], enable_categorical=True)
# y_pred = model.predict(y_test)
# del model, test

# for i in range(1):
#     model = xgb.Booster()
#     model.load_model(f'Model_Weights/XGB_v{Model_Parameters.VER}_fold{i+1}.json')
#     y_pred += model.predict(y_test)
#     del model
    
# y_pred = y_pred/2
# # sample_df = pd.read_csv('amex-default-prediction/sample_submission.csv')
# # submission = pd.DataFrame({'customer_ID': test['customer_ID'], 'target': y_pred})


In [None]:
# # Secondary Model prediction routine
# ####################################

# model = xgb.Booster()
# model.load_model(f'XGB_v{Model_Parameters.VER}_fold0.json')



In [None]:
# y_test = xgb.DMatrix(data=test.iloc[:, 1:], enable_categorical=True)
# y_pred = model.predict(y_test)
# del model, test

# for i in range(1):
#     model = xgb.Booster()
#     model.load_model(f'XGB_v{Model_Parameters.VER}_fold{i+1}.json')
#     y_pred += model.predict(y_test)
#     del model
    
# y_pred = y_pred/2

In [None]:
# save_y_pred = pd.DataFrame(y_pred)

In [None]:
# save_y_pred.to_csv('hard_model_y_pred.csv')

In [None]:
# # Create hard sample list
# #############

# sample_df = pd.read_csv('amex-default-prediction/sample_submission.csv')
# submission = pd.DataFrame({'customer_ID': test['customer_ID'], 'target': y_pred})
# # submission = submission.groupby(by='customer_ID').mean()
# print(sample_df.shape, submission.shape)
# sample_df['hard_prediction'] = submission.target.values
# sample_df.to_csv(f'output_XGBoost_hard_indicator_ver{Model_Parameters.VER}.csv', index=False)

In [None]:
# # find permutation importance based on test data
# model_sklearn_api = xgb.XGBClassifier()
# model_sklearn_api.load_model(f'XGB_v{Model_Parameters.VER}_fold{fold}.json')

# perm_importance = permutation_importance(model_sklearn_api, test.iloc[:10000,1:], test.iloc[:10000,-1], n_repeats=1, n_jobs=-1)
# perm_importance = pd.DataFrame.from_dict(perm_importance.importances_mean)
# perm_importance2 = pd.DataFrame({'importances':perm_importance.values.flatten(),'feature name': test.iloc[:, 1:].columns.values})
# perm_importance2.to_csv(f'perm_importances_{Model_Parameters.VER}')

In [None]:
# # Plot a tree for fun

# from xgboost import plot_tree
# import matplotlib.pyplot as plt

# plot_tree(model, num_trees=1)
# fig = plt.gcf()
# fig.set_size_inches(300, 150)
# plt.savefig('pic.jpg', dpi='figure')

# oof_xgb = pd.read_csv(f'oof_xgb_v{Model_Parameters.VER}.csv')

In [None]:
# # Play around with multiple CSVs output
# based_pred = pd.read_csv('output_XGBoost_ver2.csv')
# hard_indicator = pd.read_csv('output_XGBoost_hard_indicator_ver2.csv')
# hard_pred = pd.read_csv('output_XGBoost_hard_indicator_verHARD_Samples_1.csv')