In [32]:
import pandas as pd
import numpy as np
import gc
# import cupy
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.inspection import permutation_importance

from DataPipeline import agglomeration_function
from DataPipeline import null_filtering

from EvaluationMetrics import amex_metric_mod

In [33]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [34]:
train_df = pd.read_parquet('Processed_Data/cleaned_train.parquet')

In [35]:
# train_df = train_df.iloc[:10000,:]

In [36]:
null_threshold = 0.95

null_columns = null_filtering(train_df.iloc[:, 1:-1], null_threshold=null_threshold)

class Train_Parameters:
    compute_train_df = False # if True, we will compute a new train_df, otherwise we read from drive
    compute_test_df = False # if True, we will compute a new test_df, otherwise we read from drive

# Key parameters
class Model_Parameters:
    irrelevant_columns = ["customer_ID", "target"]
    other_columns = ['S_2']
    categorical_columns = ["D_63", 
                           "D_64", "D_66", 
                           "D_68", "B_30", 
                           "B_38", "D_114", 
                           "D_116", "D_117", 
                           "D_120", "D_126"]  
    train_test_delta_columns = [] #['R_1', 'D_59', 'S_11', 'B_29', 'S_9'] # columns with different distributions between train and test
    
    ignored_columns = train_test_delta_columns + list(null_columns.values) + other_columns #+ categorical_columns 
    valid_size = 0.2
    SEED = 10
    FOLDS = 2
    VER = 2

In [38]:
xgb_parms = { 
    'max_depth':5, 
    'learning_rate':0.05, 
    'subsample':0.8,
    'colsample_bytree':0.6, 
    'eval_metric':'logloss',
    'objective':'binary:logistic',
    'tree_method':'gpu_hist',
    'predictor':'gpu_predictor',
    'random_state':Model_Parameters.SEED,
    'scale_pos_weight':3
}

In [None]:
def objective(trial):
    param = {
        'scaled_pos_weight': trial.suggest_int(
            'scaled_pos_weight': 5, 15, 10),
    }
    
    model = 

In [39]:
num_features = [col for col in train_df.columns if ((col not in Model_Parameters.ignored_columns) 
                                                    and (col not in Model_Parameters.categorical_columns) 
                                                    and (col not in Model_Parameters.irrelevant_columns))]
cat_features = [col for col in train_df.columns if col in Model_Parameters.categorical_columns]

In [40]:
if Train_Parameters.compute_train_df:

    train = train_df.loc[:, ~train_df.columns.isin(Model_Parameters.ignored_columns)]
    train = agglomeration_function(train, num_features=num_features, cat_features=cat_features, apply_pca=False)
    customer_ID_cols = train_df.groupby('customer_ID')['customer_ID'].tail(1).reset_index(drop=True)
    train = pd.concat([customer_ID_cols, train], axis=1)
    train.to_csv('Processed_Data/train_df_cleaned.csv', index=False)
else:
    train = pd.read_csv('Processed_Data/train_df_cleaned.csv')

In [41]:
train.head()

Unnamed: 0,customer_ID,P_2_mean,P_2_median,P_2_std,P_2_min,P_2_max,P_2_last,D_39_mean,D_39_median,D_39_std,...,D_117_count,D_117_last,D_117_nunique,D_120_count,D_120_last,D_120_nunique,D_126_count,D_126_last,D_126_nunique,target
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,0.933824,0.938469,0.024194,0.86858,0.960384,0.934745,0.230769,0.0,0.83205,...,13,5,1,13,0,1,13,2,1,0
1,00000fd6641609c6ece5454664794f0340ad84dddce9a2...,0.89982,0.904814,0.022119,0.861109,0.929122,0.880519,7.153846,7.0,6.743468,...,13,0,1,13,0,2,13,2,1,0
2,00001b22f846c82c51f6e3958ccd81970162bae8b007e8...,0.878454,0.884522,0.028911,0.79767,0.904482,0.880875,0.0,0.0,0.0,...,13,0,1,13,0,1,13,2,1,0
3,000041bdba6ecadd89a52d11886e8eaaec9325906c9723...,0.598969,0.598278,0.020107,0.567442,0.623392,0.621776,1.538462,0.0,3.017046,...,13,7,2,13,0,1,13,2,1,0
4,00007889e4fcd2614b6cbe7f8f3d2e5c728eca32d9eb8a...,0.891679,0.879238,0.042325,0.805045,0.940382,0.8719,0.0,0.0,0.0,...,13,5,1,13,0,1,13,2,1,0


In [None]:
importances = []
permutation_importances = []
oof = []
# train = train_df.loc[:, ~train_df.columns.isin(categorical_columns)]#train.to_pandas() # free GPU memory


TRAIN_SUBSAMPLE = 1.0
# FEATURES = [i for i in train.columns[1:-1].values if i not in discarded_columns] #train.columns[1:-1]


FEATURES = train.columns[1:-1]
gc.collect()


# skf = KFold(n_splits=Model_Parameters.FOLDS, shuffle=True, random_state=Model_Parameters.SEED)
skf = StratifiedKFold(n_splits=Model_Parameters.FOLDS, shuffle=True, random_state=Model_Parameters.SEED)

for fold,(train_idx, valid_idx) in enumerate(skf.split(
            train, train.target )):
    
    # TRAIN WITH SUBSAMPLE OF TRAIN FOLD DATA
    if TRAIN_SUBSAMPLE<1.0:
        np.random.seed(SEED)
        train_idx = np.random.choice(train_idx, 
                       int(len(train_idx)*TRAIN_SUBSAMPLE), replace=False)
        np.random.seed(None)
    
    print('#'*25)
    print('### Fold',fold+1)
    print('### Train size',len(train_idx),'Valid size',len(valid_idx))
    print('### Train shape',train.loc[:, FEATURES].shape)
    print(f'### Training with {int(TRAIN_SUBSAMPLE*100)}% fold data...')
    print('#'*25)
    
    # TRAIN, VALID, TEST FOR FOLD K
    # Xy_train = IterLoadForDMatrix(train.loc[train_idx], FEATURES, 'target')
    X_valid = train.loc[valid_idx, FEATURES]
    y_valid = train.loc[valid_idx, 'target']
    
    dtrain = xgb.DMatrix(data=train.loc[train_idx, FEATURES], label=train.loc[train_idx, 'target'])
    
    # dtrain = xgb.DeviceQuantileDMatrix(Xy_train, max_bin=256)
    dvalid = xgb.DMatrix(data=X_valid, label=y_valid)
    
    # TRAIN MODEL FOLD K
    # model = xgb.XGBClassifier(n_estimators=10, max_depth=2)
    # model.fit(train.loc[train_idx, FEATURES], train.loc[train_idx, 'target'])
    model = xgb.train(xgb_parms, 
                dtrain=dtrain,
                evals=[(dtrain,'train'),(dvalid,'valid')],
                num_boost_round=9999,
                early_stopping_rounds=100,
                verbose_eval=100) 
    # print(type(model))
    model.save_model(f'XGB_v{Model_Parameters.VER}_fold{fold}.xgb')
    
    # GET FEATURE IMPORTANCE FOR FOLD K
    dd = model.get_score(importance_type='weight')
    df = pd.DataFrame({'feature':dd.keys(),f'importance_{fold}':dd.values()})
    importances.append(df)
    # permutation_importances.append(permutation_importance(model, train.iloc[:10000,1:-1], train.iloc[:10000,-1], n_repeats=2, n_jobs=-1))
            
    # INFER OOF FOLD K
    oof_preds = model.predict(dvalid)
    acc = amex_metric_mod(y_valid.values, oof_preds)
    print('Kaggle Metric =',acc,'\n')
    
    # SAVE OOF
    df = train.loc[valid_idx, ['customer_ID','target'] ].copy()
    df['oof_pred'] = oof_preds
    oof.append( df )
    
    del dtrain,dd, df#  Xy_train
    del X_valid, y_valid, dvalid, model
    _ = gc.collect()
    
print('#'*25)
oof = pd.concat(oof,axis=0,ignore_index=True).set_index('customer_ID')
acc = amex_metric_mod(oof.target.values, oof.oof_pred.values)
print('OVERALL CV Kaggle Metric =',acc)

#########################
### Fold 1
### Train size 229456 Valid size 229457
### Train shape (458913, 1059)
### Training with 100% fold data...
#########################
[0]	train-logloss:0.66450	valid-logloss:0.66455
[100]	train-logloss:0.26984	valid-logloss:0.27700
[200]	train-logloss:0.25054	valid-logloss:0.26461
[300]	train-logloss:0.23968	valid-logloss:0.26001
[400]	train-logloss:0.23107	valid-logloss:0.25735
[500]	train-logloss:0.22350	valid-logloss:0.25546
[600]	train-logloss:0.21655	valid-logloss:0.25397
[700]	train-logloss:0.21028	valid-logloss:0.25279
[800]	train-logloss:0.20399	valid-logloss:0.25153
[900]	train-logloss:0.19821	valid-logloss:0.25058
[1000]	train-logloss:0.19248	valid-logloss:0.24976
[1100]	train-logloss:0.18695	valid-logloss:0.24897
[1200]	train-logloss:0.18163	valid-logloss:0.24818
[1300]	train-logloss:0.17639	valid-logloss:0.24735
[1400]	train-logloss:0.17123	valid-logloss:0.24673
[1500]	train-logloss:0.16646	valid-logloss:0.24610
[1600]	train-logloss:0.161

In [None]:
del train
_ = gc.collect()

In [None]:
oof_xgb = train_df.drop_duplicates(subset=['customer_ID'])

oof_xgb = oof_xgb.loc[:,'customer_ID']
oof_xgb = oof_xgb.to_frame()

oof_xgb = oof_xgb.merge(oof, left_on='customer_ID', right_on='customer_ID')

In [None]:
oof_xgb.to_csv(f'oof_xgb_v{Model_Parameters.VER}.csv',index=False)
oof_xgb.head()

In [None]:
# PLOT OOF PREDICTIONS
plt.hist(oof_xgb.oof_pred.values, bins=100)
plt.title('OOF Predictions')
plt.show()

In [None]:
del oof_xgb, oof
_ = gc.collect()

In [None]:
importances_2 = []
for i in range(len(importances)):
    df = pd.DataFrame({'feature': list(importances[i].iloc[0,0]), 'importance': list(importances[i].iloc[0,1])})
    importances_2.append(df)

In [None]:
df = importances_2[0].copy()
for k in range(1,Model_Parameters.FOLDS): df = df.merge(importances_2[k], on='feature', how='left')
df['importance'] = df.iloc[:,1:].mean(axis=1)
df = df.sort_values('importance',ascending=False)
df.to_csv(f'xgb_feature_importance_v{Model_Parameters.VER}.csv',index=False)

In [None]:
NUM_FEATURES = 20
plt.figure(figsize=(10,5*NUM_FEATURES//10))
plt.barh(np.arange(NUM_FEATURES,0,-1), df.importance.values[:NUM_FEATURES])
plt.yticks(np.arange(NUM_FEATURES,0,-1), df.feature.values[:NUM_FEATURES])
plt.title(f'XGB Feature Importance - Top {NUM_FEATURES}')
plt.show()

In [None]:
test_df = pd.read_parquet('Processed_Data/test.parquet')

In [None]:
if Train_Parameters.compute_test_df:
    test = test_df.loc[:, ~test_df.columns.isin(Model_Parameters.ignored_columns)]
    test = agglomeration_function(test, num_features=num_features, cat_features=cat_features, ignore=None, apply_pca=False)
    customer_ID_cols = test_df.groupby('customer_ID')['customer_ID'].tail(1).reset_index(drop=True)
    test = pd.concat([customer_ID_cols, test], axis=1)
    test.to_csv('Processed_Data/test_FE', index=False)
else:
    test = pd.read_csv('Processed_Data/test_FE')

In [None]:
# test_df = test_df.groupby('customer_ID').tail(1).reset_index(drop=True)

In [None]:
model = xgb.Booster()
model.load_model(f'XGB_v{Model_Parameters.VER}_fold0.xgb')

In [None]:
test.iloc[:, 1:]

In [None]:
y_test = xgb.DMatrix(data=test.iloc[:, 1:])

# y_test = xgb.DMatrix(data=test.loc[:, FEATURES])


In [None]:
y_pred = model.predict(y_test)

In [None]:
len(y_pred)

In [None]:
sample_df = pd.read_csv('amex-default-prediction/sample_submission.csv')
submission = pd.DataFrame({'customer_ID': test['customer_ID'], 'target': y_pred})
# submission = submission.groupby(by='customer_ID').mean()
print(sample_df.shape, submission.shape)
sample_df['prediction'] = submission.target.values

In [None]:
sample_df.to_csv(f'output_XGBoost_ver{Model_Parameters.VER}.csv', index=False)

In [None]:
model_sklearn_api = xgb.XGBClassifier()

In [None]:
model_sklearn_api.load_model(f'XGB_v{Model_Parameters.VER}_fold{fold}.xgb')

In [None]:
perm_importance = permutation_importance(model_sklearn_api, train.iloc[:10000,1:-1], train.iloc[:10000,-1], n_repeats=1, n_jobs=-1)

In [None]:
perm_importance

In [None]:
from xgboost import plot_tree
import matplotlib.pyplot as plt

plot_tree(model, num_trees=1)
fig = plt.gcf()
fig.set_size_inches(300, 150)
plt.savefig('pic.jpg', dpi='figure')

In [None]:
oof_xgb = pd.read_csv(f'oof_xgb_v{Model_Parameters.VER}.csv')

In [None]:
train

In [None]:
oof_xgb.head()

In [None]:
plt.scatter(oof_xgb.target[::1000], oof_xgb.oof_pred[::1000])

In [None]:
# from sklearn import metrics
# metrics.plot_confusion_matrix(model_sklearn_api, train.iloc[:10000, 1:-1], train.iloc[:10000, -1]) 
# plt.show()