In [1]:
%run import_libs.py

### get data

In [2]:
df_train = get_train_data(TRAIN_PATH='./data/train.parquet')
num_features = pd.read_csv("num_feats_after_filtering.csv")["0"].to_list()

df_train_agg = get_df_w_aggrs(df=df_train, feats=num_features)
df_train_target = get_target(TARGET_PATH='./data/train_labels.csv')
df_train = get_train_data_with_target_merged(df_train=df_train_agg, df_train_target=df_train_target)

(458913, 151)
(458913, 151)
(458913, 151)
(458913, 162)
(458913, 616)


In [3]:
df_test = get_test_data(TEST_PATH='./data/test.parquet')
df_test = get_df_w_aggrs(df=df_test, feats=num_features)

(924621, 151)
(924621, 151)
(924621, 151)
(924621, 162)
(924621, 616)


In [4]:
cat_features = [f"{f}_last" for f in ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']]
cat_features

['B_30_last',
 'B_38_last',
 'D_114_last',
 'D_116_last',
 'D_117_last',
 'D_120_last',
 'D_126_last',
 'D_63_last',
 'D_64_last',
 'D_66_last',
 'D_68_last']

In [5]:
payment_feats = []
delinq_feats = []
spend_feats = []
balance_feats = []
risk_feats = []

for feat in list(df_train):
    if feat in cat_features:
        continue
    
    if feat[0] == 'P':
        #print(feat)
        payment_feats.append(feat)
    elif feat[0] == 'D':
        delinq_feats.append(feat)
    elif feat[0] == 'S':
        spend_feats.append(feat)
    elif feat[0] == 'B':
        balance_feats.append(feat)
    elif feat[0] == 'R':
        risk_feats.append(feat)

In [6]:
len(payment_feats) + len(delinq_feats) + len(spend_feats) + len(balance_feats) + len(risk_feats)

604

In [7]:
num_features = payment_feats + delinq_feats + spend_feats + balance_feats + risk_feats
len(num_features)

604

### load config

In [8]:
class CFG:
    DEBUG = False
    model = 'tabnet'
    N_folds = 5
    seed = 42
    batch_size = 512
    max_epochs = 60

In [9]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(seed = CFG.seed)

In [10]:
psutil.virtual_memory().percent

31.0

In [11]:
# using amex metric to evaluate tabnet
class Amex_tabnet(Metric):
    
  def __init__(self):
    self._name = 'amex_tabnet'
    self._maximize = True

  def __call__(self, y_true, y_pred):
    amex = get_amex_metric_calculated(y_true, y_pred[:, 1])
    return max(amex, 0.)

### Fit model

In [12]:
print('\n ', '-'*50)
print('\nTraining: ', CFG.model)
print('\n ', '-'*50)

print('\nSeed: ', CFG.seed)
print('N folds: ', CFG.N_folds)

print('\nN features: ', len(num_features))
print('\n')


  --------------------------------------------------

Training:  tabnet

  --------------------------------------------------

Seed:  42
N folds:  5

N features:  604




In [13]:
df_train.fillna(0, inplace=True)
df_test.fillna(0, inplace=True)

print('Shapes:', df_train.shape, df_test.shape)

Shapes: (458913, 617) (924621, 616)


In [15]:
type(df_test)

pandas.core.frame.DataFrame

In [17]:
df_test.shape

(924621, 616)

In [18]:
X_tr.shape

(367130, 604)

In [20]:
X_va.shape

(91783, 604)

In [19]:
len(X_va.values)

91783

In [23]:
type(X_va.values)

numpy.ndarray

In [24]:
X_va.values.shape

(91783, 604)

In [21]:
df_test.shape

(924621, 616)

In [22]:
df_test[num_features].shape

(924621, 604)

In [None]:
df_test.values

In [27]:
test_predictions.shape

(924621,)

In [None]:
test_predictions

In [28]:
len(model.feature_importances_)

604

In [30]:
len(df_train.columns.tolist())

617

In [29]:
len(df_train[num_features].columns.tolist())

604

In [14]:
# X_train = train.loc[train_idx]
# y_train = target.loc[train_idx]

# Create out of folds array
oof_predictions = np.zeros((df_train.shape[0]))
test_predictions = np.zeros(df_test.shape[0])
feature_importances = pd.DataFrame()
feature_importances["feature"] = df_train[num_features].columns.tolist()
stats = pd.DataFrame()
explain_matrices = []
masks_ =[]

target_col = 'target'
group_col = 'customer_ID'

target, groups = df_train[target_col].values, df_train[group_col].values
    
# kfold = StratifiedKFold(n_splits = CFG.N_folds, shuffle=True, random_state = CFG.seed)
sgkf = StratifiedGroupKFold(CFG.N_folds, shuffle=True, random_state=CFG.seed)

# for tr_idx, va_idx in sgkf.split(df_train[[group_col, target_col]], y, groups):

for fold, (tr_idx, va_idx) in enumerate(
                sgkf.split(df_train[[group_col, target_col]], target, groups)):
    print(f"Fold {fold}")

    ## DEBUG MODE
    if CFG.DEBUG == True:
        if fold > 0:
            print('\nDEBUG mode activated: Will train only one fold...\n')
            break      

    start = time.time()

    X_tr, X_va = df_train.iloc[tr_idx][num_features], df_train.iloc[va_idx][num_features]
    y_tr, y_va = target[tr_idx], target[va_idx]    

    # X_train, y_train = train.loc[train_idx], target.loc[train_idx]
    # X_valid, y_valid = train.loc[valid_idx], target.loc[valid_idx]        
        
    model = TabNetClassifier(n_d = 32,
                             n_a = 32,
                             n_steps = 3,
                             gamma = 1.3,
                             n_independent = 2,
                             n_shared = 2,
                             momentum = 0.02,
                             clip_value = None,
                             lambda_sparse = 1e-3,
                             optimizer_fn = torch.optim.Adam,
                             optimizer_params = dict(lr = 1e-3, weight_decay=1e-3),
                             scheduler_fn = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts,
                             scheduler_params = {'T_0':5,
                                                 'eta_min':1e-4,
                                                 'T_mult':1,
                                                 'last_epoch':-1},
                             mask_type = 'entmax',
                             seed = CFG.seed)
    
    ## train
    model.fit(np.array(X_tr),
              #np.array(y_tr.values.ravel()),
              y_tr,
              eval_set = [(np.array(X_va), y_va)],
              max_epochs = CFG.max_epochs, # CFG.max_epochs
              patience = 50,
              batch_size = CFG.batch_size,
              eval_metric = ['auc', 'accuracy', Amex_tabnet]) # Last metric is used for early stopping
    
    # Saving best model
    # saving_path_name = f"./fold{fold}"
    # saved_filepath = model.save_model(saving_path_name)
    
    # model explanability
    explain_matrix, masks = model.explain(X_va.values)
    explain_matrices.append(explain_matrix)
    masks_.append(masks[0])
    masks_.append(masks[1])
    
    # Inference
    oof_predictions[va_idx] = model.predict_proba(X_va.values)[:, 1]

    test_predictions += model.predict_proba(df_test[num_features].values)[:, 1]/5
    feature_importances[f"importance_fold{fold}+1"] = model.feature_importances_
    
    # Loss , metric tracking
    stats[f'fold{fold+1}_train_loss'] = model.history['loss']
    stats[f'fold{fold+1}_val_metric'] = model.history['val_0_amex_tabnet']

    end = time.time()
    time_delta = np.round((end - start)/60, 2)
     
    print(f'\nFold {fold+1}/{CFG.N_folds} | {time_delta:.2f} min')

    ### free memory
    del X_tr, y_tr
    del X_va, y_va
    gc.collect()

print(f'OOF score across folds: {get_amex_metric_calculated(target, oof_predictions.flatten())}')

Fold 0




epoch 0  | loss: 0.51859 | val_0_auc: 0.91092 | val_0_accuracy: 0.83555 | val_0_amex_tabnet: 0.5967  |  0:00:59s
epoch 1  | loss: 0.35551 | val_0_auc: 0.9277  | val_0_accuracy: 0.86246 | val_0_amex_tabnet: 0.6642  |  0:01:58s
epoch 2  | loss: 0.31299 | val_0_auc: 0.93591 | val_0_accuracy: 0.87087 | val_0_amex_tabnet: 0.68442 |  0:02:58s
epoch 3  | loss: 0.29255 | val_0_auc: 0.93984 | val_0_accuracy: 0.87697 | val_0_amex_tabnet: 0.7011  |  0:03:59s
epoch 4  | loss: 0.28269 | val_0_auc: 0.9412  | val_0_accuracy: 0.87815 | val_0_amex_tabnet: 0.70547 |  0:04:59s
epoch 5  | loss: 0.27169 | val_0_auc: 0.94764 | val_0_accuracy: 0.88494 | val_0_amex_tabnet: 0.73266 |  0:05:59s
epoch 6  | loss: 0.25838 | val_0_auc: 0.95045 | val_0_accuracy: 0.88843 | val_0_amex_tabnet: 0.7434  |  0:07:00s
epoch 7  | loss: 0.25037 | val_0_auc: 0.95238 | val_0_accuracy: 0.89001 | val_0_amex_tabnet: 0.75335 |  0:08:01s
epoch 8  | loss: 0.24568 | val_0_auc: 0.95314 | val_0_accuracy: 0.89127 | val_0_amex_tabnet: 0.7




Fold 1/5 | 136.43 min
Fold 1




epoch 0  | loss: 0.51468 | val_0_auc: 0.91434 | val_0_accuracy: 0.83986 | val_0_amex_tabnet: 0.61337 |  0:00:58s
epoch 1  | loss: 0.35197 | val_0_auc: 0.93174 | val_0_accuracy: 0.86868 | val_0_amex_tabnet: 0.67345 |  0:01:55s
epoch 2  | loss: 0.30433 | val_0_auc: 0.93849 | val_0_accuracy: 0.87566 | val_0_amex_tabnet: 0.69687 |  0:02:54s
epoch 3  | loss: 0.28564 | val_0_auc: 0.94177 | val_0_accuracy: 0.87843 | val_0_amex_tabnet: 0.7107  |  0:03:54s
epoch 4  | loss: 0.27844 | val_0_auc: 0.94289 | val_0_accuracy: 0.88072 | val_0_amex_tabnet: 0.71519 |  0:04:54s
epoch 5  | loss: 0.27055 | val_0_auc: 0.94771 | val_0_accuracy: 0.88674 | val_0_amex_tabnet: 0.73279 |  0:05:54s
epoch 6  | loss: 0.25776 | val_0_auc: 0.95076 | val_0_accuracy: 0.89094 | val_0_amex_tabnet: 0.74791 |  0:06:54s
epoch 7  | loss: 0.24945 | val_0_auc: 0.95318 | val_0_accuracy: 0.8925  | val_0_amex_tabnet: 0.75781 |  0:07:53s
epoch 8  | loss: 0.24422 | val_0_auc: 0.95484 | val_0_accuracy: 0.89596 | val_0_amex_tabnet: 0.7




Fold 2/5 | 137.02 min
Fold 2




epoch 0  | loss: 0.52439 | val_0_auc: 0.90668 | val_0_accuracy: 0.83744 | val_0_amex_tabnet: 0.59568 |  0:00:59s
epoch 1  | loss: 0.37264 | val_0_auc: 0.92558 | val_0_accuracy: 0.85822 | val_0_amex_tabnet: 0.64914 |  0:01:58s
epoch 2  | loss: 0.32428 | val_0_auc: 0.93421 | val_0_accuracy: 0.86786 | val_0_amex_tabnet: 0.68245 |  0:02:58s
epoch 3  | loss: 0.29638 | val_0_auc: 0.93786 | val_0_accuracy: 0.87405 | val_0_amex_tabnet: 0.69538 |  0:03:59s
epoch 4  | loss: 0.2869  | val_0_auc: 0.93995 | val_0_accuracy: 0.87652 | val_0_amex_tabnet: 0.70357 |  0:04:59s
epoch 5  | loss: 0.27785 | val_0_auc: 0.94383 | val_0_accuracy: 0.88035 | val_0_amex_tabnet: 0.72507 |  0:05:59s
epoch 6  | loss: 0.26409 | val_0_auc: 0.94931 | val_0_accuracy: 0.88776 | val_0_amex_tabnet: 0.74506 |  0:06:59s
epoch 7  | loss: 0.25578 | val_0_auc: 0.95198 | val_0_accuracy: 0.89083 | val_0_amex_tabnet: 0.75392 |  0:08:00s
epoch 8  | loss: 0.24912 | val_0_auc: 0.95349 | val_0_accuracy: 0.89281 | val_0_amex_tabnet: 0.7




Fold 3/5 | 136.81 min
Fold 3




epoch 0  | loss: 0.52531 | val_0_auc: 0.90527 | val_0_accuracy: 0.82896 | val_0_amex_tabnet: 0.59258 |  0:00:59s
epoch 1  | loss: 0.36712 | val_0_auc: 0.92433 | val_0_accuracy: 0.85669 | val_0_amex_tabnet: 0.64673 |  0:01:58s
epoch 2  | loss: 0.32723 | val_0_auc: 0.93244 | val_0_accuracy: 0.86539 | val_0_amex_tabnet: 0.67148 |  0:02:58s
epoch 3  | loss: 0.29798 | val_0_auc: 0.93962 | val_0_accuracy: 0.87579 | val_0_amex_tabnet: 0.70256 |  0:03:58s
epoch 4  | loss: 0.28263 | val_0_auc: 0.94099 | val_0_accuracy: 0.878   | val_0_amex_tabnet: 0.70871 |  0:04:59s
epoch 5  | loss: 0.27045 | val_0_auc: 0.94763 | val_0_accuracy: 0.88654 | val_0_amex_tabnet: 0.73839 |  0:06:00s
epoch 6  | loss: 0.25632 | val_0_auc: 0.9505  | val_0_accuracy: 0.89026 | val_0_amex_tabnet: 0.74739 |  0:07:00s
epoch 7  | loss: 0.24939 | val_0_auc: 0.95265 | val_0_accuracy: 0.89299 | val_0_amex_tabnet: 0.75847 |  0:08:01s
epoch 8  | loss: 0.24489 | val_0_auc: 0.9535  | val_0_accuracy: 0.89391 | val_0_amex_tabnet: 0.7




Fold 4/5 | 139.70 min
Fold 4




epoch 0  | loss: 0.52208 | val_0_auc: 0.91422 | val_0_accuracy: 0.84087 | val_0_amex_tabnet: 0.60947 |  0:00:58s
epoch 1  | loss: 0.35341 | val_0_auc: 0.93089 | val_0_accuracy: 0.86691 | val_0_amex_tabnet: 0.67317 |  0:01:58s
epoch 2  | loss: 0.31382 | val_0_auc: 0.93686 | val_0_accuracy: 0.87103 | val_0_amex_tabnet: 0.6936  |  0:02:57s
epoch 3  | loss: 0.29245 | val_0_auc: 0.93939 | val_0_accuracy: 0.87553 | val_0_amex_tabnet: 0.70164 |  0:03:57s
epoch 4  | loss: 0.28356 | val_0_auc: 0.941   | val_0_accuracy: 0.87643 | val_0_amex_tabnet: 0.70746 |  0:04:57s
epoch 5  | loss: 0.27425 | val_0_auc: 0.94713 | val_0_accuracy: 0.88363 | val_0_amex_tabnet: 0.73496 |  0:05:56s
epoch 6  | loss: 0.25854 | val_0_auc: 0.95021 | val_0_accuracy: 0.88827 | val_0_amex_tabnet: 0.74826 |  0:06:57s
epoch 7  | loss: 0.25163 | val_0_auc: 0.95257 | val_0_accuracy: 0.89185 | val_0_amex_tabnet: 0.75742 |  0:07:57s
epoch 8  | loss: 0.24604 | val_0_auc: 0.95387 | val_0_accuracy: 0.89286 | val_0_amex_tabnet: 0.7




Fold 5/5 | 136.30 min
OOF score across folds: 0.7885104454659271


In [33]:
test_predictions

array([0.12710182, 0.01828349, 0.03937742, ..., 0.51745488, 0.32151679,
       0.03944453], shape=(924621,))

In [16]:
test_predictions

array([0.0378257 , 0.00226313, 0.04991527, ..., 0.58437125, 0.23511358,
       0.08121067], shape=(924621,))

In [17]:
np.mean(test_predictions, axis=0)

np.float64(0.24355196589327263)

In [18]:
test_predictions.shape

(924621,)

In [19]:
INFERENCE = True

if INFERENCE:
    sub = pd.DataFrame({'customer_ID': df_test.customer_ID,
                        'prediction': test_predictions})
    sub.to_csv('submission_tabnet_all_features.csv', index=False)
    display(sub)

Unnamed: 0,customer_ID,prediction
0,00000469ba478561f23a92a868bd366de6f6527a684c9a...,0.037826
1,00001bf2e77ff879fab36aa4fac689b9ba411dae63ae39...,0.002263
2,0000210045da4f81e5f122c6bde5c2a617d03eef67f82c...,0.049915
3,00003b41e58ede33b8daf61ab56d9952f17c9ad1c3976c...,0.307008
4,00004b22eaeeeb0ec976890c1d9bfc14fd9427e98c4ee9...,0.865024
...,...,...
924616,ffff952c631f2c911b8a2a8ca56ea6e656309a83d2f64c...,0.013722
924617,ffffcf5df59e5e0bba2a5ac4578a34e2b5aa64a1546cd3...,0.789601
924618,ffffd61f098cc056dbd7d2a21380c4804bbfe60856f475...,0.584371
924619,ffffddef1fc3643ea179c93245b68dca0f36941cd83977...,0.235114
