In [1]:
import sklearn
import pandas as pd
import numpy as np
from tqdm import tqdm
from itertools import *
from scipy.linalg import sqrtm
from sklearn.model_selection import *
from sklearn.linear_model import *
from sklearn.manifold import *
from sklearn.preprocessing import *
from sklearn.metrics import *
import matplotlib.pyplot as plt
from pprint import pprint
from sklearn.neighbors import KNeighborsClassifier
import pickle

from catboost import *
import gc
import catboost
from sklearn.base import clone
from collections import *

In [2]:
data = pd.read_parquet("../input/train-all-fill/train_final.parquet")

In [3]:
data = data.astype(np.float16)

In [4]:
labels = pd.read_csv("../input/amex-default-prediction/train_labels.csv")
labels = labels.set_index("customer_ID").astype(np.int16)

In [5]:
cat_features  = ['B_30', 'B_38', 'D_114', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64',  'D_68']

In [6]:
data = data.drop(cat_features, axis=1)

In [7]:
data_group = data.groupby(data.index)

data_max  = data_group.max()
data_min  = data_group.min()
data_mean = data_group.mean()

data_stat_features = pd.concat([data_max, data_min, data_mean], axis = 1).to_numpy(np.float16)

del data_max
del data_min
del data_mean

In [8]:
data = data.to_numpy(np.float16)
labels = labels.to_numpy(np.int16)

In [9]:
data = data.reshape(-1, 13, data.shape[1])
labels = labels.reshape(-1)

In [10]:
# train_data, test_data, train_label, test_label = train_test_split(data, labels, test_size = 0.3, stratify = labels, random_state = 0, shuffle = True)
train_data = data
train_label = labels
train_stat_features, test_stat_features = data_stat_features[:train_data.shape[0]], data_stat_features[train_data.shape[0]:]

In [11]:
kfolds = StratifiedKFold(3)

In [12]:
class Model():
    
    def __init__(self, main_model, row_model, column_model,
                 count_row_models = 13, count_column_models = 156,
                 main_model_path = None, row_model_path = None, column_model_path = None):
        
        self.main_model    = main_model
        self.row_model     = row_model
        self.column_model  = column_model
        
        self.count_row_models    = count_row_models
        self.count_column_models = count_column_models 

        self.main_model_path     = main_model_path
        self.row_model_path      = row_model_path
        self.column_model_path   = column_model_path
        
        self.init_first_level_models()
        
        if self.main_model_path != None:
            self.main_model  = pickle.load(open(self.main_model_path, "rb"))
        else:
            self.main_model = self.main_model.copy()
            

    def init_first_level_models(self):
        
        if self.row_model_path != None:
            self.row_models = pickle.load(open(self.row_model_path, "rb"))
        else:
            self.row_models = [self.row_model.copy() for i in range(self.count_row_models)]
        
        if self.column_model_path != None:
            self.column_models = pickle.load(open(self.column_model_path, "rb"))
        else:
            self.column_models = [clone(self.column_model) for i in range(self.count_column_models)]
    
    
    def train_row_models(self, data, labels):

        for i in tqdm(range(data.shape[1])):

            train_data = data[:, i, :]

            self.row_models[i].fit(train_data, labels)

        
    def train_column_models(self, data, labels):

        for i in tqdm(range(data.shape[2])):

            train_data = data[:, :, i]

            self.column_models[i].fit(train_data, labels)

    
    def train_main_model(self, data, labels, additional_features = None):
            
        main_preds = self.predict_first_level_models(data)

        if type(additional_features) != type(None):
            main_preds = np.concatenate([main_preds, additional_features], axis = 1)

        self.main_model.fit(main_preds, labels)
        
        return main_preds
    
    
    def predict(self, data, additional_features = None):
        
        main_preds = self.predict_first_level_models(data)
        
        if type(additional_features) != type(None):
            main_preds = np.concatenate([main_preds, additional_features], axis= 1)
        
        preds = self.main_model.predict(main_preds)
        
        return preds
    
    
    def predict_by_column_models(self, data):

        results = np.zeros((data.shape[0], data.shape[2]))

        for i in tqdm(range(data.shape[2])):

            test_data = data[:, :, i]

            preds = self.column_models[i].predict_proba(test_data)[:, 1]

            results[:, i] = preds

        return results

    
    def predict_by_row_models(self, data):

        results = np.zeros((data.shape[0], data.shape[1]))

        for i in tqdm(range(data.shape[1])):

            test_data = data[:, i, :]

            preds = self.row_models[i].predict_proba(test_data)[:, 1]

            results[:, i] = preds

        return results


    def predict_first_level_models(self, data):

        row_preds    = self.predict_by_row_models(data)
        column_preds = self.predict_by_column_models(data)
        
        main_preds = np.concatenate([row_preds, column_preds], axis = 1)

        return main_preds

In [13]:
main_model   = CatBoostClassifier(depth = 9, iterations = 600, l2_leaf_reg = 3,  learning_rate = 0.01, task_type="GPU", logging_level = "Silent")
row_model    = CatBoostClassifier(early_stopping_rounds=10, task_type="GPU", logging_level = "Silent") 
column_model = LogisticRegression(max_iter = 500)

model = Model(main_model,
              row_model, 
              column_model)

In [14]:
for train1_index, train2_index in kfolds.split(train_data, train_label):

    train1, train1_targets = train_data[train1_index], train_label[train1_index]
    train2,  train2_targets  = train_data[train2_index],  train_label[train2_index]
    train_stat   = train_stat_features [train2_index]
    
    model.train_row_models(   train1, train1_targets)
    model.train_column_models(train1, train1_targets)
        
    model.train_main_model(train2, train2_targets)
    
    model.init_first_level_models()

model.train_column_models(train_data, train_label)
model.train_row_models(train_data, train_label)

100%|██████████| 13/13 [07:39<00:00, 35.38s/it]
100%|██████████| 156/156 [03:15<00:00,  1.25s/it]
100%|██████████| 13/13 [01:26<00:00,  6.65s/it]
100%|██████████| 156/156 [00:12<00:00, 12.41it/s]
100%|██████████| 13/13 [06:17<00:00, 29.07s/it]
100%|██████████| 156/156 [03:12<00:00,  1.23s/it]
100%|██████████| 13/13 [01:25<00:00,  6.57s/it]
100%|██████████| 156/156 [00:12<00:00, 12.01it/s]
100%|██████████| 13/13 [06:18<00:00, 29.11s/it]
100%|██████████| 156/156 [03:15<00:00,  1.26s/it]
100%|██████████| 13/13 [01:27<00:00,  6.71s/it]
100%|██████████| 156/156 [00:12<00:00, 12.64it/s]
100%|██████████| 156/156 [04:55<00:00,  1.89s/it]
100%|██████████| 13/13 [08:43<00:00, 40.27s/it]


In [15]:
model.train_column_models(train_data, train_label)
model.train_row_models(train_data, train_label)

100%|██████████| 156/156 [04:59<00:00,  1.92s/it]
100%|██████████| 13/13 [08:46<00:00, 40.51s/it]


In [15]:
preds = model.predict(test_data)

100%|██████████| 13/13 [01:27<00:00,  6.76s/it]
100%|██████████| 156/156 [00:11<00:00, 13.41it/s]


In [16]:
with open("abs_model", 'wb') as file:
    pickle.dump(model, file)

In [17]:
accuracy_score(test_label, preds)

0.8942792393625522

In [20]:
f1_score(test_label, preds)

0.7954414997129937