In [5]:
import sklearn
import pandas as pd
import numpy as np
from tqdm import tqdm
from itertools import *
from scipy.linalg import sqrtm
from sklearn.model_selection import *
from sklearn.linear_model import *
from sklearn.manifold import *
from sklearn.preprocessing import *
from sklearn.metrics import *
import matplotlib.pyplot as plt
from pprint import pprint
from sklearn.neighbors import KNeighborsClassifier
import pickle
from catboost import *
import gc
import catboost
from sklearn.base import clone
from collections import *

In [6]:
class Model():
    
    def __init__(self, main_model, row_model, column_model,
                 count_row_models = 13, count_column_models = 156,
                 main_model_path = None, row_model_path = None, column_model_path = None):
        
        self.main_model    = main_model
        self.row_model     = row_model
        self.column_model  = column_model
        
        self.count_row_models    = count_row_models
        self.count_column_models = count_column_models 

        self.main_model_path     = main_model_path
        self.row_model_path      = row_model_path
        self.column_model_path   = column_model_path
        
        self.init_first_level_models()
        
        if self.main_model_path != None:
            self.main_model  = pickle.load(open(self.main_model_path, "rb"))
        else:
            self.main_model = self.main_model.copy()
            

    def init_first_level_models(self):
        
        if self.row_model_path != None:
            self.row_models = pickle.load(open(self.row_model_path, "rb"))
        else:
            self.row_models = [self.row_model.copy() for i in range(self.count_row_models)]
        
        if self.column_model_path != None:
            self.column_models = pickle.load(open(self.column_model_path, "rb"))
        else:
            self.column_models = [clone(self.column_model) for i in range(self.count_column_models)]
    
    
    def train_row_models(self, data, labels):

        for i in tqdm(range(data.shape[1])):

            train_data = data[:, i, :]

            self.row_models[i].fit(train_data, labels)

        
    def train_column_models(self, data, labels):

        for i in tqdm(range(data.shape[2])):

            train_data = data[:, :, i]

            self.column_models[i].fit(train_data, labels)

    
    def train_main_model(self, data, labels, additional_features = None):
            
        main_preds = self.predict_first_level_models(data)

        if type(additional_features) != type(None):
            main_preds = np.concatenate([main_preds, additional_features], axis = 1)

        self.main_model.fit(main_preds, labels)
        
        return main_preds
    
    
    def predict(self, data, additional_features = None):
        
        main_preds = self.predict_first_level_models(data)
        
        if type(additional_features) != type(None):
            main_preds = np.concatenate([main_preds, additional_features], axis= 1)
        
        preds = self.main_model.predict(main_preds)
        
        return preds
    
    
    def predict_by_column_models(self, data):

        results = np.zeros((data.shape[0], data.shape[2]))

        for i in tqdm(range(data.shape[2])):

            test_data = data[:, :, i]

            preds = self.column_models[i].predict_proba(test_data)[:, 1]

            results[:, i] = preds

        return results

    
    def predict_by_row_models(self, data):

        results = np.zeros((data.shape[0], data.shape[1]))

        for i in tqdm(range(data.shape[1])):

            test_data = data[:, i, :]

            preds = self.row_models[i].predict_proba(test_data)[:, 1]

            results[:, i] = preds

        return results


    def predict_first_level_models(self, data):

        row_preds    = self.predict_by_row_models(data)
        column_preds = self.predict_by_column_models(data)
        
        main_preds = np.concatenate([row_preds, column_preds], axis = 1)

        return main_preds

In [7]:
def predict(model, data, additional_features = None):
        
    main_preds = model.predict_first_level_models(data)

    if type(additional_features) != type(None):
        main_preds = np.concatenate([main_preds, additional_features], axis= 1)

    preds = model.main_model.predict_proba(main_preds)

    return preds[:, 1]

In [8]:
with open("../input/models/abs_model", "rb") as file:
    model = pickle.load(file)

In [9]:
def get_stats(data):
    data_group = data.groupby(data.index)

    data_max  = data_group.max()
    data_min  = data_group.min()
    data_mean = data_group.mean()

    data_stat_features = pd.concat([data_max, data_min, data_mean], axis = 1).to_numpy(np.float16)

    del data_max
    del data_min
    del data_mean
    
    return data_stat_features

In [10]:
first = True

for i in tqdm(range(1, 4)):
    data = pd.read_parquet("../input/amex-test-repeat/test{}.parquet".format(i)).astype(np.float16)
    
    cat_features  = ['B_30', 'B_38', 'D_114', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64',  'D_68']
    
    indexes = data.index.drop_duplicates()
    data    = data.drop(cat_features, axis=1)
    
    data_stat_features = get_stats(data)
    
    data = data.to_numpy(np.float16)
    data = data.reshape(-1, 13, data.shape[1])
    
    preds = predict(model, data, data_stat_features)
    
    preds = pd.DataFrame({"customer_ID" : indexes, "prediction" : preds}).set_index("customer_ID")    
    
    if first:
        preds.to_csv("submit.csv")
        first = False
    else:
        preds.to_csv("submit.csv", mode = "a", header = False)
        
    del data
    del preds
    del data_stat_features
    
    gc.collect()

  0%|          | 0/3 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s][A
  8%|▊         | 1/13 [00:13<02:39, 13.33s/it][A
 15%|█▌        | 2/13 [00:26<02:27, 13.37s/it][A
 23%|██▎       | 3/13 [00:39<02:12, 13.25s/it][A
 31%|███       | 4/13 [00:52<01:58, 13.21s/it][A
 38%|███▊      | 5/13 [01:06<01:46, 13.31s/it][A
 46%|████▌     | 6/13 [01:19<01:33, 13.33s/it][A
 54%|█████▍    | 7/13 [01:32<01:19, 13.25s/it][A
 62%|██████▏   | 8/13 [01:45<01:05, 13.15s/it][A
 69%|██████▉   | 9/13 [01:58<00:52, 13.07s/it][A
 77%|███████▋  | 10/13 [02:12<00:39, 13.14s/it][A
 85%|████████▍ | 11/13 [02:25<00:26, 13.22s/it][A
 92%|█████████▏| 12/13 [02:38<00:13, 13.22s/it][A
100%|██████████| 13/13 [02:51<00:00, 13.20s/it][A

  0%|          | 0/156 [00:00<?, ?it/s][A
  1%|          | 1/156 [00:00<00:39,  3.88it/s][A
  2%|▏         | 3/156 [00:00<00:16,  9.28it/s][A
  3%|▎         | 5/156 [00:00<00:12, 12.48it/s][A
  4%|▍         | 7/156 [00:00<00:10, 14.46it/s][A
  6%|▌         | 9

In [11]:
preds = pd.read_csv("./submit.csv")
preds = preds.drop(488255, axis=0)

In [19]:
preds = preds.set_index("customer_ID")

In [20]:
preds.to_csv("submit3.csv")

In [21]:
preds

Unnamed: 0_level_0,prediction
customer_ID,Unnamed: 1_level_1
00000469ba478561f23a92a868bd366de6f6527a684c9a2e78fb826dcac3b9b7,0.026836
00001bf2e77ff879fab36aa4fac689b9ba411dae63ae397d4263dafa1daedef5,0.008526
0000210045da4f81e5f122c6bde5c2a617d03eef67f82c5e400fc98e7bd43ce8,0.025309
00003b41e58ede33b8daf61ab56d9952f17c9ad1c3976cf6e56734528702d694,0.440981
00004b22eaeeeb0ec976890c1d9bfc14fd9427e98c4ee9a4693dd914fca22557,0.805947
...,...
ffff952c631f2c911b8a2a8ca56ea6e656309a83d2f64c5d60460dba6dedc41e,0.031464
ffffcf5df59e5e0bba2a5ac4578a34e2b5aa64a1546cd3a4f0ca3de613b0b2ad,0.858967
ffffd61f098cc056dbd7d2a21380c4804bbfe60856f475cb095d2443a68030f1,0.651606
ffffddef1fc3643ea179c93245b68dca0f36941cd83977822e8b356988ca4d07,0.376310
