In [31]:
import numpy as np
import pandas as pd

from tqdm import tqdm
import os
import gc
import torch
import torch.nn.functional as F
import random
import torch.nn as nn
import pickle

from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl

from time import time
from sklearn.model_selection import StratifiedKFold

import json
from numpy import inf

In [32]:
def preprocess_numeric(df, num_cols):
    for col in num_cols:
        val = df[col]
        if val.min() < 0:
            val = val + abs(val.min())
        val = val + 1
        val = np.log2(val)
        val = val + 1
        val.fillna(-1, inplace=True)######  заполняем -1 NaNs, все значения не NaNs > 1, а паддинг - 0
        df[col]=val
    return df

In [33]:
def pad_sequence(array, max_len):
    shape = array.shape
    padds = [(0,0) for _ in range(len(shape) - 1)] + [(0, max_len - shape[0])]
    padds = [s for t in padds for s in t]
    padded = F.pad(array,padds, "constant", 0)
    return padded

def preprocess_binary(df):
    df['B_31'] = df['B_31'].astype(int) + 1 ## теперь они 1 или 2, для паддинга
    df['D_87'] = df['D_87'].fillna(0).astype(int) + 1 ## теперь они 1 или 2, для паддинга
    return df

def preprocess_cat(df):
    cat_features = ["B_30","B_38","D_114","D_116","D_117","D_120","D_126","D_63","D_64","D_66","D_68"]
    
    trans_table_D_63 ="CLORXMZna".maketrans({'C': "1", 'L':'2', 'O':'3','R':'4','X':'5','M':'6','Z':'7', 'n':'8', 'a': '9'})
    trans_table_D_64 ="ORUna".maketrans({'O': "1", 'R':'2', 'U':'3', 'n':'4', 'a': '5'})

    df['D_63'] = df['D_63'].astype('str').str.translate(trans_table_D_63)        
    df['D_64'] = df['D_64'].astype('str').str.translate(trans_table_D_64)
    
    for col_name in tqdm(cat_features):
        df[col_name] = pd.to_numeric(df[col_name])
        df[col_name] = df[col_name].fillna(128)
        ordered_mapping_from_zero = {e.item(): i + 1 for i, e in enumerate(df[col_name].unique())} # теперь минимальное значение - с 1, для паддинга
        df[col_name] = df[col_name].map(ordered_mapping_from_zero)
    
    return df

def preprocess_df(path):
    return df, all_cols

def get_emb_size(n_cat):
    return min(600, round(1.6 * n_cat ** .56))

def create_nunique(df,cat_features):
    nunique = {col: df[col].nunique() for col in cat_features}
    for key in nunique:
        nunique[key] = (nunique[key], get_emb_size(nunique[key]))
    return nunique

def create_folds(train_df, all_cols, batch_size):
    targets = pd.read_csv('amex-default-prediction/train_labels.csv')
    targets = targets.set_index('customer_ID')
    
    grouped = train_df.groupby("customer_ID")
    
    group_index = {i:g for i,g in enumerate(targets.index.values)}
    ys = targets['target'].values

    max_len = grouped.count().max().max()

    def get_point(index):
        group_name = group_index[index]
        group = grouped.get_group(group_name)
        X = torch.Tensor(group[all_cols].values.astype(np.double))
        y = targets.loc[group_name]['target']
        X = pad_sequence(X, max_len)
        return X, y

    def get_batch(indices):
        Xs = []
        ys = []
        for idx in indices:
            x,y = get_point(idx)
            Xs.append(x)
            ys.append(y)

        return torch.stack(Xs), torch.ShortTensor(ys)
    
    def process_fold_index(index, batch_size, folder):
        batch_number = 0
        elems_count = len(index) // batch_size
        for batch_idx in np.array_split(index, elems_count):
            X, y = get_batch(batch_idx)            
            torch.save(X, f'{folder}/{batch_number}.X.pt')
            torch.save(y, f'{folder}/{batch_number}.y.pt')
            batch_number+=1

    skf = StratifiedKFold(n_splits=5)
    for fold_number, (train_index, test_index) in enumerate(skf.split(np.zeros(len(ys)), ys)):
        base = f'amex-default-prediction/folds/{fold_number}'
        train = f'{base}/train'
        test = f'{base}/test'
        
        os.makedirs(base, exist_ok=True)
        os.makedirs(train, exist_ok=True)               
        os.makedirs(test, exist_ok=True)
        
        process_fold_index(train_index, batch_size, train)
        process_fold_index(test_index, batch_size, test)
            
def create_test(test_df, all_cols, batch_size):

    grouped = test_df.groupby("customer_ID")
    max_len = grouped.count().max().max()
    
    curr_size = 0
    Xs = []
    ids = []
    batch_number = 0
    
    base = f'amex-default-prediction/test'
    os.makedirs(base, exist_ok=True)
    
    for name, group in grouped:
        X = torch.Tensor(group[all_cols].values.astype(np.double))
        X = pad_sequence(X, max_len)
        Xs.append(X)
        ids.append(name)
        curr_size += 1
        if curr_size == batch_size:
            X_tensor = torch.stack(Xs)
            ids = np.asarray(ids)
            
            torch.save(X_tensor, f'{base}/{batch_number}.X.pt')
            np.savetxt(f'{base}/{batch_number}.ids.gz', ids, fmt='%s')
            
            curr_size = 0
            Xs = []
            ids = []
            batch_number += 1
            
    if len(Xs) > 0:
        X_tensor = torch.stack(Xs)
        ids = np.asarray(ids)

        torch.save(X_tensor, f'{base}/{batch_number}.X.pt')
        np.savetxt(f'{base}/{batch_number}.ids.gz', ids, fmt='%s')

In [34]:
train_df = pd.read_csv('amex-default-prediction/train_data.csv')
test_df = pd.read_csv('amex-default-prediction/test_data.csv')

general = pd.concat([train_df, test_df], ignore_index=True)
test_customers = test_df['customer_ID'].values
del train_df
del test_df
import gc
gc.collect()

0

In [35]:
all_cols = [c for c in list(general.columns) if c not in ['customer_ID','S_2']]
cat_features = ["B_30","B_38","D_114","D_116","D_117","D_120","D_126","D_63","D_64","D_66","D_68"]
bin_features = ['B_31', "D_87"]
num_features = [col for col in all_cols if col not in cat_features and col not in bin_features]
all_cols = cat_features +  bin_features + num_features

In [36]:
import warnings
warnings.filterwarnings("ignore")
general = preprocess_numeric(general, num_features)
general = preprocess_cat(general)
general = preprocess_binary(general)
general=general.reindex(columns=['customer_ID','S_2'] + all_cols)

100%|██████████████████████████████████████████████████████████████████████████████████| 11/11 [00:19<00:00,  1.81s/it]


In [37]:
nunique = create_nunique(general, cat_features + bin_features)

with open('nunique.json', 'w+') as fp:
    json.dump(nunique, fp)
unique_test_customers = np.unique(test_customers)

test_df = general[general['customer_ID'].isin(unique_test_customers)]
create_test(test_df, all_cols, 4196)
del test_df
train_df = general[~general['customer_ID'].isin(unique_test_customers)]
create_folds(train_df, all_cols, 4196)
del train_df
del general
gc.collect()

0

In [38]:
class FoldStorageDataset(Dataset):
    
    def __init__(self, folder):
        self.len = len(os.listdir(folder)) // 2
        self.folder = folder
        
    def __len__(self):
        return self.len

    def __getitem__(self, idx):
        X = torch.load(os.path.join(self.folder, f'{idx}.X.pt'), map_location=torch.device('cuda'))
        y = torch.load(os.path.join(self.folder, f'{idx}.y.pt'), map_location=torch.device('cuda'))
        return X,y

In [39]:
class TestDataset(Dataset):
    def __init__(self):
        self.folder = 'amex-default-prediction/test'
        self.size = len(os.listdir(self.folder)) // 2
        
    def __len__(self):
        return self.size
    
    def __getitem__(self, idx):
        X_path = f'{idx}.X.pt'
        ids_path = f'{idx}.ids.gz'
        X = torch.load(os.path.join(self.folder, X_path), map_location=torch.device('cuda'))
        ids = np.loadtxt(os.path.join(self.folder, ids_path), dtype=np.object)
        return X, ids 

In [40]:
def create_fold_dataloaders():
    for fold in range(5):
        path = f'amex-default-prediction/folds/{fold}'
        train = DataLoader(FoldStorageDataset(f'{path}/train'), batch_size = None)
        test = DataLoader(FoldStorageDataset(f'{path}/test'), batch_size = None)
        yield train, test   

In [41]:
with open('nunique.json', 'r') as fp:
    nunique = json.load(fp)

In [60]:
class Model(torch.nn.Module):
    def __init__(self, enc_hid_dim, deep_fcs):
        super().__init__()
        embedding_projection, embedding_general_output_size = self.get_embedding_projection(nunique)

        self.embeddings = torch.nn.ModuleList(embedding_projection)
        self.inp_size = 220 #emb size + num features len
        
        self.credits_rnn = torch.nn.GRU(self.inp_size, enc_hid_dim, batch_first=True, bidirectional =True)
        self.relu = torch.nn.LeakyReLU()
        self.tanh = torch.nn.Tanh()
        
        self.num_cat_size = 13 # cat + binary become cat
        
        self.fcs = torch.nn.ModuleList()

        inp = enc_hid_dim * 2
        for deep in deep_fcs:
            out = deep
            self.fcs.append(torch.nn.Linear(in_features=inp, out_features=out))
            inp = out
        
    def forward(self, X):
        cat_features = X[:,:,:self.num_cat_size]
        num_features = X[:,:,self.num_cat_size:]
        
        batch_size = X.shape[0]
        
        splitted = torch.tensor_split(cat_features.long(), cat_features.shape[-1], dim=-1)
        emb_to_cat_feature = zip(self.embeddings, splitted)
        embeddings = [emb(tensor) for emb, tensor in emb_to_cat_feature]
        concatted_emb = torch.squeeze(torch.cat(embeddings, dim=-1))
        
        if batch_size == 1:
            concatted_emb = torch.unsqueeze(concatted_emb, dim=0)
        
        concatted_input = torch.cat([concatted_emb, num_features], dim=-1)   

        rnn_output, hidden_credits_rnn = self.credits_rnn(concatted_input)
        rnn_stack = torch.cat([hidden_credits_rnn[0],hidden_credits_rnn[1]], dim=-1)
        
        x = self.tanh(rnn_stack)
        for fc in self.fcs[:-1]:
            x = self.relu(fc(x))
        x = self.fcs[-1](x)
        return x
    
    def create_embedding_projection(self, cardinality, embed_size, add_missing=True):
        add_missing = 1 if add_missing else 0
        return torch.nn.Embedding(num_embeddings=cardinality+add_missing, embedding_dim=embed_size, padding_idx=0)

    def get_embedding_projection(self, nunique):
        embedding_projection = [self.create_embedding_projection(*e) for e in nunique.values()]
        embedding_general_output_size = sum([e[1] for e in nunique.values()])
        return embedding_projection, embedding_general_output_size

In [61]:
# https://www.kaggle.com/kyakovlev
# https://www.kaggle.com/competitions/amex-default-prediction/discussion/327534
def amex_metric_np(preds: np.ndarray, target: np.ndarray) -> float:
    indices = np.argsort(preds)[::-1]
    preds, target = preds[indices], target[indices]

    weight = 20.0 - target * 19.0
    cum_norm_weight = (weight / weight.sum()).cumsum()
    four_pct_mask = cum_norm_weight <= 0.04
    d = np.sum(target[four_pct_mask]) / np.sum(target)

    weighted_target = target * weight
    lorentz = (weighted_target / weighted_target.sum()).cumsum()
    gini = ((lorentz - cum_norm_weight) * weight).sum()

    n_pos = np.sum(target)
    n_neg = target.shape[0] - n_pos
    gini_max = 10 * n_neg * (n_pos + 20 * n_neg - 19) / (n_pos + 20 * n_neg)

    g = gini / gini_max
    return 0.5 * (g + d)

In [74]:
from pytorch_lightning.loggers import TensorBoardLogger

class LightningModule(pl.LightningModule):
    def __init__(self,  hidden_size, fcs, weight, reduction):
        super().__init__()
        self.model = Model(hidden_size, fcs)
        self.save_hyperparameters('hidden_size', 'fcs', 'weight', 'reduction')
        self.metric = amex_metric_np
        weights = torch.Tensor([weight]).to('cuda')
        self.loss = torch.nn.BCEWithLogitsLoss(reduction=reduction, pos_weight=weights)

    def training_step(self, batch, batch_idx):
        x, y = batch
        output = torch.squeeze(self.model.forward(x))
        loss = self.loss(output, torch.squeeze(y).float()).mean()
        return loss    
        
    def validation_step(self, batch, batch_idx):
        x, y = batch
        output = torch.sigmoid(torch.squeeze(self.model.forward(x))).detach().cpu().numpy()
        y = torch.squeeze(y).detach().cpu().numpy()
          
        return np.row_stack([output, y])
    
    def validation_epoch_end(self, test_step_outputs):

        catted = np.concatenate(test_step_outputs, axis=1)
        
        metric = self.metric(catted[0], catted[1])
        self.log("val_loss", metric, prog_bar=True)
        return {"val_loss":  metric}
            
    def test_step(self, batch, batch_idx):
        x, y = batch
        output = torch.sigmoid(torch.squeeze(self.model.forward(x))).detach().cpu().numpy()
        y = torch.squeeze(y).detach().cpu().numpy()
          
        return np.row_stack([output, y])

    def test_epoch_end(self, test_step_outputs):

        catted = np.concatenate(test_step_outputs, axis=1)
        
        metric = self.metric(catted[0], catted[1])
        self.log("test_loss", metric, prog_bar=True)
        return {"test_loss":  metric}
        
    def predict_step(self, batch, batch_idx):
        x, ids = batch
        output = torch.sigmoid(torch.squeeze(self.model.forward(x)))
        
        return dict([(a, b.item()) for a,b in zip(ids, output.detach().cpu())])
        
    def configure_optimizers(self):
        self.optimizer = torch.optim.Adam(self.parameters(),lr=0.001)
        self.reduce_lr_on_plateau = torch.optim.lr_scheduler.ReduceLROnPlateau(
            self.optimizer,
            mode='max',
            factor=0.1,
            patience=1,
            min_lr=1e-6,
            verbose=True
        )
        return {"optimizer": self.optimizer,
                "lr_scheduler":{
                    "scheduler": self.reduce_lr_on_plateau,# torch.optim.lr_scheduler.ExponentialLR(self.optimizer,gamma=0.9),
                    "monitor": "val_loss",
                    "interval": "epoch"}
               }
    
    def forward(self, x):
        return self.model(x)

In [75]:
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping, RichProgressBar, LearningRateMonitor

def fit_model(fold_number, hidden_size, fcs, weight, reduction, train_dataloader, val_dataloader, test_dataloader):
    

    logger = TensorBoardLogger(save_dir='tb_logs', name=f"Fcs_{len(fcs)}_hidden_size_{hidden_size}_weight_{weight}_reduction_{reduction}")    
    
    module = LightningModule(hidden_size, fcs, weight, reduction)
    checkpoint_callback = ModelCheckpoint(dirpath=f'Fcs_{len(fcs)}_hidden_size_{hidden_size}_weight_{weight}_reduction_{reduction}_fold_number_{fold_number}',
                                          filename='{epoch}-{val_loss:.4f}',
                                          save_top_k=-1)
    
    lr_monitor = LearningRateMonitor()
    
    callbacks=[EarlyStopping(monitor="val_loss", mode='max'),checkpoint_callback, lr_monitor,RichProgressBar(leave=True)] #, 
    trainer = pl.Trainer(callbacks=callbacks,
                         deterministic=True,
                         enable_progress_bar=True,
                         accelerator="gpu",
                         log_every_n_steps=1,
                         logger=logger,
                         check_val_every_n_epoch=1)
    trainer.fit(model=module, train_dataloaders=train_dataloader, val_dataloaders=val_dataloader)
    
    module = LightningModule.load_from_checkpoint(checkpoint_path=checkpoint_callback.best_model_path)
    trainer.test(dataloaders=test_dataloader, model=module)
    return trainer, module

In [76]:
outputs = []
for i, (train_dataloader, test_dataloader) in enumerate(create_fold_dataloaders()):
    trainer, module = fit_model(i, 256, [256,128,64,32,16,8,1], 1, 'mean', train_dataloader, test_dataloader, test_dataloader)
    outputs.append((trainer, module))

In [None]:
len(outputs)

In [80]:
values = []
for path in [
             'Fcs_7_hidden_size_256_weight_1_reduction_mean_fold_number_3/epoch=19-val_loss=0.7790.ckpt',
             'Fcs_7_hidden_size_256_weight_1_reduction_mean_fold_number_4/epoch=17-val_loss=0.7797.ckpt'             
            ]:
    module = LightningModule.load_from_checkpoint(checkpoint_path=path)
    predictions =  create_trainer().predict(dataloaders=DataLoader(TestDataset(), batch_size=None), model=module)
    predictions = {k: v for d in predictions for k, v in d.items()}
    df = pd.DataFrame(predictions.items(), columns=['customer_ID', 'prediction'])
    values.append(df['prediction'].values)
df['prediction'] = sum(values) / len(values)

In [None]:
values = []
for trainer, module in outputs:
    predictions =  create_trainer().predict(dataloaders=DataLoader(TestDataset(), batch_size=None), model=module)
    predictions = {k: v for d in predictions for k, v in d.items()}
    df = pd.DataFrame(predictions.items(), columns=['customer_ID', 'prediction'])
    values.append(df['prediction'].values)
df['prediction'] = sum(values) / len(values)

In [81]:
df['prediction']

0         0.065661
1         0.000372
2         0.029844
3         0.304707
4         0.876459
            ...   
924616    0.012988
924617    0.806117
924618    0.566064
924619    0.352699
924620    0.086094
Name: prediction, Length: 924621, dtype: float64

In [82]:
df.to_csv('submission.csv', index=False)