In [31]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import os
import gc
import torch
import random
import pytorch_lightning as pl
import torch.nn as nn

In [32]:
os.cpu_count()

12

In [33]:
print(torch.cuda.device_count())
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

1
cuda


In [34]:
SEED = 42

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
pl.seed_everything(42, workers=True)

Global seed set to 42


42

In [35]:
mapping = {'pre_since_opened': (20, 9),
 'pre_since_confirmed': (18, 8),
 'pre_pterm': (18, 8),
 'pre_fterm': (17, 8),
 'pre_till_pclose': (17, 8),
 'pre_till_fclose': (16, 8),
 'pre_loans_credit_limit': (20, 9),
 'pre_loans_next_pay_summ': (8, 5),
 'pre_loans_outstanding': (6, 4),
 'pre_loans_total_overdue': (2, 2),
 'pre_loans_max_overdue_sum': (4, 3),
 'pre_loans_credit_cost_rate': (14, 7),
 'pre_loans5': (18, 8),
 'pre_loans530': (20, 9),
 'pre_loans3060': (10, 6),
 'pre_loans6090': (6, 4),
 'pre_loans90': (20, 9),
 'is_zero_loans5': (2, 2),
 'is_zero_loans530': (2, 2),
 'is_zero_loans3060': (2, 2),
 'is_zero_loans6090': (2, 2),
 'is_zero_loans90': (2, 2),
 'pre_util': (20, 9),
 'pre_over2limit': (20, 9),
 'pre_maxover2limit': (20, 9),
 'is_zero_util': (2, 2),
 'is_zero_over2limit': (2, 2),
 'is_zero_maxover2limit': (2, 2),
 'enc_paym_0': (4, 3),
 'enc_paym_1': (4, 3),
 'enc_paym_2': (4, 3),
 'enc_paym_3': (4, 3),
 'enc_paym_4': (4, 3),
 'enc_paym_5': (4, 3),
 'enc_paym_6': (4, 3),
 'enc_paym_7': (4, 3),
 'enc_paym_8': (4, 3),
 'enc_paym_9': (4, 3),
 'enc_paym_10': (4, 3),
 'enc_paym_11': (5, 4),
 'enc_paym_12': (4, 3),
 'enc_paym_13': (4, 3),
 'enc_paym_14': (4, 3),
 'enc_paym_15': (4, 3),
 'enc_paym_16': (4, 3),
 'enc_paym_17': (4, 3),
 'enc_paym_18': (4, 3),
 'enc_paym_19': (4, 3),
 'enc_paym_20': (5, 4),
 'enc_paym_21': (4, 3),
 'enc_paym_22': (4, 3),
 'enc_paym_23': (4, 3),
 'enc_paym_24': (5, 4),
 'enc_loans_account_holder_type': (7, 5),
 'enc_loans_credit_status': (7, 5),
 'enc_loans_credit_type': (8, 5),
 'enc_loans_account_cur': (4, 3),
 'pclose_flag': (2, 2),
 'fclose_flag': (2, 2)}

In [36]:
def pad_sequence(array: np.ndarray, max_len) -> np.ndarray:
    output = np.zeros((max_len, 59), dtype=np.int32)
    output[:array.shape[0], :] = array
    return output

In [78]:
import pickle
def read_file(file_path, is_train=True):
    data = pd.read_parquet(file_path)
    
    data.drop(columns=['rn'], inplace=True)
    data = data.groupby(['id']).agg(list).agg(list, axis="columns").reset_index()
    data[0]=data[0].apply(lambda x:(np.array(x, dtype=np.int32) + 1).T)
    
    if is_train:
        target = pd.read_csv('train_target.csv')    
        data_target = data.merge(target, on="id")
        return data_target
    else:
        return data


def get_bucket(length):
    buckets = [(0, 10), (10, 20), (20, 30), (30, 60)]
    for borders in buckets:
        if length > borders[0] and length <= borders[1]:
            return borders[1]

def create_tensors(batches_bucket, is_train):
    X = torch.LongTensor([b[0] for b in batches_bucket])
    if is_train:
        y = torch.FloatTensor([b[1] for b in batches_bucket])
        return X, y
    else:
        ids = torch.LongTensor([b[1] for b in batches_bucket])
        return X, ids

def split_to_batches(df, batch_size, is_train):
    batches = {}
    
    for i, row in df.iterrows():
        bucket =  row['bucket']
        if bucket not in batches:
            batches[bucket] = []
        if is_train:
            batches[bucket].append((row[0], row['flag']))
        else:
            batches[bucket].append((row[0],row['id']))
            
        
        if len(batches[bucket]) == batch_size:
            yield create_tensors(batches[bucket], is_train)
            batches[bucket] = []
            
    for _, batches_bucket in batches.items():
        if len(batches_bucket) > 0:
            yield create_tensors(batches_bucket, is_train)

    
def write_folder(batch_size, is_train=True):
    
    if is_train:
        folder_path = 'train_data'
        folder_out_path = 'train_tensors'
    else:
        folder_path = 'test_data'
        folder_out_path = 'test_tensors'
        
    
    dfs = []
    dataset_paths = [os.path.join(folder_path, filename) for filename in os.listdir(folder_path)]
    for f in tqdm(dataset_paths, total=len(dataset_paths)):
        dfs.append(read_file(f, is_train))
    df = pd.concat(dfs)
            
    df['lengths'] = df[0].apply(lambda x: x.shape[0])
    df['bucket'] = df['lengths'].apply(lambda x: get_bucket(x))
    df[0] = df.apply(lambda row: pad_sequence(row[0], row['bucket']), axis=1)
    
    if is_train:
        skf = StratifiedKFold(n_splits=3)
        for fold_number, (train_index, test_index) in tqdm(enumerate(skf.split(df['id'], df['flag']))):
            train = df.iloc[train_index]
            test = df.iloc[test_index]

            for i, (X, y) in enumerate(split_to_batches(train, batch_size)):
                torch.save(X, f'{folder_out_path}/fold_{fold_number}/train/{i}_X.pt')
                torch.save(y, f'{folder_out_path}/fold_{fold_number}/train/{i}_y.pt')

            for i, (X, y) in enumerate(split_to_batches(test, batch_size)):
                torch.save(X, f'{folder_out_path}/fold_{fold_number}/test/{i}_X.pt')
                torch.save(y, f'{folder_out_path}/fold_{fold_number}/test/{i}_y.pt')
    else:
        for i, (X, ids) in enumerate(split_to_batches(df, batch_size, False)):
            torch.save(X, f'{folder_out_path}/{i}_X.pt')
            torch.save(ids, f'{folder_out_path}/{i}_ids.pt')
        

In [79]:
write_folder(2048, False)

100%|███████████████████████████████████████████████████████████████████████████████████| 2/2 [08:28<00:00, 254.25s/it]


In [130]:
from torch.utils.data import Dataset 
class FoldDataset(Dataset):
    def __init__(self, fold_number, fold_type, is_train):
        self.folder_path = f'train_tensors/fold_{fold_number}/{fold_type}' if is_train else 'test_tensors'
        self.len = len(os.listdir(self.folder_path)) // 2
        self.is_train = is_train
        
    def __len__(self):
        return self.len
    
    def load(self, idx, tensor_type):
        return torch.load(f'{self.folder_path}/{idx}_{tensor_type}.pt', map_location=device)
    
    def __getitem__(self, idx):
        if self.is_train:
            return self.load(idx, 'X'), self.load(idx, 'y')
        else:
            return self.load(idx, 'X'), self.load(idx, 'ids')

In [39]:
from torch.utils.data import Dataset, IterableDataset
class FoldIterDataset(IterableDataset):
    def __init__(self, fold_number, fold_type):
        self.folder_path = f'train_tensors/fold_{fold_number}/{fold_type}'
        self.len = len(os.listdir(self.folder_path)) // 2
        
    def load(self, idx, tensor_type):
        return torch.load(f'{self.folder_path}/{idx}_{tensor_type}.pt', map_location=device)
    
    def __iter__(self):
        for i in range(self.len):
            x = self.load(i, 'X')
            y = self.load(i, 'y')
            for X, Y in zip(torch.tensor_split(x, 16),torch.tensor_split(y, 16)):
                yield X, Y
        
        

In [140]:
from torch.utils.data import DataLoader
def get_fold_dataloaders(is_train):
    if is_train:
        for i in range(3):
            yield DataLoader(FoldDataset(i, 'train', is_train), batch_size=None), DataLoader(FoldDataset(i, 'test', is_train), batch_size=None)

In [227]:
class TransformerModel(nn.Module):
        
    def __init__(self, n_head=6, num_layers = 6, dropout=0.5):
        super(TransformerModel, self).__init__()
        d_model=258
        enc_hid_dim = 205
        embedding_projection, embedding_general_output_size = get_embedding_projection()
        self.embeddings = torch.nn.ModuleList(embedding_projection)
        self.pos_encoder = nn.Embedding(enc_hid_dim, d_model)
        
        self.credits_rnn = torch.nn.GRU(embedding_general_output_size, enc_hid_dim, batch_first=True)
        
        
        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=n_head, dropout=dropout)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers =6)
        self.classifier = nn.Linear(d_model, 1)
        

    def forward(self, x):
        embeddings =  [emb(tensor) for emb, tensor in zip(self.embeddings, torch.tensor_split(x, 59, dim=-1))]
        concatted_emb = torch.squeeze(torch.cat(embeddings, dim=-1))
        
        rnn_output, hidden_credits_rnn = self.credits_rnn(concatted_emb)
        rnn_stack = hidden_credits_rnn[0]
        
        print(rnn_stack.shape)
        #positions = torch.arange(0, 258).expand(x.shape[0], 258)#.to(device)
        
        print(positions.shape)
        x = self.pos_encoder(positions) + rnn_stack
        print(x.shape)
        return
        
        x = self.pos_encoder(concatted_emb)
        x = self.transformer_encoder(x)
        x = x.mean(dim=1)
        x = self.classifier(x)        
        
        return x 

In [228]:
TransformerModel().to(device)(x)


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.

In [201]:
x.shape

torch.Size([2048, 10, 59])

In [41]:
def create_embedding_projection(cardinality, embed_size, add_missing=True):
    add_missing = 1 if add_missing else 0
    return torch.nn.Embedding(num_embeddings=cardinality+add_missing, embedding_dim=embed_size)

def get_embedding_projection():    
    data = pd.read_parquet('train_data/train_data_00.pq')[:10]
    features = data.columns.tolist()
    features.remove('id')
    features.remove('rn')
    embedding_projection = [create_embedding_projection(*mapping[e]) for e in features]
    embedding_general_output_size = sum([mapping[e][1] for e in features])
    return embedding_projection, embedding_general_output_size

In [42]:
class Model(torch.nn.Module):
    def __init__(self, enc_hid_dim):
        super().__init__()
        embedding_projection, embedding_general_output_size = get_embedding_projection()
        self.embeddings = torch.nn.ModuleList(embedding_projection)
        self.credits_rnn = torch.nn.GRU(embedding_general_output_size, enc_hid_dim, batch_first=True, bidirectional =True)
        self.relu = torch.nn.ReLU()
        self.fc = torch.nn.Linear(in_features=enc_hid_dim * 2, out_features=1)
        
        
    def forward(self, X):
        embeddings =  [emb(tensor) for emb, tensor in zip(self.embeddings, torch.tensor_split(X, 59, dim=-1))]
        concatted_emb = torch.squeeze(torch.cat(embeddings, dim=-1))

        rnn_output, hidden_credits_rnn = self.credits_rnn(concatted_emb)

        rnn_stack = torch.cat([hidden_credits_rnn[0],hidden_credits_rnn[1]], dim=-1) #torch.Size([931, 400])
        output = self.fc(self.relu(rnn_stack))
        return output

In [164]:
from torchmetrics import AUROC
import logging
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint

logging.getLogger("lightning").setLevel(logging.ERROR)
class LightningModule(pl.LightningModule):
    def __init__(self, model):
        super().__init__()
        self.model = model
        self.weights = torch.Tensor([27]).to(device)
        self.metric = AUROC(average='weighted')
        self.loss = torch.nn.BCEWithLogitsLoss(reduction="none", pos_weight=self.weights)

    def training_step(self, batch, batch_idx):
        x, y = batch
        output = torch.squeeze(self.model.forward(torch.squeeze(x)))
        loss = self.loss(output, torch.squeeze(y)).mean()
        return loss    
        
    def test_step(self, batch, batch_idx):
        x, y = batch
        output = torch.squeeze(self.model.forward(x))
        y = torch.squeeze(y)
          
        return torch.row_stack([output, y])

    def test_epoch_end(self, test_step_outputs):        

        catted = torch.cat(test_step_outputs, dim=1)
        self.log("test_loss", self.metric(catted[0], catted[1].long()).item())
        
    def predict_step(self, batch, batch_idx):
        x, ids = batch
        output = torch.sigmoid(torch.squeeze(self.model.forward(torch.squeeze(x))))
        
        return dict([(a.item(), b.item()) for a,b in zip(ids.detach().cpu(), output.detach().cpu())])
        
    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters())
        return optimizer
    
    def forward(self, x):
        return self.model(x)
    

In [60]:
from pytorch_lightning.callbacks import RichProgressBar

In [172]:
def fit_model(hidden_size, epoch, train_dataloader):
    model = Model(hidden_size)
    module = LightningModule(model)
    callbacks=[RichProgressBar(leave=True)]
    trainer = pl.Trainer(callbacks=callbacks,
                         max_epochs=epoch,
                         deterministic=True,
                         enable_progress_bar=True,
                         accelerator="gpu")
    trainer.fit(model=module, train_dataloaders=train_dataloader)
    return trainer

In [55]:
def objective_on_fold(hidden_size, epoch, train_dataloader, test_dataloader):
    trainer = fit_model(hidden_size, epoch, train_dataloader, test_dataloader)

    trainer.test(dataloaders=test_dataloader)
    return trainer.callback_metrics["test_loss"].item()

In [45]:
def objective(trial):
    
    hidden_size = trial.suggest_int("hidden_size", 5, 400, 20)
    epoch = trial.suggest_int("epoch", 1, 2)##############
    losses = []
    
    for train_dataloader, test_dataloader in get_fold_dataloaders():
        
        loss = objective_on_fold(hidden_size, epoch, train_dataloader, test_dataloader)
        losses.append(loss) 
    return sum(losses) / len(losses)


In [46]:
import logging
import sys
import math
import optuna

# Add stream handler of stdout to show the messages
#optuna.logging.get_logger("optuna").addHandler(logging.StreamHandler(sys.stdout))
study_name = "transformer"
storage_name = "sqlite:///{}.db".format(study_name)

In [50]:
loaded_study = optuna.load_study(study_name="example-study", storage=f"sqlite:///example-study.db")

In [52]:
loaded_study.best_params

{'epoch': 5, 'hidden_size': 205}

In [173]:
trainers = []
for train_dataloader, test_dataloader in get_fold_dataloaders(True):
    trainers.append(fit_model(205, 5, train_dataloader))

In [174]:
dl = DataLoader(FoldDataset(0,'', False), batch_size=None)

predictions = []
for trainer in trainers:
    predictions.append(trainer.predict(dataloaders=[dl]))

In [181]:
len(predictions)

3

In [184]:
results = {}
for _list in predictions:
    for _dict in _list:
        #print(_dict)
        for key,value in _dict.items():
            if key not in results:
                results[key] = 0
            results[key] += value / 3.0
        

In [None]:
import warnings
warnings.filterwarnings('ignore')
def train():
    study = optuna.create_study(direction="maximize", study_name=study_name, storage=storage_name, load_if_exists=True)
    study.optimize(objective, n_trials=30)

In [198]:
pd.DataFrame(results.items(), columns=['id', 'score']).sort_values(by=['id']).to_csv('final.csv',index=False)

In [None]:
pd.DataFrame(data={'id':[int(e) for e in ids], 'score':prediction}).sort_values(by=['id']).to_csv('rnn_weight_27.csv',index=False)

In [None]:
x.shape

In [None]:
Id.shape

In [None]:
def predict(row):
    X = torch.LongTensor(row[0])
    X = X.to(device)
    lengths = torch.LongTensor([row['lengths']])
    score = enc.forward(X, lengths)
    sigm = torch.sigmoid(score).item()
    del X
    return sigm

In [None]:
df['score'] = df.apply(lambda row: predict(row), axis=1)

In [None]:
df['score'].describe()

In [None]:
df[['id', 'score']].to_csv('1_epoch.csv',index=False)