In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import os
import gc
import torch
import random
import pytorch_lightning as pl

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
os.cpu_count()

12

In [3]:
print(torch.cuda.device_count())
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

1
cuda


In [4]:
SEED = 42

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
pl.seed_everything(42, workers=True)

Global seed set to 42


42

In [5]:
mapping = {'pre_since_opened': (20, 9),
 'pre_since_confirmed': (18, 8),
 'pre_pterm': (18, 8),
 'pre_fterm': (17, 8),
 'pre_till_pclose': (17, 8),
 'pre_till_fclose': (16, 8),
 'pre_loans_credit_limit': (20, 9),
 'pre_loans_next_pay_summ': (8, 5),
 'pre_loans_outstanding': (6, 4),
 'pre_loans_total_overdue': (2, 2),
 'pre_loans_max_overdue_sum': (4, 3),
 'pre_loans_credit_cost_rate': (14, 7),
 'pre_loans5': (18, 8),
 'pre_loans530': (20, 9),
 'pre_loans3060': (10, 6),
 'pre_loans6090': (6, 4),
 'pre_loans90': (20, 9),
 'is_zero_loans5': (2, 2),
 'is_zero_loans530': (2, 2),
 'is_zero_loans3060': (2, 2),
 'is_zero_loans6090': (2, 2),
 'is_zero_loans90': (2, 2),
 'pre_util': (20, 9),
 'pre_over2limit': (20, 9),
 'pre_maxover2limit': (20, 9),
 'is_zero_util': (2, 2),
 'is_zero_over2limit': (2, 2),
 'is_zero_maxover2limit': (2, 2),
 'enc_paym_0': (4, 3),
 'enc_paym_1': (4, 3),
 'enc_paym_2': (4, 3),
 'enc_paym_3': (4, 3),
 'enc_paym_4': (4, 3),
 'enc_paym_5': (4, 3),
 'enc_paym_6': (4, 3),
 'enc_paym_7': (4, 3),
 'enc_paym_8': (4, 3),
 'enc_paym_9': (4, 3),
 'enc_paym_10': (4, 3),
 'enc_paym_11': (5, 4),
 'enc_paym_12': (4, 3),
 'enc_paym_13': (4, 3),
 'enc_paym_14': (4, 3),
 'enc_paym_15': (4, 3),
 'enc_paym_16': (4, 3),
 'enc_paym_17': (4, 3),
 'enc_paym_18': (4, 3),
 'enc_paym_19': (4, 3),
 'enc_paym_20': (5, 4),
 'enc_paym_21': (4, 3),
 'enc_paym_22': (4, 3),
 'enc_paym_23': (4, 3),
 'enc_paym_24': (5, 4),
 'enc_loans_account_holder_type': (7, 5),
 'enc_loans_credit_status': (7, 5),
 'enc_loans_credit_type': (8, 5),
 'enc_loans_account_cur': (4, 3),
 'pclose_flag': (2, 2),
 'fclose_flag': (2, 2)}

In [6]:
def pad_sequence(array: np.ndarray, max_len) -> np.ndarray:
    output = np.zeros((max_len, 59))
    output[:array.shape[0], :] = array
    return output

In [7]:
def read_file(file_path, is_train=True):
    data = pd.read_parquet(file_path)
    
    data.drop(columns=['rn'], inplace=True)
    data = data.groupby(['id']).agg(list).agg(list, axis="columns").reset_index()
    data[0]=data[0].apply(lambda x:(np.array(x) + 1).T) ##becaouse of padding value = 0
    
    if is_train:
        target = pd.read_csv('train_target.csv')    
        data_target = data.merge(target, on="id")
        return data_target
    else:
        return data


def read_folder(is_train=True):
    
    if is_train:
        folder_path = 'train_data'
    else:
        folder_path = 'test_data'    
    
    dfs = []
    dataset_paths = [os.path.join(folder_path, filename) for filename in os.listdir(folder_path)]
    for f in tqdm(dataset_paths, total=len(dataset_paths)):
        dfs.append(read_file(f, is_train))
    df = pd.concat(dfs)
            
    df['lengths'] = df[0].apply(lambda x: x.shape[0])
    buckets = [(0, 10), (10, 20), (20, 30), (30, 60)]
    for start, to in buckets:
        mini_df = df[(df['lengths'] > start) & (df['lengths'] <= to)]
        mini_df.loc[:,0] = mini_df[0].apply(lambda x: pad_sequence(x, to))
        mini_df = mini_df.sample(frac=1, random_state=SEED)
        
        if is_train:        
            train_diveder = int(len(mini_df) * 0.7)
            val_diveder = int(len(mini_df) * 0.9)
            
            train = mini_df[:train_diveder]
            val = mini_df[train_diveder:val_diveder]
            test = mini_df[val_diveder:]
            
            def get_X(df):
                return torch.LongTensor(np.stack(df[0].values))  
            
            def get_y(df):
                return torch.Tensor(df['flag'].values)
        
            torch.save(get_X(train), f'train_tensors/X_train_{to}.pt')
            torch.save(get_y(train), f'train_tensors/y_train_{to}.pt')
            
            torch.save(get_X(val), f'train_tensors/X_val_{to}.pt')
            torch.save(get_y(val), f'train_tensors/y_val_{to}.pt')
            
            torch.save(get_X(test), f'train_tensors/X_test_{to}.pt')
            torch.save(get_y(test), f'train_tensors/y_test_{to}.pt')
            
        else:
              
            X = torch.LongTensor(np.stack(mini_df[0].values))
            Id = torch.Tensor(mini_df['id'].values)
            
            torch.save(X, f'test_tensors/X_{to}.pt')
            torch.save(Ids, f'test_tensors/ids_{to}.pt')


            
class IterableDataset(torch.utils.data.IterableDataset):
    def __init__(self, batch_size, mode):
        super(IterableDataset).__init__()
        self.mode = mode
        self.batch_size = batch_size
        self.sets = [10, 20, 30, 60]        
        self.folder_path = 'train_tensors'
        
        
    def __iter__(self):
        for padding_set in self.sets:
            X = torch.load(f'{self.folder_path}/X_{self.mode}_{padding_set}.pt')
            y = torch.load(f'{self.folder_path}/y_{self.mode}_{padding_set}.pt')
            
            batches_count = X.shape[0] // self.batch_size
            
            for _X, _y in zip(torch.tensor_split(X, batches_count, dim=0), torch.tensor_split(y, batches_count, dim=0)):
                yield _X, _y

In [8]:
def create_embedding_projection(cardinality, embed_size, add_missing=True):
    add_missing = 1 if add_missing else 0
    return torch.nn.Embedding(num_embeddings=cardinality+add_missing, embedding_dim=embed_size)

def get_embedding_projection():    
    data = pd.read_parquet('train_data/train_data_00.pq')[:10]
    features = data.columns.tolist()
    features.remove('id')
    features.remove('rn')
    embedding_projection = [create_embedding_projection(*mapping[e]) for e in features]
    embedding_general_output_size = sum([mapping[e][1] for e in features])
    return embedding_projection, embedding_general_output_size

In [9]:
class Model(torch.nn.Module):
    def __init__(self, enc_hid_dim):
        super().__init__()        
        embedding_projection, embedding_general_output_size = get_embedding_projection()
        self.embeddings = torch.nn.ModuleList(embedding_projection)
        self.credits_rnn = torch.nn.GRU(embedding_general_output_size, enc_hid_dim, batch_first=True, bidirectional =True)
        self.relu = torch.nn.ReLU()
        self.fc = torch.nn.Linear(in_features=enc_hid_dim * 2, out_features=1)
        
        
    def forward(self, X):
        embeddings =  [emb(tensor) for emb, tensor in zip(self.embeddings, torch.tensor_split(X, 59, dim=-1))]
        concatted_emb = torch.squeeze(torch.cat(embeddings, dim=-1))

        rnn_output, hidden_credits_rnn = self.credits_rnn(concatted_emb)

        rnn_stack = torch.cat([hidden_credits_rnn[0],hidden_credits_rnn[1]], dim=-1) #torch.Size([931, 400])
        output = self.fc(self.relu(rnn_stack))
        return output

In [54]:
from torchmetrics import AUROC
import logging
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint

logging.getLogger("lightning").setLevel(logging.ERROR)
class LightningModule(pl.LightningModule):
    def __init__(self, model):
        super().__init__()
        self.model = model

    def training_step(self, batch, batch_idx):
        
        x, y = batch
        output = torch.squeeze(self.model.forward(x))
        loss = torch.nn.BCEWithLogitsLoss(reduction="none")(output, torch.squeeze(y)).mean()
        return loss
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        output = torch.squeeze(self.model.forward(x))
        y = torch.squeeze(y)
        
        return output, y
        self.validation_outputs.append((output, y))

    def validation_epoch_end(self, validation_step_outputs):
        y_true = []
        y_pred = []
        
        for output, y in validation_step_outputs:
            y_true.append(y)
            y_pred.append(torch.sigmoid(output))
        self.log("val_loss", AUROC()(torch.cat(y_pred), torch.cat(y_true).long()).item(), on_epoch=True)        
        
        
    def test_step(self, batch, batch_idx):
        x, y = batch
        output = torch.squeeze(self.model.forward(x))
        y = torch.squeeze(y)
        
        return output, y
        self.validation_outputs.append((output, y))

    def test_epoch_end(self, test_step_outputs):
        y_true = []
        y_pred = []
        
        for output, y in test_step_outputs:
            y_true.append(y)
            y_pred.append(torch.sigmoid(output))
        self.log("test_loss", AUROC()(torch.cat(y_pred), torch.cat(y_true).long()).item(), on_epoch=True)
        
    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters())
        return optimizer
    

In [56]:
model = Model(200)
module = LightningModule(model)

checkpoint_callback = ModelCheckpoint(save_top_k=1, monitor="val_loss", mode="min")
early_stopping_callback = EarlyStopping(monitor="val_loss",mode="min", verbose=True, patience = 2)

trainer = pl.Trainer(callbacks=[early_stopping_callback, checkpoint_callback],
                     max_epochs=10,
                     deterministic=True,
                     enable_progress_bar = False,
                     accelerator="gpu",
                     limit_train_batches=2,
                     limit_val_batches=2)
trainer.fit(model=module, train_dataloaders=torch.utils.data.DataLoader(IterableDataset(1024, 'train')),
            val_dataloaders=torch.utils.data.DataLoader(IterableDataset(1024, 'val')))

trainer.test(dataloaders=torch.utils.data.DataLoader(IterableDataset(128, 'test')), ckpt_path=checkpoint_callback.best_model_path)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type  | Params
--------------------------------
0 | model | Model | 555 K 
--------------------------------
555 K     Trainable params
0         Non-trainable params
555 K     Total params
2.222     Total estimated model params size (MB)
  rank_zero_warn(
  rank_zero_warn(
  rank_zero_warn(
Metric val_loss improved. New best score: 0.483
Monitored metric val_loss did not improve in the last 2 records. Best score: 0.483. Signaling Trainer to stop.
Restoring states from the checkpoint path at C:\github\credit_scoring\lightning_logs\version_36\checkpoints\epoch=0-step=2.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from checkpoint at C:\github\credit_scoring\lightning_logs\version_36\checkpoints\epoch=0-step=2.ckpt
  rank_zero_warn(


────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        test_loss           0.5133928060531616
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'test_loss': 0.5133928060531616}]

In [None]:
preds = []
indexes=[]
for x, Id in get_uploaded_tensors(False):
    x = x.to(device)
    score=enc.forward(x)
    sigm = torch.sigmoid(score)
    for pred, _id in zip (np.squeeze(sigm.detach().cpu().numpy()), Id.detach().cpu().numpy()):
        preds.append(pred)
        indexes.append(_id)

In [None]:
pd.DataFrame(data={'id':[int(e) for e in indexes], 'score':preds}).sort_values(by=['id']).to_csv('1_epoch.csv',index=False)

In [None]:
x.shape

In [None]:
Id.shape

In [None]:
def predict(row):
    X = torch.LongTensor(row[0])
    X = X.to(device)
    lengths = torch.LongTensor([row['lengths']])
    score = enc.forward(X, lengths)
    sigm = torch.sigmoid(score).item()
    del X
    return sigm

In [None]:
df['score'] = df.apply(lambda row: predict(row), axis=1)

In [None]:
df['score'].describe()

In [None]:
df[['id', 'score']].to_csv('1_epoch.csv',index=False)