In [1]:
import pandas as pd
import numpy as np
import yaml
import csv
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import TensorDataset, DataLoader
from models_def import MultiLayerPerceptron, Bagging, TabularCNN, EarlyStopper, train_model, eval_model

# Loading Data and Weights

In [2]:
read_variables = ["features", "default_ind"]
device = torch.device("cuda")

with open("config.yaml") as f:
    config = yaml.safe_load(f)
data_path = config["processed_path"] 


train_df = pd.read_parquet(f"{data_path}/processed_train_test_data/train_df.parquet", columns=read_variables)
test_df = pd.read_parquet(f"{data_path}/processed_train_test_data/test_df.parquet", columns=read_variables)


with open(f'{data_path}/class_weights_dict.csv') as csv_file:
    reader = csv.reader(csv_file)
    weights_dict = dict(reader)

# Preparing Tensors for Models

In [3]:
X_train_full = np.array(train_df["features"].tolist(), dtype=np.float32)
y_train_full = train_df["default_ind"].values.astype(np.float32)

X_test = np.array(test_df["features"].tolist(), dtype=np.float32)
y_test = test_df["default_ind"].values.astype(np.float32)

In [4]:
X_train, X_val, y_train, y_val = train_test_split(
    X_train_full, y_train_full,
    test_size=0.2, random_state=42, stratify=y_train_full
)

In [5]:
print(X_train)

[[3.7000e+03 1.2290e+01 1.2341e+02 ... 0.0000e+00 2.0000e+00 0.0000e+00]
 [8.0000e+03 1.1220e+01 2.6275e+02 ... 0.0000e+00 0.0000e+00 0.0000e+00]
 [1.0000e+04 1.3330e+01 3.3854e+02 ... 9.0000e+00 6.0000e+00 0.0000e+00]
 ...
 [1.5000e+04 1.2290e+01 5.0030e+02 ... 1.0000e+00 7.0000e+00 0.0000e+00]
 [2.9050e+04 2.0990e+01 7.8574e+02 ... 0.0000e+00 2.4000e+01 0.0000e+00]
 [3.5000e+04 1.6590e+01 8.6215e+02 ... 0.0000e+00 1.2000e+01 0.0000e+00]]


In [6]:
train_tensor_ds = TensorDataset(torch.tensor(X_train, dtype=torch.float32),
                            torch.tensor(y_train, dtype=torch.float32))


val_tensor_ds = TensorDataset(torch.tensor(X_val, dtype=torch.float32),
                            torch.tensor(y_val, dtype=torch.float32))


test_tensor_ds = TensorDataset(torch.tensor(X_test, dtype=torch.float32),
                            torch.tensor(y_test, dtype=torch.float32))

In [7]:
train_loader = DataLoader(train_tensor_ds,
                          batch_size=32,
                          shuffle=True,
                          pin_memory=True if device.type == 'cuda' else False)

val_loader = DataLoader(val_tensor_ds, 
                        batch_size=32, 
                        shuffle=False, 
                        pin_memory=True if device.type == 'cuda' else False)

test_loader = DataLoader(test_tensor_ds, 
                         batch_size=32,
                         shuffle=True,
                         pin_memory=True if device.type == 'cuda' else False)

In [8]:
weights = torch.tensor([float(weights_dict['1']) / float(weights_dict['0'])],
                       dtype=torch.float).to(device)

# Parameters and Model Training

In [9]:
num_features = X_train.shape[1]

# Choose a model
# model = MultiLayerPerceptron(num_features)
# model = Bagging(num_features)
model = TabularCNN(num_features)
model = model.to(device)

In [10]:
# Optimiser
optimiser = torch.optim.Adam(model.parameters(), lr=0.001)

# Loss function
bce = torch.nn.BCEWithLogitsLoss(pos_weight=weights)

In [11]:
early_stopper = EarlyStopper(patience=5, min_delta=0.001)
train_model(model, train_loader, val_loader, optimiser, bce, epochs=20, early_stop=early_stopper)

Epoch 1: Train Loss = 19706.7165, Val Loss = 1.0276, Val AUC = 0.8549
Epoch 2: Train Loss = 18260.2678, Val Loss = 2.1816, Val AUC = 0.9057
Epoch 3: Train Loss = 17857.1889, Val Loss = 8.2156, Val AUC = 0.4255
Epoch 4: Train Loss = 17765.7053, Val Loss = 1.1693, Val AUC = 0.9155
Epoch 5: Train Loss = 17614.4623, Val Loss = 2.1985, Val AUC = 0.9276
Epoch 6: Train Loss = 17540.1293, Val Loss = 3.7620, Val AUC = 0.9012
Stopped early at epoch 6


In [12]:
eval_model(model, test_loader)

Test Accuracy: 0.9458884550381724
Precision: 0.0
Recall: 0.0
ROC AUC: 0.9009592831143862


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
