In [18]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import torch
import torch.nn as nn

In [2]:
%load_ext autoreload
%autoreload 2

In [1]:
!wget -P ../data/raw https://code.datasciencedojo.com/datasciencedojo/datasets/raw/master/Default%20of%20Credit%20Card%20Clients/default%20of%20credit%20card%20clients.csv

--2022-03-11 11:52:08--  https://code.datasciencedojo.com/datasciencedojo/datasets/raw/master/Default%20of%20Credit%20Card%20Clients/default%20of%20credit%20card%20clients.csv
Resolving code.datasciencedojo.com (code.datasciencedojo.com)... 167.99.111.153
Connecting to code.datasciencedojo.com (code.datasciencedojo.com)|167.99.111.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2867294 (2.7M) [text/plain]
Saving to: ‘../data/raw/default of credit card clients.csv’


2022-03-11 11:52:10 (2.11 MB/s) - ‘../data/raw/default of credit card clients.csv’ saved [2867294/2867294]



In [9]:
df = pd.read_csv('../data/raw/default of credit card clients.csv', skiprows=1)
df_cleaned = df.copy()
df_cleaned.drop('ID', axis=1, inplace=True)
target_col = 'default payment next month'
cat_cols = ['SEX', 'EDUCATION', 'MARRIAGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']
num_cols = list(set(df_cleaned.columns) - (set(cat_cols) | set([target_col])))

In [10]:
from src.data.sets import split_sets_random, save_sets

sc = StandardScaler()
df_cleaned[num_cols] = sc.fit_transform(df_cleaned[num_cols])

ohe = OneHotEncoder(sparse=False)
X_cat = pd.DataFrame(ohe.fit_transform(df_cleaned[cat_cols]))
X_cat.columns = ohe.get_feature_names(cat_cols)

df_cleaned.drop(cat_cols, axis=1, inplace=True)
X = pd.concat([df_cleaned, X_cat ], axis=1)

X_train, y_train, X_val, y_val, X_test, y_test = split_sets_random(X, target_col=target_col, test_ratio=0.2)

In [11]:
!mkdir ../data/processed/credit_card_default
save_sets(X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, X_test=X_test, y_test=y_test, path='../data/processed/credit_card_default/')

In [12]:
from src.models.pytorch import PytorchDataset
from src.models.performance import print_class_perf

train_dataset = PytorchDataset(X=X_train, y=y_train)
val_dataset = PytorchDataset(X=X_val, y=y_val)
test_dataset = PytorchDataset(X=X_test, y=y_test)

In [14]:
from src.models.null import NullModel
from src.models.performance import print_class_perf

baseline_model = NullModel(target_type='classification')
y_base = baseline_model.fit_predict(y_train)
print_class_perf(y_base, y_train, set_name='Training', average='weighted')

Accuracy Training: 0.7787222222222222
F1 Training: 0.6818471055307425


In [16]:
from src.models.pytorch import PytorchBinary
from src.models.pytorch import get_device

model = PytorchBinary(X_train.shape[1])
device = get_device()
model.to(device)

PytorchBinary(
  (layer_1): Linear(in_features=91, out_features=256, bias=True)
  (layer_out): Linear(in_features=256, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [19]:
from src.models.pytorch import train_binary, test_binary

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.9)
N_EPOCHS = 10
BATCH_SIZE = 32

for epoch in range(N_EPOCHS):
    train_loss, train_acc = train_binary(train_dataset, model=model, criterion=criterion, optimizer=optimizer, batch_size=BATCH_SIZE, device=device, scheduler=scheduler)
    valid_loss, valid_acc = test_binary(val_dataset, model=model, criterion=criterion, batch_size=BATCH_SIZE, device=device)

    print(f'Epoch: {epoch}')
    print(f'\t(train)\tLoss: {train_loss:.4f}\t|\tAcc: {train_acc * 100:.1f}%')
    print(f'\t(valid)\tLoss: {valid_loss:.4f}\t|\tAcc: {valid_acc * 100:.1f}%')

Epoch: 0
	(train)	Loss: 0.0165	|	Acc: 77.9%
	(valid)	Loss: 0.0148	|	Acc: 77.9%
Epoch: 1
	(train)	Loss: 0.0145	|	Acc: 77.9%
	(valid)	Loss: 0.0141	|	Acc: 77.9%
Epoch: 2
	(train)	Loss: 0.0141	|	Acc: 77.9%
	(valid)	Loss: 0.0140	|	Acc: 77.9%
Epoch: 3
	(train)	Loss: 0.0139	|	Acc: 77.9%
	(valid)	Loss: 0.0138	|	Acc: 77.9%
Epoch: 4
	(train)	Loss: 0.0138	|	Acc: 77.9%
	(valid)	Loss: 0.0139	|	Acc: 77.9%
Epoch: 5
	(train)	Loss: 0.0138	|	Acc: 77.9%
	(valid)	Loss: 0.0138	|	Acc: 77.9%
Epoch: 6
	(train)	Loss: 0.0137	|	Acc: 77.9%
	(valid)	Loss: 0.0138	|	Acc: 77.9%
Epoch: 7
	(train)	Loss: 0.0138	|	Acc: 77.9%
	(valid)	Loss: 0.0138	|	Acc: 77.9%
Epoch: 8
	(train)	Loss: 0.0137	|	Acc: 77.9%
	(valid)	Loss: 0.0137	|	Acc: 77.9%
Epoch: 9
	(train)	Loss: 0.0137	|	Acc: 77.9%
	(valid)	Loss: 0.0137	|	Acc: 77.9%


In [20]:
torch.save(model, "../models/pytorch_clf_default.pt")

In [21]:
test_loss, test_acc = test_binary(test_dataset, model=model, criterion=criterion, batch_size=BATCH_SIZE, device=device)
print(f'\tLoss: {test_loss:.4f}\t|\tAccuracy: {test_acc:.1f}')

	Loss: 0.0136	|	Accuracy: 0.8
