<a href="https://colab.research.google.com/github/hiroci/mlp/blob/main/mlp_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Importing libraries

In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import numpy as np
from sklearn.preprocessing import OneHotEncoder

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Reading dataset

In [None]:
data = pd.read_csv('Loan_Default.csv')
df = pd.read_csv('Loan_Default.csv')

## EDA

In [None]:
df.head()

Unnamed: 0,ID,year,loan_limit,Gender,approv_in_adv,loan_type,loan_purpose,Credit_Worthiness,open_credit,business_or_commercial,...,credit_type,Credit_Score,co-applicant_credit_type,age,submission_of_application,LTV,Region,Security_Type,Status,dtir1
0,24890,2019,cf,Sex Not Available,nopre,type1,p1,l1,nopc,nob/c,...,EXP,758,CIB,25-34,to_inst,98.728814,south,direct,1,45.0
1,24891,2019,cf,Male,nopre,type2,p1,l1,nopc,b/c,...,EQUI,552,EXP,55-64,to_inst,,North,direct,1,
2,24892,2019,cf,Male,pre,type1,p1,l1,nopc,nob/c,...,EXP,834,CIB,35-44,to_inst,80.019685,south,direct,0,46.0
3,24893,2019,cf,Male,nopre,type1,p4,l1,nopc,nob/c,...,EXP,587,CIB,45-54,not_inst,69.3769,North,direct,0,42.0
4,24894,2019,cf,Joint,pre,type1,p1,l1,nopc,nob/c,...,CRIF,602,EXP,25-34,not_inst,91.886544,North,direct,0,39.0


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 148670 entries, 0 to 148669
Data columns (total 34 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   ID                         148670 non-null  int64  
 1   year                       148670 non-null  int64  
 2   loan_limit                 145326 non-null  object 
 3   Gender                     148670 non-null  object 
 4   approv_in_adv              147762 non-null  object 
 5   loan_type                  148670 non-null  object 
 6   loan_purpose               148536 non-null  object 
 7   Credit_Worthiness          148670 non-null  object 
 8   open_credit                148670 non-null  object 
 9   business_or_commercial     148670 non-null  object 
 10  loan_amount                148670 non-null  int64  
 11  rate_of_interest           112231 non-null  float64
 12  Interest_rate_spread       112031 non-null  float64
 13  Upfront_charges            10

In [None]:

df.columns

Index(['ID', 'year', 'loan_limit', 'Gender', 'approv_in_adv', 'loan_type',
       'loan_purpose', 'Credit_Worthiness', 'open_credit',
       'business_or_commercial', 'loan_amount', 'rate_of_interest',
       'Interest_rate_spread', 'Upfront_charges', 'term', 'Neg_ammortization',
       'interest_only', 'lump_sum_payment', 'property_value',
       'construction_type', 'occupancy_type', 'Secured_by', 'total_units',
       'income', 'credit_type', 'Credit_Score', 'co-applicant_credit_type',
       'age', 'submission_of_application', 'LTV', 'Region', 'Security_Type',
       'Status', 'dtir1'],
      dtype='object')

In [None]:
df.nunique()

ID                           148670
year                              1
loan_limit                        2
Gender                            4
approv_in_adv                     2
loan_type                         3
loan_purpose                      4
Credit_Worthiness                 2
open_credit                       2
business_or_commercial            2
loan_amount                     211
rate_of_interest                131
Interest_rate_spread          22516
Upfront_charges               58271
term                             26
Neg_ammortization                 2
interest_only                     2
lump_sum_payment                  2
property_value                  385
construction_type                 2
occupancy_type                    3
Secured_by                        2
total_units                       4
income                         1001
credit_type                       4
Credit_Score                    401
co-applicant_credit_type          2
age                         

In [None]:
df['Status'].value_counts(), df['Status'].mean()

(0    112031
 1     36639
 Name: Status, dtype: int64,
 0.24644514696979888)

## Preprocessing

In [None]:
df = df.dropna(axis=1)

ID                           0
year                         0
loan_limit                   0
Gender                       0
approv_in_adv                0
loan_type                    0
loan_purpose                 0
Credit_Worthiness            0
open_credit                  0
business_or_commercial       0
loan_amount                  0
rate_of_interest             0
Interest_rate_spread         0
Upfront_charges              0
term                         0
Neg_ammortization            0
interest_only                0
lump_sum_payment             0
property_value               0
construction_type            0
occupancy_type               0
Secured_by                   0
total_units                  0
income                       0
credit_type                  0
Credit_Score                 0
co-applicant_credit_type     0
age                          0
submission_of_application    0
LTV                          0
Region                       0
Security_Type                0
Status  

In [None]:
for col in df.columns:
    if df[col].dtype=='float64' or df[col].dtype=='int64':
        df[col].fillna(df[col].mean(),inplace=True)

    else:
        df[col].fillna(df[col].mode()[0],inplace=True)

In [None]:
categorical_non_ordinal = ['loan_limit',
                           'Gender',
                           'approv_in_adv',
                           'loan_type',
                           'loan_purpose',
                           'Credit_Worthiness',
                           'open_credit',
                           'business_or_commercial',
                           'Neg_ammortization',
                           'interest_only',
                           'lump_sum_payment',
                           'construction_type',
                           'occupancy_type',
                           'Secured_by',
                           'total_units',
                           'credit_type',
                           'co-applicant_credit_type',
                           'submission_of_application',
                           'Region',
                           'Security_Type'
                           ]


In [None]:
encoder = OneHotEncoder()
categorical = pd.DataFrame.sparse.from_spmatrix(encoder.fit_transform(df[categorical_non_ordinal]))
categorical.columns = encoder.get_feature_names_out()

In [None]:
df['age'].unique()

array(['25-34', '55-64', '35-44', '45-54', '65-74', '>74', '<25'],
      dtype=object)

In [None]:
df['age'] = df['age'].replace({'25-34': (25 + 34)/2,
                   '55-64': (55 + 64)/2,
                   '35-44': (35 + 44)/2,
                   '45-54': (45 + 54)/2,
                   '65-74': (65 + 74)/2,
                   '<25' : 25,
                   '>74' : 74})

In [None]:
df = df.drop(columns=categorical_non_ordinal)
df = df.drop(columns=['ID', 'year'])


### Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [None]:
X = df.drop(columns='Status')
y = df['Status']
for col in X:
  X[col] = scaler.fit_transform(X[[col]])

In [None]:
X = pd.concat([X, categorical], axis='columns')

In [None]:
class Data(Dataset):
    def __init__(self, data, targets):
        self.data = data
        self.targets = targets

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return torch.from_numpy(self.data[idx]).float(), torch.from_numpy(np.array(self.targets[idx])).float()

dataset = Data(X,y)

### Splitting the data

In [None]:
from sklearn.model_selection import train_test_split
batch_size = 256

train_pct = 0.8
val_pct = 0.1
test_pct = 0.1

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_pct)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=val_pct)

train_data = Data(X_train.to_numpy(), y_train.to_numpy())
val_data = Data(X_val.to_numpy(), y_val.to_numpy())
test_data = Data(X_test.to_numpy(), y_test.to_numpy())

train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False)

## Pytorch NN

In [None]:
class MLP(nn.Module):

  def __init__(self, input_dim, output_dim, layers, hidden_activations, output_activation, lr=0.001):
    super(MLP, self).__init__()
    all_layers = []
    prev_dim = input_dim

    if len(layers) != len(hidden_activations):
      print('Wrong setup')
      print(f'layers: {len(layers)}')
      print(f'hidden_activations: {len(hidden_activations)}')

    for layer_dim, activation in zip(layers, hidden_activations):
      all_layers.append(nn.Linear(prev_dim, layer_dim, device=device))
      all_layers.append(activation)
      prev_dim = layer_dim

    all_layers.append(nn.Linear(prev_dim, output_dim, device=device))
    all_layers.append(output_activation)

    self.layers = nn.Sequential(*all_layers)

    self.loss = nn.MSELoss()
    self.optimizer = torch.optim.SGD(self.parameters(), lr=lr)

  def forward(self, X):
    return self.layers(X.cuda())

  def fit(self, dataloader: DataLoader):
    self.train()
    total_loss = 0

    for batch_x, batch_y in dataloader:
      batch_x, batch_y = batch_x.to(device), batch_y.to(device)
      self.optimizer.zero_grad()
      yhat = self.forward(batch_x)
      loss = self.loss(yhat.view(yhat.shape[0]), batch_y)
      loss.backward()
      self.optimizer.step()
      total_loss += loss.item()

    return total_loss/len(dataloader)


In [None]:
activation_list = [nn.ReLU()]
output_activation = nn.Sigmoid()

model = MLP(
    input_dim=63,
    output_dim=1,
    layers=[100],
    hidden_activations=activation_list,
    output_activation=output_activation,
    lr=.1
)

In [None]:
epochs = 20
for _ in range(epochs):
  loss = model.fit(dataloader=train_loader)
  print(loss)

0.1495362420876821
0.09329273371369975
0.0642190257644957
0.0423896534214164
0.029467436225962235
0.02241119977102601
0.018219510755736988
0.015440971121951273
0.013500241553826127
0.012035698804909446
0.010902974034329351
0.009991525533963035
0.009267602047651627
0.00864306908553652
0.00812084920113143
0.007656619222162414
0.007256868273126566
0.006912041365044417
0.006590178985526338
0.006316260523172669


In [None]:
model.forward(torch.tensor(X_test.iloc[0]).float())

tensor([0.0004], device='cuda:0', grad_fn=<SigmoidBackward0>)

In [None]:
preds = []
for row in range(X_test.shape[0]):
  preds.append(model.forward(torch.tensor(X_test.iloc[row]).float()).item())

In [None]:
from sklearn.metrics import classification_report

preds = np.round(preds)
print(classification_report(y_true=y_test, y_pred=preds))

              precision    recall  f1-score   support

           0       1.00      0.99      1.00     11239
           1       0.98      1.00      0.99      3628

    accuracy                           0.99     14867
   macro avg       0.99      1.00      0.99     14867
weighted avg       0.99      0.99      0.99     14867

