In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("data/UCI_Credit_Card.csv")

### About Dataset

This dataset contains information on **default payments** of credit card clients in Taiwan covering a six month period from April 2005 to September 2005. It includes columns with insights about the clients' demographic characteristics, payment histories, and credit behavior

##### Demographic Variables
- `USER_ID`: Unique identifier for each client.

- `CREDIT_LIMIT`: Credit limit in NT dollars (includes individual and family/supplementary credit).

- `SEX`: Gender of the client (1 = Male, 2 = Female).

- `EDUCATION`: Educational background of the client:
  - 1 = Graduate School
  - 2 = University
  - 3 = High School
  - 4 = Others

- `MARITALSTATUS`: Marital status of the client (1 = Married, 2 = Single, 3 = Others).

- `AGE`: Age of the client in years.


##### Billing and Payment Variables
- `BILL_AMTx`: Amount of bill statement (NT dollar)
  - `BILL_AMT1` - `BILL_AMT6`: Amount of bill statement in September - April, 2005


- `PAYMENTDELAY_1` to `PAYMENTDELAY_6`: Monthly repayment status from August 2005 to April 2005 (reverse chronological order).
  - `PAYMENTDELAY_x`: Repayment status where
    - -1 = Payment made on time
    - 1 = Payment delayed for 1 month
    - 2 = Payment delayed for 2 months
    - …
    - 8 = Delay of 8 months
    - 9 = Delay of 9 months or more

- `PAID_AMTx`: Amount of previous payment (NT dollar)
  - `PAID_AMT1` - `PAID_AMT6`: Amount of previous payment in September - April, 2005

##### Target Variable
- `DID_DEFAULT_PAYMENT`: Whether the client defaulted on payment in the next month (1 = Yes, 0 = No)

In [3]:
df.head()

Unnamed: 0,USER_ID,CREDIT_LIMIT,SEX,EDUCATION,MARITALSTATUS,AGE,PAYMENTDELAY_1,PAYMENTDELAY_2,PAYMENTDELAY_3,PAYMENTDELAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAID_AMT1,PAID_AMT2,PAID_AMT3,PAID_AMT4,PAID_AMT5,PAID_AMT6,DID_DEFAULT_PAYMENT
0,1,20000.0,2,2,1,24,2,2,0,0,...,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,1
1,2,120000.0,2,2,2,26,0,2,0,0,...,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,2000.0,1
2,3,90000.0,2,2,2,34,0,0,0,0,...,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0,0
3,4,50000.0,2,2,1,37,0,0,0,0,...,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,1100.0,1069.0,1000.0,0
4,5,50000.0,1,2,1,57,0,0,0,0,...,20940.0,19146.0,19131.0,2000.0,36681.0,10000.0,9000.0,689.0,679.0,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 25 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   USER_ID              30000 non-null  int64  
 1   CREDIT_LIMIT         30000 non-null  float64
 2   SEX                  30000 non-null  int64  
 3   EDUCATION            30000 non-null  int64  
 4   MARITALSTATUS        30000 non-null  int64  
 5   AGE                  30000 non-null  int64  
 6   PAYMENTDELAY_1       30000 non-null  int64  
 7   PAYMENTDELAY_2       30000 non-null  int64  
 8   PAYMENTDELAY_3       30000 non-null  int64  
 9   PAYMENTDELAY_4       30000 non-null  int64  
 10  PAYMENTDELAY_5       30000 non-null  int64  
 11  PAYMENTDELAY_6       30000 non-null  int64  
 12  BILL_AMT1            30000 non-null  float64
 13  BILL_AMT2            30000 non-null  float64
 14  BILL_AMT3            30000 non-null  float64
 15  BILL_AMT4            30000 non-null 

### Training
Since the dataset is about credit defaults, we will train a classifier that can predict whether a client will default on their payment in the next month. The performance of the model will be evaluated using the Accuracy and F1 Score of the model

In [5]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

In [6]:
# Segregate the data into features and target variables

target_col = ["DID_DEFAULT_PAYMENT"]
feature_cols = [
    "CREDIT_LIMIT",
    "SEX",
    "EDUCATION",
    "MARITALSTATUS",
    "AGE",
    "PAYMENTDELAY_1",
    "PAYMENTDELAY_2",
    "PAYMENTDELAY_3",
    "PAYMENTDELAY_4",
    "PAYMENTDELAY_5",
    "PAYMENTDELAY_6",
    "BILL_AMT1",
    "BILL_AMT2",
    "BILL_AMT3",
    "BILL_AMT4",
    "BILL_AMT5",
    "BILL_AMT6",
    "PAID_AMT1",
    "PAID_AMT2",
    "PAID_AMT3",
    "PAID_AMT4",
    "PAID_AMT5",
    "PAID_AMT6",
]


X = df[feature_cols]
y = df[target_col]

In [7]:
# Split the data into training and test sets

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

In [10]:
param_grid = {
    'max_depth': np.arange(3, 5),
    'criterion' : ['gini','entropy'],
    'max_leaf_nodes': [5,10],
    'min_samples_split': [2, 5]
}

# Create the grid
grid_tree = GridSearchCV(DecisionTreeClassifier(), param_grid, cv = 5, scoring= 'accuracy')

# Training
grid_tree.fit(X_train, y_train)


print(f"Best Estimator: {grid_tree.best_estimator_}")
print(f"Best Accuracy: {np.abs(grid_tree.best_score_)}")

Best Estimator: DecisionTreeClassifier(max_depth=np.int64(4), max_leaf_nodes=10)
Best Accuracy: 0.8203750000000001


In [11]:
single_entry = X_test.sample(n=1)

In [19]:
grid_tree.best_estimator_.predict(single_entry)
probs = grid_tree.best_estimator_.predict_proba(single_entry)
print(probs)

[[0.89242036 0.10757964]]


### Model Export
Now that the classifier is trained, the next step is to deploy it within a Python service. To do this, we will export the trained model as a joblib file. By saving it in pickle format, we will simplify our inference by enabling quick model loading and reuse without having to do any complex setup process.

In [15]:
import joblib

In [16]:
joblib.dump(grid_tree.best_estimator_, "risk_radar_model")

['risk_radar_model']

In [17]:
m = joblib.load("risk_radar_model")

In [18]:
prediction = m.predict(single_entry).item()
probs = m.predict_proba(single_entry)
confidence = probs.max(axis=1).item()

print(f"Prediction: {prediction}, Confidence: {confidence}")

Prediction: 0, Confidence: 0.8924203588429147


In [20]:
single_entry.to_dict(orient='records')

[{'CREDIT_LIMIT': 500000.0,
  'SEX': 1,
  'EDUCATION': 3,
  'MARITALSTATUS': 1,
  'AGE': 47,
  'PAYMENTDELAY_1': 0,
  'PAYMENTDELAY_2': 0,
  'PAYMENTDELAY_3': 0,
  'PAYMENTDELAY_4': 0,
  'PAYMENTDELAY_5': 0,
  'PAYMENTDELAY_6': 0,
  'BILL_AMT1': 18033.0,
  'BILL_AMT2': 8783.0,
  'BILL_AMT3': 13202.0,
  'BILL_AMT4': 16546.0,
  'BILL_AMT5': 12585.0,
  'BILL_AMT6': 14287.0,
  'PAID_AMT1': 8783.0,
  'PAID_AMT2': 13357.0,
  'PAID_AMT3': 16600.0,
  'PAID_AMT4': 12585.0,
  'PAID_AMT5': 14287.0,
  'PAID_AMT6': 25793.0}]