In [8]:
import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, LabelEncoder

# model imports
from sklearn.linear_model import LogisticRegression #linear model
from sklearn.svm import SVC # support vector machine
from sklearn.ensemble import RandomForestClassifier #ensemble model
from sklearn.tree import DecisionTreeClassifier # decision tree model


from sklearn.metrics import classification_report, accuracy_score, recall_score, precision_score, f1_score

#automate EDA
from ydata_profiling import ProfileReport

pd.options.display.max_columns = 100

# Import Dataset

Dataset Details

This research aimed at the case of customers' default payments in Taiwan and compares the predictive accuracy of probability of default. 

link to dataset: https://archive.ics.uci.edu/dataset/350/default+of+credit+card+clients

Variables Name



In [9]:
df =  pd.read_csv('../data/default_credit_dataset.csv', sep=';', dtype=str)

## Data Understanding and Preprocessing

In [10]:
df.sample(5)

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
25028,25029,460000,1,3,1,61,0,-1,-1,-1,-1,-1,10643,1547,1670,6425,4897,8401,1600,1800,6500,5000,5000,4500,0
23220,23221,150000,2,3,2,35,-1,2,-1,2,-1,-1,1877,709,2264,1143,163,2036,0,2264,0,163,2036,0,0
18916,18917,50000,2,1,2,29,0,-1,0,-1,-1,0,17250,8560,4725,8933,11784,9927,8560,1600,8933,11784,199,8287,0
25154,25155,70000,2,2,2,26,2,2,2,0,0,0,71862,73594,71851,68539,49027,48969,3500,0,3100,1900,2000,1865,1
6559,6560,230000,2,2,1,32,0,0,0,0,0,0,57259,46226,36089,34158,22872,22592,2500,3000,3000,1500,1500,2000,1


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 25 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   ID                          30000 non-null  object
 1   LIMIT_BAL                   30000 non-null  object
 2   SEX                         30000 non-null  object
 3   EDUCATION                   30000 non-null  object
 4   MARRIAGE                    30000 non-null  object
 5   AGE                         30000 non-null  object
 6   PAY_0                       30000 non-null  object
 7   PAY_2                       30000 non-null  object
 8   PAY_3                       30000 non-null  object
 9   PAY_4                       30000 non-null  object
 10  PAY_5                       30000 non-null  object
 11  PAY_6                       30000 non-null  object
 12  BILL_AMT1                   30000 non-null  object
 13  BILL_AMT2                   30000 non-null  ob

In [12]:
# renaming column
df.rename(columns={'default payment next month': 'default'}, inplace=True)

As there are no empty values in the dataset, we can proceed to the next step.

In [13]:
df.groupby('default')['default'].value_counts()

default
0    23364
1     6636
Name: count, dtype: int64

## Define data type for features and target

In [14]:
target = 'default'

numeric_features = [
    'LIMIT_BAL', 
    'AGE',
    'PAY_0', 
    'PAY_2', 
    'PAY_3',
    'PAY_4', 
    'PAY_5', 
    'PAY_6',
    'BILL_AMT1', 
    'BILL_AMT2', 
    'BILL_AMT3',
    'BILL_AMT4', 
    'BILL_AMT5', 
    'BILL_AMT6', 
    'PAY_AMT1', 
    'PAY_AMT2',
    'PAY_AMT3', 
    'PAY_AMT4', 
    'PAY_AMT5', 
    'PAY_AMT6'
]

categorical_features = [
    'SEX', 'EDUCATION', 'MARRIAGE'
]

In [23]:
df[numeric_features] = df[numeric_features].apply(pd.to_numeric)

In [30]:
profile = ProfileReport(df[numeric_features + categorical_features].copy(), title="Credit Card Default Dataset Profiling Report")

In [32]:
profile.to_file("./EDA_report.html")

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]