In [2]:
# Importing Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
cr_db = pd.read_csv('credit_risk.csv')

In [4]:
cr_db.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 21 columns):
over_draft                1000 non-null object
credit_usage              1000 non-null int64
credit_history            1000 non-null object
purpose                   1000 non-null object
current_balance           1000 non-null int64
Average_Credit_Balance    1000 non-null object
employment                1000 non-null object
location                  1000 non-null int64
personal_status           1000 non-null object
other_parties             1000 non-null object
residence_since           1000 non-null int64
property_magnitude        1000 non-null object
cc_age                    1000 non-null int64
other_payment_plans       1000 non-null object
housing                   1000 non-null object
existing_credits          1000 non-null int64
job                       1000 non-null object
num_dependents            1000 non-null int64
own_telephone             1000 non-null object
foreign_

# Selecting the feature and target variables

In [5]:
# Predictor attributes
X = cr_db.columns.drop('class')

In [6]:
# Target variables
y = cr_db['class']

# Encoding the features

In [7]:
encoded_X = pd.get_dummies(cr_db[X])

In [8]:
encoded_X.columns

Index(['credit_usage', 'current_balance', 'location', 'residence_since',
       'cc_age', 'existing_credits', 'num_dependents', 'over_draft_0<=X<200',
       'over_draft_<0', 'over_draft_>=200', 'over_draft_no checking',
       'credit_history_all paid',
       'credit_history_critical/other existing credit',
       'credit_history_delayed previously', 'credit_history_existing paid',
       'credit_history_no credits/all paid', 'purpose_business',
       'purpose_domestic appliance', 'purpose_education',
       'purpose_furniture/equipment', 'purpose_new car', 'purpose_other',
       'purpose_radio/tv', 'purpose_repairs', 'purpose_retraining',
       'purpose_used car', 'Average_Credit_Balance_100<=X<500',
       'Average_Credit_Balance_500<=X<1000', 'Average_Credit_Balance_<100',
       'Average_Credit_Balance_>=1000',
       'Average_Credit_Balance_no known savings', 'employment_1<=X<4',
       'employment_4<=X<7', 'employment_<1', 'employment_>=7',
       'employment_unemployed', 'p

# Building model

In [10]:
# Spliting the data
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(encoded_X, y,test_size=0.20,random_state=100)

In [11]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier().fit(X_train, y_train)

In [12]:
# Train predictions
train_pred = model.predict(X_train)

In [13]:
# Test predictions
test_pred = model.predict(X_test)

In [14]:
# Train accuracy 
model.score(X_train, y_train)

1.0

In [15]:
# Test accuracy 
model.score(X_test, y_test)

0.675

# Visualize the decision tree

In [18]:
from sklearn.tree import export_graphviz

In [17]:
model.classes_

array(['bad', 'good'], dtype=object)

In [20]:
dot_data = export_graphviz(model, out_file=None,
                          feature_names=encoded_X.columns,  
                          class_names=model.classes_,  
                          ) 

In [22]:
print(dot_data)

digraph Tree {
node [shape=box] ;
0 [label="over_draft_no checking <= 0.5\ngini = 0.424\nsamples = 800\nvalue = [244, 556]\nclass = good"] ;
1 [label="credit_usage <= 22.5\ngini = 0.489\nsamples = 482\nvalue = [206, 276]\nclass = good"] ;
0 -> 1 [labeldistance=2.5, labelangle=45, headlabel="True"] ;
2 [label="credit_history_all paid <= 0.5\ngini = 0.449\nsamples = 279\nvalue = [95, 184]\nclass = good"] ;
1 -> 2 ;
3 [label="credit_history_no credits/all paid <= 0.5\ngini = 0.428\nsamples = 261\nvalue = [81, 180]\nclass = good"] ;
2 -> 3 ;
4 [label="current_balance <= 7442.5\ngini = 0.413\nsamples = 250\nvalue = [73, 177]\nclass = good"] ;
3 -> 4 ;
5 [label="credit_usage <= 8.5\ngini = 0.404\nsamples = 246\nvalue = [69, 177]\nclass = good"] ;
4 -> 5 ;
6 [label="job_high qualif/self emp/mgmt <= 0.5\ngini = 0.149\nsamples = 37\nvalue = [3, 34]\nclass = good"] ;
5 -> 6 ;
7 [label="purpose_education <= 0.5\ngini = 0.059\nsamples = 33\nvalue = [1, 32]\nclass = good"] ;
6 -> 7 ;
8 [label="gini

In [23]:
import graphiz

ModuleNotFoundError: No module named 'graphiz'

In [24]:
print(graphiz.Source(dot_data))

NameError: name 'graphiz' is not defined

In [25]:
# Since the train accuracy is 1 that means the model is overfit to the data
# To remove this we tune the hyperparameters for the model

# min_samples_split: The minimum number of instances that should be at a node 
# before the algorithm does a further split on it.

# min_impurity_decrease: The minimum percentage reduction in the impurity of 
# a set (as measured by gini index) required to make a split.

model1 = DecisionTreeClassifier(min_samples_split=10,min_impurity_decrease=0.005).fit(X_train, y_train)

In [26]:
# Train accuracy
model1.score(X_train, y_train)

0.7675

In [27]:
# Test accuracy 
model1.score(X_test, y_test)

0.73