In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

In [4]:
data = pd.read_csv('data/loan_default.csv')

In [5]:
data.head()

Unnamed: 0,LoanID,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education,EmploymentType,MaritalStatus,HasMortgage,HasDependents,LoanPurpose,HasCoSigner,Default
0,I38PQUQS96,56,85994,50587,520,80,4,15.23,36,0.44,Bachelor's,Full-time,Divorced,Yes,Yes,Other,Yes,0
1,HPSK72WA7R,69,50432,124440,458,15,1,4.81,60,0.68,Master's,Full-time,Married,No,No,Other,Yes,0
2,C1OZ6DPJ8Y,46,84208,129188,451,26,3,21.17,24,0.31,Master's,Unemployed,Divorced,Yes,Yes,Auto,No,1
3,V2KKSFM3UN,32,31713,44799,743,0,3,7.07,24,0.23,High School,Full-time,Married,No,No,Business,No,0
4,EY08JDHTZP,60,20437,9139,633,8,4,6.51,48,0.73,Bachelor's,Unemployed,Divorced,No,Yes,Auto,No,0


In [6]:
data.columns

Index(['LoanID', 'Age', 'Income', 'LoanAmount', 'CreditScore',
       'MonthsEmployed', 'NumCreditLines', 'InterestRate', 'LoanTerm',
       'DTIRatio', 'Education', 'EmploymentType', 'MaritalStatus',
       'HasMortgage', 'HasDependents', 'LoanPurpose', 'HasCoSigner',
       'Default'],
      dtype='object')

# Data Preprocessing

In [7]:
data.isnull().sum()

LoanID            0
Age               0
Income            0
LoanAmount        0
CreditScore       0
MonthsEmployed    0
NumCreditLines    0
InterestRate      0
LoanTerm          0
DTIRatio          0
Education         0
EmploymentType    0
MaritalStatus     0
HasMortgage       0
HasDependents     0
LoanPurpose       0
HasCoSigner       0
Default           0
dtype: int64

No null values, all good

In [8]:
data = data.drop("LoanID", axis=1)

In [9]:
data.head()

Unnamed: 0,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education,EmploymentType,MaritalStatus,HasMortgage,HasDependents,LoanPurpose,HasCoSigner,Default
0,56,85994,50587,520,80,4,15.23,36,0.44,Bachelor's,Full-time,Divorced,Yes,Yes,Other,Yes,0
1,69,50432,124440,458,15,1,4.81,60,0.68,Master's,Full-time,Married,No,No,Other,Yes,0
2,46,84208,129188,451,26,3,21.17,24,0.31,Master's,Unemployed,Divorced,Yes,Yes,Auto,No,1
3,32,31713,44799,743,0,3,7.07,24,0.23,High School,Full-time,Married,No,No,Business,No,0
4,60,20437,9139,633,8,4,6.51,48,0.73,Bachelor's,Unemployed,Divorced,No,Yes,Auto,No,0


In [10]:
x = data.copy().drop('Default', axis=1)
y = data['Default']

In [11]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [12]:
categorical_variables_names = (x_train.dtypes == 'object') # selects the other data type
categorical_variables_names = list(categorical_variables_names[categorical_variables_names].index)

In [13]:
# Convert the categorical variables to a usable form:

from sklearn.preprocessing import OrdinalEncoder

OE_x_train = x_train.copy()
OE_x_test = x_test.copy()
OE = OrdinalEncoder()

OE_x_train[categorical_variables_names] = OE.fit_transform(OE_x_train[categorical_variables_names])
OE_x_test[categorical_variables_names] = OE.transform(OE_x_test[categorical_variables_names])


In [32]:
from sklearn.tree import DecisionTreeClassifier

data_model = DecisionTreeClassifier(max_leaf_nodes=25)
data_model.fit(OE_x_train, y_train)

In [37]:
import pydotplus  # pip install pydotplus
from sklearn.tree import export_graphviz


def tree_graph_to_png(tree, feature_names, png_file_to_save):
    # needs graphviz to be installed
    tree_str = export_graphviz(
        tree, feature_names=feature_names, filled=True, out_file=None
    )
    graph = pydotplus.graph_from_dot_data(tree_str)
    graph.write_png(png_file_to_save)

In [42]:
tree_graph_to_png(
    tree=data_model,
    feature_names= ['Age', 'Income', 'LoanAmount', 'CreditScore', 'MonthsEmployed',
       'NumCreditLines', 'InterestRate', 'LoanTerm', 'DTIRatio', 'Education',
       'EmploymentType', 'MaritalStatus', 'HasMortgage', 'HasDependents',
       'LoanPurpose', 'HasCoSigner'],
    png_file_to_save="dtree.png",
)

InvocationException: GraphViz's executables not found

In [40]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

predictions = data_model.predict(OE_x_test)
root_mean_error = np.sqrt(mean_squared_error(y_test, predictions))
r2_score = r2_score(y_test, predictions)

print(f"The root mean squared error is {root_mean_error}")
print(f"The R2 score is {r2_score}")

The root mean squared error is 0.34193267721441
The R2 score is -0.12634786922660357


In [20]:
for x,y in zip(predictions, y_test):
    print(f"Predicted {x} when actual is {y}")

Predicted 0 when actual is 0
Predicted 0 when actual is 0
Predicted 0 when actual is 0
Predicted 0 when actual is 0
Predicted 0 when actual is 0
Predicted 0 when actual is 0
Predicted 0 when actual is 0
Predicted 0 when actual is 0
Predicted 0 when actual is 0
Predicted 0 when actual is 0
Predicted 0 when actual is 0
Predicted 0 when actual is 0
Predicted 0 when actual is 0
Predicted 0 when actual is 0
Predicted 0 when actual is 0
Predicted 0 when actual is 1
Predicted 0 when actual is 1
Predicted 0 when actual is 0
Predicted 0 when actual is 0
Predicted 0 when actual is 0
Predicted 0 when actual is 0
Predicted 1 when actual is 0
Predicted 0 when actual is 0
Predicted 0 when actual is 0
Predicted 1 when actual is 0
Predicted 0 when actual is 0
Predicted 0 when actual is 0
Predicted 0 when actual is 0
Predicted 0 when actual is 0
Predicted 0 when actual is 0
Predicted 0 when actual is 0
Predicted 0 when actual is 1
Predicted 1 when actual is 0
Predicted 0 when actual is 0
Predicted 0 wh