In [1]:
# Initial imports
import pandas as pd
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Needed for decision tree visualization
import pydotplus
from IPython.display import Image


## Loading and Preprocessing Crowdfunding Data

In [2]:
# Load the data
file_path = "Data/cleaned_cc_data.csv"
cc_data = pd.read_csv(file_path)
cc_data.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
0,20000.0,2,2,1,24,2,2,-1,-1,-2,...,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,1
1,120000.0,2,2,2,26,-1,2,0,0,0,...,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,2000.0,1
2,90000.0,2,2,2,34,0,0,0,0,0,...,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0,0
3,50000.0,2,2,1,37,0,0,0,0,0,...,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,1100.0,1069.0,1000.0,0
4,50000.0,1,2,1,57,-1,0,-1,0,0,...,20940.0,19146.0,19131.0,2000.0,36681.0,10000.0,9000.0,689.0,679.0,0


In [3]:
# Define the features set.
X = cc_data.copy()
X = X.drop("default.payment.next.month", axis=1,)
X.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6
0,20000.0,2,2,1,24,2,2,-1,-1,-2,...,689.0,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0
1,120000.0,2,2,2,26,-1,2,0,0,0,...,2682.0,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,2000.0
2,90000.0,2,2,2,34,0,0,0,0,0,...,13559.0,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0
3,50000.0,2,2,1,37,0,0,0,0,0,...,49291.0,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,1100.0,1069.0,1000.0
4,50000.0,1,2,1,57,-1,0,-1,0,0,...,35835.0,20940.0,19146.0,19131.0,2000.0,36681.0,10000.0,9000.0,689.0,679.0


In [4]:
# Define the target set.
y = cc_data["default.payment.next.month"].values.reshape(-1, 1)
y[:5]

array([[1],
       [1],
       [0],
       [0],
       [0]], dtype=int64)

In [5]:
# Split into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [6]:
# Create a StandardScaler instance

scaler = StandardScaler()

In [7]:
# Fit the Standard Scaler with the training data

X_scaler = scaler.fit(X_train)

In [8]:
# Scale the training data

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Create and Fit the Decision Tree Model

In [9]:
# Create the decision tree classifier instance

dt_model = tree.DecisionTreeClassifier()

In [10]:
# Fit the model

dt_model = dt_model.fit(X_train_scaled, y_train)

## Make Prediction and Test the Accuracy of Model

In [11]:
# Make predictions using the testing data

dt_predictions = dt_model.predict(X_test_scaled)

In [12]:
# Calculate the Accuracy Score

acc_score = accuracy_score(y_test, dt_predictions)

# Print the accuracy score
print(f"Accuracy Score : {acc_score}")

Accuracy Score : 0.7240913390082421


---

## Create new model based off of credt_data_no_payment_hist

In [14]:
# Load the data
file_path_2 = "Data/credt_data_no_payment_hist.csv"
cc_data_2 = pd.read_csv(file_path_2)
cc_data_2.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,default.payment.next.month
0,20000.0,2,2,1,24,1
1,120000.0,2,2,2,26,1
2,90000.0,2,2,2,34,0
3,50000.0,2,2,1,37,0
4,50000.0,1,2,1,57,0


In [15]:
# Define the features set.
X2 = cc_data_2.copy()
X2 = X2.drop("default.payment.next.month", axis=1,)
X2.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE
0,20000.0,2,2,1,24
1,120000.0,2,2,2,26
2,90000.0,2,2,2,34
3,50000.0,2,2,1,37
4,50000.0,1,2,1,57


In [16]:
# Define the target set.
y2 = cc_data_2["default.payment.next.month"].values.reshape(-1, 1)
y2[:5]

array([[1],
       [1],
       [0],
       [0],
       [0]], dtype=int64)

In [17]:
# Split into Train and Test sets
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, random_state=78)

In [18]:
# Create a StandardScaler instance

scaler2 = StandardScaler()

In [19]:
# Fit the Standard Scaler with the training data

X2_scaler = scaler2.fit(X_train2)

In [20]:
# Scale the training data

X_train_scaled2 = X2_scaler.transform(X_train2)
X_test_scaled2 = X2_scaler.transform(X_test2)

In [21]:
# Create the decision tree classifier instance

dt_model2 = tree.DecisionTreeClassifier()

In [22]:
# Fit the model

dt_model2 = dt_model2.fit(X_train_scaled2, y_train2)

In [23]:
# Make predictions using the testing data

dt_predictions2 = dt_model2.predict(X_test_scaled2)

In [24]:
# Calculate the Accuracy Score

acc_score2 = accuracy_score(y_test2, dt_predictions2)

# Print the accuracy score
print(f"Accuracy Score 2 : {acc_score2}")

Accuracy Score 2 : 0.7245333333333334


In [25]:
# Create DOT data
#dot_data2 = tree.export_graphviz(
#    dt_model2, out_file=None, feature_names=X.columns, class_names=["0", "1"], filled=True
#)

# Draw graph
#graph2 = pydotplus.graph_from_dot_data(dot_data2)

# Show graph
#Image(graph2.create_png())

## Compare Accuracy Score of Both Models

In [26]:
# Print the accuracy score of both models
print(f"Accuracy Score 1 : {acc_score}")
print(f"Accuracy Score 2 : {acc_score2}")

Accuracy Score 1 : 0.7240913390082421
Accuracy Score 2 : 0.7245333333333334
