In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("Credit Card Default II (balance).csv")

In [3]:
print (df)

            income        age          loan  default
0     66155.925100  59.017015   8106.532131        0
1     34415.153970  48.117153   6564.745018        0
2     57317.170060  63.108049   8020.953296        0
3     42709.534200  45.751972   6103.642260        0
4     66952.688850  18.584336   8770.099235        1
...            ...        ...           ...      ...
3423  60903.191726  21.933924  10367.081892        1
3424  62235.644695  25.033016   8083.900063        1
3425  25597.850590  26.350344   3810.110335        1
3426  48164.194974  23.141976   6529.652892        1
3427  33055.808635  26.289612   5351.851138        1

[3428 rows x 4 columns]


## Data Wrangling

In [4]:
# removing na
df = df.dropna()
print(df)

            income        age          loan  default
0     66155.925100  59.017015   8106.532131        0
1     34415.153970  48.117153   6564.745018        0
2     57317.170060  63.108049   8020.953296        0
3     42709.534200  45.751972   6103.642260        0
4     66952.688850  18.584336   8770.099235        1
...            ...        ...           ...      ...
3423  60903.191726  21.933924  10367.081892        1
3424  62235.644695  25.033016   8083.900063        1
3425  25597.850590  26.350344   3810.110335        1
3426  48164.194974  23.141976   6529.652892        1
3427  33055.808635  26.289612   5351.851138        1

[3428 rows x 4 columns]


In [5]:
#removing non numbers
for i in df.columns:
    df1 = pd.to_numeric(df[i], errors='coerce')
    f = df1.notnull()
    df=df[f]
print(df)

            income        age          loan  default
0     66155.925100  59.017015   8106.532131        0
1     34415.153970  48.117153   6564.745018        0
2     57317.170060  63.108049   8020.953296        0
3     42709.534200  45.751972   6103.642260        0
4     66952.688850  18.584336   8770.099235        1
...            ...        ...           ...      ...
3423  60903.191726  21.933924  10367.081892        1
3424  62235.644695  25.033016   8083.900063        1
3425  25597.850590  26.350344   3810.110335        1
3426  48164.194974  23.141976   6529.652892        1
3427  33055.808635  26.289612   5351.851138        1

[3428 rows x 4 columns]


In [6]:
# removing outliers
import numpy as np
from scipy import stats

z = stats.zscore(df.astype(float))
z = np.abs(z)
#assumming values with standard deviation of more than absolute 2.5 as outliers
f = (z < 2.5).all(axis=1)
df = df[f]
print(df)

            income        age          loan  default
0     66155.925100  59.017015   8106.532131        0
1     34415.153970  48.117153   6564.745018        0
2     57317.170060  63.108049   8020.953296        0
3     42709.534200  45.751972   6103.642260        0
4     66952.688850  18.584336   8770.099235        1
...            ...        ...           ...      ...
3423  60903.191726  21.933924  10367.081892        1
3424  62235.644695  25.033016   8083.900063        1
3425  25597.850590  26.350344   3810.110335        1
3426  48164.194974  23.141976   6529.652892        1
3427  33055.808635  26.289612   5351.851138        1

[3424 rows x 4 columns]


In [7]:
pd.unique(df[["default"]].values.ravel())
# shows that default is binary, no additional cleaning required

array([0, 1])

In [8]:
# removing columns with negative values
df = df[(df['income']>0) & (df['age']>0) & (df['loan']>0)]
print (df)

            income        age          loan  default
0     66155.925100  59.017015   8106.532131        0
1     34415.153970  48.117153   6564.745018        0
2     57317.170060  63.108049   8020.953296        0
3     42709.534200  45.751972   6103.642260        0
4     66952.688850  18.584336   8770.099235        1
...            ...        ...           ...      ...
3423  60903.191726  21.933924  10367.081892        1
3424  62235.644695  25.033016   8083.900063        1
3425  25597.850590  26.350344   3810.110335        1
3426  48164.194974  23.141976   6529.652892        1
3427  33055.808635  26.289612   5351.851138        1

[3424 rows x 4 columns]


In [9]:
X = df.loc[:,["income", "age", "loan"]]
Y = df.loc[:, "default"]

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, random_state = 1234)

## Logistic Regression

In [11]:
from sklearn import linear_model
reg_model = linear_model.LogisticRegression()
reg_model.fit(X_train,Y_train)

LogisticRegression()

In [12]:
# train set accuracy
from sklearn.metrics import confusion_matrix
pred = reg_model.predict(X_train)
cm = confusion_matrix(Y_train, pred)
print(cm)
accuracy = (cm[0,0]+cm[1,1])/sum(sum(cm))
print(accuracy)

[[1139  147]
 [ 232 1050]]
0.8524143302180686


In [13]:
# test set accuracy
pred = reg_model.predict(X_test)
cm = confusion_matrix(Y_test, pred)
print(cm)
accuracy = (cm[0,0]+cm[1,1])/sum(sum(cm))
print(accuracy)

[[371  53]
 [ 62 370]]
0.8656542056074766


## Decision Tree

In [14]:
from sklearn import tree
from sklearn.model_selection import GridSearchCV
tree_model = tree.DecisionTreeClassifier(random_state = 0)
g = GridSearchCV(estimator = tree_model, param_grid=dict(max_depth = [i for i in range (1,30)]))
g = g.fit(X_train, Y_train)
g.best_params_

{'max_depth': 10}

In [15]:
tree_model = tree.DecisionTreeClassifier(max_depth = 10, random_state = 0)
tree_model.fit(X_train, Y_train)

DecisionTreeClassifier(max_depth=10, random_state=0)

In [16]:
# Train set accuracy
pred = tree_model.predict(X_train)
cm = confusion_matrix(Y_train, pred)
print((cm[0,0]+cm[1,1])/(sum(sum(cm))))

1.0


In [17]:
# Test set accuracy
pred = tree_model.predict(X_test)
cm = confusion_matrix(Y_test, pred)
print((cm[0,0]+cm[1,1])/(sum(sum(cm))))

0.991822429906542


## Random Forest

In [18]:
from sklearn import ensemble
random_model = ensemble.RandomForestClassifier(random_state = 0)
random_model.fit(X_train, Y_train)

RandomForestClassifier(random_state=0)

In [19]:
g = GridSearchCV(estimator = random_model, param_grid=dict(max_depth = [i for i in range (1,30)]))
g = g.fit(X_train, Y_train)
g.best_params_

{'max_depth': 15}

In [20]:
random_model = ensemble.RandomForestClassifier(random_state = 0, max_depth = 15)
random_model.fit(X_train, Y_train)

RandomForestClassifier(max_depth=15, random_state=0)

In [21]:
# Train set accuracy
pred = random_model.predict(X_train)
cm = confusion_matrix(Y_train, pred)
print((cm[0,0]+cm[1,1])/(sum(sum(cm))))

1.0


In [22]:
# Test set accuracy
pred = random_model.predict(X_test)
cm = confusion_matrix(Y_test, pred)
print((cm[0,0]+cm[1,1])/(sum(sum(cm))))

0.9906542056074766


## XGBoost

In [23]:
from sklearn.ensemble import GradientBoostingClassifier
XG_model = GradientBoostingClassifier(random_state = 0)
g = GridSearchCV(estimator = random_model, param_grid=dict(max_depth = [i for i in range (1,30)]))
g = g.fit(X_train, Y_train)
g.best_params_

{'max_depth': 15}

In [24]:
XG_model = GradientBoostingClassifier(random_state = 0, max_depth = 15)
XG_model.fit(X_train, Y_train)

GradientBoostingClassifier(max_depth=15, random_state=0)

In [25]:
# Train set accuracy
pred = XG_model.predict(X_train)
cm = confusion_matrix(Y_train, pred)
print((cm[0,0]+cm[1,1])/(sum(sum(cm))))

1.0


In [26]:
# Test set accuracy
pred = random_model.predict(X_test)
cm = confusion_matrix(Y_test, pred)
print((cm[0,0]+cm[1,1])/(sum(sum(cm))))

0.9906542056074766


## Neural Network

In [28]:
from sklearn import preprocessing

scaler = preprocessing.MinMaxScaler()
names = df.columns
x_scaled = scaler.fit_transform(df)
df1 = pd.DataFrame(x_scaled, columns = names)
print (df1)

        income       age      loan  default
0     0.923176  0.892092  0.602968      0.0
1     0.288122  0.654708  0.488270      0.0
2     0.746334  0.981189  0.596602      0.0
3     0.454072  0.603198  0.453967      0.0
4     0.939117  0.011524  0.652333      1.0
...        ...       ...       ...      ...
3419  0.818082  0.084473  0.771138      1.0
3420  0.844741  0.151967  0.601284      1.0
3421  0.111709  0.180657  0.283344      1.0
3422  0.563206  0.110783  0.485659      1.0
3423  0.260925  0.179334  0.398039      1.0

[3424 rows x 4 columns]


In [29]:
X1 = df1.loc[:,["income", "age", "loan"]]
Y1 = df1.loc[:, "default"]
X_train1, X_test1, Y_train1, Y_test1 = train_test_split(X1,Y1, random_state = 0)

In [30]:
from sklearn.neural_network import MLPClassifier
MLP_model = MLPClassifier(solver="lbfgs", hidden_layer_sizes = (50,2), random_state = 0 ).fit(X_train1,Y_train1)

In [31]:
# Train set accuracy
pred = MLP_model.predict(X_train1)
cm = confusion_matrix(Y_train1, pred)
print (cm)
print((cm[0,0]+cm[1,1])/(sum(sum(cm))))

[[1294    0]
 [   0 1274]]
1.0


In [32]:
# Test set accuracy
pred = MLP_model.predict(X_test1)
cm = confusion_matrix(Y_test1, pred)
print (cm)
print((cm[0,0]+cm[1,1])/(sum(sum(cm))))

[[415   1]
 [  0 440]]
0.9988317757009346


## Upload to heroku

In [33]:
import joblib

In [34]:
joblib.dump(reg_model, "Regression")

['Regression']

In [35]:
joblib.dump(tree_model, "Tree")

['Tree']

In [36]:
joblib.dump(random_model, "Random")

['Random']

In [37]:
joblib.dump(XG_model, "XG")

['XG']

In [38]:
joblib.dump(MLP_model, "MLP")

['MLP']