In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style('darkgrid')

# adjust 한글 font
from matplotlib import font_manager, rc
font_name = font_manager.FontProperties(fname='c:/Windows/Fonts/malgun.ttf').get_name()
rc('font', family=font_name)

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_excel('default_of_credit_card_clients.xlsx')

In [4]:
df.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,1,20000,2,2,1,24,2,2,-1,-1,...,0,0,0,0,689,0,0,0,0,1
1,2,120000,2,2,2,26,-1,2,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,3,90000,2,2,2,34,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,4,50000,2,2,1,37,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,5,50000,1,2,1,57,-1,0,-1,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0


In [14]:
21000 ** .5

144.9137674618944

In [15]:
# tensor graph 초기화
import tensorflow as tf
tf.reset_default_graph()

# import models
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from keras.models import Sequential
from keras.layers import Dense

def DNNClassifier():
    model = Sequential()
    model.add(Dense(32, input_shape=(23,), activation='relu'))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

log_clf = LogisticRegression()
gnb_clf = GaussianNB()
knn_clf = KNeighborsClassifier(n_neighbors=144)
rf_clf = RandomForestClassifier(n_estimators=100)
xgb_clf = XGBClassifier(n_estimators=100)
lgb_clf = LGBMClassifier(n_estimators=100)
dnn_clf = DNNClassifier()

In [16]:
from sklearn.model_selection import train_test_split

X = df[df.columns[1:-1]]
y = df[df.columns[-1]]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=20190730, stratify=y)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((22500, 23), (7500, 23), (22500,), (7500,))

In [17]:
y_train.value_counts() / len(y_train)

0    0.7788
1    0.2212
Name: default payment next month, dtype: float64

In [18]:
y_test.value_counts() / len(y_test)

0    0.7788
1    0.2212
Name: default payment next month, dtype: float64

In [None]:
np.random.seed(20190730)

from sklearn.metrics import accuracy_score
model_clf = [log_clf, gnb_clf, knn_clf, svm_clf, rf_clf, xgb_clf, lgb_clf, dnn_clf]
model_name = ['Logistic', 'gaussianNB', 'KNN', 'SVC', 'RF', 'XGB', 'LightGBM', 'DNN']

for clf, name in zip(model_clf, model_name):
    if name == 'DNN':
        clf.fit(X_train, y_train, epochs=15, batch_size=32, verbose=False) # 시간상 epoch 50만
        y_pred = np.where(
            clf.predict(X_test) > 0.5, 
            1, 0)
    else:
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
    y_true = y_test.copy()
    print('{} : {:.4f}'.format(name, accuracy_score(y_true, y_pred)).rjust(20))

   Logistic : 0.7788
 gaussianNB : 0.3955
        KNN : 0.7829


# Data Scaling

In [11]:
train_mean = X_train.mean(axis=0)
train_std = X_train.std(axis=0)

X_train = (X_train - train_mean) / train_std
X_test = (X_test - train_mean) / train_std

In [12]:
np.random.seed(20190730)

from sklearn.metrics import accuracy_score
model_clf = [log_clf, gnb_clf, knn_clf, svm_clf, rf_clf, xgb_clf, lgb_clf, dnn_clf]
model_name = ['Logistic', 'gaussianNB', 'KNN', 'SVC', 'RF', 'XGB', 'LightGBM', 'DNN']

for clf, name in zip(model_clf, model_name):
    if name == 'DNN':
        clf.fit(X_train, y_train, epochs=15, batch_size=32, verbose=False) # 시간상 epoch 50만
        y_pred = np.where(
            clf.predict(X_test) > 0.5, 
            1, 0)
    else:
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
    y_true = y_test.copy()
    print('{} : {:.4f}'.format(name, accuracy_score(y_true, y_pred)).rjust(20))

   Logistic : 0.8100
 gaussianNB : 0.7317
         RF : 0.8148
        XGB : 0.8212
   LightGBM : 0.8165
        DNN : 0.8173
