# Loan 데이터 분류나무 예측성능 비교

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('loan.csv')

In [None]:
df['job'] = df['job'].replace({'Office':0, 'ProfExe':1, 'Other':2, 'Mgr':3, 'Self':4, 'Sales':5})

In [None]:
X = df.drop(['y'], axis=1)
y = df['y']
xname = X.columns
yname = ['Normal','Bad']

## Train/Test 데이터 분할

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0, stratify=y)

## 나무 모형

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree

### Pruning

In [None]:
from sklearn.model_selection import GridSearchCV
np.random.seed(0)
g_cv = GridSearchCV(DecisionTreeClassifier(random_state=0),
              param_grid={'ccp_alpha': np.arange(0.000, 0.005, 0.0001)}, cv=10)
g_cv.fit(X_train, y_train)

In [None]:
print('selected:',g_cv.best_params_)
print('score   :',g_cv.best_score_)

### Alpha=0.0008 트리

In [None]:
c1_tree = DecisionTreeClassifier(ccp_alpha=0.0008,random_state=0) 
c1_tree.fit(X_train, y_train)

### 하이퍼파라미터 튜닝

In [None]:
np.random.seed(0)
g_cv = GridSearchCV(DecisionTreeClassifier(random_state=0),
              param_grid={'ccp_alpha': np.arange(0.000, 0.002, 0.0001),
                          'min_impurity_decrease': np.arange(0,0.003,0.0005),
                          'min_samples_split': np.arange(2,402,100)},
                    cv=10, n_jobs=-1)
g_cv.fit(X_train, y_train)

In [None]:
print('selected:',g_cv.best_params_)
print('score   :',g_cv.best_score_)

### Alpha=0.0, min_impurity_decrease=0.0005,  min_samples_split=2  트리

In [None]:
c2_tree = DecisionTreeClassifier(ccp_alpha=0.0,min_impurity_decrease=0.0005,min_samples_split=2,random_state=0)
c2_tree.fit(X_train, y_train)

### Min_impurity_decrease 튜닝

In [None]:
np.random.seed(0)
g_cv = GridSearchCV(DecisionTreeClassifier(random_state=0),
              param_grid={'min_impurity_decrease': np.arange(0,0.002,0.0001)},
                    cv=10, n_jobs=-1)
g_cv.fit(X_train, y_train)

In [None]:
print('selected:',g_cv.best_params_)
print('score   :',g_cv.best_score_)

In [None]:
c3_tree = DecisionTreeClassifier(min_impurity_decrease=0.0003,random_state=0) 
c3_tree.fit(X_train, y_train)

### Default 트리

In [None]:
c4_tree = DecisionTreeClassifier(random_state=0) 
c4_tree.fit(X_train, y_train)

### ROC 곡선 및 AUC

In [None]:
from sklearn.metrics import plot_roc_curve
roc_tree=plot_roc_curve(c1_tree, X_test, y_test)
plot_roc_curve(c2_tree, X_test, y_test, ax = roc_tree.ax_)
plot_roc_curve(c3_tree, X_test, y_test, ax = roc_tree.ax_)
plot_roc_curve(c4_tree, X_test, y_test, ax = roc_tree.ax_)
plt.title("ROC curve comparison")
plt.show()

## 다른 분류방법과의 비교

### 로지스틱 회귀분석

In [None]:
from sklearn.linear_model import LogisticRegression
c_logit = LogisticRegression(random_state=0, max_iter = 1000)
c_logit.fit(X_train, y_train)

### 신경망분석

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
c_nn = MLPClassifier(hidden_layer_sizes=(5),random_state=0, max_iter = 1000)
c_nn.fit(X_train_scaled, y_train)

### SVM

In [None]:
from sklearn import svm
c_svm = svm.SVC(kernel='rbf', random_state=0)
c_svm.fit(X_train_scaled,y_train)

### 랜덤포레스트

In [None]:
from sklearn.ensemble import RandomForestClassifier
c_rf = RandomForestClassifier(random_state=0)
c_rf.fit(X_train,y_train)

### ROC 곡선 및 AUC

In [None]:
roc_tree=plot_roc_curve(c2_tree, X_test, y_test)
plot_roc_curve(c_logit, X_test, y_test, ax = roc_tree.ax_)
plot_roc_curve(c_nn, X_test_scaled, y_test, ax = roc_tree.ax_)
plot_roc_curve(c_svm, X_test_scaled, y_test, ax = roc_tree.ax_)
plot_roc_curve(c_rf, X_test, y_test, ax = roc_tree.ax_)
plt.title("ROC curve comparison")
plt.show()