## 평가함수 정의

In [1]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from imblearn.metrics import geometric_mean_score

def get_clf_eval(y_test, y_pred):
    confmat=pd.DataFrame(confusion_matrix(y_test, y_pred),
                    index=['True[0]', 'True[1]'],
                    columns=['Predict[0]', 'Predict[1]'])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    AUC = roc_auc_score(y_test, y_pred)
    g_means = geometric_mean_score(y_test, y_pred)
    print(confmat)
    print("\n정확도 : {:.3f} \n정밀도 : {:.3f} \n재현율 : {:.3f} \nf1-score : {:.3f} \nAUC : {:.3f} \n기하평균 : {:.3f} \n".format(accuracy,
                                        precision, recall, f1, AUC, g_means))

# Modeling

## 1. 불균형 데이터 처리 하지 않음

### 1-1. 데이터 가공

In [2]:
import pandas as pd
df = pd.read_csv("Loan_Train.csv")

In [3]:
df

Unnamed: 0,Id,Income,Age,Experience,Married/Single,House_Ownership,Car_Ownership,Profession,CITY,STATE,CURRENT_JOB_YRS,CURRENT_HOUSE_YRS,Risk_Flag
0,1,1303834,23,3,single,rented,no,Mechanical_engineer,Rewa,Madhya_Pradesh,3,13,0
1,2,7574516,40,10,single,rented,no,Software_Developer,Parbhani,Maharashtra,9,13,0
2,3,3991815,66,4,married,rented,no,Technical_writer,Alappuzha,Kerala,4,10,0
3,4,6256451,41,2,single,rented,yes,Software_Developer,Bhubaneswar,Odisha,2,12,1
4,5,5768871,47,11,single,rented,no,Civil_servant,Tiruchirappalli[10],Tamil_Nadu,3,14,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
251995,251996,8154883,43,13,single,rented,no,Surgeon,Kolkata,West_Bengal,6,11,0
251996,251997,2843572,26,10,single,rented,no,Army_officer,Rewa,Madhya_Pradesh,6,11,0
251997,251998,4522448,46,7,single,rented,no,Design_Engineer,Kalyan-Dombivli,Maharashtra,7,12,0
251998,251999,6507128,45,0,single,rented,no,Graphic_Designer,Pondicherry,Puducherry,0,10,0


In [4]:
# Feature, Target 나누기
X = df.drop(['Id','Risk_Flag'], axis=1)
y = df.Risk_Flag

In [5]:
# 범주형 변수 Labeling하기

from sklearn.preprocessing import LabelEncoder

en = LabelEncoder()
category_cols = ['Married/Single','House_Ownership','Car_Ownership', 'Profession', 'CITY', 'STATE']
for cols in category_cols:
    X[cols] = en.fit_transform(X[cols])

In [6]:
# Train & Test 데이터셋 나누기
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 101)

### 1-2. 모델 적합

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn.neural_network import MLPClassifier
from lightgbm import LGBMClassifier
import time

models_X = []
models_X.append(('LR', LogisticRegression(max_iter =5000))) # 로지스틱 분류기 
models_X.append(('LDA', LinearDiscriminantAnalysis()))  # LDA 모델
models_X.append(('QDA', QuadraticDiscriminantAnalysis()))  # QDA 모델
models_X.append(('KNN', KNeighborsClassifier())) # KNN 모델
models_X.append(('SVM', SVC(gamma='auto')))  # SVM 모델
models_X.append(('NN', MLPClassifier()))  #  Neural Network 모델
models_X.append(('RF', RandomForestClassifier()))  # 랜덤포레스트 모델
models_X.append(('XGB', XGBClassifier()))  # XGB 모델
models_X.append(('Light_GBM', LGBMClassifier())) # Light_GBM 모델

for name, model in models_X:
    start = time.time()
    model.fit(X_train, y_train)
    end = time.time() - start
    msg = "%s - train_score : %.3f, test score : %.3f, time : %.5f 초" % (name, model.score(X_train, y_train), model.score(X_test, y_test), end)
    print(msg)

LR - train_score : 0.877, test score : 0.877, time : 0.73503 초
LDA - train_score : 0.877, test score : 0.877, time : 0.52559 초
QDA - train_score : 0.877, test score : 0.877, time : 0.22240 초
KNN - train_score : 0.901, test score : 0.889, time : 1.50102 초
SVM - train_score : 0.937, test score : 0.899, time : 3620.38361 초
NN - train_score : 0.877, test score : 0.877, time : 18.07014 초
RF - train_score : 0.937, test score : 0.898, time : 35.03606 초




XGB - train_score : 0.894, test score : 0.887, time : 10.79753 초
Light_GBM - train_score : 0.881, test score : 0.879, time : 1.24552 초


### 1-3. 성능 평가

In [8]:
# 모델 갯수
a = list(range(0,len(models_X)))

for i in a:
    print("----------OverSampling 하지 않음 + %s 모델 적용----------" % (models_X[i][0]))
    get_clf_eval(y_test, models_X[i][1].predict(X_test))

----------OverSampling 하지 않음 + LR 모델 적용----------


  _warn_prf(average, modifier, msg_start, len(result))


         Predict[0]  Predict[1]
True[0]       66292           0
True[1]        9308           0

정확도 : 0.877 
정밀도 : 0.000 
재현율 : 0.000 
f1-score : 0.000 
AUC : 0.500 
기하평균 : 0.000 

----------OverSampling 하지 않음 + LDA 모델 적용----------
         Predict[0]  Predict[1]
True[0]       66292           0
True[1]        9308           0

정확도 : 0.877 
정밀도 : 0.000 
재현율 : 0.000 
f1-score : 0.000 
AUC : 0.500 
기하평균 : 0.000 

----------OverSampling 하지 않음 + QDA 모델 적용----------


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


         Predict[0]  Predict[1]
True[0]       66292           0
True[1]        9308           0

정확도 : 0.877 
정밀도 : 0.000 
재현율 : 0.000 
f1-score : 0.000 
AUC : 0.500 
기하평균 : 0.000 

----------OverSampling 하지 않음 + KNN 모델 적용----------
         Predict[0]  Predict[1]
True[0]       62474        3818
True[1]        4573        4735

정확도 : 0.889 
정밀도 : 0.554 
재현율 : 0.509 
f1-score : 0.530 
AUC : 0.726 
기하평균 : 0.692 

----------OverSampling 하지 않음 + SVM 모델 적용----------
         Predict[0]  Predict[1]
True[0]       63168        3124
True[1]        4535        4773

정확도 : 0.899 
정밀도 : 0.604 
재현율 : 0.513 
f1-score : 0.555 
AUC : 0.733 
기하평균 : 0.699 

----------OverSampling 하지 않음 + NN 모델 적용----------


  _warn_prf(average, modifier, msg_start, len(result))


         Predict[0]  Predict[1]
True[0]       66292           0
True[1]        9308           0

정확도 : 0.877 
정밀도 : 0.000 
재현율 : 0.000 
f1-score : 0.000 
AUC : 0.500 
기하평균 : 0.000 

----------OverSampling 하지 않음 + RF 모델 적용----------
         Predict[0]  Predict[1]
True[0]       62991        3301
True[1]        4373        4935

정확도 : 0.898 
정밀도 : 0.599 
재현율 : 0.530 
f1-score : 0.563 
AUC : 0.740 
기하평균 : 0.710 

----------OverSampling 하지 않음 + XGB 모델 적용----------
         Predict[0]  Predict[1]
True[0]       65272        1020
True[1]        7516        1792

정확도 : 0.887 
정밀도 : 0.637 
재현율 : 0.193 
f1-score : 0.296 
AUC : 0.589 
기하평균 : 0.435 

----------OverSampling 하지 않음 + Light_GBM 모델 적용----------
         Predict[0]  Predict[1]
True[0]       66176         116
True[1]        9008         300

정확도 : 0.879 
정밀도 : 0.721 
재현율 : 0.032 
f1-score : 0.062 
AUC : 0.515 
기하평균 : 0.179 



## 2. SMOTE

### 2-1. 데이터 가공

In [9]:
X_s = pd.read_csv("x_smote.csv")
y_s = pd.read_csv("y_smote.csv")

In [10]:
X_s.drop(['Unnamed: 0'], axis=1, inplace=True)
X_s

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,8196231,37,14,1,2,1,20,63,1,7,13
1,2869872,56,19,1,2,0,19,86,1,11,12
2,7361429,58,2,1,2,0,50,292,12,2,12
3,5921974,25,3,0,2,0,50,194,6,3,11
4,1583605,45,13,1,2,0,5,249,13,13,14
...,...,...,...,...,...,...,...,...,...,...,...
309419,8928667,38,15,1,2,0,9,256,2,3,10
309420,7761009,38,3,1,2,0,33,164,4,3,12
309421,4002405,68,6,1,2,0,31,65,13,3,13
309422,8517978,58,7,1,2,0,8,160,12,4,10


In [11]:
y_s = y_s['0']
y_s

0         0
1         0
2         1
3         0
4         0
         ..
309419    1
309420    1
309421    1
309422    1
309423    1
Name: 0, Length: 309424, dtype: int64

In [12]:
from sklearn.model_selection import train_test_split

X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(X_s, y_s, test_size = 0.3, random_state = 101)

### 2-2. 모델 적합

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn.neural_network import MLPClassifier
from lightgbm import LGBMClassifier


models_s = []
models_s.append(('LR', LogisticRegression(max_iter =5000))) # 로지스틱 분류기 
models_s.append(('LDA', LinearDiscriminantAnalysis()))  # LDA 모델
models_s.append(('QDA', QuadraticDiscriminantAnalysis()))  # QDA 모델
models_s.append(('KNN', KNeighborsClassifier())) # KNN 모델
models_s.append(('SVM', SVC(gamma='auto')))  # SVM 모델
models_s.append(('NN', MLPClassifier()))  #  Neural Network 모델
models_s.append(('RF', RandomForestClassifier()))  # 랜덤포레스트 모델
models_s.append(('XGB', XGBClassifier()))  # XGB 모델
models_s.append(('Light_GBM', LGBMClassifier(boost_from_average=False))) # Light_GBM 모델

for name, model in models_s:
    start = time.time()
    model.fit(X_train_s, y_train_s)
    end = time.time() - start
    msg = "%s - train_score : %.3f, test score : %.3f, time : %.5f 초" % (name, model.score(X_train_s, y_train_s), model.score(X_test_s, y_test_s), end)
    print(msg)

LR - train_score : 0.501, test score : 0.499, time : 0.82167 초
LDA - train_score : 0.550, test score : 0.550, time : 0.73726 초
QDA - train_score : 0.557, test score : 0.557, time : 0.32053 초
KNN - train_score : 0.908, test score : 0.891, time : 2.20133 초
SVM - train_score : 0.957, test score : 0.893, time : 6504.69710 초
NN - train_score : 0.501, test score : 0.499, time : 274.95846 초
RF - train_score : 0.957, test score : 0.923, time : 44.63076 초




XGB - train_score : 0.889, test score : 0.881, time : 15.45404 초
Light_GBM - train_score : 0.803, test score : 0.798, time : 1.91792 초


### 2-3. 성능 평가

In [14]:
# 모델 갯수
a = list(range(0,len(models_s)))

for i in a:
    print("----------SMOTE + %s 모델 적용----------" % (models_s[i][0]))
    get_clf_eval(y_test_s, models_s[i][1].predict(X_test_s))

----------SMOTE + LR 모델 적용----------


  _warn_prf(average, modifier, msg_start, len(result))


         Predict[0]  Predict[1]
True[0]       46299           0
True[1]       46529           0

정확도 : 0.499 
정밀도 : 0.000 
재현율 : 0.000 
f1-score : 0.000 
AUC : 0.500 
기하평균 : 0.000 

----------SMOTE + LDA 모델 적용----------
         Predict[0]  Predict[1]
True[0]       24111       22188
True[1]       19567       26962

정확도 : 0.550 
정밀도 : 0.549 
재현율 : 0.579 
f1-score : 0.564 
AUC : 0.550 
기하평균 : 0.549 

----------SMOTE + QDA 모델 적용----------
         Predict[0]  Predict[1]
True[0]       20867       25432
True[1]       15688       30841

정확도 : 0.557 
정밀도 : 0.548 
재현율 : 0.663 
f1-score : 0.600 
AUC : 0.557 
기하평균 : 0.547 

----------SMOTE + KNN 모델 적용----------
         Predict[0]  Predict[1]
True[0]       40056        6243
True[1]        3847       42682

정확도 : 0.891 
정밀도 : 0.872 
재현율 : 0.917 
f1-score : 0.894 
AUC : 0.891 
기하평균 : 0.891 

----------SMOTE + SVM 모델 적용----------
         Predict[0]  Predict[1]
True[0]       36382        9917
True[1]          28       46501

정확도 : 0.893 
정밀도 : 0.82

## 3. ADASYN

### 3-1. 데이터 가공

In [15]:
X_a = pd.read_csv("x_adasyn.csv")
y_a = pd.read_csv("y_adasyn.csv")

In [16]:
X_a

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,0,8196231,37,14,1,2,1,20,63,1,7,13
1,1,2869872,56,19,1,2,0,19,86,1,11,12
2,2,7361429,58,2,1,2,0,50,292,12,2,12
3,3,5921974,25,3,0,2,0,50,194,6,3,11
4,4,1583605,45,13,1,2,0,5,249,13,13,14
...,...,...,...,...,...,...,...,...,...,...,...,...
307024,307024,9768782,61,10,1,2,1,46,113,2,5,11
307025,307025,9768782,61,10,1,2,1,46,113,2,5,11
307026,307026,9768782,61,10,1,2,1,46,113,2,5,11
307027,307027,9768782,61,10,1,2,1,46,113,2,5,11


In [17]:
X_a.drop(['Unnamed: 0'], axis=1, inplace=True)
X_a

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,8196231,37,14,1,2,1,20,63,1,7,13
1,2869872,56,19,1,2,0,19,86,1,11,12
2,7361429,58,2,1,2,0,50,292,12,2,12
3,5921974,25,3,0,2,0,50,194,6,3,11
4,1583605,45,13,1,2,0,5,249,13,13,14
...,...,...,...,...,...,...,...,...,...,...,...
307024,9768782,61,10,1,2,1,46,113,2,5,11
307025,9768782,61,10,1,2,1,46,113,2,5,11
307026,9768782,61,10,1,2,1,46,113,2,5,11
307027,9768782,61,10,1,2,1,46,113,2,5,11


In [18]:
y_a = y_a['0']
y_a

0         0
1         0
2         1
3         0
4         0
         ..
307024    1
307025    1
307026    1
307027    1
307028    1
Name: 0, Length: 307029, dtype: int64

In [19]:
from sklearn.model_selection import train_test_split

X_train_a, X_test_a, y_train_a, y_test_a = train_test_split(X_a, y_a, test_size = 0.3, random_state = 101)

### 3-2. 모델 적합

In [20]:
models_a = []
models_a.append(('LR', LogisticRegression(max_iter =5000))) # 로지스틱 분류기 
models_a.append(('LDA', LinearDiscriminantAnalysis()))  # LDA 모델
models_a.append(('QDA', QuadraticDiscriminantAnalysis()))  # QDA 모델
models_a.append(('KNN', KNeighborsClassifier())) # KNN 모델
models_a.append(('SVM', SVC(gamma='auto')))  # SVM 모델
models_a.append(('NN', MLPClassifier()))  #  Neural Network 모델
models_a.append(('RF', RandomForestClassifier()))  # 랜덤포레스트 모델
models_a.append(('XGB', XGBClassifier()))  # XGB 모델
models_a.append(('Light_GBM', LGBMClassifier())) # Light_GBM 모델

for name, model in models_a:
    start = time.time()
    model.fit(X_train_a, y_train_a)
    end = time.time() - start
    msg = "%s - train_score : %.3f, test score : %.3f, time : %.5f 초" % (name, model.score(X_train_a, y_train_a), model.score(X_test_a, y_test_a), end)
    print(msg)

LR - train_score : 0.504, test score : 0.503, time : 0.36182 초
LDA - train_score : 0.543, test score : 0.544, time : 0.62401 초
QDA - train_score : 0.571, test score : 0.571, time : 0.20266 초
KNN - train_score : 0.899, test score : 0.880, time : 2.16484 초
SVM - train_score : 0.957, test score : 0.892, time : 9498.31274 초
NN - train_score : 0.506, test score : 0.505, time : 28.16156 초
RF - train_score : 0.957, test score : 0.922, time : 46.13577 초




XGB - train_score : 0.877, test score : 0.871, time : 14.25811 초
Light_GBM - train_score : 0.803, test score : 0.799, time : 1.68549 초


### 3-3. 성능 평가

In [21]:
# 모델 갯수
a = list(range(0,len(models_a)))

for i in a:
    print("----------ADASYN + %s 모델 적용----------" % (models_a[i][0]))
    get_clf_eval(y_test_a, models_a[i][1].predict(X_test_a))

----------ADASYN + LR 모델 적용----------


  _warn_prf(average, modifier, msg_start, len(result))


         Predict[0]  Predict[1]
True[0]       46314           0
True[1]       45795           0

정확도 : 0.503 
정밀도 : 0.000 
재현율 : 0.000 
f1-score : 0.000 
AUC : 0.500 
기하평균 : 0.000 

----------ADASYN + LDA 모델 적용----------
         Predict[0]  Predict[1]
True[0]       26120       20194
True[1]       21770       24025

정확도 : 0.544 
정밀도 : 0.543 
재현율 : 0.525 
f1-score : 0.534 
AUC : 0.544 
기하평균 : 0.544 

----------ADASYN + QDA 모델 적용----------
         Predict[0]  Predict[1]
True[0]       29358       16956
True[1]       22588       23207

정확도 : 0.571 
정밀도 : 0.578 
재현율 : 0.507 
f1-score : 0.540 
AUC : 0.570 
기하평균 : 0.567 

----------ADASYN + KNN 모델 적용----------
         Predict[0]  Predict[1]
True[0]       39399        6915
True[1]        4094       41701

정확도 : 0.880 
정밀도 : 0.858 
재현율 : 0.911 
f1-score : 0.883 
AUC : 0.881 
기하평균 : 0.880 

----------ADASYN + SVM 모델 적용----------
         Predict[0]  Predict[1]
True[0]       36433        9881
True[1]          34       45761

정확도 : 0.892 
정밀도 : 

# 4. RandomOverSampler

### 4-1. 데이터 가공

In [45]:
X_r = pd.read_csv("x_random.csv")
y_r = pd.read_csv("y_random.csv")

In [46]:
X_r

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4
0,0,-1.283145,-1.579604,-1.180232,-0.914131,0.716356
1,1,0.895457,-0.583344,-0.014067,0.731036,0.716356
2,2,-0.349269,0.940348,-1.013637,-0.639936,-1.427981
3,3,0.437526,-0.524740,-1.346827,-1.188325,0.001577
4,4,0.268128,-0.173119,0.152528,-0.914131,1.431135
...,...,...,...,...,...,...
442003,442003,-0.295148,0.726162,-0.565437,-0.091547,0.716356
442004,442004,-0.646561,0.546732,-1.513422,-1.462520,-0.713202
442005,442005,-0.932097,-0.493935,-0.680447,-0.091547,1.431135
442006,442006,0.970591,-0.583344,-0.014067,0.731036,0.001577


In [47]:
X_r.drop(['Unnamed: 0'], axis=1, inplace=True)
X_r

Unnamed: 0,0,1,2,3,4
0,-1.283145,-1.579604,-1.180232,-0.914131,0.716356
1,0.895457,-0.583344,-0.014067,0.731036,0.716356
2,-0.349269,0.940348,-1.013637,-0.639936,-1.427981
3,0.437526,-0.524740,-1.346827,-1.188325,0.001577
4,0.268128,-0.173119,0.152528,-0.914131,1.431135
...,...,...,...,...,...
442003,-0.295148,0.726162,-0.565437,-0.091547,0.716356
442004,-0.646561,0.546732,-1.513422,-1.462520,-0.713202
442005,-0.932097,-0.493935,-0.680447,-0.091547,1.431135
442006,0.970591,-0.583344,-0.014067,0.731036,0.001577


In [48]:
y_r = y_r['0']
y_r

0         0
1         0
2         0
3         1
4         1
         ..
442003    1
442004    1
442005    1
442006    1
442007    1
Name: 0, Length: 442008, dtype: int64

In [49]:
from sklearn.model_selection import train_test_split

X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(X_r, y_r, test_size = 0.3, random_state = 101)

### 4-2. 모델 적합

In [50]:
models_r = []
models_r.append(('LR', LogisticRegression(max_iter =5000))) # 로지스틱 분류기 
models_r.append(('LDA', LinearDiscriminantAnalysis()))  # LDA 모델
models_r.append(('QDA', QuadraticDiscriminantAnalysis()))  # QDA 모델
models_r.append(('KNN', KNeighborsClassifier())) # KNN 모델
models_r.append(('SVM', SVC(gamma='auto')))  # SVM 모델
models_r.append(('NN', MLPClassifier()))  #  Neural Network 모델
models_r.append(('RF', RandomForestClassifier()))  # 랜덤포레스트 모델
models_r.append(('XGB', XGBClassifier()))  # XGB 모델
models_r.append(('Light_GBM', LGBMClassifier())) # Light_GBM 모델

for name, model in models_r:
    start = time.time()
    model.fit(X_train_r, y_train_r)
    end = time.time() - start
    msg = "%s - train_score : %.3f, test score : %.3f, time : %.5f 초" % (name, model.score(X_train_r, y_train_r), model.score(X_test_r, y_test_r), end)
    print(msg)

LR - train_score : 0.533, test score : 0.534, time : 0.42885 초
LDA - train_score : 0.533, test score : 0.534, time : 0.43683 초
QDA - train_score : 0.531, test score : 0.529, time : 0.13165 초
KNN - train_score : 0.929, test score : 0.919, time : 3.86565 초


### 4-3. 성능 평가

In [51]:
# 모델 갯수
a = list(range(0,len(models_r)))

for i in a:
    print("----------Borderline-SMOTE + %s 모델 적용----------" % (models_r[i][0]))
    get_clf_eval(y_test_r, models_r[i][1].predict(X_test_r))

----------Borderline-SMOTE + LR 모델 적용----------
         Predict[0]  Predict[1]
True[0]       35232       31022
True[1]       30810       35539

정확도 : 0.534 
정밀도 : 0.534 
재현율 : 0.536 
f1-score : 0.535 
AUC : 0.534 
기하평균 : 0.534 

----------Borderline-SMOTE + LDA 모델 적용----------
         Predict[0]  Predict[1]
True[0]       35233       31021
True[1]       30810       35539

정확도 : 0.534 
정밀도 : 0.534 
재현율 : 0.536 
f1-score : 0.535 
AUC : 0.534 
기하평균 : 0.534 

----------Borderline-SMOTE + QDA 모델 적용----------
         Predict[0]  Predict[1]
True[0]       34128       32126
True[1]       30371       35978

정확도 : 0.529 
정밀도 : 0.528 
재현율 : 0.542 
f1-score : 0.535 
AUC : 0.529 
기하평균 : 0.529 

----------Borderline-SMOTE + KNN 모델 적용----------
         Predict[0]  Predict[1]
True[0]       57992        8262
True[1]        2501       63848

정확도 : 0.919 
정밀도 : 0.885 
재현율 : 0.962 
f1-score : 0.922 
AUC : 0.919 
기하평균 : 0.918 



## 5. Distribution-SMOTE

### 5-1. 데이터 가공

In [52]:
X_d = pd.read_csv("X_smova.csv")
y_d = pd.read_csv("y_smova.csv")

In [53]:
X_d

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4
0,0,-1.283145,-1.579604,-1.180232,-0.914131,0.716356
1,1,0.895457,-0.583344,-0.014067,0.731036,0.716356
2,2,-0.349269,0.940348,-1.013637,-0.639936,-1.427981
3,3,0.437526,-0.524740,-1.346827,-1.188325,0.001577
4,4,0.268128,-0.173119,0.152528,-0.914131,1.431135
...,...,...,...,...,...,...
442003,442003,0.492554,-0.583344,-1.180232,-0.914131,1.431135
442004,442004,0.844739,0.530123,-1.513422,-1.462520,0.001577
442005,442005,-1.344268,1.409176,0.818909,-0.914131,0.001577
442006,442006,-1.732423,-1.579604,-1.513422,-1.462520,1.431135


In [54]:
X_d.drop(['Unnamed: 0'], axis=1, inplace=True)
X_d

Unnamed: 0,0,1,2,3,4
0,-1.283145,-1.579604,-1.180232,-0.914131,0.716356
1,0.895457,-0.583344,-0.014067,0.731036,0.716356
2,-0.349269,0.940348,-1.013637,-0.639936,-1.427981
3,0.437526,-0.524740,-1.346827,-1.188325,0.001577
4,0.268128,-0.173119,0.152528,-0.914131,1.431135
...,...,...,...,...,...
442003,0.492554,-0.583344,-1.180232,-0.914131,1.431135
442004,0.844739,0.530123,-1.513422,-1.462520,0.001577
442005,-1.344268,1.409176,0.818909,-0.914131,0.001577
442006,-1.732423,-1.579604,-1.513422,-1.462520,1.431135


In [55]:
y_d = y_d['0']
y_d

0         0
1         0
2         0
3         1
4         1
         ..
442003    1
442004    1
442005    1
442006    1
442007    1
Name: 0, Length: 442008, dtype: int64

In [56]:
from sklearn.model_selection import train_test_split

X_train_d, X_test_d, y_train_d, y_test_d = train_test_split(X_d, y_d, test_size = 0.3, random_state = 101)

### 5-2. 모델 적합

In [57]:
models_d = []
models_d.append(('LR', LogisticRegression(max_iter =5000))) # 로지스틱 분류기 
models_d.append(('LDA', LinearDiscriminantAnalysis()))  # LDA 모델
models_d.append(('QDA', QuadraticDiscriminantAnalysis()))  # QDA 모델
models_d.append(('KNN', KNeighborsClassifier())) # KNN 모델
models_d.append(('SVM', SVC(gamma='auto')))  # SVM 모델
models_d.append(('NN', MLPClassifier()))  #  Neural Network 모델
models_d.append(('RF', RandomForestClassifier()))  # 랜덤포레스트 모델
models_d.append(('XGB', XGBClassifier()))  # XGB 모델
models_d.append(('Light_GBM', LGBMClassifier())) # Light_GBM 모델

for name, model in models_d:
    start = time.time()
    model.fit(X_train_d, y_train_d)
    end = time.time() - start
    msg = "%s - train_score : %.3f, test score : %.3f, time : %.5f 초" % (name, model.score(X_train_d, y_train_d), model.score(X_test_d, y_test_d), end)
    print(msg)

LR - train_score : 0.523, test score : 0.524, time : 0.37065 초
LDA - train_score : 0.523, test score : 0.524, time : 0.35409 초
QDA - train_score : 0.532, test score : 0.530, time : 0.12164 초
KNN - train_score : 0.926, test score : 0.916, time : 3.32410 초


### 5-3. 성능 평가

In [58]:
# 모델 갯수
a = list(range(0,len(models_d)))

for i in a:
    print("----------Distribution SMOTE + %s 모델 적용----------" % (models_d[i][0]))
    get_clf_eval(y_test_d, models_d[i][1].predict(X_test_d))

----------Distribution SMOTE + LR 모델 적용----------
         Predict[0]  Predict[1]
True[0]       32849       33405
True[1]       29775       36574

정확도 : 0.524 
정밀도 : 0.523 
재현율 : 0.551 
f1-score : 0.537 
AUC : 0.524 
기하평균 : 0.523 

----------Distribution SMOTE + LDA 모델 적용----------
         Predict[0]  Predict[1]
True[0]       32847       33407
True[1]       29771       36578

정확도 : 0.524 
정밀도 : 0.523 
재현율 : 0.551 
f1-score : 0.537 
AUC : 0.524 
기하평균 : 0.523 

----------Distribution SMOTE + QDA 모델 적용----------
         Predict[0]  Predict[1]
True[0]       26862       39392
True[1]       22960       43389

정확도 : 0.530 
정밀도 : 0.524 
재현율 : 0.654 
f1-score : 0.582 
AUC : 0.530 
기하평균 : 0.515 

----------Distribution SMOTE + KNN 모델 적용----------
         Predict[0]  Predict[1]
True[0]       57209        9045
True[1]        2098       64251

정확도 : 0.916 
정밀도 : 0.877 
재현율 : 0.968 
f1-score : 0.920 
AUC : 0.916 
기하평균 : 0.914 



## 6. SVMSMOTE

### 5-1. 데이터 가공

In [52]:
X_ss = pd.read_csv("X_over_svm.csv")
y_ss = pd.read_csv("y_over_svm.csv")

In [53]:
X_ss

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4
0,0,-1.283145,-1.579604,-1.180232,-0.914131,0.716356
1,1,0.895457,-0.583344,-0.014067,0.731036,0.716356
2,2,-0.349269,0.940348,-1.013637,-0.639936,-1.427981
3,3,0.437526,-0.524740,-1.346827,-1.188325,0.001577
4,4,0.268128,-0.173119,0.152528,-0.914131,1.431135
...,...,...,...,...,...,...
442003,442003,0.492554,-0.583344,-1.180232,-0.914131,1.431135
442004,442004,0.844739,0.530123,-1.513422,-1.462520,0.001577
442005,442005,-1.344268,1.409176,0.818909,-0.914131,0.001577
442006,442006,-1.732423,-1.579604,-1.513422,-1.462520,1.431135


In [54]:
X_ss.drop(['Unnamed: 0'], axis=1, inplace=True)
X_ss

Unnamed: 0,0,1,2,3,4
0,-1.283145,-1.579604,-1.180232,-0.914131,0.716356
1,0.895457,-0.583344,-0.014067,0.731036,0.716356
2,-0.349269,0.940348,-1.013637,-0.639936,-1.427981
3,0.437526,-0.524740,-1.346827,-1.188325,0.001577
4,0.268128,-0.173119,0.152528,-0.914131,1.431135
...,...,...,...,...,...
442003,0.492554,-0.583344,-1.180232,-0.914131,1.431135
442004,0.844739,0.530123,-1.513422,-1.462520,0.001577
442005,-1.344268,1.409176,0.818909,-0.914131,0.001577
442006,-1.732423,-1.579604,-1.513422,-1.462520,1.431135


In [55]:
y_ss = y_ss['0']
y_ss

0         0
1         0
2         0
3         1
4         1
         ..
442003    1
442004    1
442005    1
442006    1
442007    1
Name: 0, Length: 442008, dtype: int64

In [56]:
from sklearn.model_selection import train_test_split

X_train_ss, X_test_ss, y_train_ss, y_test_ss = train_test_split(X_ss, y_ss, test_size = 0.3, random_state = 101)

### 5-2. 모델 적합

In [57]:
models_ss = []
models_ss.append(('LR', LogisticRegression(max_iter =5000))) # 로지스틱 분류기 
models_ss.append(('LDA', LinearDiscriminantAnalysis()))  # LDA 모델
models_ss.append(('QDA', QuadraticDiscriminantAnalysis()))  # QDA 모델
models_ss.append(('KNN', KNeighborsClassifier())) # KNN 모델
models_ss.append(('SVM', SVC(gamma='auto')))  # SVM 모델
models_ss.append(('NN', MLPClassifier()))  #  Neural Network 모델
models_ss.append(('RF', RandomForestClassifier()))  # 랜덤포레스트 모델
models_ss.append(('XGB', XGBClassifier()))  # XGB 모델
models_ss.append(('Light_GBM', LGBMClassifier())) # Light_GBM 모델

for name, model in models_ss:
    start = time.time()
    model.fit(X_train_ss, y_train_ss)
    end = time.time() - start
    msg = "%s - train_score : %.3f, test score : %.3f, time : %.5f 초" % (name, model.score(X_train_ss, y_train_ss), model.score(X_test_ss, y_test_ss), end)
    print(msg)

LR - train_score : 0.523, test score : 0.524, time : 0.37065 초
LDA - train_score : 0.523, test score : 0.524, time : 0.35409 초
QDA - train_score : 0.532, test score : 0.530, time : 0.12164 초
KNN - train_score : 0.926, test score : 0.916, time : 3.32410 초


### 5-3. 성능 평가

In [58]:
# 모델 갯수
a = list(range(0,len(models_ss)))

for i in a:
    print("----------Distribution SMOTE + %s 모델 적용----------" % (models_ss[i][0]))
    get_clf_eval(y_test_ss, models_ss[i][1].predict(X_test_ss))

----------Distribution SMOTE + LR 모델 적용----------
         Predict[0]  Predict[1]
True[0]       32849       33405
True[1]       29775       36574

정확도 : 0.524 
정밀도 : 0.523 
재현율 : 0.551 
f1-score : 0.537 
AUC : 0.524 
기하평균 : 0.523 

----------Distribution SMOTE + LDA 모델 적용----------
         Predict[0]  Predict[1]
True[0]       32847       33407
True[1]       29771       36578

정확도 : 0.524 
정밀도 : 0.523 
재현율 : 0.551 
f1-score : 0.537 
AUC : 0.524 
기하평균 : 0.523 

----------Distribution SMOTE + QDA 모델 적용----------
         Predict[0]  Predict[1]
True[0]       26862       39392
True[1]       22960       43389

정확도 : 0.530 
정밀도 : 0.524 
재현율 : 0.654 
f1-score : 0.582 
AUC : 0.530 
기하평균 : 0.515 

----------Distribution SMOTE + KNN 모델 적용----------
         Predict[0]  Predict[1]
True[0]       57209        9045
True[1]        2098       64251

정확도 : 0.916 
정밀도 : 0.877 
재현율 : 0.968 
f1-score : 0.920 
AUC : 0.916 
기하평균 : 0.914 

