# Tunnning

In [2]:
# 데이터 가공
import time
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# 평가
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from imblearn.metrics import geometric_mean_score
from sklearn.metrics import classification_report

# 튜닝
from sklearn.model_selection import GridSearchCV 
from imblearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score, make_scorer

# Oversampling
from imblearn.over_sampling import *

# 모델
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
import lightgbm as lgb
from lightgbm import LGBMClassifier

import warnings
warnings.filterwarnings(action='ignore')

In [3]:
# 평가함수

def get_clf_eval(y_test, y_pred):
    confmat=pd.DataFrame(confusion_matrix(y_test, y_pred),
                    index=['True[0]', 'True[1]'],
                    columns=['Predict[0]', 'Predict[1]'])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    AUC = roc_auc_score(y_test, y_pred)
    g_means = geometric_mean_score(y_test, y_pred)
    print(confmat)
    print("\n정확도 : {:.3f} \n정밀도 : {:.3f} \n재현율 : {:.3f} \nf1-score : {:.3f} \nAUC : {:.3f} \n기하평균 : {:.3f} \n".format(accuracy,
                                        precision, recall, f1, AUC, g_means))

In [4]:
df = pd.read_csv('Loan_Train.csv')
df

Unnamed: 0,Id,Income,Age,Experience,Married/Single,House_Ownership,Car_Ownership,Profession,CITY,STATE,CURRENT_JOB_YRS,CURRENT_HOUSE_YRS,Risk_Flag
0,1,1303834,23,3,single,rented,no,Mechanical_engineer,Rewa,Madhya_Pradesh,3,13,0
1,2,7574516,40,10,single,rented,no,Software_Developer,Parbhani,Maharashtra,9,13,0
2,3,3991815,66,4,married,rented,no,Technical_writer,Alappuzha,Kerala,4,10,0
3,4,6256451,41,2,single,rented,yes,Software_Developer,Bhubaneswar,Odisha,2,12,1
4,5,5768871,47,11,single,rented,no,Civil_servant,Tiruchirappalli[10],Tamil_Nadu,3,14,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
251995,251996,8154883,43,13,single,rented,no,Surgeon,Kolkata,West_Bengal,6,11,0
251996,251997,2843572,26,10,single,rented,no,Army_officer,Rewa,Madhya_Pradesh,6,11,0
251997,251998,4522448,46,7,single,rented,no,Design_Engineer,Kalyan-Dombivli,Maharashtra,7,12,0
251998,251999,6507128,45,0,single,rented,no,Graphic_Designer,Pondicherry,Puducherry,0,10,0


In [5]:
# Feature, Target 나누기
X = df.drop(['Id','Risk_Flag'], axis=1)
y = df.Risk_Flag

In [6]:
# 범주형 변수 Labeling
labelEncoder = LabelEncoder()
for e in X.columns:
    if X[e].dtype == 'object':
        labelEncoder.fit(list(X[e].values))
        X[e] = labelEncoder.transform(X[e].values)

In [7]:
X

Unnamed: 0,Income,Age,Experience,Married/Single,House_Ownership,Car_Ownership,Profession,CITY,STATE,CURRENT_JOB_YRS,CURRENT_HOUSE_YRS
0,1303834,23,3,1,2,0,33,251,13,3,13
1,7574516,40,10,1,2,0,43,227,14,9,13
2,3991815,66,4,0,2,0,47,8,12,4,10
3,6256451,41,2,1,2,1,43,54,17,2,12
4,5768871,47,11,1,2,0,11,296,22,3,14
...,...,...,...,...,...,...,...,...,...,...,...
251995,8154883,43,13,1,2,0,45,162,28,6,11
251996,2843572,26,10,1,2,0,3,251,13,6,11
251997,4522448,46,7,1,2,0,17,144,14,7,12
251998,6507128,45,0,1,2,0,27,233,18,0,10


In [8]:
# Train & Test 데이터셋 나누기
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 101)

# Tunning

In [8]:
pipeline = Pipeline(steps= [("SMOTE", SMOTE()),
                            ("RF", RandomForestClassifier())
                            ])

param_grid = {
    "SMOTE__k_neighbors": [5,10,15,20],
    "RF__n_estimators": [100, 200, 300]
              }

# define grid
grid = GridSearchCV(pipeline, param_grid=param_grid, verbose=2, scoring=make_scorer(roc_auc_score), cv = 3)

# execute grid
grid_result = grid.fit(X_train, y_train)

Fitting 3 folds for each of 12 candidates, totalling 36 fits
[CV] END .........RF__n_estimators=100, SMOTE__k_neighbors=5; total time=  48.6s
[CV] END .........RF__n_estimators=100, SMOTE__k_neighbors=5; total time=  46.5s
[CV] END .........RF__n_estimators=100, SMOTE__k_neighbors=5; total time=  46.4s
[CV] END ........RF__n_estimators=100, SMOTE__k_neighbors=10; total time=  47.8s
[CV] END ........RF__n_estimators=100, SMOTE__k_neighbors=10; total time=  48.0s
[CV] END ........RF__n_estimators=100, SMOTE__k_neighbors=10; total time=  51.4s
[CV] END ........RF__n_estimators=100, SMOTE__k_neighbors=15; total time=  51.2s
[CV] END ........RF__n_estimators=100, SMOTE__k_neighbors=15; total time=  49.0s
[CV] END ........RF__n_estimators=100, SMOTE__k_neighbors=15; total time=  47.9s
[CV] END ........RF__n_estimators=100, SMOTE__k_neighbors=20; total time= 1.1min
[CV] END ........RF__n_estimators=100, SMOTE__k_neighbors=20; total time=  57.3s
[CV] END ........RF__n_estimators=100, SMOTE__k_

In [9]:
# get the best performing model fit on the whole training set
best_model = grid_result.best_estimator_
best_model

Pipeline(steps=[('SMOTE', SMOTE()),
                ('RF', RandomForestClassifier(n_estimators=200))])

In [10]:
# evaluate model on the hold out dataset

y_test_pred = best_model.predict(X_test)
print(classification_report(y_test, y_test_pred)) # precision, recall, f1-score, support
print('\nAUC : {:.5f}'.format(roc_auc_score(y_test, y_test_pred))) # auc

# confusion matrix
confmat=pd.DataFrame(confusion_matrix(y_test, y_test_pred),
                    index=['True[0]', 'True[1]'],
                    columns=['Predict[0]', 'Predict[1]'])

confmat

              precision    recall  f1-score   support

           0       0.97      0.88      0.92     66292
           1       0.49      0.83      0.62      9308

    accuracy                           0.87     75600
   macro avg       0.73      0.85      0.77     75600
weighted avg       0.91      0.87      0.89     75600


AUC : 0.85471


Unnamed: 0,Predict[0],Predict[1]
True[0],58310,7982
True[1],1584,7724


In [None]:
pipeline = Pipeline(steps= [("SMOTE", SMOTE()),
                            ("RF", RandomForestClassifier())
                            ])

param_grid = {
    "SMOTE__sampling_strategy": [0.5, 1],
    "RF__n_estimators": [100,200,300,400,500]
              }

# define grid
grid = GridSearchCV(pipeline, param_grid=param_grid, verbose=2, scoring=make_scorer(roc_auc_score), cv = 3, n_jobs=-1)

# execute grid
grid_result = grid.fit(X_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


In [None]:
# get the best performing model fit on the whole training set
best_model = grid_result.best_estimator_
best_model

In [None]:
# evaluate model on the hold out dataset

y_test_pred = best_model.predict(X_test)
print(classification_report(y_test, y_test_pred)) # precision, recall, f1-score, support
print('\nAUC : {:.5f}'.format(roc_auc_score(y_test, y_test_pred))) # auc

# confusion matrix
confmat=pd.DataFrame(confusion_matrix(y_test, y_test_pred),
                    index=['True[0]', 'True[1]'],
                    columns=['Predict[0]', 'Predict[1]'])

confmat

In [11]:
pipeline = Pipeline(steps= [("ADASYN", ADASYN()),
                            ("RF", RandomForestClassifier())
                            ])

param_grid = {
    "ADASYN__sampling_strategy": [0.5, 1],
    "RF__n_estimators": [100,200,300,400,500]
              }

# define grid
grid = GridSearchCV(pipeline, param_grid=param_grid, verbose=2, scoring=make_scorer(roc_auc_score), cv = 3, n_jobs=-1)

# execute grid
grid_result = grid.fit(X_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


In [12]:
# get the best performing model fit on the whole training set
best_model = grid_result.best_estimator_
best_model

Pipeline(steps=[('ADASYN', ADASYN(sampling_strategy=1)),
                ('RF', RandomForestClassifier())])

In [13]:
# evaluate model on the hold out dataset

y_test_pred = best_model.predict(X_test)
print(classification_report(y_test, y_test_pred)) # precision, recall, f1-score, support
print('\nAUC : {:.5f}'.format(roc_auc_score(y_test, y_test_pred))) # auc

# confusion matrix
confmat=pd.DataFrame(confusion_matrix(y_test, y_test_pred),
                    index=['True[0]', 'True[1]'],
                    columns=['Predict[0]', 'Predict[1]'])

confmat

              precision    recall  f1-score   support

           0       0.98      0.87      0.92     66292
           1       0.48      0.85      0.61      9308

    accuracy                           0.87     75600
   macro avg       0.73      0.86      0.77     75600
weighted avg       0.91      0.87      0.88     75600


AUC : 0.85915


Unnamed: 0,Predict[0],Predict[1]
True[0],57617,8675
True[1],1404,7904


In [32]:
pipeline = Pipeline(steps= [("SMOTE", SMOTE()),
                            ("DT", DecisionTreeClassifier())
                            ])

param_grid = {
    "SMOTE__sampling_strategy": [0.5, 1],
    "DT__max_depth": [30,35,40,45,50]
              }

# define grid
grid = GridSearchCV(pipeline, param_grid=param_grid, verbose=2, scoring=make_scorer(roc_auc_score), cv = 3, n_jobs=-1)

# execute grid
grid_result = grid.fit(X_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


In [33]:
# get the best performing model fit on the whole training set
best_model = grid_result.best_estimator_
best_model

Pipeline(steps=[('SMOTE', SMOTE(sampling_strategy=1)),
                ('DT', DecisionTreeClassifier(max_depth=30))])

In [34]:
# evaluate model on the hold out dataset

y_test_pred = best_model.predict(X_test)
print(classification_report(y_test, y_test_pred)) # precision, recall, f1-score, support
print('\nAUC : {:.5f}'.format(roc_auc_score(y_test, y_test_pred))) # auc

# confusion matrix
confmat=pd.DataFrame(confusion_matrix(y_test, y_test_pred),
                    index=['True[0]', 'True[1]'],
                    columns=['Predict[0]', 'Predict[1]'])

confmat

              precision    recall  f1-score   support

           0       0.98      0.87      0.92     66292
           1       0.47      0.86      0.61      9308

    accuracy                           0.86     75600
   macro avg       0.72      0.86      0.76     75600
weighted avg       0.92      0.86      0.88     75600


AUC : 0.86146


Unnamed: 0,Predict[0],Predict[1]
True[0],57346,8946
True[1],1323,7985


In [17]:
pipeline = Pipeline(steps= [("ADASYN", ADASYN()),
                            ("DT", DecisionTreeClassifier())
                            ])

param_grid = {
    "ADASYN__sampling_strategy": [0.5, 1],
    "DT__max_depth": [30,35,40,45,50], "DT__criterion" : ['gini', 'entropy']
              }

# define grid
grid = GridSearchCV(pipeline, param_grid=param_grid, verbose=2, scoring=make_scorer(roc_auc_score), cv = 3, n_jobs=-1)

# execute grid
grid_result = grid.fit(X_train, y_train)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV] END ADASYN__sampling_strategy=0.5, DT__criterion=gini, DT__max_depth=30; total time=   3.4s
[CV] END ADASYN__sampling_strategy=0.5, DT__criterion=gini, DT__max_depth=30; total time=   3.5s
[CV] END ADASYN__sampling_strategy=0.5, DT__criterion=gini, DT__max_depth=30; total time=   3.5s
[CV] END ADASYN__sampling_strategy=0.5, DT__criterion=gini, DT__max_depth=35; total time=   3.2s
[CV] END ADASYN__sampling_strategy=0.5, DT__criterion=gini, DT__max_depth=35; total time=   4.0s
[CV] END ADASYN__sampling_strategy=0.5, DT__criterion=gini, DT__max_depth=35; total time=   3.3s
[CV] END ADASYN__sampling_strategy=0.5, DT__criterion=gini, DT__max_depth=40; total time=   3.9s
[CV] END ADASYN__sampling_strategy=0.5, DT__criterion=gini, DT__max_depth=40; total time=   3.3s
[CV] END ADASYN__sampling_strategy=0.5, DT__criterion=gini, DT__max_depth=40; total time=   3.6s
[CV] END ADASYN__sampling_strategy=0.5, DT__criterion=gini, DT__ma

In [18]:
# get the best performing model fit on the whole training set
best_model = grid_result.best_estimator_
best_model

Pipeline(steps=[('ADASYN', ADASYN(sampling_strategy=1)),
                ('DT', DecisionTreeClassifier(max_depth=35))])

In [19]:
# evaluate model on the hold out dataset

y_test_pred = best_model.predict(X_test)
print(classification_report(y_test, y_test_pred)) # precision, recall, f1-score, support
print('\nAUC : {:.5f}'.format(roc_auc_score(y_test, y_test_pred))) # auc

# confusion matrix
confmat=pd.DataFrame(confusion_matrix(y_test, y_test_pred),
                    index=['True[0]', 'True[1]'],
                    columns=['Predict[0]', 'Predict[1]'])

confmat

              precision    recall  f1-score   support

           0       0.98      0.86      0.92     66292
           1       0.47      0.86      0.61      9308

    accuracy                           0.86     75600
   macro avg       0.72      0.86      0.76     75600
weighted avg       0.92      0.86      0.88     75600


AUC : 0.86231


Unnamed: 0,Predict[0],Predict[1]
True[0],57160,9132
True[1],1281,8027


In [17]:
pipeline = Pipeline(steps= [("SMOTE", SMOTE()),
                            ("KNN", KNeighborsClassifier())
                            ])

param_grid = {
    "SMOTE__sampling_strategy": [0.5, 1],
    "KNN__n_neighbors": [10,20,30,40,50],
              }

# define grid
grid = GridSearchCV(pipeline, param_grid=param_grid, verbose=2, scoring=make_scorer(roc_auc_score), cv = 3, n_jobs=-1)

# execute grid
grid_result = grid.fit(X_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


In [18]:
# get the best performing model fit on the whole training set
best_model = grid_result.best_estimator_
best_model

Pipeline(steps=[('SMOTE', SMOTE(sampling_strategy=1)),
                ('KNN', KNeighborsClassifier(n_neighbors=20))])

In [19]:
# evaluate model on the hold out dataset

y_test_pred = best_model.predict(X_test)
print(classification_report(y_test, y_test_pred)) # precision, recall, f1-score, support
print('\nAUC : {:.5f}'.format(roc_auc_score(y_test, y_test_pred))) # auc

# confusion matrix
confmat=pd.DataFrame(confusion_matrix(y_test, y_test_pred),
                    index=['True[0]', 'True[1]'],
                    columns=['Predict[0]', 'Predict[1]'])

confmat

              precision    recall  f1-score   support

           0       0.97      0.87      0.92     66292
           1       0.48      0.84      0.61      9308

    accuracy                           0.87     75600
   macro avg       0.73      0.86      0.77     75600
weighted avg       0.91      0.87      0.88     75600


AUC : 0.85557


Unnamed: 0,Predict[0],Predict[1]
True[0],57997,8295
True[1],1524,7784


In [8]:
pipeline = Pipeline(steps= [("ADASYN", ADASYN()),
                            ("KNN", KNeighborsClassifier())
                            ])

param_grid = {
    "ADASYN__sampling_strategy": [0.5, 1],
    "KNN__n_neighbors": [10,20,30,40,50],
              }

# define grid
grid = GridSearchCV(pipeline, param_grid=param_grid, verbose=2, scoring=make_scorer(roc_auc_score), cv = 3, n_jobs=-1)

# execute grid
grid_result = grid.fit(X_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


In [9]:
# get the best performing model fit on the whole training set
best_model = grid_result.best_estimator_
best_model

Pipeline(steps=[('ADASYN', ADASYN(sampling_strategy=1)),
                ('KNN', KNeighborsClassifier(n_neighbors=20))])

In [10]:
# evaluate model on the hold out dataset

y_test_pred = best_model.predict(X_test)
print(classification_report(y_test, y_test_pred)) # precision, recall, f1-score, support
print('\nAUC : {:.5f}'.format(roc_auc_score(y_test, y_test_pred))) # auc

# confusion matrix
confmat=pd.DataFrame(confusion_matrix(y_test, y_test_pred),
                    index=['True[0]', 'True[1]'],
                    columns=['Predict[0]', 'Predict[1]'])

confmat

              precision    recall  f1-score   support

           0       0.98      0.86      0.92     66292
           1       0.46      0.85      0.60      9308

    accuracy                           0.86     75600
   macro avg       0.72      0.86      0.76     75600
weighted avg       0.91      0.86      0.88     75600


AUC : 0.85807


Unnamed: 0,Predict[0],Predict[1]
True[0],57104,9188
True[1],1352,7956


In [8]:
pipeline = Pipeline(steps= [("SMOTE", SMOTE()),
                            ("XGB", XGBClassifier())
                            ])

param_grid = {
    "SMOTE__sampling_strategy": [0.5, 1],
    "XGB__learning_rate": [0.1, 0.01, 0.05]
              }

# define grid
grid = GridSearchCV(pipeline, param_grid=param_grid, verbose=2, scoring=make_scorer(roc_auc_score), cv = 3, n_jobs=-1)

# execute grid
grid_result = grid.fit(X_train, y_train)

Fitting 3 folds for each of 6 candidates, totalling 18 fits


In [9]:
# get the best performing model fit on the whole training set
best_model = grid_result.best_estimator_
best_model

Pipeline(steps=[('SMOTE', SMOTE(sampling_strategy=1)),
                ('XGB',
                 XGBClassifier(base_score=0.5, booster='gbtree',
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=1, enable_categorical=False,
                               gamma=0, gpu_id=-1, importance_type=None,
                               interaction_constraints='', learning_rate=0.1,
                               max_delta_step=0, max_depth=6,
                               min_child_weight=1, missing=nan,
                               monotone_constraints='()', n_estimators=100,
                               n_jobs=4, num_parallel_tree=1, predictor='auto',
                               random_state=0, reg_alpha=0, reg_lambda=1,
                               scale_pos_weight=1, subsample=1,
                               tree_method='exact', validate_parameters=1,
                               verbosity=None))])

In [10]:
# evaluate model on the hold out dataset

y_test_pred = best_model.predict(X_test)
print(classification_report(y_test, y_test_pred)) # precision, recall, f1-score, support
print('\nAUC : {:.5f}'.format(roc_auc_score(y_test, y_test_pred))) # auc

# confusion matrix
confmat=pd.DataFrame(confusion_matrix(y_test, y_test_pred),
                    index=['True[0]', 'True[1]'],
                    columns=['Predict[0]', 'Predict[1]'])

confmat

              precision    recall  f1-score   support

           0       0.95      0.75      0.84     66292
           1       0.29      0.74      0.42      9308

    accuracy                           0.75     75600
   macro avg       0.62      0.74      0.63     75600
weighted avg       0.87      0.75      0.79     75600


AUC : 0.74435


Unnamed: 0,Predict[0],Predict[1]
True[0],49832,16460
True[1],2448,6860


In [11]:
pipeline = Pipeline(steps= [("ADASYN", ADASYN()),
                            ("XGB", XGBClassifier())
                            ])

param_grid = {
    "ADASYN__sampling_strategy": [0.5, 1],
    "XGB__learning_rate": [0.1, 0.01, 0.05]
              }

# define grid
grid = GridSearchCV(pipeline, param_grid=param_grid, verbose=2, scoring=make_scorer(roc_auc_score), cv = 3, n_jobs=-1)

# execute grid
grid_result = grid.fit(X_train, y_train)

Fitting 3 folds for each of 6 candidates, totalling 18 fits


In [12]:
# get the best performing model fit on the whole training set
best_model = grid_result.best_estimator_
best_model

Pipeline(steps=[('ADASYN', ADASYN(sampling_strategy=1)),
                ('XGB',
                 XGBClassifier(base_score=0.5, booster='gbtree',
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=1, enable_categorical=False,
                               gamma=0, gpu_id=-1, importance_type=None,
                               interaction_constraints='', learning_rate=0.1,
                               max_delta_step=0, max_depth=6,
                               min_child_weight=1, missing=nan,
                               monotone_constraints='()', n_estimators=100,
                               n_jobs=4, num_parallel_tree=1, predictor='auto',
                               random_state=0, reg_alpha=0, reg_lambda=1,
                               scale_pos_weight=1, subsample=1,
                               tree_method='exact', validate_parameters=1,
                               verbosity=None))])

In [13]:
# evaluate model on the hold out dataset

y_test_pred = best_model.predict(X_test)
print(classification_report(y_test, y_test_pred)) # precision, recall, f1-score, support
print('\nAUC : {:.5f}'.format(roc_auc_score(y_test, y_test_pred))) # auc

# confusion matrix
confmat=pd.DataFrame(confusion_matrix(y_test, y_test_pred),
                    index=['True[0]', 'True[1]'],
                    columns=['Predict[0]', 'Predict[1]'])

confmat

              precision    recall  f1-score   support

           0       0.94      0.75      0.84     66292
           1       0.28      0.67      0.39      9308

    accuracy                           0.74     75600
   macro avg       0.61      0.71      0.61     75600
weighted avg       0.86      0.74      0.78     75600


AUC : 0.71135


Unnamed: 0,Predict[0],Predict[1]
True[0],49844,16448
True[1],3064,6244


In [20]:
pipeline = Pipeline(steps= [("SMOTE", SMOTE()),
                            ("L_GBM", LGBMClassifier())
                            ])

param_grid = {
    "SMOTE__sampling_strategy": [0.5, 1],
    "L_GBM__learning_rate": [0.1, 0.01, 0.05]
              }

# define grid
grid = GridSearchCV(pipeline, param_grid=param_grid, verbose=2, scoring=make_scorer(roc_auc_score), cv = 3, n_jobs=-1)

# execute grid
grid_result = grid.fit(X_train, y_train)

Fitting 3 folds for each of 6 candidates, totalling 18 fits


In [21]:
# get the best performing model fit on the whole training set
best_model = grid_result.best_estimator_
best_model

Pipeline(steps=[('SMOTE', SMOTE(sampling_strategy=1)),
                ('L_GBM', LGBMClassifier())])

In [22]:
# evaluate model on the hold out dataset

y_test_pred = best_model.predict(X_test)
print(classification_report(y_test, y_test_pred)) # precision, recall, f1-score, support
print('\nAUC : {:.5f}'.format(roc_auc_score(y_test, y_test_pred))) # auc

# confusion matrix
confmat=pd.DataFrame(confusion_matrix(y_test, y_test_pred),
                    index=['True[0]', 'True[1]'],
                    columns=['Predict[0]', 'Predict[1]'])

confmat

              precision    recall  f1-score   support

           0       0.96      0.76      0.85     66292
           1       0.30      0.75      0.43      9308

    accuracy                           0.76     75600
   macro avg       0.63      0.75      0.64     75600
weighted avg       0.88      0.76      0.79     75600


AUC : 0.75383


Unnamed: 0,Predict[0],Predict[1]
True[0],50205,16087
True[1],2324,6984


In [23]:
pipeline = Pipeline(steps= [("ADASYN", ADASYN()),
                            ("L_GBM", LGBMClassifier())
                            ])

param_grid = {
    "ADASYN__sampling_strategy": [0.5, 1],
    "L_GBM__learning_rate": [0.1, 0.01, 0.05]
              }

# define grid
grid = GridSearchCV(pipeline, param_grid=param_grid, verbose=2, scoring=make_scorer(roc_auc_score), cv = 3, n_jobs=-1)

# execute grid
grid_result = grid.fit(X_train, y_train)

Fitting 3 folds for each of 6 candidates, totalling 18 fits


In [24]:
# get the best performing model fit on the whole training set
best_model = grid_result.best_estimator_
best_model

Pipeline(steps=[('ADASYN', ADASYN(sampling_strategy=1)),
                ('L_GBM', LGBMClassifier())])

In [25]:
# evaluate model on the hold out dataset

y_test_pred = best_model.predict(X_test)
print(classification_report(y_test, y_test_pred)) # precision, recall, f1-score, support
print('\nAUC : {:.5f}'.format(roc_auc_score(y_test, y_test_pred))) # auc

# confusion matrix
confmat=pd.DataFrame(confusion_matrix(y_test, y_test_pred),
                    index=['True[0]', 'True[1]'],
                    columns=['Predict[0]', 'Predict[1]'])

confmat

              precision    recall  f1-score   support

           0       0.94      0.77      0.85     66292
           1       0.29      0.65      0.40      9308

    accuracy                           0.76     75600
   macro avg       0.61      0.71      0.62     75600
weighted avg       0.86      0.76      0.79     75600


AUC : 0.71339


Unnamed: 0,Predict[0],Predict[1]
True[0],51296,14996
True[1],3230,6078


---

In [20]:
k_range = list(range(1,11)) # # of k

pipeline2 = Pipeline(steps= [("SMOTE", SMOTE()),
                            ("KNN", KNeighborsClassifier())
                            ])

param_grid2 = {
    "SMOTE__sampling_strategy": [0.5, 1],
    "KNN__n_neighbors": k_range
              }

smore_knn_pipe = GridSearchCV(pipeline2, param_grid=param_grid2, verbose=2, scoring=make_scorer(roc_auc_score), cv = 3)

smore_knn_pipe.fit(X_train, y_train)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV] END ...KNN__n_neighbors=1, SMOTE__sampling_strategy=0.5; total time=   4.6s
[CV] END ...KNN__n_neighbors=1, SMOTE__sampling_strategy=0.5; total time=   4.5s
[CV] END ...KNN__n_neighbors=1, SMOTE__sampling_strategy=0.5; total time=   3.9s
[CV] END .....KNN__n_neighbors=1, SMOTE__sampling_strategy=1; total time=   4.5s
[CV] END .....KNN__n_neighbors=1, SMOTE__sampling_strategy=1; total time=   4.8s
[CV] END .....KNN__n_neighbors=1, SMOTE__sampling_strategy=1; total time=   4.9s
[CV] END ...KNN__n_neighbors=2, SMOTE__sampling_strategy=0.5; total time=   4.1s
[CV] END ...KNN__n_neighbors=2, SMOTE__sampling_strategy=0.5; total time=   7.1s
[CV] END ...KNN__n_neighbors=2, SMOTE__sampling_strategy=0.5; total time=   4.6s
[CV] END .....KNN__n_neighbors=2, SMOTE__sampling_strategy=1; total time=   4.9s
[CV] END .....KNN__n_neighbors=2, SMOTE__sampling_strategy=1; total time=   5.1s
[CV] END .....KNN__n_neighbors=2, SMOTE__samplin

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('SMOTE', SMOTE()),
                                       ('KNN', KNeighborsClassifier())]),
             param_grid={'KNN__n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                         'SMOTE__sampling_strategy': [0.5, 1]},
             scoring=make_scorer(roc_auc_score), verbose=2)

In [21]:
# get the best performing model fit on the whole training set
best_model = smore_knn_pipe.best_estimator_

# evaluate model on the hold out dataset
y_pred = best_model.predict(X_test)

In [26]:
best_model

Pipeline(steps=[('SMOTE', SMOTE(sampling_strategy=1)),
                ('KNN', KNeighborsClassifier(n_neighbors=9))])

In [25]:
# evaluate model on the hold out dataset
y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)

print(classification_report(y_train, y_train_pred)) # 알고리즘이 얼마나 학습을 잘 시켰는가
print(classification_report(y_test, y_test_pred)) # 새로운 데이터가 들어올 때 얼마나 좋은 성능을 나타내는가
print('\nAUC : {:.3f}'.format(roc_auc_score(y_test, y_test_pred)))

              precision    recall  f1-score   support

           0       0.99      0.87      0.92    154712
           1       0.50      0.92      0.65     21688

    accuracy                           0.88    176400
   macro avg       0.74      0.90      0.79    176400
weighted avg       0.93      0.88      0.89    176400

              precision    recall  f1-score   support

           0       0.98      0.86      0.92     66292
           1       0.47      0.86      0.60      9308

    accuracy                           0.86     75600
   macro avg       0.72      0.86      0.76     75600
weighted avg       0.91      0.86      0.88     75600


AUC : 0.859


In [30]:
confmat=pd.DataFrame(confusion_matrix(y_test, y_test_pred),
                    index=['True[0]', 'True[1]'],
                    columns=['Predict[0]', 'Predict[1]'])

confmat

Unnamed: 0,Predict[0],Predict[1]
True[0],57132,9160
True[1],1333,7975


In [8]:
k_range = list(range(1,11)) # # of k

pipeline = Pipeline(steps= [("ADASYN", ADASYN()),
                            ("KNN", KNeighborsClassifier())
                            ])

param_grid = {
    "ADASYN__sampling_strategy": [0.5, 1],
    "KNN__n_neighbors": k_range
              }

# define grid
grid = GridSearchCV(pipeline, param_grid=param_grid, verbose=2, scoring=make_scorer(roc_auc_score), cv = 3, n_jobs=-1)

# execute grid
grid_result = grid.fit(X_train, y_train)

Fitting 3 folds for each of 20 candidates, totalling 60 fits


In [9]:
# get the best performing model fit on the whole training set
best_model = grid_result.best_estimator_
best_model

Pipeline(steps=[('ADASYN', ADASYN(sampling_strategy=1)),
                ('KNN', KNeighborsClassifier(n_neighbors=9))])

In [10]:
# evaluate model on the hold out dataset

y_test_pred = best_model.predict(X_test)
print(classification_report(y_test, y_test_pred)) # precision, recall, f1-score, support
print('\nAUC : {:.5f}'.format(roc_auc_score(y_test, y_test_pred))) # auc

# confusion matrix
confmat=pd.DataFrame(confusion_matrix(y_test, y_test_pred),
                    index=['True[0]', 'True[1]'],
                    columns=['Predict[0]', 'Predict[1]'])

confmat

              precision    recall  f1-score   support

           0       0.98      0.85      0.91     66292
           1       0.45      0.87      0.59      9308

    accuracy                           0.85     75600
   macro avg       0.71      0.86      0.75     75600
weighted avg       0.91      0.85      0.87     75600


AUC : 0.85992


Unnamed: 0,Predict[0],Predict[1]
True[0],56180,10112
True[1],1188,8120


In [None]:
pipeline = Pipeline(steps= [("RandomOS", RandomOversampler()),
                            ("RF", RandomForestClassifier())
                            ])

param_grid = {
    "RandomOS__k_neighbors": [5,10,15,20],
    "RF__n_estimators": [100, 200, 300]
              }

# define grid
grid = GridSearchCV(pipeline, param_grid=param_grid, verbose=2, scoring=make_scorer(roc_auc_score), cv = 3)

# execute grid
grid_result = grid.fit(X_train, y_train)