In [2]:
import pandas as pd
import numpy as np
df = pd.read_csv('Lasso_select_ERP.csv')   
df.shape

(12417, 73)

In [2]:
# SimpleInputer의 add_indicator 옵션을 제거한 버전.

data = df.drop(['EBizSystem2'], axis=1)   # 타겟변수를 제외한 입력변수를 data에 저장.
target = df['EBizSystem2']                # 타겟변수만 target에 저장.

# 50:50 data partition.
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split( 
    data, target, test_size=0.5, random_state=42)   # test_size=0.5임에 주의. 

# interval 변수의 null value를 평균(mean)으로 impute. 
from sklearn.impute import SimpleImputer
imp= SimpleImputer(strategy = 'mean')  
X_train2= imp.fit_transform(X_train) 
X_test2= imp.fit_transform(X_test)   # X_train과 and X_test 둘 다 imputation 적용해야 함에 유의.

print("X_train2 shape:", X_train2.shape) 
print("X_test2 shape:", X_test2.shape) 

X_train2 shape: (6208, 132)
X_test2 shape: (6209, 132)


In [3]:
# Neural Network 모델 (Default 모델).
from sklearn.neural_network import MLPClassifier 
from sklearn.metrics import accuracy_score
clf_mlp = MLPClassifier(max_iter = 500, random_state = 0)
clf_mlp.fit(X_train2, y_train)
pred = clf_mlp.predict(X_test2)  # 학습된 Classifier로 테스트 데이터셋 자료이용해서 타겟변수 예측값 생성.
accuracy = accuracy_score(y_test, pred)

In [4]:
from sklearn.model_selection import GridSearchCV
params = {'solver':['lbfgs'], 'alpha':[1],\
         'activation':['tanh','relu', 'logistic']}

grid_mlp = GridSearchCV(clf_mlp, param_grid=params, scoring='accuracy', cv=3, n_jobs=-1)
grid_mlp.fit(X_train2, y_train)

print("GridSearchCV max accuracy:{:.5f}".format(grid_mlp.best_score_))
print("GridSearchCV best parameter:", (grid_mlp.best_params_)) 

GridSearchCV max accuracy:0.66640
GridSearchCV best parameter: {'activation': 'relu', 'alpha': 1, 'solver': 'lbfgs'}


In [5]:
from sklearn.model_selection import GridSearchCV
params = {'solver':['sgd'], 'alpha':[1], 'max_iter':[1000],\
         'activation':['tanh','relu', 'logistic']}

grid_mlp = GridSearchCV(clf_mlp, param_grid=params, scoring='accuracy', cv=3, n_jobs=-1)
grid_mlp.fit(X_train2, y_train)

print("GridSearchCV max accuracy:{:.5f}".format(grid_mlp.best_score_))
print("GridSearchCV best parameter:", (grid_mlp.best_params_)) 

GridSearchCV max accuracy:0.71746
GridSearchCV best parameter: {'activation': 'logistic', 'alpha': 1, 'max_iter': 1000, 'solver': 'sgd'}


In [6]:
from sklearn.model_selection import GridSearchCV
params = {'solver':['adam'], 'alpha':[1],\
         'activation':['tanh','relu', 'logistic']}

grid_mlp = GridSearchCV(clf_mlp, param_grid=params, scoring='accuracy', cv=3, n_jobs=-1)
grid_mlp.fit(X_train2, y_train)

print("GridSearchCV max accuracy:{:.5f}".format(grid_mlp.best_score_))
print("GridSearchCV best parameter:", (grid_mlp.best_params_)) 

GridSearchCV max accuracy:0.71939
GridSearchCV best parameter: {'activation': 'logistic', 'alpha': 1, 'solver': 'adam'}


In [4]:
import time
start = time.time()

from sklearn.model_selection import GridSearchCV
params = {'solver':['adam','lbfgs', 'sgd'], 'alpha':[0.0001, 0.001, 0.01, 0.1, 1],\
          'max_iter':[1000],'activation':['logistic','relu']}

grid_mlp = GridSearchCV(clf_mlp, param_grid=params, scoring='accuracy', cv=3, n_jobs=-1)
grid_mlp.fit(X_train2, y_train)

print("GridSearchCV max accuracy:{:.5f}".format(grid_mlp.best_score_))
print("GridSearchCV best parameter:", (grid_mlp.best_params_)) 

end = time.time()
print(f"Runtime of the program is {end - start}")

GridSearchCV max accuracy:0.72245
GridSearchCV best parameter: {'activation': 'logistic', 'alpha': 0.0001, 'max_iter': 1000, 'solver': 'sgd'}
Runtime of the program is 294.8421320915222


In [5]:
best_clf = grid_mlp.best_estimator_
pred = best_clf.predict(X_test2)
print("Accuracy on test set:{:.5f}".format(accuracy_score(y_test, pred)))

from sklearn.metrics import roc_auc_score
ROC_AUC = roc_auc_score(y_test,best_clf.predict_proba(X_test2)[:, 1])
print("ROC AUC on test set:{:.5f}".format(ROC_AUC))

Accuracy on test set:0.72524
ROC AUC on test set:0.78719
