# C.7.10 Neural Network 모델

In [17]:
import pandas as pd
import numpy as np
df = pd.read_csv('2014DC2_dummy_indicator_friendly.csv')   
df.shape

(12417, 193)

In [2]:
# Imputation indicator가 생성됨에 주의

data = df.drop(['EBizSystem2'], axis=1)   # 타겟변수를 제외한 입력변수를 data에 저장
target = df['EBizSystem2']                # 타겟변수만 target에 저장

# 50:50 data partition.
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split( 
    data, target, test_size=0.5, random_state=42)   # test_size=0.5임에 주의 

# interval 변수의 null value를 평균(mean)으로 impute 및 add_indicator 포함 
from sklearn.impute import SimpleImputer
imp= SimpleImputer(strategy = 'mean', add_indicator=True)  
X_train2= imp.fit_transform(X_train) 
X_test2= imp.fit_transform(X_test)   # X_train과 and X_test 둘 다 imputation 적용해야 함에 유의

print("X_train2 shape:", X_train2.shape) 
print("X_test2 shape:", X_test2.shape) 


X_train2 shape: (6208, 214)
X_test2 shape: (6209, 214)


In [18]:
# Neural Network model 

from sklearn.neural_network import MLPClassifier 
from sklearn.metrics import accuracy_score
mlp = MLPClassifier(max_iter = 500, random_state = 0)
model = mlp.fit(X_train2, y_train) 
pred = model.predict(X_test2)  # 학습된 Classifier로 테스트 데이터셋 자료를 투입해서 타겟변수 예측값 생성
accuracy = accuracy_score(y_test, pred)

print("Neural Network1 Accuracy on training set:{:.5f}".format(model.score(X_train2, y_train)))
print("Neural Network1 Accuracy on test set:{:.5f}".format(accuracy_score(y_test, pred)))

Neural Network1 Accuracy on training set:1.00000
Neural Network1 Accuracy on test set:0.66645


In [19]:
# 참조 코딩
# Neural Network model 

from sklearn.neural_network import MLPClassifier 
mlp = MLPClassifier(solver='sgd', max_iter = 500, random_state = 0)
model = mlp.fit(X_train2, y_train) 
pred = model.predict(X_test2)  # 학습된 Classifier로 테스트 데이터셋 자료를 투입해서 타겟변수 예측값 생성.
accuracy = accuracy_score(y_test, pred)

print("Neural Network1 Accuracy on training set:{:.5f}".format(model.score(X_train2, y_train)))
print("Neural Network1 Accuracy on test set:{:.5f}".format(accuracy_score(y_test, pred)))

Neural Network1 Accuracy on training set:0.67494
Neural Network1 Accuracy on test set:0.66098


In [20]:
# 참조 코딩
# Neural Network model 

from sklearn.neural_network import MLPClassifier 
mlp = MLPClassifier(solver='lbfgs', max_iter = 500, random_state = 0)
model = mlp.fit(X_train2, y_train) 
pred = model.predict(X_test2)  # 학습된 Classifier로 테스트 데이터셋 자료를 투입해서 타겟변수 예측값 생성.
accuracy = accuracy_score(y_test, pred)

print("Neural Network1 Accuracy on training set:{:.5f}".format(model.score(X_train2, y_train)))
print("Neural Network1 Accuracy on test set:{:.5f}".format(accuracy_score(y_test, pred)))

Neural Network1 Accuracy on training set:0.89803
Neural Network1 Accuracy on test set:0.66677


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


In [4]:
# Neural Network 모델 (Default 모델)
from sklearn.neural_network import MLPClassifier 
mlp = MLPClassifier(max_iter = 500, random_state = 0)

In [13]:
# 참조 코딩
# 시간이 15분 이상 소요되며 수렴 경고문까지 나와서 Params 설정 단순화 필요

from sklearn.model_selection import GridSearchCV
params = {'solver':['lbfgs','sgd','adam'], 'alpha':[0.0001, 0.001, 0.01, 0.1, 1],
          'activation':['logistic','tanh','relu']}

grid_mlp = GridSearchCV(mlp, param_grid=params, scoring='accuracy', cv=5, n_jobs=-1)
grid_mlp.fit(X_train2, y_train)

print("GridSearchCV max accuracy:{:.5f}".format(grid_mlp.best_score_))
print("GridSearchCV best parameter:", (grid_mlp.best_params_)) 

GridSearchCV max accuracy:0.71956
GridSearchCV best parameter: {'activation': 'relu', 'alpha': 1, 'solver': 'sgd'}




In [None]:
# alpha=0.0001이 default값임.

In [21]:
# alpha=0.01 옵션 추가. 이는 중간 정도의 regularization on weights를 의미함

mlp = MLPClassifier(max_iter = 500, alpha = 0.1, random_state = 0)
model = mlp.fit(X_train2, y_train) 
pred = model.predict(X_test2)  # 학습된 Classifier로 테스트 데이터셋 자료를 투입해서 타겟변수 예측값 생성
accuracy = accuracy_score(y_test, pred)

print("Neural Network1 Accuracy on training set:{:.5f}".format(model.score(X_train2, y_train)))
print("Neural Network1 Accuracy on test set:{:.5f}".format(accuracy_score(y_test, pred)))

Neural Network1 Accuracy on training set:0.95683
Neural Network1 Accuracy on test set:0.67982


In [22]:
# alpha=1 옵션 추가. 이는 stronger regularization on weights를 의미함

mlp = MLPClassifier(max_iter = 500, alpha = 1, random_state = 0)
model = mlp.fit(X_train2, y_train) 
pred = model.predict(X_test2)  # 학습된 Classifier로 테스트 데이터셋 자료를 투입해서 타겟변수 예측값 생성
accuracy = accuracy_score(y_test, pred)

print("Neural Network1 Accuracy on training set:{:.5f}".format(model.score(X_train2, y_train)))
print("Neural Network1 Accuracy on test set:{:.5f}".format(accuracy_score(y_test, pred)))

Neural Network1 Accuracy on training set:0.75515
Neural Network1 Accuracy on test set:0.72717


In [23]:
# Neural Network 모델 (Default 모델)
mlp = MLPClassifier(max_iter = 500, random_state = 0)

In [24]:
# Neural Network 모델 (Default 모델)
mlp = MLPClassifier(max_iter = 500, random_state = 0)

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold  

# StratifiedKFold의 random_state 옵션값을 특정 숫자(예: 0)로 고정
cross_validation = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)
params = {'solver':['lbfgs'], 'alpha':[1],
          'activation':['tanh','relu', 'logistic']}

grid_mlp = GridSearchCV(mlp, param_grid=params, scoring='accuracy', cv=cross_validation, n_jobs=-1)
grid_mlp.fit(X_train2, y_train)

print("GridSearchCV max accuracy:{:.5f}".format(grid_mlp.best_score_))
print("GridSearchCV best parameter:", (grid_mlp.best_params_)) 

GridSearchCV max accuracy:0.66640
GridSearchCV best parameter: {'activation': 'relu', 'alpha': 1, 'solver': 'lbfgs'}


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


In [25]:
# Neural Network 모델 (Default 모델)
mlp = MLPClassifier(max_iter = 500, random_state = 0)

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold  

# StratifiedKFold의 random_state 옵션값을 특정 숫자(예: 0)로 고정
cross_validation = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)
params = {'solver':['sgd'], 'alpha':[1], 'max_iter':[1000],\
         'activation':['tanh','relu', 'logistic']}

grid_mlp = GridSearchCV(mlp, param_grid=params, scoring='accuracy', cv=cross_validation, n_jobs=-1)
grid_mlp.fit(X_train2, y_train)

print("GridSearchCV max accuracy:{:.5f}".format(grid_mlp.best_score_))
print("GridSearchCV best parameter:", (grid_mlp.best_params_)) 

GridSearchCV max accuracy:0.71843
GridSearchCV best parameter: {'activation': 'logistic', 'alpha': 1, 'max_iter': 1000, 'solver': 'sgd'}


In [26]:
# Neural Network 모델 (Default 모델)
mlp = MLPClassifier(max_iter = 500, random_state = 0)

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold  

# StratifiedKFold의 random_state 옵션값을 특정 숫자(예: 0)로 고정
cross_validation = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)
params = {'solver':['adam'], 'alpha':[1],\
         'activation':['tanh','relu', 'logistic']}

grid_mlp = GridSearchCV(mlp, param_grid=params, scoring='accuracy', cv=cross_validation, n_jobs=-1)
grid_mlp.fit(X_train2, y_train)

print("GridSearchCV max accuracy:{:.5f}".format(grid_mlp.best_score_))
print("GridSearchCV best parameter:", (grid_mlp.best_params_)) 

GridSearchCV max accuracy:0.72117
GridSearchCV best parameter: {'activation': 'logistic', 'alpha': 1, 'solver': 'adam'}


In [27]:
### adam 설정 최종 그리드서치
import time
start = time.time()

# Neural Network 모델 (Default 모델)
mlp = MLPClassifier(max_iter = 500, random_state = 0)

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold  

# StratifiedKFold의 random_state 옵션값을 특정 숫자(예: 0)로 고정
cross_validation = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)

params = {'solver':['adam'],
          'alpha':[0.0001, 0.001, 0.01, 0.1, 1],
          'max_iter':[1000],
          'activation':['logistic'],
          'hidden_layer_sizes': [(100,), (100,100)]
         }

grid_mlp = GridSearchCV(mlp, param_grid=params, scoring='accuracy', cv=cross_validation, n_jobs=-1)
grid_mlp.fit(X_train2, y_train)

print("GridSearchCV max accuracy:{:.5f}".format(grid_mlp.best_score_))
print("GridSearchCV best parameter:", (grid_mlp.best_params_)) 

end = time.time()
print(f"Runtime of the program is {end - start}")

GridSearchCV max accuracy:0.72117
GridSearchCV best parameter: {'activation': 'logistic', 'alpha': 1, 'hidden_layer_sizes': (100,), 'max_iter': 1000, 'solver': 'adam'}
Runtime of the program is 224.51609778404236


In [28]:
best_clf = grid_mlp.best_estimator_
pred = best_clf.predict(X_test2)
print("Accuracy on test set:{:.5f}".format(accuracy_score(y_test, pred)))

from sklearn.metrics import roc_auc_score
ROC_AUC = roc_auc_score(y_test,best_clf.predict_proba(X_test2)[:, 1])
print("ROC AUC on test set:{:.5f}".format(ROC_AUC))

Accuracy on test set:0.70575
ROC AUC on test set:0.78030


In [29]:
### sgd 설정 최종 그리드서치
import time
start = time.time()

# Neural Network 모델 (Default 모델).
mlp = MLPClassifier(max_iter = 500, random_state = 0)

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold  

# StratifiedKFold의 random_state 옵션값을 특정 숫자(예: 0)로 고정합니다.
cross_validation = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)

params = {'solver':['sgd'],
          'alpha':[0.0001, 0.001, 0.01, 0.1, 1],
          'max_iter':[1000],
          'activation':['logistic'],
          'hidden_layer_sizes': [(100,), (100,100)]
         }

grid_mlp = GridSearchCV(mlp, param_grid=params, scoring='accuracy', cv=cross_validation, n_jobs=-1)
grid_mlp.fit(X_train2, y_train)

print("GridSearchCV max accuracy:{:.5f}".format(grid_mlp.best_score_))
print("GridSearchCV best parameter:", (grid_mlp.best_params_)) 

end = time.time()
print(f"Runtime of the program is {end - start}")

GridSearchCV max accuracy:0.72149
GridSearchCV best parameter: {'activation': 'logistic', 'alpha': 0.1, 'hidden_layer_sizes': (100, 100), 'max_iter': 1000, 'solver': 'sgd'}
Runtime of the program is 424.24806213378906


In [30]:
best_clf = grid_mlp.best_estimator_
pred = best_clf.predict(X_test2)
print("Accuracy on test set:{:.5f}".format(accuracy_score(y_test, pred)))

from sklearn.metrics import roc_auc_score
ROC_AUC = roc_auc_score(y_test,best_clf.predict_proba(X_test2)[:, 1])
print("ROC AUC on test set:{:.5f}".format(ROC_AUC))

Accuracy on test set:0.73104
ROC AUC on test set:0.78526
