In [1]:
import pandas as pd
import numpy as np
df = pd.read_csv('Lasso_selected_ch4.csv')   
df.shape

(12417, 133)

In [2]:
# SimpleInputer의 add_indicator 옵션을 제거한 버전.

data = df.drop(['EBizSystem2'], axis=1)   # 타겟변수를 제외한 입력변수를 data에 저장.
target = df['EBizSystem2']                # 타겟변수만 target에 저장.

# 50:50 data partition.
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split( 
    data, target, test_size=0.5, random_state=42)   # test_size=0.5임에 주의. 

# interval 변수의 null value를 평균(mean)으로 impute. 
from sklearn.impute import SimpleImputer
imp= SimpleImputer(strategy = 'mean')  
X_train2= imp.fit_transform(X_train) 
X_test2= imp.fit_transform(X_test)   # X_train과 and X_test 둘 다 imputation 적용해야 함에 유의.

print("X_train2 shape:", X_train2.shape) 
print("X_test2 shape:", X_test2.shape) 

X_train2 shape: (6208, 132)
X_test2 shape: (6209, 132)


In [3]:
# SVM model (default 모델)

from sklearn.svm import SVC 
from sklearn.metrics import accuracy_score
clf_svm = SVC(kernel='rbf', C=1, gamma = 'auto', random_state=0, probability=True)
clf_svm.fit(X_train2, y_train)                                   # probability=True 에 주의.
pred = clf_svm.predict(X_test2)  # 학습된 Classifier로 테스트 데이터셋 자료이용해서 타겟변수 예측값 생성.
accuracy = accuracy_score(y_test, pred)

In [4]:
import time
start = time.time()

from sklearn.model_selection import GridSearchCV
params = {'kernel':['rbf','sigmoid'], 'C':[0.0001, 0.001, 0.01, 0.1, 1, 10, 100],\
         'gamma':['scale','auto']}

grid_svm = GridSearchCV(clf_svm, param_grid=params, scoring='accuracy', cv=3, n_jobs=-1)
grid_svm.fit(X_train2, y_train)

print("GridSearchCV max accuracy:{:.5f}".format(grid_svm.best_score_))
print("GridSearchCV best parameter:", (grid_svm.best_params_))

end = time.time()
print(f"Runtime of the program is {end - start}")

GridSearchCV max accuracy:0.71811
GridSearchCV best parameter: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
Runtime of the program is 374.66829919815063


In [5]:
best_clf = grid_svm.best_estimator_
pred = best_clf.predict(X_test2)
print("Accuracy on test set:{:.5f}".format(accuracy_score(y_test, pred)))

from sklearn.metrics import roc_auc_score
ROC_AUC = roc_auc_score(y_test,best_clf.predict_proba(X_test2)[:, 1])
print("ROC AUC on test set:{:.5f}".format(ROC_AUC))

Accuracy on test set:0.72250
ROC AUC on test set:0.78587


In [None]:
from sklearn.model_selection import GridSearchCV
params = {'kernel':['linear'], 'C':[0.0001, 0.001, 0.01, 0.1, 1, 10]} 
                                # C=100을 주면 프로그램 구동이 끝없이 이어져서 부득이 해당값은 배제.  

grid_svm2 = GridSearchCV(clf_svm, param_grid=params, scoring='accuracy', cv=3, n_jobs=-1)
grid_svm2.fit(X_train2, y_train)

print("GridSearchCV max accuracy:{:.5f}".format(grid_svm2.best_score_))
print("GridSearchCV best parameter:", (grid_svm2.best_params_))

In [4]:
import time
start = time.time()

from sklearn.model_selection import GridSearchCV
params = {'kernel':['linear'], 'C':[0.0001, 0.001, 0.01, 0.1, 1, 10]} 
                                # C=100을 주면 프로그램 구동이 끝없이 이어져서 부득이 해당값은 배제.  

grid_svm2 = GridSearchCV(clf_svm, param_grid=params, scoring='accuracy', cv=3, n_jobs=-1)
grid_svm2.fit(X_train2, y_train)

print("GridSearchCV max accuracy:{:.5f}".format(grid_svm2.best_score_))
print("GridSearchCV best parameter:", (grid_svm2.best_params_))

end = time.time()
print(f"Runtime of the program is {end - start}")

GridSearchCV max accuracy:0.72278
GridSearchCV best parameter: {'C': 0.1, 'kernel': 'linear'}
Runtime of the program is 450.0407826900482


In [None]:
best_clf = grid_svm2.best_estimator_
pred = best_clf.predict(X_test2)
print("Accuracy on test set:{:.5f}".format(accuracy_score(y_test, pred)))

from sklearn.metrics import roc_auc_score
ROC_AUC = roc_auc_score(y_test,best_clf.predict_proba(X_test2)[:, 1])
print("ROC AUC on test set:{:.5f}".format(ROC_AUC))