# C.7.14 SVM 모델

In [1]:
import pandas as pd
import numpy as np
df = pd.read_csv('2014DC2_dummy_indicator_friendly.csv')   
df.shape

(12417, 193)

In [2]:
# Imputation indicator가 생성됨에 주의

data = df.drop(['EBizSystem2'], axis=1)   # 타겟변수를 제외한 입력변수를 data에 저장
target = df['EBizSystem2']                # 타겟변수만 target에 저장

# 50:50 data partition.
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split( 
    data, target, test_size=0.5, random_state=42)   # test_size=0.5임에 주의 

# interval 변수의 null value를 평균(mean)으로 impute 및 add_indicator 포함 
from sklearn.impute import SimpleImputer
imp= SimpleImputer(strategy = 'mean', add_indicator=True)  
X_train2= imp.fit_transform(X_train) 
X_test2= imp.fit_transform(X_test)   # X_train과 and X_test 둘 다 imputation 적용해야 함에 유의

print("X_train2 shape:", X_train2.shape) 
print("X_test2 shape:", X_test2.shape) 

X_train2 shape: (6208, 214)
X_test2 shape: (6209, 214)


In [3]:
# linear SVM

from sklearn.svm import SVC 
from sklearn.metrics import accuracy_score

svm = SVC(kernel='linear', C=1, random_state=0, probability=True) 
                                                # probability=True 에 주의
model = svm.fit(X_train2, y_train) 
pred = model.predict(X_test2)  # 학습된 Classifier로 테스트 데이터셋 자료를 투입해서 타겟변수 예측값 생성
accuracy = accuracy_score(y_test, pred)

print ("Accuracy on training set: {:.5f}".format(svm.score( X_train2 , y_train ))) 
print ("Accuracy on test set: {:.5f}".format(accuracy_score(y_test, pred)))

Accuracy on training set: 0.74533
Accuracy on test set: 0.73635


In [6]:
# 참조 코딩
# rbf SVM

from sklearn.svm import SVC 
from sklearn.metrics import accuracy_score

svm = SVC(kernel='rbf', C=1, gamma = 'auto', random_state=0, probability=True) 
                                                             # probability=True 에 주의
model = svm.fit(X_train2, y_train) 
pred = model.predict(X_test2)  # 학습된 Classifier로 테스트 데이터셋 자료를 투입해서 타겟변수 예측값 생성
accuracy = accuracy_score(y_test, pred)

print ("Accuracy on training set: {:.5f}".format(svm.score( X_train2 , y_train ))) 
print ("Accuracy on test set: {:.5f}".format(accuracy_score(y_test, pred)))

Accuracy on training set: 0.78028
Accuracy on test set: 0.71912


In [8]:
# SVM model (default 모델)
svm = SVC(kernel='rbf', C=1, gamma = 'auto', random_state=0, probability=True) 
                                                             # probability=True 에 주의

In [9]:
import time
start = time.time()

# SVM model (default 모델)
svm = SVC(kernel='rbf', C=1, gamma = 'auto', random_state=0, probability=True) 
                                                                # probability=True 에 주의.

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold  

# StratifiedKFold의 random_state 옵션값을 특정 숫자(예: 0)로 고정
cross_validation = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)
params = {'kernel':['rbf','sigmoid'], 'C':[0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
         'gamma':['scale','auto']}

grid_svm = GridSearchCV(svm, param_grid=params, scoring='accuracy', cv=cross_validation, n_jobs=-1)
grid_svm.fit(X_train2, y_train)

print("GridSearchCV max accuracy:{:.5f}".format(grid_svm.best_score_))
print("GridSearchCV best parameter:", (grid_svm.best_params_))

end = time.time()
print(f"Runtime of the program is {end - start}")

GridSearchCV max accuracy:0.71988
GridSearchCV best parameter: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
Runtime of the program is 666.4072041511536


In [5]:
best_clf = grid_svm.best_estimator_
pred = best_clf.predict(X_test2)
print("Accuracy on test set:{:.5f}".format(accuracy_score(y_test, pred)))

from sklearn.metrics import roc_auc_score
ROC_AUC = roc_auc_score(y_test,best_clf.predict_proba(X_test2)[:, 1])
print("ROC AUC on test set:{:.5f}".format(ROC_AUC))

Accuracy on test set:0.72492
ROC AUC on test set:0.78707


In [6]:
import time
start = time.time()

# SVM model (default 모델)
svm = SVC(kernel='rbf', C=1, gamma = 'auto', random_state=0, probability=True) 
                                                             # probability=True 에 주의
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold  

# StratifiedKFold의 random_state 옵션값을 특정 숫자(예: 0)로 고정
cross_validation = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)
params = {'kernel':['linear'], 'C':[0.0001, 0.001, 0.01, 0.1, 1, 10]} 
                                # C=100을 주면 프로그램 구동이 끝없이 이어져서 부득이 해당값은 배제  

grid_svm2 = GridSearchCV(svm, param_grid=params, scoring='accuracy', cv=cross_validation, n_jobs=-1)
grid_svm2.fit(X_train2, y_train)

print("GridSearchCV max accuracy:{:.5f}".format(grid_svm2.best_score_))
print("GridSearchCV best parameter:", (grid_svm2.best_params_))

end = time.time()
print(f"Runtime of the program is {end - start}")

GridSearchCV max accuracy:0.72229
GridSearchCV best parameter: {'C': 0.01, 'kernel': 'linear'}
Runtime of the program is 590.6808817386627


In [7]:
best_clf = grid_svm2.best_estimator_
pred = best_clf.predict(X_test2)
print("Accuracy on test set:{:.5f}".format(accuracy_score(y_test, pred)))

from sklearn.metrics import roc_auc_score
ROC_AUC = roc_auc_score(y_test,best_clf.predict_proba(X_test2)[:, 1])
print("ROC AUC on test set:{:.5f}".format(ROC_AUC))

Accuracy on test set:0.72991
ROC AUC on test set:0.78932
