# 4.7.7 서포트 벡터 머신(SVM) 모델

In [1]:
import pandas as pd
import numpy as np
df = pd.read_csv('house-standard.csv')   
df.shape

(20495, 93)

In [2]:
data = df.drop(['VALP_B1'], axis=1)   # 타겟변수를 제외한 입력변수를 data에 저장
target = df['VALP_B1']                # 타겟변수만 target 데이터프레임에 저장

# 50:50 데이터 분할
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split( 
    data, target, test_size=0.5, random_state=42)   # test_size=0.5임에 주의 

print("X_train shape:", X_train.shape) 
print("X_test shape:", X_test.shape) 

X_train shape: (10247, 92)
X_test shape: (10248, 92)


In [3]:
# SVM model (default 모델)

from sklearn.svm import SVC 
from sklearn.metrics import accuracy_score
svm = SVC(kernel='rbf', C=1, gamma = 'auto', random_state=0, probability=True) 
                                                     # probability=True 에 주의
model = svm.fit(X_train, y_train)
pred = model.predict(X_test)  # 학습된 Classifier로 테스트 데이터셋 자료이용해서 타겟변수 예측값 생성
accuracy = accuracy_score(y_test, pred)

print ("SVM Accuracy on training set:{:.5f}".format(model.score(X_train, y_train))) 
print ("SVM Accuracy on test set:{:.5f}".format(accuracy_score(y_test, pred)))

SVM Accuracy on training set:0.75349
SVM Accuracy on test set:0.73488


In [4]:
# SVM model (default 모델)
svm = SVC(kernel='rbf', C=1, gamma = 'auto', random_state=0, probability=True) 
                                                     # probability=True 에 주의

In [5]:
# 그리드 서치 실행. 시간이 많이 걸림에 주의

import time
start = time.time()

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold  

# StratifiedKFold의 random_state 옵션값을 특정 숫자(예: 0)로 고정
cross_validation = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)
params = {'kernel':['sigmoid'], 'C':[0.0001, 0.01, 1, 10],
          'gamma':['auto','scale']}

grid_svm = GridSearchCV(svm, param_grid=params, scoring='accuracy',
                        cv=cross_validation, n_jobs=-1)
grid_svm.fit(X_train, y_train)

print("GridSearchCV max accuracy:{:.5f}".format(grid_svm.best_score_))
print("GridSearchCV best parameter:", (grid_svm.best_params_))

end = time.time()
print(f"Runtime of the program is {end - start}")

GridSearchCV max accuracy:0.73641
GridSearchCV best parameter: {'C': 1, 'gamma': 'auto', 'kernel': 'sigmoid'}
Runtime of the program is 321.6062841415405


In [6]:
# SVM model (default 모델)
svm = SVC(kernel='rbf', C=1, gamma = 'auto', random_state=0, probability=True) 
                                                     # probability=True 에 주의

In [7]:
# 그리드 서치 재실행. 시간이 많이 걸림에 주의

import time
start = time.time()

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold  

# StratifiedKFold의 random_state 옵션값을 특정 숫자(예: 0)로 고정
cross_validation = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)
params = {'kernel':['rbf'], 'C':[0.0001, 0.01, 1, 10],
         'gamma':['auto','scale']}

grid_svm = GridSearchCV(svm, param_grid=params, scoring='accuracy', 
                        cv=cross_validation, n_jobs=-1)
grid_svm.fit(X_train, y_train)

print("GridSearchCV max accuracy:{:.5f}".format(grid_svm.best_score_))
print("GridSearchCV best parameter:", (grid_svm.best_params_))

end = time.time()
print(f"Runtime of the program is {end - start}")

GridSearchCV max accuracy:0.76110
GridSearchCV best parameter: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
Runtime of the program is 419.0511739253998


In [8]:
best_clf = grid_svm.best_estimator_
pred = best_clf.predict(X_test)
print("Accuracy on test set:{:.5f}".format(accuracy_score(y_test, pred)))

from sklearn.metrics import roc_auc_score
ROC_AUC = roc_auc_score(y_test,best_clf.predict_proba(X_test)[:, 1])
print("ROC AUC on test set:{:.5f}".format(ROC_AUC))

Accuracy on test set:0.76883
ROC AUC on test set:0.84251
