In [1]:
import pandas as pd
import numpy as np
df = pd.read_csv('2014DC2_dummy_indicator_friendly.csv')   
df.shape

(12417, 193)

In [2]:
# Imputation indicator가 생성됨에 주의.

data = df.drop(['EBizSystem2'], axis=1)   # 타겟변수를 제외한 변수만 data 데이터프레임에 저장.
target = df['EBizSystem2']                # 타겟변수만 target 데이터프레임에 저장.

# 50:50 data partition.
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split( 
    data, target, test_size=0.5, random_state=42)   # test_size=0.5임에 주의. 

# interval 변수의 null value를 평균(mean)으로 impute 및 add_indicator 포함 
from sklearn.impute import SimpleImputer
imp= SimpleImputer(strategy = 'mean', add_indicator=True)  
X_train2= imp.fit_transform(X_train) 
X_test2= imp.fit_transform(X_test)   # X_train과 and X_test 둘 다 imputation 적용해야 함에 유의.

print("X_train2 shape:", X_train2.shape) 
print("X_test2 shape:", X_test2.shape) 

X_train2 shape: (6208, 214)
X_test2 shape: (6209, 214)


In [3]:
# linear SVM.

from sklearn.svm import SVC 
svm = SVC(kernel='linear', C=1).fit(X_train2, y_train) 

print ("Accuracy on training set: {:.5f}".format(svm.score( X_train2 , y_train ))) 
print ("Accuracy on test set: {:.5f}".format(svm.score( X_test2 , y_test ))) 

Accuracy on training set: 0.74533
Accuracy on test set: 0.73635


In [6]:
# SVM model (default 모델)

from sklearn.svm import SVC 
from sklearn.metrics import accuracy_score
clf_svm = SVC(kernel='rbf', C=1, gamma = 'auto', random_state=0, probability=True) 
                                                                # probability=True 에 주의.
clf_svm.fit(X_train2, y_train)
pred = clf_svm.predict(X_test2)  # 학습된 Classifier로 테스트 데이터셋 자료이용해서 타겟변수 예측값 생성.
accuracy = accuracy_score(y_test, pred)

In [9]:
import time
start = time.time()

from sklearn.model_selection import GridSearchCV
params = {'kernel':['rbf','sigmoid'], 'C':[0.0001, 0.001, 0.01, 0.1, 1, 10, 100],\
         'gamma':['scale','auto']}

grid_svm = GridSearchCV(clf_svm, param_grid=params, scoring='accuracy', cv=3, n_jobs=-1)
grid_svm.fit(X_train2, y_train)

print("GridSearchCV max accuracy:{:.5f}".format(grid_svm.best_score_))
print("GridSearchCV best parameter:", (grid_svm.best_params_))

end = time.time()
print(f"Runtime of the program is {end - start}")

GridSearchCV max accuracy:0.71714
GridSearchCV best parameter: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
Runtime of the program is 531.2303531169891


In [10]:
best_clf = grid_svm.best_estimator_
pred = best_clf.predict(X_test2)
print("Accuracy on test set:{:.5f}".format(accuracy_score(y_test, pred)))

Accuracy on test set:0.72492


In [7]:
import time
start = time.time()

from sklearn.model_selection import GridSearchCV
params = {'kernel':['linear'], 'C':[0.0001, 0.001, 0.01, 0.1, 1, 10]} 
                                # C=100을 주면 프로그램 구동이 끝없이 이어져서 부득이 해당값은 배제.  

grid_svm2 = GridSearchCV(clf_svm, param_grid=params, scoring='accuracy', cv=3, n_jobs=-1)
grid_svm2.fit(X_train2, y_train)

print("GridSearchCV max accuracy:{:.5f}".format(grid_svm2.best_score_))
print("GridSearchCV best parameter:", (grid_svm2.best_params_))

end = time.time()
print(f"Runtime of the program is {end - start}")

GridSearchCV max accuracy:0.71972
GridSearchCV best parameter: {'C': 0.1, 'kernel': 'linear'}
Runtime of the program is 400.68917083740234


In [8]:
best_clf = grid_svm2.best_estimator_
pred = best_clf.predict(X_test2)
print("Accuracy on test set:{:.5f}".format(accuracy_score(y_test, pred)))

from sklearn.metrics import roc_auc_score
ROC_AUC = roc_auc_score(y_test,best_clf.predict_proba(X_test2)[:, 1])
print("ROC AUC on test set:{:.5f}".format(ROC_AUC))

Accuracy on test set:0.73555
ROC AUC on test set:0.79058


In [8]:
print('회귀계수', best_clf.coef_)

회귀계수 [[ 4.91855948e-02  1.62280316e-02  1.63918942e-03  2.81831997e-02
  -1.10194365e-02 -1.64127953e-02  2.37904968e-02 -4.80028045e-02
  -3.59409264e-03  1.40587610e-02 -2.29128426e-02 -1.19322843e-02
  -4.09133460e-02  6.45977470e-02  1.24617920e-02 -1.63342892e-02
   1.33456189e-01  3.22110168e-02 -1.27284969e-02  6.06922117e-02
   1.35846124e-03  7.33931205e-04 -1.08070904e-02  5.25112262e-02
   4.14153688e-03  2.71502072e-03  4.42642847e-04 -9.43377489e-03
  -7.66832132e-03  1.60894072e-02  1.08302641e-02  1.63286029e-02
  -6.51998889e-03  1.18677162e-01  5.95108393e-02 -1.05762507e-02
   2.73058668e-01 -2.10716639e-02  2.93385046e-01 -2.91641201e-01
   9.97692520e-02  2.58649540e-01  2.22729812e-01 -1.23207942e+00
  -1.02115200e-02 -2.36369496e-02  3.80489891e-02 -4.16475187e-01
   1.33680015e-01  2.38777147e-01 -3.74575381e-02  9.75295529e-02
  -8.65182682e-02  1.73038378e-01  1.42777466e-01  1.01667441e-01
  -2.09988806e-01  7.87863119e-02  9.33365325e-02 -1.10812210e-01
  -1.

In [4]:
### 단순 참조. 커널이 linear일 때 c=100을 주면 프로그램 구동이 끝없이 이어진다.
### 아래 코드는 실행시키지 말 것.

import time
start = time.time()

from sklearn.model_selection import GridSearchCV
params = {'kernel':['linear'], 'C':[100]}

grid_svm3 = GridSearchCV(clf_svm, param_grid=params, scoring='accuracy', cv=3, n_jobs=-1)
grid_svm3.fit(X_train2, y_train)

print("GridSearchCV max accuracy:{:.5f}".format(grid_svm3.best_score_))
print("GridSearchCV best parameter:", (grid_svm3.best_params_))

end = time.time()
print(f"Runtime of the program is {end - start}")

KeyboardInterrupt: 