## SVM (1)
###### 분류, 회귀, 이상치 감지에 사용되는 지도학습 알고리즘

#### #01. 패키지 참조

In [1]:
from pandas import DataFrame, read_excel
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score, cross_validate

#### #02. 데이터 가져오기
###### 569개의 row, 31개의 column, 종속변수는 [0, 1] 로 구분되어 있다.
###### 30개의 독립변수를 통해 유방암 진단을 결정하는 데이터셋

In [2]:
origin = read_excel('https://data.hossam.kr/G02/breast_cancer.xlsx')
origin.head()


Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


#### #03. 데이터 전처리

##### 전처리 과정에서 고민해 봐야 하는 단계
1. 결측치, 이상치 감지
2. 파생변수 생성여부 결정 및 수행

##### 독립변수, 종속변수 분리

In [3]:
x = origin.drop('target', axis=1)
y = origin['target']
x.shape, y.shape

((569, 30), (569,))

##### 이 단계에서 고려해야 하는 단계
###### 1.  표준화 적용 여부 (가급적 수행, before/after 결과 비교 권장)
###### 2. 2. 훈련 데이터와 검증 데이터 분할 (지도학습은 거의 필수라고 봐야 함)

In [4]:
scaler = StandardScaler()
std_x = scaler.fit_transform(x)
std_x[:5]

array([[ 1.09706398e+00, -2.07333501e+00,  1.26993369e+00,
         9.84374905e-01,  1.56846633e+00,  3.28351467e+00,
         2.65287398e+00,  2.53247522e+00,  2.21751501e+00,
         2.25574689e+00,  2.48973393e+00, -5.65265059e-01,
         2.83303087e+00,  2.48757756e+00, -2.14001647e-01,
         1.31686157e+00,  7.24026158e-01,  6.60819941e-01,
         1.14875667e+00,  9.07083081e-01,  1.88668963e+00,
        -1.35929347e+00,  2.30360062e+00,  2.00123749e+00,
         1.30768627e+00,  2.61666502e+00,  2.10952635e+00,
         2.29607613e+00,  2.75062224e+00,  1.93701461e+00],
       [ 1.82982061e+00, -3.53632408e-01,  1.68595471e+00,
         1.90870825e+00, -8.26962447e-01, -4.87071673e-01,
        -2.38458552e-02,  5.48144156e-01,  1.39236330e-03,
        -8.68652457e-01,  4.99254601e-01, -8.76243603e-01,
         2.63326966e-01,  7.42401948e-01, -6.05350847e-01,
        -6.92926270e-01, -4.40780058e-01,  2.60162067e-01,
        -8.05450380e-01, -9.94437403e-02,  1.80592744e+

#### #04. 학습 모델 구현 
##### 이 단계에서 표준화 적용 전후를 비교

In [5]:
# 표준화 적용 전
svc = SVC(random_state=777)
scores = cross_val_score(svc, x, y, cv=5)
print(scores)
print("교차검증 평균: ", scores.mean())
score_df = DataFrame(cross_validate(svc, x, y, cv=5))
score_df

[0.85087719 0.89473684 0.92982456 0.94736842 0.9380531 ]
교차검증 평균:  0.9121720229777983


Unnamed: 0,fit_time,score_time,test_score
0,0.006,0.002025,0.850877
1,0.005,0.002999,0.894737
2,0.006013,0.001961,0.929825
3,0.003999,0.003,0.947368
4,0.003975,0.004027,0.938053


In [6]:
# 표준화 적용 후
svc = SVC(random_state=777)
scores = cross_val_score(svc, std_x, y, cv=5)
print(scores)
print("교차검증 평균: ", scores.mean())
score_df = DataFrame(cross_validate(svc, std_x, y, cv=5))
score_df

[0.97368421 0.95614035 1.         0.96491228 0.97345133]
교차검증 평균:  0.9736376339077782


Unnamed: 0,fit_time,score_time,test_score
0,0.002999,0.002002,0.973684
1,0.002998,0.002,0.95614
2,0.004009,0.001992,1.0
3,0.003,0.001001,0.964912
4,0.002999,0.002,0.973451


#### 최적 파라미터 찾기

In [7]:
svc = SVC(random_state=777)
params = {
'C': [0.001, 0.01, 0.1, 1, 10, 100],
'kernel': ['linear', 'rbf', 'sigmoid', 'poly'],
}
grid_svc = GridSearchCV(svc, param_grid=params, cv=5)
grid_svc.fit(std_x, y)
print(grid_svc.best_params_)
result_df = DataFrame(grid_svc.cv_results_['params'])
result_df['mean_test_score'] = grid_svc.cv_results_['mean_test_score']
result_df.sort_values(by='mean_test_score', ascending=False)

{'C': 10, 'kernel': 'rbf'}


Unnamed: 0,C,kernel,mean_test_score
17,10.0,rbf,0.977177
8,0.1,linear,0.975408
13,1.0,rbf,0.973638
12,1.0,linear,0.970144
4,0.01,linear,0.96839
16,10.0,linear,0.966651
20,100.0,linear,0.959649
14,1.0,sigmoid,0.959603
19,10.0,poly,0.957864
23,100.0,poly,0.957833
