# 8. SVM(Support Vector Machine)
## 8.1 Linear SVM 


In [None]:
# 마진 실습
import numpy as np
import pandas as pd 

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import accuracy_score

X, y = load_breast_cancer(return_X_y = True) 

# split 
X_train, X_test, y_train, y_test = train_test_split(X,y, stratify= y, random_state=1)

# preprocessing 
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# model : SVC 
svc = SVC(kernel = 'linear', # svc = LinearSVC() 
          C = 1, # hyper param :  0.1, 0.01 <-- 1(default) --> 10, 100
          random_state = 1)
svc.fit(X_train_scaled, y_train)

# predict 
pred_train = svc.predict(X_train_scaled)
pred_test = svc.predict(X_test_scaled)

# accuracy : svc의 C값에 따라 값이 달라진다. 1을 기준으로 변경해보자. 
print('- svc train accuracy :', accuracy_score(y_train,pred_train))
print('- svc test  accuracy :', accuracy_score(y_test, pred_test))

@ 질문 
- 수업을 안들어서 그런데 return_X_y  뭘 의미?
- 전처리 시 판단법 
- 어떤 지점에 random_state를 넣어줘야하는가?
- SVM, SVC ? 
- svc = LinearSVC()  으로 처리해줘도 C값 하이퍼파라미터 줄수 있나?
- C 값이 의미하는게 정확히 뭐야? 오차의 허용범위? 
- 최적의 C값을 찾아주는건 없어? 그리드서치 


[결론]
- C 값이 커질수록 train값은 좋아지나 test 값이 줄어들면서 일반화가 어려워진다. ==> overfitting 
- C 값이 작을수록 train, test 둘다 성능이 안 좋아진다. ==> underfitting 


- 그렇다면 비선형데이터일때는 어떻게? --> 커널서포트 벡터 머신 

## 8.2 비선형데이터일때 
### 8.2.1 

# 비선형 데이터일때는?
- Kernel Support Vercotr machine 
- Kernel Trick 
- RBF 


In [None]:
# RBF 예제 - recall_score, precision_score 
from sklearn.metrics import recall_score, precision_score 
from sklearn.metrics import roc_auc_score, average_precision_score

# split 
X_train, X_test, y_train, y_test = train_test_split(X,y, stratify= y, random_state=1)

# preprocessing 
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# model : rbf
rbf_svc = SVC(kernel = 'rbf',  # default : rbf
              C = 1, # soft margin -- 1 -- hard margin 
              gamma = 0.1, # underfitting -- 일반화 -- overfitting
              probability=True, # predict_proba(), roc_auc_score, average_precision_score를 쓰기 위해서 설정 (default:Flase)
              random_state=1
)
rbf_svc.fit(X_train_scaled, y_train)

# predict 
pred_train = rbf_svc.predict(X_train_scaled)
pred_test = rbf_svc.predict(X_test_scaled)

# C : 1, gammar : 1 -> 0.5 -> 0.1 -> 0.01 , C값을 고정해 놓고 감마 값을 조정해보자. 
print('- rbf_train accuracy :',accuracy_score(y_train, pred_train))
print('- rbf_test  accuracy :',accuracy_score(y_test, pred_test))

print('- rbf_train recall :',recall_score(y_train, pred_train))
print('- rbf_test  recall :',recall_score(y_test, pred_test))

# RBF 예제 - roc_auc_score, average_precision_score
# 양성의 확률 error : why? predict_proba is not available when  probability=False
# svm모델은 기본적으로 False로 설정되어 있기 때문에 확률값을 확인하기 위해서는 True로 설정해줘야 한다. 
pos_proba = rbf_svc.predict_proba(X_train_scaled)[:,1] 
print('- rbf_train roc_auc_score :',roc_auc_score(y_train, pos_proba))
print('- rbf_train average_precision_score :',average_precision_score(y_train, pos_proba))


@ 질문
- scaled를 넣어줘야할때느 언제 ? 
- proba를 왜 썼었지? roc_auc_score, average_precision_score를 쓰기위함 맞나?
- 근데 값을 보고 딱 오버피팅이다 언더피팅이다 감이 잘 안오는거 같아.. 
- 적절할 C와 감마 값을 구하기 위해서 필요한 것 ==> GridSearch 를 통해서 최적의 조합을 찾는다. 


# 최적의 조합을 찾아라 GridSearch
# GridSearch 

In [None]:
# GridSearch 예제 

param = {
    'kernel' : ['rbf','linear'],
    'C' : [0.001, 0.01, 0.1, 1, 10, 100],
    'gamma' : [0.001, 0.01, 0.1, 1, 10] # rbf를 위해 넣어준다. linear일때 의미없는 값 
}
svc = SVC(random_state=1, probability= True) # roc_auc 위해 설정
gs_svc = GridSearchCV(svc,
                      param_grid = param,
#                       scoring='accuracy',
                      scoring=['accuracy','roc_auc'], # roc_auc위해 probability 설정 
                      cv = 3, 
                      n_jobs = -1 
)
gs_svc.fit(X_train_scaled, y_train)

print('- best_params_ : ', gs_svc.best_params_)

# 데이터프레임 표로 확인 
pd.DataFrame(gs_svc.cv_results_)#.sort_values('rank_test_score').head(3)

- the parameter refit ? 

ValueError: For multi-metric scoring, the parameter refit must be set to a scorer key or a callable to refit an estimator with the best parameter setting on the whole data and make the best_* attributes available for that metric. If this is not needed, refit should be set to False explicitly. True was passed.

그리드 서치자체에서 크로스밸리데이션도 하기 때문에 크로스밸리데이션 하지 않아도 된다는 이야기인가? 

In [None]:
# cross_val_score() 
from sklearn.model_selection import cross_val_score

svc2 = SVC(C = 10, gamma = 0.01)

result = cross_val_score(svc2,
                         X_train_scaled,
                         y_train,
                         scoring='accuracy',
                         cv = 3
)
print('- result :',result)
print('- mean   :',np.mean(result))

# 예제  todo 
## iris DataSet으로 분류 --> 다시 확인 
- 다중 클래스 분류
- model : GridSearch
- 평가지표 : accuracy 
- 파이프라인

In [None]:
import numpy as np
import pandas as pd 
from sklearn.datasets import load_iris
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline

X, y = load_iris(return_X_y = True)

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=1)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)



--
# References      
> - 김성환, 엔코아 플레이데이터 (2021, 인공지능 개발자 3기 과정)