# **SVM**

In [1]:
from sklearn.datasets import load_iris
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


# IRIS 데이터셋
iris = load_iris()
df = pd.DataFrame(data= np.c_[iris.data, iris.target] , 
                  columns= ['sepal length', 'sepal width', 'petal length', 'petal width', 'target'])

df

Unnamed: 0,sepal length,sepal width,petal length,petal width,target
0,5.1,3.5,1.4,0.2,0.0
1,4.9,3.0,1.4,0.2,0.0
2,4.7,3.2,1.3,0.2,0.0
3,4.6,3.1,1.5,0.2,0.0
4,5.0,3.6,1.4,0.2,0.0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2.0
146,6.3,2.5,5.0,1.9,2.0
147,6.5,3.0,5.2,2.0,2.0
148,6.2,3.4,5.4,2.3,2.0


In [2]:
X = df[df.columns[:-1]]
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 123)
print(f'Train : {len(X_train)}개 | Test : {len(X_test)}개')

Train : 105개 | Test : 45개


In [3]:
# SVC : Support Vector Classifier

model = SVC(kernel='linear')
model.fit(X_train, y_train)

SVC(kernel='linear')

In [4]:
pred_train = model.predict(X_train)
pred_test = model.predict(X_test)

# Training data와 Test data에 대한 성능 평가

print(f'Training accuracy : {accuracy_score(y_train, pred_train):.4f} | Test accuracy : {accuracy_score(y_test, pred_test):.4f}')

Training accuracy : 0.9905 | Test accuracy : 0.9556


## **정규화**

In [5]:
# 기존의 X 값 보기
X_train

Unnamed: 0,sepal length,sepal width,petal length,petal width
114,5.8,2.8,5.1,2.4
136,6.3,3.4,5.6,2.4
53,5.5,2.3,4.0,1.3
19,5.1,3.8,1.5,0.3
38,4.4,3.0,1.3,0.2
...,...,...,...,...
17,5.1,3.5,1.4,0.3
98,5.1,2.5,3.0,1.1
66,5.6,3.0,4.5,1.5
126,6.2,2.8,4.8,1.8


In [6]:
# 정규화 작업
scaler = StandardScaler() # 평균이 0, 분산이 1이 되도록 scaling
X_train_scaled = scaler.fit_transform(X_train)

pd.DataFrame(X_train_scaled, columns=['sepal length', 'sepal width', 'petal length', 'petal width'])

Unnamed: 0,sepal length,sepal width,petal length,petal width
0,-0.087756,-0.615256,0.740209,1.587111
1,0.543367,0.858551,1.033278,1.587111
2,-0.466430,-1.843427,0.095457,0.091913
3,-0.971328,1.841088,-1.369889,-1.267359
4,-1.854900,-0.123987,-1.487117,-1.403286
...,...,...,...,...
100,-0.971328,1.104185,-1.428503,-1.267359
101,-0.971328,-1.352159,-0.490682,-0.179942
102,-0.340205,-0.123987,0.388526,0.363767
103,0.417142,-0.615256,0.564367,0.771548


### 중요!
- Test dataset은 train dataset을 기준으로 학습된 정규화 모델을 사용해야 함
- 만약 test data로 새로운 scaling 기준을 만들게 되면, train data와 test data의 scaling 기준이 서로 달라지게 되어 올바른 예측결과를 도출하지 못함
- 즉, 머신러닝 모델은 train data를 기반으로 학습되기 때문에 반드시 test data도 train data의 scaling 기준을 따라야 함

In [7]:
# 위에서 train data에 fit 한 scaler 그대로 사용

X_test_scaled = scaler.transform(X_test)

In [8]:
model = SVC(kernel='linear')
model.fit(X_train_scaled, y_train)

pred_train = model.predict(X_train_scaled)
pred_test = model.predict(X_test_scaled)

# Standard Scaling을 수행한 Training data와 Test data에 대한 성능 평가
print('After Standard scaling')
print(f'Training accuracy : {accuracy_score(y_train, pred_train):.4f} | Test accuracy : {accuracy_score(y_test, pred_test):.4f}')

After Standard scaling
Training accuracy : 0.9905 | Test accuracy : 0.9778


### 정규화 방법들
- MinMax scaling : 최댓값을 1로 최솟값을 0으로 하고 모든 값들을 그사이로 mapping 시키는 방법임
- 따라서 이상치에 매우 취약함 (이상치가 존재하면 나머지 정상 값들이 매우 좁은 범위로 압축되기 때문)
- Standard scaling : 경험적으로 평균과 분산을 계산해서 min-max 보다는 덜 취약하지만, 너무 큰 이상치가 있으면 마찬가지로 균형 잡힌 변환을 보장 못 함
- RobustScaler : 이상치의 영향을 최소화한 기법, 궁금하면 찾아보기

In [9]:
scaler = MinMaxScaler() # 최솟값이 0, 최댓값이 1이 되도록 scaling 
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

pd.DataFrame(X_train_scaled, columns=['sepal length', 'sepal width', 'petal length', 'petal width'])

Unnamed: 0,sepal length,sepal width,petal length,petal width
0,0.424242,0.272727,0.694915,0.958333
1,0.575758,0.545455,0.779661,0.958333
2,0.333333,0.045455,0.508475,0.500000
3,0.212121,0.727273,0.084746,0.083333
4,0.000000,0.363636,0.050847,0.041667
...,...,...,...,...
100,0.212121,0.590909,0.067797,0.083333
101,0.212121,0.136364,0.338983,0.416667
102,0.363636,0.363636,0.593220,0.583333
103,0.545455,0.272727,0.644068,0.708333


In [10]:
model = SVC(kernel='linear')
model.fit(X_train_scaled, y_train)

pred_train = model.predict(X_train_scaled)
pred_test = model.predict(X_test_scaled)

# MinMax scaling을 수행한 Training data와 Test data에 대한 성능 평가
print('After MinMax scaling')
print(f'Training accuracy : {accuracy_score(y_train, pred_train):.4f} | Test accuracy : {accuracy_score(y_test, pred_test):.4f}')

After MinMax scaling
Training accuracy : 0.9810 | Test accuracy : 0.9111


# **퀴즈 1**

*   make_moons 데이터에 대하여 SVM 모델을 생성하고 학습된 모델의 정확도를 도출하시오.


In [11]:
from sklearn.datasets import make_moons
X, y = make_moons(n_samples=100, noise=0.25, random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [12]:
model = SVC(kernel='linear')
model.fit(X_train, y_train)

SVC(kernel='linear')

In [13]:
pred_train = model.predict(X_train)
pred_test = model.predict(X_test)

print(f'Training accuracy : {accuracy_score(y_train, pred_train):.4f} | Test accuracy : {accuracy_score(y_test, pred_test):.4f}')

Training accuracy : 0.8667 | Test accuracy : 0.8400
