# Multiclass SVM 구현

In [103]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from mlxtend.classifier import EnsembleVoteClassifier

#IRIS 데이터 로드
iris =  sns.load_dataset('iris') 
X= iris.iloc[:,:4] #학습할데이터
y = iris.iloc[:,-1] #타겟
print(y)

0         setosa
1         setosa
2         setosa
3         setosa
4         setosa
         ...    
145    virginica
146    virginica
147    virginica
148    virginica
149    virginica
Name: species, Length: 150, dtype: object


## data 전처리

### class one-hot encoding
&#x2022; setosa : 0
<br>
&#x2022; versicolor : 1
<br>
&#x2022; virginica : 2

In [104]:
# classes = list(set(y.values))
# print(classes)
encoding_y = pd.get_dummies(y).values
print(encoding_y)
# for i in range(len(y)):
#     y[i] = classes.index(y[i])

[[1 0 0]
 [1 0 0]
 [1 0 0]
 [1 0 0]
 [1 0 0]
 [1 0 0]
 [1 0 0]
 [1 0 0]
 [1 0 0]
 [1 0 0]
 [1 0 0]
 [1 0 0]
 [1 0 0]
 [1 0 0]
 [1 0 0]
 [1 0 0]
 [1 0 0]
 [1 0 0]
 [1 0 0]
 [1 0 0]
 [1 0 0]
 [1 0 0]
 [1 0 0]
 [1 0 0]
 [1 0 0]
 [1 0 0]
 [1 0 0]
 [1 0 0]
 [1 0 0]
 [1 0 0]
 [1 0 0]
 [1 0 0]
 [1 0 0]
 [1 0 0]
 [1 0 0]
 [1 0 0]
 [1 0 0]
 [1 0 0]
 [1 0 0]
 [1 0 0]
 [1 0 0]
 [1 0 0]
 [1 0 0]
 [1 0 0]
 [1 0 0]
 [1 0 0]
 [1 0 0]
 [1 0 0]
 [1 0 0]
 [1 0 0]
 [0 1 0]
 [0 1 0]
 [0 1 0]
 [0 1 0]
 [0 1 0]
 [0 1 0]
 [0 1 0]
 [0 1 0]
 [0 1 0]
 [0 1 0]
 [0 1 0]
 [0 1 0]
 [0 1 0]
 [0 1 0]
 [0 1 0]
 [0 1 0]
 [0 1 0]
 [0 1 0]
 [0 1 0]
 [0 1 0]
 [0 1 0]
 [0 1 0]
 [0 1 0]
 [0 1 0]
 [0 1 0]
 [0 1 0]
 [0 1 0]
 [0 1 0]
 [0 1 0]
 [0 1 0]
 [0 1 0]
 [0 1 0]
 [0 1 0]
 [0 1 0]
 [0 1 0]
 [0 1 0]
 [0 1 0]
 [0 1 0]
 [0 1 0]
 [0 1 0]
 [0 1 0]
 [0 1 0]
 [0 1 0]
 [0 1 0]
 [0 1 0]
 [0 1 0]
 [0 1 0]
 [0 1 0]
 [0 1 0]
 [0 1 0]
 [0 0 1]
 [0 0 1]
 [0 0 1]
 [0 0 1]
 [0 0 1]
 [0 0 1]
 [0 0 1]
 [0 0 1]
 [0 0 1]
 [0 0 1]
 [0 0 1]
 

### data set 분리

In [105]:
# data train, test set 분리
X_train, X_test, y_train, y_test = train_test_split(X, encoding_y, test_size=0.2, random_state=48)

### data 정규화

In [106]:
# data 정규화
def standardization(train, test):
    scaler = StandardScaler()
    train = scaler.fit_transform(train)
    test = scaler.transform(test)
    return train, test

X_train, X_test = standardization(X_train, X_test)

In [107]:
X_train.shape

(120, 4)

In [108]:
X_test.shape

(30, 4)

## 모델 구현

### one vs rest
클래스가 N개 있으면 모든 Class에 대해 1:N-1로 binary분류하여 이 클래스가 맞는지 아닌지를 판단하고 투표로 결정합니다. 이 때 N개의 분류기가 필요합니다.

In [188]:
class model:
    def __init__(self, kernel, C, gamma):        
        self.kernel = kernel
        self.gamma = gamma
        self.C = C
        
    def fit(self, X_train, y_train):
        clfs= []
        for i in range(y_train.shape[1]):
            clf = SVC(kernel=self.kernel, C=self.C, gamma=self.gamma, random_state=42).fit(X_train, y_train[:,i])
            clfs.append(clf)
        return clfs

### Soft vs Hard voting
<ul>
    <li><Strong>Hard voting</Strong>: 각각의 모델들이 은 각각의 모델들이 결과를 예측하면 단순하게 가장 많은 표를 얻은 결과를 선택하는 것입니다</li>
    <li><Strong>Soft voting</Strong>:  각 class별로 모델들이 예측한 probability를 합산해서 가장 높은 class를 선택하면 됩니다.

</li>
</ul>

In [189]:
# Soft voting방법으로 모델링
def soft_voting(clfs, X_test, y_test):
    Soft_voting = np.zeros(y_test.shape)
    for i,clf in enumerate(clfs):
        prob = clf.decision_function(X_test)
        # i해당하는 prob 더해준다.
        Soft_voting[:,i] += prob
        # i를 제외한 인덱스에 대해서 1-prob만큼 더해준다.
        Soft_voting += (1 - prob[:,np.newaxis])
        Soft_voting[:,i] -= 1 - prob
    return Soft_voting

In [179]:
encoding_y_test = np.argmax(y_test, axis=-1)

### 사이킷런 SVM parameter
<ul>
    <li><Strong>Kernel</Strong>: Decision Boundary의 모양 결정(Linear, Polynomial, Sigmoid, RBF등)</li>
    <li><Strong>C</Strong>: Decision Boundary 일반화 vs Training data의 정확한 분류 사이의 trade-off 조정</li>
    <li><Strong>Gamma</Strong>: 결정경계의 굴곡에 영향을 주는 데이터의 범위를 정의</li>
</ul>

In [181]:
parameters = {'kernel': ['rbf','linear'], 'C': [0.001, 0.01, 0.1, 1, 10, 100],
             'gamma': [0.001, 0.01, 0.1, 1, 10, 100]}

In [190]:
results = []
for kernel in parameters['kernel']:
    for C in parameters['C']:
        for gamma in parameters['gamma']:
            SVM = model(kernel, C, gamma)
            clfs = SVM.fit(X_train, y_train)
            prob = soft_voting(clfs, X_test, y_test)
            predict = np.argmax(prob,axis=-1)
            acc_score = accuracy_score(encoding_y_test, predict)
            results.append((acc_score,kernel,C,gamma))

In [191]:
results.sort()

In [194]:
results[-1]

(1.0, 'rbf', 10, 0.1)

### Grid Search 방식을 이용해서 최적의 파라미터를 도출해보았습니다. 
kernel='rbf', C=10, gamma=0.1일때 Acc는 1.0입니다. 