# Multi Class SVM

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

iris =  sns.load_dataset('iris') #data load
X= iris.iloc[:,:4]
y = iris.iloc[:,-1]
print(y)

0         setosa
1         setosa
2         setosa
3         setosa
4         setosa
         ...    
145    virginica
146    virginica
147    virginica
148    virginica
149    virginica
Name: species, Length: 150, dtype: object


In [2]:
scal = StandardScaler() #scaling
X = scal.fit_transform(X)

### One vs Rest
* 하나와 나머지, 플러스가 나오면 그 하나에 속하는 것이고, 마이너스가 나오면 속하지 않는 것이다 
* 만약 동일한 부호가 나온다면(동점이면) 거리를 비교해서 가까운쪽으로 분류하겠다 

In [3]:
# One Versus Rest 
svm_1 = SVC(kernel ='rbf', gamma = 5, C = 100)
svm_2 = SVC(kernel ='rbf', gamma = 5, C = 100)
svm_3 = SVC(kernel ='rbf', gamma = 5, C = 100)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=48)

In [5]:
y_train = pd.get_dummies(y_train) #one hot encoding

In [6]:
y_train.head()

Unnamed: 0,setosa,versicolor,virginica
110,0,0,1
69,0,1,0
148,0,0,1
39,1,0,0
53,0,1,0


In [7]:
svm_1.fit(X_train,y_train.iloc[:,0]) # setosa  
svm_2.fit(X_train,y_train.iloc[:,1]) # versicolor 
svm_3.fit(X_train,y_train.iloc[:,2]) # virginica  

print(svm_1.decision_function(X_test))
print(svm_2.decision_function(X_test))
print(svm_3.decision_function(X_test)) #decision_function = hyperplane과의 거리!

[-1.12470845 -0.86326953 -0.65281099 -0.50248821 -0.76284323 -0.87465573
  1.07709345 -0.99281647  0.47441336 -0.99842743 -0.83989348  0.15633457
  0.32871788 -0.97965464 -0.72385083 -0.92638376  1.28322481 -0.56240455
 -0.72719892 -0.99509775  0.43166724 -0.96451557 -0.82991366 -1.03020581
 -0.75166835  1.13461335  0.39943974 -1.04194106 -0.93376548 -1.06133798]
[ 1.36054602  0.62859488  0.0580828  -0.36987093 -0.25686625 -0.79229874
 -1.03195247 -0.87132049 -0.77957234  0.99327762 -0.73309581 -0.64607768
 -0.71843897 -0.80882458  0.47748663  0.69953462 -1.11879121 -0.1870504
 -0.24458236 -0.99337352 -0.76149141 -0.96812018  0.11310267  1.06892719
 -0.56749692 -1.05639624 -0.74817515 -1.0537858  -0.89721836  1.17126024]
[-1.26277651 -0.77964936 -0.40539105 -0.13070947  0.02643314  0.57365649
 -1.04443118  0.88027844 -0.69605517 -0.99074328  0.52442024 -0.51171892
 -0.61157919  0.78148257 -0.75517707 -0.76905925 -1.16380018 -0.25333938
 -0.0251979   0.98843144 -0.67129528  0.89577306 -

In [8]:
y_pred = []

for i in range(len(X_test)):
    if ((np.sign(svm_1.decision_function(X_test)[i]) > 0) and (np.sign(svm_2.decision_function(X_test)[i]) < 0) and (np.sign(svm_3.decision_function(X_test)[i]) < 0)): 
        y_pred.append("setosa")
    elif ((np.sign(svm_2.decision_function(X_test)[i]) > 0) and (np.sign(svm_1.decision_function(X_test)[i]) < 0) and (np.sign(svm_3.decision_function(X_test)[i]) < 0)): 
        y_pred.append("versicolor") 
    elif ((np.sign(svm_3.decision_function(X_test)[i]) > 0) and (np.sign(svm_1.decision_function(X_test)[i]) < 0) and (np.sign(svm_2.decision_function(X_test)[i]) < 0)): 
        y_pred.append("virginica") 
    else:
        y_pred.append(3)

* 플러스 부호인 경우 그 군집에 속하는 것으로 처리 
* 모두 같은 부호가 나오면 일단 보류 

In [9]:
y_pred

['versicolor',
 'versicolor',
 'versicolor',
 3,
 'virginica',
 'virginica',
 'setosa',
 'virginica',
 'setosa',
 'versicolor',
 'virginica',
 'setosa',
 'setosa',
 'virginica',
 'versicolor',
 'versicolor',
 'setosa',
 3,
 3,
 'virginica',
 'setosa',
 'virginica',
 'versicolor',
 'versicolor',
 'virginica',
 'setosa',
 'setosa',
 'virginica',
 'virginica',
 'versicolor']

In [10]:
# 부호가 모든 같은 경우가 있는가? < 모두 동점인 경우!! 
for i in range(len(X_test)):
    if (np.sign(svm_1.decision_function(X_test)[i]) == np.sign(svm_2.decision_function(X_test)[i]) == np.sign(svm_3.decision_function(X_test)[i])):
        print(i)

3
17
18


In [11]:
for i in [3,17,18]:
    print(svm_1.decision_function(X_test)[i], svm_2.decision_function(X_test)[i], svm_3.decision_function(X_test)[i])

-0.5024882107322184 -0.36987092513822756 -0.13070947139128608
-0.562404550285589 -0.18705039695337913 -0.2533393758266559
-0.7271989224328648 -0.24458235766281572 -0.025197903756274675


#### decision function이 가까운 것 (= 절댓값이 작은 것)으로 분류: 
* 3: 'virginica'
* 17: 'versicolor'
* 18: 'virginica'

In [12]:
y_pred[3] = 'virginica'
y_pred[17] = 'versicolor'
y_pred[18] = 'virginica'

In [13]:
from sklearn import metrics

metrics.accuracy_score(y_test,y_pred)

0.8666666666666667

#### sklearn MultiClass SVM과 비교

In [14]:
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X, y, test_size=0.2, random_state=48)

svm_4 = SVC(kernel ='rbf', C = 5, gamma = 5)
svm_4.fit(X_train_2, y_train_2)
y_pred = svm_4.predict(X_test_2)

metrics.accuracy_score(y_test_2,y_pred)

0.8333333333333334

내장된 함수를 그냥 사용하는 것 보다 one vs rest 방법으로 군집을 분류하는 것이 성능이 더 좋다 