# Multiclass SVM 구현

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

#IRIS 데이터 로드
iris =  sns.load_dataset('iris') 
X= iris.iloc[:,:4] #학습할데이터
y = iris.iloc[:,-1] #타겟
print(y)

0         setosa
1         setosa
2         setosa
3         setosa
4         setosa
         ...    
145    virginica
146    virginica
147    virginica
148    virginica
149    virginica
Name: species, Length: 150, dtype: object


In [60]:
y.unique()

array(['setosa', 'versicolor', 'virginica'], dtype=object)

In [2]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=48)

In [3]:
def standardization(train, test):
    scaler = StandardScaler()
    train = scaler.fit_transform(train)
    test = scaler.transform(test)
    return train, test

X_train, X_test = standardization(X_train, X_test)

In [4]:
X_train

array([[ 0.78522493,  0.32015325,  0.77221097,  1.04726529],
       [-0.26563371, -1.29989934,  0.0982814 , -0.11996537],
       [ 0.43493872,  0.78302542,  0.94069336,  1.43634218],
       [-0.84944407,  0.78302542, -1.24957775, -1.28719604],
       [-0.38239578, -1.7627715 ,  0.15444219,  0.13941922],
       [ 0.55170079, -0.374155  ,  1.05301496,  0.7878807 ],
       [ 0.31817664, -0.14271892,  0.65988937,  0.7878807 ],
       [ 0.20141457, -0.374155  ,  0.43524618,  0.39880381],
       [-1.66677857, -0.14271892, -1.36189934, -1.28719604],
       [-0.14887164, -0.60559109,  0.21060299,  0.13941922],
       [-0.14887164, -1.06846325, -0.12636179, -0.24965767],
       [ 0.31817664, -0.60559109,  0.15444219,  0.13941922],
       [ 0.66846286, -0.83702717,  0.88453256,  0.91757299],
       [ 0.0846525 , -0.14271892,  0.77221097,  0.7878807 ],
       [-0.49915786, -0.14271892,  0.43524618,  0.39880381],
       [-0.26563371, -0.60559109,  0.65988937,  1.04726529],
       [ 2.18636979,  1.

In [5]:
X_test

array([[-0.14887164, -0.374155  ,  0.26676379,  0.13941922],
       [ 0.31817664, -0.60559109,  0.54756778,  0.00972692],
       [ 0.31817664, -1.06846325,  1.05301496,  0.26911151],
       [-1.5500165 , -1.7627715 , -1.36189934, -1.15750374],
       [ 0.0846525 ,  0.32015325,  0.60372857,  0.7878807 ],
       [ 0.78522493, -0.14271892,  0.99685416,  0.7878807 ],
       [-0.84944407,  1.70876975, -1.24957775, -1.15750374],
       [ 0.20141457, -0.14271892,  0.60372857,  0.7878807 ],
       [-0.38239578,  2.63451409, -1.30573855, -1.28719604],
       [-0.38239578, -1.29989934,  0.15444219,  0.13941922],
       [ 0.66846286,  0.08871717,  0.99685416,  0.7878807 ],
       [-0.38239578,  1.0144615 , -1.36189934, -1.28719604],
       [-0.49915786,  0.78302542, -1.13725615, -1.28719604],
       [ 0.43493872, -0.60559109,  0.60372857,  0.7878807 ],
       [ 0.55170079, -1.7627715 ,  0.37908538,  0.13941922],
       [ 0.55170079,  0.55158933,  0.54756778,  0.52849611],
       [-1.19973028,  0.

One vs Rest 방식을 이용해서 멀티클래스 svm을 구현해보자. 그러기 위해 먼저 원핫인코딩 과정을 거쳐 각각의 binary svm에 사용될 행을 만들어 준다. 일반적으로 적용되는 멀티클래스 svm을 구현해주기 전 한 단계씩 One vs Rest 방식을 적용해본다.

In [6]:
y_train = y_train.values
y_train

array(['virginica', 'versicolor', 'virginica', 'setosa', 'versicolor',
       'virginica', 'virginica', 'versicolor', 'setosa', 'versicolor',
       'versicolor', 'versicolor', 'virginica', 'virginica', 'versicolor',
       'virginica', 'virginica', 'setosa', 'setosa', 'virginica',
       'setosa', 'versicolor', 'setosa', 'setosa', 'setosa', 'virginica',
       'setosa', 'virginica', 'setosa', 'virginica', 'versicolor',
       'virginica', 'versicolor', 'virginica', 'versicolor', 'virginica',
       'virginica', 'setosa', 'versicolor', 'versicolor', 'virginica',
       'virginica', 'versicolor', 'versicolor', 'virginica', 'virginica',
       'versicolor', 'versicolor', 'versicolor', 'virginica',
       'versicolor', 'versicolor', 'virginica', 'versicolor',
       'versicolor', 'versicolor', 'versicolor', 'setosa', 'setosa',
       'versicolor', 'versicolor', 'setosa', 'setosa', 'virginica',
       'virginica', 'setosa', 'setosa', 'virginica', 'setosa',
       'versicolor', 'virginica',

In [7]:
y_train = y_train.reshape(-1, 1)

In [8]:
y_train

array([['virginica'],
       ['versicolor'],
       ['virginica'],
       ['setosa'],
       ['versicolor'],
       ['virginica'],
       ['virginica'],
       ['versicolor'],
       ['setosa'],
       ['versicolor'],
       ['versicolor'],
       ['versicolor'],
       ['virginica'],
       ['virginica'],
       ['versicolor'],
       ['virginica'],
       ['virginica'],
       ['setosa'],
       ['setosa'],
       ['virginica'],
       ['setosa'],
       ['versicolor'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['virginica'],
       ['setosa'],
       ['virginica'],
       ['setosa'],
       ['virginica'],
       ['versicolor'],
       ['virginica'],
       ['versicolor'],
       ['virginica'],
       ['versicolor'],
       ['virginica'],
       ['virginica'],
       ['setosa'],
       ['versicolor'],
       ['versicolor'],
       ['virginica'],
       ['virginica'],
       ['versicolor'],
       ['versicolor'],
       ['virginica'],
       ['virginica'],
      

In [14]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder()
onehot_y = ohe.fit_transform(y_train).toarray()
onehot_y = pd.DataFrame(onehot_y)
onehot_y.columns = ['setosa', 'versicolor', 'virginica']

In [15]:
onehot_y

Unnamed: 0,setosa,versicolor,virginica
0,0.0,0.0,1.0
1,0.0,1.0,0.0
2,0.0,0.0,1.0
3,1.0,0.0,0.0
4,0.0,1.0,0.0
...,...,...,...
115,0.0,1.0,0.0
116,0.0,1.0,0.0
117,0.0,1.0,0.0
118,0.0,1.0,0.0


In [11]:
#test의 타겟에 대해서도 원핫인코딩을 수행한다.

y_test = y_test.values
y_test = y_test.reshape(-1, 1)

ohe = OneHotEncoder()
onehot_test = ohe.fit_transform(y_test).toarray()
onehot_test = pd.DataFrame(onehot_test)
onehot_test.columns = ['setosa', 'versicolor', 'virginica']

In [12]:
onehot_test

Unnamed: 0,setosa,versicolor,virginica
0,0.0,1.0,0.0
1,0.0,1.0,0.0
2,0.0,0.0,1.0
3,1.0,0.0,0.0
4,0.0,1.0,0.0
5,0.0,0.0,1.0
6,1.0,0.0,0.0
7,0.0,0.0,1.0
8,1.0,0.0,0.0
9,0.0,1.0,0.0


원핫인코딩을 마친 타겟 데이터 프레임의 한 열씩을 이용하여 binary svm을 적용해본다.

In [16]:
onehot_y['virginica']

0      1.0
1      0.0
2      1.0
3      0.0
4      0.0
      ... 
115    0.0
116    0.0
117    0.0
118    0.0
119    0.0
Name: virginica, Length: 120, dtype: float64

In [17]:
#가장 먼저 virginica인지 아닌지 판단하는 모델을 학습시켜본다.
#그리드 서치를 이용해서 최적의 파라미터를 구한다

from sklearn.model_selection import GridSearchCV

param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
             'gamma': [0.001, 0.01, 0.1, 1, 10, 100] }

model = SVC()
grid_search = GridSearchCV(model, param_grid, cv=5, return_train_score=True)
grid_search.fit(X_train, onehot_y['virginica'])

print("최적 매개변수: {}".format(grid_search.best_params_))

최적 매개변수: {'C': 10, 'gamma': 0.01}


In [32]:
#그리드 서치로 구한 최적의 매개변수 값을 이용하여 학습시킨다.
svc_virginica = SVC(kernel='rbf', gamma = 0.01, C = 10)
svc_virginica.fit(X_train, onehot_y['virginica'])

SVC(C=10, gamma=0.01)

In [33]:
#test 셋으로 학습시킨 모델에 적용하여 정확도를 확인해본다.
from sklearn import metrics

y_pred = svc_virginica.predict(X_test) # 훈련한 모델로 test셋을 시험
print('Accuracy Score:') 
print(metrics.accuracy_score(onehot_test['virginica'],y_pred))

Accuracy Score:
0.9666666666666667


테스트 셋에 대해 매우 높은 정확도를 보인다.

In [20]:
#다음으로 versicolor인지 아닌지 판단하는 모델을 학습시켜본다.
#그리드 서치를 이용해서 최적의 파라미터를 구한다

from sklearn.model_selection import GridSearchCV

param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
             'gamma': [0.001, 0.01, 0.1, 1, 10, 100] }

model = SVC()
grid_search = GridSearchCV(model, param_grid, cv=5, return_train_score=True)
grid_search.fit(X_train, onehot_y['versicolor'])

print("최적 매개변수: {}".format(grid_search.best_params_))

최적 매개변수: {'C': 100, 'gamma': 0.01}


In [21]:
#그리드 서치로 구한 최적의 매개변수 값을 이용하여 학습시킨다.
svc_versicolor = SVC(kernel='rbf', gamma = 0.01, C = 100)
svc_versicolor.fit(X_train, onehot_y['versicolor'])

SVC(C=100, gamma=0.01)

In [22]:
#test 셋으로 학습시킨 모델에 적용하여 정확도를 확인해본다.

y_pred = svc_versicolor.predict(X_test) # 훈련한 모델로 test셋을 시험
print('Accuracy Score:') 
print(metrics.accuracy_score(onehot_test['versicolor'],y_pred))

Accuracy Score:
0.9333333333333333


In [23]:
#마지막으로 setosa인지 아닌지 판단하는 모델을 학습시켜본다.
#그리드 서치를 이용해서 최적의 파라미터를 구한다

from sklearn.model_selection import GridSearchCV

param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
             'gamma': [0.001, 0.01, 0.1, 1, 10, 100] }

model = SVC()
grid_search = GridSearchCV(model, param_grid, cv=5, return_train_score=True)
grid_search.fit(X_train, onehot_y['setosa'])

print("최적 매개변수: {}".format(grid_search.best_params_))

최적 매개변수: {'C': 0.1, 'gamma': 0.1}


In [61]:
print(grid_search.best_params_['C'])

0.1


In [34]:
#그리드 서치로 구한 최적의 매개변수 값을 이용하여 학습시킨다.
svc_setosa = SVC(kernel='rbf', gamma = 0.1, C = 0.1)
svc_setosa.fit(X_train, onehot_y['setosa'])

SVC(C=0.1, gamma=0.1)

In [35]:
#test 셋으로 학습시킨 모델에 적용하여 정확도를 확인해본다.

y_pred = svc_setosa.predict(X_test) # 훈련한 모델로 test셋을 시험
print('Accuracy Score:') 
print(metrics.accuracy_score(onehot_test['setosa'],y_pred))

Accuracy Score:
1.0


이제 3가지의 binary svm을 모두 구현했으니 세가지 모형을 돌린 결과로 예측을 판별해주는 코드를 작성한다.

In [39]:
setosa_pred = svc_setosa.predict(X_test)
virginica_pred = svc_virginica.predict(X_test)
versicolor_pred = svc_versicolor.predict(X_test)

setosa_d = svc_setosa.decision_function(X_test)
virginica_d = svc_virginica.decision_function(X_test)
versicolor_d = svc_versicolor.decision_function(X_test)

pred_df = pd.DataFrame({'setosa_pred': setosa_pred,
                        'versicolor_pred': versicolor_pred,
                        'virginica_pred': virginica_pred,
                        'setosa_d': setosa_d
                        ,'versicolor_d': versicolor_d
                        ,'virginica_d': virginica_d})

pred_df라는 데이터 프레임을 생성한다. 이 데이터 프레임은 만들어둔 3가지 모형의 예측 결과와 decision_function의 값을 담고있도록 한다.

In [40]:
pred_df

Unnamed: 0,setosa_pred,versicolor_pred,virginica_pred,setosa_d,versicolor_d,virginica_d
0,0.0,1.0,0.0,-0.894444,1.09537,-1.296696
1,0.0,1.0,0.0,-1.094077,1.495842,-1.070419
2,0.0,1.0,1.0,-1.291429,0.550942,0.092655
3,1.0,0.0,0.0,0.080967,-0.464622,-3.940741
4,0.0,0.0,0.0,-0.99789,-0.296286,-0.169793
5,0.0,0.0,1.0,-1.256461,-0.47764,0.505021
6,1.0,0.0,0.0,1.072398,-2.572162,-5.173826
7,0.0,0.0,1.0,-1.154036,-0.18656,0.017082
8,1.0,0.0,0.0,0.888722,-3.373779,-5.430779
9,0.0,1.0,0.0,-1.033879,1.054637,-1.024003


In [41]:
list(pred_df.loc[0][:3])

[0.0, 1.0, 0.0]

In [42]:
n = len(pred_df)
y_pred = []

for i in range(n):
  target = list(pred_df.loc[i][:3])
  if target == [1.0, 0.0, 0.0]:
    add = 'setosa'
  elif target == [0.0, 1.0, 0.0]:
    add = 'versicolor'
  elif target == [0.0, 0.0, 1.0]:
    add = 'virginica'

  else:
    decision_target = list(pred_df.loc[i][3:])
    decision = np.argmax(decision_target)
    if decision == 0:
      add = 'setosa'
    elif decision == 1:
      add = 'versicolor'
    elif decision == 2:
      add = 'virginica'
  
  y_pred.append(add)

print(y_pred)

['versicolor', 'versicolor', 'versicolor', 'setosa', 'virginica', 'virginica', 'setosa', 'virginica', 'setosa', 'versicolor', 'virginica', 'setosa', 'setosa', 'virginica', 'versicolor', 'versicolor', 'setosa', 'versicolor', 'versicolor', 'virginica', 'setosa', 'virginica', 'versicolor', 'versicolor', 'virginica', 'setosa', 'setosa', 'virginica', 'virginica', 'versicolor']


In [43]:
y_test

array([['versicolor'],
       ['versicolor'],
       ['virginica'],
       ['setosa'],
       ['versicolor'],
       ['virginica'],
       ['setosa'],
       ['virginica'],
       ['setosa'],
       ['versicolor'],
       ['virginica'],
       ['setosa'],
       ['setosa'],
       ['virginica'],
       ['versicolor'],
       ['versicolor'],
       ['setosa'],
       ['versicolor'],
       ['versicolor'],
       ['virginica'],
       ['setosa'],
       ['virginica'],
       ['versicolor'],
       ['versicolor'],
       ['virginica'],
       ['setosa'],
       ['setosa'],
       ['virginica'],
       ['virginica'],
       ['versicolor']], dtype=object)

In [44]:
print('Accuracy Score:') 
print(metrics.accuracy_score(y_test,y_pred))

Accuracy Score:
0.9333333333333333


예측 결과를 확인해봤을 때, pred 값이 아닌 decision function의 값으로만 판단해도 성공적인 결과가 나오는 것을 확인할 수 있다. 따라서 일반화된 멀티클래스 svm을 구현할 때는 decision function만 고려하는 방식을 사용해도 될 것 같다.

## 일반화된 multiclass svm 함수 만들기

In [109]:
def multiclass_svm(X_train, X_test, y_train, y_test):
  y_index = y_train.unique()
  #print(y_index)
  idx = len(y_index)
  #print(idx)
  
  X_train, X_test = standardization(X_train, X_test)

  y_train = y_train.values
  y_train = y_train.reshape(-1, 1)

  from sklearn.preprocessing import OneHotEncoder

  ohe = OneHotEncoder()
  onehot_y = ohe.fit_transform(y_train).toarray()
  onehot_y = pd.DataFrame(onehot_y)
  onehot_y.columns = y_index

  y_test = y_test.values
  y_test = y_test.reshape(-1, 1)

  ohe = OneHotEncoder()
  onehot_test = ohe.fit_transform(y_test).toarray()
  onehot_test = pd.DataFrame(onehot_test)
  onehot_test.columns = y_index
  
  decision_f = []

  from sklearn.model_selection import GridSearchCV

  for i in range(idx):

    param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
                 'gamma': [0.001, 0.01, 0.1, 1, 10, 100] }

    model = SVC()
    grid_search = GridSearchCV(model, param_grid, cv=5, return_train_score=True)
    grid_search.fit(X_train, onehot_y[y_index[i]])

    svc = SVC(kernel='rbf', gamma = grid_search.best_params_['gamma'], C = grid_search.best_params_['C'])
    svc.fit(X_train, onehot_y[y_index[i]])

    d = svc.decision_function(X_test)
    decision_f.append(d)

  r_idx = y_index[::-1]
  #print(r_idx)
  pred_df = pd.DataFrame(decision_f).transpose()
  pred_df.columns = r_idx
  #print(pred_df)

  n = len(pred_df)
  y_pred = []

  for i in range(n):
    decision_target = list(pred_df.loc[i])
    decision = np.argmax(decision_target)
    #print(decision)
    add = r_idx[decision]
    y_pred.append(add)

  return y_pred

In [72]:
#IRIS 데이터 로드
iris =  sns.load_dataset('iris') 
X= iris.iloc[:,:4] #학습할데이터
y = iris.iloc[:,-1] #타겟
print(y)

0         setosa
1         setosa
2         setosa
3         setosa
4         setosa
         ...    
145    virginica
146    virginica
147    virginica
148    virginica
149    virginica
Name: species, Length: 150, dtype: object


In [73]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=48)

In [110]:
y_pred = multiclass_svm(X_train, X_test, y_train, y_test)

In [111]:
y_pred

['versicolor',
 'versicolor',
 'versicolor',
 'setosa',
 'virginica',
 'virginica',
 'setosa',
 'virginica',
 'setosa',
 'versicolor',
 'virginica',
 'setosa',
 'setosa',
 'virginica',
 'versicolor',
 'versicolor',
 'setosa',
 'versicolor',
 'versicolor',
 'virginica',
 'setosa',
 'virginica',
 'versicolor',
 'versicolor',
 'virginica',
 'setosa',
 'setosa',
 'virginica',
 'virginica',
 'versicolor']

In [112]:
print('Accuracy Score:') 
print(metrics.accuracy_score(y_test,y_pred))

Accuracy Score:
0.9333333333333333


일반화를 위해 구현한 함수를 이용해도 성공적으로 예측하는 것을 볼 수 있다.