In [26]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [27]:
# iris 데이터 불러오기
iris =  sns.load_dataset('iris')
X= iris.iloc[:,:4]
y = iris.iloc[:,-1]

In [28]:
# target(species 3종류) one hot encoding
y = pd.get_dummies(y)
y

Unnamed: 0,setosa,versicolor,virginica
0,1,0,0
1,1,0,0
2,1,0,0
3,1,0,0
4,1,0,0
...,...,...,...
145,0,0,1
146,0,0,1
147,0,0,1
148,0,0,1


In [29]:
# trian 과 test 데이터로 구분 (비율은 8 : 2)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=48)

In [30]:
# 스케일링 진행
# train 데이터로 스케일 모델 학습 후 test 데이터에 적용
scale = StandardScaler()
X_train_std = scale.fit_transform(X_train)
X_test_std =  scale.transform(X_test)

In [31]:
X

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


In [32]:
# One Versus Rest
# rbf kernel 은 가우시안 커널 (선형적으로 분류할 수 없는 문제이기에 사용하는 것으로 이해)
svm_1 = SVC(kernel ='rbf', gamma = 5, C = 100)
svm_2 = SVC(kernel ='rbf', gamma = 5, C = 100)
svm_3 = SVC(kernel ='rbf', gamma = 5, C = 100)

In [41]:
svm_1.fit(X_train_std,y_train.iloc[:,0]) 
svm_2.fit(X_train_std,y_train.iloc[:,1])
svm_3.fit(X_train_std,y_train.iloc[:,2])

SVC(C=100, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=5, kernel='rbf', max_iter=-1,
    probability=False, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

In [79]:
dist1 = svm_1.decision_function(X_test_std) # decision_function hyperplane과의 거리!
dist2 = svm_2.decision_function(X_test_std)
dist3 = svm_3.decision_function(X_test_std)

In [44]:
# 부호가 모든 같은 경우가 있는가? < 모두 동점인 경우!!
# np.sign 은 해당 인자의 부호값 을 확인하는 함수
# 4, 18, 19 번째가 부호가 모두 같음
# 부호가 같음 은 모두 같은 면에 속해 있다고 이해됨
for i in range(len(X_test)):
    if (np.sign(svm_1.decision_function(X_test_std)[i]) == np.sign(svm_2.decision_function(X_test_std)[i]))\
    and (np.sign(svm_2.decision_function(X_test_std)[i]) == np.sign(svm_3.decision_function(X_test_std)[i])):
        print(i)

3
17
18


In [68]:
# 한 로우의 각 species 와 hyperplane(초평면) 과의 거리 한 배열로
dist = np.stack((dist1, dist2, dist3), axis=1)

In [69]:
# 데이터프레임으로
df  = pd.DataFrame(dist, columns = ['y1', 'y2', 'y3'])

In [70]:
df.head()

Unnamed: 0,y1,y2,y3
0,-1.1236,1.379466,-1.274571
1,-0.867825,0.641587,-0.78527
2,-0.655992,0.083346,-0.428302
3,-0.501943,-0.371038,-0.130155
4,-0.765411,-0.248726,0.02101


In [76]:
# 동점자로 확인할 수 있었던 4, 18, 19 번째 로우 확인
print(df.loc[3])
print(df.loc[17])
print(df.loc[18])

y1   -0.501943
y2   -0.371038
y3   -0.130155
Name: 3, dtype: float64
y1   -0.568279
y2   -0.165651
y3   -0.268853
Name: 17, dtype: float64
y1   -0.730927
y2   -0.227390
y3   -0.039268
Name: 18, dtype: float64
