## OVR & OVO 모듈 활용

### (1) 모듈 로딩 및 데이터 준비 <hr>

In [117]:
## 모듈 로딩
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
from sklearn.linear_model import LogisticRegression
import pandas as pd
import numpy as np

In [118]:
# 데이터 준비
data_file = '../data/fish.csv'

fishDF=pd.read_csv(data_file)
fishDF.head()

Unnamed: 0,Species,Weight,Length,Diagonal,Height,Width
0,Bream,242.0,25.4,30.0,11.52,4.02
1,Bream,290.0,26.3,31.2,12.48,4.3056
2,Bream,340.0,26.5,31.1,12.3778,4.6961
3,Bream,363.0,29.0,33.5,12.73,4.4555
4,Bream,430.0,29.0,34.0,12.444,5.134


### (2) 데이터셋 준비 <hr>

#### (2-1) 피쳐와 타겟 분리

In [119]:
targetDF = fishDF.Species
featureDF = fishDF.drop('Species', axis=1)
featureDF

Unnamed: 0,Weight,Length,Diagonal,Height,Width
0,242.0,25.4,30.0,11.5200,4.0200
1,290.0,26.3,31.2,12.4800,4.3056
2,340.0,26.5,31.1,12.3778,4.6961
3,363.0,29.0,33.5,12.7300,4.4555
4,430.0,29.0,34.0,12.4440,5.1340
...,...,...,...,...,...
154,12.2,12.2,13.4,2.0904,1.3936
155,13.4,12.4,13.5,2.4300,1.2690
156,12.2,13.0,13.8,2.2770,1.2558
157,19.7,14.3,15.2,2.8728,2.0672


In [120]:
print(f'featureDF.shape: {featureDF.shape}, targetDF.shape: {targetDF.shape}')

featureDF.shape: (159, 5), targetDF.shape: (159,)


In [121]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
encoder.fit(targetDF)

In [122]:
targetDF = encoder.transform(targetDF)
targetDF

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 5, 5, 5, 5, 5, 5, 5, 5, 5,
       5, 5, 5, 5, 5])

In [123]:
# 타겟 즉 클래스 수 확인
# targetDF.nunique()

In [124]:
# 타겟 클래스 별 데이터 수 확인
# targetDF.value_counts() / targetDF.shape[0]
# 데이터가 불균등함

#### (2-2) 학습용/테스트용 데이터셋 준비

In [125]:
from sklearn.model_selection import train_test_split

In [126]:
X_train, X_test, y_train, y_test = train_test_split(featureDF,
                                                    targetDF,
                                                    stratify=targetDF,
                                                    random_state=11)

In [127]:
print(f'Training set: {X_train.shape}, {y_train.shape} test set: {X_test.shape}, {y_test.shape}')

Training set: (119, 5), (119,) test set: (40, 5), (40,)


### (3) 학습 진행<hr>

In [128]:
## OVO/OVR에서 사용할 관측지(Estimator) 생성
model = LogisticRegression(solver='liblinear')

#### (3-1) OvO 기반 학습 진행

In [129]:
ovomodel = OneVsOneClassifier(model)
ovomodel.fit(X_train, y_train)

In [130]:
print(f'ovomodel.classes_ : {ovomodel.classes_}')
print(f'ovomodel.feature_names_in_ : {ovomodel.feature_names_in_}')
print(f'ovomodel.estimators_ : {len(ovomodel.estimators_)}개') # 7 * 6 / 2

ovomodel.classes_ : [0 1 2 3 4 5 6]
ovomodel.feature_names_in_ : ['Weight' 'Length' 'Diagonal' 'Height' 'Width']
ovomodel.estimators_ : 21개


In [131]:
print(f'[Train Score] {ovomodel.score(X_train, y_train)}\n[Test Score] {ovomodel.score(X_test, y_test)}')

[Train Score] 0.957983193277311
[Test Score] 0.925


In [132]:
# 예측
ovomodel.predict(X_test[:2])

array([0, 1])

In [133]:
ovomodel.decision_function(X_test[:2])

array([[ 6.32094951,  5.32872468,  2.32890163,  0.68506766,  3.322758  ,
        -0.33168462,  4.3140798 ],
       [ 4.26849104,  6.32543178,  2.3234672 ,  0.67951149,  5.319289  ,
         0.67104504,  1.85564622]])

#### (3-2) OvR 기반 학습 진행

In [134]:
ovrmodel = OneVsRestClassifier(model)
ovrmodel.fit(X_train, y_train)

In [135]:
ovrmodel.predict(X_test[:2])

array([0, 1])

In [136]:
ovrmodel.decision_function(X_test[:2])


array([[  1.87053681,   0.13665969,  -7.34472734, -14.76498298,
         -0.86086327, -27.57113603,  -3.90345836],
       [ -1.40152254,   2.39014045,  -2.83220689, -12.23098559,
         -2.55867318, -15.03484394,  -4.32514035]])

In [137]:
print(f'[Train Score] {ovrmodel.score(X_train, y_train)}\n[Test Score] {ovrmodel.score(X_test, y_test)}')

[Train Score] 0.9495798319327731
[Test Score] 0.975


### (6) 모델 성능 평가 <hr>
- 정확도
- 정밀도
- 재현율
- F1 - Score
- Confusion Matrics
- Classification Report

In [138]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, classification_report

In [158]:
model=LogisticRegression(solver= 'liblinear')
model.fit(X_train, y_train)

In [159]:
print(classification_report(y_test, model.predict(X_test), zero_division=0)) # 정답지, 예측값 순서

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         9
           1       1.00      1.00      1.00         3
           2       0.93      1.00      0.97        14
           3       1.00      1.00      1.00         4
           4       1.00      1.00      1.00         5
           5       1.00      1.00      1.00         4
           6       0.00      0.00      0.00         1

    accuracy                           0.97        40
   macro avg       0.85      0.86      0.85        40
weighted avg       0.95      0.97      0.96        40


In [160]:
print(f1_score(y_test, model.predict(X_test),average='micro'))

0.975


In [163]:
recall_score(y_test, model.predict(X_test), average='micro')

0.975

In [166]:
confusion_matrix(y_test, model.predict(X_test))

array([[ 9,  0,  0,  0,  0,  0,  0],
       [ 0,  3,  0,  0,  0,  0,  0],
       [ 0,  0, 14,  0,  0,  0,  0],
       [ 0,  0,  0,  4,  0,  0,  0],
       [ 0,  0,  0,  0,  5,  0,  0],
       [ 0,  0,  0,  0,  0,  4,  0],
       [ 0,  0,  1,  0,  0,  0,  0]])