In [1]:
# 붓꽃 판별

## 패키지 로딩

In [2]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score,classification_report,roc_auc_score

## 데이터 로드 및 확인

In [9]:
X, y = load_iris(return_X_y= True)
print(X.shape,y.shape)
# print(np.unique(y))
unique, counts = np.unique(y,return_counts=True)
print(unique)
print(counts)
print(dict(zip(unique, counts)))

(150, 4) (150,)
[0 1 2]
[50 50 50]
{0: 50, 1: 50, 2: 50}


In [10]:
df = pd.DataFrame(X, columns=load_iris().feature_names)
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


## 학습 데이터와 평가 데이터 분할

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X,y,train_size=0.8,random_state=10,stratify=y)

In [15]:

print('y의 클래스별 데이터수: ' ,np.bincount(y))
print('y_train의 클래스별 데이터수: ' ,np.bincount(y_train))
print('y_test의 클래스별 데이터수: ' ,np.bincount(y_test))


y의 클래스별 데이터수:  [50 50 50]
y_train의 클래스별 데이터수:  [40 40 40]
y_test의 클래스별 데이터수:  [10 10 10]


## 모델 생성
- 다항분류를 위한 옵션
    - solver: 최적화 문제에 사용할 알고리즘 (default:lbfgs)
        >- 'newton-cg','sag', 'saga', 'lbfgs' 만 다항 손실을 처리한다. 즉, 멀티 클래스 분류 모델에 사용
        >- 'liblinear' 는 one-versus-rest 방식으로 제한된다.
    - multi_class: 다중 클래스 분류 문제의 상황에서 어떤 접근 방식을 취할지 결정
        >- 'ovr' : 이진분류기인 sigmoid 함수를 이용하여 결과 예측
        >- 'multinomial' : 각 클래스에 대한 softmax 함수를 이용하여 다중분류를 수행

In [34]:
# model = LogisticRegression(random_state=0,solver='liblinear',multi_class='ovr',max_iter=2000)
model = LogisticRegression(random_state=0,solver='lbfgs',multi_class='multinomial',max_iter=2000)

model.fit(X_train,y_train)

## 모델 예측

In [35]:
y_hat = model.predict(X_test)


In [36]:
proba = model.predict_proba(X_test)
print(np.round(proba[:5],3))

[[0.012 0.832 0.156]
 [0.971 0.029 0.   ]
 [0.018 0.71  0.272]
 [0.002 0.483 0.515]
 [0.242 0.754 0.004]]


In [37]:
df_proba = pd.DataFrame(proba[:5],columns= ['class-0','class-1','class-2'])
df_proba['result'] = np.argmax(proba[:5], axis=1)
print(y_hat[:5])
df_proba

[1 0 1 2 1]


Unnamed: 0,class-0,class-1,class-2,result
0,0.012015,0.832231,0.1557538,1
1,0.971461,0.028539,3.196184e-07,0
2,0.017753,0.709753,0.2724934,1
3,0.001906,0.483312,0.5147826,2
4,0.241615,0.753939,0.004445463,1


## 모델 평가

In [38]:
print(f'Accuracy: {accuracy_score(y_test,y_hat)}')
# ovo : One-Versus-One 방식으로 모든 고유한 쌍별 클래스 조합을 비교한다
auc = roc_auc_score(y_test,proba, multi_class = 'ovr')
print(f'AUC:{auc:.3f}')
con_mat = confusion_matrix(y_test, y_hat)
print(con_mat)
report = classification_report(y_test,y_hat)
print(report)

Accuracy: 1.0
AUC:1.000
[[10  0  0]
 [ 0 10  0]
 [ 0  0 10]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00        10
           2       1.00      1.00      1.00        10

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30



## 하이퍼 파라메터 튜닝

In [41]:
from sklearn.model_selection import GridSearchCV

model = LogisticRegression()
param = {'C' : [0.01, 0.1, 1, 3, 5, 10],'penalty': ['l1','l2'], 'solver': ['newton-cg','lbfgs','sag','saga','liblinear']
        ,'multi_class':['ovr','multinomial']}
# n_jobs : 코어 사용 개수, -1로 설정하면 모든 코어 사용(default:1)
gs = GridSearchCV(model, param, cv = 5, scoring = 'accuracy', n_jobs = 4)
gs.fit(X,y)
print('최적 파라메터:', gs.best_params_)
print(f'최적 정확도:{gs.best_score_:.3f}')

최적 파라메터: {'C': 1, 'multi_class': 'multinomial', 'penalty': 'l1', 'solver': 'saga'}
최적 정확도:0.987


240 fits failed out of a total of 600.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
60 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\user\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\user\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\user\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\linear_model\_logistic.py", line 

## 최적 파라메터를 이용한 모델 평가

In [43]:
y_hat = gs.best_estimator_.predict(X_test)
print(f'Accuracy:{accuracy_score(y_test,y_hat):.3f}')
# ovo : One-Versus-One 방식으로 모든 고유한 쌍별 클래스 조합을 비교한다
auc = roc_auc_score(y_test,proba,multi_class='ovo')
print(f'AUC:{auc:.3f}')

Accuracy:1.000
AUC:1.000
