In [12]:
# 데이터 불러오기
import seaborn as sns # seaborn을 불러오고 SNS로 축약
import numpy as np

iris = sns.load_dataset('iris')  # iris라는 변수명으로 Iris data를 download
print(iris.head())
X = iris.drop('species', axis=1) # 'species'열을 drop하고 input X를 정의
y = iris['species']
print(np.unique(y))

   sepal_length  sepal_width  petal_length  petal_width species
0           5.1          3.5           1.4          0.2  setosa
1           4.9          3.0           1.4          0.2  setosa
2           4.7          3.2           1.3          0.2  setosa
3           4.6          3.1           1.5          0.2  setosa
4           5.0          3.6           1.4          0.2  setosa
['setosa' 'versicolor' 'virginica']


In [13]:
# y data를 범주형으로 변환
from sklearn.preprocessing import LabelEncoder    # LabelEncoder() method를 불러옴

classle = LabelEncoder()
y = classle.fit_transform(iris['species'].values) # species 열의 문자열을 categorical 값으로 전환
print(np.unique(y))

[0 1 2]


In [14]:
# 전체 data를 training set과 test set으로 split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,
                                                    random_state=1, stratify=y)

In [15]:
# 표준화
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

In [20]:
# Logistic regression
from sklearn.linear_model import LogisticRegression

Logit = LogisticRegression(C=1e2, random_state=1,
                          multi_class='ovr', max_iter=200)  # C = 1/λ. 디폴트: L2, One-versus-Rest
                        # C값이 클수록 λ이 작아지기 때문에 규제화의 강도가 줄어든다.
                        # ovr: one-vs-rest로 가장 큰 확률을 가진 범주를 해당 관측치에 할당 (multi_class='multinomial'도 가능)
l_1=Logit.fit(X_train_std, y_train)
y_train_pred = Logit.predict(X_train_std)
y_test_pred = Logit.predict(X_test_std)

In [21]:
# Accuracy score (정밀도=전체 표본에서 정확하게 예측을 맞춘 비율)
from sklearn.metrics import accuracy_score

print(accuracy_score(y_test,y_test_pred))    
print(accuracy_score(y_train,y_train_pred))  

0.9777777777777777
0.9428571428571428


In [22]:
# Confusion matrix
from sklearn.metrics import confusion_matrix

print(confusion_matrix(y_test, y_test_pred))  # Confusion matrix

[[15  0  0]
 [ 0 15  0]
 [ 0  1 14]]


In [23]:
Logit = LogisticRegression(C=1e2, random_state=1, max_iter=200, multi_class = 'ovr')  # C = 1/λ. 디폴트: L2, Auto
# One-versus-rest로 진행하는 경우 multi_class='ovr'이라고 명시해줘야 함.
Logit.fit(X_train, y_train)
y_train_pred = Logit.predict(X_train)
y_test_pred = Logit.predict(X_test)
y_test_pred_proba=Logit.predict_proba(X_test)  # predict_proba: predict probability
print(y_test_pred[:5])
print(y_test_pred_proba[:5])

[2 0 0 2 1]
[[5.92893708e-10 5.87048155e-04 9.99412951e-01]
 [9.98963464e-01 1.03653599e-03 1.98612888e-20]
 [9.99802215e-01 1.97784985e-04 3.43495956e-21]
 [3.09188776e-05 4.37402877e-01 5.62566204e-01]
 [2.14214878e-05 9.99312629e-01 6.65949451e-04]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [24]:
print(accuracy_score(y_test,y_test_pred))   
print(accuracy_score(y_train,y_train_pred)) 
print(confusion_matrix(y_test, y_test_pred)) 

1.0
0.9809523809523809
[[15  0  0]
 [ 0 15  0]
 [ 0  0 15]]


학습된 로지스틱 회귀모형을 저장하고 다시 불러내어 데이터에 직접 적용하기로 한다. <br>
save.pkl 파일에 저장한 뒤, joblib.load() 함수를 이용해 이를 객체화하고 predict하는 함수 등을 적용해볼 수 있다.

In [25]:
!pip install joblib



In [26]:
import joblib

joblib.dump(Logit,'save.pkl')
logit_from_joblib=joblib.load('save.pkl')
logit_pred=logit_from_joblib.predict(X_test)
print(accuracy_score(y_test,logit_pred))
print(confusion_matrix(y_test,logit_pred))
print(logit_pred)

1.0
[[15  0  0]
 [ 0 15  0]
 [ 0  0 15]]
[2 0 0 2 1 1 2 1 2 0 0 2 0 1 0 1 2 1 1 2 2 0 1 2 1 1 1 2 0 2 0 0 1 1 2 2 0
 0 0 1 2 2 1 0 0]
