# 22. 분류: 로지스틱 회귀분석(Logistic Regression)

In [84]:
import pandas as pd
import numpy as np
from statsmodels.api import Logit
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score


In [85]:
df = pd.read_csv('../../ref_files/csv_file/iris.csv')
df.head()

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [86]:
df['Species'].unique()

array(['setosa', 'versicolor', 'virginica'], dtype=object)

In [87]:
df['is_setosa'] = (df['Species'] == 'setosa') + 0
df.head()

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species,is_setosa
0,5.1,3.5,1.4,0.2,setosa,1
1,4.9,3.0,1.4,0.2,setosa,1
2,4.7,3.2,1.3,0.2,setosa,1
3,4.6,3.1,1.5,0.2,setosa,1
4,5.0,3.6,1.4,0.2,setosa,1


In [88]:
model = Logit(endog = df['is_setosa'],
              exog= df.iloc[:, :2]).fit()
model

Optimization terminated successfully.
         Current function value: 0.036374
         Iterations 11


<statsmodels.discrete.discrete_model.BinaryResultsWrapper at 0x19502d17ee0>

In [89]:
model.params

Sepal.Length    -7.529945
Sepal.Width     13.130734
dtype: float64

In [90]:
model.pvalues

Sepal.Length    0.000828
Sepal.Width     0.000989
dtype: float64

In [91]:
pred = model.predict(df.iloc[:3, :2])
pred

0    0.999477
1    0.923824
2    0.998678
dtype: float64

In [92]:
(pred > 0.5 ) + 0

0    1
1    1
2    1
dtype: int32

In [93]:
model = LogisticRegression(random_state=123)
model.fit(X = df.iloc[:, :2],
          y = df['is_setosa'])
model

In [94]:
model.coef_

array([[-3.38829757,  3.1645277 ]])

In [95]:
model.intercept_

array([8.32330389])

In [96]:
pred = model.predict_proba(df.iloc[:3, :2])
pred

array([[0.10727976, 0.89272024],
       [0.22895365, 0.77104635],
       [0.07413821, 0.92586179]])

In [97]:
pred = pred[:, 1]
pred

array([0.89272024, 0.77104635, 0.92586179])

In [98]:
(pred > 0.5) + 0

array([1, 1, 1])

In [99]:
pred = model.predict_proba(df.iloc[:, :2])
pred = pred[:, 1]
pred

array([8.92720238e-01, 7.71046348e-01, 9.25861792e-01, 9.27383226e-01,
       9.41260955e-01, 9.14366510e-01, 9.70588854e-01, 8.94844544e-01,
       9.30340069e-01, 8.22106028e-01, 8.50082658e-01, 9.43687032e-01,
       8.25355057e-01, 9.62572636e-01, 7.90722967e-01, 9.49498438e-01,
       9.14366510e-01, 8.92720238e-01, 7.37925296e-01, 9.55561442e-01,
       6.86948409e-01, 9.40011474e-01, 9.84162879e-01, 8.15466254e-01,
       9.43687032e-01, 7.05867903e-01, 8.94844544e-01, 8.55697295e-01,
       8.12075145e-01, 9.25861792e-01, 8.66401735e-01, 6.86948409e-01,
       9.75366839e-01, 9.51601779e-01, 8.22106028e-01, 8.18809824e-01,
       6.82116219e-01, 9.57423588e-01, 9.48259025e-01, 8.58438460e-01,
       9.21120027e-01, 5.87678957e-01, 9.71839851e-01, 9.21120027e-01,
       9.55561442e-01, 8.25355057e-01, 9.55561442e-01, 9.46018652e-01,
       8.88358175e-01, 8.61136004e-01, 5.12600003e-03, 3.78582845e-02,
       5.24138947e-03, 4.59168341e-02, 7.84539075e-03, 1.06280067e-01,
      

In [100]:
from sklearn.metrics import roc_auc_score

In [101]:
roc_auc_score(y_true = df['is_setosa'],
              y_score= pred)

0.9999999999999999

In [102]:
accuracy_score(y_true = df['is_setosa'],
              y_pred= (pred > 0.8) + 0)

0.9466666666666667