In [2]:
import pandas as pd
import numpy as np
import statsmodels.api as sm

In [3]:
from sklearn.datasets import load_iris
iris = load_iris()
print(iris.DESCR)

Iris Plants Database

Notes
-----
Data Set Characteristics:
    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20  0.76     0.9565  (high!)

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :Date: July, 1988

This is a copy of UCI ML iris d

- 붓꽃 분류 문제에서 클래스가 setosa, versicolor인 데이터만 사용하고 (setosa=0, versicolor=1) 독립변수로는 꽃받침 길이(Sepal Length)와 상수항만 사용하여 StatsModels 패키지의 로지스틱 회귀 모형으로 결과를 예측하고 보고서를 출력한다.
- 위 결과를 confusion matrix와 classification report로 표현한다.
- 이 모형에 대해 ROC커브를 그리고 AUC를 구한다. 이 때 Scikit-Learn의 LogisticRegression을 사용하지 않고 위에서 StatsModels로 구한 모형을 사용한다.

In [160]:
dfX = pd.DataFrame(iris.data, columns=iris.feature_names)
dfy = pd.DataFrame(iris.target, columns=["y"])
df = pd.concat([dfX, dfy], axis=1)
df['sepal_length'] = df['sepal length (cm)']
df.drop(columns='sepal length (cm)',inplace=True)
df.tail()

Unnamed: 0,sepal width (cm),petal length (cm),petal width (cm),y,sepal_length
145,3.0,5.2,2.3,2,6.7
146,2.5,5.0,1.9,2,6.3
147,3.0,5.2,2.0,2,6.5
148,3.4,5.4,2.3,2,6.2
149,3.0,5.1,1.8,2,5.9


In [161]:
dfX = df['sepal_length']

In [162]:
dfy = df['y']
dfy = dfy[dfy!=2]

In [163]:
df = pd.concat([dfX,dfy],axis=1)
df = df[:100]

In [61]:
model = sm.Logit.from_formula("y ~ sepal_length", df)
result = model.fit(disp=0)
print(result.summary())

                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                  100
Model:                          Logit   Df Residuals:                       98
Method:                           MLE   Df Model:                            1
Date:                Sat, 14 Jul 2018   Pseudo R-squ.:                  0.5368
Time:                        11:37:31   Log-Likelihood:                -32.106
converged:                       True   LL-Null:                       -69.315
                                        LLR p-value:                 6.320e-18
                   coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept      -27.8315      5.434     -5.122      0.000     -38.481     -17.182
sepal_length     5.1403      1.007      5.107      0.000       3.168       7.113


In [None]:
from sklearn.metrics import confusion_matrix

In [120]:
y_true = df['y'].values
y_true.shape

(100,)

In [148]:
y_pred = result.predict()
y_pred = y_pred.reshape(-1,1)

In [138]:
from sklearn.preprocessing import Binarizer
binarizer = Binarizer(threshold=0.5)
binarizer

Binarizer(copy=True, threshold=0.5)

In [153]:
binarizer.fit(y_pred)
y_pred = binarizer.transform(y_pred)

In [155]:
confusion_matrix(y_true, y_pred)

array([[45,  5],
       [ 6, 44]])

In [158]:
from sklearn.datasets import make_classification
dfX = sm.add_constant(dfX)
dfX.tail()

Unnamed: 0,const,sepal_length
145,1.0,6.7
146,1.0,6.3
147,1.0,6.5
148,1.0,6.2
149,1.0,5.9


In [164]:
df = sm.add_constant(df)
df.tail()

Unnamed: 0,const,sepal_length,y
95,1.0,5.7,1.0
96,1.0,5.7,1.0
97,1.0,6.2,1.0
98,1.0,5.1,1.0
99,1.0,5.7,1.0


In [166]:
dfX = df[['const','sepal_length']]
dfy = df['y']

In [167]:
# # chisqprob 함수가 없다는 오류가 발생하면 다음 두 줄을 실행한다.
# from scipy import stats
# stats.chisqprob = lambda chisq, df: stats.chi2.sf(chisq, df)

logit_mod = sm.Logit(dfy, dfX)
logit_res = logit_mod.fit(disp=0)
print(logit_res.summary())

                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                  100
Model:                          Logit   Df Residuals:                       98
Method:                           MLE   Df Model:                            1
Date:                Sat, 14 Jul 2018   Pseudo R-squ.:                  0.5368
Time:                        12:10:22   Log-Likelihood:                -32.106
converged:                       True   LL-Null:                       -69.315
                                        LLR p-value:                 6.320e-18
                   coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
const          -27.8315      5.434     -5.122      0.000     -38.481     -17.182
sepal_length     5.1403      1.007      5.107      0.000       3.168       7.113


In [170]:
from sklearn.metrics import roc_curve

fpr1, tpr1, thresholds1 = roc_curve(dfy, model1.decision_function(dfX))

In [169]:
model1 = LogisticRegression().fit(dfX, dfy)

In [172]:
confusion_matrix(y_true, y_pred)

array([[45,  5],
       [ 6, 44]])

In [171]:
from sklearn.metrics import auc
auc(fpr1, tpr1)

0.9325999999999999

In [24]:
dfX = pd.DataFrame(iris.data, columns=iris.feature_names)
dfy = pd.DataFrame(iris.target, columns=["y"])
df = pd.concat([dfX, dfy], axis=1)
df['sepal_length'] = df['sepal length (cm)']
df.drop(columns='sepal length (cm)',inplace=True)
df.tail()

Unnamed: 0,sepal width (cm),petal length (cm),petal width (cm),y,sepal_length
145,3.0,5.2,2.3,2,6.7
146,2.5,5.0,1.9,2,6.3
147,3.0,5.2,2.0,2,6.5
148,3.4,5.4,2.3,2,6.2
149,3.0,5.1,1.8,2,5.9


In [63]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

qda = QuadraticDiscriminantAnalysis(store_covariance=True).fit(dfX, dfy)
lda = LinearDiscriminantAnalysis(n_components=3, solver="svd", store_covariance=True).fit(dfX, dfy)

  y = column_or_1d(y, warn=True)


In [75]:
y_pred_lda = lda.predict(dfX)
y_pred_qda = qda.predict(dfX)

In [70]:
from sklearn.metrics import *
print(classification_report(dfy,lda.predict(dfX)))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00        50
          1       0.98      0.96      0.97        50
          2       0.96      0.98      0.97        50

avg / total       0.98      0.98      0.98       150



In [71]:
from sklearn.metrics import *
print(classification_report(dfy,qda.predict(dfX)))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00        50
          1       0.98      0.96      0.97        50
          2       0.96      0.98      0.97        50

avg / total       0.98      0.98      0.98       150



In [76]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_true, y_pred_qda)
# confusion matrix of QDA

array([[50,  0,  0],
       [ 0, 48,  2],
       [ 0,  1, 49]])

In [77]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_true, y_pred_lda)
#confustion matrix of LDA

array([[50,  0,  0],
       [ 0, 48,  2],
       [ 0,  1, 49]])