# 로지스틱회귀 + 더미변수

In [1]:
from pandas import read_excel, DataFrame, merge
from matplotlib import pyplot as plt
import seaborn as sb
import numpy as np
from patsy import dmatrix

import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.getcwd())))
from helper import my_logit, scalling

## 데이터

In [2]:
df = read_excel("https://data.hossam.kr/E05/gradeuate.xlsx")
df.head()

Unnamed: 0,합격여부,필기점수,학부성적,병원경력
0,0,380,3.61,3
1,1,660,3.67,3
2,1,800,4.0,1
3,1,640,3.19,4
4,0,520,2.93,4


In [3]:
dv = dmatrix('C(병원경력)', df)
dv

DesignMatrix with shape (400, 4)
  Intercept  C(병원경력)[T.2]  C(병원경력)[T.3]  C(병원경력)[T.4]
          1             0             1             0
          1             0             1             0
          1             0             0             0
          1             0             0             1
          1             0             0             1
          1             1             0             0
          1             0             0             0
          1             1             0             0
          1             0             1             0
          1             1             0             0
          1             0             0             1
          1             0             0             0
          1             0             0             0
          1             1             0             0
          1             0             0             0
          1             0             1             0
          1             0             0          

In [6]:
dummy_df = DataFrame(np.asarray(dv))
dummy_df.drop(0, axis=1, inplace=True)
# 1~5, 3~5의 경우 '~'를 인식하지 못함
dummy_df.rename(columns={1: '고수', 2: '중수', 3: '하수'}, inplace=True)
dummy_df.head()

Unnamed: 0,고수,중수,하수
0,0.0,1.0,0.0
1,0.0,1.0,0.0
2,0.0,0.0,0.0
3,0.0,0.0,1.0
4,0.0,0.0,1.0


In [9]:
df

Unnamed: 0,합격여부,필기점수,학부성적,병원경력
0,0,380,3.61,3
1,1,660,3.67,3
2,1,800,4.00,1
3,1,640,3.19,4
4,0,520,2.93,4
...,...,...,...,...
395,0,620,4.00,2
396,0,560,3.04,3
397,0,460,2.63,2
398,0,700,3.65,2


In [8]:
# mdf : mergedf
mdf = merge(df.drop('병원경력', axis = 1), dummy_df, left_index = True, right_index = True)
mdf.head()

Unnamed: 0,합격여부,필기점수,학부성적,고수,중수,하수
0,0,380,3.61,0.0,1.0,0.0
1,1,660,3.67,0.0,1.0,0.0
2,1,800,4.0,0.0,0.0,0.0
3,1,640,3.19,0.0,0.0,1.0
4,0,520,2.93,0.0,0.0,1.0


# 로지스틱 회귀 분석 (모듈기능 활용)

In [10]:
logit_result = my_logit(mdf, y='합격여부', x=['필기점수','학부성적','고수','중수','하수'])
print(logit_result.summary)

Optimization terminated successfully.
         Current function value: 0.573147
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                   합격여부   No. Observations:                  400
Model:                          Logit   Df Residuals:                      394
Method:                           MLE   Df Model:                            5
Date:                Mon, 31 Jul 2023   Pseudo R-squ.:                 0.08292
Time:                        14:49:41   Log-Likelihood:                -229.26
converged:                       True   LL-Null:                       -249.99
Covariance Type:            nonrobust   LLR p-value:                 7.578e-08
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -3.9900      1.140     -3.500      0.000      -6.224      -1.756
필기점수           0.0023      0.

## 혼동행렬

In [11]:
logit_result.cmdf

Unnamed: 0,Negative,Positive
True,254,30
False,97,19


## 오즈비

In [12]:
logit_result.odds_rate_df

Unnamed: 0,odds_rate
Intercept,0.0185
필기점수,1.002267
학부성적,2.234545
고수,0.508931
중수,0.261792
하수,0.211938


## 결과값

In [19]:
logit_result.result_df.T

Unnamed: 0,0
설명력(Pseudo-Rsqe),0.082922
정확도(Accuracy),0.71
정밀도(Precision),0.612245
"재현율(Recall, TPR)",0.23622
"위양성율(Fallout, FPR)",0.069597
"특이성(Specificity, TNR)",0.930403
RAS,0.583312
f1_score,0.340909
결과값,


# 표준화(scalling)

>표준화 : 각 특성들의 단위를 무시하고 값으로 단순 비교할 수 있게 변경

## 특정 컬럼 표준화

In [20]:
sdf = scalling(mdf.filter(['필기점수', '학부성적']))
sdf.head()

Unnamed: 0,필기점수,학부성적
0,-1.800263,0.579072
1,0.626668,0.736929
2,1.840134,1.605143
3,0.453316,-0.525927
4,-0.586797,-1.209974


In [21]:
mdf['필기점수'] = sdf['필기점수']
mdf['학부성적'] = sdf['학부성적']
mdf.head()

Unnamed: 0,합격여부,필기점수,학부성적,고수,중수,하수
0,0,-1.800263,0.579072,0.0,1.0,0.0
1,1,0.626668,0.736929,0.0,1.0,0.0
2,1,1.840134,1.605143,0.0,0.0,0.0
3,1,0.453316,-0.525927,0.0,0.0,1.0
4,0,-0.586797,-1.209974,0.0,0.0,1.0


## 로지스틱 회귀

In [22]:
logit_result = my_logit(mdf, y='합격여부', x=['필기점수','학부성적','고수','중수','하수'])
print(logit_result.summary)

Optimization terminated successfully.
         Current function value: 0.573147
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                   합격여부   No. Observations:                  400
Model:                          Logit   Df Residuals:                      394
Method:                           MLE   Df Model:                            5
Date:                Mon, 31 Jul 2023   Pseudo R-squ.:                 0.08292
Time:                        14:54:52   Log-Likelihood:                -229.26
converged:                       True   LL-Null:                       -249.99
Covariance Type:            nonrobust   LLR p-value:                 7.578e-08
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.0664      0.266      0.250      0.802      -0.454       0.587
필기점수           0.2613      0.

In [24]:
logit_result.result_df.T

Unnamed: 0,0
설명력(Pseudo-Rsqe),0.082922
정확도(Accuracy),0.71
정밀도(Precision),0.612245
"재현율(Recall, TPR)",0.23622
"위양성율(Fallout, FPR)",0.069597
"특이성(Specificity, TNR)",0.930403
RAS,0.583312
f1_score,0.340909


> 기존 02-로지스틱회귀.ipynb 분석 값에 비해 개선되었지만 큰 차이는 보이지 않는다.