# 로지스틱 회귀 + 더미변수
## #01. 작업준비
### 패키지 가져오기



In [12]:
import pandas as pd
from statsmodels.formula.api import logit
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score, accuracy_score, precision_score, f1_score,recall_score
from patsy import dmatrix
import seaborn as sb
from matplotlib import pyplot as plt
import numpy as np
import sys
import os

sys.path.append(os.path.dirname(os.path.dirname(os.getcwd())))
from helper import my_logit, scailing

### 그래프 초기화

In [13]:
plt.rcParams["font.family"] = 'AppleGothic' if sys.platform == 'darwin' else 'Malgun Gothic'
plt.rcParams["font.size"] = 12
plt.rcParams["figure.figsize"] = (10, 5)
plt.rcParams["axes.unicode_minus"] = False

### 데이터 가져오기
방사선학 석사과정 대학원생 모집을 하였다. 이 때 지원한 방사선사의 대학원 합격 여부에 따른 주요 요인이 무엇인지 분석하라.

단, 독립변수는 정규분포를 따른다고 가정한다


|변수|	구분	|설명|
|--|--|--|
|합격여부|	범주형|	1=합격, 0=불합격|
|필기점수|	연속형|	800점 만점|
|학부성적|	연속형|	4.0 만점|
|병원경력|	범주형|	1: 10년이상, 2: 2~5년, 3: 1~5년, 4: 1년 미만|


In [14]:
df = pd.read_excel("https://data.hossam.kr/E05/gradeuate.xlsx")
df

Unnamed: 0,합격여부,필기점수,학부성적,병원경력
0,0,380,3.61,3
1,1,660,3.67,3
2,1,800,4.00,1
3,1,640,3.19,4
4,0,520,2.93,4
...,...,...,...,...
395,0,620,4.00,2
396,0,560,3.04,3
397,0,460,2.63,2
398,0,700,3.65,2


## #02. 데이터 표준화




In [15]:
dv= dmatrix("C(병원경력)",df)


In [19]:
dummy_df = pd.DataFrame(np.asarray(dv))
dummy_df.drop(0,axis=1,inplace=True)
dummy_df.rename(columns={1:'고수',2:"중수",3:"하수"},inplace=True)
dummy_df.head()

Unnamed: 0,고수,중수,하수
0,0.0,1.0,0.0
1,0.0,1.0,0.0
2,0.0,0.0,0.0
3,0.0,0.0,1.0
4,0.0,0.0,1.0


In [20]:
mdf = pd.merge(df.drop('병원경력',axis=1),dummy_df,left_index=True,right_index=True)
mdf

Unnamed: 0,합격여부,필기점수,학부성적,고수,중수,하수
0,0,380,3.61,0.0,1.0,0.0
1,1,660,3.67,0.0,1.0,0.0
2,1,800,4.00,0.0,0.0,0.0
3,1,640,3.19,0.0,0.0,1.0
4,0,520,2.93,0.0,0.0,1.0
...,...,...,...,...,...,...
395,0,620,4.00,1.0,0.0,0.0
396,0,560,3.04,0.0,1.0,0.0
397,0,460,2.63,1.0,0.0,0.0
398,0,700,3.65,1.0,0.0,0.0


## #03. 로지스틱 회귀분석 (모듈기능 활용)

In [23]:
logit_result = my_logit(mdf,y='합격여부',x=['필기점수','학부성적','고수','중수','하수'])
print(logit_result.summary())

Optimization terminated successfully.
         Current function value: 0.573147
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                   합격여부   No. Observations:                  400
Model:                          Logit   Df Residuals:                      394
Method:                           MLE   Df Model:                            5
Date:                Mon, 31 Jul 2023   Pseudo R-squ.:                 0.08292
Time:                        14:45:27   Log-Likelihood:                -229.26
converged:                       True   LL-Null:                       -249.99
Covariance Type:            nonrobust   LLR p-value:                 7.578e-08
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -3.9900      1.140     -3.500      0.000      -6.224      -1.756
필기점수           0.0023      0.

In [24]:
logit_result.cmdf

Unnamed: 0,Positive,Negative
True,254,19
False,97,30


In [25]:
logit_result.odds_rate_df

Unnamed: 0,odds_rate
Intercept,0.0185
필기점수,1.002267
학부성적,2.234545
고수,0.508931
중수,0.261792
하수,0.211938


In [26]:
logit_result.prs

0.08292194794441865

In [27]:
logit_result.prs

0.08292194794441865

In [28]:
logit_result.result_df

Unnamed: 0,설명력(Pseudo-Rsqe),정확도(Accuracy),정밀도(Precision),"재현율(Recall, TPR)","위양성율(Fallout, FPR)","특이성(Specificity, TNR)",RAS,f1_score
0,0.082922,0.71,0.612245,0.23622,0.105634,0.894366,0.583312,0.340909


### 표준화 적용


In [29]:
sdf = scailing(mdf.filter(['필기점수','학부성적']))
sdf.head()

Unnamed: 0,필기점수,학부성적
0,-1.800263,0.579072
1,0.626668,0.736929
2,1.840134,1.605143
3,0.453316,-0.525927
4,-0.586797,-1.209974


In [30]:
mdf['필기점수']=sdf['필기점수']
mdf['학부성적']=sdf['학부성적']
mdf.head()


Unnamed: 0,합격여부,필기점수,학부성적,고수,중수,하수
0,0,-1.800263,0.579072,0.0,1.0,0.0
1,1,0.626668,0.736929,0.0,1.0,0.0
2,1,1.840134,1.605143,0.0,0.0,0.0
3,1,0.453316,-0.525927,0.0,0.0,1.0
4,0,-0.586797,-1.209974,0.0,0.0,1.0


In [32]:
logit_result = my_logit(mdf, y='합격여부', x=['필기점수','학부성적','고수','중수','하수'])
print(logit_result.summary())

Optimization terminated successfully.
         Current function value: 0.573147
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                   합격여부   No. Observations:                  400
Model:                          Logit   Df Residuals:                      394
Method:                           MLE   Df Model:                            5
Date:                Mon, 31 Jul 2023   Pseudo R-squ.:                 0.08292
Time:                        14:47:10   Log-Likelihood:                -229.26
converged:                       True   LL-Null:                       -249.99
Covariance Type:            nonrobust   LLR p-value:                 7.578e-08
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.0664      0.266      0.250      0.802      -0.454       0.587
필기점수           0.2613      0.

In [33]:
logit_result.result_df

Unnamed: 0,설명력(Pseudo-Rsqe),정확도(Accuracy),정밀도(Precision),"재현율(Recall, TPR)","위양성율(Fallout, FPR)","특이성(Specificity, TNR)",RAS,f1_score
0,0.082922,0.71,0.612245,0.23622,0.105634,0.894366,0.583312,0.340909
