#### 로지스틱 회귀

- 오즈(odds)
    - 로지스틱 회귀 분석은 확률의 오즈를 선형모형으로 모델링하는 개념
    - 어떤 사건이 발생할 확률과 그 사건이 발생하지 않을 확률 비율
    - odds = P / 1 - P
        - P: 사건이 일어날 확률 (0~1)
        - 1 - P: 사건이 일어나지 않을 확률
    

In [1]:
# 대학교 입학 데이터

import numpy as np
import pandas as pd

admission_data = pd.read_csv('https://raw.githubusercontent.com/YoungjinBD/data/main/admission.csv')
print(admission_data.shape)
print(admission_data.head())

(400, 5)
   admit  gre   gpa  rank gender
0      0  380  3.61     3      M
1      1  660  3.67     3      F
2      1  800  4.00     1      F
3      1  640  3.19     4      M
4      0  520  2.93     4      M


In [2]:
# 데이터에서 입학이 허가될 확률의 오즈 구하기

p_hat = admission_data['admit'].mean()
print(np.round(p_hat / (1 - p_hat), 3))

0.465


In [None]:
# 범주형 변수를 사용한 오즈 계산

unique_ranks = sorted(admission_data['rank'].unique())
print(unique_ranks)

# rank는 1에서 4등급 까지 존재

[1, 2, 3, 4]


In [6]:
grouped_data = admission_data.groupby('rank').agg(p_admit = ('admit', 'mean'))
grouped_data['odds'] = grouped_data['p_admit'] / (1 - grouped_data['p_admit'])
print(grouped_data)

       p_admit      odds
rank                    
1     0.540984  1.178571
2     0.357616  0.556701
3     0.231405  0.301075
4     0.179104  0.218182


#### 오즈(Odds)를 사용한 확률 역산


In [11]:
print(np.round(1.178 / (1.178 + 1) ,3))

0.541


#### 로지스틱 회귀계수 예측과 해석

- 로지스틱 회귀 분석의 계수를 구하는 내용은 MLE를 유도하는 것

In [15]:
odds_data = admission_data.groupby('rank').agg(p_admit = ('admit', 'mean')).reset_index()
odds_data['odds'] = odds_data['p_admit'] / (1 - odds_data['p_admit'])
odds_data['log_odds'] = np.log(odds_data['odds'])
print(odds_data)

   rank   p_admit      odds  log_odds
0     1  0.540984  1.178571  0.164303
1     2  0.357616  0.556701 -0.585727
2     3  0.231405  0.301075 -1.200395
3     4  0.179104  0.218182 -1.522427


In [17]:
# rank 변수는 범주형이지만 순서가 있는 변수 이므로 수치형으로 취급하고 진행

import statsmodels.formula.api as smf

model = smf.ols("log_odds ~rank", data=odds_data).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:               log_odds   R-squared:                       0.972
Model:                            OLS   Adj. R-squared:                  0.957
Method:                 Least Squares   F-statistic:                     68.47
Date:                Tue, 17 Jun 2025   Prob (F-statistic):             0.0143
Time:                        20:15:41   Log-Likelihood:                 3.2107
No. Observations:                   4   AIC:                            -2.421
Df Residuals:                       2   BIC:                            -3.649
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.6327      0.188      3.368      0.0

  warn("omni_normtest is not valid with less than 8 observations; %i "


#### 로지스틱 회귀 분석 과정

In [18]:
admission_data = pd.read_csv('https://raw.githubusercontent.com/YoungjinBD/data/main/admission.csv')
import statsmodels.formula.api as smf
import statsmodels.api as sm

admission_data['rank'] = admission_data['rank'].astype('category')
admission_data['gender'] = admission_data['gender'].astype('category')

In [20]:
admission_data.head()

Unnamed: 0,admit,gre,gpa,rank,gender
0,0,380,3.61,3,M
1,1,660,3.67,3,F
2,1,800,4.0,1,F
3,1,640,3.19,4,M
4,0,520,2.93,4,M


#### 방법 1 Formula API 활용

In [None]:
model = smf.logit('admit ~ gpa + gpa + rank + gender', data=admission_data).fit()
print(model.summary())

Optimization terminated successfully.
         Current function value: 0.578471
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                  admit   No. Observations:                  400
Model:                          Logit   Df Residuals:                      394
Method:                           MLE   Df Model:                            5
Date:                Tue, 17 Jun 2025   Pseudo R-squ.:                 0.07440
Time:                        20:22:03   Log-Likelihood:                -231.39
converged:                       True   LL-Null:                       -249.99
Covariance Type:            nonrobust   LLR p-value:                 5.461e-07
                  coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------
Intercept      -3.4211      1.108     -3.087      0.002      -5.593      -1.249
rank[T.2]      -0.6771    

In [24]:
model = smf.glm('admit ~ gre + gpa + rank + gender', data=admission_data, family=sm.families.Binomial()).fit()

#### 방법 2

In [25]:
# 범주형 변수 더미 변수로 변환

admission_data = pd.get_dummies(admission_data, columns=['rank', 'gender'], drop_first=True)

# bool 타입 int 변환
admission_data[['rank_2', 'rank_3', 'rank_4', 'gender_M']] = admission_data[['rank_2', 'rank_3', 'rank_4', 'gender_M']].astype(int)

# 독립변수와 종속변수 설정
X = admission_data[['gre', 'gpa', 'rank_2','rank_3', 'rank_4', 'gender_M']]
y = admission_data['admit']

# 상수항 추가
X = sm.add_constant(X)

# Logit 모델 적합 (로지스틱 회귀)
model = sm.Logit(y, X).fit()
print(model.summary())
#model = sm.GLM(y, X, family = sm.families.Binomial()).fit()

Optimization terminated successfully.
         Current function value: 0.573066
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                  admit   No. Observations:                  400
Model:                          Logit   Df Residuals:                      393
Method:                           MLE   Df Model:                            6
Date:                Tue, 17 Jun 2025   Pseudo R-squ.:                 0.08305
Time:                        20:30:24   Log-Likelihood:                -229.23
converged:                       True   LL-Null:                       -249.99
Covariance Type:            nonrobust   LLR p-value:                 2.283e-07
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -3.9536      1.149     -3.442      0.001      -6.205      -1.702
gre            0.0023      0.

-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [45]:
import pandas as pd
import numpy as np

# 예제 데이터 생성
np.random.seed(42)
n_samples = 210
X = np.random.randn(n_samples, 4)
y = (X[:, 0] + X[:, 1] * 0.5 + np.random.randn(n_samples) * 0.5 > 0).astype(int)
df = pd.DataFrame(X, columns = ['weight', 'height', 'age', 'income'])
df['gender'] = y

print(df.head())

     weight    height       age    income  gender
0  0.496714 -0.138264  0.647689  1.523030       1
1 -0.234153 -0.234137  1.579213  0.767435       0
2 -0.469474  0.542560 -0.463418 -0.465730       0
3  0.241962 -1.913280 -1.724918 -0.562288       0
4 -1.012831  0.314247 -0.908024 -1.412304       0


In [46]:
import statsmodels.formula.api as smf

model = smf.logit("gender ~ weight", data = df).fit()
model.summary()

Optimization terminated successfully.
         Current function value: 0.426786
         Iterations 7


0,1,2,3
Dep. Variable:,gender,No. Observations:,210.0
Model:,Logit,Df Residuals:,208.0
Method:,MLE,Df Model:,1.0
Date:,"Tue, 17 Jun 2025",Pseudo R-squ.:,0.3836
Time:,21:19:27,Log-Likelihood:,-89.625
converged:,True,LL-Null:,-145.41
Covariance Type:,nonrobust,LLR p-value:,4.4450000000000006e-26

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,0.2743,0.188,1.459,0.145,-0.094,0.643
weight,2.6078,0.361,7.229,0.000,1.901,3.315


In [40]:
np.exp(model.params['weight'])

13.569314589014752

In [41]:
df.head()

Unnamed: 0,weight,height,age,income,gender
0,0.496714,-0.138264,0.647689,1.52303,1
1,-0.234153,-0.234137,1.579213,0.767435,0
2,-0.469474,0.54256,-0.463418,-0.46573,0
3,0.241962,-1.91328,-1.724918,-0.562288,0
4,-1.012831,0.314247,-0.908024,-1.412304,0


In [43]:
import statsmodels.formula.api as smf

model = smf.logit("gender ~ weight + height + age + income", data=df).fit()

model.summary()

Optimization terminated successfully.
         Current function value: 0.284256
         Iterations 8


0,1,2,3
Dep. Variable:,gender,No. Observations:,210.0
Model:,Logit,Df Residuals:,205.0
Method:,MLE,Df Model:,4.0
Date:,"Tue, 17 Jun 2025",Pseudo R-squ.:,0.5895
Time:,21:09:55,Log-Likelihood:,-59.694
converged:,True,LL-Null:,-145.41
Covariance Type:,nonrobust,LLR p-value:,5.161e-36

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,0.6549,0.262,2.499,0.012,0.141,1.169
weight,4.0574,0.578,7.021,0.000,2.925,5.190
height,2.1131,0.372,5.677,0.000,1.384,2.843
age,0.2838,0.220,1.287,0.198,-0.148,0.716
income,-0.1859,0.243,-0.764,0.445,-0.662,0.291


In [44]:
-2 * (model.llf)

119.38771008100466

In [53]:
# 1번 문제의 모델(몸무게를 독립변수로 사용한) 데이터를 학습 데이터와 평가 데이터(90개로 설정)로 분류 한 후, 오분류율을 계산하라

from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=90, random_state=42)

print(train.shape, test.shape)

logit_model = smf.logit("gender ~ weight", data=train).fit()

(120, 5) (90, 5)
Optimization terminated successfully.
         Current function value: 0.476228
         Iterations 7


In [54]:
########

In [55]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_diabetes

diabetes = load_diabetes(as_frame = True)
df = diabetes.frame
print(df.head())

        age       sex       bmi        bp        s1        s2        s3  \
0  0.038076  0.050680  0.061696  0.021872 -0.044223 -0.034821 -0.043401   
1 -0.001882 -0.044642 -0.051474 -0.026328 -0.008449 -0.019163  0.074412   
2  0.085299  0.050680  0.044451 -0.005670 -0.045599 -0.034194 -0.032356   
3 -0.089063 -0.044642 -0.011595 -0.036656  0.012191  0.024991 -0.036038   
4  0.005383 -0.044642 -0.036385  0.021872  0.003935  0.015596  0.008142   

         s4        s5        s6  target  
0 -0.002592  0.019907 -0.017646   151.0  
1 -0.039493 -0.068332 -0.092204    75.0  
2 -0.002592  0.002861 -0.025930   141.0  
3  0.034309  0.022688 -0.009362   206.0  
4 -0.002592 -0.031988 -0.046641   135.0  


In [84]:
# 타겟 값 변경
df['target'] = (df['target'] > df['target'].median()).astype(int)

import statsmodels.formula.api as smf

model = smf.logit("target ~ age + sex + bmi + bp", data=df).fit()

model.summary()

p_values = model.pvalues

Optimization terminated successfully.
         Current function value: 0.543957
         Iterations 6


In [64]:
p_values = model.pvalues

In [67]:
p_values[p_values >= 0.05]

Intercept    0.679658
age          0.654140
sex          0.051523
dtype: float64

In [77]:
import statsmodels.formula.api as smf

new_model = smf.logit('target ~ bmi+bp', data=df).fit()

new_model.summary()

Optimization terminated successfully.
         Current function value: 0.548382
         Iterations 6


0,1,2,3
Dep. Variable:,target,No. Observations:,442.0
Model:,Logit,Df Residuals:,439.0
Method:,MLE,Df Model:,2.0
Date:,"Tue, 17 Jun 2025",Pseudo R-squ.:,0.2089
Time:,21:53:16,Log-Likelihood:,-242.39
converged:,True,LL-Null:,-306.37
Covariance Type:,nonrobust,LLR p-value:,1.626e-28

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,0.0403,0.111,0.363,0.717,-0.177,0.258
bmi,21.2153,2.934,7.230,0.000,15.464,26.966
bp,12.0247,2.627,4.578,0.000,6.876,17.173


In [78]:
new_model.params.mean()

11.09340765301607

In [83]:
new_model.params

Intercept     0.040255
bmi          21.215278
bp           12.024689
dtype: float64

In [None]:
# 오즈비를 구하세요!
# 오즈비는 np.exp(회귀계수)

np.exp(new_model.params.mean())

65736.37163328465

In [85]:
model.params

Intercept     0.046241
age           1.130876
sex          -4.767892
bmi          21.100453
bp           12.996251
dtype: float64

In [86]:
np.exp( 1.130876)

3.0983694833319757