# 14_2.Logistic Regression(분류)(연습문제1)
- https://www.statsmodels.org/stable/discretemod.html

## 1.기본 package 설정

In [None]:
# 그래프에서 한글 폰트 인식하기
!sudo apt-get install -y fonts-nanum
!sudo fc-cache -fv
!rm ~/.cache/matplotlib -rf

In [None]:
!pip install pingouin

#  *** 런타임 다시 시작

In [30]:
# 1.기본
import numpy as np  # numpy 패키지 가져오기
import matplotlib.pyplot as plt # 시각화 패키지 가져오기
import seaborn as sns # 시각화

# 2.데이터 가져오기
import pandas as pd # csv -> dataframe으로 전환

# 3.통계분석 package
import pingouin as pg
from scipy import stats
import statsmodels.api as sm

In [31]:
# 기본세팅
# 테마 설정
sns.set_theme(style = "darkgrid")

# 한글 인식
plt.rc('font', family='NanumBarunGothic')
plt.rcParams['axes.unicode_minus'] = False # -인식

## 2.데이터 불러오기

### 2.1 데이터 프레임으로 저장
- 원본데이터(csv)를 dataframe 형태로 가져오기(pandas)

In [32]:
lr_df = pd.read_csv('https://raw.githubusercontent.com/leecho-bigdata/statistics-python/main/14_2.HR.csv', encoding="cp949")
lr_df.head()

Unnamed: 0,id,이직의도,신체적건강,심리적건강,조직만족,이직경험
0,1,1,43,18,28,2
1,2,1,54,27,28,2
2,3,1,60,30,26,1
3,4,1,57,17,23,1
4,5,1,60,30,29,2


### 2.2 범주형 변수 처리
- 가변수 처리시 문자로 처리를 해야 변수명 구분이 쉬움

In [33]:
lr_df['이직의도'].replace({1:'없음', 2:'있음'}, inplace=True)
lr_df['이직경험'].replace({1:'없음', 2:'있음'}, inplace=True)
lr_df['이직의도'] = lr_df['이직의도'].astype('category')
lr_df['이직경험'] = lr_df['이직경험'].astype('category')
lr_df

Unnamed: 0,id,이직의도,신체적건강,심리적건강,조직만족,이직경험
0,1,없음,43,18,28,있음
1,2,없음,54,27,28,있음
2,3,없음,60,30,26,없음
3,4,없음,57,17,23,없음
4,5,없음,60,30,29,있음
...,...,...,...,...,...,...
95,96,있음,33,15,18,있음
96,97,있음,41,26,16,없음
97,98,있음,27,15,14,없음
98,99,있음,45,22,23,있음


### 2.3 자료구조 살펴보기

In [34]:
lr_df.shape

(100, 6)

In [35]:
lr_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   id      100 non-null    int64   
 1   이직의도    100 non-null    category
 2   신체적건강   100 non-null    int64   
 3   심리적건강   100 non-null    int64   
 4   조직만족    100 non-null    int64   
 5   이직경험    100 non-null    category
dtypes: category(2), int64(4)
memory usage: 3.7 KB


In [36]:
lr_df.columns

Index(['id', '이직의도', '신체적건강', '심리적건강', '조직만족', '이직경험'], dtype='object')

## 3.기술통계

In [37]:
# 그룹별 기술통계
lr_df.describe().round(3).T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,100.0,50.5,29.011,1.0,25.75,50.5,75.25,100.0
신체적건강,100.0,44.86,7.448,26.0,41.0,45.0,48.0,60.0
심리적건강,100.0,21.78,4.877,9.0,18.0,22.0,24.0,30.0
조직만족,100.0,22.95,4.208,12.0,21.0,24.0,26.0,32.0


In [38]:
# 범주형 변수
# lecture_df.columns
categorical_features = ['이직의도']

for col in categorical_features:
    print("----", col, "----")
    results = lr_df[col].value_counts()
    print(results, "\n")

---- 이직의도 ----
없음    72
있음    28
Name: 이직의도, dtype: int64 



## 4.Logistic Regression
- https://www.statsmodels.org/stable/examples/notebooks/generated/ols.html
- 수치형 + 범주형
- dmatrix 사용


### 4.1 회귀분석

In [39]:
# columns = ['반지름', '질감', '주변부', '크기', '평활도', '조밀성', '오목', '대칭', '프랙탈']

# 다중공선 제거: 반지름, 주변부
columns = ['신체적건강', '심리적건강', '조직만족', 'C(이직경험)']

formula = "이직의도 ~ " + " + ".join(columns)
formula

'이직의도 ~ 신체적건강 + 심리적건강 + 조직만족 + C(이직경험)'

In [43]:
# dmatrix 이용
from patsy import dmatrices

y, X = dmatrices(formula,
                 data = lr_df,
                 return_type = 'dataframe')

In [44]:
X.head()

Unnamed: 0,Intercept,C(이직경험)[T.있음],신체적건강,심리적건강,조직만족
0,1.0,1.0,43.0,18.0,28.0
1,1.0,1.0,54.0,27.0,28.0
2,1.0,0.0,60.0,30.0,26.0
3,1.0,0.0,57.0,17.0,23.0
4,1.0,1.0,60.0,30.0,29.0


In [45]:
y = y.drop('이직의도[없음]', axis = 1)
y.head()

Unnamed: 0,이직의도[있음]
0,0.0
1,0.0
2,0.0
3,0.0
4,0.0


In [46]:
model = sm.Logit(y, X)   # 모델 생성
result = model.fit()   # 모델 실행

Optimization terminated successfully.
         Current function value: 0.277245
         Iterations 8


In [47]:
print(result.summary())

                           Logit Regression Results                           
Dep. Variable:               이직의도[있음]   No. Observations:                  100
Model:                          Logit   Df Residuals:                       95
Method:                           MLE   Df Model:                            4
Date:                Sat, 09 Mar 2024   Pseudo R-squ.:                  0.5324
Time:                        09:59:44   Log-Likelihood:                -27.725
converged:                       True   LL-Null:                       -59.295
Covariance Type:            nonrobust   LLR p-value:                 6.336e-13
                    coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------
Intercept        14.8034      3.605      4.107      0.000       7.739      21.868
C(이직경험)[T.있음]     2.0283      0.852      2.381      0.017       0.359       3.698
신체적건강            -0.0561      0.067     

### 4.2 odds

In [48]:
print("===== 계수 =====")
print(result.params)
print("\n")
print("===== odds =====")
print(np.exp(result.params))

===== 계수 =====
Intercept        14.803356
C(이직경험)[T.있음]     2.028263
신체적건강            -0.056097
심리적건강            -0.065524
조직만족             -0.603206
dtype: float64


===== odds =====
Intercept        2.685443e+06
C(이직경험)[T.있음]    7.600869e+00
신체적건강            9.454476e-01
심리적건강            9.365762e-01
조직만족             5.470551e-01
dtype: float64


## 5.가정검정
- https://ethanweed.github.io/pythonbook/05.04-regression.html#regressionnormality
- 잔차의 등분산성: Breusch-Pagan
- 잔차의 정규성: Jarque-Bera, Omnibus(D'Angostino's test)
- 독립성(자기상관): Durbin-Watson
- 다중공선성(VIF): Cond. No

### 5.1 다중 공선성
- VIF 10이상 삭제

In [49]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(X.values, i) for i in range(X.values.shape[1])]
vif["features"] = X.columns
print(vif.round(1))

   VIF Factor       features
0        54.2      Intercept
1         1.0  C(이직경험)[T.있음]
2         1.9          신체적건강
3         1.9          심리적건강
4         1.1           조직만족
