# 14_2.Logistic Regression(분류)(연습문제2)
- https://www.statsmodels.org/stable/discretemod.html

## 1.기본 package 설정

In [None]:
# 그래프에서 한글 폰트 인식하기
!sudo apt-get install -y fonts-nanum
!sudo fc-cache -fv
!rm ~/.cache/matplotlib -rf

In [None]:
!pip install pingouin

#  *** 런타임 다시 시작

In [1]:
# 1.기본
import numpy as np  # numpy 패키지 가져오기
import matplotlib.pyplot as plt # 시각화 패키지 가져오기
import seaborn as sns # 시각화

# 2.데이터 가져오기
import pandas as pd # csv -> dataframe으로 전환

# 3.통계분석 package
import pingouin as pg
from scipy import stats
import statsmodels.api as sm

In [2]:
# 기본세팅
# 테마 설정
sns.set_theme(style = "darkgrid")

# 한글 인식
plt.rc('font', family='NanumBarunGothic')
plt.rcParams['axes.unicode_minus'] = False # -인식

## 2.데이터 불러오기

### 2.1 데이터 프레임으로 저장
- 원본데이터(csv)를 dataframe 형태로 가져오기(pandas)

In [3]:
lr_df = pd.read_csv('https://raw.githubusercontent.com/leecho-bigdata/statistics-python/main/14_3.UniversalBank.csv', encoding="cp949")
lr_df.head()

Unnamed: 0,id,대출의도,나이,경력,수입,카드사용액,카드보유유무
0,1,0,25,1,49,1.6,0
1,2,0,45,19,34,1.5,0
2,3,0,39,15,11,1.0,0
3,4,0,35,9,100,2.7,0
4,5,0,35,8,45,1.0,1


### 2.2 범주형 변수 처리
- 가변수 처리시 문자로 처리를 해야 변수명 구분이 쉬움

In [15]:
lr_df['대출의도'].replace({0:'거절', 1:'수락'}, inplace=True)
lr_df['카드보유유무'].replace({0:'없음', 1:'있음'}, inplace=True)
lr_df['대출의도'] = lr_df['대출의도'].astype('category')
lr_df['카드보유유무'] = lr_df['카드보유유무'].astype('category')

lr_df

Unnamed: 0,id,대출의도,나이,경력,수입,카드사용액,카드보유유무
0,1,거절,25,1,49,1.6,없음
1,2,거절,45,19,34,1.5,없음
2,3,거절,39,15,11,1.0,없음
3,4,거절,35,9,100,2.7,없음
4,5,거절,35,8,45,1.0,있음
...,...,...,...,...,...,...,...
195,196,거절,34,10,13,1.0,없음
196,197,거절,48,24,165,5.0,있음
197,198,거절,55,31,9,0.7,없음
198,199,거절,27,3,59,0.0,없음


### 2.3 자료구조 살펴보기

In [16]:
lr_df.shape

(200, 7)

In [17]:
lr_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   id      200 non-null    int64   
 1   대출의도    200 non-null    category
 2   나이      200 non-null    int64   
 3   경력      200 non-null    int64   
 4   수입      200 non-null    int64   
 5   카드사용액   200 non-null    float64 
 6   카드보유유무  200 non-null    category
dtypes: category(2), float64(1), int64(4)
memory usage: 8.6 KB


In [18]:
lr_df.columns

Index(['id', '대출의도', '나이', '경력', '수입', '카드사용액', '카드보유유무'], dtype='object')

## 3.기술통계

In [19]:
# 그룹별 기술통계
lr_df.describe().round(3).T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,200.0,100.5,57.879,1.0,50.75,100.5,150.25,200.0
나이,200.0,45.015,11.316,24.0,36.0,46.0,54.0,67.0
경력,200.0,19.89,11.262,-1.0,11.0,20.0,29.0,41.0
수입,200.0,73.835,46.148,8.0,39.0,64.5,109.75,194.0
카드사용액,200.0,2.061,1.849,0.0,0.7,1.5,2.7,8.9


In [20]:
# 범주형 변수
# lecture_df.columns
categorical_features = ['대출의도']

for col in categorical_features:
    print("----", col, "----")
    results = lr_df[col].value_counts()
    print(results, "\n")

---- 대출의도 ----
거절    181
수락     19
Name: 대출의도, dtype: int64 



## 4.Logistic Regression
- https://www.statsmodels.org/stable/examples/notebooks/generated/ols.html
- 수치형 + 범주형
- dmatrix 사용


### 4.1 회귀분석

In [29]:
# columns = ['나이', '경력', '수입', '카드사용액', 'C(카드보유유무)]

# 다중공선 제거: 나이
columns = ['경력', '수입', '카드사용액', 'C(카드보유유무)']

formula = "대출의도 ~ " + " + ".join(columns)
formula

'대출의도 ~ 경력 + 수입 + 카드사용액 + C(카드보유유무)'

In [30]:
# dmatrix 이용
from patsy import dmatrices

y, X = dmatrices(formula,
                 data = lr_df,
                 return_type = 'dataframe')

In [31]:
X.head()

Unnamed: 0,Intercept,C(카드보유유무)[T.있음],경력,수입,카드사용액
0,1.0,0.0,1.0,49.0,1.6
1,1.0,0.0,19.0,34.0,1.5
2,1.0,0.0,15.0,11.0,1.0
3,1.0,0.0,9.0,100.0,2.7
4,1.0,1.0,8.0,45.0,1.0


In [32]:
y = y.drop('대출의도[거절]', axis = 1)
y.head()

Unnamed: 0,대출의도[수락]
0,0.0
1,0.0
2,0.0
3,0.0
4,0.0


In [33]:
model = sm.Logit(y, X)   # 모델 생성
result = model.fit()   # 모델 실행

Optimization terminated successfully.
         Current function value: 0.161036
         Iterations 9


In [34]:
print(result.summary())

                           Logit Regression Results                           
Dep. Variable:               대출의도[수락]   No. Observations:                  200
Model:                          Logit   Df Residuals:                      195
Method:                           MLE   Df Model:                            4
Date:                Sat, 09 Mar 2024   Pseudo R-squ.:                  0.4871
Time:                        09:58:45   Log-Likelihood:                -32.207
converged:                       True   LL-Null:                       -62.791
Covariance Type:            nonrobust   LLR p-value:                 1.648e-12
                      coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
Intercept          -7.4939      1.516     -4.943      0.000     -10.465      -4.523
C(카드보유유무)[T.있음]    -0.1935      0.680     -0.285      0.776      -1.526       1.139
경력                 -0.0354      

### 4.2 odds

In [35]:
print("===== 계수 =====")
print(result.params)
print("\n")
print("===== odds =====")
print(np.exp(result.params))

===== 계수 =====
Intercept         -7.493916
C(카드보유유무)[T.있음]   -0.193535
경력                -0.035417
수입                 0.047949
카드사용액              0.182931
dtype: float64


===== odds =====
Intercept          0.000556
C(카드보유유무)[T.있음]    0.824041
경력                 0.965202
수입                 1.049117
카드사용액              1.200731
dtype: float64


## 5.가정검정
- https://ethanweed.github.io/pythonbook/05.04-regression.html#regressionnormality
- 잔차의 등분산성: Breusch-Pagan
- 잔차의 정규성: Jarque-Bera, Omnibus(D'Angostino's test)
- 독립성(자기상관): Durbin-Watson
- 다중공선성(VIF): Cond. No

### 5.1 다중 공선성
- VIF 10이상 삭제

In [36]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(X.values, i) for i in range(X.values.shape[1])]
vif["features"] = X.columns
print(vif.round(1))

   VIF Factor         features
0         7.4        Intercept
1         1.0  C(카드보유유무)[T.있음]
2         1.0               경력
3         1.7               수입
4         1.8            카드사용액


In [37]:
# 나이 제거하고 다시 수행