# 패키지

In [12]:
from pandas import read_excel, DataFrame

# 로지스틱 회귀 모듈1
from statsmodels.formula.api import logit
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score, accuracy_score, recall_score, precision_score, f1_score

# 시각확
import seaborn as sb
from matplotlib import pyplot as plt
import numpy as np
import sys

# 문제 1

피마 인디언 당뇨병 발병여부를 예측할 수 있는 분석 모델을 구현하기 위해 아래와 같은 항목들을 조사하였다. 

분석하라.

단, 모든 독립변수는 명목형 변수를 포함하지 않으며 정규분포를 만족한다고 가정한다.

In [13]:
df = read_excel("https://data.hossam.kr/E05/indian_diabetes.xlsx")
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [15]:
df.isna().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [16]:
print(df.columns)

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')


## #1. 로지스틱 회귀

In [17]:
df1 = df.copy()
model = logit("Outcome ~ Pregnancies + Glucose + BloodPressure + SkinThickness + Insulin + BMI + DiabetesPedigreeFunction + Age", data=df1)
fit = model.fit()
print(fit.summary())

Optimization terminated successfully.
         Current function value: 0.470993
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                Outcome   No. Observations:                  768
Model:                          Logit   Df Residuals:                      759
Method:                           MLE   Df Model:                            8
Date:                Mon, 31 Jul 2023   Pseudo R-squ.:                  0.2718
Time:                        15:37:04   Log-Likelihood:                -361.72
converged:                       True   LL-Null:                       -496.74
Covariance Type:            nonrobust   LLR p-value:                 9.652e-54
                               coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------------
Intercept                   -8.4047      0.717    -11.728      0.000      -9.809

### 의사결정계수 확인

In [18]:
fit.prsquared

0.27180966859224576

In [19]:
df1.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

### DF 생성

In [23]:
df1["예측값"] = fit.predict(df1.drop("Outcome", axis=1))
df1['예측결과'] = df1['예측값'] > 0.5
df1

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,예측값,예측결과
0,6,148,72,35,0,33.6,0.627,50,1,0.721727,True
1,1,85,66,29,0,26.6,0.351,31,0,0.048642,False
2,8,183,64,0,0,23.3,0.672,32,1,0.796702,True
3,1,89,66,23,94,28.1,0.167,21,0,0.041625,False
4,0,137,40,35,168,43.1,2.288,33,1,0.902184,True
...,...,...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0,0.317115,False
764,2,122,70,27,0,36.8,0.340,27,0,0.318969,False
765,5,121,72,23,112,26.2,0.245,30,0,0.170416,False
766,1,126,60,0,0,30.1,0.349,47,1,0.284976,False


### 혼동행렬

In [24]:
cm = confusion_matrix(df1['Outcome'], df1['예측결과'])
tn, fp, fn, tp = cm.ravel()

print("혼동행렬 구분\n" "tn :",tn, "fp :",fp, "fn :",fn, "tp :",tp)

cmdf = DataFrame([[tn, tp], [fn, fp]], index=['True', 'False'], columns=['Negative', 'Positive'])
cmdf

혼동행렬 구분
tn : 445 fp : 55 fn : 112 tp : 156


Unnamed: 0,Negative,Positive
True,445,156
False,112,55


### 데이터 값 시각화

In [25]:
ras = roc_auc_score(df1['Outcome'], df1['예측결과'])

# 정확도
acc = accuracy_score(df1['Outcome'], df1['예측결과'])

# 정밀도
pre = precision_score(df1['Outcome'], df1['예측결과'])

# 재현율
recall = recall_score(df1['Outcome'], df1['예측결과'])

# F1 score
f1 = f1_score(df1['Outcome'], df1['예측결과'])

# 위양성율
fallout = fp / (fp + tn)

# 특이성
spe = 1 - fallout

result_df = DataFrame({'설명력(Pseudo-Rsqe)': [fit.prsquared], '정확도(Accuracy)':[acc], '정밀도(Precision)':[pre], '재현율(Recall, TPR)':[recall], '위양성율(Fallout, FPR)': [fallout], '특이성(Specificity, TNR)':[spe], 'RAS': [ras], 'f1_score':[f1]})

result_df.T

Unnamed: 0,0
설명력(Pseudo-Rsqe),0.27181
정확도(Accuracy),0.782552
정밀도(Precision),0.739336
"재현율(Recall, TPR)",0.58209
"위양성율(Fallout, FPR)",0.11
"특이성(Specificity, TNR)",0.89
RAS,0.736045
f1_score,0.651357


### 오즈비(Odds Rate) 구하기

#### 계수 값만 추출

In [26]:
coef = fit.params
coef

Intercept                  -8.404696
Pregnancies                 0.123182
Glucose                     0.035164
BloodPressure              -0.013296
SkinThickness               0.000619
Insulin                    -0.001192
BMI                         0.089701
DiabetesPedigreeFunction    0.945180
Age                         0.014869
dtype: float64

#### 오즈비 계산

In [27]:
odds_rate = np.exp(coef)
odds_rate

Intercept                   0.000224
Pregnancies                 1.131091
Glucose                     1.035789
BloodPressure               0.986792
SkinThickness               1.000619
Insulin                     0.998809
BMI                         1.093847
DiabetesPedigreeFunction    2.573276
Age                         1.014980
dtype: float64

In [29]:
myresult = fit.predict(df1)
myresult

0      0.721727
1      0.048642
2      0.796702
3      0.041625
4      0.902184
         ...   
763    0.317115
764    0.318969
765    0.170416
766    0.284976
767    0.072014
Length: 768, dtype: float64