In [1]:
import pandas as pd
import numpy as np

male_df = pd.read_csv("C:/Users/JEONGHEE/Desktop/당뇨병플젝/male.csv")
female_df = pd.read_csv("C:/Users/JEONGHEE/Desktop/당뇨병플젝/female.csv")

In [2]:
male_df.shape

(36558, 29)

In [3]:
female_df.shape

(70308, 29)

In [5]:
# Step 1. 정규성 검정 (Shapiro-Wilk test)
# Step 2. 등분산성 검정 (Levene’s test)

# → 정규성 만족 & 등분산성 만족: 독립표본 t-test
# → 정규성 만족 & 등분산성 X : Welch’s t-test
# → 정규성 X : Mann-Whitney U test (aka Wilcoxon rank-sum)

# 남자

## 등분산성.정규성 검정

In [4]:
from scipy.stats import ttest_ind, chi2_contingency

target_col = 'target'

def classify_variable(series, threshold=10):
    if pd.api.types.is_numeric_dtype(series):
        if series.nunique() <= threshold:
            return 'categorical'
        else:
            return 'continuous'
    else:
        return 'categorical'

var_types = {col: classify_variable(male_df[col]) for col in male_df.columns if col != target_col}
continuous_vars = [col for col, typ in var_types.items() if typ == 'continuous']
categorical_vars = [col for col, typ in var_types.items() if typ == 'categorical']

# 그룹 분리
group0 = male_df[male_df[target_col] == 0]
group1 = male_df[male_df[target_col] == 1]

In [6]:
#연속형 변수 → t-test
ttest_results = []
for var in continuous_vars:
    try:
        mean0, std0 = group0[var].mean(), group0[var].std()
        mean1, std1 = group1[var].mean(), group1[var].std()
        _, p_val = ttest_ind(group0[var], group1[var], nan_policy='omit')
        ttest_results.append({
            '변수': var,
            '비당뇨 (평균±SD)': f'{mean0:.2f} ± {std0:.2f}',
            '당뇨 (평균±SD)': f'{mean1:.2f} ± {std1:.2f}',
            'p-value': f'{p_val:.4f}' if p_val >= 0.0001 else '<0.0001'
        })
    except:
        continue

ttest_df = pd.DataFrame(ttest_results)

#범주형 변수 → 카이제곱 검정
chi2_results = []
for var in categorical_vars:
    try:
        table = pd.crosstab(male_df[var], male_df[target_col])
        _, p_val, _, _ = chi2_contingency(table)
        chi2_results.append({
            '변수': var,
            'p-value': f'{p_val:.4f}' if p_val >= 0.0001 else '<0.0001'
        })
    except:
        continue

chi2_df = pd.DataFrame(chi2_results)


print("연속형 변수 t-test 결과:")
print(ttest_df)

print("\n범주형 변수 카이제곱 결과:")
print(chi2_df)

연속형 변수 t-test 결과:
             변수          비당뇨 (평균±SD)           당뇨 (평균±SD)  p-value
0       DS1_AGE         53.14 ± 8.73         54.75 ± 8.20  <0.0001
1    DS1_WALKFQ  50696.16 ± 37051.81  53627.14 ± 35997.25   0.0019
2     DS1_WALKT  50714.95 ± 37026.11  53643.59 ± 35972.75   0.0019
3     DS1_WAIST         85.47 ± 7.47         88.87 ± 7.67  <0.0001
4       DS1_HIP         96.02 ± 5.60         97.22 ± 6.16  <0.0001
5     DS1_PULSE         68.38 ± 9.68        71.68 ± 10.58  <0.0001
6       DS1_SBP       125.44 ± 14.34       130.96 ± 14.96  <0.0001
7       DS1_DBP         78.70 ± 9.77         81.19 ± 9.70  <0.0001
8       DS1_BMI         24.34 ± 2.73         25.45 ± 2.92  <0.0001
9       DS1_PBF         23.29 ± 4.45         25.28 ± 4.24  <0.0001
10  DS1_BODYFAT         16.45 ± 4.75         18.59 ± 5.21  <0.0001
11   DS1_MUSCLE         49.01 ± 5.11         49.64 ± 5.57  <0.0001
12   DS1_VISFAT          2.49 ± 1.02          2.96 ± 1.18  <0.0001
13      DS1_WHR          0.89 ± 0.05        

In [8]:
from scipy.stats import kstest

ks_results = []

for var in continuous_vars:
    try:
        data = male_df[var].dropna()
        mean = data.mean()
        std = data.std()

        # 정규분포(mean, std)에 대해 K-S test 수행
        stat, p_val = kstest(data, 'norm', args=(mean, std))

        ks_results.append({
            '변수': var,
            'Kolmogorov-Smirnov p-value': f'{p_val:.4f}',
            '정규성 판단': '만족' if p_val > 0.05 else '불만족'
        })
    except:
        continue

ks_df = pd.DataFrame(ks_results)


print("Kolmogorov–Smirnov 정규성 검정 결과:")
print(ks_df)

Kolmogorov–Smirnov 정규성 검정 결과:
             변수 Kolmogorov-Smirnov p-value 정규성 판단
0       DS1_AGE                     0.0000    불만족
1    DS1_WALKFQ                     0.0000    불만족
2     DS1_WALKT                     0.0000    불만족
3     DS1_WAIST                     0.0000    불만족
4       DS1_HIP                     0.0000    불만족
5     DS1_PULSE                     0.0000    불만족
6       DS1_SBP                     0.0000    불만족
7       DS1_DBP                     0.0000    불만족
8       DS1_BMI                     0.0000    불만족
9       DS1_PBF                     0.0000    불만족
10  DS1_BODYFAT                     0.0000    불만족
11   DS1_MUSCLE                     0.0000    불만족
12   DS1_VISFAT                     0.0000    불만족
13      DS1_WHR                     0.0000    불만족
14     DS1_GLU0                     0.0000    불만족
15    DS1_HBA1C                     0.0000    불만족
16   total_exer                     0.0000    불만족


In [9]:
# ##### 전부 불만족 
# #### Mann–Whitney U test 적용

In [11]:
from scipy.stats import mannwhitneyu

mw_results = []

for var in continuous_vars:
    try:
        data0 = group0[var].dropna()
        data1 = group1[var].dropna()
        _, p_val = mannwhitneyu(data0, data1, alternative='two-sided')
        mw_results.append({
            '변수': var,
            '비당뇨 (중앙값)': round(data0.median(), 2),
            '당뇨 (중앙값)': round(data1.median(), 2),
            'p-value': f'{p_val:.4f}' if p_val >= 0.0001 else '<0.0001'
        })
    except:
        continue


mw_df = pd.DataFrame(mw_results)
print("Mann–Whitney U test (비모수 검정) 결과:\n")
print(mw_df)

Mann–Whitney U test (비모수 검정) 결과:

             변수  비당뇨 (중앙값)  당뇨 (중앙값)  p-value
0       DS1_AGE      53.00     55.00  <0.0001
1    DS1_WALKFQ   77777.00  77777.00   0.0003
2     DS1_WALKT   77777.00  77777.00   0.0021
3     DS1_WAIST      85.50     89.00  <0.0001
4       DS1_HIP      96.00     97.00  <0.0001
5     DS1_PULSE      68.00     71.00  <0.0001
6       DS1_SBP     125.00    130.00  <0.0001
7       DS1_DBP      80.00     80.00  <0.0001
8       DS1_BMI      24.27     25.25  <0.0001
9       DS1_PBF      23.70     25.60  <0.0001
10  DS1_BODYFAT      16.40     18.20  <0.0001
11   DS1_MUSCLE      48.60     49.20  <0.0001
12   DS1_VISFAT       2.40      2.80  <0.0001
13      DS1_WHR       0.89      0.91  <0.0001
14     DS1_GLU0      92.00    134.00  <0.0001
15    DS1_HBA1C       5.50      6.70  <0.0001
16   total_exer       1.50      0.75   0.0147


In [12]:
###### p-value 유의미한 변수 17개 

## 다중공선성 

In [15]:
# 남자 다중공선성
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant


pd.set_option('display.float_format', '{:.2f}'.format)

# 등분산 만족 + 연속형 변수만 사용 ('target'은 종속 변수라 제외)
X = male_df[['DS1_AGE', 'DS1_WALKFQ', 'DS1_WALKT',
             'DS1_WAIST', 'DS1_HIP', 'DS1_PULSE', 'DS1_SBP',
             'DS1_DBP', 'DS1_BMI', 'DS1_PBF', 'DS1_BODYFAT',
             'DS1_MUSCLE', 'DS1_VISFAT', 'DS1_WHR',
             'DS1_GLU0', 'total_exer']]

X_const = add_constant(X)

# VIF 계산
vif_data = pd.DataFrame()
vif_data['Variable'] = X_const.columns
vif_data['VIF'] = [variance_inflation_factor(X_const.values, i) for i in range(X_const.shape[1])]
print(vif_data)

       Variable        VIF
0         const   18132.32
1       DS1_AGE       1.29
2    DS1_WALKFQ 2395348.52
3     DS1_WALKT 2395320.79
4     DS1_WAIST     141.78
5       DS1_HIP      67.08
6     DS1_PULSE       1.05
7       DS1_SBP       2.39
8       DS1_DBP       2.34
9       DS1_BMI       5.39
10      DS1_PBF      22.46
11  DS1_BODYFAT      34.91
12   DS1_MUSCLE       5.99
13   DS1_VISFAT       5.36
14      DS1_WHR      60.44
15     DS1_GLU0       1.05
16   total_exer       1.06


# 여자

## 등분산성.정규성 검정

In [29]:

from scipy.stats import ttest_ind, chi2_contingency, kstest, mannwhitneyu

female_df = pd.read_csv("C:/Users/JEONGHEE/Desktop/당뇨병플젝/female.csv")


target_col = 'target'
def classify_variable(series, threshold=10):
    if pd.api.types.is_numeric_dtype(series):
        if series.nunique() <= threshold:
            return 'categorical'
        else:
            return 'continuous'
    else:
        return 'categorical'


var_types = {col: classify_variable(female_df[col]) for col in female_df.columns if col != target_col}
continuous_vars = [col for col, typ in var_types.items() if typ == 'continuous']
categorical_vars = [col for col, typ in var_types.items() if typ == 'categorical']

group0 = female_df[female_df[target_col] == 0]
group1 = female_df[female_df[target_col] == 1]

In [34]:
# 1. 연속형 변수 → t-test
ttest_results = []
for var in continuous_vars:
    try:
        mean0, std0 = group0[var].mean(), group0[var].std()
        mean1, std1 = group1[var].mean(), group1[var].std()
        _, p_val = ttest_ind(group0[var], group1[var], nan_policy='omit')
        ttest_results.append({
            '변수': var,
            '비당뇨 (평균±SD)': f'{mean0:.2f} ± {std0:.2f}',
            '당뇨 (평균±SD)': f'{mean1:.2f} ± {std1:.2f}',
            'p-value': f'{p_val:.4f}' if p_val >= 0.0001 else '<0.0001'
        })
    except:
        continue

ttest_df = pd.DataFrame(ttest_results)

# 2. 범주형 변수 → 카이제곱 검정
chi2_results = []
for var in categorical_vars:
    try:
        table = pd.crosstab(female_df[var], female_df[target_col])
        _, p_val, _, _ = chi2_contingency(table)
        chi2_results.append({
            '변수': var,
            'p-value': f'{p_val:.4f}' if p_val >= 0.0001 else '<0.0001'
        })
    except:
        continue

chi2_df = pd.DataFrame(chi2_results)

# 결과 출력
print("연속형 변수 t-test 결과:")
print(ttest_df)

print("\n범주형 변수 카이제곱 결과:")
print(chi2_df)


연속형 변수 t-test 결과:
             변수          비당뇨 (평균±SD)           당뇨 (평균±SD)  p-value
0       DS1_AGE         51.97 ± 7.88         55.84 ± 7.75  <0.0001
1    DS1_WALKFQ  47291.38 ± 37968.63  50414.99 ± 37150.63   0.0009
2     DS1_WALKT  47311.73 ± 37943.28  50433.47 ± 37125.55   0.0009
3     DS1_WAIST         78.11 ± 8.09         84.29 ± 8.48  <0.0001
4       DS1_HIP         93.55 ± 5.63         95.74 ± 6.66  <0.0001
5     DS1_PULSE         69.24 ± 9.19         72.20 ± 9.91  <0.0001
6       DS1_SBP       120.29 ± 15.17       128.65 ± 16.33  <0.0001
7       DS1_DBP         74.53 ± 9.73         78.55 ± 9.81  <0.0001
8       DS1_BMI         23.54 ± 2.90         25.75 ± 3.41  <0.0001
9       DS1_PBF         30.50 ± 4.36         33.55 ± 4.09  <0.0001
10  DS1_BODYFAT         17.82 ± 4.51         21.17 ± 5.23  <0.0001
11   DS1_MUSCLE         36.54 ± 3.48         37.58 ± 4.06  <0.0001
12   DS1_VISFAT          2.09 ± 0.88          2.72 ± 1.04  <0.0001
13      DS1_WHR          0.83 ± 0.06        

In [31]:
ks_results2 = []

for var in continuous_vars:
    try:
        data = female_df[var].dropna()
        mean = data.mean()
        std = data.std()
        stat, p_val = kstest(data, 'norm', args=(mean, std))
        ks_results2.append({
            '변수': var,
            'Kolmogorov-Smirnov p-value': f'{p_val:.4f}',
            '정규성 판단': '만족' if p_val > 0.05 else '불만족'
        })
    except:
        continue

ks_df2 = pd.DataFrame(ks_results2)

print("\nKolmogorov–Smirnov 정규성 검정 결과:")
print(ks_df2)


Kolmogorov–Smirnov 정규성 검정 결과:
             변수 Kolmogorov-Smirnov p-value 정규성 판단
0       DS1_AGE                     0.0000    불만족
1    DS1_WALKFQ                     0.0000    불만족
2     DS1_WALKT                     0.0000    불만족
3     DS1_WAIST                     0.0000    불만족
4       DS1_HIP                     0.0000    불만족
5     DS1_PULSE                     0.0000    불만족
6       DS1_SBP                     0.0000    불만족
7       DS1_DBP                     0.0000    불만족
8       DS1_BMI                     0.0000    불만족
9       DS1_PBF                     0.0000    불만족
10  DS1_BODYFAT                     0.0000    불만족
11   DS1_MUSCLE                     0.0000    불만족
12   DS1_VISFAT                     0.0000    불만족
13      DS1_WHR                     0.0000    불만족
14     DS1_GLU0                     0.0000    불만족
15    DS1_HBA1C                     0.0000    불만족
16   total_exer                     0.0000    불만족


In [27]:
# ##### 전부 불만족 
# #### Mann–Whitney U test 적용

In [32]:
mw_results2 = []

for var in continuous_vars:
    try:
        data0 = group0[var].dropna()
        data1 = group1[var].dropna()
        _, p_val = mannwhitneyu(data0, data1, alternative='two-sided')
        mw_results2.append({
            '변수': var,
            '비당뇨 (중앙값)': round(data0.median(), 2),
            '당뇨 (중앙값)': round(data1.median(), 2),
            'p-value': f'{p_val:.4f}' if p_val >= 0.0001 else '<0.0001'
        })
    except:
        continue

mw_df2 = pd.DataFrame(mw_results2)

print("\nMann–Whitney U test (비모수 검정) 결과:")
print(mw_df2)


Mann–Whitney U test (비모수 검정) 결과:
             변수  비당뇨 (중앙값)  당뇨 (중앙값)  p-value
0       DS1_AGE      51.00     56.00  <0.0001
1    DS1_WALKFQ   77777.00  77777.00  <0.0001
2     DS1_WALKT   77777.00  77777.00   0.0006
3     DS1_WAIST      78.00     84.00  <0.0001
4       DS1_HIP      93.00     95.00  <0.0001
5     DS1_PULSE      68.00     71.00  <0.0001
6       DS1_SBP     120.00    128.00  <0.0001
7       DS1_DBP      74.00     80.00  <0.0001
8       DS1_BMI      23.26     25.31  <0.0001
9       DS1_PBF      30.60     33.60  <0.0001
10  DS1_BODYFAT      17.40     20.60  <0.0001
11   DS1_MUSCLE      36.30     37.30  <0.0001
12   DS1_VISFAT       2.00      2.60  <0.0001
13      DS1_WHR       0.83      0.88  <0.0001
14     DS1_GLU0      89.00    129.00  <0.0001
15    DS1_HBA1C       5.50      6.70  <0.0001
16   total_exer       0.00      0.00   0.0150


In [33]:
#### 전부 유의미함

## 다중공선성

In [36]:
# 여자 다중공선성
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

pd.set_option('display.float_format', '{:.2f}'.format)

# 등분산 만족 + 연속형 변수만 사용 ('target'은 종속 변수라 제외)
X = female_df[['DS1_AGE', 'DS1_WALKFQ', 'DS1_WALKT',
               'DS1_WAIST', 'DS1_HIP', 'DS1_PULSE', 'DS1_SBP',
               'DS1_DBP', 'DS1_BMI', 'DS1_PBF', 'DS1_BODYFAT',
               'DS1_MUSCLE', 'DS1_VISFAT', 'DS1_WHR',
               'DS1_GLU0', 'total_exer']]

)
X_const = add_constant(X)

# VIF 계산
vif_data = pd.DataFrame()
vif_data['Variable'] = X_const.columns
vif_data['VIF'] = [variance_inflation_factor(X_const.values, i) for i in range(X_const.shape[1])]

# VIF 출력
print(vif_data)

       Variable        VIF
0         const   25700.86
1       DS1_AGE       1.42
2    DS1_WALKFQ 3607118.53
3     DS1_WALKT 3607064.54
4     DS1_WAIST     282.81
5       DS1_HIP     101.41
6     DS1_PULSE       1.05
7       DS1_SBP       2.83
8       DS1_DBP       2.66
9       DS1_BMI       8.53
10      DS1_PBF      23.09
11  DS1_BODYFAT      41.42
12   DS1_MUSCLE       6.59
13   DS1_VISFAT       5.23
14      DS1_WHR     145.58
15     DS1_GLU0       1.08
16   total_exer       1.05
