In [1]:
import pandas as pd
from ydata_profiling import ProfileReport
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
matplotlib.use('Qt5Agg')
plt.rcParams['font.family'] = 'Malgun Gothic'
plt.rcParams['axes.unicode_minus'] = False
import seaborn as sns

from sklearn.impute import KNNImputer

In [2]:
file_path = ['hn19_all_raw.csv', 'hn20_all_raw.csv', 'hn21_all_raw.csv']
df_19 = pd.read_csv("../data/" + file_path[0], encoding='UTF-8', low_memory=False)
df_20 = pd.read_csv("../data/" + file_path[1], encoding='UTF-8', low_memory=False)
df_21 = pd.read_csv("../data/" + file_path[2], encoding='UTF-8', low_memory=False)

df = pd.concat([df_19, df_20, df_21])

In [3]:
def see(a):
    i = df[a].isnull().sum()
    k = df[a].value_counts()
    print("nullity: ", end='')
    print(i)
    print("\n---")
    df[a].info()
    print("\n---")
    print(k)

In [4]:
col = [
    'sex',
    'age',
    'DI2_dg',
    'DI2_2',
    'DI3_dg',
    'DI5_dg',
    'DE1_dg',
    'DE1_31',
    'DE1_32',
    'HE_glu',
    'HE_HbA1c',
    'BD1_11',
    'BD2_1',
    'sm_presnt',
    'BE3_31',
    'BE3_32',
    'BE3_33',
    'HE_DMfh1',
    'HE_DMfh2',
    'HE_HP',
    'HE_ht',
    'HE_wt',
    'HE_wc',
    'HE_BMI',
    'HE_obe',
    'HE_HCHOL',
    'HE_HTG',
    'N_EN',
    'N_PROT',
    'N_FAT',
    'N_CHO'
]

# DI2_dg: 이상지질혈증 의사진단 여부
# DI2_2: 이상지질혈증 약복용
# DI3_dg: 뇌졸중 의사진단 여부
# DI5_dg: 심근경색증 의사진단 여부
# DE1_dg: 당뇨병 의사진단 여부
# DE1_31: 인슐린 주사
# DE1_32: 당뇨병약 
# HE_glu: 공복혈당
# HE_HbA1c: 당화혈색소
# BD1_11: 1년간 음주빈도
# BD2_1: 한 번에 마시는 음주량
# sm_presnt: 현재흡연율
# BE3_31: 1주일간 걷기 일수
# BE3_32: 걷기 지속 시간(시간)
# BE3_33: 걷기 지속 시간(분)
# HE_DMfh1: 당뇨병 의사진단 여부(부)
# HE_DMfh2: 당뇨병 의사진단 여부(모)
# HE_HP: 고혈압 유병여부(19세이상)
# HE_ht: 신장
# HE_wt: 체중
# HE_wc: 허리둘레
# HE_BMI: 체질량지수
# HE_obe: 비만 유병여부(19세이상)
# HE_HCHOL: 고콜레스테롤혈증 유병여부(19세이상)
# HE_HTG: 고중성지방혈증 유병여부(19세이상)
# Y_BTH_WT: 출생 시 체중
# N_EN: 에너지
# N_PROT: 단백질
# N_FAT: 지방
# N_CHO: 탄수화물

In [5]:
df = df[df['age'] >= 20]
df = df[df['sex'] == 1] # 남자 데이터 분리
df = df[col]

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8270 entries, 0 to 7086
Data columns (total 31 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   sex        8270 non-null   int64  
 1   age        8270 non-null   int64  
 2   DI2_dg     7869 non-null   float64
 3   DI2_2      7869 non-null   float64
 4   DI3_dg     7869 non-null   float64
 5   DI5_dg     7869 non-null   float64
 6   DE1_dg     7869 non-null   float64
 7   DE1_31     7869 non-null   float64
 8   DE1_32     7869 non-null   float64
 9   HE_glu     7676 non-null   float64
 10  HE_HbA1c   7672 non-null   float64
 11  BD1_11     7869 non-null   float64
 12  BD2_1      7869 non-null   float64
 13  sm_presnt  7783 non-null   float64
 14  BE3_31     7869 non-null   float64
 15  BE3_32     7869 non-null   float64
 16  BE3_33     7869 non-null   float64
 17  HE_DMfh1   7830 non-null   float64
 18  HE_DMfh2   7830 non-null   float64
 19  HE_HP      7775 non-null   float64
 20  HE_ht      77

In [7]:
df = df.copy()
df = df.apply(pd.to_numeric)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8270 entries, 0 to 7086
Data columns (total 31 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   sex        8270 non-null   int64  
 1   age        8270 non-null   int64  
 2   DI2_dg     7869 non-null   float64
 3   DI2_2      7869 non-null   float64
 4   DI3_dg     7869 non-null   float64
 5   DI5_dg     7869 non-null   float64
 6   DE1_dg     7869 non-null   float64
 7   DE1_31     7869 non-null   float64
 8   DE1_32     7869 non-null   float64
 9   HE_glu     7676 non-null   float64
 10  HE_HbA1c   7672 non-null   float64
 11  BD1_11     7869 non-null   float64
 12  BD2_1      7869 non-null   float64
 13  sm_presnt  7783 non-null   float64
 14  BE3_31     7869 non-null   float64
 15  BE3_32     7869 non-null   float64
 16  BE3_33     7869 non-null   float64
 17  HE_DMfh1   7830 non-null   float64
 18  HE_DMfh2   7830 non-null   float64
 19  HE_HP      7775 non-null   float64
 20  HE_ht      77

In [8]:
df.head()

Unnamed: 0,sex,age,DI2_dg,DI2_2,DI3_dg,DI5_dg,DE1_dg,DE1_31,DE1_32,HE_glu,...,HE_wt,HE_wc,HE_BMI,HE_obe,HE_HCHOL,HE_HTG,N_EN,N_PROT,N_FAT,N_CHO
0,1,61,0.0,8.0,0.0,0.0,0.0,8.0,8.0,136.0,...,81.6,98.5,25.987394,4.0,0.0,0.0,,,,
1,1,28,0.0,8.0,0.0,0.0,0.0,8.0,8.0,81.0,...,51.7,63.3,16.900942,1.0,0.0,0.0,,,,
2,1,53,0.0,8.0,0.0,0.0,0.0,8.0,8.0,90.0,...,63.1,74.7,19.781829,2.0,0.0,0.0,1794.462141,73.01152,33.617174,295.234107
8,1,60,1.0,1.0,0.0,0.0,1.0,0.0,1.0,173.0,...,60.4,83.7,21.657284,2.0,1.0,0.0,912.820666,39.587765,16.231734,158.612039
9,1,80,0.0,8.0,0.0,0.0,0.0,8.0,8.0,108.0,...,74.5,100.0,26.681078,4.0,0.0,0.0,1294.804956,53.934273,24.375475,221.231167


In [9]:
target_list = ['DE1_31','DE1_32','DE1_dg', 'HE_glu', 'HE_HbA1c']
target_df = df[target_list]
target_df.head()
print(target_df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 8270 entries, 0 to 7086
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   DE1_31    7869 non-null   float64
 1   DE1_32    7869 non-null   float64
 2   DE1_dg    7869 non-null   float64
 3   HE_glu    7676 non-null   float64
 4   HE_HbA1c  7672 non-null   float64
dtypes: float64(5)
memory usage: 387.7 KB
None


In [10]:
# null 값이 있는 행 삭제
df = df.dropna(subset=target_list)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7669 entries, 0 to 7086
Data columns (total 31 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   sex        7669 non-null   int64  
 1   age        7669 non-null   int64  
 2   DI2_dg     7669 non-null   float64
 3   DI2_2      7669 non-null   float64
 4   DI3_dg     7669 non-null   float64
 5   DI5_dg     7669 non-null   float64
 6   DE1_dg     7669 non-null   float64
 7   DE1_31     7669 non-null   float64
 8   DE1_32     7669 non-null   float64
 9   HE_glu     7669 non-null   float64
 10  HE_HbA1c   7669 non-null   float64
 11  BD1_11     7669 non-null   float64
 12  BD2_1      7669 non-null   float64
 13  sm_presnt  7594 non-null   float64
 14  BE3_31     7669 non-null   float64
 15  BE3_32     7669 non-null   float64
 16  BE3_33     7669 non-null   float64
 17  HE_DMfh1   7636 non-null   float64
 18  HE_DMfh2   7636 non-null   float64
 19  HE_HP      7590 non-null   float64
 20  HE_ht      75

In [11]:
target_df = target_df.dropna().reset_index(drop=True)
target_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7669 entries, 0 to 7668
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   DE1_31    7669 non-null   float64
 1   DE1_32    7669 non-null   float64
 2   DE1_dg    7669 non-null   float64
 3   HE_glu    7669 non-null   float64
 4   HE_HbA1c  7669 non-null   float64
dtypes: float64(5)
memory usage: 299.7 KB


In [12]:
da_drug = []
for i, j in zip(df.DE1_31, df.DE1_32):
    if i == 1 or j == 1:
        da_drug.append(1)
    else:
        da_drug.append(0)
da_drug = np.array(da_drug)

target_df2 = target_df.copy()
target_df2['da_drug'] = da_drug
print(target_df2.head())
print(target_df2.info())

   DE1_31  DE1_32  DE1_dg  HE_glu  HE_HbA1c  da_drug
0     8.0     8.0     0.0   136.0       6.4        0
1     8.0     8.0     0.0    81.0       5.0        0
2     8.0     8.0     0.0    90.0       5.4        0
3     0.0     1.0     1.0   173.0       8.3        1
4     8.0     8.0     0.0   108.0       5.4        0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7669 entries, 0 to 7668
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   DE1_31    7669 non-null   float64
 1   DE1_32    7669 non-null   float64
 2   DE1_dg    7669 non-null   float64
 3   HE_glu    7669 non-null   float64
 4   HE_HbA1c  7669 non-null   float64
 5   da_drug   7669 non-null   int32  
dtypes: float64(5), int32(1)
memory usage: 329.7 KB
None


In [13]:
target_df2['da_drug'].value_counts()

da_drug
0    6793
1     876
Name: count, dtype: int64

In [14]:
temp = ['sex','age']
feature_df = df[temp]
feature_df.columns = ['sex','age']
feature_df.sample(5,random_state=42)

Unnamed: 0,sex,age
740,1,64
4944,1,53
2183,1,52
1244,1,46
5244,1,62


In [15]:
#list 생성
diabetes = []
for i in range(len(target_df2)):
    if (target_df2.loc[i,'DE1_31']!=1) & (target_df2.loc[i,'DE1_32']!=1) & (target_df2.loc[i,'DE1_dg']!=1) & (target_df2.loc[i,'HE_glu']<126) & (target_df2.loc[i,'HE_HbA1c']<6.5):
        diabetes.append(0)
    else:
        diabetes.append(1)
diabetes = np.array(diabetes)
#column 추가
target_df = target_df2.copy()
target_df['diabetes'] = diabetes
target_df.sample(5,random_state=42)

Unnamed: 0,DE1_31,DE1_32,DE1_dg,HE_glu,HE_HbA1c,da_drug,diabetes
2968,8.0,8.0,0.0,119.0,5.9,0,0
4409,8.0,8.0,0.0,111.0,6.0,0,0
3450,8.0,8.0,0.0,100.0,5.7,0,0
3139,8.0,8.0,0.0,91.0,5.5,0,0
1729,8.0,8.0,0.0,98.0,5.5,0,0


In [16]:
df = df.drop(columns=target_list,axis=1)
df.shape

(7669, 26)

In [17]:
# 폭음 여부
heavy_drink = []
for index, row in df.iterrows():
    if (row['sex'] == 1) & (row['BD1_11'] in [3, 4, 5, 6]) & (row['BD2_1'] in [4, 5]):
        heavy_drink.append(1)
    elif (row['sex'] == 2) & (row['BD1_11'] in [3, 4, 5, 6]) & (row['BD2_1'] in [3, 4, 5]):
        heavy_drink.append(1)
    else:
        heavy_drink.append(0)
heavy_drink = np.array(heavy_drink)
feature_df['heavy_drink'] = heavy_drink

In [18]:
# 현재흡연여부
df['sm_presnt'].replace({np.nan: 0}, inplace=True)
feature_df['smoke'] = df['sm_presnt'].astype(int)
feature_df.sample(5,random_state=42)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['sm_presnt'].replace({np.nan: 0}, inplace=True)


Unnamed: 0,sex,age,heavy_drink,smoke
740,1,64,1,0
4944,1,53,0,0
2183,1,52,1,1
1244,1,46,0,0
5244,1,62,0,0


In [19]:
genetic = []
for index, row in df.iterrows():
    if (row['HE_DMfh1']==1) & (row['HE_DMfh2']==1):
        genetic.append(2)
    elif (row['HE_DMfh1']==1) & (row['HE_DMfh2']!=1):
        genetic.append(1)
    elif (row['HE_DMfh1']!=1) & (row['HE_DMfh2']==1):
        genetic.append(1)
    else:
        genetic.append(0)
genetic = np.array(genetic)
#column 추가
feature_df['genetic_diabetes'] = genetic
feature_df.sample(5,random_state=42)

Unnamed: 0,sex,age,heavy_drink,smoke,genetic_diabetes
740,1,64,1,0,0
4944,1,53,0,0,0
2183,1,52,1,1,0
1244,1,46,0,0,1
5244,1,62,0,0,0


In [20]:
df['abdominal_obesity'] = ((df['sex'] == 1) & (df['HE_wc'] >= 90)) | ((df['sex'] == 2) & (df['HE_wc'] >= 85))
feature_df['abdominal_obesity'] = df['abdominal_obesity'].astype(int)
feature_df.sample(5,random_state=42)

Unnamed: 0,sex,age,heavy_drink,smoke,genetic_diabetes,abdominal_obesity
740,1,64,1,0,0,1
4944,1,53,0,0,0,1
2183,1,52,1,1,0,1
1244,1,46,0,0,1,0
5244,1,62,0,0,0,0


In [21]:
feature_df['Hypercholesterolemia'] = df['HE_HCHOL'].astype('Int64')
feature_df['hypertriglyceridemia'] = df['HE_HTG'].astype('Int64')
feature_df['high_bloodpressure'] = df['HE_HP'].astype('Int64')
feature_df.replace(pd.NA, np.nan, inplace=True)
# 결과 확인
feature_df.sample(5, random_state=42)

Unnamed: 0,sex,age,heavy_drink,smoke,genetic_diabetes,abdominal_obesity,Hypercholesterolemia,hypertriglyceridemia,high_bloodpressure
740,1,64,1,0,0,1,0,,3
4944,1,53,0,0,0,1,0,0.0,2
2183,1,52,1,1,0,1,1,0.0,3
1244,1,46,0,0,1,0,1,1.0,2
5244,1,62,0,0,0,0,0,0.0,1


In [22]:
# 뇌졸중
feature_df['stroke'] = df['DI3_dg'].replace({9: np.nan}).astype("Int64")
# 심근경색증
feature_df['myocardial_infarction'] = df['DI5_dg'].replace({9: np.nan}).astype('Int64')
feature_df.sample(5, random_state=42)

Unnamed: 0,sex,age,heavy_drink,smoke,genetic_diabetes,abdominal_obesity,Hypercholesterolemia,hypertriglyceridemia,high_bloodpressure,stroke,myocardial_infarction
740,1,64,1,0,0,1,0,,3,0,0
4944,1,53,0,0,0,1,0,0.0,2,0,0
2183,1,52,1,1,0,1,1,0.0,3,0,0
1244,1,46,0,0,1,0,1,1.0,2,0,0
5244,1,62,0,0,0,0,0,0.0,1,0,0


In [23]:
# 체중, BMI
exam_list0 = ['HE_wt','HE_BMI']
exam_list1 = ['weight','BMI']

feature_df[exam_list1] = df[exam_list0].astype(float)
feature_df.sample(5,random_state=42)

Unnamed: 0,sex,age,heavy_drink,smoke,genetic_diabetes,abdominal_obesity,Hypercholesterolemia,hypertriglyceridemia,high_bloodpressure,stroke,myocardial_infarction,weight,BMI
740,1,64,1,0,0,1,0,,3,0,0,76.4,27.526016
4944,1,53,0,0,0,1,0,0.0,2,0,0,85.6,27.760337
2183,1,52,1,1,0,1,1,0.0,3,0,0,71.9,23.995774
1244,1,46,0,0,1,0,1,1.0,2,0,0,64.7,23.967854
5244,1,62,0,0,0,0,0,0.0,1,0,0,54.6,19.094374


In [24]:
#비만도
feature_df['Obesity'] = df['HE_obe'].astype('Int64')
feature_df.sample(5,random_state=42)

Unnamed: 0,sex,age,heavy_drink,smoke,genetic_diabetes,abdominal_obesity,Hypercholesterolemia,hypertriglyceridemia,high_bloodpressure,stroke,myocardial_infarction,weight,BMI,Obesity
740,1,64,1,0,0,1,0,,3,0,0,76.4,27.526016,4
4944,1,53,0,0,0,1,0,0.0,2,0,0,85.6,27.760337,4
2183,1,52,1,1,0,1,1,0.0,3,0,0,71.9,23.995774,3
1244,1,46,0,0,1,0,1,1.0,2,0,0,64.7,23.967854,3
5244,1,62,0,0,0,0,0,0.0,1,0,0,54.6,19.094374,2


In [25]:
see('HE_obe')

nullity: 70

---
<class 'pandas.core.series.Series'>
Index: 7669 entries, 0 to 7086
Series name: HE_obe
Non-Null Count  Dtype  
--------------  -----  
7599 non-null   float64
dtypes: float64(1)
memory usage: 119.8 KB

---
HE_obe
4.0    2763
2.0    2150
3.0    1987
5.0     426
1.0     195
6.0      78
Name: count, dtype: int64


In [26]:
feature_df['stroke'].value_counts()

stroke
0    7003
1     203
Name: count, dtype: Int64

In [27]:
def walking_practice(row):
    # '88', '99'를 가지고 있는 경우는 null로 처리
    if row['BE3_31'] in [88, 99] or row['BE3_32'] in [88, 99] or row['BE3_33'] in [88, 99]:
        return np.nan
    # BE3_31(걷기 일수)가 5 이상이고, BE3_32(걷기 지속 시간(시간))이 1 이상이거나 BE3_33(걷기 지속 시간(분))이 30 이상인 경우 걷기 실천으로 판단
    elif row['BE3_31'] >= 5 and (row['BE3_32'] >= 1 or row['BE3_33'] >= 30):
        return 1
    else:
        return 0

feature_df['walking_practice'] = df.apply(walking_practice, axis=1).astype('Int64')
feature_df.sample(5,random_state=42)

Unnamed: 0,sex,age,heavy_drink,smoke,genetic_diabetes,abdominal_obesity,Hypercholesterolemia,hypertriglyceridemia,high_bloodpressure,stroke,myocardial_infarction,weight,BMI,Obesity,walking_practice
740,1,64,1,0,0,1,0,,3,0,0,76.4,27.526016,4,1.0
4944,1,53,0,0,0,1,0,0.0,2,0,0,85.6,27.760337,4,
2183,1,52,1,1,0,1,1,0.0,3,0,0,71.9,23.995774,3,0.0
1244,1,46,0,0,1,0,1,1.0,2,0,0,64.7,23.967854,3,1.0
5244,1,62,0,0,0,0,0,0.0,1,0,0,54.6,19.094374,2,1.0


In [28]:
float_data = [
    'age',
    'weight',
    'BMI',
    'N_EN',
    'N_PROT', 
    'N_FAT', 
    'N_CHO'
]

In [29]:
feature_df[['N_EN', 'N_PROT', 'N_FAT', 'N_CHO']] = df[['N_EN', 'N_PROT', 'N_FAT', 'N_CHO']]

In [30]:
feature_df['diabetes'] = target_df['diabetes']
feature_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7669 entries, 0 to 7086
Data columns (total 20 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   sex                    7669 non-null   int64  
 1   age                    7669 non-null   int64  
 2   heavy_drink            7669 non-null   int32  
 3   smoke                  7669 non-null   int32  
 4   genetic_diabetes       7669 non-null   int32  
 5   abdominal_obesity      7669 non-null   int32  
 6   Hypercholesterolemia   7419 non-null   Int64  
 7   hypertriglyceridemia   6297 non-null   Int64  
 8   high_bloodpressure     7590 non-null   Int64  
 9   stroke                 7206 non-null   Int64  
 10  myocardial_infarction  7202 non-null   Int64  
 11  weight                 7643 non-null   float64
 12  BMI                    7599 non-null   float64
 13  Obesity                7599 non-null   Int64  
 14  walking_practice       5867 non-null   Int64  
 15  N_EN     

In [31]:
y = feature_df.diabetes
X = feature_df.drop('diabetes', axis=1)

# Replace empty strings with NaN
feature_df.replace('', float('nan'), inplace=True)

# Initialize and fit KNNImputer
knn = KNNImputer(n_neighbors=5, weights="uniform")
k_df = pd.DataFrame(knn.fit_transform(feature_df), columns=feature_df.columns)

# 각 열의 데이터 타입을 변환
for col in k_df.columns:
    if col in float_data:
        k_df[col] = k_df[col].astype(float)
    else:
        k_df[col] = k_df[col].astype(int)
        k_df[col] = k_df[col].astype('category')

In [32]:
feature_df = k_df.copy()

In [33]:
feature_df.isnull().sum()

sex                      0
age                      0
heavy_drink              0
smoke                    0
genetic_diabetes         0
abdominal_obesity        0
Hypercholesterolemia     0
hypertriglyceridemia     0
high_bloodpressure       0
stroke                   0
myocardial_infarction    0
weight                   0
BMI                      0
Obesity                  0
walking_practice         0
N_EN                     0
N_PROT                   0
N_FAT                    0
N_CHO                    0
diabetes                 0
dtype: int64

In [34]:
feature_df.head()

Unnamed: 0,sex,age,heavy_drink,smoke,genetic_diabetes,abdominal_obesity,Hypercholesterolemia,hypertriglyceridemia,high_bloodpressure,stroke,myocardial_infarction,weight,BMI,Obesity,walking_practice,N_EN,N_PROT,N_FAT,N_CHO,diabetes
0,1,61.0,1,0,0,1,0,0,3,0,0,81.6,25.987394,4,0,2218.006445,74.957913,38.122252,307.407566,1
1,1,28.0,0,0,0,0,0,0,1,0,0,51.7,16.900942,1,0,2371.351477,110.223887,66.416694,297.000961,0
2,1,53.0,1,1,0,0,0,0,2,0,0,63.1,19.781829,2,0,1794.462141,73.01152,33.617174,295.234107,0
3,1,60.0,0,1,1,0,1,0,1,0,0,60.4,21.657284,2,0,912.820666,39.587765,16.231734,158.612039,0
4,1,80.0,0,0,0,1,0,0,3,0,0,74.5,26.681078,4,0,1294.804956,53.934273,24.375475,221.231167,0


In [35]:
# 에너지 섭취 분율 계산 함수
def calculate_energy_ratio(row):
    # 에너지 섭취량 계산
    energy_intake = (row['N_PROT'] * 4) + (row['N_FAT'] * 9) + (row['N_CHO'] * 4)
    
    # 단백질, 지방, 탄수화물 에너지 섭취 분율 계산
    protein_energy_ratio = (row['N_PROT'] * 4) / energy_intake if energy_intake != 0 else 0
    fat_energy_ratio = (row['N_FAT'] * 9) / energy_intake if energy_intake != 0 else 0
    carbohydrate_energy_ratio = (row['N_CHO'] * 4) / energy_intake if energy_intake != 0 else 0
    
    return protein_energy_ratio, fat_energy_ratio, carbohydrate_energy_ratio

# 에너지 섭취 분율 계산하여 새로운 열로 추가
feature_df['protein_energy_ratio'], feature_df['fat_energy_ratio'], feature_df['carbohydrate_energy_ratio'] = zip(*feature_df.apply(calculate_energy_ratio, axis=1))
feature_df['energy'] = feature_df['N_EN']
feature_df.drop(columns=['N_EN', 'N_PROT', 'N_FAT', 'N_CHO'], inplace=True)
feature_df.sample(5,random_state=42)

Unnamed: 0,sex,age,heavy_drink,smoke,genetic_diabetes,abdominal_obesity,Hypercholesterolemia,hypertriglyceridemia,high_bloodpressure,stroke,myocardial_infarction,weight,BMI,Obesity,walking_practice,diabetes,protein_energy_ratio,fat_energy_ratio,carbohydrate_energy_ratio,energy
2968,1,64.0,1,0,0,1,0,0,3,0,0,76.4,27.526016,4,1,0,0.146743,0.184146,0.669111,2030.09448
4409,1,53.0,0,0,0,1,0,0,2,0,0,85.6,27.760337,4,0,0,0.251222,0.122778,0.625999,1911.255009
3450,1,52.0,1,1,0,1,1,0,3,0,0,71.9,23.995774,3,0,1,0.146965,0.287983,0.565052,2434.829993
3139,1,46.0,0,0,1,0,1,1,2,0,0,64.7,23.967854,3,1,0,0.172411,0.476508,0.351081,2295.785023
1729,1,62.0,0,0,0,0,0,0,1,0,0,54.6,19.094374,2,1,1,0.1588,0.175871,0.665329,1755.349164


In [36]:
feature_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7669 entries, 0 to 7668
Data columns (total 20 columns):
 #   Column                     Non-Null Count  Dtype   
---  ------                     --------------  -----   
 0   sex                        7669 non-null   category
 1   age                        7669 non-null   float64 
 2   heavy_drink                7669 non-null   category
 3   smoke                      7669 non-null   category
 4   genetic_diabetes           7669 non-null   category
 5   abdominal_obesity          7669 non-null   category
 6   Hypercholesterolemia       7669 non-null   category
 7   hypertriglyceridemia       7669 non-null   category
 8   high_bloodpressure         7669 non-null   category
 9   stroke                     7669 non-null   category
 10  myocardial_infarction      7669 non-null   category
 11  weight                     7669 non-null   float64 
 12  BMI                        7669 non-null   float64 
 13  Obesity                    7669 n

In [37]:
# # 프로파일링 보고서 생성
# profile = ProfileReport(feature_df, title="Profiling Report")
# # 보고서 저장 (선택사항)
# profile.to_file("../resources/Profiling Report_small.html")

In [38]:
# diabetes를 맨 마지막열로 변경 & 남자만 있으므로  sex 열 삭제
feature_df.drop(['diabetes', 'sex'], axis=1, inplace=True)
feature_df['diabetes'] = target_df['diabetes'].astype('category')
feature_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7669 entries, 0 to 7668
Data columns (total 19 columns):
 #   Column                     Non-Null Count  Dtype   
---  ------                     --------------  -----   
 0   age                        7669 non-null   float64 
 1   heavy_drink                7669 non-null   category
 2   smoke                      7669 non-null   category
 3   genetic_diabetes           7669 non-null   category
 4   abdominal_obesity          7669 non-null   category
 5   Hypercholesterolemia       7669 non-null   category
 6   hypertriglyceridemia       7669 non-null   category
 7   high_bloodpressure         7669 non-null   category
 8   stroke                     7669 non-null   category
 9   myocardial_infarction      7669 non-null   category
 10  weight                     7669 non-null   float64 
 11  BMI                        7669 non-null   float64 
 12  Obesity                    7669 non-null   category
 13  walking_practice           7669 n

In [39]:
feature_df.to_csv("../data/data_small_man.csv", index=False)

In [49]:
corr0 = feature_df.corr(method='pearson')
corr0.iloc[0:,-1:].style.background_gradient(cmap='coolwarm',vmin=-1,vmax=1)

Unnamed: 0,diabetes
age,0.285866
heavy_drink,-0.032691
smoke,-0.001816
genetic_diabetes,0.110993
abdominal_obesity,0.165902
Hypercholesterolemia,0.191937
hypertriglyceridemia,0.061431
high_bloodpressure,0.224774
stroke,0.078795
myocardial_infarction,0.103204


In [41]:
y = feature_df.diabetes
X = feature_df.drop('diabetes', axis=1)

# 히트맵 그리기
plt.figure(figsize=(15, 15))
mask = np.triu(np.ones_like(X.corr(), dtype=bool))
sns.heatmap(X.corr(), annot=True)
plt.savefig('../resources/heatmap_small_man.png')
plt.show()

In [42]:
feature_df.drop(['weight', 'Obesity'], axis=1, inplace=True)

In [48]:
corr0 = feature_df.corr(method='pearson')
corr0.iloc[0:,-1:].style.background_gradient(cmap='coolwarm',vmin=-1,vmax=1)

Unnamed: 0,diabetes
age,0.285866
heavy_drink,-0.032691
smoke,-0.001816
genetic_diabetes,0.110993
abdominal_obesity,0.165902
Hypercholesterolemia,0.191937
hypertriglyceridemia,0.061431
high_bloodpressure,0.224774
stroke,0.078795
myocardial_infarction,0.103204


In [43]:
y = feature_df.diabetes
X = feature_df.drop('diabetes', axis=1)

# 히트맵 그리기
plt.figure(figsize=(15, 15))
mask = np.triu(np.ones_like(X.corr(), dtype=bool))
sns.heatmap(X.corr(), annot=True)
plt.savefig('../resources/heatmap_small_man_repair.png')
plt.show()

In [44]:
feature_df.to_csv("../data/data_small_man.csv", index=False)

In [45]:
# # 프로파일링 보고서 생성
# profile = ProfileReport(feature_df, title="Profiling Report man")
# # 보고서 저장 (선택사항)
# profile.to_file("../resources/Profiling Report_small_man.html")

In [46]:
feature_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7669 entries, 0 to 7668
Data columns (total 17 columns):
 #   Column                     Non-Null Count  Dtype   
---  ------                     --------------  -----   
 0   age                        7669 non-null   float64 
 1   heavy_drink                7669 non-null   category
 2   smoke                      7669 non-null   category
 3   genetic_diabetes           7669 non-null   category
 4   abdominal_obesity          7669 non-null   category
 5   Hypercholesterolemia       7669 non-null   category
 6   hypertriglyceridemia       7669 non-null   category
 7   high_bloodpressure         7669 non-null   category
 8   stroke                     7669 non-null   category
 9   myocardial_infarction      7669 non-null   category
 10  BMI                        7669 non-null   float64 
 11  walking_practice           7669 non-null   category
 12  protein_energy_ratio       7669 non-null   float64 
 13  fat_energy_ratio           7669 n

- 당뇨병: 공복혈당이 126 mg/dL 이상이거나 의사로부터 당뇨병을 진단받았거나 당뇨병약제로 치료 중이거나 당화혈색소가 6.5% 이상

- 조절 불가 위험요인
•나이
•성별
•가족력(당뇨병)

- 만성질환 위험요인
•고혈압: 수축기혈압이 140 mmHg 이상이거나 이완기혈압이 90 mmHg 이상 또는 고혈압약제를 복용
•고콜레스테롤혈증: LDL 콜레스테롤이 100 mg/dL 이상이거나 콜레스테롤강하제를 복용
•낮은 HDL 콜레스테롤: 남자 ＜ 40 mg/dL, 여자 ＜ 50 mg/dL 또는 이상지질혈증약 복용
•심혈관질환(뇌졸중, 관상동맥질환 등)

- 생활습관 위험요인
- 비만: ➀ <18.5 저체중 ➁ 18.5-22.9 정상체중 ➂ 23.0-24.9 비만전단계 ➃ 25.0-29.9 1단계 비만 ➄ 30.0-34.9 2단계 비만 ➅ 35.0 이상 3단계 비만
- 복부비만(허리둘레 기준): 남자 90 cm 이상, 여자 85 cm 이상인 경우
- 흡연:
•현재흡연: 평생 총 5갑(100개비)이상의 담배를 피웠으며 현재도 담배를 피우고 있는 경우
•과거흡연: 평생 총 5갑(100개비)이상의 담배를 피웠으나 현재는 담배를 피우지 않는 경우
•비흡연: 평생 총 5갑(100개비)이상의 담배를 피운 적이 없는 경우

- 음주:  1회 평균 음주량이 남자 7잔, 여자 5잔 이상이며, 주 2회 이상 음주하는 분
- 운동(걷기 실천률): 최근 1주일 동안 걷기를 1일 총 30분 이상, 주 5일 이상 실천

- 에너지 과잉 섭취 분율: 에너지 섭취량이 필요추정량의 125% 이상인 분율
- 단백질 에너지 섭취 분율: {(단백질 섭취량×4)+(지방 섭취량×9)+(탄수화물 섭취량×4)}에 대한(단백질 섭취량×4)의 분율
- 지방 에너지 섭취 분율: {(단백질 섭취량×4)+(지방 섭취량×9)+(탄수화물 섭취량×4)}에 대한(지방 섭취량×9)의 분율
- 탄수화물 에너지 섭취 분율: {(단백질 섭취량×4)+(지방 섭취량×9)+(탄수화물 섭취량×4)}에 대한(탄수화물 섭취량×4)의 분율