In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [75]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler

In [None]:
file_path = '/content/drive/MyDrive/teamproject/Diabetes-Classification-Project/diabetes_dataset.csv'
data = pd.read_csv(file_path)

print(data.shape) # (행, 열) = (100000, 31)
print(data.head(10))

In [None]:
### train_data, train_label, test_data, test_label 데이터셋 분리

# lable 분리 ('diabetes_stage') (다중분류)
label = data['diabetes_stage']
not_use = ['diabetes_stage', 'diagnosed_diabetes'] # 안쓰는 특성
features = data.drop(not_use, axis=1)

# train, test 데이터셋 분리 train:80%, test:20%
train_data, test_data, train_labels, test_labels = train_test_split(features, label, test_size=0.2, random_state = 0, stratify = label)

print(f"train_data: {train_data.shape}")
print(f"train_labels: {train_labels.shape}")
print(f"test_data: {test_data.shape}")
print(f"test_labels: {test_labels.shape}")

In [None]:
### 결측치 존재 유무 확인 -> diabetes dataset에 결측치 하나도 없음
print(f"train_data 결측치 개수: {train_data.isnull().sum().sum()}")
print(f"train_labels 결측치 개수: {train_labels.isnull().sum().sum()}")
print(f"test_data 결측치 개수: {test_data.isnull().sum().sum()}")
print(f"test_labels 결측치 개수: {test_labels.isnull().sum().sum()}")

In [None]:
### 데이터 정규화

# 순서가 없는 범주형 데이터 -> 'One-Hot Encoding'
## gender, ethnicity, employment_status, smoking_status, family_history_diabetes, hypertension_history, cardiovascular_history
orderless_categorical_feature = ['gender', 'ethnicity', 'employment_status', 'smoking_status', 'family_history_diabetes', 'hypertension_history', 'cardiovascular_history']

train_olcf_encode = pd.get_dummies(train_data[orderless_categorical_feature])
test_olcf_encode = pd.get_dummies(test_data[orderless_categorical_feature])
## boolean 형태 -> 0/1 형태 변환
train_olcf_encode = train_olcf_encode.astype(int)
test_olcf_encode  = test_olcf_encode.astype(int)



# 순서가 있는 범주형 데이터 -> 'Label Encoding' -> 'OrdinalEncoder'
## education_level, income_level
order_categorical_feature = ['education_level', 'income_level']

## 원본 features에서 같은 인덱스로 순서형 열만
train_ocf_data = features.loc[train_data.index, order_categorical_feature].copy()
test_ocf_data  = features.loc[test_data.index,  order_categorical_feature].copy()


## 기준 순서 정의
edu_order = ['No formal', 'Highschool', 'Graduate', 'Postgraduate']
inc_order = ['Low', 'Lower-Middle', 'Middle', 'Upper-Middle', 'High']

# 순서형 인코더 정의
ord_enc = OrdinalEncoder(
    categories=[edu_order, inc_order],
    handle_unknown='use_encoded_value',
    unknown_value=-1
)

train_ocf_encoded = ord_enc.fit_transform(train_ocf_data)
test_ocf_encoded  = ord_enc.transform(test_ocf_data)

train_ocf_encoded = pd.DataFrame(
    train_ocf_encoded, columns=order_categorical_feature, index=train_ocf_data.index
)
test_ocf_encoded = pd.DataFrame(
    test_ocf_encoded, columns=order_categorical_feature, index=test_ocf_data.index
)

# 0~1 스케일링 (MinMaxScaler)
scaler = MinMaxScaler()
train_ocf_scaled = pd.DataFrame(
    scaler.fit_transform(train_ocf_encoded),
    columns=[f"{c}_scaled" for c in order_categorical_feature],
    index=train_ocf_encoded.index
)
test_ocf_scaled = pd.DataFrame(
    scaler.transform(test_ocf_encoded),
    columns=[f"{c}_scaled" for c in order_categorical_feature],
    index=test_ocf_encoded.index
)

test_olcf_encode = test_olcf_encode.reindex(columns=train_olcf_encode.columns, fill_value=0)

# 결합
train_data_normalized = pd.concat(
    [train_data.reset_index(drop=True),
     train_ocf_scaled.reset_index(drop=True),
     train_olcf_encode.reset_index(drop=True)],
    axis=1
)
test_data_normalized = pd.concat(
    [test_data.reset_index(drop=True),
     test_ocf_scaled.reset_index(drop=True),
     test_olcf_encode.reset_index(drop=True)],
    axis=1
)


# 수치형 데이터 -> '그냥 정규화'
## age, alcohol_consumption_per_week, physical_activity_minutes_per_week, diet_score, sleep_hours_per_day, screen_time_hours_per_day, bmi, waist_to_hip_ratio
## systolic_bp, diastolic_bp, heart_rate, cholesterol_total, cholesterol_total, ldl_cholesterol, triglycerides, glucose_fasting, glucose_postprandial, insulin_level
## hba1c, diabetes_risk_score
numerical_feature = ['age', 'alcohol_consumption_per_week', 'physical_activity_minutes_per_week', 'diet_score', 'sleep_hours_per_day', 'screen_time_hours_per_day', 'bmi', 'waist_to_hip_ratio',
'systolic_bp', 'diastolic_bp', 'heart_rate', 'cholesterol_total', 'hdl_cholesterol', 'ldl_cholesterol', 'triglycerides', 'glucose_fasting', 'glucose_postprandial', 'insulin_level',
'hba1c', 'diabetes_risk_score']

mean = train_data[numerical_feature].mean(axis=0)
std = train_data[numerical_feature].mean(axis=0)

train_data = (train_data[numerical_feature] - mean) / std
test_data = (test_data[numerical_feature] - mean) / std

print(len(order_categorical_feature))
print(len(orderless_categorical_feature))
print(len(numerical_feature))
print(len(orderless_categorical_feature) + len(order_categorical_feature) + len(numerical_feature))

In [None]:
# 수치형 데이터

print(train_data.head())
print(test_data.head())



In [81]:
# train 기준으로 test의 더미 컬럼(원-핫 열) 정렬/보정
## test 에 없는 열은 새로 만들고 0 으로 채움
test_olcf_encode = test_olcf_encode.reindex(columns=train_olcf_encode.columns, fill_value=0)


train_data_normalized = pd.concat([train_data.reset_index(drop=True),
                                   train_olcf_encode.reset_index(drop=True)], axis=1)
test_data_normalized  = pd.concat([test_data.reset_index(drop=True),
                                   test_olcf_encode.reset_index(drop=True)], axis=1)


In [None]:

# 무순서 범주 데이터
print("*정규화된 train 데이터 샘플")
print(train_data_normalized.head())

print("\n*정규화된 test 데이터 샘플 ")
print(test_data_normalized.head())

print("\n*무순서 범주형(원-핫 인코딩)")
print(train_olcf_encode.head())

In [None]:
# 순서 범주 데이터
print("\n*순서형 범주형 데이터")
print(train_ocf_scaled.head())

print("\n*순서형 범주형 데이터")
print(test_ocf_scaled.head())

print("\ntrain_data_normalized shape:", train_data_normalized.shape)
print("test_data_normalized  shape:", test_data_normalized.shape)