In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder

In [None]:
file_path = '/content/drive/MyDrive/teamproject/Diabetes-Classification-Project/diabetes_dataset.csv'
data = pd.read_csv(file_path)

print(data.shape) # (행, 열) = (100000, 31)
print(data.head(10))

In [None]:
### train_data, train_label, test_data, test_label 데이터셋 분리

# lable 분리 ('diabetes_stage') (다중분류)
label = data['diabetes_stage']
not_use = ['diabetes_stage', 'diagnosed_diabetes'] # 안쓰는 특성
features = data.drop(not_use, axis=1)

# train, test 데이터셋 분리 train:80%, test:20%
train_data, test_data, train_labels, test_labels = train_test_split(features, label, test_size=0.2, random_state = 0, stratify = label)

print(f"train_data: {train_data.shape}")
print(f"train_labels: {train_labels.shape}")
print(f"test_data: {test_data.shape}")
print(f"test_labels: {test_labels.shape}")

In [None]:
### 결측치 존재 유무 확인 -> diabetes dataset에 결측치 하나도 없음
print(f"train_data 결측치 개수: {train_data.isnull().sum().sum()}")
print(f"train_labels 결측치 개수: {train_labels.isnull().sum().sum()}")
print(f"test_data 결측치 개수: {test_data.isnull().sum().sum()}")
print(f"test_labels 결측치 개수: {test_labels.isnull().sum().sum()}")

In [None]:
### 데이터 정규화

# 순서가 없는 범주형 데이터 -> 'One-Hot Encoding'
## gender, ethnicity, employment_status, smoking_status, family_history_diabetes, hypertension_history, cardiovascular_history
orderless_categorical_feature = ['gender', 'ethnicity', 'employment_status', 'smoking_status', 'family_history_diabetes', 'hypertension_history', 'cardiovascular_history']

train_olcf_encode = pd.get_dummies(train_data[orderless_categorical_feature])
test_olcf_encode = pd.get_dummies(test_data[orderless_categorical_feature])

# 순서가 있는 범주형 데이터 -> 'Label Encoding'
## education_level, income_level
order_categorical_feature = ['education_level', 'income_level']

# 수치형 데이터 -> '그냥 정규화'
## age, alcohol_consumption_per_week, physical_activity_minutes_per_week, diet_score, sleep_hours_per_day, screen_time_hours_per_day, bmi, waist_to_hip_ratio
## systolic_bp, diastolic_bp, heart_rate, cholesterol_total, cholesterol_total, ldl_cholesterol, triglycerides, glucose_fasting, glucose_postprandial, insulin_level
## hba1c, diabetes_risk_score
numerical_feature = ['age', 'alcohol_consumption_per_week', 'physical_activity_minutes_per_week', 'diet_score', 'sleep_hours_per_day', 'screen_time_hours_per_day', 'bmi', 'waist_to_hip_ratio',
'systolic_bp', 'diastolic_bp', 'heart_rate', 'cholesterol_total', 'hdl_cholesterol', 'ldl_cholesterol', 'triglycerides', 'glucose_fasting', 'glucose_postprandial', 'insulin_level',
'hba1c', 'diabetes_risk_score']

mean = train_data[numerical_feature].mean(axis=0)
std = train_data[numerical_feature].mean(axis=0)

train_data = (train_data[numerical_feature] - mean) / std
test_data = (test_data[numerical_feature] - mean) / std

print(len(order_categorical_feature))
print(len(orderless_categorical_feature))
print(len(numerical_feature))
print(len(orderless_categorical_feature) + len(order_categorical_feature) + len(numerical_feature))

In [None]:
# 수치형 데이터



print(train_data.head())
print(test_data.head())