In [4]:
import sys
from pathlib import Path

PROJECT_ROOT = Path.cwd().parent.parent

if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

In [5]:
import importlib
import src.config as cfg
importlib.reload(cfg)

PROCESSED_DATA_DIR = cfg.PROCESSED_DATA_DIR
FEATURE_ENGINEER_FILE = PROCESSED_DATA_DIR / 'features_engineer.csv'

import pandas as pd
df = pd.read_csv(FEATURE_ENGINEER_FILE)

In [6]:
y = df['Outcome']
X=df.drop(['Outcome'], axis=1)

In [7]:
X.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,New_Glucose_Diabetes,New_Glucose_Normal,...,New_BMI_Obese II,New_BMI_Obese III,New_BMI_Overweight,New_BMI_Underweight,New_Blood_Pressure_High Blood Pressure Stage I,New_Blood_Pressure_High Blood Pressure Stage II,New_Blood_Pressure_Hypertensive Crisis,New_Blood_Pressure_Normal,New_Insulin_Abnormal,New_Insulin_Normal
0,6,148.0,72.0,35.0,169.5,33.6,0.627,50,False,False,...,False,False,False,False,False,False,False,True,True,False
1,1,85.0,66.0,29.0,102.5,26.6,0.351,31,False,True,...,False,False,True,False,False,False,False,True,False,True
2,8,183.0,64.0,32.0,169.5,23.3,0.672,32,False,False,...,False,False,False,False,False,False,False,True,True,False
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,False,True,...,False,False,True,False,False,False,False,True,False,True
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,False,True,...,False,True,False,False,False,False,False,True,True,False


# Chia tập dữ liệu

Do tập dữ liệu bị mất cân bằng nhãn lớp nên thực hiện chia theo chiến lược **Stratified Sampling (Lấy mẫu phân tầng)**

In [8]:
from sklearn.model_selection import train_test_split

In [12]:
X_train, X_test, y_train , y_test = train_test_split(X,y, test_size=0.2, random_state=42, stratify= y)

# Chuẩn hóa bằng RobustScaler

Tập dữ liệu chứa số lượng lớn các giá trị ngoại lai nên thực hiện chuẩn hóa RobustScaler trên các trường thuộc tính kiểu Numerical

RobustScaler chuẩn hóa dữ liệu bằng cách sử dụng tứ phân vị (quartiles) thay vì giá trị trung bình và độ lệch chuẩn, giúp nó ổn định (robust) hơn trước các giá trị ngoại lai. Cụ thể:
$$x_{scaled} = \frac{x - median}{Q_{3} - Q_{1}} $$

Trong đó:
* $Q_3$ là tứ phân vị thứ ba (quantile 75%)
* $Q_1$ là tứ phân vị thứ nhất (quantile 25%)
* $median$ là giá trị trung vị 

In [9]:
from sklearn.preprocessing import RobustScaler

In [10]:
categorical_columns = ['New_Glucose_Diabetes', 'New_Glucose_Normal', 'New_Glucose_Prediabetes',
       'New_BMI_Healthy', 'New_BMI_Obese I', 'New_BMI_Obese II',
       'New_BMI_Obese III', 'New_BMI_Overweight', 'New_BMI_Underweight',
       'New_Blood_Pressure_High Blood Pressure Stage I',
       'New_Blood_Pressure_High Blood Pressure Stage II',
       'New_Blood_Pressure_Hypertensive Crisis', 'New_Blood_Pressure_Normal',
       'New_Insulin_Abnormal', 'New_Insulin_Normal']

In [13]:
X_train_categorical = X_train[categorical_columns]
X_train = X_train.drop(categorical_columns, axis= 1)

X_test_categorical = X_test[categorical_columns]
X_test = X_test.drop(categorical_columns, axis= 1)

**Chuẩn hóa trên các trường Numerical**

In [14]:
r_scaler = RobustScaler()

X_train_columns = X_train.columns
X_train_index = X_train.index
X_train = r_scaler.fit_transform(X_train)
X_train = pd.DataFrame(X_train, columns= X_train_columns, index= X_train_index)

X_test_columns = X_test.columns
X_test_index = X_test.index
X_test = r_scaler.transform(X_test)
X_test = pd.DataFrame(X_test, columns= X_test_columns, index= X_test_index)

In [15]:
X_train = pd.concat([X_train, X_train_categorical], axis= 1)
X_test = pd.concat([X_test, X_test_categorical], axis= 1)

In [16]:
X_train

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,New_Glucose_Diabetes,New_Glucose_Normal,...,New_BMI_Obese II,New_BMI_Obese III,New_BMI_Overweight,New_BMI_Underweight,New_Blood_Pressure_High Blood Pressure Stage I,New_Blood_Pressure_High Blood Pressure Stage II,New_Blood_Pressure_Hypertensive Crisis,New_Blood_Pressure_Normal,New_Insulin_Abnormal,New_Insulin_Normal
353,-0.4,-0.658537,-0.625,-2.285714,-0.888060,-0.585915,0.500951,-0.294118,False,True,...,False,False,True,False,False,False,False,True,False,True
711,0.4,0.219512,0.375,-0.142857,-1.201493,-0.315493,0.143310,0.647059,False,True,...,False,False,True,False,False,False,False,True,False,True
373,-0.2,-0.292683,-0.875,1.714286,-0.126866,0.281690,-0.399493,-0.235294,False,True,...,False,False,False,False,False,False,False,True,False,True
46,-0.4,0.707317,-1.000,-0.142857,0.000000,-0.304225,0.460368,0.000000,False,False,...,False,False,True,False,False,False,False,True,False,True
682,-0.6,-0.536585,-0.500,1.571429,0.037313,1.374648,-0.041852,-0.411765,False,True,...,False,True,False,False,False,False,False,True,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
451,-0.2,0.414634,-0.125,0.571429,1.000000,-0.394366,0.404566,-0.352941,False,True,...,False,False,True,False,False,False,False,True,True,False
113,0.2,-1.000000,-0.625,-0.142857,0.000000,0.180282,0.021560,-0.235294,False,True,...,False,False,False,False,False,False,False,True,False,True
556,-0.4,-0.487805,-0.125,1.714286,0.000000,0.642254,-0.417248,0.058824,False,True,...,True,False,False,False,False,False,False,True,False,True
667,1.4,-0.146341,-0.125,-0.142857,1.000000,-0.552113,-0.612555,0.647059,False,True,...,False,False,True,False,False,False,False,True,True,False


In [20]:
DATA_DIR = cfg.DATA_DIR
TRAIN_TEST_SPLIT_DIR = DATA_DIR / 'train_test_split'

X_TRAIN_FILE = TRAIN_TEST_SPLIT_DIR / 'X_train.csv'
X_TEST_FILE = TRAIN_TEST_SPLIT_DIR / 'X_test.csv'

Y_TRAIN_FILE = TRAIN_TEST_SPLIT_DIR / 'y_train.csv'
Y_TEST_FILE = TRAIN_TEST_SPLIT_DIR / 'y_test.csv'

In [21]:
X_train.to_csv(X_TRAIN_FILE, index= False)
X_test.to_csv(X_TEST_FILE, index= False)
y_train.to_csv(Y_TRAIN_FILE, index= False)
y_test.to_csv(Y_TEST_FILE, index= False)