# Feature Engineering

In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler

# Đọc dữ liệu đã được làm sạch từ file 01
data = pd.read_csv('../data/processed/diamonds_clean.csv')
print("Kích thước dữ liệu:", data.shape)
data.head()

Kích thước dữ liệu: (46425, 10)


Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,IDEAL,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,PREMIUM,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.29,PREMIUM,I,VS2,62.4,58.0,334,4.2,4.23,2.63
3,0.31,GOOD,J,SI2,63.3,58.0,335,4.34,4.35,2.75
4,0.24,VERY GOOD,J,VVS2,62.8,57.0,336,3.94,3.96,2.48


In [21]:
categorical_cols = ['cut', 'color', 'clarity']
numerical_cols = ['carat', 'depth', 'table', 'x', 'y', 'z']
target_col = 'price'

In [22]:
# Mã hóa thứ tự logic cho các cột phân loại
cut_order = ['FAIR', 'GOOD', 'VERY GOOD', 'PREMIUM', 'IDEAL']
color_order = ['J', 'I', 'H', 'G', 'F', 'E', 'D']
clarity_order = ['I1', 'SI2', 'SI1', 'VS2', 'VS1', 'VVS2', 'VVS1', 'IF']

data['cut'] = data['cut'].map({v: i for i, v in enumerate(cut_order, 1)})
data['color'] = data['color'].map({v: i for i, v in enumerate(color_order, 1)})
data['clarity'] = data['clarity'].map({v: i for i, v in enumerate(clarity_order, 1)})

In [23]:
data.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,5,6,2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,4,6,3,59.8,61.0,326,3.89,3.84,2.31
2,0.29,4,2,4,62.4,58.0,334,4.2,4.23,2.63
3,0.31,2,1,2,63.3,58.0,335,4.34,4.35,2.75
4,0.24,3,1,6,62.8,57.0,336,3.94,3.96,2.48


In [24]:
clean_path = "../data/processed/diamonds_clean.csv"
data.to_csv(clean_path, index=False)

In [25]:
X = data.drop(columns=[target_col])
y = data[target_col]

In [26]:
# Chuẩn hóa các biến số
scaler = StandardScaler()
X[numerical_cols] = scaler.fit_transform(X[numerical_cols])

In [27]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

folds = []
for fold, (train_idx, val_idx) in enumerate(kf.split(X), 1):
    print(f"Fold {fold}: Train {len(train_idx)} mẫu, Val {len(val_idx)} mẫu")
    folds.append((train_idx, val_idx))

Fold 1: Train 37140 mẫu, Val 9285 mẫu
Fold 2: Train 37140 mẫu, Val 9285 mẫu
Fold 3: Train 37140 mẫu, Val 9285 mẫu
Fold 4: Train 37140 mẫu, Val 9285 mẫu
Fold 5: Train 37140 mẫu, Val 9285 mẫu


In [28]:
X.to_csv('../data/X_processed.csv', index=False)
y.to_csv('../data/y_processed.csv', index=False)

# Lưu index của từng fold
import json
fold_indices = [{'train': train.tolist(), 'val': val.tolist()} for train, val in folds]
with open('../data/kfold_indices.json', 'w') as f:
    json.dump(fold_indices, f)
