In [1]:
from sklearn.neighbors import KNeighborsRegressor
from models.neighbors import KNeighborsRegressor as MyKNeighborsRegressor

from sklearn.tree import  DecisionTreeRegressor
from models.tree import DecisionTreeRegressor as MyDecisionTreeRegressor

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

from sklearn.datasets import fetch_california_housing

from models.utils import save_metrics_to_csv
import numpy as np


In [2]:
path_regression_metrics = "../reports/regression_metrics.csv"

In [3]:
data  = fetch_california_housing(as_frame=True)
df_housing = data.frame

## 📊 Mô tả các cột dữ liệu

| Tên cột              | Giải thích |
|----------------------|------------|
| **MedInc**           | **Thu nhập trung vị** của cư dân trong khu vực (tính theo đơn vị: chục nghìn USD). Ví dụ: `8.3252` tương đương với khoảng `$83,252`. |
| **HouseAge**         | **Tuổi trung bình của các căn nhà** trong khu vực (tính bằng năm). |
| **AveRooms**         | **Số phòng trung bình** mỗi hộ dân trong khu vực. |
| **AveBedrms**        | **Số phòng ngủ trung bình** mỗi hộ dân. |
| **Population**       | **Tổng dân số** trong khu vực. |
| **AveOccup**         | **Số người trung bình** sống trong một hộ. |
| **Latitude**         | **Vĩ độ** của khu vực (tọa độ địa lý). |
| **Longitude**        | **Kinh độ** của khu vực (tọa độ địa lý). |
| **MedHouseVal**      | **Giá trị nhà trung vị** trong khu vực (tính theo đơn vị: trăm nghìn USD). |

In [8]:
print("First 5 rows of the dataset:")
print(df_housing.head())

First 5 rows of the dataset:
   MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0  8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88   
1  8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86   
2  7.2574      52.0  8.288136   1.073446       496.0  2.802260     37.85   
3  5.6431      52.0  5.817352   1.073059       558.0  2.547945     37.85   
4  3.8462      52.0  6.281853   1.081081       565.0  2.181467     37.85   

   Longitude  MedHouseVal  RoomsPerPerson  
0    -122.23        4.526        2.732919  
1    -122.22        3.585        2.956685  
2    -122.24        3.521        2.957661  
3    -122.25        3.413        2.283154  
4    -122.25        3.422        2.879646  


In [7]:
print("\nDataset Info:")
df_housing.info()


Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   MedInc       20640 non-null  float64
 1   HouseAge     20640 non-null  float64
 2   AveRooms     20640 non-null  float64
 3   AveBedrms    20640 non-null  float64
 4   Population   20640 non-null  float64
 5   AveOccup     20640 non-null  float64
 6   Latitude     20640 non-null  float64
 7   Longitude    20640 non-null  float64
 8   MedHouseVal  20640 non-null  float64
dtypes: float64(9)
memory usage: 1.4 MB


In [4]:
df_housing['RoomsPerPerson'] = df_housing['AveRooms'] / df_housing['AveOccup']
df_housing = df_housing.replace([np.inf, -np.inf], np.nan).dropna()

In [5]:
X = df_housing.drop('MedHouseVal', axis=1)
y = df_housing['MedHouseVal']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
model = KNeighborsRegressor(n_neighbors=3)
model.fit(X_train, y_train)

y_train_pred = model.predict(X_train)
train_metrics = {
    "MSE": mean_squared_error(y_train, y_train_pred),
    "MAE": mean_absolute_error(y_train, y_train_pred),
}
save_metrics_to_csv(path_regression_metrics, "KNN", "train", train_metrics)

y_test_pred = model.predict(X_test)
test_metrics = {
    "MSE": mean_squared_error(y_test, y_test_pred),
    "MAE": mean_absolute_error(y_test, y_test_pred),
}
save_metrics_to_csv(path_regression_metrics, "KNN", "test", test_metrics)

NameError: name 'csv' is not defined

In [13]:
model = MyKNeighborsRegressor(k=3)
model.fit(X_train, y_train)

y_train_pred = model.predict(X_train)
print("\n--- Train Evaluation ---")
print("MSE:", mean_squared_error(y_train, y_train_pred))
print("MAE:", mean_absolute_error(y_train, y_train_pred))
print("R² Score:", r2_score(y_train, y_train_pred))

# 4. Dự đoán trên tập Test
y_test_pred = model.predict(X_test)
print("\n--- Test Evaluation ---")
print("MSE:", mean_squared_error(y_test, y_test_pred))
print("MAE:", mean_absolute_error(y_test, y_test_pred))
print("R² Score:", r2_score(y_test, y_test_pred))


--- Train Evaluation ---
MSE: 0.5516148036960722
MAE: 0.5634146299660853
R² Score: 0.5873550310506078

--- Test Evaluation ---
MSE: 1.151877871209149
MAE: 0.8235785457041344
R² Score: 0.12097834314639977


In [14]:
model = DecisionTreeRegressor(max_depth=5)
model.fit(X, y)

y_pred = model.predict(X)
print(y_pred)

mse = mean_squared_error(y, y_pred)
mae = mean_absolute_error(y, y_pred)
r2 = r2_score(y, y_pred)

print("MSE:", mse)
print("MAE:", mae)
print("R² Score:", r2)

[4.76503848 3.99102448 4.25729486 ... 0.86923132 0.86923132 0.86923132]
MSE: 0.4905640694739824
MAE: 0.5107515840572718
R² Score: 0.6315842747781715


In [16]:
model = MyDecisionTreeRegressor(max_depth=5)
model.fit(X, y)

# Dự đoán
y_pred = model.predict(X)

# Tính Loss
mse = mean_squared_error(y, y_pred)
mae = mean_absolute_error(y, y_pred)
r2 = r2_score(y, y_pred)
print("Dự đoán:   ", y_pred.tolist())
print("Chênh lệch:", (y - y_pred).tolist())

print("MSE:", mse)
print("MAE:", mae)
print("R² Score:", r2)

Thực tế:    [4.526, 3.585, 3.521, 3.413, 3.422, 2.697, 2.992, 2.414, 2.267, 2.611, 2.815, 2.418, 2.135, 1.913, 1.592, 1.4, 1.525, 1.555, 1.587, 1.629, 1.475, 1.598, 1.139, 0.997, 1.326, 1.075, 0.938, 1.055, 1.089, 1.32, 1.223, 1.152, 1.104, 1.049, 1.097, 0.972, 1.045, 1.039, 1.914, 1.76, 1.554, 1.5, 1.188, 1.888, 1.844, 1.823, 1.425, 1.375, 1.875, 1.125, 1.719, 0.938, 0.975, 1.042, 0.875, 0.831, 0.875, 0.853, 0.803, 0.6, 0.757, 0.75, 0.861, 0.761, 0.735, 0.784, 0.844, 0.813, 0.85, 1.292, 0.825, 0.952, 0.75, 0.675, 1.375, 1.775, 1.021, 1.083, 1.125, 1.313, 1.625, 1.125, 1.125, 1.375, 1.188, 0.982, 1.188, 1.625, 1.375, 5.00001, 1.625, 1.375, 1.625, 1.875, 1.792, 1.3, 1.838, 1.25, 1.7, 1.931, 2.578, 2.734, 2.375, 3.5, 3.357, 3.134, 2.685, 2.594, 2.757, 2.25, 2.625, 2.185, 2.55, 2.241, 2.431, 2.316, 2.185, 2.341, 3.276, 3.476, 3.661, 3.35, 3.736, 3.895, 3.911, 3.373, 2.952, 2.923, 4.115, 3.115, 3.259, 3.926, 3.193, 3.333, 3.352, 3.512, 3.689, 3.659, 3.667, 3.628, 4.833, 3.314, 3.235, 2.167