In [42]:
import pandas as pd 
import wandb
import os
import joblib
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

In [43]:
run = wandb.init(project="predict_house_price", job_type="model_download")

artifact = run.use_artifact('predict_house_price/xgb_model:latest', type='model')
artifact_dir = artifact.download()

model = joblib.load(f"{artifact_dir}/xgb_model.pkl")


[34m[1mwandb[0m:   1 of 1 files downloaded.  


In [44]:
run = wandb.init(project="predict_house_price", job_type="download_pca_data")

artifact = run.use_artifact('pca_X_y_train_test:latest', type='processed_data')

artifact_dir = artifact.download()

X_train = pd.read_csv(os.path.join(artifact_dir, "X_train.csv"), index_col=0)
X_test = pd.read_csv(os.path.join(artifact_dir, "X_test.csv"), index_col=0)
y_train = pd.read_csv(os.path.join(artifact_dir, "y_train.csv"), index_col=0)
y_test = pd.read_csv(os.path.join(artifact_dir, "y_test.csv"), index_col=0)

run.finish()

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


[34m[1mwandb[0m:   4 of 4 files downloaded.  


In [45]:
# GridSearch để tìm tham số tốt
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 4, 5],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.7, 1],
    'colsample_bytree': [0.7, 1]
}

xgb = xgb.XGBRegressor(random_state=42)
grid_search = GridSearchCV(xgb, param_grid, cv=3, scoring="neg_root_mean_squared_error", verbose=1, n_jobs=-1)

grid_search.fit(X_train, y_train)

# In kết quả
print("Best Parameters:", grid_search.best_params_)

# Đánh giá trên tập test
best_model = grid_search.best_estimator_
y_pred = np.expm1(best_model.predict(X_test))
y_true = np.expm1(y_test)

rmse = np.sqrt(mean_squared_error(y_true, y_pred))
r2 = r2_score(y_true, y_pred)
print(f"📈 RMSE (real scale): {rmse:,.2f}")
print(f"📊 R² Score (real scale): {r2:.4f}")


Fitting 3 folds for each of 72 candidates, totalling 216 fits
Best Parameters: {'colsample_bytree': 0.7, 'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 200, 'subsample': 1}
📈 RMSE (real scale): 26,263.18
📊 R² Score (real scale): 0.8856


In [46]:
run = wandb.init(project="predict_house_price", job_type="download_preprocessed")

artifact = run.use_artifact('preprocessed_data:latest', type='processed_data')

artifact_dir = artifact.download()

preprocessed = pd.read_csv(os.path.join(artifact_dir, "preprocessed_data.csv"))

run.finish()


[34m[1mwandb[0m:   1 of 1 files downloaded.  
[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


In [47]:
preprocessed["TotalSF"] = preprocessed["1stFlrSF"] + preprocessed["2ndFlrSF"] + preprocessed["TotalBsmtSF"]
preprocessed["Age"] = preprocessed["YrSold"] - preprocessed["YearBuilt"]
preprocessed["RemodAge"] = preprocessed["YrSold"] - preprocessed["YearRemodAdd"]
preprocessed["HasGarage"] = (preprocessed["GarageArea"] > 0).astype(int)
preprocessed["HasBsmt"] = (preprocessed["TotalBsmtSF"] > 0).astype(int)
preprocessed["OverallLivQual"] = preprocessed["GrLivArea"] * preprocessed["OverallQual"]

In [48]:
# 2. Tách X và y
X = preprocessed.drop(columns=["SalePrice"])
y = np.log1p(preprocessed["SalePrice"])

In [49]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [50]:
run = wandb.init(project="predict_house_price", job_type="save_tuned_data")

os.makedirs("tuned_data", exist_ok=True)

X_train.to_csv("tuned_data/X_train.csv")
X_test.to_csv("tuned_data/X_test.csv")
y_train.to_csv("tuned_data/y_train.csv")
y_test.to_csv("tuned_data/y_test.csv")

artifact = wandb.Artifact(
    name="tuned_X_y_train_test",  # tên mới cho artifact
    type="processed_data",        # type vẫn giữ processed_data
    description="Dataset after tuning features and processing",
)

artifact.add_file("tuned_data/X_train.csv")
artifact.add_file("tuned_data/X_test.csv")
artifact.add_file("tuned_data/y_train.csv")
artifact.add_file("tuned_data/y_test.csv")

run.log_artifact(artifact)

run.finish()


[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


In [58]:
import re

def clean_column(name):
    # Chỉ giữ chữ cái, số và dấu _
    return re.sub(r'[^A-Za-z0-9_]', '', name)

X_train.columns = [clean_column(col) for col in X_train.columns]
X_test.columns = [clean_column(col) for col in X_test.columns]


In [59]:
import sys
import subprocess

try:
    from lightgbm import LGBMRegressor
except ModuleNotFoundError:
    print("lightgbm chưa được cài đặt. Đang tiến hành cài đặt...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "lightgbm"])
    from lightgbm import LGBMRegressor

from sklearn.linear_model import Ridge
from xgboost import XGBRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd
import numpy as np

# Base learners
estimators = [
    ("xgb", XGBRegressor(n_estimators=200, learning_rate=0.05, max_depth=4,
                         subsample=0.7, colsample_bytree=0.7, random_state=42)),
    ("lgb", LGBMRegressor(n_estimators=200, learning_rate=0.05, max_depth=4,
                          subsample=0.7, colsample_bytree=0.7, random_state=42)),
    ("ridge", Ridge(alpha=10))
]
# Final estimator
stack_model = StackingRegressor(estimators=estimators, final_estimator=Ridge(alpha=5))
stack_model.fit(X_train, y_train)

# Predict & evaluate
y_pred = np.expm1(stack_model.predict(X_test))
y_true = np.expm1(y_test)

rmse = np.sqrt(mean_squared_error(y_true, y_pred))
r2 = r2_score(y_true, y_pred)

print(f"✅ RMSE: {rmse:,.2f}")
print(f"📊 R² Score: {r2:.4f}")



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000693 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4788
[LightGBM] [Info] Number of data points in the train set: 1052, number of used features: 136
[LightGBM] [Info] Start training from score 12.001088
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000867 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4540
[LightGBM] [Info] Number of data points in the train set: 841, number of used features: 125
[LightGBM] [Info] Start training from score 11.996388
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001046 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4538
[LightGBM] [Info] Number of data points in the train se

In [65]:
if "SalePrice" in preprocessed.columns:
    clean_data = preprocessed.drop(columns=["SalePrice"])

# Xuất ra file CSV
clean_data.to_csv("test_input_for_api.csv", index=False)

In [66]:
import wandb
import joblib

run = wandb.init(project="predict_house_price", job_type="upload_test_input")

artifact = wandb.Artifact(
    name="test_input_for_api",     # Tên artifact
    type="test_data",              # Loại artifact
    description="Test input data for API prediction"
)

artifact.add_file("test_input_for_api.csv")

run.log_artifact(artifact)

run.finish()


[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


In [67]:
print("Expected features:", model.n_features_in_)

Expected features: 202
