# Include libraries


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import lightgbm as lgb
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

warnings.filterwarnings("ignore")

# Load data

In [None]:
df_train = pd.read_csv('./hanoi_real_estate_prediction/data/train.csv')
df_val = pd.read_csv('./hanoi_real_estate_prediction/data/val.csv')
df_test = pd.read_csv('./hanoi_real_estate_prediction/data/test.csv')
df = pd.concat([df_train, df_val], axis=0)
df

In [None]:
features_numerical = df.select_dtypes(exclude=['object', 'bool']).copy()
numerical_cols = features_numerical.columns.tolist()

numerical_cols.remove('price')
features_categorical = df.select_dtypes(include=['object', 'bool']).copy()
categorical_cols = features_categorical.columns.tolist()
categorical_transformer = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
numerical_transformer = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_cols),
        ('num', numerical_transformer, numerical_cols)     
    ],
    remainder='passthrough' 
)
y_train = df_train['price'].copy()
y_val = df_val['price'].copy()

df_train = df_train.drop(['price'], axis = 1)
df_val = df_val.drop(['price'], axis = 1)

x_train = preprocessor.fit_transform(df_train)
x_val = preprocessor.transform(df_val)
x_test = preprocessor.transform(df_test)

In [None]:
print("Training x: " + str(x_train.shape))
print("Training y: " + str(y_train.shape))

print("Validation x: " + str(x_val.shape))
print("Validation y: " + str(y_val.shape))

print("Test x: " + str(x_test.shape))

# Extreme Gradient Boosting

## Effect of max depth

In [None]:
import xgboost as xgb
from sklearn.metrics import root_mean_squared_error
import matplotlib.pyplot as plt

depth_values = [2, 3, 4, 5, 6, 7, 8, 9, 10]

train_rmse_list = []
val_rmse_list = []

# Convert data to xgboost type
dtrain = xgb.DMatrix(x_train, label=y_train)
dval = xgb.DMatrix(x_val, label=y_val)

for depth in depth_values:
    params = {
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',
        'learning_rate': 0.01,
        'max_depth': depth,
    }

    model = xgb.train(
        params,
        dtrain,
        num_boost_round=2000,
        evals=[(dtrain, 'train'), (dval, 'eval')],
        verbose_eval=False
    )

    y_train_pred = model.predict(dtrain)
    y_val_pred = model.predict(dval)

    train_rmse = root_mean_squared_error(y_train, y_train_pred)
    val_rmse = root_mean_squared_error(y_val, y_val_pred)

    train_rmse_list.append(train_rmse)
    val_rmse_list.append(val_rmse)

    print(f"max_depth={depth:2d} | Train RMSE = {train_rmse:.4f} | Val RMSE = {val_rmse:.4f}")

plt.figure(figsize=(8, 5))
plt.plot(depth_values, train_rmse_list, marker='o', label='Tập huấn luyện')
plt.plot(depth_values, val_rmse_list, marker='o', label='Tập kiểm tra')
plt.title("Ảnh hưởng của độ sâu tối đa mỗi cây đến RMSE")
plt.xlabel("Độ sâu tối đa của mỗi cây")
plt.ylabel("RMSE")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


# Effect of min_child_weight

In [None]:
import xgboost as xgb
from sklearn.metrics import root_mean_squared_error
import matplotlib.pyplot as plt

min_child_weight_values = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]


train_rmse_list = []
val_rmse_list = []

dtrain = xgb.DMatrix(x_train, label=y_train)
dval = xgb.DMatrix(x_val, label=y_val)

for min_child_weight in min_child_weight_values:
    params = {
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',
        'learning_rate': 0.01,
        'min_child_weight': min_child_weight,
    }


    model = xgb.train(
        params,
        dtrain,
        num_boost_round=2000,
        evals=[(dtrain, 'train'), (dval, 'eval')],
        early_stopping_rounds=2000,
        verbose_eval=False
    )

    y_train_pred = model.predict(dtrain)
    y_val_pred = model.predict(dval)

    train_rmse = root_mean_squared_error(y_train, y_train_pred)
    val_rmse = root_mean_squared_error(y_val, y_val_pred)

    train_rmse_list.append(train_rmse)
    val_rmse_list.append(val_rmse)

    print(f"min_child_weight={min_child_weight:2d} | Train RMSE = {train_rmse:.4f} | Val RMSE = {val_rmse:.4f}")

plt.figure(figsize=(8, 5))
plt.plot(min_child_weight_values, train_rmse_list, marker='o', label='Dữ liệu huấn luyện')
plt.plot(min_child_weight_values, val_rmse_list, marker='o', label='Dữ liệu kiểm tra')
plt.title("Ảnh hưởng của tổng trọng số tối thiểu mỗi nút lá đến RMSE")
plt.xlabel("Tổng trọng số tối thiểu mỗi nút lá")
plt.ylabel("RMSE")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


## Final model

In [None]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score

dtrain = xgb.DMatrix(x_train, label=y_train)
dval = xgb.DMatrix(x_val, label=y_val)

params = {
    'objective': 'reg:squarederror',  
    'eval_metric': 'rmse',
    'learning_rate': 0.1,
    'max_depth': 3,
    'min_child_weight': 6,
    'subsample': 0.9,       
}

evals = [(dtrain, 'train'), (dval, 'eval')]

model = xgb.train(
    params,
    dtrain,
    num_boost_round=2000,
    evals=evals,
    early_stopping_rounds=100,  
)

y_pred = model.predict(dval, iteration_range=(0, model.best_iteration + 1))

rmse = root_mean_squared_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)

print(f"Validation RMSE: {rmse:.4f}")
print(f"Validation R²: {r2:.4f}")


In [None]:
import pandas as pd

dtest = xgb.DMatrix(x_test)
y_pred = model.predict(dtest)

submission = pd.DataFrame({
    'Id': range(len(y_pred)),  
    'TARGET': y_pred
})

submission.to_csv('submission.csv', index=False)
