# Include libraries


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import lightgbm as lgb
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

warnings.filterwarnings("ignore")

# Load data

In [None]:
df_train = pd.read_csv('hanoi_real_estate_prediction/data/train.csv')
df_val = pd.read_csv('hanoi_real_estate_prediction/data/val.csv')
df_test = pd.read_csv('hanoi_real_estate_prediction/data/test.csv')
df = pd.concat([df_train, df_val], axis=0)
df

In [None]:
features_numerical = df.select_dtypes(exclude=['object', 'bool']).copy()
numerical_cols = features_numerical.columns.tolist()

numerical_cols.remove('price')
features_categorical = df.select_dtypes(include=['object', 'bool']).copy()
categorical_cols = features_categorical.columns.tolist()
categorical_transformer = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
numerical_transformer = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_cols),
        ('num', numerical_transformer, numerical_cols)     
    ],
    remainder='passthrough' 
)
y_train = df_train['price'].copy()
y_val = df_val['price'].copy()

df_train = df_train.drop(['price'], axis = 1)
df_val = df_val.drop(['price'], axis = 1)

x_train = preprocessor.fit_transform(df_train)
x_val = preprocessor.transform(df_val)
x_test = preprocessor.transform(df_test)

In [None]:
print("Training x: " + str(x_train.shape))
print("Training y: " + str(y_train.shape))

print("Validation x: " + str(x_val.shape))
print("Validation y: " + str(y_val.shape))

print("Test x: " + str(x_test.shape))

# Light Gradient Boosting

## Effect of the number of leaves

In [None]:
import lightgbm as lgb
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import root_mean_squared_error


params_base = {
    'objective': 'regression',
    'metric': 'rmse',
    'learning_rate': 0.01,
    'verbosity': -1
}


num_leaves_values = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
train_losses = []
val_losses = []

for num_leaves in num_leaves_values:
    params = params_base.copy()
    params['num_leaves'] = num_leaves
    
    train_data = lgb.Dataset(x_train, label=y_train)
    val_data = lgb.Dataset(x_val, label=y_val)
    
    model = lgb.train(
        params,
        train_data,
        valid_sets=[train_data, val_data],
        valid_names=['train', 'valid'],
        num_boost_round=2000
    )
    
    y_train_pred = model.predict(x_train, num_iteration=model.best_iteration)
    y_val_pred = model.predict(x_val, num_iteration=model.best_iteration)

    train_losses.append(root_mean_squared_error(y_train, y_train_pred))
    val_losses.append(root_mean_squared_error(y_val, y_val_pred))

plt.figure(figsize=(8,5))
plt.plot(num_leaves_values, train_losses, label='Tập huấn luyện', marker='o')
plt.plot(num_leaves_values, val_losses, label='Tập kiểm tra', marker='o')
plt.xlabel('Số nút lá tối đa')
plt.ylabel('RMSE')
plt.title('Ảnh hưởng của số nút lá tối đa đến RMSE')
plt.xticks(num_leaves_values)
plt.legend()
plt.tight_layout()
plt.grid(True)
plt.show()


## Effect of max depth

In [None]:
import lightgbm as lgb
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import root_mean_squared_error


params_base = {
    'objective': 'regression',
    'metric': 'rmse',
    'learning_rate': 0.01,
    'verbosity': -1
}


max_depth_values = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20]
train_losses = []
val_losses = []

for max_depth in max_depth_values:
    params = params_base.copy()
    params['max_depth'] = max_depth
    train_data = lgb.Dataset(x_train, label=y_train)
    val_data = lgb.Dataset(x_val, label=y_val)
    
    model = lgb.train(
        params,
        train_data,
        valid_sets=[train_data, val_data],
        valid_names=['train', 'valid'],
        num_boost_round=2000,
    )
    
    y_train_pred = model.predict(x_train, num_iteration=model.best_iteration)
    y_val_pred = model.predict(x_val, num_iteration=model.best_iteration)

    train_losses.append(root_mean_squared_error(y_train, y_train_pred))
    val_losses.append(root_mean_squared_error(y_val, y_val_pred))

plt.figure(figsize=(8,5))
plt.plot(max_depth_values, train_losses, label='Tập huấn luyện', marker='o')
plt.plot(max_depth_values, val_losses, label='Tập kiểm tra', marker='o')
plt.xlabel('Độ sâu tối đa')
plt.ylabel('RMSE')
plt.title('Ảnh hưởng của độ sâu tối đa đến RMSE')
plt.xticks(max_depth_values)
plt.legend()
plt.tight_layout()
plt.grid(True)
plt.show()


# Effect of min_data_in_leaf

In [None]:
import lightgbm as lgb
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import root_mean_squared_error


params_base = {
    'objective': 'regression',
    'metric': 'rmse',
    'learning_rate': 0.01,
    'verbosity': -1
}


min_data_in_leaf_values = [ 2, 4, 6, 8, 10, 12, 14, 16, 18, 20]
train_losses = []
val_losses = []

for min_data_in_leaf in min_data_in_leaf_values:
    params = params_base.copy()
    params['min_data_in_leaf'] = min_data_in_leaf
    train_data = lgb.Dataset(x_train, label=y_train)
    val_data = lgb.Dataset(x_val, label=y_val)
    
    model = lgb.train(
        params,
        train_data,
        valid_sets=[train_data, val_data],
        valid_names=['train', 'valid'],
        num_boost_round=2000,
    )
    
    y_train_pred = model.predict(x_train, num_iteration=model.best_iteration)
    y_val_pred = model.predict(x_val, num_iteration=model.best_iteration)

    train_losses.append(root_mean_squared_error(y_train, y_train_pred))
    val_losses.append(root_mean_squared_error(y_val, y_val_pred))

plt.figure(figsize=(8,5))
plt.plot(min_data_in_leaf_values, train_losses, label='Tập huấn luyện', marker='o')
plt.plot(min_data_in_leaf_values, val_losses, label='Tập kiểm tra', marker='o')
plt.xlabel('Số điểm dữ liệu tối thiểu trong mỗi lá')
plt.ylabel('RMSE')
plt.title('Ảnh hưởng của số điểm dữ liệu tối thiểu mỗi lá đến RMSE')
plt.xticks(min_data_in_leaf_values)
plt.legend()
plt.grid(True)
plt.show()


## Final model

In [None]:
import lightgbm as lgb
from sklearn.metrics import r2_score

train_data = lgb.Dataset(x_train, label=y_train)
val_data = lgb.Dataset(x_val, label=y_val)

params = {
    'objective': 'regression',
    'metric': 'rmse',
    'learning_rate': 0.01,         
    'num_leaves': 30,              
    'max_depth': 8,               
    'min_data_in_leaf': 20,       
    'verbosity': -1
}


def r2_eval(preds, train_data):
    y_true = train_data.get_label()
    return 'r2', r2_score(y_true, preds), True  

model = lgb.train(
    params,
    train_data,
    valid_sets=[val_data],
    valid_names=['valid'],
    num_boost_round=2000,
    callbacks=[
        lgb.early_stopping(stopping_rounds=500),
        lgb.log_evaluation(period=100)
    ],
)

y_pred = model.predict(x_val, num_iteration=model.best_iteration)
r2 = r2_score(y_val, y_pred)
print(f"Validation R²: {r2:.4f}")


In [None]:
import pandas as pd

y_pred = model.predict(x_test)

submission = pd.DataFrame({
    'Id': range(len(y_pred)), 
    'TARGET': y_pred
})

submission.to_csv('submission.csv', index=False)
