In [1]:
import pandas as pd
import numpy as np

# **1. Random Forest**


In [3]:
data = pd.read_csv("../clean_data.csv").drop("Unnamed: 0", axis = 1)

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28409 entries, 0 to 28408
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ID                   28409 non-null  object 
 1   Price (billion VND)  28409 non-null  float64
 2   Area (m2)            28409 non-null  float64
 3   Property Type        28409 non-null  object 
 4   Bedrooms             28409 non-null  int64  
 5   Bathrooms            28409 non-null  int64  
 6   Address              28409 non-null  object 
 7   Law Document         28409 non-null  object 
 8   Post Date            28409 non-null  object 
 9   Latitude             28409 non-null  float64
 10  Longitude            28409 non-null  float64
 11  Postal Code          28409 non-null  float64
 12  Importance           28409 non-null  float64
 13  Place Rank           28409 non-null  float64
 14  City                 28409 non-null  object 
dtypes: float64(7), int64(2), object(6)
m

## **1.1. Data Preprocessing**

### **1.1.1. Making changes to certain fields**

In [5]:
# Dropping unnecessary fields
data.drop(["ID", "Address"], axis = 1, inplace = True)

In [6]:
# Extracting the quarter and year from Post Date
data["Year"] = data["Post Date"].apply(lambda x: int(x.split("-")[0]))
data["Quarter"] = data["Post Date"].apply(lambda x: 1 if 1 <= int(x.split("-")[1]) <= 3
                                          else 2 if 4 <= int(x.split("-")[1]) <= 6
                                          else 3 if 7 <= int(x.split("-")[1]) <= 9
                                          else 4)

In [7]:
data.drop("Post Date", axis = 1, inplace = True)

### **1.1.2. Scaling and Numerical Encoding**

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28409 entries, 0 to 28408
Data columns (total 14 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Price (billion VND)  28409 non-null  float64
 1   Area (m2)            28409 non-null  float64
 2   Property Type        28409 non-null  object 
 3   Bedrooms             28409 non-null  int64  
 4   Bathrooms            28409 non-null  int64  
 5   Law Document         28409 non-null  object 
 6   Latitude             28409 non-null  float64
 7   Longitude            28409 non-null  float64
 8   Postal Code          28409 non-null  float64
 9   Importance           28409 non-null  float64
 10  Place Rank           28409 non-null  float64
 11  City                 28409 non-null  object 
 12  Year                 28409 non-null  int64  
 13  Quarter              28409 non-null  int64  
dtypes: float64(7), int64(4), object(3)
memory usage: 3.0+ MB


In [9]:
from sklearn.preprocessing import StandardScaler, OrdinalEncoder

In [10]:
numerical_fields = data.drop("Price (billion VND)", axis = 1).select_dtypes(include = ["int64", "float64"]).columns
scaler = StandardScaler()
data[numerical_fields] = scaler.fit_transform(data[numerical_fields])

In [11]:
data.head()

Unnamed: 0,Price (billion VND),Area (m2),Property Type,Bedrooms,Bathrooms,Law Document,Latitude,Longitude,Postal Code,Importance,Place Rank,City,Year,Quarter
0,1.77,-0.514786,Apartment,-0.924916,-1.611125,Valid Documents,-1.462842,1.335916,1.498438,-0.88062,1.284855,HCM,1.799852,0.398068
1,20.0,1.082414,Apartment,-0.924916,-0.743176,Valid Documents,-1.451113,1.488025,1.432554,0.188526,-0.506473,HCM,1.799852,0.398068
2,18.0,0.684554,Apartment,-0.924916,-0.743176,Valid Documents,-1.45036,1.543152,1.495161,0.188526,-0.506473,HCM,1.799852,0.398068
3,10.5,-0.199827,Apartment,-1.549241,-1.611125,Valid Documents,-1.45036,1.543152,1.495161,0.188526,-0.506473,HCM,1.799852,0.398068
4,16.5,0.70792,Apartment,-0.924916,-0.743176,Valid Documents,-1.45036,1.543152,1.495161,0.188526,-0.506473,HCM,1.799852,0.398068


In [12]:
def numerical_convert(data: pd.DataFrame, categorical_encoder) -> any:
    """ Return the converted DataFrame and the transformation dictionary """
    converted_data = data.copy()
    object_cols = data.select_dtypes(include=["object"]).columns
    for col in object_cols:
        converted_data[col] = categorical_encoder.fit_transform(data[[col]])
    return converted_data

In [13]:
# Encoding categorical fields
data = numerical_convert(data, OrdinalEncoder())
data.info()
data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28409 entries, 0 to 28408
Data columns (total 14 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Price (billion VND)  28409 non-null  float64
 1   Area (m2)            28409 non-null  float64
 2   Property Type        28409 non-null  float64
 3   Bedrooms             28409 non-null  float64
 4   Bathrooms            28409 non-null  float64
 5   Law Document         28409 non-null  float64
 6   Latitude             28409 non-null  float64
 7   Longitude            28409 non-null  float64
 8   Postal Code          28409 non-null  float64
 9   Importance           28409 non-null  float64
 10  Place Rank           28409 non-null  float64
 11  City                 28409 non-null  float64
 12  Year                 28409 non-null  float64
 13  Quarter              28409 non-null  float64
dtypes: float64(14)
memory usage: 3.0 MB


Unnamed: 0,Price (billion VND),Area (m2),Property Type,Bedrooms,Bathrooms,Law Document,Latitude,Longitude,Postal Code,Importance,Place Rank,City,Year,Quarter
0,1.77,-0.514786,0.0,-0.924916,-1.611125,4.0,-1.462842,1.335916,1.498438,-0.88062,1.284855,0.0,1.799852,0.398068
1,20.0,1.082414,0.0,-0.924916,-0.743176,4.0,-1.451113,1.488025,1.432554,0.188526,-0.506473,0.0,1.799852,0.398068
2,18.0,0.684554,0.0,-0.924916,-0.743176,4.0,-1.45036,1.543152,1.495161,0.188526,-0.506473,0.0,1.799852,0.398068
3,10.5,-0.199827,0.0,-1.549241,-1.611125,4.0,-1.45036,1.543152,1.495161,0.188526,-0.506473,0.0,1.799852,0.398068
4,16.5,0.70792,0.0,-0.924916,-0.743176,4.0,-1.45036,1.543152,1.495161,0.188526,-0.506473,0.0,1.799852,0.398068


## **1.2. Training my model**

In [14]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV, KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, max_error
from pprint import pprint

In [172]:
def evaluate_with_kfold(model, X, y, n_splits=5) -> None:
    """
    Evaluate the model using KFold cross-validation and display performance metrics.

    Parameters:
        model: The regression model to evaluate.
        X: Features (numpy array or pandas DataFrame).
        y: Target values (numpy array or pandas Series).
        n_splits: Number of folds for KFold cross-validation.

    Returns:
        None
    """
    # Initialize KFold
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

    # Metrics storage
    mae_list = []
    mse_list = []
    max_error_list = []
    r2_list = []

    # Cross-validation
    for fold, (train_idx, test_idx) in enumerate(kf.split(X), 1):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        # Train the model
        model.fit(X_train, y_train)

        # Predict on the test set
        y_pred = model.predict(X_test)

        # Compute metrics
        mae = mean_absolute_error(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)
        me = max_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)

        # Store metrics
        mae_list.append(mae)
        mse_list.append(mse)
        max_error_list.append(me)
        r2_list.append(r2)

        # Print metrics for the current fold
        print(f"Fold {fold}:")
        print(f"  Mean Absolute Error: {mae:.2f}")
        print(f"  Mean Squared Error: {mse:.2f}")
        print(f"  Max Error: {me:.2f}")
        print(f"  R2 Score: {r2:.2f}")

    # Print average metrics across all folds
    print("\nOverall Model Performance (Across Folds):")
    print(f"  Mean Absolute Error: {np.mean(mae_list):.2f} ± {np.std(mae_list):.2f}")
    print(f"  Mean Squared Error: {np.mean(mse_list):.2f} ± {np.std(mse_list):.2f}")
    print(f"  Max Error: {np.mean(max_error_list):.2f} ± {np.std(max_error_list):.2f}")
    print(f"  R2 Score: {np.mean(r2_list):.2f} ± {np.std(r2_list):.2f}")


In [15]:
X = data.drop(columns = ["Price (billion VND)"])
y = data["Price (billion VND)"]

# Split the data into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [174]:
X_test.to_csv("test_features.csv")
y_test.to_csv("test_targets.csv")

### **1.2.1. Base model**

In [175]:
base_model = RandomForestRegressor(random_state = 42)
base_model.fit(X_train, y_train)

In [176]:
evaluate_with_kfold(base_model, X_test, y_test)

Fold 1:
  Mean Absolute Error: 1.91
  Mean Squared Error: 9.75
  Max Error: 18.10
  R2 Score: 0.53
Fold 2:
  Mean Absolute Error: 1.98
  Mean Squared Error: 9.74
  Max Error: 15.68
  R2 Score: 0.55
Fold 3:
  Mean Absolute Error: 2.00
  Mean Squared Error: 9.93
  Max Error: 14.79
  R2 Score: 0.52
Fold 4:
  Mean Absolute Error: 2.10
  Mean Squared Error: 10.93
  Max Error: 15.94
  R2 Score: 0.46
Fold 5:
  Mean Absolute Error: 1.96
  Mean Squared Error: 9.69
  Max Error: 16.15
  R2 Score: 0.53

Overall Model Performance (Across Folds):
  Mean Absolute Error: 1.99 ± 0.06
  Mean Squared Error: 10.01 ± 0.47
  Max Error: 16.13 ± 1.09
  R2 Score: 0.52 ± 0.03


In [177]:
pprint(base_model.get_params())

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': 1.0,
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}


### **1.2.2. RandomizedSearchCV**

In [178]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 300, num = 20)]

# Number of features to consider at every split
max_features = ['log2', 'sqrt', 1.0]

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(100, 200, num = 11)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, None],
 'max_features': ['log2', 'sqrt', 1.0],
 'min_samples_leaf': [1, 2],
 'min_samples_split': [2, 5],
 'n_estimators': [100,
                  110,
                  121,
                  131,
                  142,
                  152,
                  163,
                  173,
                  184,
                  194,
                  205,
                  215,
                  226,
                  236,
                  247,
                  257,
                  268,
                  278,
                  289,
                  300]}


In [179]:
rf_random = RandomizedSearchCV(estimator = RandomForestRegressor(),
                               param_distributions = random_grid,
                               n_iter = 100  , cv = 3,
                               verbose=2, random_state=42,
                               n_jobs = -1)

In [180]:
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


In [181]:
rf_random.best_params_

{'n_estimators': 236,
 'min_samples_split': 5,
 'min_samples_leaf': 2,
 'max_features': 'log2',
 'max_depth': 150,
 'bootstrap': False}

In [186]:
rf_best = RandomForestRegressor(
    min_samples_split = 5,
    min_samples_leaf = 2,
    max_depth = 150,
    max_features = "log2",
    n_estimators = 236,
    bootstrap = False
)
evaluate = evaluate_with_kfold(rf_best, X_train, y_train, n_splits=5)

Fold 1:
  Mean Absolute Error: 1.86
  Mean Squared Error: 9.17
  Max Error: 15.88
  R2 Score: 0.56
Fold 2:
  Mean Absolute Error: 1.77
  Mean Squared Error: 8.29
  Max Error: 15.83
  R2 Score: 0.59
Fold 3:
  Mean Absolute Error: 1.84
  Mean Squared Error: 9.20
  Max Error: 17.97
  R2 Score: 0.57
Fold 4:
  Mean Absolute Error: 1.82
  Mean Squared Error: 8.77
  Max Error: 18.14
  R2 Score: 0.59
Fold 5:
  Mean Absolute Error: 1.82
  Mean Squared Error: 8.49
  Max Error: 14.97
  R2 Score: 0.60

Overall Model Performance (Across Folds):
  Mean Absolute Error: 1.82 ± 0.03
  Mean Squared Error: 8.78 ± 0.36
  Max Error: 16.56 ± 1.27
  R2 Score: 0.58 ± 0.02


In [18]:
rf_best.fit(X_test, y_test)
for i in range(rf_best.n_features_in_):
  print("%s: %.10f" % (rf_best.feature_names_in_[i], rf_best.feature_importances_[i]))

Area (m2): 0.2218824005
Property Type: 0.1632279778
Bedrooms: 0.0941235483
Bathrooms: 0.0678439796
Law Document: 0.0163208414
Latitude: 0.1022254531
Longitude: 0.1055591431
Postal Code: 0.0769522203
Importance: 0.0184232137
Place Rank: 0.0120223493
City: 0.0140529376
Year: 0.0779004184
Quarter: 0.0294655170


### **1.2.3. Test set evaluation**

In [19]:
y_pred = rf_best.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
me = max_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Test Set Evaluation:")
print(f"Mean Absolute Error (MAE): {mae:.3f}")
print(f"Mean Squared Error (MSE): {mse:.3f}")
print(f"Max Error (MSE): {me:.3f}")
print(f"R² Score: {r2:.3f}")

Test Set Evaluation:
Mean Absolute Error (MAE): 0.791
Mean Squared Error (MSE): 1.662
Max Error (MSE): 9.920
R² Score: 0.920


## **1.3. Save model**

In [189]:
import pickle
pickle.dump(rf_best, open("RandomForest.h5", "wb"))