In [22]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor

In [2]:
data = pd.read_csv("combinedData.csv").drop("Unnamed: 0", axis=1)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28409 entries, 0 to 28408
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ID                   28409 non-null  object 
 1   Price (billion VND)  28409 non-null  float64
 2   Area (m2)            28409 non-null  float64
 3   Property Type        28409 non-null  object 
 4   Bedrooms             28409 non-null  int64  
 5   Bathrooms            28409 non-null  int64  
 6   Address              28409 non-null  object 
 7   Law Document         28409 non-null  object 
 8   Post Date            28409 non-null  object 
 9   Latitude             28409 non-null  float64
 10  Longitude            28409 non-null  float64
 11  Postal Code          28409 non-null  float64
 12  Importance           28409 non-null  float64
 13  Place Rank           28409 non-null  float64
 14  City                 28409 non-null  object 
dtypes: float64(7), int64(2), object(6)
m

# **1. Finalizing the labels**

## **1.1. Property Type**

In [3]:
data["Property Type"].unique()

array(['Căn hộ', 'Căn hộ Studio', 'Nhà phố', 'can ho chung cu',
       'nha rieng', 'nha pho du an', 'biet thu'], dtype=object)

In [4]:
data["Property Type"] = data["Property Type"].replace("Căn hộ", "can ho chung cu")
data["Property Type"] = data["Property Type"].replace("Căn hộ Studio", "can ho studio")
data["Property Type"] = data["Property Type"].replace("Nhà phố", "nha pho du an")

In [5]:
data["Property Type"].unique()

array(['can ho chung cu', 'can ho studio', 'nha pho du an', 'nha rieng',
       'biet thu'], dtype=object)

## **1.2. Law Document**

In [6]:
data["Law Document"].unique()

array(['So hong', 'Hop đong', 'Giay đo', 'KXĐ', 'not provided',
       'Giay to hop le', 'Đang hop thuc hoa', 'Giay tay',
       'Chu quyen tu nhan', 'Khong xac đinh'], dtype=object)

In [7]:
data["Law Document"] = data["Law Document"].replace(["KXĐ", "not provided"], "Khong xac đinh")

In [8]:
data["Law Document"].unique()

array(['So hong', 'Hop đong', 'Giay đo', 'Khong xac đinh',
       'Giay to hop le', 'Đang hop thuc hoa', 'Giay tay',
       'Chu quyen tu nhan'], dtype=object)

## **1.3. Post Date**

In [9]:
data["Year"] = data["Post Date"].apply(lambda x: int(x.split("-")[0]))
data["Month"] = data["Post Date"].apply(lambda x: int(x.split("-")[1]))
data["Date"] = data["Post Date"].apply(lambda x: int(x.split("-")[2]))

In [10]:
data.drop("Post Date", axis=1, inplace=True)

In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28409 entries, 0 to 28408
Data columns (total 17 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ID                   28409 non-null  object 
 1   Price (billion VND)  28409 non-null  float64
 2   Area (m2)            28409 non-null  float64
 3   Property Type        28409 non-null  object 
 4   Bedrooms             28409 non-null  int64  
 5   Bathrooms            28409 non-null  int64  
 6   Address              28409 non-null  object 
 7   Law Document         28409 non-null  object 
 8   Latitude             28409 non-null  float64
 9   Longitude            28409 non-null  float64
 10  Postal Code          28409 non-null  float64
 11  Importance           28409 non-null  float64
 12  Place Rank           28409 non-null  float64
 13  City                 28409 non-null  object 
 14  Year                 28409 non-null  int64  
 15  Month                28409 non-null 

# **2. Training my Random Forest model**

## **2.1. Preprocessing data**

### **2.1.1. Scaling numerical fields**

In [12]:
used_data = data.drop(["ID", "Address"], axis=1)
numerical_fields = used_data.select_dtypes(include=["float64", "int64"]).columns
scaler = StandardScaler()
used_data[numerical_fields] = scaler.fit_transform(used_data[numerical_fields])

In [13]:
used_data

Unnamed: 0,Price (billion VND),Area (m2),Property Type,Bedrooms,Bathrooms,Law Document,Latitude,Longitude,Postal Code,Importance,Place Rank,City,Year,Month,Date
0,-0.835496,-0.514786,can ho chung cu,-0.924916,-1.611125,So hong,-1.462842,1.335916,1.498438,-0.880620,1.284855,HCM,1.799852,0.709435,-0.565353
1,3.143606,1.082414,can ho chung cu,-0.924916,-0.743176,Hop đong,-1.451113,1.488025,1.432554,0.188526,-0.506473,HCM,1.799852,0.387465,1.660362
2,2.707062,0.684554,can ho chung cu,-0.924916,-0.743176,Hop đong,-1.450360,1.543152,1.495161,0.188526,-0.506473,HCM,1.799852,0.387465,-1.385353
3,1.070020,-0.199827,can ho chung cu,-1.549241,-1.611125,Hop đong,-1.450360,1.543152,1.495161,0.188526,-0.506473,HCM,1.799852,0.387465,-1.385353
4,2.379653,0.707920,can ho chung cu,-0.924916,-0.743176,Hop đong,-1.450360,1.543152,1.495161,0.188526,-0.506473,HCM,1.799852,0.387465,-1.385353
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28404,2.161381,0.154182,biet thu,2.196706,-0.743176,Khong xac đinh,0.689577,-0.373589,-0.694185,0.188642,-0.506473,HN,1.799852,-0.578444,-0.331067
28405,-1.167270,2.042654,biet thu,1.572382,0.992724,Khong xac đinh,0.684676,-0.587367,-0.637818,0.188642,-0.506473,HN,1.799852,-0.256474,-1.619639
28406,3.580151,0.954382,biet thu,-0.300592,-0.743176,Khong xac đinh,0.695583,-0.683411,-0.686202,-0.880504,1.284855,HN,1.799852,-0.256474,-0.448210
28407,3.143606,0.154182,biet thu,-0.924916,0.124774,Khong xac đinh,0.683845,-0.701014,-0.689479,0.188642,-0.506473,HN,1.799852,0.387465,-0.799639


### **2.1.2. Encoding categorical fields**

In [14]:
def numerical_convert(data: pd.DataFrame, categorical_encoder) -> any:
    """ Return the converted DataFrame and the transformation dictionary """
    converted_data = data.copy()
    object_cols = data.select_dtypes(include=["object"]).columns
    for col in object_cols:
        converted_data[col] = categorical_encoder.fit_transform(data[[col]])
    return converted_data

In [15]:
input_data = numerical_convert(used_data, OrdinalEncoder())

In [16]:
input_data

Unnamed: 0,Price (billion VND),Area (m2),Property Type,Bedrooms,Bathrooms,Law Document,Latitude,Longitude,Postal Code,Importance,Place Rank,City,Year,Month,Date
0,-0.835496,-0.514786,1.0,-0.924916,-1.611125,6.0,-1.462842,1.335916,1.498438,-0.880620,1.284855,0.0,1.799852,0.709435,-0.565353
1,3.143606,1.082414,1.0,-0.924916,-0.743176,4.0,-1.451113,1.488025,1.432554,0.188526,-0.506473,0.0,1.799852,0.387465,1.660362
2,2.707062,0.684554,1.0,-0.924916,-0.743176,4.0,-1.450360,1.543152,1.495161,0.188526,-0.506473,0.0,1.799852,0.387465,-1.385353
3,1.070020,-0.199827,1.0,-1.549241,-1.611125,4.0,-1.450360,1.543152,1.495161,0.188526,-0.506473,0.0,1.799852,0.387465,-1.385353
4,2.379653,0.707920,1.0,-0.924916,-0.743176,4.0,-1.450360,1.543152,1.495161,0.188526,-0.506473,0.0,1.799852,0.387465,-1.385353
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28404,2.161381,0.154182,0.0,2.196706,-0.743176,5.0,0.689577,-0.373589,-0.694185,0.188642,-0.506473,1.0,1.799852,-0.578444,-0.331067
28405,-1.167270,2.042654,0.0,1.572382,0.992724,5.0,0.684676,-0.587367,-0.637818,0.188642,-0.506473,1.0,1.799852,-0.256474,-1.619639
28406,3.580151,0.954382,0.0,-0.300592,-0.743176,5.0,0.695583,-0.683411,-0.686202,-0.880504,1.284855,1.0,1.799852,-0.256474,-0.448210
28407,3.143606,0.154182,0.0,-0.924916,0.124774,5.0,0.683845,-0.701014,-0.689479,0.188642,-0.506473,1.0,1.799852,0.387465,-0.799639


## **2.2. Preparing the data**

In [17]:
# Separate features and target
X = input_data.drop(columns=["Price (billion VND)"])
y = input_data["Price (billion VND)"]

# Split the data into train and validation sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## **2.3. Training the base model**

In [18]:
def evaluate(model, X_test, y_test) -> None:
  y_pred = model.predict(X_test)
  mae = mean_absolute_error(y_test, y_pred)
  mse = mean_squared_error(y_test, y_pred)
  r2 = r2_score(y_test, y_pred)
  print('Model Performance:')
  print("Mean Absolute Error: {:0.2f}".format(mae))
  print("Mean Squared Error: {:0.2f}".format(mse))
  print("R2 score: {:0.2f}".format(r2))
  return None

In [19]:
base_model = RandomForestRegressor(random_state=42)
base_model.fit(X_train, y_train)
evaluate(base_model, X_test, y_test)

Model Performance:
Mean Absolute Error: 0.41
Mean Squared Error: 0.44
R2 score: 0.56


In [20]:
from pprint import pprint
pprint(base_model.get_params())

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': 1.0,
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}


## **2.4. RandomizedSearchCV training**

In [23]:
# Create a parameter grid to sample

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 200, num = 11)]

# Number of features to consider at every split
max_features = ['log2', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['log2', 'sqrt'],
 'min_samples_leaf': [1, 2],
 'min_samples_split': [2, 5],
 'n_estimators': [100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200]}


In [None]:
# Define the model
model = RandomForestRegressor()
rf_random = RandomizedSearchCV(estimator = model,
                               param_distributions = random_grid,
                               n_iter = 80  , cv = 3,
                               verbose=2, random_state=42,
                               n_jobs = -1)

# Fit the random search model
rf_random.fit(X_train, y_train)

In [None]:
rf_random.best_params_

{'n_estimators': 200,
 'min_samples_split': 5,
 'min_samples_leaf': 1,
 'max_features': 'log2',
 'max_depth': None,
 'bootstrap': False}

In [None]:
best_random = rf_random.best_estimator_
evaluate(best_random, X_test, y_test)

Model Performance:
Mean Absolute Error: 0.39
Mean Squared Error: 0.40
R2 score: 0.60
