# **Artificial Neural Network**

# **1. Setting up the environments**

In [496]:
import pandas as pd
import numpy as np
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, Input, Add, Activation
from tensorflow.keras.optimizers import Adam, RMSprop, SGD
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.regularizers import l2
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error, max_error, r2_score
from sklearn.preprocessing import StandardScaler, OrdinalEncoder

# **2. Inspecting our data**

In [384]:
data = pd.read_csv("clean_data.csv").drop("Unnamed: 0", axis=1)

In [385]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28409 entries, 0 to 28408
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ID                   28409 non-null  object 
 1   Price (billion VND)  28409 non-null  float64
 2   Area (m2)            28409 non-null  float64
 3   Property Type        28409 non-null  object 
 4   Bedrooms             28409 non-null  int64  
 5   Bathrooms            28409 non-null  int64  
 6   Address              28409 non-null  object 
 7   Law Document         28409 non-null  object 
 8   Post Date            28409 non-null  object 
 9   Latitude             28409 non-null  float64
 10  Longitude            28409 non-null  float64
 11  Postal Code          28409 non-null  float64
 12  Importance           28409 non-null  float64
 13  Place Rank           28409 non-null  float64
 14  City                 28409 non-null  object 
dtypes: float64(7), int64(2), object(6)
m

In [386]:
data.head()

Unnamed: 0,ID,Price (billion VND),Area (m2),Property Type,Bedrooms,Bathrooms,Address,Law Document,Post Date,Latitude,Longitude,Postal Code,Importance,Place Rank,City
0,ABC118222,1.77,54.1,Apartment,2,1,"Số 4, Phong Phú, Bình Chánh",Valid Documents,2024-09-11,10.714894,106.640598,72900.0,6.7e-05,30.0,HCM
1,A01198772,20.0,104.0,Apartment,2,2,"Nguyễn Thị Minh Khai, Bến Nghé, Quận 1",Valid Documents,2024-08-30,10.770974,106.70247,71010.0,0.0534,26.0,HCM
2,A0289931,18.0,91.57,Apartment,2,2,"Mai Chí Thọ, Thủ Thiêm, Quận 2",Valid Documents,2024-08-04,10.774573,106.724893,72806.0,0.0534,26.0,HCM
3,A0287287,10.5,63.94,Apartment,1,1,"Mai Chí Thọ, Thủ Thiêm, Quận 2",Valid Documents,2024-08-04,10.774573,106.724893,72806.0,0.0534,26.0,HCM
4,A02198668,16.5,92.3,Apartment,2,2,"Mai Chí Thọ, Thủ Thiêm, Quận 2",Valid Documents,2024-08-04,10.774573,106.724893,72806.0,0.0534,26.0,HCM


# **3.Data Preprocessing**

In [387]:
# Dropping unused properties
data.drop(columns = ["ID", "Address"], axis = 1, inplace = True)

## **3.1. Scaling numerical data**

In [388]:
# Extract dates, months, and years 
data["Year"] = data["Post Date"].apply(lambda x: int(x.split('-')[0]))
data["Month"] = data["Post Date"].apply(lambda x: int(x.split('-')[1]))
data["Date"] = data["Post Date"].apply(lambda x: int(x.split('-')[2]))
data.drop("Post Date", axis = 1, inplace= True)

In [389]:
# Our target is "Price (billion VND)"
numerical_fields = data.drop("Price (billion VND)", axis = 1).select_dtypes(include = ["int64", "float64"]).columns
scaler = StandardScaler()
data[numerical_fields] = scaler.fit_transform(data[numerical_fields])

In [390]:
data.head()

Unnamed: 0,Price (billion VND),Area (m2),Property Type,Bedrooms,Bathrooms,Law Document,Latitude,Longitude,Postal Code,Importance,Place Rank,City,Year,Month,Date
0,1.77,-0.514786,Apartment,-0.924916,-1.611125,Valid Documents,-1.462842,1.335916,1.498438,-0.88062,1.284855,HCM,1.799852,0.709435,-0.565353
1,20.0,1.082414,Apartment,-0.924916,-0.743176,Valid Documents,-1.451113,1.488025,1.432554,0.188526,-0.506473,HCM,1.799852,0.387465,1.660362
2,18.0,0.684554,Apartment,-0.924916,-0.743176,Valid Documents,-1.45036,1.543152,1.495161,0.188526,-0.506473,HCM,1.799852,0.387465,-1.385353
3,10.5,-0.199827,Apartment,-1.549241,-1.611125,Valid Documents,-1.45036,1.543152,1.495161,0.188526,-0.506473,HCM,1.799852,0.387465,-1.385353
4,16.5,0.70792,Apartment,-0.924916,-0.743176,Valid Documents,-1.45036,1.543152,1.495161,0.188526,-0.506473,HCM,1.799852,0.387465,-1.385353


## **3.2. Encoding categorical data**

In [391]:
def numerical_convert(data: pd.DataFrame, categorical_encoder) -> any:
    """ Return the converted DataFrame and the transformation dictionary """
    converted_data = data.copy()
    object_cols = data.select_dtypes(include=["object"]).columns
    for col in object_cols:
        converted_data[col] = categorical_encoder.fit_transform(data[[col]])
    return converted_data

In [393]:
data = numerical_convert(data, OrdinalEncoder())

In [394]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28409 entries, 0 to 28408
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Price (billion VND)  28409 non-null  float64
 1   Area (m2)            28409 non-null  float64
 2   Property Type        28409 non-null  float64
 3   Bedrooms             28409 non-null  float64
 4   Bathrooms            28409 non-null  float64
 5   Law Document         28409 non-null  float64
 6   Latitude             28409 non-null  float64
 7   Longitude            28409 non-null  float64
 8   Postal Code          28409 non-null  float64
 9   Importance           28409 non-null  float64
 10  Place Rank           28409 non-null  float64
 11  City                 28409 non-null  float64
 12  Year                 28409 non-null  float64
 13  Month                28409 non-null  float64
 14  Date                 28409 non-null  float64
dtypes: float64(15)
memory usage: 3.3 MB


# **4. Model training**

In [395]:
import tensorflow as tf
tf.random.set_seed(42)
np.random.seed(42)

In [396]:
# Splitting data into training and test sets
X = data.drop("Price (billion VND)", axis = 1)
y = data["Price (billion VND)"]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [491]:
reduce_lr = ReduceLROnPlateau(monitor='loss', factor=0.5, patience=5)
early_stopping = EarlyStopping(monitor="loss", patience=10, restore_best_weights=True)

In [505]:
X_val.to_csv("ANN_features.csv")
y_val.to_csv("ANN_results.csv")

In [492]:
# Fine-tuning with KFold Cross-validation
def evaluate_with_kfold(model, X, y, n_splits=5) -> None:
    """
    Evaluate the model using KFold cross-validation and display performance metrics.

    Parameters:
        model: The regression model to evaluate.
        X: Features (numpy array or pandas DataFrame).
        y: Target values (numpy array or pandas Series).
        n_splits: Number of folds for KFold cross-validation.

    Returns:
        None
    """
    # Initialize KFold
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

    # Metrics storage
    mae_list = []
    mse_list = []
    max_error_list = []
    r2_list = []

    # Cross-validation
    for fold, (train_idx, test_idx) in enumerate(kf.split(X), 1):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        # Train the model
        model.fit(X_train, y_train, 
                  epochs = 300,
                  batch_size=350, 
                  callbacks = [early_stopping, reduce_lr],
                  shuffle = False, 
                  verbose=1)

        # Predict on the test set
        y_pred = model.predict(X_test)

        # Compute metrics
        mae = mean_absolute_error(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)
        me = max_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)

        # Store metrics
        mae_list.append(mae)
        mse_list.append(mse)
        max_error_list.append(me)
        r2_list.append(r2)

    # Print average metrics across all folds
    print("\nOverall Model Performance (Across Folds):")
    print(f"  Mean Absolute Error: {np.mean(mae_list):.2f} ± {np.std(mae_list):.2f}")
    print(f"  Mean Squared Error: {np.mean(mse_list):.2f} ± {np.std(mse_list):.2f}")
    print(f"  Max Error: {np.mean(max_error_list):.2f} ± {np.std(max_error_list):.2f}")
    print(f"  R2 Score: {np.mean(r2_list):.2f} ± {np.std(r2_list):.2f}")


## **Neural Network**

In [490]:
# Building model
nn_model = Sequential([
    Dense(1024, activation="relu"),
    BatchNormalization(),
    Dropout(0.5),
    Dense(512, activation="relu"),
    BatchNormalization(),
    Dropout(0.5),
    Dense(32, activation="relu"), 
    Dense(16, activation="relu"),
    Dense(1)
])

nn_model.compile(optimizer=Adam(learning_rate=0.001), loss='huber', metrics=['mae'])

In [416]:
evaluate_with_kfold(nn_model, X_train, y_train)

Epoch 1/300
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 11ms/step - loss: 2.9394 - mae: 3.3926 - learning_rate: 0.0010
Epoch 2/300
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - loss: 2.0276 - mae: 2.4519 - learning_rate: 0.0010
Epoch 3/300
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - loss: 1.9423 - mae: 2.3640 - learning_rate: 0.0010
Epoch 4/300
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - loss: 1.8846 - mae: 2.3047 - learning_rate: 0.0010
Epoch 5/300
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - loss: 1.8439 - mae: 2.2625 - learning_rate: 0.0010
Epoch 6/300
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - loss: 1.8325 - mae: 2.2494 - learning_rate: 0.0010
Epoch 7/300
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - loss: 1.8063 - mae: 2.2212 - learning_rate: 0.0010
Epoch 8/300
[1m52/52[0m 

In [417]:
# Test set evaluation
y_pred = nn_model.predict(X_val)

mae = mean_absolute_error(y_val, y_pred)
mse = mean_squared_error(y_val, y_pred)
me = max_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)

print(f"Test Set Evaluation:")
print(f"Mean Absolute Error (MAE): {mae:.3f}")
print(f"Mean Squared Error (MSE): {mse:.3f}")
print(f"Max Error (MSE): {me:.3f}")
print(f"R² Score: {r2:.3f}")

[1m178/178[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Test Set Evaluation:
Mean Absolute Error (MAE): 2.047
Mean Squared Error (MSE): 11.245
Max Error (MSE): 18.039
R² Score: 0.458


In [418]:
nn_model.save("ANN.h5")

