## Training attention based model

In [1]:
%matplotlib inline

import sys
sys.path.append("../../")
import tensorflow as tf
from train_model import train

from sklearn.linear_model import LinearRegression
from yellowbrick.regressor import ResidualsPlot, PredictionError

import seaborn as sns
import matplotlib.pyplot as plt

from matplotlib import rcParams
rcParams['figure.figsize'] = (8, 4)
rcParams['figure.dpi'] = 100
rcParams['font.size'] = 8
rcParams['font.family'] = 'sans-serif'
rcParams['axes.facecolor'] = '#ffffff'
rcParams['lines.linewidth'] = 2.0

In [2]:
import os
file_path = r"D:\Final_file\ASI-main\output\models\BJ\asi_BJ_weights.hdf5"
from tensorflow.keras import backend as K
K.clear_session()

if os.path.exists(file_path):
    os.remove(file_path)
    print(f"Removed the file: {file_path}")
else:
    print(f"The file {file_path} does not exist")

Removed the file: D:\Final_file\ASI-main\output\models\BJ\asi_BJ_weights.hdf5


In [3]:
##BEST IT

hyperparameter={
"num_nearest":30,
"sigma":10,
"geointerpolation": 'asi_multi',
'type_compat_funct_eucli':'kernel_gaussiano',
'Num_heads':4,
"learning_rate":0.001,
"batch_size":32,
"num_neuron":60,
"num_layers":3,
"size_embedded":60,
"num_nearest_geo":15,
"num_nearest_eucli":15,
"id_dataset":'BJ',
"epochs":200,
"optimier":'adam',
"validation_split":0.1,
"label":'asi_BJ',
"early_stopping": False,
'scale_log':False,
"graph_label":'matrix',
}

In [4]:
from tensorflow.keras import backend as K
K.clear_session()

In [5]:
spatial = train(**hyperparameter)

In [None]:
dataset,\
result,\
fit,\
embedded_train,\
embedded_test,\
predict_regression_train,\
predict_regression_test = spatial()

In [7]:
print('################# Test ##########################')
print('MALE test:.... {}'.format(result[0]))
print('RMSE test:.... {}'.format(result[1]))
print('MAPE test:.... {}'.format(result[2]))
print('################# Train ##########################')
print('MALE train:.... {}'.format(result[3]))
print('RMSE train:.... {}'.format(result[4]))
print('MAPE train:.... {}'.format(result[5]))

################# Test ##########################
MALE test:.... 4751.664026242338
RMSE test:.... 7863.479323742264
MAPE test:.... 7.591164363198836
################# Train ##########################
MALE train:.... 4696.1917338089015
RMSE train:.... 7885.88575257439
MAPE train:.... 7.326200880747932


In [45]:
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor

def mean_absolute_percentage_error(y_true, y_pred): 
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

## Base models benchmarking 

In [54]:
X_train ,X_test, y_train , y_test = dataset.X_train,dataset.X_test,dataset.y_train,dataset.y_test

In [55]:


# Initialize lists to store test set evaluation metrics for each model
test_mae_list = {}
test_mse_list = {}
test_rmse_list = {}
test_r2_list = {}
test_mape_list = {}
test_male_list = {} 
# Define the models and their respective parameter grids
models = {
    'Linear Regression': {
        'model': LinearRegression(),
        'params': {}
    },
    'KNN': {
        'model': KNeighborsRegressor(),
        'params': {'n_neighbors': [5, 10, 15, 20]}
    },
    'Decision Tree': {
        'model': DecisionTreeRegressor(),
        'params': {'max_depth': [5, 9, 12, 15]}
    },
    'Random Forest': {
        'model': RandomForestRegressor(),
        'params': {'n_estimators': [50, 100, 150], 'max_depth': [8, 12, 16]}
    },
    'SVM': {
        'model': SVR(),
        'params': {'C': [10, 100], 'gamma': ['scale', 'auto']}
    },
    'LightGBM': {
        'model': LGBMRegressor(),
        'params': {'n_estimators': [200,400,1000], 'learning_rate': [0.05]}
    },
    'CatBoost': {
        'model': CatBoostRegressor(verbose=0),
        'params': {'depth': [ 8, 10], 'learning_rate': [0.05],'n_estimators': [200,400,1000]}
    },
    'XGBoost': {
        'model': XGBRegressor( learning_rate=0.05, random_state=42),
        'params': {'max_depth': [5, 7, 9], 'learning_rate': [0.05],'n_estimators': [200,400,1000]}
    }
}

# Define the K-fold cross-validator
cv = KFold(n_splits=10, shuffle=True, random_state=42)

# For each model
for name, model_info in models.items():
    test_mae_list[name] = []
    test_mse_list[name] = []
    test_rmse_list[name] = []
    test_r2_list[name] = []
    test_mape_list[name] = []
    test_male_list[name] = [] 

    for train_idx, val_idx in cv.split(X_train, y_train):
        X_train_fold, X_val_fold = X_train[train_idx], X_train[val_idx]
        y_train_fold, y_val_fold = y_train[train_idx], y_train[val_idx]

        # Grid Search for hyperparameter tuning
        grid = GridSearchCV(estimator=model_info['model'], param_grid=model_info['params'], cv=3, scoring='neg_mean_squared_error', verbose=0, n_jobs=-1)
        grid_result = grid.fit(X_train_fold, y_train_fold)
        best_model = grid_result.best_estimator_

        # Make predictions on the test set
        y_pred_test = best_model.predict(X_test)

        # Evaluate the model on the test set
        test_mae = mean_absolute_error((y_test), (y_pred_test))
        test_mse = mean_squared_error((y_test), (y_pred_test))
        test_rmse = np.sqrt(test_mse)
        test_r2 = r2_score((y_test), (y_pred_test))
        test_mape = mean_absolute_percentage_error((y_test), (y_pred_test))
        test_male = mean_absolute_error(np.log1p(y_test), np.log1p(y_pred_test)) 

        # Store the test set metrics in the lists
        test_mae_list[name].append(test_mae)
        test_mse_list[name].append(test_mse)
        test_rmse_list[name].append(test_rmse)
        test_r2_list[name].append(test_r2)
        test_mape_list[name].append(test_mape)
        test_male_list[name].append(test_male)

    # Calculate the average and best metrics for the test set
    avg_test_mae = np.mean(test_mae_list[name])
    avg_test_mse = np.mean(test_mse_list[name])
    avg_test_rmse = np.mean(test_rmse_list[name])
    avg_test_r2 = np.mean(test_r2_list[name])
    avg_test_mape = np.mean(test_mape_list[name])
    avg_test_male = np.mean(test_male_list[name])

    best_test_mae = np.min(test_mae_list[name])
    best_test_mse = np.min(test_mse_list[name])
    best_test_rmse = np.min(test_rmse_list[name])
    best_test_r2 = np.max(test_r2_list[name])
    best_test_mape = np.min(test_mape_list[name])
    best_test_male = np.min(test_male_list[name])

    print(f"Test Set Evaluation for {name}")
    print(f"Average Test MAE: {avg_test_mae}, Best Test MAE: {best_test_mae}")
    print(f"Average Test MSE: {avg_test_mse}, Best Test MSE: {best_test_mse}")
    print(f"Average Test RMSE: {avg_test_rmse}, Best Test RMSE: {best_test_rmse}")
    print(f"Average Test R2: {avg_test_r2}, Best Test R2: {best_test_r2}")
    print(f"Average Test MAPE: {avg_test_mape}, Best Test MAPE: {best_test_mape}")
    print(f"Average Test MALE: {avg_test_male}, Best Test MALE: {best_test_male}")
    print("\n")


Test Set Evaluation for Linear Regression
Average Test MAE: 15516.859641943287, Best Test MAE: 15510.25690570184
Average Test MSE: 422501728.5886174, Best Test MSE: 422159152.18374324
Average Test RMSE: 20554.846351967375, Best Test RMSE: 20546.51192255618
Average Test R2: 0.3761539363926765, Best Test R2: 0.37665976850460514
Average Test MAPE: 25.547598887443872, Best Test MAPE: 25.521808043681737
Average Test MALE: 0.23909405064858286, Best Test MALE: 0.23894196734972284


Test Set Evaluation for KNN
Average Test MAE: 7310.020479859896, Best Test MAE: 7269.561015761821
Average Test MSE: 140610870.6633156, Best Test MSE: 137777323.13610506
Average Test RMSE: 11857.808890472083, Best Test RMSE: 11737.85854132282
Average Test R2: 0.7923806407686466, Best Test R2: 0.7965645229903824
Average Test MAPE: 11.294469950766327, Best Test MAPE: 11.224936959737635
Average Test MALE: 0.10869885670875852, Best Test MALE: 0.1081307005625147


Test Set Evaluation for Decision Tree
Average Test MAE: 1

## Testing the embeddings

In [42]:
X_train ,X_test, y_train , y_test = embedded_train,embedded_test,dataset.y_train,dataset.y_test


In [43]:
from sklearn.preprocessing import StandardScaler

# Original feature names

# Initialize the Standard Scaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [44]:


# Initialize lists to store test set evaluation metrics for each model
test_mae_list = {}
test_mse_list = {}
test_rmse_list = {}
test_r2_list = {}
test_mape_list = {}
test_male_list = {} 
# Define the models and their respective parameter grids
models = {
    'Linear Regression': {
        'model': LinearRegression(),
        'params': {}
    },
    'KNN': {
        'model': KNeighborsRegressor(),
        'params': {'n_neighbors': [5, 10, 15, 40]}
    },
    'Decision Tree': {
        'model': DecisionTreeRegressor(),
        'params': {'max_depth': [5, 9, 12, 40]}
    },
    'Random Forest': {
        'model': RandomForestRegressor(),
        'params': {'n_estimators': [50, 100, 150], 'max_depth': [8, 12, 16]}
    },
    'SVM': {
        'model': SVR(),
        'params': {'C': [10, 100], 'gamma': ['scale', 'auto']}
    },
    'LightGBM': {
        'model': LGBMRegressor(),
        'params': {'n_estimators': [100,200,400], 'learning_rate': [0.05, 0.1]}
    },
    'CatBoost': {
        'model': CatBoostRegressor(verbose=0),
        'params': {'depth': [6, 8, 10], 'learning_rate': [0.01, 0.05],'n_estimators': [100,200,400,800]}
    },
    'XGBoost': {
        'model': XGBRegressor( learning_rate=0.05, random_state=42),
        'params': {'max_depth': [5, 7, 9], 'learning_rate': [0.01, 0.05, 0.1],'n_estimators': [100,200,400,800]}
    }
}

# Define the K-fold cross-validator
cv = KFold(n_splits=10, shuffle=True, random_state=42)

# For each model
for name, model_info in models.items():
    test_mae_list[name] = []
    test_mse_list[name] = []
    test_rmse_list[name] = []
    test_r2_list[name] = []
    test_mape_list[name] = []
    test_male_list[name] = [] 

    for train_idx, val_idx in cv.split(X_train, y_train):
        X_train_fold, X_val_fold = X_train[train_idx], X_train[val_idx]
        y_train_fold, y_val_fold = y_train[train_idx], y_train[val_idx]

        # Grid Search for hyperparameter tuning
        grid = GridSearchCV(estimator=model_info['model'], param_grid=model_info['params'], cv=3, scoring='neg_mean_squared_error', verbose=0, n_jobs=-1)
        grid_result = grid.fit(X_train_fold, y_train_fold)
        best_model = grid_result.best_estimator_

        # Make predictions on the test set
        y_pred_test = best_model.predict(X_test)

        # Evaluate the model on the test set
        test_mae = mean_absolute_error((y_test), (y_pred_test))
        test_mse = mean_squared_error((y_test), (y_pred_test))
        test_rmse = np.sqrt(test_mse)
        test_r2 = r2_score((y_test), (y_pred_test))
        test_mape = mean_absolute_percentage_error((y_test), (y_pred_test))
        test_male = mean_absolute_error(np.log1p(y_test), np.log1p(y_pred_test)) 

        # Store the test set metrics in the lists
        test_mae_list[name].append(test_mae)
        test_mse_list[name].append(test_mse)
        test_rmse_list[name].append(test_rmse)
        test_r2_list[name].append(test_r2)
        test_mape_list[name].append(test_mape)
        test_male_list[name].append(test_male)

    # Calculate the average and best metrics for the test set
    avg_test_mae = np.mean(test_mae_list[name])
    avg_test_mse = np.mean(test_mse_list[name])
    avg_test_rmse = np.mean(test_rmse_list[name])
    avg_test_r2 = np.mean(test_r2_list[name])
    avg_test_mape = np.mean(test_mape_list[name])
    avg_test_male = np.mean(test_male_list[name])

    best_test_mae = np.min(test_mae_list[name])
    best_test_mse = np.min(test_mse_list[name])
    best_test_rmse = np.min(test_rmse_list[name])
    best_test_r2 = np.max(test_r2_list[name])
    best_test_mape = np.min(test_mape_list[name])
    best_test_male = np.min(test_male_list[name])

    print(f"Test Set Evaluation for {name}")
    print(f"Average Test MAE: {avg_test_mae}, Best Test MAE: {best_test_mae}")
    print(f"Average Test MSE: {avg_test_mse}, Best Test MSE: {best_test_mse}")
    print(f"Average Test RMSE: {avg_test_rmse}, Best Test RMSE: {best_test_rmse}")
    print(f"Average Test R2: {avg_test_r2}, Best Test R2: {best_test_r2}")
    print(f"Average Test MAPE: {avg_test_mape}, Best Test MAPE: {best_test_mape}")
    print(f"Average Test MALE: {avg_test_male}, Best Test MALE: {best_test_male}")
    print("\n")


Test Set Evaluation for Linear Regression
Average Test MAE: 4772.366677019483, Best Test MAE: 4746.529726494089
Average Test MSE: 61630708.495675005, Best Test MSE: 61285168.143553704
Average Test RMSE: 7850.502425308563, Best Test RMSE: 7828.4844091020395
Average Test R2: 0.9089990116234684, Best Test R2: 0.909509220159676
Average Test MAPE: 7.624320321845824, Best Test MAPE: 7.568865503848032
Average Test MALE: 0.07395975804263254, Best Test MALE: 0.07347002106812871


Test Set Evaluation for KNN
Average Test MAE: 4880.169730735552, Best Test MAE: 4872.636449211909
Average Test MSE: 62981984.65811026, Best Test MSE: 62799840.62786263
Average Test RMSE: 7936.114912881063, Best Test RMSE: 7924.635046982456
Average Test R2: 0.9070037811717547, Best Test R2: 0.907272726429468
Average Test MAPE: 7.776341090362996, Best Test MAPE: 7.7635892042766
Average Test MALE: 0.07536436584161497, Best Test MALE: 0.07525315914303446


Test Set Evaluation for Decision Tree
Average Test MAE: 4893.081338