## Training attention based model

In [1]:
%matplotlib inline

import sys
sys.path.append("../../")

from train_model import train

from sklearn.linear_model import LinearRegression
from yellowbrick.regressor import ResidualsPlot, PredictionError

import seaborn as sns
import matplotlib.pyplot as plt

from matplotlib import rcParams
rcParams['figure.figsize'] = (8, 4)
rcParams['figure.dpi'] = 100
rcParams['font.size'] = 8
rcParams['font.family'] = 'sans-serif'
rcParams['axes.facecolor'] = '#ffffff'
rcParams['lines.linewidth'] = 2.0

In [2]:
import os
file_path = r"D:\Final_file\ASI-main\output\models\IT\asi_IT_weights.hdf5"
from tensorflow.keras import backend as K
K.clear_session()

if os.path.exists(file_path):
    os.remove(file_path)
    print(f"Removed the file: {file_path}")
else:
    print(f"The file {file_path} does not exist")

Removed the file: D:\Final_file\ASI-main\output\models\IT\asi_IT_weights.hdf5


In [3]:
##BEST IT

hyperparameter={
"num_nearest":40,
"sigma":2,
"geointerpolation": 'asi_multi',
'type_compat_funct_eucli':'identity',
'Num_heads':8,
"learning_rate":0.001,
"batch_size":32,
"num_neuron":60,
"num_layers":5,
"size_embedded":50,
"num_nearest_geo":30,
"num_nearest_eucli":25,
"id_dataset":'IT',
"epochs":300,
"optimier":'adam',
"validation_split":0.1,
"label":'asi_IT',
"early_stopping": False,
'scale_log':False,
"graph_label":'matrix',
}

In [4]:
spatial = train(**hyperparameter)

In [None]:
dataset,\
result,\
fit,\
embedded_train,\
embedded_test,\
predict_regression_train,\
predict_regression_test = spatial()

In [6]:
#asi
print('################# Test ##########################')
print('MALE test:.... {}'.format(result[0]))
print('RMSE test:.... {}'.format(result[1]))
print('MAPE test:.... {}'.format(result[2]))
print('################# Train ##########################')
print('MALE train:.... {}'.format(result[3]))
print('RMSE train:.... {}'.format(result[4]))
print('MAPE train:.... {}'.format(result[5]))

################# Test ##########################
MALE test:.... 0.13318920943380927
RMSE test:.... 46473.58619153165
MAPE test:.... 13.980159078004906
################# Train ##########################
MALE train:.... 0.13318920943380927
RMSE train:.... 44279.78419996523
MAPE train:.... 12.575263850756183


In [6]:
#asi_multi
print('################# Test ##########################')
print('MALE test:.... {}'.format(result[0]))
print('RMSE test:.... {}'.format(result[1]))
print('MAPE test:.... {}'.format(result[2]))
print('################# Train ##########################')
print('MALE train:.... {}'.format(result[3]))
print('RMSE train:.... {}'.format(result[4]))
print('MAPE train:.... {}'.format(result[5]))

################# Test ##########################
MALE test:.... 0.13128807562459377
RMSE test:.... 45797.649009432156
MAPE test:.... 13.779126627211536
################# Train ##########################
MALE train:.... 0.13128807562459377
RMSE train:.... 44220.15179854315
MAPE train:.... 12.734757197189131


In [7]:
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor


In [28]:
path = r'D:\Final_file\ASI-main\datasets\IT\data.npz'
data = np.load(path)

In [29]:
X_train = data['X_train']
X_test = data['X_test']
y_train = data['y_train']
y_test = data['y_test']

In [30]:
from sklearn.preprocessing import StandardScaler,MinMaxScaler
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [8]:

# Define the mean absolute percentage error
def mean_absolute_percentage_error(y_true, y_pred): 
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100


def mean_absolute_log_error(y_true, y_pred):
    # Ensure all values are positive with a small constant (e.g., 1e-10)
    y_true_pos = np.maximum(y_true, 1e-10) + 1
    y_pred_pos = np.maximum(y_pred, 1e-10) + 1
    y_true_log = np.log(y_true_pos)
    y_pred_log = np.log(y_pred_pos)
    return mean_absolute_error(y_true_log, y_pred_log)


## Base models benchmarking 

In [34]:



# Initialize lists to store test set evaluation metrics for each model
test_mae_list = {}
test_mse_list = {}
test_rmse_list = {}
test_r2_list = {}
test_mape_list = {}
test_male_list = {} 
# Define the models and their respective parameter grids
models = {
    'Linear Regression': {
        'model': LinearRegression(),
        'params': {}
    },
    'KNN': {
        'model': KNeighborsRegressor(),
        'params': {'n_neighbors': [5, 10, 15, 20]}
    },
    'Decision Tree': {
        'model': DecisionTreeRegressor(),
        'params': {'max_depth': [5, 9, 12, 15]}
    },
    'Random Forest': {
        'model': RandomForestRegressor(),
        'params': {'n_estimators': [50, 100, 150], 'max_depth': [8, 12, 16]}
    },
    'SVM': {
        'model': SVR(),
        'params': {'C': [10, 100], 'gamma': ['scale', 'auto']}
    },
    'LightGBM': {
        'model': LGBMRegressor(),
        'params': {'n_estimators': [200,400,1000], 'learning_rate': [0.05]}
    },
    'CatBoost': {
        'model': CatBoostRegressor(verbose=0),
        'params': {'depth': [ 8, 10], 'learning_rate': [0.05],'n_estimators': [200,400,1000]}
    },
    'XGBoost': {
        'model': XGBRegressor( learning_rate=0.05, random_state=42),
        'params': {'max_depth': [5, 7, 9], 'learning_rate': [0.05],'n_estimators': [200,400,1000]}
    }
}

# Define the K-fold cross-validator
cv = KFold(n_splits=10, shuffle=True, random_state=42)

# For each model
for name, model_info in models.items():
    test_mae_list[name] = []
    test_mse_list[name] = []
    test_rmse_list[name] = []
    test_r2_list[name] = []
    test_mape_list[name] = []
    test_male_list[name] = [] 

    for train_idx, val_idx in cv.split(X_train, y_train):
        X_train_fold, X_val_fold = X_train[train_idx], X_train[val_idx]
        y_train_fold, y_val_fold = y_train[train_idx], y_train[val_idx]

        # Grid Search for hyperparameter tuning
        grid = GridSearchCV(estimator=model_info['model'], param_grid=model_info['params'], cv=3, scoring='neg_mean_squared_error', verbose=0, n_jobs=-1)
        grid_result = grid.fit(X_train_fold, y_train_fold)
        best_model = grid_result.best_estimator_

        # Make predictions on the test set
        y_pred_test = best_model.predict(X_test)

        # Evaluate the model on the test set
        test_mae = mean_absolute_error((y_test), (y_pred_test))
        test_mse = mean_squared_error((y_test), (y_pred_test))
        test_rmse = np.sqrt(test_mse)
        test_r2 = r2_score((y_test), (y_pred_test))
        test_mape = mean_absolute_percentage_error((y_test), (y_pred_test))
        test_male = mean_absolute_log_error((y_test), (y_pred_test)) 

        # Store the test set metrics in the lists
        test_mae_list[name].append(test_mae)
        test_mse_list[name].append(test_mse)
        test_rmse_list[name].append(test_rmse)
        test_r2_list[name].append(test_r2)
        test_mape_list[name].append(test_mape)
        test_male_list[name].append(test_male)

    # Calculate the average and best metrics for the test set
    avg_test_mae = np.mean(test_mae_list[name])
    avg_test_mse = np.mean(test_mse_list[name])
    avg_test_rmse = np.mean(test_rmse_list[name])
    avg_test_r2 = np.mean(test_r2_list[name])
    avg_test_mape = np.mean(test_mape_list[name])
    avg_test_male = np.mean(test_male_list[name])

    best_test_mae = np.min(test_mae_list[name])
    best_test_mse = np.min(test_mse_list[name])
    best_test_rmse = np.min(test_rmse_list[name])
    best_test_r2 = np.max(test_r2_list[name])
    best_test_mape = np.min(test_mape_list[name])
    best_test_male = np.min(test_male_list[name])

    print(f"Test Set Evaluation for {name}")
    print(f"Average Test MAE: {avg_test_mae}, Best Test MAE: {best_test_mae}")
    print(f"Average Test MSE: {avg_test_mse}, Best Test MSE: {best_test_mse}")
    print(f"Average Test RMSE: {avg_test_rmse}, Best Test RMSE: {best_test_rmse}")
    print(f"Average Test R2: {avg_test_r2}, Best Test R2: {best_test_r2}")
    print(f"Average Test MAPE: {avg_test_mape}, Best Test MAPE: {best_test_mape}")
    print(f"Average Test MALE: {avg_test_male}, Best Test MALE: {best_test_male}")
    print("\n")



Test Set Evaluation for Linear Regression
Average Test MAE: 58093.08058097004, Best Test MAE: 58064.76742870805
Average Test MSE: 5812678571.317408, Best Test MSE: 5810110355.881016
Average Test RMSE: 76240.92357961097, Best Test RMSE: 76224.07989527336
Average Test R2: 0.7422545297611219, Best Test R2: 0.7423684094273045
Average Test MAPE: 29.400632407979543, Best Test MAPE: 29.352031314845984
Average Test MALE: 0.3886035663677529, Best Test MALE: 0.38527007344245323


Test Set Evaluation for KNN
Average Test MAE: 61266.84217335058, Best Test MAE: 60793.735640362225
Average Test MSE: 7252818072.698311, Best Test MSE: 7163435942.79412
Average Test RMSE: 85163.21845646147, Best Test RMSE: 84637.08373280663
Average Test R2: 0.6783959440095145, Best Test R2: 0.6823593214473773
Average Test MAPE: 27.96700025082716, Best Test MAPE: 27.73906339881466
Average Test MALE: 0.24898910298269414, Best Test MALE: 0.24736744501670985


Test Set Evaluation for Decision Tree
Average Test MAE: 50538.047

## Testing the embeddings

In [9]:
X_train ,X_test, y_train , y_test = embedded_train,embedded_test,dataset.y_train,dataset.y_test

In [10]:


# Initialize lists to store test set evaluation metrics for each model
test_mae_list = {}
test_mse_list = {}
test_rmse_list = {}
test_r2_list = {}
test_mape_list = {}
test_male_list = {} 
# Define the models and their respective parameter grids
models = {
    'Linear Regression': {
        'model': LinearRegression(),
        'params': {}
    },
    'KNN': {
        'model': KNeighborsRegressor(),
        'params': {'n_neighbors': [5, 10, 15, 20]}
    },
    'Decision Tree': {
        'model': DecisionTreeRegressor(),
        'params': {'max_depth': [5, 9, 12, 15]}
    },
    'Random Forest': {
        'model': RandomForestRegressor(),
        'params': {'n_estimators': [50, 100, 150], 'max_depth': [8, 12, 16]}
    },
    'SVM': {
        'model': SVR(),
        'params': {'C': [10, 100], 'gamma': ['scale', 'auto']}
    },
    'LightGBM': {
        'model': LGBMRegressor(),
        'params': {'n_estimators': [200,400,1000], 'learning_rate': [0.05]}
    },
    'CatBoost': {
        'model': CatBoostRegressor(verbose=0),
        'params': {'depth': [ 8, 10], 'learning_rate': [0.05],'n_estimators': [200,400,1000]}
    },
    'XGBoost': {
        'model': XGBRegressor( learning_rate=0.05, random_state=42),
        'params': {'max_depth': [5, 7, 9], 'learning_rate': [0.05],'n_estimators': [200,400,1000]}
    }
}

# Define the K-fold cross-validator
cv = KFold(n_splits=10, shuffle=True, random_state=42)

# For each model
for name, model_info in models.items():
    test_mae_list[name] = []
    test_mse_list[name] = []
    test_rmse_list[name] = []
    test_r2_list[name] = []
    test_mape_list[name] = []
    test_male_list[name] = [] 

    for train_idx, val_idx in cv.split(X_train, y_train):
        X_train_fold, X_val_fold = X_train[train_idx], X_train[val_idx]
        y_train_fold, y_val_fold = y_train[train_idx], y_train[val_idx]

        # Grid Search for hyperparameter tuning
        grid = GridSearchCV(estimator=model_info['model'], param_grid=model_info['params'], cv=3, scoring='neg_mean_squared_error', verbose=0, n_jobs=-1)
        grid_result = grid.fit(X_train_fold, y_train_fold)
        best_model = grid_result.best_estimator_

        # Make predictions on the test set
        y_pred_test = best_model.predict(X_test)

        # Evaluate the model on the test set
        test_mae = mean_absolute_error((y_test), (y_pred_test))
        test_mse = mean_squared_error((y_test), (y_pred_test))
        test_rmse = np.sqrt(test_mse)
        test_r2 = r2_score((y_test), (y_pred_test))
        test_mape = mean_absolute_percentage_error((y_test), (y_pred_test))
        test_male = mean_absolute_error(np.log1p(y_test), np.log1p(y_pred_test)) 

        # Store the test set metrics in the lists
        test_mae_list[name].append(test_mae)
        test_mse_list[name].append(test_mse)
        test_rmse_list[name].append(test_rmse)
        test_r2_list[name].append(test_r2)
        test_mape_list[name].append(test_mape)
        test_male_list[name].append(test_male)

    # Calculate the average and best metrics for the test set
    avg_test_mae = np.mean(test_mae_list[name])
    avg_test_mse = np.mean(test_mse_list[name])
    avg_test_rmse = np.mean(test_rmse_list[name])
    avg_test_r2 = np.mean(test_r2_list[name])
    avg_test_mape = np.mean(test_mape_list[name])
    avg_test_male = np.mean(test_male_list[name])

    best_test_mae = np.min(test_mae_list[name])
    best_test_mse = np.min(test_mse_list[name])
    best_test_rmse = np.min(test_rmse_list[name])
    best_test_r2 = np.max(test_r2_list[name])
    best_test_mape = np.min(test_mape_list[name])
    best_test_male = np.min(test_male_list[name])

    print(f"Test Set Evaluation for {name}")
    print(f"Average Test MAE: {avg_test_mae}, Best Test MAE: {best_test_mae}")
    print(f"Average Test MSE: {avg_test_mse}, Best Test MSE: {best_test_mse}")
    print(f"Average Test RMSE: {avg_test_rmse}, Best Test RMSE: {best_test_rmse}")
    print(f"Average Test R2: {avg_test_r2}, Best Test R2: {best_test_r2}")
    print(f"Average Test MAPE: {avg_test_mape}, Best Test MAPE: {best_test_mape}")
    print(f"Average Test MALE: {avg_test_male}, Best Test MALE: {best_test_male}")
    print("\n")


Test Set Evaluation for Linear Regression
Average Test MAE: 31180.073168155726, Best Test MAE: 31158.668807608345
Average Test MSE: 2103890502.852539, Best Test MSE: 2101100031.6004324
Average Test RMSE: 45868.18264631477, Best Test RMSE: 45837.757706943215
Average Test R2: 0.9067094042212048, Best Test R2: 0.9068331391424185
Average Test MAPE: 13.980914104332777, Best Test MAPE: 13.944543179821991
Average Test MALE: 0.13189330401560767, Best Test MALE: 0.13176323722085304


Test Set Evaluation for KNN
Average Test MAE: 32219.40468709573, Best Test MAE: 32141.967795924968
Average Test MSE: 2186569268.0298276, Best Test MSE: 2176044445.331173
Average Test RMSE: 46760.704931323606, Best Test RMSE: 46648.091550793084
Average Test R2: 0.9030432670096026, Best Test R2: 0.9035099581129145
Average Test MAPE: 14.350143294479162, Best Test MAPE: 14.324406347290578
Average Test MALE: 0.13544054767885055, Best Test MALE: 0.13520619193407202


Test Set Evaluation for Decision Tree
Average Test MAE