In [36]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import seaborn as sns
import os
from typing import Union, Tuple

from sklearn.compose import ColumnTransformer
from sklearn.ensemble import HistGradientBoostingRegressor, RandomForestRegressor
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR

In [37]:
df = pd.read_csv("data/EMEWS_cleaned_with_nan.csv")
df.head()

Unnamed: 0,day,date,total_number_of_patients,total_number_of_emews,zone_a_mwr_patients,zone_a_mwr_cat_3,zone_a_mwr_cat_4,zone_a_mwr_sets_of_emews,zone_a_mwr_deescalations,zone_a_mwr_escalations,...,zone_a__cat_3,zone_a__sets_of_emews,zone_a__deescalations,zone_a__escalations,zone_b/c_patients,zone_b/c_cat_2,zone_b/c_cat_3,zone_b/c_sets_of_emews,zone_b/c_deescalations,zone_b/c_escalations
0,thursday,2024-01-04 00:00:00,100.0,302.0,64.0,53.0,11.0,192.0,10.0,6.0,...,9.0,50.0,9.0,1.0,15.0,,,60.0,1.0,5.0
1,friday,2024-01-05 00:00:00,112.0,220.0,59.0,59.0,0.0,90.0,10.0,0.0,...,12.0,60.0,10.0,0.0,27.0,12.0,15.0,70.0,0.0,5.0
2,saturday,2024-01-06 00:00:00,69.0,175.0,36.0,36.0,0.0,58.0,3.0,2.0,...,0.0,67.0,5.0,2.0,15.0,,,50.0,0.0,5.0
3,sunday,2024-01-07 00:00:00,74.0,206.0,30.0,27.0,3.0,41.0,2.0,11.0,...,6.0,61.0,7.0,3.0,16.0,,,50.0,0.0,5.0
4,monday,2024-01-08 00:00:00,87.0,267.0,41.0,38.0,3.0,116.0,10.0,1.0,...,5.0,73.0,8.0,1.0,23.0,10.0,13.0,78.0,0.0,4.0


In [38]:
date_req = pd.to_datetime('2024-01-17')
df['date'] = pd.to_datetime(df['date'])

new_row = pd.DataFrame([{col: np.nan for col in df.columns}])
new_row['date'] = date_req
new_row['day'] = date_req.day_name().lower()


df = pd.concat([new_row, df], ignore_index=True)
df.sort_values('date', inplace=True)
df.reset_index(drop=True, inplace=True)

In [39]:
split_idx = int(len(df) * 0.9)
train_data = df.iloc[:split_idx]
test_data = df.iloc[split_idx:]

# Then split X and y
X_train = train_data.drop(columns=['total_number_of_patients', 'total_number_of_emews'])
y_train = train_data[['total_number_of_patients', 'total_number_of_emews']]

X_test = test_data.drop(columns=['total_number_of_patients', 'total_number_of_emews'])
y_test = test_data[['total_number_of_patients', 'total_number_of_emews']]

# X_train = train_df.drop(columns=['total_number_of_patients'])
# y_train = train_df[['total_number_of_patients']]

# X_test = test_df.drop(columns=['total_number_of_patients'])
# y_test = test_df[['total_number_of_patients']]

In [40]:
def apply_imputation(
        X_train: pd.DataFrame, 
        X_test: pd.DataFrame,
        imputer: Union[KNNImputer, IterativeImputer, SimpleImputer]
        ) -> Tuple[pd.DataFrame, pd.DataFrame]:

    num_cols = X_train.select_dtypes(include='number').columns
    non_num_cols = X_train.columns.difference(num_cols).to_list()

    X_train_num = X_train[num_cols]
    X_test_num = X_test[num_cols]

    needs_scaling = type(imputer) in (KNNImputer, IterativeImputer)

    if needs_scaling:
        scaler = StandardScaler()
        X_train_num = scaler.fit_transform(X_train_num)
        X_test_num = scaler.transform(X_test_num)

    X_train_imputed = imputer.fit_transform(X_train_num)
    X_test_imputed = imputer.transform(X_test_num)

    if needs_scaling:
        X_train_imputed = scaler.inverse_transform(X_train_imputed)
        X_test_imputed = scaler.inverse_transform(X_test_imputed)

    X_train_imputed = np.round(X_train_imputed).astype(int)
    X_test_imputed = np.round(X_test_imputed).astype(int)

    # Replace only numeric columns
    X_train_copy = X_train.copy()
    X_train_copy[num_cols] = X_train_imputed
    X_train_copy[non_num_cols] = X_train[non_num_cols]

    X_test_copy = X_test.copy()
    X_test_copy[num_cols] = X_test_imputed
    X_test_copy[non_num_cols] = X_test[non_num_cols]

    return X_train_copy, X_test_copy

In [41]:
imputation_strategies = {
    'mean': SimpleImputer(strategy='mean'),
    'median': SimpleImputer(strategy='median'),
    'mode': SimpleImputer(strategy='most_frequent'),
    'mice': IterativeImputer(random_state=42),
    'mice_hgb': IterativeImputer(estimator=HistGradientBoostingRegressor(random_state=42), random_state=42),
    'mice_lr': IterativeImputer(estimator=LinearRegression(), random_state=42),
    'mice_rf': IterativeImputer(estimator=RandomForestRegressor(random_state=42), random_state=42),
    'mice_svr': IterativeImputer(estimator=SVR(), random_state=42)
}

for i in range(3, 6):
    key = f'knn_{i}'
    imputation_strategies[key] = KNNImputer(n_neighbors=i)
    key = f'knn_{i}_distance'
    imputation_strategies[key] = KNNImputer(n_neighbors=i, weights='distance')
    key = f'mice_knn_{i}'
    imputation_strategies[key] = IterativeImputer(estimator=KNeighborsRegressor(n_neighbors=i), random_state=42)
    key = f'mice_knn_{i}_distance'
    imputation_strategies[key] = IterativeImputer(estimator=KNeighborsRegressor(n_neighbors=i, weights='distance'), random_state=42)
    

datasets = {}

for name, imputer in imputation_strategies.items():
    X_train_imp, X_test_imp = apply_imputation(X_train, X_test, imputer)

    train_df = pd.concat([X_train_imp, y_train], axis=1)
    test_df = pd.concat([X_test_imp, y_test], axis=1)

    train_df['date'] = pd.to_datetime(train_df['date'])
    test_df['date'] = pd.to_datetime(test_df['date'])

    # manual checks...
    patient_count_cols = ['zone_a_mwr_patients', 'zone_a__patients', 'zone_b/c_patients']
    emews_count_cols = ['zone_a_mwr_sets_of_emews', 'zone_a__sets_of_emews', 'zone_b/c_sets_of_emews']
    
    train_df['total_number_of_patients'] = train_df['total_number_of_patients'].fillna(train_df[patient_count_cols].sum(axis=1))
    test_df['total_number_of_patients'] = test_df['total_number_of_patients'].fillna(test_df[patient_count_cols].sum(axis=1))

    train_df['total_number_of_emews'] = train_df['total_number_of_emews'].fillna(train_df[emews_count_cols].sum(axis=1))
    test_df['total_number_of_emews'] = test_df['total_number_of_emews'].fillna(test_df[emews_count_cols].sum(axis=1))

    # train_df['corrected_total_number_of_patients'] = train_df[patient_count_cols].sum(axis=1)
    # test_df['corrected_total_number_of_patients'] = test_df[patient_count_cols].sum(axis=1)

    datasets[name] = (train_df, test_df)

In [42]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

def r2_rounded(y_true, y_pred):
    """Calculates R2 score after rounding predictions to the nearest whole number."""
    return r2_score(y_true, np.round(y_pred))

def rmse_rounded(y_true, y_pred):
    """Calculates RMSE after rounding predictions to the nearest whole number."""
    return np.sqrt(mean_squared_error(y_true, np.round(y_pred)))

def r2_ceil(y_true, y_pred):
    """Calculates R2 score after ceiling predictions to the nearest whole number."""
    return r2_score(y_true, np.ceil(y_pred))

def rmse_ceil(y_true, y_pred):
    """Calculates RMSE after ceiling predictions to the nearest whole number."""
    return np.sqrt(mean_squared_error(y_true, np.ceil(y_pred)))

def mae_rounded(y_true, y_pred):
    """Calculates MAE after rounding predictions to the nearest whole number."""
    return mean_absolute_error(y_true, np.round(y_pred))

def mae_ceil(y_true, y_pred):
    """Calculates MAE after ceiling predictions to the nearest whole number."""
    return mean_absolute_error(y_true, np.ceil(y_pred))

In [43]:
from pycaret.regression import *
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit

results = []

for name, (train_df, test_df) in datasets.items():
    print(f"\n🔍 Processing dataset: {name}")
    # train_df = train_df.copy().drop(columns=['corrected_total_number_of_patients'])
    # test_df = test_df.copy().drop(columns=['corrected_total_number_of_patients'])

    setup(
        data=train_df,
        target='total_number_of_patients',
        session_id=42,
        verbose=False,
        test_data=test_df,  # Use predefined test set,
        fold_strategy='timeseries',
        data_split_shuffle=False,
        fold_shuffle=False
    )

    best_model = compare_models(exclude=['lightgbm', 'par', 'dummy', 'lar'], sort='R2')
    leaderboard = pull()
    top_model_results = leaderboard.iloc[0]

    preds = predict_model(best_model, data=test_df)
    
    y_true = test_df['total_number_of_patients'].values
    y_pred = preds['prediction_label'].values

    # Standard metrics
    r2 = r2_score(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)

    # Custom metrics
    r2_r = r2_rounded(y_true, y_pred)
    rmse_r = rmse_rounded(y_true, y_pred)
    r2_c = r2_ceil(y_true, y_pred)
    rmse_c = rmse_ceil(y_true, y_pred)
    mae_r = mae_rounded(y_true, y_pred)
    mae_c = mae_ceil(y_true, y_pred)

    results.append({
        'dataset': name,
        'model': top_model_results['Model'],
        'R2_train': top_model_results['R2'],
        'R2_test': r2,
        'RMSE_train': top_model_results['RMSE'],
        'RMSE_test': rmse,
        'MAE_train': top_model_results['MAE'],
        'MAE_test': mae,
        'R2_rounded': r2_r,
        'RMSE_rounded': rmse_r,
        'MAE_rounded': mae_r,
        'R2_ceil': r2_c,
        'RMSE_ceil': rmse_c,
        'MAE_ceil': mae_c
    })

results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by='R2_test', ascending=False)
print("\n📊 Final Results:")
results_df



🔍 Processing dataset: mean


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
huber,Huber Regressor,2.8704,35.2056,5.5136,0.912,0.3106,0.0955,0.019
lasso,Lasso Regression,4.4972,44.1738,6.2896,0.8892,0.372,0.1604,0.293
llar,Lasso Least Angle Regression,4.4972,44.1731,6.2895,0.8892,0.3719,0.1604,0.016
en,Elastic Net,4.5194,44.9587,6.3342,0.8877,0.372,0.1589,0.015
br,Bayesian Ridge,4.4659,46.5528,6.4912,0.8823,0.3799,0.1551,0.016
ridge,Ridge Regression,4.827,53.017,6.9475,0.8675,0.3777,0.1528,0.015
lr,Linear Regression,4.8881,54.4063,7.0154,0.865,0.377,0.1537,0.603
catboost,CatBoost Regressor,4.4126,57.9278,6.8521,0.8617,0.326,0.1312,0.474
et,Extra Trees Regressor,4.536,60.2922,7.1393,0.8581,0.3201,0.1341,0.043
gbr,Gradient Boosting Regressor,5.005,63.3691,7.3965,0.8386,0.3218,0.158,0.027


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Huber Regressor,3.3223,28.8363,5.3699,0.9309,0.1128,0.0547



🔍 Processing dataset: median


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
huber,Huber Regressor,2.8616,36.7857,5.5823,0.9095,0.319,0.0941,0.019
lasso,Lasso Regression,4.4388,44.6495,6.3034,0.8887,0.3671,0.157,0.017
llar,Lasso Least Angle Regression,4.4386,44.6472,6.3032,0.8887,0.3671,0.157,0.016
en,Elastic Net,4.464,45.3953,6.3462,0.8875,0.3679,0.155,0.015
br,Bayesian Ridge,4.393,46.869,6.49,0.8829,0.3765,0.1501,0.025
ridge,Ridge Regression,4.7103,52.2162,6.8538,0.8715,0.3753,0.144,0.015
lr,Linear Regression,4.7729,53.7288,6.9302,0.8687,0.3748,0.145,0.38
catboost,CatBoost Regressor,4.4935,58.6576,6.9145,0.862,0.3284,0.1328,13.523
et,Extra Trees Regressor,4.8278,66.1164,7.4615,0.8453,0.3297,0.1436,0.053
gbr,Gradient Boosting Regressor,5.096,65.9519,7.6217,0.8303,0.3362,0.1637,0.031


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Huber Regressor,3.28,32.7253,5.7206,0.9223,0.1166,0.0548



🔍 Processing dataset: mode


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
huber,Huber Regressor,2.9933,38.6195,6.0425,0.901,0.387,0.0743,0.022
en,Elastic Net,4.1181,43.8629,6.2623,0.886,0.392,0.1254,0.019
lasso,Lasso Regression,4.1878,44.4209,6.3093,0.8836,0.3957,0.1303,0.019
llar,Lasso Least Angle Regression,4.1881,44.4263,6.3096,0.8836,0.3957,0.1304,0.018
br,Bayesian Ridge,4.0071,46.4913,6.4043,0.8722,0.3952,0.1242,0.03
et,Extra Trees Regressor,4.488,59.0609,7.0212,0.8641,0.3248,0.1344,0.053
catboost,CatBoost Regressor,4.4428,58.8796,6.9131,0.8618,0.3296,0.1286,0.427
gbr,Gradient Boosting Regressor,5.1756,70.6787,7.6804,0.8273,0.3673,0.1629,0.028
rf,Random Forest Regressor,5.3377,73.801,7.986,0.826,0.3532,0.1625,0.064
xgboost,Extreme Gradient Boosting,5.7702,81.5205,8.3609,0.8016,0.4007,0.1809,0.028


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Huber Regressor,3.6033,36.7412,6.0614,0.9127,0.0966,0.0611



🔍 Processing dataset: mice


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
huber,Huber Regressor,3.1378,29.7448,4.991,0.9261,0.2525,0.1075,0.021
br,Bayesian Ridge,4.6122,44.1376,6.2519,0.8879,0.3326,0.1608,0.021
en,Elastic Net,4.7508,44.8463,6.2679,0.8877,0.3275,0.1634,0.017
lasso,Lasso Regression,4.7759,44.6456,6.2655,0.8874,0.3287,0.1657,0.019
llar,Lasso Least Angle Regression,4.7763,44.6509,6.2658,0.8874,0.3287,0.1657,0.015
ridge,Ridge Regression,5.2022,57.6982,6.9504,0.8643,0.3282,0.1622,0.019
et,Extra Trees Regressor,4.7914,62.8181,7.3825,0.8504,0.3146,0.1454,0.053
lr,Linear Regression,5.3614,67.9452,7.2818,0.8471,0.3311,0.165,0.019
catboost,CatBoost Regressor,4.8313,67.241,7.6683,0.8324,0.3428,0.1464,0.51
gbr,Gradient Boosting Regressor,5.3175,69.8447,7.8245,0.8254,0.31,0.171,0.033


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Huber Regressor,3.989,47.656,6.9033,0.8895,0.1096,0.0659



🔍 Processing dataset: mice_hgb


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
huber,Huber Regressor,4.4552,43.162,6.3337,0.8822,0.3057,0.1605,0.019
lasso,Lasso Regression,5.3567,55.6603,7.1338,0.8526,0.3754,0.1865,0.016
llar,Lasso Least Angle Regression,5.3567,55.6571,7.1336,0.8526,0.3755,0.1865,0.017
br,Bayesian Ridge,5.3872,54.2945,7.1467,0.8505,0.3919,0.1906,0.015
en,Elastic Net,5.4291,56.7689,7.2203,0.8487,0.383,0.1877,0.017
gbr,Gradient Boosting Regressor,6.2829,82.0869,8.5346,0.7861,0.3457,0.2031,0.032
knn,K Neighbors Regressor,6.6543,90.4308,8.9818,0.7852,0.421,0.2011,0.015
et,Extra Trees Regressor,6.1394,89.8903,8.8089,0.7838,0.3348,0.1868,0.042
catboost,CatBoost Regressor,6.1904,85.9074,8.6868,0.7808,0.3599,0.193,0.474
rf,Random Forest Regressor,6.5557,92.1639,9.0592,0.765,0.3883,0.2087,0.415


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Huber Regressor,5.4242,64.7295,8.0455,0.8651,0.1264,0.0947



🔍 Processing dataset: mice_lr


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
huber,Huber Regressor,3.1821,30.031,5.1476,0.9234,0.2526,0.1087,0.018
br,Bayesian Ridge,4.7851,46.8873,6.4341,0.8821,0.3401,0.1675,0.016
lasso,Lasso Regression,5.004,49.7097,6.5535,0.8775,0.3376,0.1716,0.016
llar,Lasso Least Angle Regression,5.0042,49.723,6.5541,0.8774,0.3376,0.1716,0.016
en,Elastic Net,4.9937,50.754,6.5832,0.8762,0.3361,0.1694,0.015
ridge,Ridge Regression,5.384,59.1967,7.0491,0.8617,0.3325,0.1667,0.015
lr,Linear Regression,5.4149,59.835,7.0852,0.8604,0.3322,0.1681,0.018
et,Extra Trees Regressor,4.7838,62.4267,7.3623,0.8508,0.3147,0.1396,0.041
catboost,CatBoost Regressor,4.7817,66.0295,7.5378,0.8356,0.3319,0.1454,0.385
gbr,Gradient Boosting Regressor,5.2786,72.359,7.8236,0.8247,0.2986,0.1682,0.031


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Huber Regressor,3.7988,37.33,6.1098,0.9111,0.1082,0.0645



🔍 Processing dataset: mice_rf


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
huber,Huber Regressor,4.4709,47.0641,6.5031,0.8744,0.2767,0.1586,0.019
lasso,Lasso Regression,5.5458,58.5594,7.3315,0.8462,0.3544,0.1934,0.015
llar,Lasso Least Angle Regression,5.5454,58.5532,7.3312,0.8462,0.3544,0.1933,0.016
en,Elastic Net,5.5985,59.4476,7.3789,0.8443,0.3565,0.1936,0.016
br,Bayesian Ridge,5.6408,60.5464,7.482,0.8393,0.3649,0.1965,0.015
et,Extra Trees Regressor,5.5898,85.0685,8.3828,0.8066,0.3244,0.17,0.042
catboost,CatBoost Regressor,5.7132,83.741,8.5341,0.7939,0.3649,0.1812,0.419
knn,K Neighbors Regressor,6.4,85.8552,8.7763,0.7924,0.4336,0.1985,0.015
rf,Random Forest Regressor,6.4626,94.4125,9.097,0.7664,0.3914,0.2143,0.055
ridge,Ridge Regression,6.4601,104.9251,8.9535,0.764,0.3796,0.198,0.016


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Huber Regressor,5.5319,69.7923,8.3542,0.844,0.1398,0.0947



🔍 Processing dataset: mice_svr


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
huber,Huber Regressor,4.2127,43.207,6.2083,0.8778,0.2902,0.1525,0.019
lasso,Lasso Regression,5.1495,53.2757,6.8642,0.8584,0.3469,0.1794,0.016
llar,Lasso Least Angle Regression,5.1497,53.2776,6.8643,0.8584,0.3469,0.1794,0.016
en,Elastic Net,5.1979,53.9818,6.9146,0.8577,0.3478,0.18,0.015
br,Bayesian Ridge,5.2039,53.2696,6.9224,0.8574,0.3555,0.1829,0.015
ridge,Ridge Regression,5.7781,64.2536,7.5616,0.8375,0.3638,0.1837,0.015
lr,Linear Regression,5.8749,68.109,7.7077,0.8309,0.3647,0.1845,0.017
et,Extra Trees Regressor,6.1884,87.9118,8.791,0.7846,0.3409,0.1959,0.043
catboost,CatBoost Regressor,6.3583,91.3752,8.9513,0.7695,0.3664,0.2019,0.393
gbr,Gradient Boosting Regressor,6.3631,90.5525,9.0386,0.7646,0.3564,0.2038,0.03


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Huber Regressor,5.2641,59.1284,7.6895,0.8611,0.1375,0.0977



🔍 Processing dataset: knn_3


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
huber,Huber Regressor,4.508,44.8214,6.5305,0.8701,0.3433,0.1681,0.019
lasso,Lasso Regression,5.2748,51.9794,7.0289,0.8537,0.3989,0.1918,0.016
llar,Lasso Least Angle Regression,5.2748,51.9801,7.0289,0.8537,0.3989,0.1918,0.016
en,Elastic Net,5.3716,52.7875,7.0901,0.8513,0.4067,0.1928,0.015
br,Bayesian Ridge,5.4376,54.3853,7.2026,0.8454,0.4158,0.196,0.016
ridge,Ridge Regression,6.106,68.4896,7.9218,0.8197,0.4137,0.2019,0.016
lr,Linear Regression,6.2266,74.4994,8.1286,0.8098,0.413,0.203,0.017
et,Extra Trees Regressor,6.1083,86.5962,8.8147,0.7847,0.3414,0.1952,0.042
catboost,CatBoost Regressor,6.2715,89.7718,8.8426,0.7768,0.365,0.1991,0.426
rf,Random Forest Regressor,6.5483,92.3319,9.118,0.7684,0.3817,0.2098,0.057


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Huber Regressor,5.002,51.8751,7.2024,0.8823,0.1359,0.0911



🔍 Processing dataset: knn_3_distance


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
huber,Huber Regressor,4.5129,45.499,6.5771,0.8669,0.3539,0.1676,0.019
lasso,Lasso Regression,5.3384,53.0913,7.1045,0.8507,0.3968,0.1935,0.016
llar,Lasso Least Angle Regression,5.3385,53.0916,7.1045,0.8507,0.3968,0.1935,0.015
en,Elastic Net,5.4386,53.9828,7.17,0.8481,0.4031,0.1951,0.015
br,Bayesian Ridge,5.4921,55.4303,7.2673,0.8427,0.4055,0.1983,0.016
ridge,Ridge Regression,6.1202,65.7312,7.8409,0.8239,0.4097,0.2032,0.014
lr,Linear Regression,6.2492,70.0312,8.0093,0.8168,0.4088,0.2046,0.019
et,Extra Trees Regressor,6.0823,86.9537,8.779,0.7876,0.3363,0.1947,0.044
gbr,Gradient Boosting Regressor,6.5747,91.659,8.9741,0.7746,0.3552,0.2132,0.03
catboost,CatBoost Regressor,6.2703,90.4444,8.9026,0.7731,0.3704,0.2008,0.744


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Huber Regressor,4.7323,49.032,7.0023,0.8887,0.1294,0.0856



🔍 Processing dataset: mice_knn_3


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
huber,Huber Regressor,4.6267,49.5116,6.7238,0.8632,0.2911,0.175,0.019
lasso,Lasso Regression,5.4408,54.8172,7.1017,0.8533,0.3442,0.1926,0.016
llar,Lasso Least Angle Regression,5.4408,54.8185,7.1018,0.8533,0.3442,0.1926,0.016
en,Elastic Net,5.5046,55.5205,7.1545,0.8519,0.3467,0.1927,0.015
br,Bayesian Ridge,5.5559,56.6468,7.2441,0.8475,0.3482,0.1973,0.015
ridge,Ridge Regression,6.0944,82.8586,8.1983,0.8061,0.3628,0.1951,0.016
et,Extra Trees Regressor,6.0537,86.9541,8.7987,0.7867,0.3404,0.191,0.043
gbr,Gradient Boosting Regressor,6.4927,87.8412,8.8684,0.7795,0.3552,0.2039,0.03
lr,Linear Regression,6.2491,99.2492,8.5978,0.7789,0.3542,0.1962,0.017
catboost,CatBoost Regressor,6.2492,86.6391,8.8612,0.7779,0.3684,0.1955,0.412


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Huber Regressor,5.0764,49.3203,7.0228,0.8865,0.1308,0.102



🔍 Processing dataset: mice_knn_3_distance


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
huber,Huber Regressor,4.5954,50.69,6.806,0.8583,0.286,0.1692,0.02
lasso,Lasso Regression,5.3309,55.6556,7.1592,0.8499,0.3335,0.1867,0.015
llar,Lasso Least Angle Regression,5.331,55.6564,7.1594,0.8499,0.3335,0.1867,0.016
en,Elastic Net,5.4141,56.6841,7.2228,0.8476,0.3359,0.1881,0.015
br,Bayesian Ridge,5.4743,57.1612,7.2822,0.8442,0.3402,0.192,0.015
ridge,Ridge Regression,6.0253,77.4066,8.0758,0.8121,0.3516,0.1938,0.016
lr,Linear Regression,6.1824,89.2976,8.4,0.7922,0.3451,0.195,0.017
et,Extra Trees Regressor,6.0284,86.9832,8.8209,0.7842,0.3444,0.1924,0.043
knn,K Neighbors Regressor,6.5935,91.534,9.154,0.7737,0.4438,0.2053,0.016
catboost,CatBoost Regressor,6.3863,89.852,8.9702,0.7704,0.38,0.1989,0.389


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Huber Regressor,5.4414,57.2032,7.5633,0.8697,0.1285,0.1021



🔍 Processing dataset: knn_4


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
huber,Huber Regressor,4.595,45.0563,6.5176,0.8721,0.3651,0.1703,0.018
lasso,Lasso Regression,5.3477,52.7437,7.0521,0.8548,0.4226,0.1933,0.015
llar,Lasso Least Angle Regression,5.3477,52.7441,7.0522,0.8548,0.4226,0.1933,0.016
en,Elastic Net,5.392,53.0353,7.0781,0.8539,0.4226,0.1934,0.015
br,Bayesian Ridge,5.4965,55.1262,7.2176,0.8466,0.4209,0.1979,0.015
et,Extra Trees Regressor,6.024,85.9288,8.7061,0.7888,0.3395,0.1933,0.044
ridge,Ridge Regression,6.2963,89.1649,8.5118,0.7879,0.4194,0.2044,0.015
catboost,CatBoost Regressor,6.1922,88.6684,8.839,0.7741,0.369,0.197,0.392
rf,Random Forest Regressor,6.4846,91.701,9.0446,0.7713,0.3793,0.2088,0.054
knn,K Neighbors Regressor,6.7984,93.8395,9.3113,0.768,0.4479,0.2097,0.015


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Huber Regressor,4.702,51.4907,7.1757,0.8832,0.1377,0.0838



🔍 Processing dataset: knn_4_distance


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
huber,Huber Regressor,4.5006,43.7541,6.4096,0.8753,0.3522,0.1678,0.019
lasso,Lasso Regression,5.3591,52.6465,7.0437,0.8551,0.4189,0.1924,0.016
llar,Lasso Least Angle Regression,5.3592,52.6487,7.0438,0.8551,0.4189,0.1924,0.016
en,Elastic Net,5.4096,52.8764,7.0711,0.8541,0.4218,0.1924,0.015
br,Bayesian Ridge,5.5097,54.9819,7.2056,0.847,0.4219,0.1965,0.015
et,Extra Trees Regressor,6.0408,84.1637,8.6334,0.7939,0.3395,0.1906,0.048
catboost,CatBoost Regressor,6.1162,88.3817,8.7786,0.7761,0.3683,0.197,0.382
rf,Random Forest Regressor,6.48,91.752,9.0362,0.7707,0.3748,0.2087,0.059
ridge,Ridge Regression,6.4312,101.5855,8.8062,0.7678,0.4136,0.2042,0.017
knn,K Neighbors Regressor,6.7731,94.5109,9.3546,0.766,0.448,0.209,0.016


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Huber Regressor,4.8067,52.2722,7.23,0.8814,0.1378,0.0858



🔍 Processing dataset: mice_knn_4


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
huber,Huber Regressor,4.1135,40.1145,6.1037,0.8883,0.288,0.1509,0.019
lasso,Lasso Regression,5.2012,50.7142,6.8533,0.8668,0.3571,0.1775,0.015
llar,Lasso Least Angle Regression,5.2016,50.7245,6.8538,0.8668,0.3571,0.1775,0.016
en,Elastic Net,5.2575,51.9245,6.9287,0.8641,0.3606,0.1765,0.015
br,Bayesian Ridge,5.2066,51.6299,6.9462,0.8617,0.3658,0.1774,0.015
ridge,Ridge Regression,5.7423,61.5158,7.4567,0.8438,0.3767,0.1812,0.015
lr,Linear Regression,5.82,63.9738,7.553,0.8395,0.3754,0.1824,0.016
et,Extra Trees Regressor,5.926,86.0648,8.7938,0.7872,0.351,0.188,0.043
catboost,CatBoost Regressor,6.1223,86.5696,8.8123,0.7812,0.3633,0.1856,0.411
gbr,Gradient Boosting Regressor,6.3765,91.8181,9.0962,0.7669,0.3627,0.1969,0.031


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Huber Regressor,4.2857,45.516,6.7466,0.8945,0.1105,0.0731



🔍 Processing dataset: mice_knn_4_distance


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
huber,Huber Regressor,4.0469,39.9894,6.0642,0.8883,0.2828,0.1472,0.019
lasso,Lasso Regression,5.2166,51.3565,6.8603,0.8639,0.3502,0.1757,0.018
llar,Lasso Least Angle Regression,5.2165,51.3554,6.8603,0.8639,0.3502,0.1757,0.016
en,Elastic Net,5.2795,52.4098,6.9209,0.8621,0.3466,0.1753,0.016
br,Bayesian Ridge,5.2275,52.1596,6.9449,0.8599,0.3524,0.1768,0.015
ridge,Ridge Regression,5.8001,64.8264,7.4942,0.8412,0.3493,0.1801,0.017
lr,Linear Regression,5.8887,68.1309,7.6128,0.8355,0.3527,0.1815,0.018
et,Extra Trees Regressor,5.8037,84.4576,8.7104,0.7889,0.3397,0.184,0.041
catboost,CatBoost Regressor,6.2604,88.7026,8.9791,0.7719,0.3795,0.1921,0.391
knn,K Neighbors Regressor,6.6833,93.0114,9.2197,0.7694,0.4493,0.21,0.016


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Huber Regressor,4.2019,43.4209,6.5895,0.8993,0.1035,0.0727



🔍 Processing dataset: knn_5


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
huber,Huber Regressor,4.2639,40.288,6.1207,0.8875,0.3168,0.1552,0.019
lasso,Lasso Regression,5.223,50.1681,6.8743,0.862,0.3854,0.1836,0.015
llar,Lasso Least Angle Regression,5.223,50.1672,6.8742,0.862,0.3854,0.1836,0.016
en,Elastic Net,5.284,50.7701,6.9173,0.8606,0.3883,0.1842,0.016
br,Bayesian Ridge,5.3573,52.2731,7.027,0.8549,0.396,0.1878,0.016
ridge,Ridge Regression,6.0381,71.7212,7.9228,0.8215,0.4184,0.1927,0.015
et,Extra Trees Regressor,5.9074,82.0046,8.4875,0.7992,0.344,0.1883,0.042
lr,Linear Regression,6.2233,85.6422,8.3247,0.7983,0.4195,0.1958,0.017
catboost,CatBoost Regressor,5.8587,81.4247,8.4654,0.7955,0.3642,0.1865,0.411
gbr,Gradient Boosting Regressor,6.2114,88.2373,8.8748,0.7757,0.348,0.1987,0.032


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Huber Regressor,4.4361,48.9068,6.9933,0.8854,0.127,0.0812



🔍 Processing dataset: knn_5_distance


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
huber,Huber Regressor,4.3095,41.2713,6.2037,0.8828,0.3142,0.1605,0.019
lasso,Lasso Regression,5.2593,50.4775,6.8961,0.8611,0.3741,0.1866,0.015
llar,Lasso Least Angle Regression,5.2593,50.4785,6.8961,0.8611,0.3741,0.1866,0.015
en,Elastic Net,5.322,50.9434,6.9356,0.8596,0.3786,0.1864,0.015
br,Bayesian Ridge,5.398,52.6943,7.0559,0.8532,0.3888,0.1902,0.016
ridge,Ridge Regression,5.8503,57.9351,7.4071,0.8439,0.4052,0.1909,0.015
lr,Linear Regression,5.9593,60.854,7.5445,0.839,0.4051,0.1928,0.018
et,Extra Trees Regressor,5.9388,81.9322,8.5246,0.796,0.3379,0.1904,0.042
catboost,CatBoost Regressor,5.9729,81.5751,8.4604,0.7915,0.3627,0.1905,0.384
rf,Random Forest Regressor,6.4386,91.6689,9.0426,0.7682,0.3807,0.2071,0.055


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Huber Regressor,4.3095,44.7533,6.6898,0.8958,0.122,0.0785



🔍 Processing dataset: mice_knn_5


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
huber,Huber Regressor,4.1172,40.1828,6.0195,0.891,0.2866,0.1466,0.021
lasso,Lasso Regression,5.1736,51.5467,6.8303,0.8663,0.3556,0.1743,0.015
llar,Lasso Least Angle Regression,5.1737,51.548,6.8303,0.8663,0.3556,0.1743,0.016
en,Elastic Net,5.2199,52.3709,6.8849,0.8643,0.3552,0.1748,0.016
br,Bayesian Ridge,5.2351,52.1978,6.9228,0.8618,0.3624,0.1772,0.015
ridge,Ridge Regression,5.7695,64.0572,7.4979,0.8409,0.3433,0.184,0.016
lr,Linear Regression,5.833,66.2445,7.5805,0.8371,0.3513,0.1849,0.377
knn,K Neighbors Regressor,6.4412,88.2117,9.0129,0.7823,0.4168,0.1964,0.019
et,Extra Trees Regressor,6.0078,88.3351,8.8474,0.7822,0.3495,0.191,0.042
catboost,CatBoost Regressor,6.1174,87.9433,8.7539,0.78,0.3753,0.191,0.375


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Huber Regressor,4.3058,41.2616,6.4235,0.902,0.1134,0.0778



🔍 Processing dataset: mice_knn_5_distance


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
huber,Huber Regressor,4.1768,40.5104,6.0863,0.8905,0.3226,0.1436,0.019
lasso,Lasso Regression,5.0223,47.8063,6.646,0.8738,0.3684,0.1662,0.016
llar,Lasso Least Angle Regression,5.022,47.8024,6.6457,0.8738,0.3684,0.1662,0.016
en,Elastic Net,5.0486,48.4546,6.6772,0.8735,0.3655,0.1645,0.015
br,Bayesian Ridge,5.0541,49.6795,6.7632,0.8693,0.3673,0.1656,0.016
ridge,Ridge Regression,5.5908,65.2551,7.4203,0.8449,0.3693,0.1689,0.015
lr,Linear Regression,5.7523,73.2213,7.6652,0.8316,0.3681,0.1712,0.017
catboost,CatBoost Regressor,5.822,82.0012,8.4638,0.7971,0.3601,0.1777,0.396
et,Extra Trees Regressor,5.8852,85.9053,8.7481,0.7881,0.3424,0.1889,0.043
gbr,Gradient Boosting Regressor,6.2386,87.7008,8.8532,0.7776,0.3448,0.1952,0.031


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Huber Regressor,4.4232,40.6881,6.3787,0.9037,0.1047,0.0765



📊 Final Results:


Unnamed: 0,dataset,model,R2_train,R2_test,RMSE_train,RMSE_test,MAE_train,MAE_test,R2_rounded,RMSE_rounded,MAE_rounded,R2_ceil,RMSE_ceil,MAE_ceil
0,mean,Huber Regressor,0.912,0.93088,5.5136,5.369945,2.8704,3.322315,0.931326,5.352569,3.25,0.927052,5.516641,3.233333
1,median,Huber Regressor,0.9095,0.922313,5.5823,5.720601,2.8616,3.28003,0.920395,5.790797,3.333333,0.919643,5.818075,3.25
2,mode,Huber Regressor,0.901,0.912652,6.0425,6.061448,2.9933,3.603343,0.914255,6.005553,3.566667,0.90637,6.275614,3.716667
5,mice_lr,Huber Regressor,0.9234,0.911111,5.1476,6.109827,3.1821,3.798831,0.911381,6.100546,3.75,0.906619,6.262321,3.816667
19,mice_knn_5_distance,Huber Regressor,0.8905,0.903712,6.0863,6.378724,4.1768,4.42323,0.903329,6.3914,4.383333,0.901159,6.462714,4.433333
18,mice_knn_5,Huber Regressor,0.891,0.902049,6.0195,6.423521,4.1172,4.30581,0.901562,6.439462,4.266667,0.897685,6.565059,4.4
15,mice_knn_4_distance,Huber Regressor,0.8883,0.899324,6.0642,6.589455,4.0469,4.201934,0.900107,6.56379,4.15,0.89462,6.741662,4.283333
17,knn_5_distance,Huber Regressor,0.8828,0.895837,6.2037,6.689791,4.3095,4.309459,0.895379,6.704476,4.283333,0.893091,6.777413,4.333333
14,mice_knn_4,Huber Regressor,0.8883,0.894467,6.1037,6.746551,4.1135,4.285714,0.893692,6.771263,4.283333,0.89091,6.8593,4.35
3,mice,Huber Regressor,0.9261,0.889505,4.991,6.903329,3.1378,3.989016,0.888128,6.946222,4.05,0.885423,7.029699,3.95


In [44]:
results_df.sort_values(by='R2_rounded', ascending=False)

Unnamed: 0,dataset,model,R2_train,R2_test,RMSE_train,RMSE_test,MAE_train,MAE_test,R2_rounded,RMSE_rounded,MAE_rounded,R2_ceil,RMSE_ceil,MAE_ceil
0,mean,Huber Regressor,0.912,0.93088,5.5136,5.369945,2.8704,3.322315,0.931326,5.352569,3.25,0.927052,5.516641,3.233333
1,median,Huber Regressor,0.9095,0.922313,5.5823,5.720601,2.8616,3.28003,0.920395,5.790797,3.333333,0.919643,5.818075,3.25
2,mode,Huber Regressor,0.901,0.912652,6.0425,6.061448,2.9933,3.603343,0.914255,6.005553,3.566667,0.90637,6.275614,3.716667
5,mice_lr,Huber Regressor,0.9234,0.911111,5.1476,6.109827,3.1821,3.798831,0.911381,6.100546,3.75,0.906619,6.262321,3.816667
19,mice_knn_5_distance,Huber Regressor,0.8905,0.903712,6.0863,6.378724,4.1768,4.42323,0.903329,6.3914,4.383333,0.901159,6.462714,4.433333
18,mice_knn_5,Huber Regressor,0.891,0.902049,6.0195,6.423521,4.1172,4.30581,0.901562,6.439462,4.266667,0.897685,6.565059,4.4
15,mice_knn_4_distance,Huber Regressor,0.8883,0.899324,6.0642,6.589455,4.0469,4.201934,0.900107,6.56379,4.15,0.89462,6.741662,4.283333
17,knn_5_distance,Huber Regressor,0.8828,0.895837,6.2037,6.689791,4.3095,4.309459,0.895379,6.704476,4.283333,0.893091,6.777413,4.333333
14,mice_knn_4,Huber Regressor,0.8883,0.894467,6.1037,6.746551,4.1135,4.285714,0.893692,6.771263,4.283333,0.89091,6.8593,4.35
3,mice,Huber Regressor,0.9261,0.889505,4.991,6.903329,3.1378,3.989016,0.888128,6.946222,4.05,0.885423,7.029699,3.95


In [45]:
from pycaret.regression import setup, compare_models, pull
import pandas as pd

results = []

for name, (train_df, test_df) in datasets.items():
    print(f"\n🔍 Processing dataset: {name}")
    # train_df = train_df.copy().drop(columns=['corrected_total_number_of_patients'])
    # test_df = test_df.copy().drop(columns=['corrected_total_number_of_patients'])

    setup(
        data=train_df,
        target='total_number_of_emews',
        session_id=42,
        verbose=False,
        test_data=test_df,  # Use predefined test set,
        fold_strategy='timeseries',
        data_split_shuffle=False,
        fold_shuffle=False
    )

    best_model = compare_models(exclude=['lightgbm', 'par', 'dummy', 'lar'], sort='R2')
    leaderboard = pull()
    top_model_results = leaderboard.iloc[0]

    preds = predict_model(best_model, data=test_df)
    
    y_true = test_df['total_number_of_emews'].values
    y_pred = preds['prediction_label'].values

    # Standard metrics
    r2 = r2_score(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)

    # Custom metrics
    r2_r = r2_rounded(y_true, y_pred)
    rmse_r = rmse_rounded(y_true, y_pred)
    r2_c = r2_ceil(y_true, y_pred)
    rmse_c = rmse_ceil(y_true, y_pred)
    mae_r = mae_rounded(y_true, y_pred)
    mae_c = mae_ceil(y_true, y_pred)

    results.append({
        'dataset': name,
        'model': top_model_results['Model'],
        'R2_train': top_model_results['R2'],
        'R2_test': r2,
        'RMSE_train': top_model_results['RMSE'],
        'RMSE_test': rmse,
        'MAE_train': top_model_results['MAE'],
        'MAE_test': mae,
        'R2_rounded': r2_r,
        'RMSE_rounded': rmse_r,
        'MAE_rounded': mae_r,
        'R2_ceil': r2_c,
        'RMSE_ceil': rmse_c,
        'MAE_ceil': mae_c
    })

results_df_2 = pd.DataFrame(results)
results_df_2 = results_df_2.sort_values(by='R2_test', ascending=False)
print("\n📊 Final Results:")
results_df_2



🔍 Processing dataset: mean


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
huber,Huber Regressor,7.0997,312.9918,16.1848,0.9267,0.4353,0.0557,0.021
br,Bayesian Ridge,10.1029,336.305,16.7265,0.9205,0.4533,0.0953,0.016
en,Elastic Net,10.8984,356.328,17.0704,0.9178,0.4084,0.1034,0.017
lasso,Lasso Regression,11.8486,396.5323,18.1623,0.9101,0.4235,0.1093,0.02
llar,Lasso Least Angle Regression,11.8492,396.5089,18.162,0.9101,0.4236,0.1093,0.02
catboost,CatBoost Regressor,13.7131,518.1401,20.8403,0.8885,0.4054,0.1143,0.777
ridge,Ridge Regression,14.0581,521.6504,20.7265,0.8811,0.4902,0.1271,0.017
gbr,Gradient Boosting Regressor,14.3899,550.0217,21.8917,0.874,0.3961,0.1267,0.03
lr,Linear Regression,14.8685,600.3832,21.6515,0.8683,0.4935,0.1341,0.02
et,Extra Trees Regressor,14.4494,637.185,22.4175,0.8676,0.2789,0.1128,0.043


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Huber Regressor,9.1381,274.8858,16.5797,0.9457,0.0924,0.0472



🔍 Processing dataset: median


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
huber,Huber Regressor,6.8021,291.6858,15.9132,0.9298,0.4655,0.0542,0.02
br,Bayesian Ridge,9.4512,310.191,16.0317,0.927,0.4214,0.0893,0.017
en,Elastic Net,10.2008,322.4664,16.2984,0.9254,0.396,0.0969,0.015
lasso,Lasso Regression,11.1603,366.6739,17.5064,0.9165,0.4121,0.1019,0.016
llar,Lasso Least Angle Regression,11.1616,366.7303,17.5072,0.9165,0.4121,0.102,0.016
catboost,CatBoost Regressor,13.5691,497.4554,20.3825,0.8932,0.3884,0.1103,0.359
ridge,Ridge Regression,12.9101,473.8482,19.8896,0.8881,0.4438,0.1161,0.016
gbr,Gradient Boosting Regressor,14.4315,524.3023,21.2653,0.8818,0.3798,0.1215,0.029
lr,Linear Regression,13.5859,536.313,20.7041,0.8777,0.4558,0.1217,0.02
et,Extra Trees Regressor,14.5643,629.5166,22.3636,0.8682,0.2776,0.1137,0.043


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Huber Regressor,8.3589,246.7574,15.7085,0.9511,0.0922,0.045



🔍 Processing dataset: mode


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
huber,Huber Regressor,6.9152,228.2244,14.7901,0.9407,0.4608,0.0496,0.021
en,Elastic Net,8.8302,234.4652,14.5249,0.9395,0.375,0.0766,0.015
lasso,Lasso Regression,9.1437,251.3727,14.9391,0.9359,0.3671,0.0807,0.016
llar,Lasso Least Angle Regression,9.1412,251.2543,14.9362,0.9359,0.3671,0.0806,0.017
br,Bayesian Ridge,8.8343,253.5492,15.0601,0.9296,0.3755,0.076,0.015
gbr,Gradient Boosting Regressor,11.9452,393.1532,18.2989,0.913,0.3533,0.0968,0.03
catboost,CatBoost Regressor,12.1371,451.3796,19.308,0.9032,0.3376,0.0886,0.363
et,Extra Trees Regressor,12.2949,474.4643,19.5149,0.9009,0.2713,0.098,0.043
ridge,Ridge Regression,10.23,356.1598,17.4247,0.8927,0.3945,0.0867,0.017
xgboost,Extreme Gradient Boosting,13.8316,520.2562,20.7532,0.8873,0.3251,0.1112,0.026


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Huber Regressor,10.919,354.1548,18.819,0.9289,0.0918,0.0528



🔍 Processing dataset: mice


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
huber,Huber Regressor,7.9838,249.9081,14.1079,0.944,0.3709,0.0739,0.02
br,Bayesian Ridge,10.4988,318.6171,15.7339,0.9279,0.4171,0.1015,0.016
en,Elastic Net,11.2844,358.4178,16.3633,0.9214,0.3877,0.1087,0.016
lasso,Lasso Regression,12.2812,399.2612,17.4197,0.9133,0.4146,0.1147,0.018
llar,Lasso Least Angle Regression,12.2804,399.1036,17.4181,0.9133,0.4146,0.1147,0.016
catboost,CatBoost Regressor,14.7433,586.4527,22.4901,0.8677,0.4226,0.1228,0.378
ridge,Ridge Regression,14.957,658.3021,21.0312,0.8658,0.5056,0.1347,0.016
et,Extra Trees Regressor,14.3276,640.6808,22.7883,0.8639,0.2955,0.1119,0.044
gbr,Gradient Boosting Regressor,15.5456,660.578,23.2978,0.8547,0.3676,0.1343,0.03
rf,Random Forest Regressor,15.5384,708.7309,23.8901,0.8485,0.3586,0.1249,0.06


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Huber Regressor,10.7292,289.7338,17.0216,0.9424,0.0929,0.0564



🔍 Processing dataset: mice_hgb


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
huber,Huber Regressor,12.0681,380.3682,18.2981,0.906,0.3555,0.1211,0.02
br,Bayesian Ridge,13.1048,405.9382,18.9844,0.8974,0.4332,0.1311,0.016
en,Elastic Net,14.171,494.6189,20.4013,0.8801,0.4676,0.1372,0.016
gbr,Gradient Boosting Regressor,15.8148,644.0542,23.1939,0.8565,0.3911,0.1336,0.031
catboost,CatBoost Regressor,15.8693,693.8949,23.7905,0.8513,0.4274,0.1379,0.4
lasso,Lasso Regression,16.0419,634.4454,22.6294,0.8511,0.4917,0.1496,0.016
llar,Lasso Least Angle Regression,16.0439,635.171,22.6387,0.8509,0.4917,0.1496,0.016
rf,Random Forest Regressor,14.9859,676.2608,23.7193,0.8491,0.3534,0.1149,0.056
et,Extra Trees Regressor,16.3684,772.2537,25.324,0.8306,0.2957,0.131,0.042
omp,Orthogonal Matching Pursuit,18.4929,783.7464,26.6297,0.8169,0.4777,0.1615,0.016


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Huber Regressor,15.776,592.6742,24.3449,0.8919,0.1167,0.0831



🔍 Processing dataset: mice_lr


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
huber,Huber Regressor,8.1957,274.444,15.0253,0.9375,0.3987,0.0755,0.019
br,Bayesian Ridge,10.7774,359.848,16.4008,0.9222,0.4209,0.1014,0.02
en,Elastic Net,12.258,533.7108,18.2835,0.8943,0.3907,0.1134,0.016
ridge,Ridge Regression,15.1833,581.3602,20.3862,0.8774,0.4697,0.1383,0.018
lasso,Lasso Regression,13.708,651.273,20.0653,0.8737,0.4219,0.1228,0.019
llar,Lasso Least Angle Regression,13.7095,651.7115,20.068,0.8736,0.4219,0.1228,0.016
lr,Linear Regression,15.5863,611.1191,20.7737,0.8719,0.4613,0.1439,0.02
gbr,Gradient Boosting Regressor,14.9023,602.6918,22.2138,0.8684,0.3668,0.1273,0.031
catboost,CatBoost Regressor,14.0769,576.156,22.1638,0.8674,0.4117,0.1171,0.374
rf,Random Forest Regressor,14.841,683.5369,23.2405,0.8566,0.3508,0.1188,0.054


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Huber Regressor,9.7643,267.8629,16.3665,0.9456,0.0911,0.0525



🔍 Processing dataset: mice_rf


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
huber,Huber Regressor,11.5867,374.7733,18.0595,0.9083,0.4103,0.1169,0.021
br,Bayesian Ridge,12.8957,412.77,18.9628,0.8995,0.4019,0.1296,0.02
en,Elastic Net,13.6239,434.731,19.4525,0.8951,0.4458,0.1348,0.016
lasso,Lasso Regression,14.879,520.6134,20.8179,0.8801,0.452,0.1415,0.016
llar,Lasso Least Angle Regression,14.8879,522.124,20.8344,0.8799,0.4521,0.1416,0.016
catboost,CatBoost Regressor,15.7385,627.3166,23.4655,0.8573,0.4475,0.1414,0.362
gbr,Gradient Boosting Regressor,15.7466,623.5908,23.3013,0.8563,0.3813,0.1354,0.031
rf,Random Forest Regressor,15.6988,724.7419,24.7013,0.839,0.3898,0.1247,0.055
et,Extra Trees Regressor,16.0057,747.945,24.8682,0.836,0.3185,0.1285,0.045
xgboost,Extreme Gradient Boosting,17.4691,779.0971,26.0286,0.8252,0.389,0.1448,0.027


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Huber Regressor,15.2997,591.8436,24.3278,0.8904,0.1165,0.0791



🔍 Processing dataset: mice_svr


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
huber,Huber Regressor,11.0167,340.3655,17.1627,0.9143,0.3821,0.1076,0.02
br,Bayesian Ridge,12.6545,374.9348,18.1473,0.907,0.4368,0.1252,0.016
en,Elastic Net,13.7477,437.596,19.2258,0.8958,0.4637,0.1341,0.016
lasso,Lasso Regression,15.5676,533.2565,20.9932,0.877,0.4912,0.1442,0.015
llar,Lasso Least Angle Regression,15.5656,533.091,20.9908,0.877,0.4911,0.1442,0.016
gbr,Gradient Boosting Regressor,15.387,633.6449,22.752,0.863,0.3623,0.1271,0.031
catboost,CatBoost Regressor,15.6067,661.6368,23.0315,0.8608,0.4041,0.1295,0.388
ridge,Ridge Regression,16.9965,627.3027,22.6965,0.8551,0.5249,0.1572,0.016
et,Extra Trees Regressor,15.5891,711.9916,24.1052,0.8466,0.2951,0.1255,0.045
lr,Linear Regression,17.9561,725.2171,23.7457,0.8392,0.5335,0.1637,0.02


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Huber Regressor,14.8094,536.0976,23.1538,0.8934,0.1186,0.0797



🔍 Processing dataset: knn_3


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
huber,Huber Regressor,12.5906,413.4108,19.2868,0.8926,0.4181,0.1343,0.021
br,Bayesian Ridge,13.2555,418.4585,19.3801,0.8908,0.4527,0.1419,0.015
en,Elastic Net,13.7985,431.1569,19.7376,0.8878,0.5039,0.146,0.015
llar,Lasso Least Angle Regression,14.5626,465.3175,20.5018,0.8815,0.5043,0.1496,0.017
lasso,Lasso Regression,14.5635,465.3972,20.5032,0.8814,0.5043,0.1496,0.015
gbr,Gradient Boosting Regressor,15.4937,634.5819,22.9928,0.859,0.3831,0.1319,0.03
ridge,Ridge Regression,17.1311,633.9232,23.2225,0.8477,0.5472,0.1688,0.016
catboost,CatBoost Regressor,16.4346,731.8284,24.7729,0.8384,0.4395,0.1483,0.377
rf,Random Forest Regressor,16.1488,740.7885,25.01,0.8349,0.3744,0.1349,0.056
lr,Linear Regression,18.0657,736.1244,24.2783,0.8315,0.5504,0.1749,0.021


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Huber Regressor,14.0484,454.4959,21.3189,0.9103,0.1113,0.0769



🔍 Processing dataset: knn_3_distance


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
huber,Huber Regressor,12.3361,415.3147,19.1855,0.8943,0.4198,0.1304,0.021
br,Bayesian Ridge,13.3256,426.5673,19.5239,0.8895,0.4498,0.1421,0.015
en,Elastic Net,14.0091,441.2345,19.913,0.8862,0.5173,0.1472,0.015
lasso,Lasso Regression,14.7619,476.5242,20.716,0.8795,0.5137,0.1511,0.016
llar,Lasso Least Angle Regression,14.762,476.5961,20.7165,0.8795,0.5136,0.1511,0.017
gbr,Gradient Boosting Regressor,15.3898,634.6124,23.318,0.8549,0.3967,0.1331,0.031
ridge,Ridge Regression,17.0056,597.1199,22.8966,0.8533,0.5374,0.169,0.018
lr,Linear Regression,17.9341,688.2506,23.9329,0.8387,0.5457,0.1755,0.02
catboost,CatBoost Regressor,16.4801,743.4292,24.7895,0.8374,0.4499,0.1475,0.361
rf,Random Forest Regressor,16.1819,739.0916,24.99,0.8353,0.3723,0.1337,0.059


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Huber Regressor,14.1076,481.0221,21.9322,0.905,0.107,0.0751



🔍 Processing dataset: mice_knn_3


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
huber,Huber Regressor,12.0323,427.8083,18.8496,0.8995,0.5268,0.1117,0.019
br,Bayesian Ridge,13.8251,455.9075,19.672,0.89,0.5261,0.1303,0.016
en,Elastic Net,14.9462,491.0557,20.3964,0.8821,0.4832,0.1411,0.016
lasso,Lasso Regression,16.3066,553.947,21.8295,0.8676,0.517,0.152,0.016
llar,Lasso Least Angle Regression,16.3084,554.0446,21.831,0.8676,0.517,0.152,0.017
gbr,Gradient Boosting Regressor,16.425,684.9029,24.1803,0.8417,0.3407,0.1412,0.029
catboost,CatBoost Regressor,16.9552,728.2215,24.9447,0.837,0.4299,0.1496,0.372
rf,Random Forest Regressor,17.1786,809.9194,25.3141,0.8294,0.3731,0.1392,0.059
ridge,Ridge Regression,18.5977,772.495,24.6589,0.8254,0.5351,0.171,0.018
et,Extra Trees Regressor,16.831,819.1869,25.894,0.823,0.3092,0.1377,0.046


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Huber Regressor,14.5454,486.9692,22.0674,0.9031,0.1131,0.078



🔍 Processing dataset: mice_knn_3_distance


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
huber,Huber Regressor,12.1057,428.0152,18.9099,0.8983,0.5024,0.1131,0.021
br,Bayesian Ridge,13.6966,458.1012,19.6993,0.8891,0.5129,0.1296,0.017
en,Elastic Net,14.7441,498.9991,20.5577,0.8806,0.4785,0.1384,0.02
lasso,Lasso Regression,16.2554,583.1504,22.2166,0.8631,0.5069,0.1491,0.016
llar,Lasso Least Angle Regression,16.2539,583.0388,22.2152,0.8631,0.5069,0.1491,0.019
et,Extra Trees Regressor,16.1712,769.8227,25.1373,0.8342,0.3081,0.1312,0.044
gbr,Gradient Boosting Regressor,16.5484,757.1042,24.8193,0.8333,0.3715,0.1393,0.031
ridge,Ridge Regression,18.2555,731.5712,24.3438,0.8321,0.537,0.1663,0.016
catboost,CatBoost Regressor,17.0635,789.5853,25.6074,0.827,0.4385,0.1476,0.751
rf,Random Forest Regressor,16.9214,833.3485,25.507,0.8269,0.3798,0.137,0.057


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Huber Regressor,15.2996,519.0923,22.7836,0.8969,0.1189,0.0825



🔍 Processing dataset: knn_4


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
huber,Huber Regressor,12.0033,381.3209,18.3456,0.9032,0.3991,0.1283,0.019
br,Bayesian Ridge,13.4621,420.6406,19.207,0.8924,0.4553,0.1437,0.017
en,Elastic Net,14.0118,440.8486,19.6426,0.8887,0.5116,0.1474,0.022
lasso,Lasso Regression,15.0224,488.9674,20.7565,0.8798,0.52,0.153,0.016
llar,Lasso Least Angle Regression,15.0233,489.0904,20.758,0.8798,0.5199,0.1529,0.016
gbr,Gradient Boosting Regressor,15.5355,654.4955,23.4293,0.8537,0.3531,0.1349,0.03
catboost,CatBoost Regressor,16.0125,683.2987,23.9501,0.8505,0.4222,0.138,0.381
rf,Random Forest Regressor,15.9129,713.4969,24.5898,0.8403,0.3775,0.1319,0.063
et,Extra Trees Regressor,16.5164,801.4015,25.7293,0.8231,0.3159,0.136,0.051
ridge,Ridge Regression,18.2078,852.3897,25.1368,0.8156,0.53,0.1756,0.016


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Huber Regressor,13.8938,501.7806,22.4005,0.9008,0.1101,0.0729



🔍 Processing dataset: knn_4_distance


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
huber,Huber Regressor,11.908,388.3775,18.4629,0.9013,0.3992,0.127,0.019
br,Bayesian Ridge,13.3643,419.7664,19.1661,0.8927,0.4562,0.1408,0.016
en,Elastic Net,13.9678,437.7383,19.6095,0.8892,0.5149,0.1446,0.016
lasso,Lasso Regression,14.8484,483.6122,20.623,0.8814,0.5071,0.1487,0.017
llar,Lasso Least Angle Regression,14.8494,483.8588,20.6258,0.8813,0.5069,0.1487,0.016
gbr,Gradient Boosting Regressor,15.5648,663.3288,23.4435,0.8525,0.3689,0.132,0.031
catboost,CatBoost Regressor,15.7918,678.3776,23.9151,0.8507,0.4358,0.1356,0.394
rf,Random Forest Regressor,15.8655,713.5927,24.5759,0.8407,0.3546,0.1314,0.057
ridge,Ridge Regression,18.0935,827.7618,24.9012,0.8198,0.5316,0.1721,0.017
et,Extra Trees Regressor,16.7688,811.2572,25.8904,0.8198,0.3018,0.136,0.043


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Huber Regressor,13.6985,485.0928,22.0248,0.904,0.1076,0.0726



🔍 Processing dataset: mice_knn_4


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
huber,Huber Regressor,11.255,376.0101,17.8051,0.9107,0.4304,0.1067,0.019
br,Bayesian Ridge,12.6825,409.0283,18.5652,0.8991,0.4354,0.1217,0.016
en,Elastic Net,13.8611,466.8673,19.4919,0.8878,0.4437,0.1315,0.021
lasso,Lasso Regression,15.1945,523.599,20.9325,0.8748,0.4638,0.1418,0.017
llar,Lasso Least Angle Regression,15.2107,525.6432,20.9594,0.8744,0.4637,0.1419,0.016
gbr,Gradient Boosting Regressor,16.3437,665.4586,23.4631,0.8525,0.3887,0.1436,0.031
catboost,CatBoost Regressor,16.5111,702.8933,24.4607,0.8439,0.4239,0.1449,0.377
ridge,Ridge Regression,17.2065,637.0756,23.0384,0.8412,0.5031,0.1604,0.016
rf,Random Forest Regressor,16.9719,792.7382,25.2452,0.8312,0.3708,0.1389,0.057
et,Extra Trees Regressor,16.4467,784.6471,25.388,0.8304,0.3187,0.134,0.055


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Huber Regressor,11.9982,392.969,19.8234,0.9216,0.0984,0.0621



🔍 Processing dataset: mice_knn_4_distance


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
huber,Huber Regressor,11.0591,367.8918,17.6699,0.9106,0.3865,0.1052,0.019
br,Bayesian Ridge,12.5014,405.6402,18.4375,0.9024,0.4634,0.1155,0.019
en,Elastic Net,13.5636,459.2525,19.2803,0.8927,0.4347,0.1241,0.016
lasso,Lasso Regression,15.1387,527.6898,20.8198,0.878,0.4668,0.1351,0.017
llar,Lasso Least Angle Regression,15.1399,527.705,20.8201,0.878,0.4667,0.1351,0.017
ridge,Ridge Regression,16.959,636.954,22.643,0.8538,0.4815,0.1515,0.018
gbr,Gradient Boosting Regressor,16.2264,668.8469,23.7587,0.8504,0.3498,0.1415,0.031
lr,Linear Regression,17.6439,710.0642,23.44,0.8416,0.4678,0.1579,0.024
catboost,CatBoost Regressor,16.6522,732.9965,24.9065,0.8378,0.4489,0.1447,0.351
et,Extra Trees Regressor,16.684,788.3734,25.4216,0.8291,0.3171,0.1359,0.048


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Huber Regressor,12.3862,414.8762,20.3685,0.9172,0.0966,0.0634



🔍 Processing dataset: knn_5


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
huber,Huber Regressor,11.4838,373.0608,17.9533,0.9077,0.4186,0.1201,0.019
br,Bayesian Ridge,12.9858,396.6748,18.6063,0.8986,0.4319,0.1364,0.016
en,Elastic Net,13.7002,415.6394,19.0911,0.895,0.4751,0.1408,0.016
lasso,Lasso Regression,14.5655,453.0059,20.1488,0.8872,0.4825,0.1459,0.017
llar,Lasso Least Angle Regression,14.5647,452.9037,20.1476,0.8872,0.4825,0.1459,0.019
gbr,Gradient Boosting Regressor,15.4761,619.2167,22.8783,0.8601,0.3994,0.1349,0.034
ridge,Ridge Regression,17.0515,608.2416,22.7235,0.8549,0.5167,0.1648,0.016
catboost,CatBoost Regressor,15.6303,675.9812,23.7683,0.8509,0.4302,0.1347,0.363
lr,Linear Regression,18.0527,710.9605,23.8247,0.8386,0.5234,0.1717,0.027
rf,Random Forest Regressor,16.2001,746.7124,24.9137,0.8357,0.3613,0.1322,0.065


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Huber Regressor,13.0133,458.9906,21.4241,0.9091,0.1042,0.0684



🔍 Processing dataset: knn_5_distance


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
huber,Huber Regressor,11.4999,378.4649,18.1295,0.9044,0.3876,0.1228,0.019
br,Bayesian Ridge,12.9846,401.6093,18.711,0.8975,0.4301,0.1366,0.017
en,Elastic Net,13.7862,424.316,19.2653,0.8933,0.4683,0.1412,0.034
lasso,Lasso Regression,14.5552,456.6677,20.2226,0.8862,0.477,0.1458,0.017
llar,Lasso Least Angle Regression,14.5576,456.8931,20.2258,0.8861,0.4769,0.1458,0.017
ridge,Ridge Regression,16.6219,562.8243,22.2034,0.8619,0.5084,0.1625,0.379
gbr,Gradient Boosting Regressor,15.5746,638.2687,23.0135,0.8591,0.3703,0.134,0.034
catboost,CatBoost Regressor,15.5995,681.9715,23.9019,0.851,0.4326,0.1342,0.376
lr,Linear Regression,17.6477,640.1075,23.1478,0.8492,0.5202,0.1701,0.027
rf,Random Forest Regressor,16.4614,771.8556,25.2154,0.8308,0.3523,0.1357,0.057


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Huber Regressor,12.5524,443.853,21.0678,0.9121,0.1025,0.0662



🔍 Processing dataset: mice_knn_5


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
huber,Huber Regressor,10.7259,349.7127,17.0167,0.9174,0.2974,0.1007,0.019
br,Bayesian Ridge,12.0233,377.0984,17.642,0.9111,0.3692,0.1122,0.016
en,Elastic Net,13.1277,418.6419,18.362,0.9038,0.4076,0.1222,0.015
lasso,Lasso Regression,14.6597,502.5966,19.9966,0.8884,0.4337,0.1315,0.016
llar,Lasso Least Angle Regression,14.6608,502.6347,19.9974,0.8884,0.4337,0.1315,0.017
ridge,Ridge Regression,16.4216,584.955,21.8207,0.8653,0.4545,0.1531,0.018
lr,Linear Regression,17.1267,644.5716,22.5195,0.8552,0.4628,0.1593,0.025
catboost,CatBoost Regressor,15.8938,678.6878,24.0344,0.8488,0.4341,0.1374,0.432
et,Extra Trees Regressor,15.5972,723.3684,24.4331,0.8431,0.3066,0.1244,0.045
gbr,Gradient Boosting Regressor,16.1926,714.2176,24.5222,0.8369,0.4263,0.1351,0.032


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Huber Regressor,13.329,454.2332,21.3127,0.9094,0.1046,0.0694



🔍 Processing dataset: mice_knn_5_distance


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
huber,Huber Regressor,10.8676,341.6913,17.0667,0.9172,0.321,0.1036,0.02
br,Bayesian Ridge,11.9136,375.8389,17.6038,0.9108,0.3998,0.1103,0.016
en,Elastic Net,12.7554,396.9452,18.0617,0.9067,0.4282,0.1168,0.015
lasso,Lasso Regression,14.2618,468.7436,19.6102,0.8928,0.4425,0.1268,0.017
llar,Lasso Least Angle Regression,14.2615,468.7058,19.6098,0.8928,0.4425,0.1268,0.017
ridge,Ridge Regression,16.0662,563.5427,21.4935,0.8701,0.512,0.143,0.016
lr,Linear Regression,17.016,661.5854,22.5658,0.8544,0.5206,0.1507,0.03
catboost,CatBoost Regressor,16.0979,698.9238,24.1989,0.8467,0.4243,0.1399,0.723
gbr,Gradient Boosting Regressor,16.704,715.957,24.5518,0.8401,0.3928,0.1435,0.031
et,Extra Trees Regressor,16.1554,770.6927,24.9104,0.8382,0.306,0.1314,0.044


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Huber Regressor,13.5792,452.3763,21.2691,0.9098,0.1026,0.0707



📊 Final Results:


Unnamed: 0,dataset,model,R2_train,R2_test,RMSE_train,RMSE_test,MAE_train,MAE_test,R2_rounded,RMSE_rounded,MAE_rounded,R2_ceil,RMSE_ceil,MAE_ceil
1,median,Huber Regressor,0.9298,0.951102,15.9132,15.708514,6.8021,8.358915,0.950945,15.733722,8.283333,0.95179,15.597543,8.183333
0,mean,Huber Regressor,0.9267,0.945662,16.1848,16.579682,7.0997,9.138087,0.945557,16.595682,9.083333,0.946789,16.406808,9.083333
5,mice_lr,Huber Regressor,0.9375,0.945629,15.0253,16.366516,8.1957,9.764273,0.945618,16.36816,9.783333,0.946647,16.212649,9.75
3,mice,Huber Regressor,0.944,0.942365,14.1079,17.021567,7.9838,10.729172,0.94269,16.973509,10.7,0.943463,16.858727,10.716667
2,mode,Huber Regressor,0.9407,0.928914,14.7901,18.819001,6.9152,10.91903,0.928918,18.818431,10.933333,0.930387,18.623014,10.65
14,mice_knn_4,Huber Regressor,0.9107,0.921605,17.8051,19.823445,11.255,11.998198,0.921735,19.806985,12.016667,0.92256,19.702369,11.916667
15,mice_knn_4_distance,Huber Regressor,0.9106,0.917191,17.6699,20.36851,11.0591,12.386157,0.917067,20.383817,12.4,0.918414,20.217567,12.25
17,knn_5_distance,Huber Regressor,0.9044,0.912094,18.1295,21.067819,11.4999,12.552424,0.911887,21.092653,12.566667,0.913544,20.89338,12.533333
8,knn_3,Huber Regressor,0.8926,0.910302,19.2868,21.31891,12.5906,14.048378,0.910732,21.267738,14.05,0.910318,21.317051,14.083333
19,mice_knn_5_distance,Huber Regressor,0.9172,0.90977,17.0667,21.269141,10.8676,13.579168,0.909862,21.258332,13.516667,0.910447,21.189227,13.616667


In [46]:
results_df_2.sort_values(by='R2_rounded', ascending=False)

Unnamed: 0,dataset,model,R2_train,R2_test,RMSE_train,RMSE_test,MAE_train,MAE_test,R2_rounded,RMSE_rounded,MAE_rounded,R2_ceil,RMSE_ceil,MAE_ceil
1,median,Huber Regressor,0.9298,0.951102,15.9132,15.708514,6.8021,8.358915,0.950945,15.733722,8.283333,0.95179,15.597543,8.183333
5,mice_lr,Huber Regressor,0.9375,0.945629,15.0253,16.366516,8.1957,9.764273,0.945618,16.36816,9.783333,0.946647,16.212649,9.75
0,mean,Huber Regressor,0.9267,0.945662,16.1848,16.579682,7.0997,9.138087,0.945557,16.595682,9.083333,0.946789,16.406808,9.083333
3,mice,Huber Regressor,0.944,0.942365,14.1079,17.021567,7.9838,10.729172,0.94269,16.973509,10.7,0.943463,16.858727,10.716667
2,mode,Huber Regressor,0.9407,0.928914,14.7901,18.819001,6.9152,10.91903,0.928918,18.818431,10.933333,0.930387,18.623014,10.65
14,mice_knn_4,Huber Regressor,0.9107,0.921605,17.8051,19.823445,11.255,11.998198,0.921735,19.806985,12.016667,0.92256,19.702369,11.916667
15,mice_knn_4_distance,Huber Regressor,0.9106,0.917191,17.6699,20.36851,11.0591,12.386157,0.917067,20.383817,12.4,0.918414,20.217567,12.25
17,knn_5_distance,Huber Regressor,0.9044,0.912094,18.1295,21.067819,11.4999,12.552424,0.911887,21.092653,12.566667,0.913544,20.89338,12.533333
8,knn_3,Huber Regressor,0.8926,0.910302,19.2868,21.31891,12.5906,14.048378,0.910732,21.267738,14.05,0.910318,21.317051,14.083333
19,mice_knn_5_distance,Huber Regressor,0.9172,0.90977,17.0667,21.269141,10.8676,13.579168,0.909862,21.258332,13.516667,0.910447,21.189227,13.616667


In [47]:
# for name, (train_df, test_df) in datasets.items():
#     path = f'data/imputed/'
#     os.makedirs(path, exist_ok=True)

#     df = pd.concat([train_df, test_df], axis=0).sort_values(by='date').reset_index(drop=True)

#     df.to_csv(path+f'{name}_df.csv', index=False)

In [48]:
full_df = pd.concat([train_data, test_data], ignore_index=True)

X_full = full_df.drop(columns=['total_number_of_patients', 'total_number_of_emews'])
y_full = full_df[['total_number_of_patients', 'total_number_of_emews']]

patient_cols = ['zone_a_mwr_patients', 'zone_a__patients', 'zone_b/c_patients']
emews_cols = ['zone_a_mwr_sets_of_emews', 'zone_a__sets_of_emews', 'zone_b/c_sets_of_emews']

In [49]:
def apply_final_imputation(X: pd.DataFrame, imputer):
    num_cols = X.select_dtypes(include='number').columns

    X_num = X[num_cols]
    needs_scaling = isinstance(imputer, (KNNImputer, IterativeImputer))

    if needs_scaling:
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X_num)
        X_imputed = imputer.fit_transform(X_scaled)
        X_imputed = scaler.inverse_transform(X_imputed)
    else:
        X_imputed = imputer.fit_transform(X_num)

    X_imputed = np.round(X_imputed).astype(int)
    X_final = X.copy()
    X_final[num_cols] = X_imputed
    return X_final

In [50]:
imputed_datasets = {}

for name, imputer in imputation_strategies.items():
    print(f"🔧 Imputing with: {name}")

    X_imputed = apply_final_imputation(X_full.copy(), imputer)

    # Combine with target values again
    df_imputed = pd.concat([X_imputed, y_full.copy()], axis=1)

    # Fix totals if needed
    df_imputed['total_number_of_patients'] = df_imputed['total_number_of_patients'].fillna(
        df_imputed[patient_cols].sum(axis=1)
    )
    df_imputed['total_number_of_emews'] = df_imputed['total_number_of_emews'].fillna(
        df_imputed[emews_cols].sum(axis=1)
    )

    # Save to dictionary
    imputed_datasets[name] = df_imputed

🔧 Imputing with: mean
🔧 Imputing with: median
🔧 Imputing with: mode
🔧 Imputing with: mice
🔧 Imputing with: mice_hgb
🔧 Imputing with: mice_lr
🔧 Imputing with: mice_rf
🔧 Imputing with: mice_svr
🔧 Imputing with: knn_3
🔧 Imputing with: knn_3_distance
🔧 Imputing with: mice_knn_3
🔧 Imputing with: mice_knn_3_distance
🔧 Imputing with: knn_4
🔧 Imputing with: knn_4_distance
🔧 Imputing with: mice_knn_4
🔧 Imputing with: mice_knn_4_distance
🔧 Imputing with: knn_5
🔧 Imputing with: knn_5_distance
🔧 Imputing with: mice_knn_5
🔧 Imputing with: mice_knn_5_distance


In [51]:
for name, dataframe in imputed_datasets.items():
    path = f'data/imputed2/'
    os.makedirs(path, exist_ok=True)

    dataframe.to_csv(path+f'{name}_df.csv', index=False)