In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [4]:
dengue = pd.read_csv('Cleaned_Dengue_Data.csv')

print(dengue.head())
print(dengue.info())

    tave   tmin   tmax  heat_index   pr  wind_speed     rh  solar_rad  uv_rad  \
0  24.90  23.05  27.55       25.77  2.2        5.23  74.81     154.55   18.75   
1  24.67  22.73  27.29       25.38  0.0        4.65  72.92     158.34   19.29   
2  24.63  22.76  27.25       25.46  6.4        5.20  75.66     153.93   18.64   
3  25.38  23.15  28.33       26.46  0.0        3.72  74.96     178.14   21.64   
4  25.27  23.12  28.03       26.20  0.0        3.10  73.44     180.83   21.71   

   dentist_nearest  ...  optician_nearest  pharmacy_nearest  pop_count_total  \
0          1461.16  ...           1327.64            153.05         13551.19   
1         10000.00  ...           9998.30            949.48         20156.26   
2          1800.19  ...           1743.96            300.98         51784.80   
3          1461.16  ...           1327.64            153.05         13551.19   
4         10000.00  ...           9998.30            949.48         20156.26   

   pop_density_mean  dentist_cou

In [3]:
X = dengue.iloc[:, :-1]
y = dengue.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print(f"Train set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")
print(f"Total size: {X_train.shape[0] + X_test.shape[0]}")

Train set size: 1050
Test set size: 450
Total size: 1500


In [4]:
knn = KNeighborsRegressor()

param_grid_KNN = {
    'n_neighbors': np.arange(1, 31, 2),
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'leaf_size': [10, 20, 30, 40, 50],
    'metric': ['minkowski', 'manhattan', 'euclidean', 'chebyshev'],
    'p': [1, 2]
}

grid_search = GridSearchCV(knn, param_grid_KNN, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

best_knn = grid_search.best_estimator_
print("Best Parameters: ", grid_search.best_params_)

Best Parameters:  {'algorithm': 'auto', 'leaf_size': 10, 'metric': 'minkowski', 'n_neighbors': 9, 'p': 1, 'weights': 'distance'}


  _data = np.array(data, dtype=dtype, copy=copy,


In [5]:
y_pred = best_knn.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("Mean Absolute Error:", mae)
print("R-squared:", r2)

Mean Squared Error: 115.04937089999743
Root Mean Squared Error (RMSE): 10.726106977836713
Mean Absolute Error: 6.4229297072277065
R-squared: 0.23164328564669334


In [6]:
decision_tree = DecisionTreeRegressor()

param_grid_DT = {
    'criterion': ['friedman_mse', 'poisson'],
    'max_depth': [5, 10, 15, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None],
    'max_leaf_nodes': [None, 10, 20, 50, 100],
    'min_impurity_decrease': [0.0, 0.01, 0.05],
    'ccp_alpha': [0.0, 0.01, 0.05]
}

grid_search = GridSearchCV(decision_tree, param_grid_DT, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

best_dt = grid_search.best_estimator_
print("Best Parameters: ", grid_search.best_params_)

Best Parameters:  {'ccp_alpha': 0.0, 'criterion': 'poisson', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': 50, 'min_impurity_decrease': 0.01, 'min_samples_leaf': 1, 'min_samples_split': 2}


In [7]:
y_pred = best_dt.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("Mean Absolute Error:", mae)
print("R-squared:", r2)

Mean Squared Error: 135.50791877395778
Root Mean Squared Error (RMSE): 11.640786862319823
Mean Absolute Error: 6.538607641335257
R-squared: 0.09501096421888144


In [8]:
random_forest = RandomForestRegressor()

param_grid_RFR = {
    'n_estimators': [100, 200], 
    'max_depth': [None, 10, 20], 
    'min_samples_split': [2, 5], 
    'min_samples_leaf': [1, 2],  
    'max_features': ['sqrt', 'log2'],
    'criterion': ['friedman_mse', 'poisson'], 
    'bootstrap': [True],  
    'max_leaf_nodes': [None, 20, 50], 
    'min_impurity_decrease': [0.0, 0.01],  
    'ccp_alpha': [0.0, 0.01]  
}

grid_search = GridSearchCV(random_forest, param_grid_RFR, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

best_rfr = grid_search.best_estimator_
print("Best Parameters: ", grid_search.best_params_)

Best Parameters:  {'bootstrap': True, 'ccp_alpha': 0.01, 'criterion': 'poisson', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}


In [9]:
y_pred = best_rfr.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("Mean Absolute Error:", mae)
print("R-squared:", r2)

Mean Squared Error: 96.87056335435727
Root Mean Squared Error (RMSE): 9.842284458110184
Mean Absolute Error: 6.070284109835237
R-squared: 0.35305037138182693


Highest value in column case_total: 116.0
Lowest value in column case_total: 0.0
