In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [2]:
dengue = pd.read_csv('Cleaned_Dengue_Data.csv')

print(dengue.head())
print(dengue.info())

     tave    tmin    tmax  heat_index     pr  wind_speed      rh  solar_rad  \
0  25.556  23.066  28.834      26.434  0.480       3.714  68.282    190.046   
1  24.922  22.712  27.774      25.506  0.286       3.138  67.308    197.286   
2  25.180  22.716  28.438      25.976  0.498       3.692  68.996    189.620   
3  26.608  23.820  30.206      28.102  0.516       2.912  72.240    196.138   
4  26.208  23.546  29.464      27.344  0.468       2.182  70.792    197.318   

   uv_rad  dentist_nearest  ...  wind_speed-3.0     rh-1.0     rh-2.0  rh-3.0  \
0  22.378          10000.0  ...           2.064  78.245000  78.820000  78.188   
1  23.014          10000.0  ...           1.524  77.398333  78.062000  78.890   
2  22.312          10000.0  ...           2.070  78.933333  79.504000  78.668   
3  23.314          10000.0  ...           3.084  68.282000  78.245000  78.820   
4  23.300          10000.0  ...           2.536  67.308000  77.398333  78.062   

   solar_rad-1.0  solar_rad-2.0  solar

In [3]:
X = dengue.iloc[:, :-1]
y = dengue.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print(f"Train set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")
print(f"Total size: {X_train.shape[0] + X_test.shape[0]}")

Train set size: 226
Test set size: 98
Total size: 324


In [4]:
knn = KNeighborsRegressor()

param_grid_KNN = {
    'n_neighbors': np.arange(1, 31, 2),
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'leaf_size': [10, 20, 30, 40, 50],
    'metric': ['minkowski', 'manhattan', 'euclidean', 'chebyshev'],
    'p': [1, 2]
}

grid_search = GridSearchCV(knn, param_grid_KNN, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

best_knn = grid_search.best_estimator_
print("Best Parameters: ", grid_search.best_params_)

Best Parameters:  {'algorithm': 'auto', 'leaf_size': 10, 'metric': 'minkowski', 'n_neighbors': 3, 'p': 1, 'weights': 'distance'}


In [5]:
y_pred = best_knn.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("Mean Absolute Error:", mae)
print("R-squared:", r2)

Mean Squared Error: 8.344262830551994
Root Mean Squared Error (RMSE): 2.8886437701025014
Mean Absolute Error: 2.2577451448743107
R-squared: 0.4939783462194517


In [6]:
decision_tree = DecisionTreeRegressor()

param_grid_DT = {
    'criterion': ['friedman_mse', 'poisson'],
    'max_depth': [5, 10, 15, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None],
    'max_leaf_nodes': [None, 10, 20, 50, 100],
    'min_impurity_decrease': [0.0, 0.01, 0.05],
    'ccp_alpha': [0.0, 0.01, 0.05]
}

grid_search = GridSearchCV(decision_tree, param_grid_DT, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

best_dt = grid_search.best_estimator_
print("Best Parameters: ", grid_search.best_params_)

Best Parameters:  {'ccp_alpha': 0.0, 'criterion': 'poisson', 'max_depth': 10, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 2, 'min_samples_split': 5}


In [7]:
y_pred = best_dt.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("Mean Absolute Error:", mae)
print("R-squared:", r2)

Mean Squared Error: 0.1689725256676747
Root Mean Squared Error (RMSE): 0.411062678514694
Mean Absolute Error: 0.30353798185941083
R-squared: 0.9897529885361752


In [8]:
random_forest = RandomForestRegressor()

param_grid_RFR = {
    'n_estimators': [100, 200], 
    'max_depth': [None, 10, 20], 
    'min_samples_split': [2, 5], 
    'min_samples_leaf': [1, 2],  
    'max_features': ['sqrt', 'log2'],
    'criterion': ['friedman_mse', 'poisson'], 
    'bootstrap': [True],  
    'max_leaf_nodes': [None, 20, 50], 
    'min_impurity_decrease': [0.0, 0.01],  
    'ccp_alpha': [0.0, 0.01]  
}

grid_search = GridSearchCV(random_forest, param_grid_RFR, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

best_rfr = grid_search.best_estimator_
print("Best Parameters: ", grid_search.best_params_)

Best Parameters:  {'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'friedman_mse', 'max_depth': 20, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}


In [9]:
y_pred = best_rfr.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("Mean Absolute Error:", mae)
print("R-squared:", r2)

Mean Squared Error: 0.49717047696690025
Root Mean Squared Error (RMSE): 0.7051031676052095
Mean Absolute Error: 0.5261904761904806
R-squared: 0.9698500596068818
