In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

typhoid = pd.read_csv('Cleaned_Typhoid_Data.csv')

print(typhoid.head())
print(typhoid.info())

   Unnamed: 0    tave    tmin    tmax  heat_index     pr  wind_speed      rh  \
0          36  25.556  23.066  28.834      26.434  0.480       3.714  68.282   
1          37  24.922  22.712  27.774      25.506  0.286       3.138  67.308   
2          38  25.180  22.716  28.438      25.976  0.498       3.692  68.996   
3          39  26.608  23.820  30.206      28.102  0.516       2.912  72.240   
4          40  26.208  23.546  29.464      27.344  0.468       2.182  70.792   

   solar_rad  uv_rad  ...  wind_speed-3.0     rh-1.0     rh-2.0  rh-3.0  \
0    190.046  22.378  ...           2.064  78.245000  78.820000  78.188   
1    197.286  23.014  ...           1.524  77.398333  78.062000  78.890   
2    189.620  22.312  ...           2.070  78.933333  79.504000  78.668   
3    196.138  23.314  ...           3.084  68.282000  78.245000  78.820   
4    197.318  23.300  ...           2.536  67.308000  77.398333  78.062   

   solar_rad-1.0  solar_rad-2.0  solar_rad-3.0  uv_rad-1.0  uv_rad-2

In [2]:
X = typhoid.iloc[:, :-1]
y = typhoid.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print(f"Train set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")
print(f"Total size: {X_train.shape[0] + X_test.shape[0]}")

Train set size: 226
Test set size: 98
Total size: 324


In [3]:
knn = KNeighborsRegressor()

param_grid_KNN = {
    'n_neighbors': np.arange(1, 31, 2),
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'leaf_size': [10, 20, 30, 40, 50],
    'metric': ['minkowski', 'manhattan', 'euclidean', 'chebyshev'],
    'p': [1, 2]
}

grid_search = GridSearchCV(knn, param_grid_KNN, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

best_knn = grid_search.best_estimator_
print("Best Parameters: ", grid_search.best_params_)

y_pred = best_knn.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("Mean Absolute Error:", mae)
print("R-squared:", r2)

Best Parameters:  {'algorithm': 'auto', 'leaf_size': 10, 'metric': 'minkowski', 'n_neighbors': 3, 'p': 1, 'weights': 'distance'}
Mean Squared Error: 8.325132132434181
Root Mean Squared Error (RMSE): 2.8853305066203734
Mean Absolute Error: 2.253269744048722
R-squared: 0.49513849034435964


In [4]:
decision_tree = DecisionTreeRegressor()

param_grid_DT = {
    'criterion': ['friedman_mse', 'poisson'],
    'max_depth': [5, 10, 15, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None],
    'max_leaf_nodes': [None, 10, 20, 50, 100],
    'min_impurity_decrease': [0.0, 0.01, 0.05],
    'ccp_alpha': [0.0, 0.01, 0.05]
}

grid_search = GridSearchCV(decision_tree, param_grid_DT, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

best_dt = grid_search.best_estimator_
print("Best Parameters: ", grid_search.best_params_)

y_pred = best_dt.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("Mean Absolute Error:", mae)
print("R-squared:", r2)

Best Parameters:  {'ccp_alpha': 0.0, 'criterion': 'poisson', 'max_depth': 10, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 2, 'min_samples_split': 5}
Mean Squared Error: 0.16854206997984347
Root Mean Squared Error (RMSE): 0.4105387557586293
Mean Absolute Error: 0.30540816326530607
R-squared: 0.9897790926874297


In [5]:
random_forest = RandomForestRegressor()

param_grid_RFR = {
    'n_estimators': [100, 200],  
    'max_depth': [None, 10, 20],  
    'min_samples_split': [2, 5],  
    'min_samples_leaf': [1, 2], 
    'max_features': ['sqrt', 'log2'],  
    'criterion': ['friedman_mse', 'poisson'], 
    'bootstrap': [True],  #
    'max_leaf_nodes': [None, 20, 50],  
    'min_impurity_decrease': [0.0, 0.01],  
    'ccp_alpha': [0.0, 0.01]  
}

grid_search = GridSearchCV(random_forest, param_grid_RFR, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

best_rfr = grid_search.best_estimator_
print("Best Parameters: ", grid_search.best_params_)

y_pred = best_rfr.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("Mean Absolute Error:", mae)
print("R-squared:", r2)

Best Parameters:  {'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'poisson', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Mean Squared Error: 0.43144377563319725
Root Mean Squared Error (RMSE): 0.6568437985040259
Mean Absolute Error: 0.4854157312925212
R-squared: 0.9738359280750518
