In [37]:
import os
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import KFold, GridSearchCV


# Load the CSV file
file_path = os.path.join("..", "Final Data", "FINAL-THESIS-DATA.csv")
df = pd.read_csv(file_path)

In [38]:
# Add lag features for climate and dengue cases
for i in range(1, 3):
    df[f'Lag_{i}_Months_Temperature'] = df.groupby('Barangay')['Temperature'].shift(i).fillna(0)
    df[f'Lag_{i}_Months_Rainfall'] = df.groupby('Barangay')['Rainfall'].shift(i).fillna(0)
    df[f'Lag_{i}_Months_Humidity'] = df.groupby('Barangay')['Humidity'].shift(i).fillna(0)
    df[f'Lag_{i}_Months_Cases'] = df.groupby('Barangay')['Dengue Cases'].shift(i).fillna(0)
    
# Rolling Mean (Trend Features)
df["Cases_Rolling_Mean"] = df.groupby("Barangay")["Dengue Cases"].rolling(3).mean().reset_index(level=0, drop=True)

# Cyclical Encoding for Month
df['Month_sin'] = np.sin(2 * np.pi * df['Month'] / 12)
df['Month_cos'] = np.cos(2 * np.pi * df['Month'] / 12)

In [39]:
# Define Features and Target
features = [
    'Temperature', 'Rainfall', 'Humidity', 'x', 'y', 'Year',  'Cases_Rolling_Mean',
    'Month_sin', 'Month_cos'
] + [f'Lag_{i}_Months_Temperature' for i in range(1, 3)] \
  + [f'Lag_{i}_Months_Rainfall' for i in range(1, 3)] \
  + [f'Lag_{i}_Months_Humidity' for i in range(1, 3)] \
  + [f'Lag_{i}_Months_Cases' for i in range(1, 3)]

target = 'Dengue Cases'

# Drop rows with NaN from lag and rolling
df = df.dropna()

In [40]:
from sklearn.model_selection import TimeSeriesSplit

# Set up TimeSeriesSplit with 3 splits
tscv = TimeSeriesSplit(n_splits=3)

for fold, (train_index, test_index) in enumerate(tscv.split(df), 1):
    train_data = df.iloc[train_index]
    test_data = df.iloc[test_index]
    
    # Extract features and target for train and test sets
    x_train = train_data[features]
    y_train = train_data[target]
    x_test = test_data[features]
    y_test = test_data[target]
    
    # Print info about the current fold
    print(f"\nFold {fold}:")
    print(f"Training set: {len(train_data)} rows")
    print(f"From: {train_data.index[0]} To: {train_data.index[-1]}")
    print(f"Testing set: {len(test_data)} rows")
    print(f"From: {test_data.index[0]} To: {test_data.index[-1]}")
    
    # Train and evaluate your models (KNN, XGBoost, Hybrid)



Fold 1:
Training set: 1562 rows
From: 88 To: 1649
Testing set: 1562 rows
From: 1650 To: 3211

Fold 2:
Training set: 3124 rows
From: 88 To: 3211
Testing set: 1562 rows
From: 3212 To: 4773

Fold 3:
Training set: 4686 rows
From: 88 To: 4773
Testing set: 1562 rows
From: 4774 To: 6335


In [41]:
# Define possible values for k (n_neighbors) and CV values to test
k_values = range(3, 15)
cv_values = [5, 10, 15]

# Track best overall result
best_score = -np.inf
best_k = None
best_cv = None
best_model = None

# Store all results for printing
all_cv_results = {}

# Loop through each CV value
for cv in cv_values:
    print(f"\nCross-validation with CV={cv} folds...")

    # Set up grid search
    grid = GridSearchCV(
        estimator=KNeighborsRegressor(metric='euclidean'),
        param_grid={'n_neighbors': list(k_values)},
        cv=cv,
        scoring='r2',
        n_jobs=-1
    )

    # Fit model
    grid.fit(x_train, y_train)

    # Store and print best result for this CV
    best_k_cv = grid.best_params_['n_neighbors']
    best_score_cv = grid.best_score_
    print(f"Best R² Score for CV={cv}: {best_score_cv:.4f} with k={best_k_cv}\n")

    # Save all results in DataFrame
    results = pd.DataFrame(grid.cv_results_)[['param_n_neighbors', 'mean_test_score']]
    results.columns = ['k', f'CV={cv} Mean R²']
    results.set_index('k', inplace=True)
    all_cv_results[f'CV={cv}'] = results

    # Print the full table of results for this CV
    print(results)

    # Update best overall model
    if best_score_cv > best_score:
        best_score = best_score_cv
        best_k = best_k_cv
        best_cv = cv
        best_model = grid.best_estimator_

# Print best overall configuration
print(f"\nBest k: {best_k}, Best CV: {best_cv} (R² Score: {best_score:.4f})")



Cross-validation with CV=5 folds...
Best R² Score for CV=5: 0.6029 with k=4

    CV=5 Mean R²
k               
3       0.601557
4       0.602855
5       0.602103
6       0.597262
7       0.589166
8       0.581022
9       0.576197
10      0.567891
11      0.562766
12      0.557517
13      0.551944
14      0.547723

Cross-validation with CV=10 folds...
Best R² Score for CV=10: 0.5393 with k=5

    CV=10 Mean R²
k                
3        0.532928
4        0.538466
5        0.539310
6        0.535376
7        0.522471
8        0.516742
9        0.516493
10       0.514912
11       0.500259
12       0.486829
13       0.475977
14       0.465906

Cross-validation with CV=15 folds...
Best R² Score for CV=15: 0.5295 with k=4

    CV=15 Mean R²
k                
3        0.516853
4        0.529457
5        0.518429
6        0.511417
7        0.512092
8        0.501002
9        0.498620
10       0.495818
11       0.489014
12       0.484394
13       0.480245
14       0.471766

Best k: 4, Best CV:

In [42]:
# Train final model with best k and best CV
final_kf = KFold(n_splits=best_cv, shuffle=True, random_state=47)
knn_model = KNeighborsRegressor(n_neighbors=best_k, metric='euclidean')
knn_model.fit(x_train, y_train)
print("Final KNN model trained successfully with optimal K-Fold!")

# Predict dengue cases
test_data['Predicted Cases'] = np.round(knn_model.predict(x_test)).astype(int)  # Round to whole numbers

# Select relevant columns for output
output_df = test_data[['Barangay_No', 'Barangay', 'Year', 'Month', 'Dengue Cases', 'Predicted Cases']]

# Print results
print(output_df.to_string(index=False))

Final KNN model trained successfully with optimal K-Fold!
 Barangay_No         Barangay  Year  Month  Dengue Cases  Predicted Cases
          22            PALAO  2020      1             6                6
          23      PANOROGANAN  2020      1             0                0
          24        POBLACION  2020      1            22               14
          25          PUGA-AN  2020      1             0                0
          26         ROGONGON  2020      1             4                1
          27       SAN MIGUEL  2020      1             6                6
          28        SAN ROQUE  2020      1             6                4
          29      SANTA ELENA  2020      1             4                5
          30   SANTA FILOMENA  2020      1             2                3
          31         SANTIAGO  2020      1            10                4
          32    SANTO ROSARIO  2020      1             0                2
          33    SARAY-TIBANGA  2020      1            

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['Predicted Cases'] = np.round(knn_model.predict(x_test)).astype(int)  # Round to whole numbers


In [43]:
# Model Evaluation
def evaluate_model(actual, predicted):
    mae = mean_absolute_error(actual, predicted)
    mse = mean_squared_error(actual, predicted)
    rmse = np.sqrt(mse)
    r2 = r2_score(actual, predicted)

    print("\nModel Evaluation Metrics:")
    print(f"Mean Absolute Error (MAE): {mae:.2f}")
    print(f"Mean Squared Error (MSE): {mse:.2f}")
    print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
    print(f"R² Score: {r2:.4f}")

    return mae, mse, rmse, r2

# Evaluate the model
mae, mse, rmse, r2 = evaluate_model(test_data['Dengue Cases'], test_data['Predicted Cases'])



Model Evaluation Metrics:
Mean Absolute Error (MAE): 0.85
Mean Squared Error (MSE): 3.17
Root Mean Squared Error (RMSE): 1.7800
R² Score: 0.6138
