In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import StackingRegressor
import xgboost as xgb
from sklearn.model_selection import cross_val_score, TimeSeriesSplit
import json
from pathlib import Path


# Load the CSV file
file_path = os.path.join("..", "Final Data", "FINAL-THESIS-DATA.csv")
df = pd.read_csv(file_path)


# Add lag features
for i in range(1, 3):
    df[f'Lag_{i}_Months_Temperature'] = df.groupby('Barangay')['Temperature'].shift(i)
    df[f'Lag_{i}_Months_Rainfall'] = df.groupby('Barangay')['Rainfall'].shift(i)
    df[f'Lag_{i}_Months_Humidity'] = df.groupby('Barangay')['Humidity'].shift(i)
    df[f'Lag_{i}_Months_Cases'] = df.groupby('Barangay')['Dengue Cases'].shift(i)

# Rolling mean
df["Cases_Rolling_Mean"] = df.groupby("Barangay")["Dengue Cases"].rolling(3).mean().reset_index(level=0, drop=True)

# Cyclical encoding
df['Month_sin'] = np.sin(2 * np.pi * df['Month'] / 12)
df['Month_cos'] = np.cos(2 * np.pi * df['Month'] / 12)

# Define Features and Target
features = [
    'Temperature', 'Rainfall', 'Humidity', 'x', 'y', 'Year', 'Cases_Rolling_Mean',
    'Month_sin', 'Month_cos'
] + [f'Lag_{i}_Months_Temperature' for i in range(1, 3)] \
  + [f'Lag_{i}_Months_Rainfall' for i in range(1, 3)] \
  + [f'Lag_{i}_Months_Humidity' for i in range(1, 3)] \
  + [f'Lag_{i}_Months_Cases' for i in range(1, 3)]

target = 'Dengue Cases'

# Drop rows with NaN from lag and rolling
df = df.dropna()

# TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=3)

for train_index, test_index in tscv.split(df):
    train_data = df.iloc[train_index]
    test_data = df.iloc[test_index]

    x_train = train_data[features]
    y_train = train_data[target]
    x_test = test_data[features]
    y_test = test_data[target]

# Base Estimator: KNN
knn = KNeighborsRegressor(n_neighbors=4, metric='euclidean')

# Apply 5-fold cross-validation (no output or storage)
cross_val_score(knn, x_train, y_train, cv=5, scoring='neg_mean_absolute_error')

# KNN Predictions
knn.fit(x_train, y_train)
test_data['Predicted Cases (KNN)'] = np.round(knn.predict(x_test)).astype(int)

# Hybrid model: KNN + XGBoost
xgb_final = xgb.XGBRegressor(
    objective='reg:squarederror',
    n_estimators=700,
    learning_rate=0.01,
    max_depth=6,
    subsample=0.7,
    colsample_bytree=0.8,
    reg_alpha=10,     # L1 regularization
    reg_lambda=10,      # L2 regularization
    random_state=47,
)

hybrid_model = StackingRegressor(
    estimators=[('knn', knn)],
    final_estimator=xgb_final,
    passthrough=True,
    n_jobs=-1
)

hybrid_model.fit(x_train, y_train)
test_data['Predicted Cases (Hybrid)'] = np.round(hybrid_model.predict(x_test)).astype(int)


# Forecast Future Monthly Dengue Cases for January 2023 
future_predictions = []

barangays = df['Barangay'].unique()
for barangay in barangays:
    hist = df[df['Barangay'] == barangay].sort_values(['Year', 'Month']).copy()
    temp_df = hist.tail(3).copy()  # use last 3 months to simulate forward

    last = temp_df.iloc[-1]
    new_month = 1
    new_year = 2023

    # Build input row for January 2023
    recent_cases = temp_df['Dengue Cases'].values[-3:]
    recent_temps = temp_df['Temperature'].values[-3:]
    recent_rain = temp_df['Rainfall'].values[-3:]
    recent_humid = temp_df['Humidity'].values[-3:]

    input_row = {
        'Barangay': barangay,
        'Year': new_year,
        'Month': new_month,
        'Temperature': recent_temps[-1],
        'Rainfall': recent_rain[-1],
        'Humidity': recent_humid[-1],
        'x': last['x'],
        'y': last['y'],
        'Cases_Rolling_Mean': np.mean(recent_cases),
        'Month_sin': np.sin(2 * np.pi * new_month / 12),
        'Month_cos': np.cos(2 * np.pi * new_month / 12),
        'Lag_1_Months_Temperature': recent_temps[-1],
        'Lag_2_Months_Temperature': recent_temps[-2],
        'Lag_1_Months_Rainfall': recent_rain[-1],
        'Lag_2_Months_Rainfall': recent_rain[-2],
        'Lag_1_Months_Humidity': recent_humid[-1],
        'Lag_2_Months_Humidity': recent_humid[-2],
        'Lag_1_Months_Cases': recent_cases[-1],
        'Lag_2_Months_Cases': recent_cases[-2],
    }

    X_input = pd.DataFrame([input_row])[features]
    predicted = hybrid_model.predict(X_input)[0]
    predicted_rounded = np.round(predicted).astype(int)

    # Add prediction and risk label
    input_row['Dengue Cases'] = predicted
    input_row['Predicted Dengue Cases'] = predicted_rounded


    # Assign risk label based on final rule
    if predicted_rounded == 0:
        input_row['Risk Label'] = 'Low'
    elif 1 <= predicted_rounded <= 12:
        input_row['Risk Label'] = 'Medium'
    else:
        input_row['Risk Label'] = 'High'


    future_predictions.append(input_row)

# Combine and sort results
future_df = pd.DataFrame(future_predictions)
output_str = future_df[['Barangay', 'Year', 'Month', 'Predicted Dengue Cases', 'Risk Label']].sort_values(['Barangay']).to_string(index=False)
print(output_str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['Predicted Cases (KNN)'] = np.round(knn.predict(x_test)).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['Predicted Cases (Hybrid)'] = np.round(hybrid_model.predict(x_test)).astype(int)


        Barangay  Year  Month  Predicted Dengue Cases Risk Label
           ABUNO  2023      1                       1     Medium
           ACMAC  2023      1                       1     Medium
   BAGONG SILANG  2023      1                       1     Medium
        BONBONON  2023      1                       0        Low
         BUNAWAN  2023      1                       0        Low
         BURU-UN  2023      1                       3     Medium
        DALIPUGA  2023      1                       1     Medium
      DEL CARMEN  2023      1                       4     Medium
       DIGKILAAN  2023      1                       0        Low
       DITUCALAN  2023      1                       0        Low
           DULAG  2023      1                       0        Low
      HINAPLANON  2023      1                       8     Medium
         HINDANG  2023      1                       0        Low
      KABACSANAN  2023      1                       0        Low
      KALILANGAN  2023   

In [2]:
# Select only relevant columns from test_data for hybrid model
test_data_export = test_data.copy()
test_data_export = test_data_export[[
    'Barangay', 'Year', 'Month', 'Dengue Cases', 'Predicted Cases (Hybrid)'
]]

# Prepare future_df to match test_data_export format
future_export = future_df[[
    'Barangay', 'Year', 'Month', 'Predicted Dengue Cases'
]].rename(columns={
    'Predicted Dengue Cases': 'Predicted Cases (Hybrid)'
})

# Add placeholder for actual dengue cases (unknown in future)
future_export['Dengue Cases'] = np.nan

# Reorder columns to match test_data_export
future_export = future_export[[
    'Barangay', 'Year', 'Month', 'Dengue Cases', 'Predicted Cases (Hybrid)'
]]

# Combine and save
combined_df = pd.concat([test_data_export, future_export], ignore_index=True)
combined_df.to_csv("Hybrid_test_and_2023-01-predictions.csv", index=False)

# Notify after saving
print("Saved done.")


Saved done.


In [3]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Evaluation on historical test data
mae = mean_absolute_error(y_test, test_data['Predicted Cases (Hybrid)'])
rmse = np.sqrt(mean_squared_error(y_test, test_data['Predicted Cases (Hybrid)']))
r2 = r2_score(y_test, test_data['Predicted Cases (Hybrid)'])

print("\nðŸ“Š Hybrid Model Evaluation on Historical Test Set:")
print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"RÂ²: {r2:.2f}")



ðŸ“Š Hybrid Model Evaluation on Historical Test Set:
MAE: 0.40
RMSE: 0.97
RÂ²: 0.88


In [4]:
import pandas as pd
import json
from pathlib import Path

# Load CSVs
historical_df = pd.read_csv("actual_cases_2011_to_2019.csv")
future_df = pd.read_csv("Hybrid_test_and_2023-01-predictions.csv")

# Normalize barangay names
historical_df['Barangay'] = historical_df['Barangay'].str.strip().str.upper()
future_df['Barangay'] = future_df['Barangay'].str.strip().str.upper()

# Get average thresholds from historical data
average_per_range = {'Low': [], 'Medium': [], 'High': []}

for _, group in historical_df.groupby('Year'):
    x = group['Dengue Cases'].mean()
    s = group['Dengue Cases'].std()
    upper = x + 1.5 * s

    low_avg = group[group['Dengue Cases'] < x]['Dengue Cases'].mean() if not group[group['Dengue Cases'] < x].empty else 0
    medium_avg = group[(group['Dengue Cases'] >= x) & (group['Dengue Cases'] <= upper)]['Dengue Cases'].mean() if not group[(group['Dengue Cases'] >= x) & (group['Dengue Cases'] <= upper)].empty else 0
    high_avg = group[group['Dengue Cases'] > upper]['Dengue Cases'].mean() if not group[group['Dengue Cases'] > upper].empty else 0

    average_per_range['Low'].append(low_avg)
    average_per_range['Medium'].append(medium_avg)
    average_per_range['High'].append(high_avg)

avg_low = round(sum(average_per_range['Low']) / len(average_per_range['Low']))
avg_high = round(sum(average_per_range['High']) / len(average_per_range['High']))

# Get coordinates
if 'y' in historical_df.columns and 'x' in historical_df.columns:
    barangay_coords = historical_df.drop_duplicates(subset='Barangay')[['Barangay', 'y', 'x']]
    barangay_coords = barangay_coords.rename(columns={'y': 'lat', 'x': 'lng'})
    barangay_coords = barangay_coords.set_index('Barangay').to_dict(orient='index')
else:
    barangay_coords = {}

# Filter and convert 2020â€“2023 data
final_json_data = []

for _, row in future_df.iterrows():
    year = int(row['Year'])
    if year >= 2020:
        predicted = row['Predicted Cases (Hybrid)']
        actual_cases = int(row['Dengue Cases']) if not pd.isna(row['Dengue Cases']) else None
        barangay = row['Barangay']
        coords = barangay_coords.get(barangay, {'lat': None, 'lng': None})
        month = int(row['Month'])

        # Risk level assignment
        if predicted < avg_low:
            risk = "Low"
        elif predicted < avg_high:
            risk = "Medium"
        else:
            risk = "High"

        final_json_data.append({
            "name": barangay,
            "predictedCases_Hybrid": predicted,
            "actualCases": actual_cases,
            "riskLevel": risk,
            "lat": coords['lat'],
            "lng": coords['lng'],
            "month": month,
            "year": year
        })

# Save to JSON
save_dir = Path.cwd() / "map"
save_dir.mkdir(parents=True, exist_ok=True)
output_path = save_dir / "Hybrid_2020_to_2023_only.json"

with open(output_path, "w") as f:
    json.dump(final_json_data, f, indent=4)

print("âœ… JSON file saved:", output_path)


âœ… JSON file saved: c:\Users\Kristine\KNN-XGBOOST_DENGUE_ML\THREE IN ONE\map\Hybrid_2020_to_2023_only.json
