In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import AdaBoostRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import matplotlib.pyplot as plt

# Load the dataset
dengue_data = pd.read_csv('dataset/dataset.csv')

# Define missing value handling strategies
missing_value_strategies = {
    'constant_0': SimpleImputer(strategy='constant', fill_value=0),
    'mean': SimpleImputer(strategy='mean'),
    'median': SimpleImputer(strategy='median'),
    'mode': SimpleImputer(strategy='most_frequent'),
    'mode': SimpleImputer(strategy='most_frequent'),
    'ffill': 'ffill',
    'bfill': 'bfill'
}

# Test sizes and random states
test_sizes = [0.2, 0.25]
random_states = [0, 42]

# To track the best combination
best_combination = None
best_mse = np.inf
best_r2 = -np.inf

# To store results
results = []

# Iterate through all combinations of strategies, test sizes, and random states
for strategy_name, strategy in missing_value_strategies.items():
    # Prepare a copy of the dataset
    data = dengue_data.copy()

    # Handle missing values
    if strategy_name in ['ffill', 'bfill']:
        if strategy_name == 'ffill':
            data.ffill(inplace=True)
        else:
            data.bfill(inplace=True)
        # data.fillna(method=strategy_name, inplace=True)
    else:
        data.iloc[:, 2:] = strategy.fit_transform(data.iloc[:, 2:])

    # Encode categorical data
    label_encoder = LabelEncoder()
    data['District'] = label_encoder.fit_transform(data['District'])

    # Create the target variable for July to November
    data['Target_July_to_November'] = data[['July', 'August', 'September', 'October', 'November']].sum(axis=1)

    # Create lag features
    data['Previous_Year_July_to_November'] = data.groupby('District')[
        ['July', 'August', 'September', 'October', 'November']
    ].shift(1).sum(axis=1).fillna(0)

    # Define features and target
    features = data.drop(columns=[
        'January', 'February', 'March', 'April', 'May', 'June', 'December',
        'July', 'August', 'September', 'October', 'November', 'Target_July_to_November'
    ])
    features['Previous_Year_July_to_November'] = data['Previous_Year_July_to_November']
    target = data['Target_July_to_November']

    # Handle any remaining missing values
    features = features.fillna(0)
    target = target.fillna(0)

    # Try different train-test splits
    for test_size in test_sizes:
        for random_state in random_states:
            # Split data
            X_train, X_test, y_train, y_test = train_test_split(
                features, target, test_size=test_size, random_state=random_state
            )

            # Train the model
            model = AdaBoostRegressor(n_estimators=100, random_state=random_state)
            model.fit(X_train, y_train)

            # Make predictions
            y_pred = model.predict(X_test)

            # Evaluate the model
            mse = mean_squared_error(y_test, y_pred)
            r2 = r2_score(y_test, y_pred)

            # Save results
            results.append({
                'Strategy': strategy_name,
                'Test_Size': test_size,
                'Random_State': random_state,
                'MSE': mse,
                'R2': r2
            })

            # Update best combination
            if mse < best_mse or (mse == best_mse and r2 > best_r2):
                best_combination = {
                    'Strategy': strategy_name,
                    'Test_Size': test_size,
                    'Random_State': random_state,
                    'MSE': mse,
                    'R2': r2
                }
                best_mse = mse
                best_r2 = r2

# Results DataFrame
results_df = pd.DataFrame(results)
results_df.sort_values(by='MSE', inplace=True)

# Display all combinations
print("All combinations evaluated:")
print(results_df)

# Display the best combination
print("\nBest combination:")
print(best_combination)

# Use the best combination to train the final model
data = dengue_data.copy()

# Apply the best strategy
strategy_name = best_combination['Strategy']
if strategy_name in ['forward_fill', 'backward_fill']:
    if strategy_name == 'forward_fill':
        data.ffill(inplace=True)
    else:
        data.bfill(inplace=True)
    # data.fillna(method=strategy_name, inplace=True)
else:
    imputer = missing_value_strategies[strategy_name]
    data.iloc[:, 2:] = imputer.fit_transform(data.iloc[:, 2:])

# Encode categorical data
data['District'] = label_encoder.fit_transform(data['District'])
data['Target_July_to_November'] = data[['July', 'August', 'September', 'October', 'November']].sum(axis=1)
data['Previous_Year_July_to_November'] = data.groupby('District')[
    ['July', 'August', 'September', 'October', 'November']
].shift(1).sum(axis=1).fillna(0)

features = data.drop(columns=[
    'January', 'February', 'March', 'April', 'May', 'June', 'December',
    'July', 'August', 'September', 'October', 'November', 'Target_July_to_November'
])
features['Previous_Year_July_to_November'] = data['Previous_Year_July_to_November']
target = data['Target_July_to_November']

# print(dengue_data)

# Handle remaining missing values
# features = features.fillna(0)
# target = target.fillna(0)

# Split data using the best parameters
X_train, X_test, y_train, y_test = train_test_split(
    features, target, test_size=best_combination['Test_Size'], random_state=best_combination['Random_State']
)

# Train the final model
# final_model = AdaBoostRegressor(n_estimators=100, random_state=best_combination['Random_State'])
# final_model.fit(X_train, y_train)

# Train the final model with RandomForestRegressor
# from sklearn.ensemble import RandomForestRegressor
# final_model = RandomForestRegressor(n_estimators=100, random_state=best_combination['Random_State'])
# final_model.fit(X_train, y_train)

# Train the final model with GradientBoostingRegressor
# from sklearn.ensemble import GradientBoostingRegressor
# final_model = GradientBoostingRegressor(n_estimators=100, random_state=best_combination['Random_State'])
# final_model.fit(X_train, y_train)

# hyperparameter tuning for XGBRegressor
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor
param_grid = {
    'n_estimators': [50, 100, 150, 200],
    'learning_rate': [0.01, 0.1, 0.3, 0.5],
    'max_depth': [3, 4, 5, 6, 7],
    'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0],
    'gamma': [0, 0.1, 0.2, 0.3, 0.4],
    'min_child_weight': [1, 2, 3, 4]
}
xgb = XGBRegressor()
grid_search = GridSearchCV(xgb, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)
print(grid_search.best_params_)
final_model = grid_search.best_estimator_
final_model.fit(X_train, y_train)

# Make final predictions
y_pred = final_model.predict(X_test)

# Evaluate the final model
final_mse = mean_squared_error(y_test, y_pred)
final_r2 = r2_score(y_test, y_pred)

print("\nFinal Model Evaluation:")
print(f"Mean Squared Error (MSE): {final_mse}")
print(f"R-squared (R²): {final_r2}")

# Visualize predictions vs actual cases
district_predictions = pd.DataFrame({
    'District': X_test['District'],
    'Predicted_Cases': y_pred,
    'Actual_Cases': y_test.values
}).reset_index(drop=True)

# Decode district labels back to their original names
district_predictions['District'] = label_encoder.inverse_transform(district_predictions['District'])

plt.figure(figsize=(14, 8))
district_predictions.sort_values('Actual_Cases').plot(
    kind='bar', x='District', y=['Predicted_Cases', 'Actual_Cases'], figsize=(14, 8)
)
plt.title('Predicted vs Actual Dengue Cases (July to November) by District', fontsize=16)
plt.xlabel('District', fontsize=12)
plt.ylabel('Number of Cases (July to November)', fontsize=12)
plt.legend(['Predicted Cases', 'Actual Cases'], fontsize=12)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()
