In [None]:
# AdaBoost Model for Dengue Prediction

# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.ensemble import AdaBoostRegressor

# Importing the dataset
dataset = pd.read_csv('/home/anik/CSE445-Project/DenguePrediction/dataset/dataset.csv', header = None)
dataset.columns = dataset.iloc[0]
dataset = dataset[1:]
dataset = dataset.drop(columns=['Year'])

# Encoding categorical data
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
dataset['District'] = label_encoder.fit_transform(dataset['District'])

# Handling missing data
dataset = dataset.apply(pd.to_numeric, errors='coerce')
# dataset.ffill(inplace=True)
# dataset.fillna(dataset.mean(), inplace=True)
dataset.fillna(dataset.mode().iloc[0], inplace=True)
# dataset.fillna(dataset.median(), inplace=True)
dataset['Total Cases'] = dataset.iloc[:, 37:49].sum(axis=1)
dataset = dataset.drop(dataset.columns[37:49], axis=1)

# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X = dataset.iloc[:, 0:-1].values
Y = dataset.iloc[:, -1].values
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.20, random_state = 0)



# Hyperparameter tuning(Grid Search)
from sklearn.model_selection import GridSearchCV
# parameters = {
#     'n_estimators': [50, 100, 150, 200, 250],
#     'learning_rate': [0.01, 0.1, 0.5, 1],
#     'loss': ['linear', 'square', 'exponential'],
# }
# grid_search = GridSearchCV(estimator = AdaBoostRegressor(), param_grid = parameters, scoring = 'r2', cv = 10, n_jobs = -1)
# grid_search = grid_search.fit(X_train, Y_train)
# best_parameters = grid_search.best_params_

# Predict on the test set(grid search)
#Y_pred = grid_search.best_estimator_.predict(X_test)
#Y_pred = AdaBoostRegressor().fit(X_train, Y_train).predict(X_test)

# Hyperparameter tuning(Random Search)
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
parameters = {
    'n_estimators': randint(50, 250),
    'learning_rate': [0.01, 0.1, 0.5, 1],
    'loss': ['linear', 'square', 'exponential'],
}
random_search = RandomizedSearchCV(estimator = AdaBoostRegressor(), param_distributions = parameters, n_iter = 100, scoring = 'r2', cv = 10, n_jobs = -1)
random_search = random_search.fit(X_train, Y_train)
best_parameters = random_search.best_params_

# Predict on the test set(random search)
# Y_pred = random_search.best_estimator_.predict(X_test)
Y_pred = AdaBoostRegressor().fit(X_train, Y_train).predict(X_test)

# Hyperparameter tuning(Bayesian Optimization)
from skopt import BayesSearchCV
# parameters = {
#     'n_estimators': (50, 250),
#     'learning_rate': (0.01, 1.0, 'log-uniform'),
#     'loss': ['linear', 'square', 'exponential'],
# }
# bayes_search = BayesSearchCV(estimator = AdaBoostRegressor(), search_spaces = parameters, n_iter = 100, scoring = 'r2', cv = 10, n_jobs = -1)
# bayes_search = bayes_search.fit(X_train, Y_train)
# best_parameters = bayes_search.best_params_

# Predict on the test set(bayesian optimization)
# Y_pred = bayes_search.best_estimator_.predict(X_test)


# Evaluate the model
from sklearn.metrics import r2_score, mean_squared_error
print(f"R2 Score: {r2_score(Y_test, Y_pred)}")
print(f"MSE: {mean_squared_error(Y_test, Y_pred)}")

# Explainaing the model with SHAP
import shap
explainer = shap.KernelExplainer(random_search.best_estimator_.predict, X_train)
shap_values = explainer.shap_values(X_test)
shap.summary_plot(shap_values, X_test, feature_names=dataset.columns[:-1])

# Explainaing the model with LIME
import lime
import lime.lime_tabular
explainer = lime.lime_tabular.LimeTabularExplainer(X_train, mode='regression', feature_names=dataset.columns[:-1])
i = 0
exp = explainer.explain_instance(X_test[i], random_search.best_estimator_.predict, num_features=5)
exp.show_in_notebook(show_table=True)


