### Importing the Libraries


In [166]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

### Importing the dataset


In [167]:
dataset = pd.read_csv('dataset/dataset.csv', header = None)
dataset.columns = dataset.iloc[0]
dataset = dataset[1:]
dataset = dataset.drop(columns=['Year'])

### Encoding Categorical Data


In [168]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
dataset['District'] = label_encoder.fit_transform(dataset['District'])

### Handling Missing Values


In [169]:
dataset = dataset.apply(pd.to_numeric, errors='coerce')
# dataset.fillna(dataset.median(), inplace=True)
# dataset.fillna(dataset.mode(), inplace=True)
dataset.fillna(dataset.mean(), inplace=True)
# dataset.fillna(0, inplace=True) # fill with 0
# dataset.ffill(inplace=True)
dataset['Total Cases'] = dataset.iloc[:, 37:50].sum(axis=1)
dataset = dataset.drop(dataset.columns[37:49], axis=1)

### Handling Missing Values : Interpolation

In [170]:
dataset = dataset.apply(pd.to_numeric, errors='coerce')
dataset.interpolate(method='linear', inplace=True)
dataset['Total Cases'] = dataset.iloc[:, 37:50].sum(axis=1)
dataset = dataset.drop(dataset.columns[37:49], axis=1)

### Test/Train Split


In [171]:
from sklearn.model_selection import train_test_split

X = dataset.iloc[:, 0:-1].values
Y = dataset.iloc[:, -1].values

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.20, random_state = 0)

### Hyperparameter Tuning

#### Random Search

In [172]:
# from sklearn.ensemble import AdaBoostRegressor
# from sklearn.tree import DecisionTreeRegressor
# from sklearn.model_selection import RandomizedSearchCV
# from sklearn.metrics import mean_squared_error


# # Define the parameter grid for Random Search
# param_grid = {
#     'n_estimators': [50, 100, 200, 300, 500],
#     'learning_rate': np.linspace(0.01, 1.0, 10),
#     'estimator__max_depth': [1, 3, 5, 10, 15],  # Tuning the weak learner
#     'estimator__min_samples_split': [2, 5, 10],  # Weak learner split
#     'estimator__min_samples_leaf': [1, 2, 5]
# }

# # Initialize the weak learner (Decision Tree)
# base_estimator = DecisionTreeRegressor(random_state=42)

# # Initialize the AdaBoost Regressor with the weak learner
# ada_model = AdaBoostRegressor(estimator=base_estimator, random_state=42)

# # Initialize RandomizedSearchCV
# random_search = RandomizedSearchCV(
#     estimator=ada_model,
#     param_distributions=param_grid,
#     n_iter=50,  # Number of random combinations to try
#     scoring='neg_mean_squared_error',  # Evaluation metric
#     cv=5,  # 5-fold cross-validation
#     verbose=1,
#     random_state=42,
#     n_jobs=-1  # Utilize all CPU cores
# )

# # Fit RandomizedSearchCV
# random_search.fit(X_train, Y_train)

# # Get the best parameters and model performance
# print(f"Best parameters: {random_search.best_params_}")
# print(f"Best RMSE (CV): {np.sqrt(-random_search.best_score_)}")

# # Evaluate the best model on the test set
# best_model = random_search.best_estimator_
# y_pred = best_model.predict(X_test)
# test_rmse = mean_squared_error(Y_test, y_pred, squared=False)
# print(f"Test RMSE: {test_rmse}")


### Training Model


In [173]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
#base_estimator = DecisionTreeRegressor(max_depth=15, min_samples_split=2, min_samples_leaf=1)
base_estimator = DecisionTreeRegressor(max_depth=10, min_samples_split=5, min_samples_leaf=5)
regressor = AdaBoostRegressor(estimator=base_estimator, n_estimators=200, learning_rate=0.01)
# regressor = AdaBoostRegressor()

regressor.fit(X_train, Y_train)

# Best parameters: {'n_estimators': 200, 'learning_rate': 0.01, 'estimator__min_samples_split': 10, 'estimator__min_samples_leaf': 5, 'estimator__max_depth': 10}



#### Comparing Values


In [174]:
Y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2, suppress=True)
print(np.concatenate((Y_pred.reshape(len(Y_pred),1), Y_test.reshape(len(Y_test),1)),1))


[[75.14 76.51]
 [79.91 80.22]
 [79.46 80.81]
 [76.54 76.91]
 [78.36 77.87]
 [80.05 80.34]
 [80.29 80.58]
 [76.07 79.35]
 [73.48 72.45]
 [80.08 80.58]
 [82.69 82.74]
 [78.84 78.84]
 [78.84 78.84]
 [85.3  85.52]
 [78.84 78.84]
 [78.14 79.68]
 [79.13 80.16]
 [79.02 78.42]
 [79.41 78.87]
 [81.74 81.74]
 [80.35 83.03]
 [74.97 73.86]
 [80.35 83.03]
 [78.84 78.84]
 [78.84 78.84]
 [77.38 78.02]
 [79.82 81.13]
 [78.84 78.84]
 [79.09 79.19]
 [80.04 80.1 ]
 [79.67 81.13]
 [78.84 78.84]
 [81.89 81.81]
 [84.02 84.13]
 [77.18 73.97]
 [78.84 78.84]
 [75.63 75.15]
 [79.6  79.74]
 [80.05 80.86]
 [82.38 82.39]
 [83.98 83.32]
 [75.17 74.74]
 [79.45 81.33]
 [79.02 82.45]
 [78.84 78.84]
 [79.02 80.26]
 [78.84 78.84]
 [77.98 79.1 ]
 [77.04 80.61]
 [75.41 75.54]
 [82.24 81.81]
 [78.27 78.39]
 [78.84 78.84]
 [78.84 78.84]
 [82.39 82.65]
 [83.32 84.45]
 [77.92 78.32]
 [83.84 84.68]
 [72.77 72.75]
 [79.14 79.8 ]
 [74.6  74.61]
 [78.84 78.84]
 [78.94 79.1 ]
 [79.12 79.63]
 [81.51 81.58]
 [84.14 84.77]
 [84.36 84

In [175]:
from sklearn.metrics import r2_score
r2_score(Y_test, Y_pred)

0.8362251436518191

In [176]:
def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true, dtype=float), np.array(y_pred, dtype=float)
    non_zero_indices = y_true != 0
    return np.mean(np.abs((y_true[non_zero_indices] - y_pred[non_zero_indices]) / y_true[non_zero_indices])) * 100


mean_absolute_percentage_error(Y_test, Y_pred)

0.9233720144033435