### Importing the Libraries


In [11]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

### Importing the dataset


In [12]:
dataset = pd.read_csv('dataset/dataset.csv', header = None)
dataset.columns = dataset.iloc[0]
dataset = dataset[1:]
dataset = dataset.drop(columns=['Year'])

### Encoding Categorical Data


In [13]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
dataset['District'] = label_encoder.fit_transform(dataset['District'])

### Handling Missing Values


In [14]:
dataset = dataset.apply(pd.to_numeric, errors='coerce')
dataset.fillna(dataset.median(), inplace=True)
dataset['Total Cases'] = dataset.iloc[:, 37:50].sum(axis=1)
dataset = dataset.drop(dataset.columns[37:49], axis=1)

### Handling Missing Values : Interpolation

In [15]:
dataset = dataset.apply(pd.to_numeric, errors='coerce')
dataset.interpolate(method='linear', inplace=True)
dataset['Total Cases'] = dataset.iloc[:, 37:50].sum(axis=1)
dataset = dataset.drop(dataset.columns[37:49], axis=1)

### Test/Train Split


In [16]:
from sklearn.model_selection import train_test_split

X = dataset.iloc[:, 0:-1].values
Y = dataset.iloc[:, -1].values

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.20, random_state = 0)

### Hyperparameter Tuning

#### Random Search

In [17]:
# from sklearn.ensemble import AdaBoostRegressor
# from sklearn.tree import DecisionTreeRegressor
# from sklearn.model_selection import RandomizedSearchCV
# from sklearn.metrics import mean_squared_error


# # Define the parameter grid for Random Search
# param_grid = {
#     'n_estimators': [50, 100, 200, 300, 500],
#     'learning_rate': np.linspace(0.01, 1.0, 10),
#     'estimator__max_depth': [1, 3, 5, 10, 15],  # Tuning the weak learner
#     'estimator__min_samples_split': [2, 5, 10],  # Weak learner split
#     'estimator__min_samples_leaf': [1, 2, 5]
# }

# # Initialize the weak learner (Decision Tree)
# base_estimator = DecisionTreeRegressor(random_state=42)

# # Initialize the AdaBoost Regressor with the weak learner
# ada_model = AdaBoostRegressor(estimator=base_estimator, random_state=42)

# # Initialize RandomizedSearchCV
# random_search = RandomizedSearchCV(
#     estimator=ada_model,
#     param_distributions=param_grid,
#     n_iter=50,  # Number of random combinations to try
#     scoring='neg_mean_squared_error',  # Evaluation metric
#     cv=5,  # 5-fold cross-validation
#     verbose=1,
#     random_state=42,
#     n_jobs=-1  # Utilize all CPU cores
# )

# # Fit RandomizedSearchCV
# random_search.fit(X_train, Y_train)

# # Get the best parameters and model performance
# print(f"Best parameters: {random_search.best_params_}")
# print(f"Best RMSE (CV): {np.sqrt(-random_search.best_score_)}")

# # Evaluate the best model on the test set
# best_model = random_search.best_estimator_
# y_pred = best_model.predict(X_test)
# test_rmse = mean_squared_error(Y_test, y_pred, squared=False)
# print(f"Test RMSE: {test_rmse}")


### Training Model


In [18]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
#base_estimator = DecisionTreeRegressor(max_depth=15, min_samples_split=2, min_samples_leaf=1)
base_estimator = DecisionTreeRegressor(max_depth=10, min_samples_split=5, min_samples_leaf=5)
regressor = AdaBoostRegressor(estimator=base_estimator, n_estimators=200, learning_rate=0.01)

regressor.fit(X_train, Y_train)

# Best parameters: {'n_estimators': 200, 'learning_rate': 0.01, 'estimator__min_samples_split': 10, 'estimator__min_samples_leaf': 5, 'estimator__max_depth': 10}



#### Comparing Values


In [19]:
Y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2, suppress=True)
print(np.concatenate((Y_pred.reshape(len(Y_pred),1), Y_test.reshape(len(Y_test),1)),1))


[[75.07 76.51]
 [79.71 80.22]
 [79.37 80.81]
 [76.49 76.91]
 [78.16 77.87]
 [79.97 80.34]
 [79.83 80.58]
 [75.81 79.35]
 [73.43 72.45]
 [79.56 80.58]
 [82.7  82.74]
 [79.39 79.39]
 [79.39 79.39]
 [85.32 85.52]
 [79.39 79.39]
 [78.88 79.68]
 [78.83 80.16]
 [79.4  78.42]
 [79.29 78.87]
 [81.74 81.74]
 [78.58 83.03]
 [75.02 73.86]
 [78.58 83.03]
 [79.39 79.39]
 [79.39 79.39]
 [77.14 78.02]
 [79.62 81.13]
 [79.39 79.39]
 [79.27 79.19]
 [79.81 80.1 ]
 [79.71 81.13]
 [79.39 79.39]
 [81.92 81.81]
 [83.79 84.13]
 [79.04 73.97]
 [79.39 79.39]
 [75.28 75.15]
 [79.41 79.74]
 [79.99 80.86]
 [82.42 82.39]
 [83.87 83.32]
 [75.06 74.74]
 [79.65 81.33]
 [79.29 82.45]
 [79.39 79.39]
 [79.33 80.26]
 [79.39 79.39]
 [78.07 79.1 ]
 [76.95 80.61]
 [75.36 75.54]
 [82.29 81.81]
 [77.5  78.39]
 [79.39 79.39]
 [79.39 79.39]
 [82.34 82.65]
 [83.47 84.45]
 [78.18 78.32]
 [83.59 84.68]
 [73.   72.75]
 [79.1  79.8 ]
 [74.98 74.61]
 [79.39 79.39]
 [79.12 79.1 ]
 [79.3  79.63]
 [81.53 81.58]
 [84.09 84.77]
 [84.24 84

In [20]:
from sklearn.metrics import r2_score
r2_score(Y_test, Y_pred)

0.7702551521861192

In [21]:
def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true, dtype=float), np.array(y_pred, dtype=float)
    non_zero_indices = y_true != 0
    return np.mean(np.abs((y_true[non_zero_indices] - y_pred[non_zero_indices]) / y_true[non_zero_indices])) * 100


mean_absolute_percentage_error(Y_test, Y_pred)

1.048617628414995