### Importing the Libraries


In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

### Importing the dataset


In [2]:
dataset = pd.read_csv('dataset/dataset.csv', header = None)
dataset.columns = dataset.iloc[0]
dataset = dataset[1:]
dataset = dataset.drop(columns=['Year']) 

### Encoding Categorical Data


In [3]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
dataset['District'] = label_encoder.fit_transform(dataset['District'])

### Handling Missing Values


In [4]:
dataset = dataset.apply(pd.to_numeric, errors='coerce')
dataset.fillna(dataset.median(), inplace=True)
dataset['Total Cases'] = dataset.iloc[:, 37:50].sum(axis=1)
dataset = dataset.drop(dataset.columns[37:49], axis=1)

### Test/Train Split


In [5]:
from sklearn.model_selection import train_test_split

X = dataset.iloc[:, 0:-1].values
Y = dataset.iloc[:, -1].values

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.20, random_state = 0)

### Hyperparameter Tuning


#### Bayesian Optimization


In [6]:
# import optuna
# import xgboost as xgb
# from sklearn.metrics import mean_squared_error

# # Define the objective function for Optuna
# def objective(trial):
#     # Suggest hyperparameters
#     param = {
#         "n_estimators": trial.suggest_int("n_estimators", 50, 500),
#         "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
#         "max_depth": trial.suggest_int("max_depth", 3, 10),
#         "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
#         "subsample": trial.suggest_float("subsample", 0.5, 1.0),
#         "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
#         "gamma": trial.suggest_float("gamma", 0.0, 5.0),
#         "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 1.0),
#         "reg_lambda": trial.suggest_float("reg_lambda", 1.0, 10.0),
#     }

#     # Create and train the model with suggested parameters
#     model = xgb.XGBRegressor(**param, random_state=42, n_jobs=-1)
#     model.fit(X_train, Y_train)

#     # Make predictions and calculate RMSE
#     y_pred = model.predict(X_test)
#     rmse = mean_squared_error(Y_test, y_pred, squared=False)

#     return rmse

# # Create an Optuna study and optimize it
# study = optuna.create_study(direction="minimize")
# study.optimize(objective, n_trials=50)

# # Print the best parameters and RMSE
# print(f"Best parameters: {study.best_params}")
# print(f"Best RMSE: {study.best_value}")



#### Random Search


In [7]:
# import xgboost as xgb
# from sklearn.model_selection import RandomizedSearchCV
# from sklearn.metrics import mean_squared_error

# # Define the parameter grid for Random Search
# param_grid = {
#     'n_estimators': [50, 100, 200, 300, 500],
#     'learning_rate': np.linspace(0.01, 0.3, 10),
#     'max_depth': [3, 5, 7, 9, 10],
#     'min_child_weight': [1, 3, 5, 7, 10],
#     'subsample': np.linspace(0.5, 1.0, 6),
#     'colsample_bytree': np.linspace(0.5, 1.0, 6),
#     'gamma': np.linspace(0, 5, 10),
#     'reg_alpha': np.linspace(0, 1, 5),
#     'reg_lambda': np.linspace(1, 10, 5),
# }

# # Initialize the XGBoost regressor
# xgb_model = xgb.XGBRegressor(random_state=42, n_jobs=-1)

# # Initialize RandomizedSearchCV
# random_search = RandomizedSearchCV(
#     estimator=xgb_model,
#     param_distributions=param_grid,
#     n_iter=50,  # Number of random combinations to try
#     scoring='neg_mean_squared_error',  # Metric for evaluation
#     cv=5,  # 5-fold cross-validation
#     verbose=1,
#     random_state=42,
#     n_jobs=-1,  # Utilize all CPU cores
# )

# # Fit RandomizedSearchCV
# random_search.fit(X_train, Y_train)

# # Get the best parameters and model performance
# print(f"Best parameters: {random_search.best_params_}")
# print(f"Best RMSE: {np.sqrt(-random_search.best_score_)}")

# # Evaluate the model on the test set
# best_model = random_search.best_estimator_
# y_pred = best_model.predict(X_test)
# test_rmse = mean_squared_error(Y_test, y_pred, squared=False)
# print(f"Test RMSE: {test_rmse}")

### Training Model


In [8]:
import xgboost as xgb
regressor = xgb.XGBRegressor(colsample_bytree=1, learning_rate=0.042222222222222223,
                           max_depth=7, alpha=10, n_estimators=500, min_child_weight=1, subsample=0.5, gamma=3.3333333333333335, reg_alpha=0, reg_lambda=1)

# regressor = xgb.XGBRegressor(colsample_bytree=0.7857872949109472, learning_rate=0.010344501343048346,
#                            max_depth=10, alpha=10, n_estimators=257, min_child_weight=2, subsample=0.850598292345319, gamma=3.1518448315907417, reg_alpha=0.9279458085493363, reg_lambda=7.180908650941852)

# regressor = xgb.XGBRegressor(colsample_bytree=0.7, learning_rate=0.042222222222222223,
#                            max_depth=10, alpha=10, n_estimators=50, min_child_weight=1, subsample=1, gamma=2.7777777777777777, reg_alpha=0.5, reg_lambda=10)

regressor.fit(X, Y)

#### Comparing Values


In [9]:
Y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2, suppress=True)
print(np.concatenate((Y_pred.reshape(len(Y_pred),1), Y_test.reshape(len(Y_test),1)),1))


[[  349.92   336.  ]
 [  610.55   493.  ]
 [ 6858.54  6950.  ]
 [ 1674.38  1448.  ]
 [ 2901.26  2841.  ]
 [  559.02   491.  ]
 [   42.02    27.  ]
 [  -76.87     1.  ]
 [   48.65    10.  ]
 [  482.51   278.  ]
 [ 3425.38  3398.  ]
 [  129.08   111.5 ]
 [  119.77   116.5 ]
 [  193.17    96.  ]
 [  279.92   140.  ]
 [   -3.94    10.  ]
 [ 5435.5   5621.  ]
 [  818.94   762.  ]
 [ 2184.22  2128.  ]
 [  -30.13    67.  ]
 [  -32.88     0.  ]
 [  297.61   318.  ]
 [   56.47     0.  ]
 [  229.17   142.5 ]
 [  214.61   125.5 ]
 [ 4470.95  4533.  ]
 [  452.15   361.  ]
 [  216.56   322.5 ]
 [   14.25     0.  ]
 [  -30.12     0.  ]
 [ 6221.96  6349.  ]
 [  768.76   664.5 ]
 [  249.09     0.  ]
 [  478.91   467.  ]
 [  -13.28     0.  ]
 [  232.22   276.5 ]
 [  451.17   335.  ]
 [ -128.42     0.  ]
 [ 1199.88  1053.  ]
 [ 1147.89  1163.  ]
 [   47.99    84.  ]
 [ -138.61    17.  ]
 [  710.33   739.  ]
 [  126.14    62.  ]
 [  250.66   245.5 ]
 [   90.94     4.  ]
 [ 2574.38  2946.5 ]
 [  356.37   

In [10]:
from sklearn.metrics import r2_score
r2_score(Y_test, Y_pred)

0.9979540206772833

In [11]:
def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true, dtype=float), np.array(y_pred, dtype=float)
    non_zero_indices = y_true != 0
    return np.mean(np.abs((y_true[non_zero_indices] - y_pred[non_zero_indices]) / y_true[non_zero_indices])) * 100


mean_absolute_percentage_error(Y_test, Y_pred)

192.3296165983041