### Importing the Libraries


In [13]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

### Importing the dataset


In [14]:
dataset = pd.read_csv('dataset/dataset.csv', header = None)
dataset.columns = dataset.iloc[0]
dataset = dataset[1:]
dataset = dataset.drop(columns=['Year'])

### Encoding Categorical Data


In [15]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
dataset['District'] = label_encoder.fit_transform(dataset['District'])

### Handling Missing Values


In [16]:
dataset = dataset.apply(pd.to_numeric, errors='coerce')
dataset.fillna(dataset.median(), inplace=True)
dataset['Total Cases'] = dataset.iloc[:, 37:50].sum(axis=1)
dataset = dataset.drop(dataset.columns[37:49], axis=1)

### Handling Missing Values : Interpolation

In [17]:
dataset = dataset.apply(pd.to_numeric, errors='coerce')
dataset.interpolate(method='linear', inplace=True)
dataset['Total Cases'] = dataset.iloc[:, 37:50].sum(axis=1)
dataset = dataset.drop(dataset.columns[37:49], axis=1)

### Test/Train Split


In [18]:
from sklearn.model_selection import train_test_split

X = dataset.iloc[:, 0:-1].values
Y = dataset.iloc[:, -1].values

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.20, random_state = 0)

### Feature Scaling

In [19]:
# from sklearn.preprocessing import StandardScaler
# sc = StandardScaler()
# X_train[:, 5:] = sc.fit_transform(X_train[:, 5:])
# X_test[:, 5:] = sc.transform(X_test[:, 5:])

### Hyperparameter Tuning

#### Bayesian Optimization

In [20]:
# import optuna
# import xgboost as xgb
# from sklearn.metrics import mean_squared_error

# # Define the objective function for Optuna
# def objective(trial):
#     # Suggest hyperparameters
#     param = {
#         "n_estimators": trial.suggest_int("n_estimators", 50, 500),
#         "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
#         "max_depth": trial.suggest_int("max_depth", 3, 10),
#         "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
#         "subsample": trial.suggest_float("subsample", 0.5, 1.0),
#         "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
#         "gamma": trial.suggest_float("gamma", 0.0, 5.0),
#         "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 1.0),
#         "reg_lambda": trial.suggest_float("reg_lambda", 1.0, 10.0),
#     }

#     # Create and train the model with suggested parameters
#     model = xgb.XGBRegressor(**param, random_state=42, n_jobs=-1)
#     model.fit(X_train, Y_train)

#     # Make predictions and calculate RMSE
#     y_pred = model.predict(X_test)
#     rmse = mean_squared_error(Y_test, y_pred, squared=False)

#     return rmse

# # Create an Optuna study and optimize it
# study = optuna.create_study(direction="minimize")
# study.optimize(objective, n_trials=50)

# # Print the best parameters and RMSE
# print(f"Best parameters: {study.best_params}")
# print(f"Best RMSE: {study.best_value}")



#### Random Search

In [21]:
# import xgboost as xgb
# from sklearn.model_selection import RandomizedSearchCV
# from sklearn.metrics import mean_squared_error

# # Define the parameter grid for Random Search
# param_grid = {
#     'n_estimators': [50, 100, 200, 300, 500],
#     'learning_rate': np.linspace(0.01, 0.3, 10),
#     'max_depth': [3, 5, 7, 9, 10],
#     'min_child_weight': [1, 3, 5, 7, 10],
#     'subsample': np.linspace(0.5, 1.0, 6),
#     'colsample_bytree': np.linspace(0.5, 1.0, 6),
#     'gamma': np.linspace(0, 5, 10),
#     'reg_alpha': np.linspace(0, 1, 5),
#     'reg_lambda': np.linspace(1, 10, 5),
# }

# # Initialize the XGBoost regressor
# xgb_model = xgb.XGBRegressor(random_state=42, n_jobs=-1)

# # Initialize RandomizedSearchCV
# random_search = RandomizedSearchCV(
#     estimator=xgb_model,
#     param_distributions=param_grid,
#     n_iter=50,  # Number of random combinations to try
#     scoring='neg_mean_squared_error',  # Metric for evaluation
#     cv=5,  # 5-fold cross-validation
#     verbose=1,
#     random_state=42,
#     n_jobs=-1,  # Utilize all CPU cores
# )

# # Fit RandomizedSearchCV
# random_search.fit(X_train, Y_train)

# # Get the best parameters and model performance
# print(f"Best parameters: {random_search.best_params_}")
# print(f"Best RMSE: {np.sqrt(-random_search.best_score_)}")

# # Evaluate the model on the test set
# best_model = random_search.best_estimator_
# y_pred = best_model.predict(X_test)
# test_rmse = mean_squared_error(Y_test, y_pred, squared=False)
# print(f"Test RMSE: {test_rmse}")

### Training Model


In [22]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
poly_reg = PolynomialFeatures(degree = 3)
X_poly = poly_reg.fit_transform(X_train)
lin_reg_2 = LinearRegression()
lin_reg_2.fit(X_poly, Y_train)

#### Comparing Values


In [23]:
Y_pred = lin_reg_2.predict(poly_reg.transform(X_test))
np.set_printoptions(precision=2, suppress=True)
print(np.concatenate((Y_pred.reshape(len(Y_pred),1), Y_test.reshape(len(Y_test),1)),1))

[[ 75.93  76.51]
 [ 78.53  80.22]
 [ 80.73  80.81]
 [ 80.57  76.91]
 [ 75.94  77.87]
 [ 76.46  80.34]
 [ 75.    80.58]
 [ 77.56  79.35]
 [ 72.97  72.45]
 [ 78.93  80.58]
 [ 82.83  82.74]
 [ 79.38  79.39]
 [ 79.35  79.39]
 [ 88.42  85.52]
 [ 79.29  79.39]
 [ 78.41  79.68]
 [ 74.12  80.16]
 [ 81.01  78.42]
 [ 78.05  78.87]
 [ 82.43  81.74]
 [ 79.27  83.03]
 [ 69.81  73.86]
 [ 81.63  83.03]
 [ 79.33  79.39]
 [ 79.36  79.39]
 [ 79.12  78.02]
 [ 80.83  81.13]
 [ 79.36  79.39]
 [ 82.15  79.19]
 [ 74.99  80.1 ]
 [ 79.7   81.13]
 [ 79.39  79.39]
 [ 81.47  81.81]
 [100.78  84.13]
 [ 77.73  73.97]
 [ 79.39  79.39]
 [ 74.12  75.15]
 [ 78.38  79.74]
 [ 82.47  80.86]
 [ 79.38  82.39]
 [ 81.7   83.32]
 [ 77.25  74.74]
 [ 79.68  81.33]
 [ 81.53  82.45]
 [ 79.19  79.39]
 [ 81.85  80.26]
 [ 79.37  79.39]
 [ 77.63  79.1 ]
 [ 78.57  80.61]
 [ 70.47  75.54]
 [ 99.28  81.81]
 [ 79.72  78.39]
 [ 79.4   79.39]
 [ 79.89  79.39]
 [ 77.19  82.65]
 [ 84.9   84.45]
 [ 78.36  78.32]
 [ 84.68  84.68]
 [ 70.84  72.7

In [24]:
from sklearn.metrics import r2_score
r2_score(Y_test, Y_pred)

-1.1127439730044926

In [25]:
def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true, dtype=float), np.array(y_pred, dtype=float)
    non_zero_indices = y_true != 0
    return np.mean(np.abs((y_true[non_zero_indices] - y_pred[non_zero_indices]) / y_true[non_zero_indices])) * 100


mean_absolute_percentage_error(Y_test, Y_pred)

2.8693963558313875