In [41]:
# Import necessary libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Load the dataset
dataset = pd.read_csv('dataset/dataset.csv', header = None)
dataset.columns = dataset.iloc[0] # set the first row as the column names
dataset = dataset[1:] # remove the first row
dataset = dataset.drop(columns=['Year'])    # remove the year column

# Encode the categorical data
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
dataset['District'] = label_encoder.fit_transform(dataset['District'])

# Handle missing data
# from sklearn.impute import SimpleImputer
# imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
# imputer = imputer.fit(dataset)
# dataset = imputer.transform(dataset)

dataset = dataset.apply(pd.to_numeric, errors='coerce') # convert all columns of DataFrame
dataset.ffill(inplace=True)
# dataset.fillna(dataset.mode().iloc[0], inplace=True)

# print(dataset)

# Split the dataset into the independent and dependent variables
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample
from sklearn.metrics import mean_squared_error, r2_score

dataset['Total Cases'] = dataset.iloc[:, 37:49].sum(axis=1)
dataset = dataset.drop(dataset.columns[37:49], axis=1)

X = dataset.iloc[:, 0:-1].values # Independent Variables (District to December)
Y = dataset.iloc[:, -1].values # Dependent Variable (Total Cases)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.20, random_state = 0)

# Perform bootstrapping to scale the dataset

# Combine X_train and Y_train to resample together
train_data = np.hstack((X_train, Y_train.reshape(-1, 1)))

# Perform bootstrapping
n_samples = 384  # Desired number of samples
bootstrapped_data = resample(train_data, replace=True, n_samples=n_samples, random_state=0)

# Split the bootstrapped data back into X and Y
X_train_bootstrapped = bootstrapped_data[:, :-1] # All columns except the last one
Y_train_bootstrapped = bootstrapped_data[:, -1]  # The last column

# Standardize the features
scaler = StandardScaler()
X_train_bootstrapped = scaler.fit_transform(X_train_bootstrapped)
X_test = scaler.transform(X_test)

# Train the model using the bootstrapped dataset
from sklearn.ensemble import AdaBoostRegressor
# model = AdaBoostRegressor(n_estimators=100, random_state=0)

from sklearn.ensemble import RandomForestRegressor
# model = RandomForestRegressor(n_estimators=100, random_state=0)

from sklearn.ensemble import GradientBoostingRegressor
# model = GradientBoostingRegressor(n_estimators=100, random_state=0)

from sklearn.ensemble import BaggingRegressor
# model = BaggingRegressor(n_estimators=100, random_state=0)

from sklearn.linear_model import LinearRegression
# model = LinearRegression()

import xgboost as xgb
model = xgb.XGBRegressor()

model.fit(X_train_bootstrapped, Y_train_bootstrapped)

# Predict the results
Y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(Y_test, Y_pred)
r2 = r2_score(Y_test, Y_pred)

print(f'MSE: {mse}')
print(f'R2: {r2}')

MSE: 39682463.35063705
R2: -3.622945938237333
