### Importing the Libraries


In [99]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.metrics import mean_absolute_error

### Importing the dataset


In [100]:
dataset = pd.read_csv('dataset/dataset.csv', header = None)
dataset.columns = dataset.iloc[0]
dataset = dataset[1:]
dataset = dataset.drop(columns=['Year'])

### Encoding Categorical Data


In [101]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
dataset['District'] = label_encoder.fit_transform(dataset['District'])

### Handling Missing Values


In [102]:
# dataset = dataset.apply(pd.to_numeric, errors='coerce')
# dataset.fillna(dataset.median(), inplace=True)
# dataset['Total Cases'] = dataset.iloc[:, 37:50].sum(axis=1)
# dataset = dataset.drop(dataset.columns[37:49], axis=1)

### Handling Missing Values : Interpolation

In [103]:
# dataset = dataset.apply(pd.to_numeric, errors='coerce')
# dataset.interpolate(method='linear', inplace=True)
# dataset['Total Cases'] = dataset.iloc[:, 37:50].sum(axis=1)
# dataset = dataset.drop(dataset.columns[37:49], axis=1)

dataset = dataset.apply(pd.to_numeric, errors='coerce')
# dataset.fillna(dataset.median(), inplace=True)
# dataset.fillna(dataset.mode(), inplace=True)
# dataset.interpolate(method='linear', inplace=True)
dataset.ffill(inplace=True)
dataset['Total Cases'] = dataset.iloc[:, 37:50].sum(axis=1)
dataset = dataset.drop(dataset.columns[37:49], axis=1)

### Test/Train Split


In [104]:
from sklearn.model_selection import train_test_split

X = dataset.iloc[:, 0:-1].values
Y = dataset.iloc[:, -1].values

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.20, random_state = 0)

### Hyperparameter Tuning

#### Bayesian Optimization

In [105]:
from skopt import BayesSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.impute import SimpleImputer
from skopt.space import Categorical, Integer

# Assume X_train, X_test, Y_train, Y_test are already defined

# Handle missing values using SimpleImputer
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)

# Define the parameter grid for Bayesian Optimization
param_grid = {
    'criterion': Categorical(['squared_error', 'friedman_mse', 'absolute_error', 'poisson']),
    'max_depth': Integer(5, 50),
    'min_samples_split': Integer(2, 15),
    'min_samples_leaf': Integer(1, 10),
    'max_features': Categorical([None, 'sqrt', 'log2']),
    'max_leaf_nodes': Integer(10, 100)
}

# Initialize the BayesSearchCV object
bayes_search = BayesSearchCV(
    estimator=DecisionTreeRegressor(random_state=42),
    search_spaces=param_grid,
    n_iter=30,  # Use fewer iterations for simplicity
    cv=3,       # 3-fold cross-validation
    random_state=42,
    verbose=0   # No extra output
)

# Perform the search to find the best parameters
bayes_search.fit(X_train_imputed, Y_train)

# Print the best parameters
print("Best parameters found: ", bayes_search.best_params_)


Best parameters found:  OrderedDict({'criterion': 'absolute_error', 'max_depth': 50, 'max_features': None, 'max_leaf_nodes': 20, 'min_samples_leaf': 8, 'min_samples_split': 7})


#### Random Search

In [106]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.impute import SimpleImputer
from scipy.stats import randint

# Assume X_train, X_test, Y_train, Y_test are already defined

# Handle missing values using SimpleImputer
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)

# Define the parameter grid for Randomized Search
param_distributions = {
    'criterion': ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
    'max_depth': randint(5, 50),
    'min_samples_split': randint(2, 15),
    'min_samples_leaf': randint(1, 10),
    'max_features': ['sqrt', 'log2', None],
    'max_leaf_nodes': randint(10, 100)
}

# Initialize the RandomizedSearchCV object
random_search = RandomizedSearchCV(
    estimator=DecisionTreeRegressor(random_state=42),
    param_distributions=param_distributions,
    n_iter=30,       # Number of random parameter combinations to try
    scoring='neg_mean_squared_error',  # Use negative MSE as the scoring metric
    cv=3,            # 3-fold cross-validation
    random_state=42, # For reproducibility
    verbose=0,       # No extra output
    n_jobs=-1        # Use all available cores for parallel processing
)

# Fit the RandomizedSearchCV object to the training data
random_search.fit(X_train_imputed, Y_train)

# Print the best parameters
print("Best parameters found: ", random_search.best_params_)


Best parameters found:  {'criterion': 'absolute_error', 'max_depth': 45, 'max_features': None, 'max_leaf_nodes': 82, 'min_samples_leaf': 8, 'min_samples_split': 13}


#### Hyperband


In [111]:
from sklearn.experimental import enable_halving_search_cv  # Needed for HalvingGridSearchCV
from sklearn.model_selection import HalvingGridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.impute import SimpleImputer

# Assume X_train, X_test, Y_train, Y_test are already defined

# Handle missing values using SimpleImputer
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)

# Define the parameter grid for Hyperband
param_grid = {
    'criterion': ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
    'max_depth': [5, 10, 20, 30, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5],
    'max_features': ['sqrt', 'log2', None],
    'max_leaf_nodes': [10, 20, 50, 100]
}

# Initialize the HalvingGridSearchCV object
hyperband_search = HalvingGridSearchCV(
    estimator=DecisionTreeRegressor(random_state=42),
    param_grid=param_grid,
    factor=3,             # How aggressively to reduce the search space
    scoring='neg_mean_squared_error',  # Scoring metric for regression
    cv=3,                 # 3-fold cross-validation
    verbose=1,            # Show progress during the search
    n_jobs=-1,            # Use all available cores
    random_state=42       # Ensure reproducibility
)

# Fit the Hyperband search object to the training data
hyperband_search.fit(X_train_imputed, Y_train)

# Print the best parameters
print("Best parameters found: ", hyperband_search.best_params_)


n_iterations: 4
n_required_iterations: 7
n_possible_iterations: 4
min_resources_: 6
max_resources_: 307
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 2160
n_resources: 6
Fitting 3 folds for each of 2160 candidates, totalling 6480 fits


  _data = np.array(data, dtype=dtype, copy=copy,


----------
iter: 1
n_candidates: 720
n_resources: 18
Fitting 3 folds for each of 720 candidates, totalling 2160 fits
----------
iter: 2
n_candidates: 240
n_resources: 54
Fitting 3 folds for each of 240 candidates, totalling 720 fits
----------
iter: 3
n_candidates: 80
n_resources: 162
Fitting 3 folds for each of 80 candidates, totalling 240 fits
Best parameters found:  {'criterion': 'absolute_error', 'max_depth': 5, 'max_features': 'log2', 'max_leaf_nodes': 10, 'min_samples_leaf': 2, 'min_samples_split': 10}


### Training Model


In [112]:
# Best parameters found:  OrderedDict({'criterion': 'absolute_error', 'max_depth': 50, 'max_features': None, 'max_leaf_nodes': 20, 'min_samples_leaf': 8, 'min_samples_split': 7})
# # regressor =  DecisionTreeRegressor(criterion = 'absolute_error', max_depth = 50, max_features = None, max_leaf_nodes = 20, min_samples_leaf = 8, min_samples_split = 7)

# Best parameters found:  {'criterion': 'absolute_error', 'max_depth': 45, 'max_features': None, 'max_leaf_nodes': 82, 'min_samples_leaf': 8, 'min_samples_split': 13}
# regressor =  DecisionTreeRegressor(criterion = 'absolute_error', max_depth = 45, max_features = None, max_leaf_nodes = 82, min_samples_leaf = 8, min_samples_split = 13)

# Best parameters found:  {'criterion': 'absolute_error', 'max_depth': 5, 'max_features': 'log2', 'max_leaf_nodes': 10, 'min_samples_leaf': 2, 'min_samples_split': 10}
regressor =  DecisionTreeRegressor(criterion = 'absolute_error', max_depth = 5, max_features = 'log2', max_leaf_nodes = 10, min_samples_leaf = 2, min_samples_split = 10)

regressor.fit(X_train, Y_train)


#### Comparing Values


In [113]:
Y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2, suppress=True)
print(np.concatenate((Y_pred.reshape(len(Y_pred), 1), Y_test.reshape(len(Y_test), 1)), 1))


[[  128.    336. ]
 [  827.    493. ]
 [ 1549.   6950. ]
 [ 1857.   1448. ]
 [ 4974.   2841. ]
 [ 3299.    491. ]
 [  128.     27. ]
 [  128.      1. ]
 [  128.     10. ]
 [ 1517.    290. ]
 [ 2932.   3407. ]
 [  353.5   144. ]
 [  128.    109. ]
 [  128.     96. ]
 [  353.5   140. ]
 [  128.     10. ]
 [  128.   5628. ]
 [  128.    762. ]
 [  128.   2224. ]
 [  128.     67. ]
 [  128.      0. ]
 [  128.    318. ]
 [  128.      0. ]
 [  128.    152. ]
 [ 1857.    382. ]
 [  128.   4533. ]
 [  128.    361. ]
 [ 1857.    924. ]
 [  128.      0. ]
 [  128.      0. ]
 [ 1857.   6354. ]
 [ 1616.   1010. ]
 [  353.5     0. ]
 [  353.5   476. ]
 [  128.      0. ]
 [ 1616.    529. ]
 [  128.    335. ]
 [  353.5     0. ]
 [ 1549.   1053. ]
 [ 2932.   1551. ]
 [  128.     84. ]
 [  128.     17. ]
 [ 4974.    739. ]
 [  128.     62. ]
 [  128.    281. ]
 [  128.      4. ]
 [  353.5  4541. ]
 [  128.    261. ]
 [  128.    895. ]
 [  128.    485. ]
 [  353.5 12745. ]
 [  353.5   353. ]
 [ 1616.    

In [114]:
from sklearn.metrics import r2_score
r2_score(Y_test, Y_pred)

-0.12838058167884236

In [115]:
def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true, dtype=float), np.array(y_pred, dtype=float)
    non_zero_indices = y_true != 0
    return np.mean(np.abs((y_true[non_zero_indices] - y_pred[non_zero_indices]) / y_true[non_zero_indices])) * 100


mean_absolute_percentage_error(Y_test, Y_pred)

384.2897308656154