### Importing the Libraries


In [90]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.metrics import mean_absolute_error

### Importing the dataset


In [91]:
dataset = pd.read_csv('dataset/dataset.csv', header = None)
dataset.columns = dataset.iloc[0]
dataset = dataset[1:]
dataset = dataset.drop(columns=['Year'])

### Encoding Categorical Data


In [92]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
dataset['District'] = label_encoder.fit_transform(dataset['District'])

### Handling Missing Values


In [93]:
# dataset = dataset.apply(pd.to_numeric, errors='coerce')
# dataset.fillna(dataset.median(), inplace=True)
# dataset['Total Cases'] = dataset.iloc[:, 37:50].sum(axis=1)
# dataset = dataset.drop(dataset.columns[37:49], axis=1)

### Handling Missing Values : Interpolation

In [94]:
# dataset = dataset.apply(pd.to_numeric, errors='coerce')
# dataset.interpolate(method='linear', inplace=True)
# dataset['Total Cases'] = dataset.iloc[:, 37:50].sum(axis=1)
# dataset = dataset.drop(dataset.columns[37:49], axis=1)

dataset = dataset.apply(pd.to_numeric, errors='coerce')
# dataset.fillna(dataset.median(), inplace=True)
dataset.fillna(dataset.mode(), inplace=True)
# dataset.interpolate(method='linear', inplace=True)
# dataset.ffill(inplace=True)
dataset['Total Cases'] = dataset.iloc[:, 37:50].sum(axis=1)
dataset = dataset.drop(dataset.columns[37:49], axis=1)

### Test/Train Split


In [95]:
from sklearn.model_selection import train_test_split

X = dataset.iloc[:, 0:-1].values
Y = dataset.iloc[:, -1].values

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.20, random_state = 0)

### Hyperparameter Tuning

#### Random Search

In [96]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error

Y_train = pd.to_numeric(Y_train, errors='coerce')
y_test = pd.to_numeric(Y_test, errors='coerce')
# X_train = np.nan_to_num(X_train)
# X_test = np.nan_to_num(X_test)
# Y_train = np.nan_to_num(Y_train)
# Y_test = np.nan_to_num(Y_test)


# Define the parameter grid for Random Search
param_grid = {
    'criterion': ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
    'max_depth': [None, 5, 10, 15, 20, 30, 50],
    'min_samples_split': [2, 5, 10, 15],
    'min_samples_leaf': [1, 2, 5, 10],
    'max_features': [None, 'sqrt', 'log2'],
    'max_leaf_nodes': [None, 10, 20, 30, 50, 100],
}

# Initialize the DecisionTreeRegressor
dt_model = DecisionTreeRegressor(random_state=42)

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=dt_model,
    param_distributions=param_grid,
    n_iter=50,  # Number of random combinations to try
    scoring='neg_mean_squared_error',  # Metric for evaluation
    cv=5,  # 5-fold cross-validation
    verbose=1,
    random_state=42,
    n_jobs=-1,  # Utilize all CPU cores
)

# Fit RandomizedSearchCV
random_search.fit(X_train, Y_train)

# Get the best parameters and model performance
print(f"Best parameters: {random_search.best_params_}")
print(f"Best RMSE (CV): {np.sqrt(-random_search.best_score_)}")

# Evaluate the model on the test set
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)
test_rmse = mean_squared_error(Y_test, y_pred, squared=False)
print(f"Test RMSE: {test_rmse}")


Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best parameters: {'min_samples_split': 10, 'min_samples_leaf': 10, 'max_leaf_nodes': 50, 'max_features': None, 'max_depth': 50, 'criterion': 'squared_error'}
Best RMSE (CV): 7689.818914020574
Test RMSE: 5137.005948519605


65 fits failed out of a total of 250.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
65 fits failed with the following error:
Traceback (most recent call last):
  File "/home/anik/miniconda3/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/anik/miniconda3/lib/python3.12/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/anik/miniconda3/lib/python3.12/site-packages/sklearn/tree/_classes.py", line 1377, in fit
    super()._fit(
  File "/home/anik/miniconda3/lib/python3.12/site-packages/sklearn/tree/_classes.py", 

### Training Model


In [97]:
from sklearn.tree import DecisionTreeRegressor
#regressor = DecisionTreeRegressor(criterion = 'absolute_error', max_depth = 15, max_features = 'sqrt', max_leaf_nodes = 30, min_samples_leaf = 5, min_samples_split = 15)
# regressor = DecisionTreeRegressor(criterion = 'friedman_mse', max_depth = 20, max_features = None, max_leaf_nodes = None, min_samples_leaf = 1, min_samples_split = 15)
regressor = DecisionTreeRegressor()
regressor.fit(X_train, Y_train)


#### Comparing Values


In [98]:
Y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2, suppress=True)
print(np.concatenate((Y_pred.reshape(len(Y_pred), 1), Y_test.reshape(len(Y_test), 1)), 1))


[[  141.   336.]
 [ 2205.   493.]
 [  711.  6950.]
 [ 1063.  1448.]
 [ 6853.  2841.]
 [  827.   491.]
 [    4.    27.]
 [   26.     1.]
 [    6.    10.]
 [  916.   277.]
 [  341.  3397.]
 [    3.    77.]
 [  422.    82.]
 [  132.    96.]
 [  200.   140.]
 [  259.    10.]
 [  359.  5620.]
 [ 1281.   762.]
 [  241.  2127.]
 [   33.    67.]
 [ 2994.     0.]
 [  200.   318.]
 [   18.     0.]
 [  254.   108.]
 [  101.    91.]
 [ 1692.  4533.]
 [   83.   361.]
 [  103.   288.]
 [ 8809.     0.]
 [    0.     0.]
 [ 2114.  6348.]
 [  914.   630.]
 [17719.     0.]
 [ 4249.   466.]
 [  182.     0.]
 [  256.   242.]
 [  141.   335.]
 [  620.     0.]
 [  711.  1053.]
 [ 7573.  1162.]
 [   11.    84.]
 [  102.    17.]
 [  129.   739.]
 [  241.    62.]
 [  295.   211.]
 [    0.     4.]
 [   43.  2912.]
 [    0.   261.]
 [  241.   877.]
 [  182.   485.]
 [17719. 12743.]
 [    0.   353.]
 [ 1482.    73.]
 [   43.   367.]
 [ 1005. 13223.]
 [   51.   356.]
 [   44.    40.]
 [   33.    43.]
 [  247.   111

In [99]:
from sklearn.metrics import r2_score
r2_score(Y_test, Y_pred)

-1.3289938775076178

In [100]:
def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true, dtype=float), np.array(y_pred, dtype=float)
    non_zero_indices = y_true != 0
    return np.mean(np.abs((y_true[non_zero_indices] - y_pred[non_zero_indices]) / y_true[non_zero_indices])) * 100


mean_absolute_percentage_error(Y_test, Y_pred)

510.7788818046423