### Importing the Libraries


In [307]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

### Importing the dataset


In [308]:
dataset = pd.read_csv('dataset/dataset.csv', header = None)
dataset.columns = dataset.iloc[0]
dataset = dataset[1:]
dataset = dataset.drop(columns=['Year']) 

### Encoding Categorical Data


In [309]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
dataset['District'] = label_encoder.fit_transform(dataset['District'])

### Handling Missing Values


In [310]:
dataset = dataset.apply(pd.to_numeric, errors='coerce')
dataset.fillna(dataset.median(), inplace=True)
dataset['Total Cases'] = dataset.iloc[:, 37:50].sum(axis=1)
dataset = dataset.drop(dataset.columns[37:49], axis=1)

### Test/Train Split


In [311]:
from sklearn.model_selection import train_test_split

X = dataset.iloc[:, 0:-1].values
Y = dataset.iloc[:, -1].values

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.20, random_state = 0)

### Hyperparameter Tuning

#### Random Search

In [312]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error

Y_train = pd.to_numeric(Y_train, errors='coerce')
y_test = pd.to_numeric(Y_test, errors='coerce')
# X_train = np.nan_to_num(X_train)
# X_test = np.nan_to_num(X_test)
# Y_train = np.nan_to_num(Y_train)
# Y_test = np.nan_to_num(Y_test)


# Define the parameter grid for Random Search
param_grid = {
    'criterion': ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
    'max_depth': [None, 5, 10, 15, 20, 30, 50],
    'min_samples_split': [2, 5, 10, 15],
    'min_samples_leaf': [1, 2, 5, 10],
    'max_features': [None, 'sqrt', 'log2'],
    'max_leaf_nodes': [None, 10, 20, 30, 50, 100],
}

# Initialize the DecisionTreeRegressor
dt_model = DecisionTreeRegressor(random_state=42)

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=dt_model,
    param_distributions=param_grid,
    n_iter=50,  # Number of random combinations to try
    scoring='neg_mean_squared_error',  # Metric for evaluation
    cv=5,  # 5-fold cross-validation
    verbose=1,
    random_state=42,
    n_jobs=-1,  # Utilize all CPU cores
)

# Fit RandomizedSearchCV
random_search.fit(X_train, Y_train)

# Get the best parameters and model performance
print(f"Best parameters: {random_search.best_params_}")
print(f"Best RMSE (CV): {np.sqrt(-random_search.best_score_)}")

# Evaluate the model on the test set
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)
test_rmse = mean_squared_error(Y_test, y_pred, squared=False)
print(f"Test RMSE: {test_rmse}")


Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best parameters: {'min_samples_split': 15, 'min_samples_leaf': 5, 'max_leaf_nodes': 30, 'max_features': 'sqrt', 'max_depth': 15, 'criterion': 'absolute_error'}
Best RMSE (CV): 7321.489479553103
Test RMSE: 3082.0931046683445




### Training Model


In [313]:
from sklearn.tree import DecisionTreeRegressor
#regressor = DecisionTreeRegressor(criterion = 'absolute_error', max_depth = 15, max_features = 'sqrt', max_leaf_nodes = 30, min_samples_leaf = 5, min_samples_split = 15)
regressor = DecisionTreeRegressor(criterion = 'friedman_mse', max_depth = 20, max_features = None, max_leaf_nodes = None, min_samples_leaf = 1, min_samples_split = 15)

regressor.fit(X_train, Y_train)


#### Comparing Values


In [314]:
Y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2, suppress=True)
print(np.concatenate((Y_pred.reshape(len(Y_pred),1), Y_test.reshape(len(Y_test),1)),1))


[[  119.96   336.  ]
 [ 3001.5    493.  ]
 [ 1241.43  6950.  ]
 [ 3941.67  1448.  ]
 [ 5552.29  2841.  ]
 [ 1241.43   491.  ]
 [    9.38    27.  ]
 [ 2853.09     1.  ]
 [  146.89    10.  ]
 [ 1745.35   278.  ]
 [ 2171.75  3398.  ]
 [  293.07   111.5 ]
 [  293.07   116.5 ]
 [   41.57    96.  ]
 [  461.94   140.  ]
 [  259.      10.  ]
 [    2.83  5621.  ]
 [  483.64   762.  ]
 [  675.67  2128.  ]
 [   41.57    67.  ]
 [ 2853.09     0.  ]
 [  146.89   318.  ]
 [ 2853.09     0.  ]
 [  247.55   142.5 ]
 [  119.96   125.5 ]
 [  483.64  4533.  ]
 [  293.07   361.  ]
 [  119.96   322.5 ]
 [  119.96     0.  ]
 [    9.38     0.  ]
 [ 2145.5   6349.  ]
 [ 9454.83   664.5 ]
 [ 5575.3      0.  ]
 [  803.83   467.  ]
 [ 3779.75     0.  ]
 [  675.67   276.5 ]
 [  119.96   335.  ]
 [ 5575.3      0.  ]
 [ 1241.43  1053.  ]
 [ 5575.3   1163.  ]
 [    2.83    84.  ]
 [  119.96    17.  ]
 [ 1241.43   739.  ]
 [    9.38    62.  ]
 [  293.07   245.5 ]
 [    9.38     4.  ]
 [  293.07  2946.5 ]
 [    2.83   

In [315]:
from sklearn.metrics import r2_score
r2_score(Y_test, Y_pred)

-0.4045605557045744

In [316]:
def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true, dtype=float), np.array(y_pred, dtype=float)
    non_zero_indices = y_true != 0
    return np.mean(np.abs((y_true[non_zero_indices] - y_pred[non_zero_indices]) / y_true[non_zero_indices])) * 100


mean_absolute_percentage_error(Y_test, Y_pred)

4601.2563279213555