### Importing the Libraries


In [124]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.metrics import mean_absolute_error

### Importing the dataset


In [125]:
dataset = pd.read_csv('dataset/dataset.csv', header = None)
dataset.columns = dataset.iloc[0]
dataset = dataset[1:]
dataset = dataset.drop(columns=['Year'])

### Encoding Categorical Data


In [126]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
dataset['District'] = label_encoder.fit_transform(dataset['District'])

### Handling Missing Values


In [127]:
dataset = dataset.apply(pd.to_numeric, errors='coerce')
dataset.fillna(dataset.median(), inplace=True)
dataset['Total Cases'] = dataset.iloc[:, 37:50].sum(axis=1)
dataset = dataset.drop(dataset.columns[37:49], axis=1)

### Handling Missing Values : Interpolation

In [128]:
dataset = dataset.apply(pd.to_numeric, errors='coerce')
dataset.interpolate(method='linear', inplace=True)
dataset['Total Cases'] = dataset.iloc[:, 37:50].sum(axis=1)
dataset = dataset.drop(dataset.columns[37:49], axis=1)

### Test/Train Split


In [129]:
from sklearn.model_selection import train_test_split

X = dataset.iloc[:, 0:-1].values
Y = dataset.iloc[:, -1].values

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.20, random_state = 0)

### Hyperparameter Tuning

#### Random Search

In [130]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error

Y_train = pd.to_numeric(Y_train, errors='coerce')
y_test = pd.to_numeric(Y_test, errors='coerce')
# X_train = np.nan_to_num(X_train)
# X_test = np.nan_to_num(X_test)
# Y_train = np.nan_to_num(Y_train)
# Y_test = np.nan_to_num(Y_test)


# Define the parameter grid for Random Search
param_grid = {
    'criterion': ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
    'max_depth': [None, 5, 10, 15, 20, 30, 50],
    'min_samples_split': [2, 5, 10, 15],
    'min_samples_leaf': [1, 2, 5, 10],
    'max_features': [None, 'sqrt', 'log2'],
    'max_leaf_nodes': [None, 10, 20, 30, 50, 100],
}

# Initialize the DecisionTreeRegressor
dt_model = DecisionTreeRegressor(random_state=42)

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=dt_model,
    param_distributions=param_grid,
    n_iter=50,  # Number of random combinations to try
    scoring='neg_mean_squared_error',  # Metric for evaluation
    cv=5,  # 5-fold cross-validation
    verbose=1,
    random_state=42,
    n_jobs=-1,  # Utilize all CPU cores
)

# Fit RandomizedSearchCV
random_search.fit(X_train, Y_train)

# Get the best parameters and model performance
print(f"Best parameters: {random_search.best_params_}")
print(f"Best RMSE (CV): {np.sqrt(-random_search.best_score_)}")

# Evaluate the model on the test set
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)
test_rmse = mean_squared_error(Y_test, y_pred, squared=False)
print(f"Test RMSE: {test_rmse}")


Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best parameters: {'min_samples_split': 2, 'min_samples_leaf': 2, 'max_leaf_nodes': 100, 'max_features': None, 'max_depth': 5, 'criterion': 'poisson'}
Best RMSE (CV): 1.7016478002874074
Test RMSE: 1.573233606123639




### Training Model


In [131]:
from sklearn.tree import DecisionTreeRegressor
#regressor = DecisionTreeRegressor(criterion = 'absolute_error', max_depth = 15, max_features = 'sqrt', max_leaf_nodes = 30, min_samples_leaf = 5, min_samples_split = 15)
regressor = DecisionTreeRegressor(criterion = 'friedman_mse', max_depth = 20, max_features = None, max_leaf_nodes = None, min_samples_leaf = 1, min_samples_split = 15)

regressor.fit(X_train, Y_train)


#### Comparing Values


In [132]:
Y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2, suppress=True)
print(np.concatenate((Y_pred.reshape(len(Y_pred), 1), Y_test.reshape(len(Y_test), 1)), 1))


[[74.18 76.51]
 [77.72 80.22]
 [79.09 80.81]
 [77.11 76.91]
 [77.72 77.87]
 [79.81 80.34]
 [80.46 80.58]
 [77.22 79.35]
 [74.18 72.45]
 [80.46 80.58]
 [82.62 82.74]
 [79.39 79.39]
 [79.39 79.39]
 [85.44 85.52]
 [79.39 79.39]
 [80.18 79.68]
 [78.04 80.16]
 [74.18 78.42]
 [79.8  78.87]
 [81.6  81.74]
 [81.6  83.03]
 [75.32 73.86]
 [81.6  83.03]
 [79.39 79.39]
 [79.64 79.39]
 [77.72 78.02]
 [80.46 81.13]
 [79.64 79.39]
 [72.52 79.19]
 [80.46 80.1 ]
 [80.46 81.13]
 [79.39 79.39]
 [81.51 81.81]
 [85.44 84.13]
 [79.32 73.97]
 [79.39 79.39]
 [74.22 75.15]
 [79.   79.74]
 [80.46 80.86]
 [82.62 82.39]
 [83.06 83.32]
 [75.32 74.74]
 [79.81 81.33]
 [80.18 82.45]
 [79.39 79.39]
 [80.18 80.26]
 [80.18 79.39]
 [77.22 79.1 ]
 [76.19 80.61]
 [74.18 75.54]
 [82.62 81.81]
 [79.   78.39]
 [79.39 79.39]
 [80.18 79.39]
 [82.62 82.65]
 [85.44 84.45]
 [77.11 78.32]
 [84.51 84.68]
 [73.63 72.75]
 [79.8  79.8 ]
 [75.32 74.61]
 [79.39 79.39]
 [77.22 79.1 ]
 [79.8  79.63]
 [81.6  81.58]
 [84.51 84.77]
 [85.44 84

In [133]:
from sklearn.metrics import r2_score
r2_score(Y_test, Y_pred)

0.7118020833051459

In [134]:
def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true, dtype=float), np.array(y_pred, dtype=float)
    non_zero_indices = y_true != 0
    return np.mean(np.abs((y_true[non_zero_indices] - y_pred[non_zero_indices]) / y_true[non_zero_indices])) * 100


mean_absolute_percentage_error(Y_test, Y_pred)

1.2031310183725057