### Importing the Libraries


In [66]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

### Importing the dataset


In [67]:
dataset = pd.read_csv('dataset/dataset.csv', header = None)
dataset.columns = dataset.iloc[0]
dataset = dataset[1:]
dataset = dataset.drop(columns=['Year'])

### Encoding Categorical Data


In [68]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
dataset['District'] = label_encoder.fit_transform(dataset['District'])

### Handling Missing Values


In [69]:
dataset = dataset.apply(pd.to_numeric, errors='coerce')
# dataset.fillna(dataset.median(), inplace=True)
dataset.fillna(dataset.mode().iloc[0], inplace=True)
# dataset.ffill(inplace=True)
dataset['Total Cases'] = dataset.iloc[:, 37:49].sum(axis=1)
dataset = dataset.drop(dataset.columns[37:49], axis=1)

### Handling Missing Values : Interpolation

In [70]:
# dataset = dataset.apply(pd.to_numeric, errors='coerce')
# dataset.interpolate(method='linear', inplace=True)
# dataset['Total Cases'] = dataset.iloc[:, 37:50].sum(axis=1)
# dataset = dataset.drop(dataset.columns[37:49], axis=1)

### Test/Train Split


In [71]:
from sklearn.model_selection import train_test_split

X = dataset.iloc[:, 0:-1].values
Y = dataset.iloc[:, -1].values

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.20, random_state = 0)

### Feature Scaling

In [72]:
from sklearn.preprocessing import StandardScaler

sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

### Grid Search

In [73]:
# from sklearn.model_selection import GridSearchCV
# from sklearn.ensemble import RandomForestRegressor

# # Initialize the regressor
# regressor = RandomForestRegressor()

# # Define the parameter grid
# param_grid = {
#     'n_estimators': [100, 200, 300],
#     'max_depth': [None, 10, 20, 30],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4],
#     'max_features': ['auto', 'sqrt', 'log2']
# }

# # Initialize the GridSearchCV object
# grid_search = GridSearchCV(estimator=regressor, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

# # Fit the grid search to the data
# grid_search.fit(X_train, Y_train)

# # Print the best parameters and the best score
# print("Best Parameters:", grid_search.best_params_)
# print("Best Score:", grid_search.best_score_)

### Training Model


In [74]:
from sklearn.ensemble import RandomForestRegressor
# regressor = RandomForestRegressor(n_estimators = 378, random_state = 0, max_depth=37, min_samples_leaf=1, min_samples_split=5, max_features=None)
# regressor = RandomForestRegressor()

# Best Parameters: {'max_depth': 20, 'max_features': 'log2', 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 300}
regressor = RandomForestRegressor(n_estimators = 300, random_state = 42, max_depth=20, min_samples_leaf=4, min_samples_split=2, max_features='log2')
regressor.fit(X_train, Y_train)

#### Comparing Values


In [75]:
Y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2, suppress=True)
print(np.concatenate((Y_pred.reshape(len(Y_pred),1), Y_test.reshape(len(Y_test),1)),1))


[[ 1838.57   336.  ]
 [ 3935.41   493.  ]
 [ 3643.03  6950.  ]
 [ 7474.21  1448.  ]
 [ 7175.74  2841.  ]
 [ 3880.22   491.  ]
 [ 1071.1     27.  ]
 [ 1095.66     1.  ]
 [  524.92    10.  ]
 [ 2080.17   277.  ]
 [ 4240.24  3397.  ]
 [  584.21    77.  ]
 [  706.7     82.  ]
 [ 1276.8     96.  ]
 [ 2549.6    140.  ]
 [  612.64    10.  ]
 [ 1132.49  5620.  ]
 [ 1849.31   762.  ]
 [ 2240.59  2127.  ]
 [  999.82    67.  ]
 [ 1497.73     0.  ]
 [  690.16   318.  ]
 [ 1250.4      0.  ]
 [  766.36   108.  ]
 [ 3678.89    91.  ]
 [ 2662.44  4533.  ]
 [ 1342.63   361.  ]
 [ 2457.26   288.  ]
 [ 3439.91     0.  ]
 [ 1032.75     0.  ]
 [ 2941.39  6348.  ]
 [ 3278.35   630.  ]
 [ 2833.67     0.  ]
 [ 3120.39   466.  ]
 [ 1308.26     0.  ]
 [  626.48   242.  ]
 [ 5578.43   335.  ]
 [  857.81     0.  ]
 [ 5075.15  1053.  ]
 [ 3905.2   1162.  ]
 [  676.69    84.  ]
 [  816.56    17.  ]
 [ 4917.59   739.  ]
 [  966.36    62.  ]
 [  375.25   211.  ]
 [  617.9      4.  ]
 [ 1200.75  2912.  ]
 [  601.39   

In [76]:
from sklearn.metrics import r2_score
r2_score(Y_test, Y_pred)

-0.15712767896231727

In [77]:
def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true, dtype=float), np.array(y_pred, dtype=float)
    non_zero_indices = y_true != 0
    return np.mean(np.abs((y_true[non_zero_indices] - y_pred[non_zero_indices]) / y_true[non_zero_indices])) * 100


mean_absolute_percentage_error(Y_test, Y_pred)

2740.459752462284