### Importing the Libraries


In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

### Importing the dataset


In [2]:
dataset = pd.read_csv('dataset/dataset.csv', header = None)
dataset.columns = dataset.iloc[0]
dataset = dataset[1:]
dataset = dataset.drop(columns=['Year']) 

### Encoding Categorical Data


In [3]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
dataset['District'] = label_encoder.fit_transform(dataset['District'])

### Handling Missing Values


In [4]:
dataset = dataset.apply(pd.to_numeric, errors='coerce')
dataset.fillna(dataset.median(), inplace=True)
dataset['Total Cases'] = dataset.iloc[:, 37:50].sum(axis=1)
dataset = dataset.drop(dataset.columns[37:49], axis=1)

### Test/Train Split


In [5]:
from sklearn.model_selection import train_test_split

X = dataset.iloc[:, 0:-1].values
Y = dataset.iloc[:, -1].values

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.20, random_state = 0)

### Training Model


In [6]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators = 378, random_state = 0, max_depth=37, min_samples_leaf=1, min_samples_split=5, max_features=None)
regressor.fit(X_train, Y_train)

#### Comparing Values


In [7]:
Y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2, suppress=True)
print(np.concatenate((Y_pred.reshape(len(Y_pred),1), Y_test.reshape(len(Y_test),1)),1))


[[ 1431.37   336.  ]
 [ 2960.42   493.  ]
 [ 3215.66  6950.  ]
 [24228.71  1448.  ]
 [ 4691.59  2841.  ]
 [ 6559.19   491.  ]
 [ 1199.8     27.  ]
 [  697.33     1.  ]
 [  253.3     10.  ]
 [ 2160.33   278.  ]
 [ 4211.1   3398.  ]
 [  225.02   111.5 ]
 [ 1435.36   116.5 ]
 [ 3019.66    96.  ]
 [  835.53   140.  ]
 [  380.46    10.  ]
 [  336.85  5621.  ]
 [ 5966.89   762.  ]
 [ 1140.96  2128.  ]
 [ 1085.02    67.  ]
 [ 2941.75     0.  ]
 [  490.34   318.  ]
 [ 2647.68     0.  ]
 [  416.31   142.5 ]
 [ 7467.54   125.5 ]
 [ 1189.91  4533.  ]
 [  469.65   361.  ]
 [ 7369.66   322.5 ]
 [ 5811.66     0.  ]
 [  557.04     0.  ]
 [ 2277.65  6349.  ]
 [ 7320.89   664.5 ]
 [ 3919.44     0.  ]
 [ 4148.05   467.  ]
 [ 1583.33     0.  ]
 [  422.7    276.5 ]
 [ 9170.92   335.  ]
 [  773.72     0.  ]
 [ 8881.5   1053.  ]
 [ 3929.93  1163.  ]
 [  441.87    84.  ]
 [ 1285.71    17.  ]
 [ 7987.58   739.  ]
 [  547.77    62.  ]
 [  293.2    245.5 ]
 [  156.31     4.  ]
 [  843.21  2946.5 ]
 [  262.52   

In [8]:
from sklearn.metrics import r2_score
r2_score(Y_test, Y_pred)

-1.7283836361255065

In [9]:
def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true, dtype=float), np.array(y_pred, dtype=float)
    non_zero_indices = y_true != 0
    return np.mean(np.abs((y_true[non_zero_indices] - y_pred[non_zero_indices]) / y_true[non_zero_indices])) * 100


mean_absolute_percentage_error(Y_test, Y_pred)

2082.1027939636056