### Importing the Libraries


In [41]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

### Importing the dataset


In [42]:
dataset = pd.read_csv('dataset/dataset.csv', header = None)
dataset.columns = dataset.iloc[0]
dataset = dataset[1:]
dataset = dataset.drop(columns=['Year']) 

### Encoding Categorical Data


In [43]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
dataset['District'] = label_encoder.fit_transform(dataset['District'])

### Handling Missing Values


In [44]:
dataset = dataset.apply(pd.to_numeric, errors='coerce')
dataset.fillna(dataset.median(), inplace=True)
dataset['Total Cases'] = dataset.iloc[:, 37:50].sum(axis=1)
dataset = dataset.drop(dataset.columns[37:49], axis=1)

In [45]:
# Ensure month columns are accessible by checking for NaN values in a sample
print("Checking for NaN in month columns:")
print(dataset[['January', 'February', 'March']].head())

# Restructure the dataset for individual month prediction
temperature_features = [col for col in dataset.columns if 'Temperature' in col]
data = []
for _, row in dataset.iterrows():
    for month in ["January", "February", "March", "April", "May", "June", "July", 
                  "August", "September", "October", "November", "December"]:
        if pd.notna(row[month]):  # Ensure month data is available
            # Combine temperature data, month, and district as features, with cases as target
            features = row[temperature_features].tolist() + [month, row["District"]]
            target = row[month]
            data.append(features + [target])

# Create a DataFrame for the reshaped data
reshaped_data = pd.DataFrame(data, columns=temperature_features + ["Month", "District", "Cases"])

# One-hot encode the 'Month' column
reshaped_data = pd.get_dummies(reshaped_data, columns=["Month"], drop_first=True)

# Define features (X) and target (y)
X = reshaped_data.drop("Cases", axis=1)
Y = reshaped_data["Cases"]


Checking for NaN in month columns:


KeyError: "None of [Index(['January', 'February', 'March'], dtype='object', name=0)] are in the [columns]"

### Test/Train Split


In [None]:
from sklearn.model_selection import train_test_split

X = dataset.iloc[:, 0:-1].values
Y = dataset.iloc[:, -1].values

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.20, random_state = 0)

### Training Model


In [None]:
import xgboost as xgb
regressor = xgb.XGBRegressor(colsample_bytree=1, learning_rate=0.042222222222222223,
                           max_depth=7, alpha=10, n_estimators=500, min_child_weight=1, subsample=0.5, gamma=3.3333333333333335, reg_alpha=0, reg_lambda=1)

# regressor = xgb.XGBRegressor(colsample_bytree=0.7857872949109472, learning_rate=0.010344501343048346,
#                            max_depth=10, alpha=10, n_estimators=257, min_child_weight=2, subsample=0.850598292345319, gamma=3.1518448315907417, reg_alpha=0.9279458085493363, reg_lambda=7.180908650941852)

# regressor = xgb.XGBRegressor(colsample_bytree=0.7, learning_rate=0.042222222222222223,
#                            max_depth=10, alpha=10, n_estimators=50, min_child_weight=1, subsample=1, gamma=2.7777777777777777, reg_alpha=0.5, reg_lambda=10)

regressor.fit(X, Y)

#### Comparing Values


In [None]:
Y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2, suppress=True)
print(np.concatenate((Y_pred.reshape(len(Y_pred),1), Y_test.reshape(len(Y_test),1)),1))


[[  349.92   336.  ]
 [  610.55   493.  ]
 [ 6858.54  6950.  ]
 [ 1674.38  1448.  ]
 [ 2901.26  2841.  ]
 [  559.02   491.  ]
 [   42.02    27.  ]
 [  -76.87     1.  ]
 [   48.65    10.  ]
 [  482.51   278.  ]
 [ 3425.38  3398.  ]
 [  129.08   111.5 ]
 [  119.77   116.5 ]
 [  193.17    96.  ]
 [  279.92   140.  ]
 [   -3.94    10.  ]
 [ 5435.5   5621.  ]
 [  818.94   762.  ]
 [ 2184.22  2128.  ]
 [  -30.13    67.  ]
 [  -32.88     0.  ]
 [  297.61   318.  ]
 [   56.47     0.  ]
 [  229.17   142.5 ]
 [  214.61   125.5 ]
 [ 4470.95  4533.  ]
 [  452.15   361.  ]
 [  216.56   322.5 ]
 [   14.25     0.  ]
 [  -30.12     0.  ]
 [ 6221.96  6349.  ]
 [  768.76   664.5 ]
 [  249.09     0.  ]
 [  478.91   467.  ]
 [  -13.28     0.  ]
 [  232.22   276.5 ]
 [  451.17   335.  ]
 [ -128.42     0.  ]
 [ 1199.88  1053.  ]
 [ 1147.89  1163.  ]
 [   47.99    84.  ]
 [ -138.61    17.  ]
 [  710.33   739.  ]
 [  126.14    62.  ]
 [  250.66   245.5 ]
 [   90.94     4.  ]
 [ 2574.38  2946.5 ]
 [  356.37   

In [None]:
from sklearn.metrics import r2_score
r2_score(Y_test, Y_pred)

0.9979540206772833

In [None]:
def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true, dtype=float), np.array(y_pred, dtype=float)
    non_zero_indices = y_true != 0
    return np.mean(np.abs((y_true[non_zero_indices] - y_pred[non_zero_indices]) / y_true[non_zero_indices])) * 100


mean_absolute_percentage_error(Y_test, Y_pred)

192.3296165983041