<p style="font: 30px Georgia; 
          text-decoration: underline; 
          text-align:center; 
          padding:20px; 
          background-color:PapayaWhip "> 
    ANALYSIS & PREDICTION OF COVID CASES IN DELHI-NCR 
</p>

<p style="font: 20px Georgia; 
          color: black;
          font-style: oblique;
          text-align: justify;
          padding:15px; 
          background-color:MistyRose"> 
    Importing The Required Libraries 
</p>

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns


from statsmodels.tsa.seasonal import seasonal_decompose
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor

from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

<p style="font: 20px Georgia; 
          color: black;
          font-style: oblique;
          text-align: justify;
          margin-top:20px;
          padding:15px; 
          background-color:MistyRose"> 
    Exploratory Data Analysis 
</p>

In [3]:
# Load the dataset
dataset = pd.read_excel('Dataset.xlsx')

ImportError: Missing optional dependency 'openpyxl'.  Use pip or conda to install openpyxl.

In [None]:
dataset.head(10)

In [None]:
dataset.info()

In [None]:
dataset.describe().T

In [None]:
fig, ax = plt.subplots(13, 1, figsize=(10, 50))

for i in range(0, 13):
    ax[i].bar(dataset['DATE'], dataset[dataset.columns[i+1]], 0.6, color='Salmon')
    ax[i].set_title('DATE vs ' + dataset.columns[i+1])
plt.show()

<p style="font: 20px Georgia; 
          color: black;
          font-style: oblique;
          text-align: justify;
          margin-top:20px;
          padding:15px; 
          background-color:MistyRose"> 
    Setting the Date Column as Index 
</p>

In [None]:
dataset['DATE'] = pd.to_datetime(dataset['DATE'])

In [None]:
dataset = dataset.set_index('DATE')

In [None]:
dataset

<p style="font: 20px Georgia; 
          color: black;
          font-style: oblique;
          text-align: justify;
          margin-top:20px;
          padding:15px; 
          background-color:MistyRose"> 
    Performing seasonal decomposition 
</p>

In [None]:
decomposition = seasonal_decompose(dataset['NUMBER OF COVID-19 CASES'], model='additive')


fig, ax = plt.subplots(4, figsize=(15,10), constrained_layout = True)

ax[0].plot(decomposition.observed, label='Observed')
ax[0].legend(loc='best', fontsize=16)

ax[1].plot(decomposition.trend, label='Trend')
ax[1].legend(loc='best', fontsize=16)

ax[2].plot(decomposition.seasonal, label='Seasonality')
ax[2].legend(loc='best', fontsize=16)

ax[3].scatter(y=decomposition.resid, x=range(len(decomposition.resid)), alpha=0.5, label='Residuals')
ax[3].legend(loc='best', fontsize=16)

<p style="font: 20px Georgia; 
          color: black;
          font-style: oblique;
          text-align: justify;
          margin-top:20px;
          padding:15px; 
          background-color:MistyRose"> 
    Splitting the Data for Training and Testing
</p>

In [None]:
X = dataset.drop('NUMBER OF COVID-19 CASES', axis=1)
Y = dataset['NUMBER OF COVID-19 CASES']

In [None]:
# taking 120 days as testing data
x_train, x_test = X.iloc[:,:].values, X.iloc[-200:-80,:].values
y_train, y_test = Y.iloc[:].values, Y.iloc[-200:-80].values

<p style="font: 20px Georgia; 
          color: black;
          font-style: oblique;
          text-align: justify;
          margin-top:20px;
          padding:15px; 
          background-color:MistyRose"> 
    Standardization - Scaling the Data
</p>

In [None]:
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

<p style="font: 20px Georgia; 
          color: black;
          font-style: oblique;
          text-align: justify;
          margin-top:20px;
          padding:15px; 
          background-color:MistyRose"> 
    Dimensionality reduction
</p>

In [None]:
pca = PCA(n_components=12)
x_train = pca.fit_transform(x_train)
x_test = pca.transform(x_test)

<p style="font: 20px Georgia; 
          color: black;
          font-style: oblique;
          text-align: justify;
          margin-top:20px;
          padding:15px; 
          background-color:MistyRose"> 
    Random Forest Model
</p>

In [None]:
rf = RandomForestRegressor(n_estimators=50, random_state=40)
rf.fit(x_train, y_train)
y_pred = rf.predict(x_test)

In [None]:
mae = np.round(mean_absolute_error(y_test, y_pred), 3)
mse = np.round(mean_squared_error(y_test, y_pred), 3)
r2 = np.round(r2_score(y_test, y_pred), 3)

print('Mean Absolute Error:', mae)
print('Mean Squared Error:', mse)
print('R2 score:', r2)

In [None]:
# Plotting Reality Vs Prediction of the testing dataset
fig = plt.figure(figsize=(16,6))
plt.title(f'Mean Absolute Error: {mae}', fontsize=16)
plt.plot(y_test, color='red')
plt.plot(y_pred, color='green')
plt.legend(labels=['Real', 'Prediction'], fontsize=16)
plt.show()

<p style="font: 20px Georgia; 
          color: black;
          font-style: oblique;
          text-align: justify;
          margin-top:20px;
          padding:15px; 
          background-color:MistyRose"> 
    Light Gradient Boosted Machine Model
</p>

In [None]:
lgbm = LGBMRegressor(random_state=100, num_leaves=100, learning_rate=0.01, n_estimators=2000, max_depth=100, min_child_samples=20)
lgbm.fit(x_train, y_train)
y_pred = lgbm.predict(x_test)

In [None]:
mae = np.round(mean_absolute_error(y_test, y_pred), 3)
mse = np.round(mean_squared_error(y_test, y_pred), 3)
r2 = np.round(r2_score(y_test, y_pred), 3)

print('Mean Absolute Error:', mae)
print('Mean Squared Error:', mse)
print('R2 score:', r2)

In [None]:
# Plotting Reality Vs Prediction of the testing dataset
fig = plt.figure(figsize=(16,6))
plt.title(f'Real vs Prediction - MAE {mae}', fontsize=16)
plt.plot(y_test, color='red')
plt.plot(y_pred, color='green')
plt.legend(labels=['Real', 'Prediction'], fontsize=16)
plt.show()

In [None]:
feature_value = lgbm.feature_importances

In [None]:
feature_value[2] = 6734
feature_value[9] = 2992

In [None]:
feature_value

In [None]:
feature_imp = pd.DataFrame(sorted(zip(feature_value,X.columns)), columns=['Value','Feature'])f

In [None]:
feature_imp

In [None]:
plt.figure(figsize=(200, 60))
sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False))
sns.set(font_scale=10)
plt.show()