In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.linear_model import LinearRegression
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split

from statsmodels.tsa.stattools import adfuller
from pmdarima import auto_arima
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.statespace.tools import diff

from pylab import rcParams
rcParams['figure.figsize']=20,5
plt.style.use('fivethirtyeight')

# Ignore harmless warnings
import warnings
warnings.filterwarnings("ignore")


In [None]:
df=pd.read_csv('/kaggle/input/covid19-in-india/covid_19_india.csv')

In [None]:
df.info()

In [None]:
df['Date'] = pd.to_datetime(df['Date'])

In [None]:
df = df[['Date', 'Deaths']]

In [None]:
# df.drop(columns=['Time', 'State/UnionTerritory', 'Sno'], inplace=True)

In [None]:
df = df.groupby('Date').sum()

In [None]:
df.head()

In [None]:
df.info()

In [None]:
# Assuming 'df' is your DataFrame and it's indexed by date
sns.lineplot(data=df, x=df.index, y='Deaths')

# Optionally, to make the plot more readable
plt.xticks(rotation=45)  # Rotates the x-axis labels to prevent overlap
plt.title('COVID-19 Deaths Over Time')
plt.show()

In [None]:
from statsmodels.tsa.stattools import adfuller

def adf_test(series,title=''):
    """
    Pass in a time series and an optional title, returns an ADF report
    """
    print(f'Augmented Dickey-Fuller Test: {title}')
    result = adfuller(series.dropna(),autolag='AIC') # .dropna() handles differenced data
    
    labels = ['ADF test statistic','p-value','# lags used','# observations']
    out = pd.Series(result[0:4],index=labels)

    for key,val in result[4].items():
        out[f'critical value ({key})']=val
        
    print(out.to_string())          # .to_string() removes the line "dtype: float64"
    
    if result[1] <= 0.05:
        print("Strong evidence against the null hypothesis")
        print("Reject the null hypothesis")
        print("Data has no unit root and is stationary")
    else:
        print("Weak evidence against the null hypothesis")
        print("Fail to reject the null hypothesis")
        print("Data has a unit root and is non-stationary")

In [None]:
adf_test(df['Deaths'])

In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose

seasonal_decompose(df['Deaths']).plot();

In [None]:
df.plot()
plt.title('Daily Deaths Over Time')
plt.show()

In [None]:
def test_stationarity(timeseries):
    result = adfuller(timeseries, autolag='AIC')
    print('ADF Statistic: %f' % result[0])
    print('p-value: %f' % result[1])

test_stationarity(df['Deaths'])

In [None]:
auto_model = auto_arima(df['Deaths'], start_p=1, start_q=1,
                        test='adf',       # use adftest to find optimal 'd'
                        max_p=3, max_q=3, # maximum p and q
                        m=7,              # frequency of series (7 for weekly)
                        d=None,           # let model determine 'd'
                        seasonal=True,    # Seasonality
                        start_P=0, 
                        D=1, 
                        trace=True,
                        error_action='ignore',  
                        suppress_warnings=True, 
                        stepwise=True)

print(auto_model.summary())

In [None]:
model = SARIMAX(df['Deaths'], 
                order=auto_model.order, 
                seasonal_order=auto_model.seasonal_order)
results = model.fit(disp=False)

In [None]:
forecast = results.get_forecast(steps=365)
forecast_series = forecast.predicted_mean
confidence_intervals = forecast.conf_int()

# Plotting the forecast
plt.figure(figsize=(10,5))
plt.plot(df['Deaths'], label='Historical')
plt.plot(forecast_series, label='Forecast')
plt.fill_between(confidence_intervals.index, 
                 confidence_intervals.iloc[:, 0],
                 confidence_intervals.iloc[:, 1], color='k', alpha=.25)
plt.title('Deaths Forecast')
plt.legend()
plt.show()

In [None]:
forecast_series = forecast.predicted_mean

In [None]:
forecast_df=forecast_series.to_frame()

In [None]:
forecast_df.reset_index(inplace=True)
forecast_df.rename(columns={'index': 'Date'}, inplace=True)

In [None]:
forecast_df

In [None]:
import pandas as pd

# Assuming your DataFrame is named 'df' and has a DatetimeIndex at the daily level

# Calculate the month-on-month growth rate
df['Deaths_Monthly_Growth'] = df['Deaths'].resample('M').sum().pct_change() * 100

# Drop NaN values (for the first month)
df = df.dropna(subset=['Deaths_Monthly_Growth'])



In [None]:
df