In [12]:
import pandas as pd
import requests
import json
import numpy as np
import matplotlib.pyplot as plt

%matplotlib notebook
pd.set_option('display.max_columns', 10)
pd.set_option('display.max_rows', 500)

import warnings
warnings.filterwarnings('ignore')

In [2]:
url = "https://api.covid19india.org/data.json"
r = requests.get(url)
data = r.json()

In [3]:
data = data['cases_time_series']

In [4]:
df = pd.DataFrame(data)

In [5]:
df = df.astype({'totalconfirmed': 'int64', 'totalrecovered': 'int64', 'totaldeceased': 'int64',
               'dailyconfirmed': 'int64', 'dailyrecovered': 'int64', 'dailydeceased': 'int64'})

In [6]:
df.columns

Index(['dailyconfirmed', 'dailydeceased', 'dailyrecovered', 'date', 'dateymd',
       'totalconfirmed', 'totaldeceased', 'totalrecovered'],
      dtype='object')

In [7]:
columns_to_keep = ['dateymd', 'dailyconfirmed']

In [8]:
df = df[columns_to_keep]

In [9]:
df.rename(columns = {'dateymd':'Date', 'dailyconfirmed':'Confirmed'}, inplace = True)

In [10]:
df['Date'] = pd.to_datetime(df['Date'], infer_datetime_format = True)
indexdf = df.set_index(['Date'])

In [13]:
indexdf.plot()

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x1fe5d593d68>

### Determine Rolling Statistics

In [14]:
rolmean = indexdf.rolling(window = 7).mean()
rolstd = indexdf.rolling(window = 7).std()

In [15]:
plt.figure()
orig = plt.plot(indexdf, color = 'blue', label = 'Original')
mean = plt.plot(rolmean, color = 'red', label = 'Rolling Mean')
st = plt.plot(rolstd, color = 'black', label = 'Rolling Std')
plt.legend(loc = 'best')
plt.title('Rolling  Mean & Standard Deviation')
plt.show()

<IPython.core.display.Javascript object>

### Augumented Dickey Fuller test

In [16]:
from statsmodels.tsa.stattools import adfuller
print('Results of Dickey-Fuller Test:')
dftest = adfuller(indexdf.Confirmed, autolag = 'AIC')
dfoutput = pd.Series(dftest[0:4], index = ['Test Statistic', 'p-value', '#Lags Used', 'Number of Observations Used'])
for key, value in dftest[4].items():
    dfoutput['Critical Value (%s)' %key] = value
print(dfoutput)

Results of Dickey-Fuller Test:
Test Statistic                  -1.720497
p-value                          0.420584
#Lags Used                      17.000000
Number of Observations Used    349.000000
Critical Value (1%)             -3.449227
Critical Value (5%)             -2.869857
Critical Value (10%)            -2.571201
dtype: float64


## Estimating Trends

### Log Scale

In [17]:
new_index_df = indexdf.copy()
new_index_df['Confirmed'] = new_index_df['Confirmed'].replace(0, 1)
indexdf_logscale = np.log(new_index_df)
test_stationarity(indexdf_logscale)

NameError: name 'test_stationarity' is not defined

In [None]:
movingAverage = indexdf_logscale.rolling(window = 7).mean()
movingSTD = indexdf_logscale.rolling(window = 7).std()

In [None]:
plt.figure()
orig = plt.plot(indexdf_logscale, color = 'blue', label = 'Log_Scale')
mean = plt.plot(movingAverage, color = 'red', label = 'Rolling Mean')
st = plt.plot(movingSTD, color = 'black', label = 'Rolling Std')
plt.legend(loc = 'best')
plt.title('Rolling  Mean & Standard Deviation')
plt.show()

### Log Scale Minus Moving Average

In [None]:
df_index_logscale_movingaverage = indexdf_logscale - movingAverage
df_index_logscale_movingaverage.dropna(inplace = True)

In [None]:
def test_stationarity(timeseries):
    
    #Determining rolling statistics
    rolmean = timeseries.rolling(window = 7).mean()
    rolstd = timeseries.rolling(window = 7).std()
    
    #plotting rolling statistics
    plt.figure()
    orig = plt.plot(timeseries, color = 'blue', label = 'Original')
    mean = plt.plot(rolmean, color = 'red', label = 'Rolling Mean')
    st = plt.plot(rolstd, color = 'black', label = 'Rolling Std')
    plt.legend(loc = 'best')
    plt.title('Rolling  Mean & Standard Deviation')
    plt.show()
    
    #Dickey_Fuller test:
    print('Results of Dickey-Fuller Test:')
    dftest = adfuller(timeseries.Confirmed, autolag = 'AIC')
    dfoutput = pd.Series(dftest[0:4], index = ['Test Statistic', 'p-value', '#Lags Used', 'Number of Observations Used'])
    for key, value in dftest[4].items():
        dfoutput['Critical Value (%s)' %key] = value
    print(dfoutput)

In [None]:
test_stationarity(df_index_logscale_movingaverage)

### Log Scale Minus Exponential Decay Weighted Average

In [None]:
exp_decay_weighted_avg = indexdf_logscale.ewm(halflife = 2).mean()
plt.figure()
plt.plot(indexdf_logscale)
plt.plot(exp_decay_weighted_avg)
plt.show()

In [None]:
df_index_logscale_exp_decay_weighted_avg = indexdf_logscale - exp_decay_weighted_avg
test_stationarity(df_index_logscale_exp_decay_weighted_avg)

### Log Difference Shifting

In [None]:
df_log_diff_shift = indexdf_logscale - indexdf_logscale.shift()
plt.figure()
plt.plot(df_log_diff_shift)

In [None]:
df_log_diff_shift.dropna(inplace = True)
test_stationarity(df_log_diff_shift)

### Square Root 

In [None]:
indexdf_sqrt = np.sqrt(indexdf_logscale)
rolmean = indexdf_sqrt.rolling(window = 7).mean()
rolstd = indexdf_sqrt.rolling(window = 7).std()
plt.figure()
plt.plot(indexdf_sqrt)
plt.plot(rolmean)
plt.plot(rolstd)

In [None]:
test_stationarity(indexdf_sqrt)

In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose
decomposition = seasonal_decompose(indexdf_sqrt)
trend = decomposition.trend
seasonal = decomposition.seasonal
residual = decomposition.resid

plt.figure()
plt.subplot(411)
plt.plot(indexdf_sqrt, label = 'Original')
plt.legend(loc = 'best')
plt.subplot(412)
plt.plot(trend, label = 'Trend')
plt.legend(loc = 'best')
plt.subplot(413)
plt.plot(seasonal, label = 'Seasonality')
plt.legend(loc = 'best')
plt.subplot(414)
plt.plot(residual, label = 'Residual')
plt.legend(loc = 'best')

In [None]:
indexdf_sqrt_decompose = residual
indexdf_sqrt_decompose.dropna(inplace = True)
test_stationarity(indexdf_sqrt_decompose)

In [None]:
indexdf_sqrt_diff = indexdf_sqrt - indexdf_sqrt.shift(50)
plt.figure()
plt.plot(indexdf_sqrt_diff)

In [None]:
plt.figure()
pd.plotting.autocorrelation_plot(indexdf_sqrt_diff)

In [None]:
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
plt.figure()
plt.subplot(211)
plot_acf(indexdf_sqrt_diff, ax=plt.gca())
plt.subplot(212)
plot_pacf(indexdf_sqrt_diff, ax=plt.gca())
plt.show()

In [None]:
from statsmodels.tsa.arima_model import ARIMA, ARMAResults

In [None]:
model = ARIMA(indexdf_sqrt, order = (2,2,2))
results_AR = model.fit()
plt.figure()
plt.plot(indexdf_sqrt)
plt.plot(results_AR.fittedvalues, color = 'red')
#plt.title('RSS: %.4f'% sum((results_AR.fittedvalues - indexdf_sqrt)**2))