In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
from statsmodels.tsa.arima_model import ARIMA
from matplotlib import pyplot
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import statsmodels.api as sm
from statsmodels.tsa.statespace.sarimax import SARIMAX

In [3]:
from matplotlib import pyplot as plt


In [4]:
data= pd.read_csv('../input/hourdata/Power_consumption.csv')

In [5]:
data.head()

In [6]:
## we will take AC1 first and try to forecast it 
#3 we will consider temp variable also

In [7]:
## Before Forecasting we will do time series analysis first and see seasonality etc 


In [8]:
value= data[['AC 1','tempC']]

In [9]:
value.shape

In [10]:
## We might add holidays as well here 

In [11]:
holiday = pd.read_html('https://www.calendarlabs.com/holidays/india/2021')


In [12]:
len(holiday)

In [13]:
holiday=holiday[1]

In [14]:
holiday

In [15]:
def remove_char(x):
    return x[:-6]

In [16]:
holiday['DATE'] = holiday['DATE'].apply(remove_char)


In [17]:
holiday.head()

In [18]:
from datetime import datetime


def mdy_to_ymd(d):
    return datetime.strptime(d, '%b %d, %Y').strftime('%Y-%m-%d')

In [19]:
holiday['DATE'] = holiday['DATE'].apply(mdy_to_ymd)


In [20]:
holiday.head()

In [21]:
holiday['DATE'] = pd.to_datetime(holiday.DATE) - pd.offsets.DateOffset(years=2)


In [22]:
holiday.drop(['DAY'],axis=1,inplace=True)

In [23]:
holiday.head()

In [24]:
holiday['holiday_flag']=1

In [25]:
holiday.drop(['HOLIDAY'],axis=1,inplace=True)

In [26]:
dates=holiday['DATE'].dt.date.tolist()

In [27]:
data['date_time']= pd.to_datetime(data['date_time'])

In [28]:
data = data.set_index('date_time').asfreq('H')


In [29]:
data.head()

In [30]:
#data.loc['2019-08-01':'2019-08-15']
data['date']= pd.to_datetime(data.index).date

In [31]:
data['holiday_flag']=0

In [32]:
pd.to_datetime(dates[0])

In [33]:
for value in dates:
    indexes=data[data['date']== pd.to_datetime(value.strftime('%Y-%m-%d')) ].index.values
    if len(indexes) > 1:
        for i in indexes:
            data.at[i, 'holiday_flag']=1
        else:
            continue
            

In [34]:
plot_acf(data['AC 1'],lags=24)  # Daily lag
plt.show()

In [35]:
plot_acf(data['AC 1'],lags=168)  # weekly lag
plt.show()

In [36]:
plot_pacf(data['AC 1'],lags=24)
plt.show()

In [37]:
plot_pacf(data['AC 1'],lags=168)
plt.show()

In [38]:
## PACf drop straigth after 1 st lag means there is AR term
## Auto correlation shows hints of seasonality

In [39]:
data.sort_index(inplace= True)

In [40]:
from statsmodels.tsa.seasonal import seasonal_decompose
decompose_data = seasonal_decompose(data['AC 1'], model="additive")
decompose_data.plot();

In [41]:
seasonality=decompose_data.seasonal
seasonality.plot(color='green')

In [42]:
#df = pd.DataFrame(
#        {'date_time': pd.date_range('2019-08-01', '2019-10-01', freq='1H', closed='left')}
 #    )

In [43]:
daily=data[['AC 1']].resample('D').mean()

In [44]:
from statsmodels.tsa.seasonal import seasonal_decompose
decompose_data = seasonal_decompose(daily, model="additive")
decompose_data.plot();

In [45]:
seasonality=decompose_data.seasonal
seasonality.plot(color='green')

In [46]:
# We can easily see the seasonality pattern when data is sampled to daily

In [47]:
from statsmodels.tsa.stattools import adfuller
dftest = adfuller(data['AC 1'], autolag = 'AIC')
print("1. ADF : ",dftest[0])
print("2. P-Value : ", dftest[1])
print("3. Num Of Lags : ", dftest[2])
print("4. Num Of Observations Used For ADF Regression and Critical Values Calculation :", dftest[3])
print("5. Critical Values :")
for key, val in dftest[4].items():
    print("\t",key, ": ", val)


In [48]:
# p value is already lower than 0.05 it is already stationary

In [49]:
data.columns

In [50]:
model_data = data[['AC 1','holiday_flag']]
train = model_data.loc['2019-08-01':'2019-09-15']
test = model_data.loc['2019-09-16':'2019-09-30']

In [51]:
train['AC 1'].plot(figsize=(25,4))
test['AC 1'].plot(figsize=(25,4))

In [52]:
endog = train['AC 1']
exog = sm.add_constant(train[['holiday_flag']])

mod = sm.tsa.statespace.SARIMAX(endog=endog, exog=exog, order=(1,1,1),seasonal_order=(1,1, 0, 168))
model_fit = mod.fit()
model_fit.summary()

In [53]:
train['AC 1'].plot(figsize=(25,10))
model_fit.fittedvalues.plot()
plt.show()

In [54]:
predict = model_fit.predict(start = len(train),end = len(train)+len(test)-1,exog = sm.add_constant(test[['holiday_flag']]))
test['predicted'] = predict.values
test.tail(5)

In [55]:
test[['AC 1','predicted']].plot()

In [56]:
del model_fit

In [57]:
## We have made simple model  here without the usage of temperature 
## holiday as an exogenous variable doesnt add anything as mostly it is all 0 only

In [58]:
## we will try to forecast same with fb prophet as well simple forecasting 

In [59]:
data.to_csv('hourly_data.csv',header=True,index=True)