In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import io
from google.colab import files


# Useful links

**numpy** https://www.numpy.org/

**pandas** https://pandas.pydata.org/

**matplotlib** https://matplotlib.org/

**colab** https://colab.research.google.com/notebooks/welcome.ipynb

# **Getting the data**
The data is released by Open Power System Data plateforme under Creative Commons Attribution-International license, based on consumptuon and production of several German small business households.


In [0]:
link = "https://bit.ly/2L5Kof1"

In [0]:
uploaded = files.upload()

Saving ProductionConsumptionGermanIndustrialParc.csv to ProductionConsumptionGermanIndustrialParc.csv


# Handling missing values



In [0]:
def plot_df(df, title):
  df.plot(title=(title), figsize=(15,10), lw=2)
  plt.xlabel('Date')
  plt.ylabel('Power (KW)')
  plt.legend(loc='best')
  plt.show()

In [0]:
def plot_multiple_features(df, subplots, fig_w, fig_h):
  fig, axes = plt.subplots(subplots,subplots,figsize = (fig_w, fig_h))
  for column, ax in zip(df.columns, axes.flatten()):
    df[[column]].plot(ax=ax, title=column, fontsize = 18)
    ax.tick_params(axis='x',labelsize=4)
    ax.tick_params(axis='y',labelsize=4, labelrotation=75)
    ax.xaxis.label.set_visible(False)
 

# Let's try the interactive plots using plotly

**plotly** https://plot.ly/python/

**seaborn** https://seaborn.pydata.org/

In [0]:
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import iplot, init_notebook_mode
import cufflinks
cufflinks.go_offline(connected=True)
init_notebook_mode(connected=True)
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

def configure_plotly_browser_state():
  import IPython
  display(IPython.core.display.HTML('''
        <script src="/static/components/requirejs/require.js"></script>
        <script>
          requirejs.config({
            paths: {
              base: '/static/base',
              plotly: 'https://cdn.plot.ly/plotly-latest.min.js?noext',
            },
          });
        </script>
        '''))

In [0]:
configure_plotly_browser_state()
global_df.resample('M').mean().iplot(
    kind='bar',
     barmode='overlay',
    xTitle='Date',
    yTitle='Average per month',
    title='global metrics in Kwh')

In [0]:
import seaborn as sns
dist = sns.pairplot(global_df)
dist.fig.set_size_inches(12,12)

In [0]:
from pylab import rcParams
rcParams['figure.figsize'] = 6, 4
sns.distplot(global_df['global_consumption'].fillna(0), hist=True, kde=True, color = 'orange', 
             hist_kws={'edgecolor':'black'},
             kde_kws={'linewidth': 5})
sns.distplot(global_df['grid_import'].fillna(0), hist=True, kde=True, color = 'green', 
             hist_kws={'edgecolor':'black'},
             kde_kws={'linewidth': 5})

# Time series analysis
## Stationarity of time series data

**statsmodels** https://www.statsmodels.org/

In [0]:
from statsmodels.tsa.stattools import adfuller
print('Dickey Fuller Test \n Null hypothezsis states that the time series are non-stationary, and the alternative hypothesis states that time series are stationary')
ad_fuller_stat = adfuller(global_df['global_consumption'].resample('d').mean().fillna(method="bfill"), autolag='AIC')
plot_ad_fuller = pd.Series(ad_fuller_stat[0:4],index=['Test statistics','p-value','number of lags used','number of observations'])
for key,value in ad_fuller_stat[4].items():
    plot_ad_fuller['Critical value (%s)'%key] = value
print(plot_ad_fuller)

In [0]:
data_to_plot = global_df['global_consumption'].resample('d').mean().fillna(method="bfill")
ma =  data_to_plot.rolling(30).mean()
mstd = data_to_plot.rolling(30).std()

plt.plot(data_to_plot, color = 'blue', label = "Original value")
plt.plot(ma, color= 'red',  label = "Rolling mean")
plt.plot(mstd, color= 'green',  label = "Rolling std")
plt.legend()

In [0]:
from statsmodels.graphics.tsaplots import plot_acf
a=plot_acf(global_df['global_consumption'].resample('D').mean(), title= 'Autocorrelation consumption')

In [0]:
from pylab import rcParams
rcParams['figure.figsize'] = 20, 10
import statsmodels.api as sm
consumption_decomposed= sm.tsa.seasonal_decompose(global_df['global_consumption'].resample('D').mean())
fig = consumption_decomposed.plot()
plt.show()

In [0]:
from statsmodels.tsa.api import SimpleExpSmoothing


rcParams['figure.figsize'] = 20, 10
train, test = resampled_by_day_consumption["global_consumption_lagged_once"]['2016-02-11 10:15:00':'2017-01-01 00:00:00'].fillna(0), resampled_by_day_consumption["global_consumption_lagged_once"]['2017-01-01 00:00:00':'2017-02-09 10:45:00'].fillna(0)

model = SimpleExpSmoothing(train).fit(smoothing_level=0.9)
pred = model.predict(start=test.index[0], end=test.index[-1])
plt.title(model.model.params['smoothing_level'])
# plot
plt.plot(train.index, train, label='Train')
plt.plot(test.index, test, label='Test')
plt.plot(train.index, model.fittedvalues, label='Fitted values')
plt.plot(pred.index, pred, label='Simple Exponential Smoothing')
plt.legend(loc='best')

In [0]:
from statsmodels.tsa.api import ExponentialSmoothing

train, test = resampled_by_day_consumption["global_consumption"]['2016-02-11 10:15:00':'2017-01-01 00:00:00'].fillna(0), resampled_by_day_consumption["global_consumption"]['2017-01-01 00:00:00':'2017-02-09 10:45:00'].fillna(0)
model = ExponentialSmoothing(train, seasonal='add', trend='add',seasonal_periods=7).fit(use_boxcox=True)
pred = model.predict(start=test.index[0], end=test.index[-1])

plt.plot(train.index, train, label='Train')
plt.plot(test.index, test, label='Test')
#plt.plot(train.index, model.fittedvalues, label='Fitted values')
plt.plot(pred.index, pred, label='Holt-Winters')
plt.legend(loc='best')

# Advanced predictive modeling

**keras**  https://keras.io/

**sklearn** https://scikit-learn.org/stable/

In [0]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout

In [0]:
from keras.preprocessing.sequence import TimeseriesGenerator
generator = TimeseriesGenerator(train_scaled, train_scaled, length=7,  batch_size=1)
for i in range(len(generator)):
    x, y = generator[i]
    print('%s => %s' % (x, y))

In [0]:
predictor = Sequential()
predictor.add(LSTM(units=32, input_shape=(7,1)))
predictor.add(Dense(units=1))

predictor.compile(optimizer="adam", loss="mse")

predictor.fit_generator(generator, steps_per_epoch=1, epochs=50, verbose=1)

In [0]:
dataset_validation = pd.concat((train['global_consumption'], test["global_consumption"]), axis=0)
test_inputs = dataset_validation[len(dataset_validation) - len(test) - 7:].values.reshape(-1,1)
test_final = []
for i in range (7, len(test)):
    test_final.append(test_inputs[i-7:i, 0])
test_values = np.array(test_final)
test_values_reshaped = np.reshape(test_values, (test_values.shape[0], test_values.shape[1], 1))
prediction = predictor.predict(test_values_reshaped)
