In [31]:
import pandas as pd
import datetime
import numpy as np
from sqlalchemy import create_engine
import pymysql
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from sklearn.preprocessing import MinMaxScaler
from collections import OrderedDict
import boto3

In [32]:
# convert time series into supervised learning problem
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
    n_vars = 1 if type(data) is list else data.shape[1]
    df = pd.DataFrame(data)
    cols, names = list(), list()
    # input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
        names += [('var%d(t-%d)' % (j + 1, i)) for j in range(n_vars)]
    # forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
        if i == 0:
            names += [('var%d(t)' % (j + 1)) for j in range(n_vars)]
        else:
            names += [('var%d(t+%d)' % (j + 1, i)) for j in range(n_vars)]
    # put it all together
    agg = pd.concat(cols, axis=1)
    agg.columns = names
    # drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)
    return agg


In [33]:
# normalize the dataset
def prepare_data(series, n_lag, n_seq):
    # extract raw values
    raw_values = series.values

    # rescale values to -1, 1
    scaler = MinMaxScaler(feature_range=(-1, 1))
    scaled_values = scaler.fit_transform(raw_values)
    scaled_values = scaled_values.reshape(len(scaled_values), 1)

    # transform into supervised learning problem X, y
    supervised = series_to_supervised(scaled_values, n_lag, n_seq)
    supervised_values = supervised.values

    return scaler, supervised_values

In [34]:
# create an LSTM network to training data
def create_lstm(X, y, n_lag, n_seq, n_batch, n_neurons):
    # design network
    model = Sequential()
    model.add(LSTM(n_neurons, batch_input_shape=(n_batch, X.shape[1], X.shape[2]), stateful=True))
    model.add(Dense(y.shape[1]))
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

In [35]:
def fit_lstm(nb_epoch, model, n_batch):
    # fit network - reset state between epochs
    for i in range(nb_epoch):
        model.fit(X, y, epochs=1, batch_size=n_batch, verbose=1, shuffle=False)
        model.reset_states()

In [36]:
# make one forecast with an LSTM,
def forecast_lstm(model, X, n_batch):
    # make forecast
    forecast = model.predict(X, batch_size=n_batch)
    model.reset_states()
    # convert to array
    return [x for x in forecast[0, :]]

In [37]:
# inverse data transform on forecasts
def inverse_transform(series, forecast, scaler):
    # create array from forecast
    forecast = np.array(forecast)
    forecast = forecast.reshape(1, -1)

    # invert scaling
    inv_scale = np.exp(scaler.inverse_transform(forecast))
    inv_scale = inv_scale[0, :]

    return inv_scale

In [38]:
def extrapolate_timeslots(start_date, num_days=1):
    """Extrapolate the forecast dates and timeslots and return dataframe with forecast_dates and timeslots"""

    timeslots_per_day = 96
    forecast_dates = []
    timeslots = []

    for days in range(1, num_days + 1):
        forecast_dates = forecast_dates + [(start_date + datetime.timedelta(days=days)).date()] * timeslots_per_day
        timeslots = timeslots + list(range(1, timeslots_per_day + 1))

    dictionary = {
        "forecast_date": forecast_dates,
        "timeslot": timeslots
    }

    return pd.DataFrame.from_dict(dictionary)

In [39]:
db_host = "yieldmanagement.c2jaydssljuc.us-west-2.rds.amazonaws.com"
db_username = "ymdsmaster"
db_password = "YieLDMgMt36"
db_name = "yieldmanagement"
db_port = "6174"

conn = create_engine('mysql+pymysql://' + db_username + ':' + db_password + '@' + db_host + ':' + db_port + '/' + db_name, echo=False)

start_date = '2017-07-01'
end_date = '2017-12-31'
channel = 'ESPN'
network_number = "33"  # This is ESPN

In [40]:
query = (
    "SELECT network_label, start_time,start_date,household_impressions, timeslot FROM yieldmanagement.rentrak_impressions_old where start_date BETWEEN '" + start_date + "' AND '" + end_date +"' AND network_label IN ('" + channel + "') ORDER BY network_label, start_time;")

series = pd.read_sql(query, con=conn)

#series.set_index(keys=['start_time'], drop=False, inplace=True)
series.reset_index()
series.set_index(keys=['start_date'],drop=False,inplace=True)
series.drop(['network_label', 'start_time','start_date'], axis=1, inplace=True)


In [41]:
print("series shape", series.shape) 
print("series columns", series.columns)
series.head()

('series shape', (17664, 2))
('series columns', Index([u'household_impressions', u'timeslot'], dtype='object'))


Unnamed: 0_level_0,household_impressions,timeslot
start_date,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-07-01,45365,1
2017-07-01,43367,2
2017-07-01,42509,3
2017-07-01,39906,4
2017-07-01,42388,5


In [42]:
series.head()
series = series.pivot(columns='timeslot')

In [43]:
data = series.iloc[:,0:2]

In [44]:
data.head()

Unnamed: 0_level_0,household_impressions,household_impressions
timeslot,1,2
start_date,Unnamed: 1_level_2,Unnamed: 2_level_2
2017-07-01,45365,43367
2017-07-02,180097,182934
2017-07-03,78902,69084
2017-07-04,92381,78948
2017-07-05,65659,61542


In [45]:
values = data.values

In [46]:
values.shape

(184, 2)

In [63]:
data.head()

Unnamed: 0_level_0,household_impressions,household_impressions
timeslot,1,2
start_date,Unnamed: 1_level_2,Unnamed: 2_level_2
2017-07-01,45365,43367
2017-07-02,180097,182934
2017-07-03,78902,69084
2017-07-04,92381,78948
2017-07-05,65659,61542


In [64]:
df.head()

Unnamed: 0,var1(t-1),var2(t-1),var1(t),var2(t),var1(t+1),var2(t+1),var1(t+2),var2(t+2)
1,45365.0,43367.0,180097,182934,78902.0,69084.0,92381.0,78948.0
2,180097.0,182934.0,78902,69084,92381.0,78948.0,65659.0,61542.0
3,78902.0,69084.0,92381,78948,65659.0,61542.0,55148.0,49327.0
4,92381.0,78948.0,65659,61542,55148.0,49327.0,50932.0,46587.0
5,65659.0,61542.0,55148,49327,50932.0,46587.0,55319.0,50486.0


In [47]:
df = series_to_supervised(values,1,3)

In [48]:
df.head()

Unnamed: 0,var1(t-1),var2(t-1),var1(t),var2(t),var1(t+1),var2(t+1),var1(t+2),var2(t+2)
1,45365.0,43367.0,180097,182934,78902.0,69084.0,92381.0,78948.0
2,180097.0,182934.0,78902,69084,92381.0,78948.0,65659.0,61542.0
3,78902.0,69084.0,92381,78948,65659.0,61542.0,55148.0,49327.0
4,92381.0,78948.0,65659,61542,55148.0,49327.0,50932.0,46587.0
5,65659.0,61542.0,55148,49327,50932.0,46587.0,55319.0,50486.0


In [49]:
n_lag = 1
n_seq = 3
n_test =10

In [68]:
def prepare_data1(series, n_test, n_lag, n_seq):
    # extract raw values
    raw_values = series.values
    raw_values = raw_values.reshape(len(raw_values), 2)
    # transform into supervised learning problem X, y010
    supervised = series_to_supervised(raw_values, n_lag, n_seq)
    supervised_values = supervised.values
    # split into train and test sets
    train, test = supervised_values[0:-n_test], supervised_values[-n_test:]
    return train, test

In [76]:
train,test = prepare_data1(data,n_test,n_lag,n_seq)

In [77]:
train.shape

(171, 8)

In [78]:
test.shape

(10, 8)

In [81]:
X_train, y_train = train[:,0:2], train[:,2:]
X_test, y_test = test[:,0:2], test[:,2:]

In [82]:
X_train = X_train.reshape(X_train.shape[0],1,X_train.shape[1])
X_test = X_test.reshape(X_test.shape[0],1,X_test.shape[1])

In [89]:
print("X_train Shape: "+str(X_train.shape))
print("X_test Shape: "+str(X_test.shape))

X_train Shape: (171, 1, 2)
X_test Shape: (10, 1, 2)


In [98]:
# Variables for the LSTM model
n_batch = 1
nb_epoch = 2
n_neurons = 128

# Creating the LSTM model
model = Sequential()
model.add(LSTM(n_neurons, batch_input_shape=(n_batch, X_train.shape[1], X_train.shape[2]), stateful=True, return_sequences=True))
model.add(LSTM(500))
model.add(Dense(y_train.shape[1]))
model.compile(loss='mean_squared_error', optimizer='adam',metrics=['accuracy'])


In [99]:
model.fit(X_train, y_train, epochs=10, batch_size=n_batch, verbose=1, shuffle=False)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7ff7824f8550>

In [100]:
forecast = model.predict(X_test, batch_size=n_batch)

In [102]:
y_test

array([[  83520.,   75734.,   85466.,   78397.,   92250.,   83728.],
       [  85466.,   78397.,   92250.,   83728.,  171923.,  165819.],
       [  92250.,   83728.,  171923.,  165819.,  105952.,  100245.],
       [ 171923.,  165819.,  105952.,  100245.,  195689.,  163366.],
       [ 105952.,  100245.,  195689.,  163366.,  360417.,  286445.],
       [ 195689.,  163366.,  360417.,  286445.,  275068.,  211802.],
       [ 360417.,  286445.,  275068.,  211802.,  300847.,  291940.],
       [ 275068.,  211802.,  300847.,  291940.,  430185.,  438421.],
       [ 300847.,  291940.,  430185.,  438421.,  568304.,  403368.],
       [ 430185.,  438421.,  568304.,  403368.,  392774.,  339257.]])

In [103]:
forecast

array([[ 540.1416626 ,  552.71087646,  538.10894775,  548.9128418 ,
         537.76245117,  545.99719238],
       [ 540.14276123,  552.71185303,  538.10986328,  548.91387939,
         537.76342773,  545.99822998],
       [ 540.14276123,  552.71185303,  538.10986328,  548.91387939,
         537.76342773,  545.99822998],
       [ 540.14276123,  552.71185303,  538.10986328,  548.91387939,
         537.76342773,  545.99822998],
       [ 540.14276123,  552.71185303,  538.10986328,  548.91387939,
         537.76342773,  545.99822998],
       [ 540.14276123,  552.71185303,  538.10986328,  548.91387939,
         537.76342773,  545.99822998],
       [ 540.14239502,  552.71142578,  538.10955811,  548.91339111,
         537.76312256,  545.99786377],
       [ 540.14294434,  552.71209717,  538.11016846,  548.91412354,
         537.76367188,  545.99847412],
       [ 540.14324951,  552.71240234,  538.1104126 ,  548.91442871,
         537.76403809,  545.99865723],
       [ 540.14160156,  552.71063232,