In [None]:
import json, os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

from statsmodels.tsa.arima_model import ARIMA
import statsmodels.api as sm
from statsmodels.tsa.api import VAR
from datetime import timedelta, datetime, time

from aquabyte.data_access_utils import RDSAccessUtils

pd.set_option('display.max_rows', None)

%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

In [None]:
rds_access_utils = RDSAccessUtils(json.load(open(os.environ['PROD_SQL_CREDENTIALS'])))

In [None]:
query = """
    select p.id as id, s.name as site_name, p.name as pen_name from pens p
    left join sites s
    on p.site_id = s.id
    and p.is_accessible_to_customer is true
    order by p.id;
"""

df_pens = rds_access_utils.extract_from_database(query)

pen_ids = [ 66 ] #56 # 37 # 56, 60,
pen_infos = []

#print(df_pens)

for index, pen in df_pens.iterrows():
    if pen.id in pen_ids:
            pen_infos.append((pen.id, pen.site_name, pen.pen_name))
            
print(pen_infos)

pen_id, site_name, pen_name = pen_infos[0]

In [None]:
query = """
    select event_type, started_at, ended_at from event_logs 
    where pen_id = %i
    and started_at > '2020-01-01';
""" % (pen_id, )

df_events = rds_access_utils.extract_from_database(query)

df_events

In [None]:
query = """
    select date, female_avg, female_moving_avg, moving_avg, moving_moving_avg, num_lati_fish, num_moving_avg_lati_fish
    from day_summaries a
    where a.pen_id = %i
    and a.date >= '2020-01-01';
""" % (pen_id, )

day_summaries = rds_access_utils.extract_from_database(query)

day_summaries.index = pd.to_datetime(day_summaries['date'])
day_summaries = day_summaries.sort_index()

#day_summaries

In [None]:
# model = ARIMA(series, order=(2,1,0))
# model_fit = model.fit(disp=0)
# print(model_fit.summary())

# residuals = pd.DataFrame(model_fit.resid)
# residuals.plot()
# plt.show()
# residuals.plot(kind='kde')
# plt.show()
# print(residuals.describe())

i = 5
day_summaries_rolling = day_summaries.rolling('%iD' % (i, )).mean().shift(-24 * i / 2, freq='h').resample('D').apply(lambda x:x.tail(1) if x.shape[0] else np.nan)

if pen_id == 66:
    subset = (day_summaries.index > '2020-03-04') & (day_summaries.index < '2020-04-09')
    subsetRolling = (day_summaries_rolling.index > '2020-03-04') & (day_summaries_rolling.index < '2020-04-09')
elif pen_id in [56, 60]:
    subset = (day_summaries.index > '2020-03-04') & (day_summaries.index < '2020-04-26')
    subsetRolling = (day_summaries_rolling.index > '2020-03-04') & (day_summaries_rolling.index < '2020-04-26')

#seriesMRolling = day_summaries['moving_avg'][subset].rolling('%iD' % (i, )).mean().shift(-24 * i / 2, freq='h').resample('D').apply(lambda x:x.tail(1) if x.shape[0] else np.nan)

seriesCombinedOriginal = day_summaries[['female_avg', 'moving_avg']][subset].fillna(method='ffill')
seriesCombinedRolling = day_summaries_rolling[['female_avg', 'moving_avg']][subsetRolling]


In [None]:
# model_fit = model.fit(maxlags=5, ic='aic')
# # model_fit.plot()
# # model_fit.plot_acorr()
# lag_order = model_fit.k_ar
# predictionDays
# print(history)
# print(history[-lag_order:], predictionDays[0])
# output = model_fit.forecast(history.values[-lag_order:], predictionDays[0])
# np.sum(output, 0)


In [None]:
def testSeriesMultivariate(series, trainSplitPct, predictionDays, isExp, AR):
    originalX = series.copy()
    X = series.copy()
    
    for field in isExp:
        X[field] = np.log(X[field])

    diffX = X.diff().dropna()
    
    startPredictDays = 10

    size = int(len(diffX) * trainSplitPct)
    train, test = diffX.ix[0:size], diffX.ix[size:len(diffX)]
    history = train
    predictions = {}
    day0Predictions = originalX.ix[0:(size + startPredictDays)]
    day0PredictionsLower = originalX.ix[0:(size + startPredictDays)]
    day0PredictionsUpper = originalX.ix[0:(size + startPredictDays)]

    maxPredictionDay = 0
    

    for predictionDay in predictionDays:
        maxPredictionDay = max(maxPredictionDay, predictionDay)
        predictions[predictionDay] = pd.concat([originalX.ix[0:(size + predictionDay)]])
        
    for t in range(len(test)):
        nextDate = originalX.index[size + t + 1]
        model = VAR(history)
        model_fit = model.fit(maxlags = AR, ic='aic') #model.fit(AR)
        lag_order = model_fit.k_ar
        if lag_order == 0:
            model_fit = model.fit(1)
        output, output_lower, output_upper = model_fit.forecast_interval(history.values[-lag_order:], maxPredictionDay, 0.25)
        
        for predictionDay in predictionDays:
            if t > len(test) - predictionDay:
                continue
            predictionDate = originalX.index[size + t + predictionDay]
            prediction = X.ix[0] + np.sum(history) + np.sum(output[:predictionDay], 0)
            
            for field in isExp:
                prediction[field] = np.exp(prediction[field])
                
            if t == startPredictDays:
                cov = model_fit.forecast_cov(maxPredictionDay)
                
                for i in range(predictionDay):
                    date = originalX.index[size + startPredictDays + i + 1]
                    datePrediction = X.ix[0] + np.sum(history) + np.sum(output[:i], 0)
                    datePredictionLower = X.ix[0] + np.sum(history) + np.sum(output_lower[:i], 0)
                    datePredictionUpper = X.ix[0] + np.sum(history) + np.sum(output_upper[:i], 0)
                    #print(datePrediction)
                    SE = np.sqrt(np.diagonal(np.sum(cov[:i,:,:], 0)))
                    currentCOV = np.diagonal(cov[i,:,:])
                    expSE = np.sqrt(np.sum((np.exp(currentCOV) - 1) * np.exp(2 * np.mean(history) + currentCOV), 0))
                    #print(marginCI)
                    #datePredictionLower = X.ix[0] + np.sum(history) + np.sum(output[:(i-1)], 0) + np.sum(output_lower[(i-1):i], 0) + marginCI
                    #datePredictionUpper = X.ix[0] + np.sum(history) + np.sum(output[:(i-1)], 0) + np.sum(output_upper[(i-1):i], 0) - marginCI
                   
                    day0Predictions.loc[date] = datePrediction
                    day0PredictionsLower.loc[date] = np.maximum(datePrediction - SE * 1.96, datePredictionLower)
                    day0PredictionsUpper.loc[date] = np.minimum(datePrediction + SE * 1.96, datePredictionUpper)
                    
                    datePredictionExp = np.exp(datePrediction)
                    datePredictionLowerExp = np.maximum(np.exp(datePrediction) - expSE * 1.96, np.exp(datePredictionLower))
                    datePredictionUpperExp = np.minimum(np.exp(datePrediction) + expSE * 1.96, np.exp(datePredictionUpper))
                    
                    for field in isExp:
                        day0Predictions.loc[date, field] = datePredictionExp[field]
                        day0PredictionsLower.loc[date, field] = datePredictionLowerExp[field]
                        day0PredictionsUpper.loc[date, field] = datePredictionUpperExp[field]
           
            predictions[predictionDay].loc[predictionDate] = prediction
            
#             print(series.ix[0])
#             print(history)
#             print(np.sum(history))
#             print(np.sum(output[:predictionDay], 0))
#             print(nextDate, predictionDate)
#             print(prediction)

        history.loc[nextDate] = test.ix[t]

#     for predictionDay in predictionDays:
#         for field in isExp:
#             predictions[predictionDay][field] = np.exp(predictions[predictionDay][field])

    return predictions, day0Predictions, day0PredictionsLower, day0PredictionsUpper
    
#    return train, test, predictions

In [None]:
trainSplitPct = 0.5
predictionDays = [14]

if pen_id == 66:
    predictions, day0Predictions, day0PredictionsLower, day0PredictionsUpper = testSeriesMultivariate(seriesCombinedRolling, trainSplitPct, predictionDays, [], 3)
else:
    predictions, day0Predictions, day0PredictionsLower, day0PredictionsUpper = testSeriesMultivariate(seriesCombinedRolling, trainSplitPct, predictionDays, ['female_avg'], 1)

fig, ax = plt.subplots(2)

fig.set_size_inches(15, 20)

i = 2
halfI = int((i + 1)/ 2)

for index, predictionDay in enumerate(predictionDays):
    predictionSeries = predictions[predictionDay]
    totalLen = int((len(predictionSeries) - 1) * trainSplitPct) + predictionDay
    predictionSeriesRolling = predictionSeries.rolling('%iD' % (i, )).mean().shift(-24 * i / 2, freq='h').resample('D').apply(lambda x:x.tail(1) if x.shape[0] else np.nan)
    predictionSeriesRolling.ix[0:halfI] = np.nan
    predictionSeriesRolling['female_avg'].ix[halfI:(totalLen + halfI)] = seriesCombinedRolling['female_avg'].values[0:totalLen]
    predictionSeriesRolling['moving_avg'].ix[halfI:(totalLen + halfI)] = seriesCombinedRolling['moving_avg'].values[0:totalLen]

    ax[0].plot(predictionSeries.index, predictionSeries['female_avg'], color = 'red', label = '%i Day Original' % (predictionDay, ))
    ax[0].plot(predictionSeriesRolling.index, predictionSeriesRolling['female_avg'].values, color = 'blue', label = '%i Day Rolling' % (predictionDay, ))
    ax[0].plot(seriesCombinedOriginal.index, seriesCombinedOriginal['female_avg'], color = 'black', linestyle = '--', label = 'Actual')
    ax[0].plot(seriesCombinedRolling.index, seriesCombinedRolling['female_avg'], color = 'black', linewidth = 5, label = 'Rolling')
    ax[0].plot(day0Predictions.index, day0Predictions['female_avg'], color = 'purple', linewidth = 2, label = 'Daily Predictions')
    ax[0].fill_between(day0Predictions.index, day0PredictionsLower['female_avg'], day0PredictionsUpper['female_avg'], color='b', alpha=.1)
    ax[0].set_title('%s %s Adult Female Predictions (ARIMA)' % (site_name, pen_name))
    ax[0].set_xlabel('Date')
    ax[0].set_ylabel('Adult Female Count')
    ax[0].legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
    ax[0].axhline(0)
    
    ax[1].plot(predictionSeries.index, predictionSeries['moving_avg'], color = 'red', label = '%i Day Original' % (predictionDay, ))
    ax[1].plot(predictionSeriesRolling.index, predictionSeriesRolling['moving_avg'].values, color = 'blue', label = '%i Day Rolling' % (predictionDay, ))
    ax[1].plot(seriesCombinedOriginal.index, seriesCombinedOriginal['moving_avg'], color = 'black', linestyle = '--', label = 'Actual')
    ax[1].plot(seriesCombinedRolling.index, seriesCombinedRolling['moving_avg'], color = 'black', linewidth = 5, label = 'Rolling')
    ax[1].plot(day0Predictions.index, day0Predictions['moving_avg'], color = 'purple', linewidth = 2, label = 'Daily Predictions')
    ax[1].fill_between(day0Predictions.index, day0PredictionsLower['moving_avg'], day0PredictionsUpper['moving_avg'], color='b', alpha=.1)
    ax[1].set_title('%s %s Moving Predictions (ARIMA)' % (site_name, pen_name))
    ax[1].set_xlabel('Date')
    ax[1].set_ylabel('Moving Count')
    ax[1].legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)


In [None]:
# BELOW IS OLD

In [None]:
# fig, ax = plt.subplots(2)

# fig.set_size_inches(15, 20)

# ax[0].plot(seriesAFRolling.index, seriesAFRolling.values, color='orange', label = 'AF Rolling')
# ax[0].plot(seriesAFOriginal.index, seriesAFOriginal.values, color='red', label = 'AF Original')
#ax[1].plot(seriesAFRolling.index, np.log(seriesAFRolling.values), color='orange', label = 'AF')



In [None]:
def testSeries(series, trainSplitPct, modelOrder, predictionDays, isExp):
    X = series

    if isExp:
        X = np.log(X)

    size = int(len(X) * trainSplitPct)
    train, test = X[0:size], X[size:len(X)]
    history = [x for x in train]
    predictions = {}

    maxPredictionDay = 0

    for predictionDay in predictionDays:
        maxPredictionDay = max(maxPredictionDay, predictionDay)
        predictions[predictionDay] = []

    dates = [0]

    for t in range(len(test)):
        model = ARIMA(history, order = modelOrder)
        model_fit = model.fit(disp = 0)
        output = model_fit.forecast(steps = maxPredictionDay)

        for predictionDay in predictionDays:
            prediction = output[0][predictionDay - 1]

            if isExp:
                prediction = np.exp(prediction)

            predictions[predictionDay].append(prediction)

        history.append(test[t])

    if isExp:
        train = np.exp(train)
        test = np.exp(test)

    return train, test, predictions

In [None]:
trainSplitPct = 0.5
order = (4, 1, 0) # 2 AR, 1 difference (non-stationary), 0 MA
predictionDays = [14]

# seriesAF = seriesAFRolling
# seriesM = seriesMRolling
seriesAFRolling = seriesCombinedRolling['female_avg']
seriesMRolling = seriesCombinedRolling['moving_avg']
seriesAF = seriesCombinedOriginal['female_avg']
seriesM = seriesCombinedOriginal['moving_avg']

if pen_id == 66:
    trainAF, testAF, predictionsAF = testSeries(seriesAFRolling, trainSplitPct, order, predictionDays, False)
else:
    trainAF, testAF, predictionsAF = testSeries(seriesAFRolling, trainSplitPct, order, predictionDays, True)
trainM, testM, predictionsM = testSeries(seriesMRolling, trainSplitPct, order, predictionDays, False)

In [None]:
fig, ax = plt.subplots(2)

fig.set_size_inches(15, 20)

i = 2
halfI = int((i + 1)/ 2)

for index, predictionDay in enumerate(predictionDays):
    totalLen = len(trainAF) + len(testAF[0:(predictionDay - 1)])
    predictionValues = np.concatenate((trainAF, testAF[0:(predictionDay - 1)], predictionsAF[predictionDay][0:-(predictionDay - 1)]))
    predictionSeries = pd.Series(predictionValues, index = seriesAF.index)
    predictionSeriesRolling = predictionSeries.rolling('%iD' % (i, )).mean().shift(-24 * i / 2, freq='h').resample('D').apply(lambda x:x.tail(1) if x.shape[0] else np.nan)
    predictionSeriesRolling.ix[0:halfI] = np.nan
    predictionSeriesRolling.ix[halfI:(totalLen + halfI)] = seriesAFRolling.values[0:totalLen]
    #ax[0].plot(predictionSeries.index, predictionSeries.values, color = 'C%i' % (index + 1, ), label = '%i Day' % (predictionDay, ))
    ax[0].plot(predictionSeriesRolling.index, predictionSeriesRolling.values, color = 'C%i' % (index + 1, ), label = '%i Day Rolling' % (predictionDay, ))
#ax[0].plot(seriesAF.index, seriesAFOriginal.values, color = 'black', linestyle = '--', label = 'Actual')
ax[0].plot(seriesAF.index, seriesAFRolling.values, color = 'black', linewidth = 5, label = 'Actual')
ax[0].set_title('%s %s Adult Female Predictions (ARIMA)' % (site_name, pen_name))
ax[0].set_xlabel('Date')
ax[0].set_ylabel('Adult Female Count')
ax[0].legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)

for index, predictionDay in enumerate(predictionDays):
    predictionValues = np.concatenate((trainM, testM[0:(predictionDay - 1)], predictionsM[predictionDay][0:-(predictionDay - 1)]))
    predictionSeries = pd.Series(predictionValues, index = seriesM.index)
    predictionSeriesRolling = predictionSeries.rolling('%iD' % (i, )).mean().shift(-24 * i / 2, freq='h').resample('D').apply(lambda x:x.tail(1) if x.shape[0] else np.nan)
    predictionSeriesRolling.ix[0:halfI] = np.nan
    predictionSeriesRolling.ix[halfI:(totalLen + halfI)] = seriesMRolling.values[0:totalLen]
    #ax[1].plot(predictionSeries.index, predictionSeries.values, color = 'C%i' % (index + 1, ), label = '%i Day' % (predictionDay, ))
    ax[1].plot(predictionSeriesRolling.index, predictionSeriesRolling.values, color = 'C%i' % (index + 1, ), label = '%i Day' % (predictionDay, ))
#ax[1].plot(seriesM.index, seriesMOriginal.values, color = 'black', linestyle = '--', label = 'Actual')
ax[1].plot(seriesM.index, seriesMRolling.values, color = 'black', linewidth = 5, label = 'Actual')
ax[1].set_title('%s %s Moving Predictions (ARIMA)' % (site_name, pen_name))
ax[1].set_xlabel('Date')
ax[1].set_ylabel('Moving Count')
ax[1].legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)

plt.show()