In [None]:
import json, os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

from statsmodels.tsa.arima_model import ARIMA
import statsmodels.api as sm
from statsmodels.tsa.api import VAR
from datetime import timedelta, datetime, time

from research.utils.data_access_utils import RDSAccessUtils

rds_access_utils = RDSAccessUtils(json.load(open(os.environ['PROD_SQL_CREDENTIALS'])))

In [None]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

In [None]:
query = """
    select p.id as id, s.name as site_name, p.name as pen_name from pens p
    left join sites s
    on p.site_id = s.id
    and p.is_accessible_to_customer is true
    order by p.id;
"""

df_pens = rds_access_utils.extract_from_database(query)

pen_ids = [ 60 ] #56 # 37 # 56, 60,
pen_infos = []

#print(df_pens)

for index, pen in df_pens.iterrows():
    if pen.id in pen_ids:
            pen_infos.append((pen.id, pen.site_name, pen.pen_name))
            
print(pen_infos)

pen_id, site_name, pen_name = pen_infos[0]

In [None]:
query = """
    select date, weight_avg, weight_moving_avg
    from day_summaries a
    where a.pen_id = %i
    and a.date >= '2020-04-01';
""" % (pen_id, )

day_summaries = rds_access_utils.extract_from_database(query)

day_summaries.index = pd.to_datetime(day_summaries['date'])
day_summaries = day_summaries.sort_index()

#day_summaries

In [None]:
seriesCombinedOriginal = day_summaries['weight_avg'].fillna(method='ffill')
seriesCombinedRolling = day_summaries['weight_moving_avg']

In [None]:
def testSeriesMultivariate(series, trainSplitPct, predictionDays, isExp, AR):
    originalX = series.copy()
    X = series.copy()
    
    if isExp:
        X = np.log(X)

    diffX = X.diff().dropna()
    
    startPredictDays = 10

    size = int(len(diffX) * trainSplitPct)
    train, test = diffX.ix[0:size], diffX.ix[size:len(diffX)]
    history = train
    predictions = {}
    day0Predictions = originalX.ix[0:(size + startPredictDays)]
    day0PredictionsLower = originalX.ix[0:(size + startPredictDays)]
    day0PredictionsUpper = originalX.ix[0:(size + startPredictDays)]

    maxPredictionDay = 0
    

    for predictionDay in predictionDays:
        maxPredictionDay = max(maxPredictionDay, predictionDay)
        predictions[predictionDay] = pd.concat([originalX.ix[0:(size + predictionDay)]])
        
    for t in range(len(test)):
        nextDate = originalX.index[size + t + 1]
        #print(history)
        model = ARIMA(history, order=AR)
        model_fit = model.fit(disp=0)
        
        forecast, stderr, conf_int = model_fit.forecast(maxPredictionDay)
        output = forecast
        output_lower = conf_int[:,0]
        output_upper = conf_int[:,1]
        
#         print(output)
#         model_fit = model.fit(maxlags = AR, ic='aic') #model.fit(AR)
#         lag_order = model_fit.k_ar
#         if lag_order == 0:
#             model_fit = model.fit(1)
#         output, output_lower, output_upper = model_fit.forecast_interval(history.values[-lag_order:], maxPredictionDay, 0.25)
        
        for predictionDay in predictionDays:
            if t > len(test) - predictionDay:
                continue
            predictionDate = originalX.index[size + t + predictionDay]
            prediction = X.ix[0] + np.sum(history) + np.sum(output[:predictionDay], 0)
            
            if isExp:
                prediction = np.exp(prediction)
                
            if t == startPredictDays:
                #cov = model_fit.forecast_cov(maxPredictionDay)
                
                for i in range(predictionDay):
                    date = originalX.index[size + startPredictDays + i + 1]
                    datePrediction = X.ix[0] + np.sum(history) + np.sum(output[:i], 0)
                    datePredictionLower = X.ix[0] + np.sum(history) + np.sum(output_lower[:i], 0)
                    datePredictionUpper = X.ix[0] + np.sum(history) + np.sum(output_upper[:i], 0)
                    
                    SE = stderr[i]
                    currentCOV = stderr ** 2
                    expSE = np.sqrt(np.sum((np.exp(currentCOV) - 1) * np.exp(2 * np.mean(history) + currentCOV), 0))
                    
                    day0Predictions.loc[date] = datePrediction
                    day0PredictionsLower.loc[date] = np.maximum(datePrediction - SE * 1.96, datePredictionLower)
                    day0PredictionsUpper.loc[date] = np.minimum(datePrediction + SE * 1.96, datePredictionUpper)
                    
                    datePredictionExp = np.exp(datePrediction)
                    datePredictionLowerExp = np.maximum(np.exp(datePrediction) - expSE * 1.96, np.exp(datePredictionLower))
                    datePredictionUpperExp = np.minimum(np.exp(datePrediction) + expSE * 1.96, np.exp(datePredictionUpper))
                    
                    
                    if isExp:
                        day0Predictions.loc[date] = datePredictionExp
                        day0PredictionsLower.loc[date] = datePredictionLowerExp
                        day0PredictionsUpper.loc[date] = datePredictionUpperExp
           
            predictions[predictionDay].loc[predictionDate] = prediction
            
#             print(series.ix[0])
#             print(history)
#             print(np.sum(history))
#             print(np.sum(output[:predictionDay], 0))
#             print(nextDate, predictionDate)
#             print(prediction)

        history.loc[nextDate] = test.ix[t]

#     for predictionDay in predictionDays:
#         for field in isExp:
#             predictions[predictionDay][field] = np.exp(predictions[predictionDay][field])

    return predictions, day0Predictions, day0PredictionsLower, day0PredictionsUpper
    
#    return train, test, predictions

In [None]:
trainSplitPct = 0.5
predictionDays = [14]

predictions, day0Predictions, day0PredictionsLower, day0PredictionsUpper = testSeriesMultivariate(seriesCombinedRolling, trainSplitPct, predictionDays, True, (4,0,0))

fig, ax = plt.subplots(2)

fig.set_size_inches(15, 20)

i = 7
halfI = int((i + 1)/ 2)

for index, predictionDay in enumerate(predictionDays):
    predictionSeries = predictions[predictionDay]
    totalLen = int((len(predictionSeries) - 1) * trainSplitPct) + predictionDay
    predictionSeriesRolling = predictionSeries.rolling('%iD' % (i, )).mean().shift(-24 * i / 2, freq='h').resample('D').apply(lambda x:x.tail(1) if x.shape[0] else np.nan)
    predictionSeriesRolling.ix[0:halfI] = np.nan
    predictionSeriesRolling.ix[halfI:(totalLen + halfI)] = seriesCombinedRolling.values[0:totalLen]
    predictionSeriesRolling.ix[halfI:(totalLen + halfI)] = seriesCombinedRolling.values[0:totalLen]

    ax[0].plot(predictionSeries.index, predictionSeries, color = 'red', label = '%i Day Original' % (predictionDay, ))
    ax[0].plot(predictionSeriesRolling.index, predictionSeriesRolling.values, color = 'blue', label = '%i Day Rolling' % (predictionDay, ))
    ax[0].plot(seriesCombinedOriginal.index, seriesCombinedOriginal, color = 'black', linestyle = '--', label = 'Actual')
    ax[0].plot(seriesCombinedRolling.index, seriesCombinedRolling, color = 'black', linewidth = 5, label = 'Rolling')
    ax[0].plot(day0Predictions.index, day0Predictions, color = 'purple', linewidth = 2, label = 'Daily Predictions')
    ax[0].fill_between(day0Predictions.index, day0PredictionsLower, day0PredictionsUpper, color='b', alpha=.1)
    ax[0].set_title('%s %s Average Weight (ARIMA)' % (site_name, pen_name))
    ax[0].set_xlabel('Date')
    ax[0].set_ylabel('Average Weight')
    ax[0].legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
    ax[0].axhline(0)
    

## 