In [None]:
import json, os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors

from sklearn.linear_model import LinearRegression
from statsmodels.tsa.arima_model import ARIMA
from sklearn.metrics import mean_squared_error

from datetime import timedelta, datetime

from aquabyte.data_access_utils import S3AccessUtils, RDSAccessUtils

In [None]:
rds_access_utils = RDSAccessUtils(json.load(open(os.environ['PROD_SQL_CREDENTIALS'])))

In [None]:
query = """
    select * from sites;
"""

df_sites = rds_access_utils.extract_from_database(query)

df_sites

In [None]:
pen_id = 56
#pen_id = 66

In [None]:
query = """
    select s.name as site_name, p.name as pen_name from sites s
    inner join pens p on p.site_id = s.id
    where p.id = %i;
""" % (pen_id)

df_sites = rds_access_utils.extract_from_database(query)

#df_sites
site_name = df_sites.ix[0, 'site_name']
pen_name = df_sites.ix[0, 'pen_name']

print(site_name, pen_name)

In [None]:
query = """
    select event_type, started_at, ended_at from event_logs 
    where pen_id = %i
    and started_at > '2020-03-01';
""" % (pen_id, )

df_events = rds_access_utils.extract_from_database(query)

df_events

In [None]:
query = """
    select captured_at, annotation_metadata 
    from annotations a
    where a.pen_id = %i
    and a.is_qa = true
    and a.is_skipped = false
    and a.captured_at > '2020-03-01';
""" % (pen_id, )

lice_counts = rds_access_utils.extract_from_database(query)

lice_counts.index = lice_counts['captured_at']
lice_counts = lice_counts.sort_index()

lice_counts

In [None]:
query = """
    select date, female_moving_avg, moving_moving_avg, num_lati_fish, num_moving_avg_lati_fish
    from day_summaries a
    where a.pen_id = %i
    and a.date >= '2020-03-01';
""" % (pen_id, )

day_summaries = rds_access_utils.extract_from_database(query)

day_summaries.index = day_summaries['date']
day_summaries = day_summaries.sort_index()

day_summaries

In [None]:
for index, lice_count in lice_counts.iterrows():
    liceCounts = lice_count['annotation_metadata']['liceCounts']

    lice_counts.ix[index, 'movingCountAdjusted'] = liceCounts['movingCountAdjusted']
    lice_counts.ix[index, 'adultFemaleCountAdjusted'] = liceCounts['adultFemaleCountAdjusted']
    lice_counts.ix[index, 'count'] = 1

lice_counts

In [None]:
subset = lice_counts.index < '2020-04-12'
#subset = lice_counts.index < '2020-04-01'
timescale = lice_counts.index[subset]
start = np.min(timescale)

secs_since_start = (timescale - start).total_seconds().to_numpy()

adultFemaleY = lice_counts.adultFemaleCountAdjusted[subset]
logAdultFemaleY = np.log(1 + adultFemaleY)
movingY = lice_counts.movingCountAdjusted[subset]
logMovingY = np.log(1 + movingY)

X = secs_since_start.reshape(-1, 1)

#plt.plot(X, Y, 'p')

adultFemaleReg = LinearRegression().fit(X, logAdultFemaleY)
movingReg = LinearRegression().fit(X, logMovingY)

#print(reg.intercept_, reg.coef_)
#print(reg.score(X, logY))
    
logAdultFemaleY_pred = adultFemaleReg.predict(X)
logMovingY_pred = movingReg.predict(X)

adultFemaleY_pred = np.exp(logAdultFemaleY_pred) - 1
movingY_pred = np.exp(logMovingY_pred) - 1

# ax[0].scatter(secs_since_start, Y)
# ax[0].plot(secs_since_start, Y_pred)

lice_counts.ix[subset, 'adultFemaleCountPred'] = adultFemaleY_pred
lice_counts.ix[subset, 'movingCountPred'] = movingY_pred

In [None]:
columns = [ 'movingCountAdjusted', 'adultFemaleCountAdjusted', 'adultFemaleCountPred', 'movingCountPred' ]
lice_counts.index = pd.to_datetime(lice_counts.captured_at, format='%Y-%m-%d')

moving_averages = []

i = 5

daily_avg_count = lice_counts['count'].rolling('%iD' % (i, )).sum().resample('D').apply(lambda x:x.tail(1) if x.shape[0] else np.nan)
daily_avg = lice_counts[columns].rolling('%iD' % (i, )).mean().resample('D').apply(lambda x:x.tail(1) if x.shape[0] else np.nan)
daily_center_avg_count = lice_counts['count'].rolling('%iD' % (i, )).sum().shift(-24 * i / 2, freq='h').resample('D').apply(lambda x:x.tail(1) if x.shape[0] else np.nan)
daily_center_avg = lice_counts[columns].rolling('%iD' % (i, )).mean().shift(-24 * i / 2, freq='h').resample('D').apply(lambda x:x.tail(1) if x.shape[0] else np.nan)
daily_avg['count'] = daily_avg_count
daily_center_avg['count'] = daily_center_avg_count



subset = (daily_avg.index < '2020-04-12') & (np.isfinite(daily_avg.adultFemaleCountAdjusted))
timescale = daily_avg.index[subset]
start = np.min(timescale)

secs_since_start = (timescale - start).total_seconds().to_numpy()

adultFemaleY = daily_avg.adultFemaleCountAdjusted[subset]
logAdultFemaleY = np.log(adultFemaleY + 1)
movingY = daily_avg.movingCountAdjusted[subset]
logMovingY = np.log(movingY + 1)

X = secs_since_start.reshape(-1, 1)

#plt.plot(X, Y, 'p')

adultFemaleY

adultFemaleReg = LinearRegression().fit(X, logAdultFemaleY)
movingReg = LinearRegression().fit(X, logMovingY)

#print(reg.intercept_, reg.coef_)
#print(reg.score(X, logY))
    
logAdultFemaleY_pred = adultFemaleReg.predict(X)
logMovingY_pred = movingReg.predict(X)

adultFemaleY_pred = np.exp(logAdultFemaleY_pred) - 1
movingY_pred = np.exp(logMovingY_pred) - 1

# ax[0].scatter(secs_since_start, Y)
# ax[0].plot(secs_since_start, Y_pred)

daily_avg.ix[subset, 'adultFemaleCountDailyPred'] = adultFemaleY_pred
daily_avg.ix[subset, 'movingCountDailyPred'] = movingY_pred





subset = (daily_center_avg.index < '2020-04-12') & (np.isfinite(daily_center_avg.adultFemaleCountAdjusted))
timescale = daily_center_avg.index[subset]
start = np.min(timescale)

secs_since_start = (timescale - start).total_seconds().to_numpy()

adultFemaleY = daily_center_avg.adultFemaleCountAdjusted[subset]
logAdultFemaleY = np.log(adultFemaleY + 1)
movingY = daily_center_avg.movingCountAdjusted[subset]
logMovingY = np.log(movingY + 1)

X = secs_since_start.reshape(-1, 1)

#plt.plot(X, Y, 'p')

adultFemaleY

adultFemaleReg = LinearRegression().fit(X, logAdultFemaleY)
movingReg = LinearRegression().fit(X, logMovingY)

#print(reg.intercept_, reg.coef_)
#print(reg.score(X, logY))
    
logAdultFemaleY_pred = adultFemaleReg.predict(X)
logMovingY_pred = movingReg.predict(X)

adultFemaleY_pred = np.exp(logAdultFemaleY_pred) - 1
movingY_pred = np.exp(logMovingY_pred) - 1

# ax[0].scatter(secs_since_start, Y)
# ax[0].plot(secs_since_start, Y_pred)

daily_center_avg.ix[subset, 'adultFemaleCountDailyPred'] = adultFemaleY_pred
daily_center_avg.ix[subset, 'movingCountDailyPred'] = movingY_pred





for index, event in df_events.iterrows():
    if event['event_type'] == 'WELLBOAT_TREATMENT':
        subset = (daily_avg.index > event['started_at']) & (daily_avg.index < event['ended_at'])
        daily_avg[subset] = None
        daily_avg.ix[subset, 'count'] = 100

        subset_center = (daily_center_avg.index > event['started_at'] + timedelta(hours=-24 * i / 2)) & (daily_center_avg.index < event['ended_at'] + timedelta(hours=-24 * i / 2))
        daily_center_avg[subset_center] = None
        daily_center_avg.ix[subset_center, 'count'] = 100

moving_averages.append((i, daily_avg, daily_center_avg))


In [None]:
cmap = plt.cm.rainbow
norm = matplotlib.colors.Normalize(vmin=1, vmax=10)

fig, ax = plt.subplots(2)

fig.set_size_inches(15, 20)

for index, moving_average in enumerate(moving_averages):
    subset = (moving_average[1]['count'] >= 100)
    subset_center = (moving_average[2].index >= np.min(moving_average[1].index)) & (moving_average[2]['count'] >= 100)
    ax[0].plot(moving_average[1].index[subset], moving_average[1]['adultFemaleCountAdjusted'][subset], color = cmap(norm(moving_average[0])), linestyle = '--', marker = '*', label = '%i day MA' % (moving_average[0], ))
    ax[0].plot(moving_average[2].index[subset_center], moving_average[2]['adultFemaleCountAdjusted'][subset_center], color = cmap(norm(moving_average[0] + 2)), linestyle = '--', marker = '*', label = '%i day Centered MA' % (moving_average[0], ))
    ax[0].plot(moving_average[2].index[subset_center], moving_average[2]['adultFemaleCountPred'][subset_center], color = 'orange', linestyle = '--', marker = '*', label = 'Regression')
    ax[0].plot(moving_average[1].index[subset], moving_average[1]['adultFemaleCountDailyPred'][subset], color = 'purple', linestyle = '--', marker = '*', label = 'Regression')
    ax[0].plot(moving_average[2].index[subset_center], moving_average[2]['adultFemaleCountDailyPred'][subset_center], color = 'blue', linestyle = '--', marker = '*', label = 'Regression')
ax[0].plot(day_summaries.index, day_summaries['female_moving_avg'], linestyle = 'solid', linewidth = 5, marker = 'o', color = 'black', label = 'Current AF')
ax[0].set_title('%s %s Adult Female Counts' % (site_name, pen_name))
ax[0].set_xlabel('Date')
ax[0].set_ylabel('Adult Female Count')

#ax02 = ax[0].twinx() 
#ax02.plot(moving_average[1].index, moving_average[1]['count'], color = 'red', linestyle = '--', marker = 'o', label='Count')

for index, moving_average in enumerate(moving_averages):
    subset = (moving_average[1]['count'] >= 100)
    subset_center = (moving_average[2].index >= np.min(moving_average[1].index)) & (moving_average[2]['count'] >= 100)
    ax[1].plot(moving_average[1].index[subset], moving_average[1]['movingCountAdjusted'][subset], color = cmap(norm(moving_average[0])), linestyle = '--', marker = '*', label = '%i day MA' % (moving_average[0], ))
    ax[1].plot(moving_average[2].index[subset_center], moving_average[2]['movingCountAdjusted'][subset_center], color = cmap(norm(moving_average[0] + 2)), linestyle = '--', marker = '*', label = '%i day Centered MA' % (moving_average[0], ))
    ax[1].plot(moving_average[2].index[subset_center], moving_average[2]['movingCountPred'][subset_center], color = 'orange', linestyle = '--', marker = '*', label = 'Regression')
    ax[1].plot(moving_average[1].index[subset], moving_average[1]['movingCountDailyPred'][subset], color = 'purple', linestyle = '--', marker = '*', label = 'Regression')
    ax[1].plot(moving_average[2].index[subset_center], moving_average[2]['movingCountDailyPred'][subset_center], color = 'blue', linestyle = '--', marker = '*', label = 'Regression')
ax[1].plot(day_summaries.index, day_summaries['moving_moving_avg'], linestyle = 'solid', linewidth = 5, marker = 'o', color = 'black', label = 'Current AF')
ax[1].set_title('%s %s Mobile Counts' % (site_name, pen_name))
ax[1].set_xlabel('Date')
ax[1].set_ylabel('Mobile Count')

#ax12 = ax[1].twinx() 
#ax12.plot(moving_average[1].index, moving_average[1]['count'], color = 'red', linestyle = 'dashdot', marker = 's', label='Count')

ax[0].legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
ax[1].legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)

for index, event in df_events.iterrows():
    if event['event_type'] == 'WELLBOAT_TREATMENT':
        ax[0].axvline(x= event['started_at'], color = 'r')
        ax[0].axvline(x= event['ended_at'], color = 'r')
        
        ax[1].axvline(x= event['started_at'], color = 'r')
        ax[1].axvline(x= event['ended_at'], color = 'r')


plt.show()

In [None]:
moving_average = moving_averages[0]

#subset = (moving_average[1]['count'] >= 100) & np.isfinite(moving_average[1]['movingCountAdjusted']) & (moving_average[1].index < '2020-04-26')
#series = moving_average[1]['movingCountAdjusted'][subset]

subset = (day_summaries.index > datetime.strptime('2020-03-01', '%Y-%m-%d').date()) & (day_summaries.index < datetime.strptime('2020-04-26', '%Y-%m-%d').date())
series = day_summaries['moving_moving_avg'][subset]


pd.plotting.autocorrelation_plot(series)

In [None]:
model = ARIMA(series, order=(5,1,0))
model_fit = model.fit(disp=0)
print(model_fit.summary())

residuals = pd.DataFrame(model_fit.resid)
residuals.plot()
plt.show()
residuals.plot(kind='kde')
plt.show()
print(residuals.describe())

In [None]:
X = series.values
size = int(len(X) * 0.5)
train, test = X[0:size], X[size:len(X)]
history = [x for x in train]
predictions1 = list()
predictions2 = list()
predictions3 = list()

dates = [0]

for t in range(len(test)):
    model = ARIMA(history, order=(5,1,0))
    model_fit = model.fit(disp=0)
    output = model_fit.forecast(steps = 14)
    #print(output)
    predictions1.append(output[0][0])
    predictions2.append(output[0][7 - 1])
    predictions3.append(output[0][14 - 1])
    obs = test[t]
    history.append(obs)
    #print('predicted=%f, expected=%f' % (yhat, obs))
# error = mean_squared_error(test, predictions)
# print('Test MSE: %.3f' % error)
# plot


In [None]:
fig, ax = plt.subplots(2)

fig.set_size_inches(15, 20)

ax[0].plot(series.index, np.concatenate((train, predictions1)), color='red', label = '1 Day')
ax[0].plot(series.index, np.concatenate((train, test[0:6], predictions2[0:-6])), color='orange', label = '7 Day')
ax[0].plot(series.index, np.concatenate((train, test[0:13], predictions3[0:-13])), color='yellow', label = '14 Day')
ax[0].plot(series.index, np.concatenate((train, test)), color='green', label = 'Actual')
ax[0].set_title('%s %s Moving Predictions (ARIMA)' % (site_name, pen_name))
ax[0].set_xlabel('Date')
ax[0].set_ylabel('Moving Count')

ax[0].legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)

plt.show()

In [None]:
# MCMC based predictions