In [None]:
import pandas as pd
import numpy as np
import random
from matplotlib import pyplot as plt
from pandas.plotting import autocorrelation_plot

from statsmodels.tsa.stattools import acf, pacf
import json, os
import matplotlib.dates as mdates

from statsmodels.stats.stattools import durbin_watson
from scipy.stats import ttest_ind, normaltest
from decimal import Decimal
from aquabyte.data_access_utils import S3AccessUtils, RDSAccessUtils
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_colwidth', 500)

In [None]:
rds_access_utils = RDSAccessUtils(json.load(open(os.environ['PROD_SQL_CREDENTIALS'])))
query = """
    select * from event_logs where started_at between '2019-04-01' and '2019-05-15';
"""
df_events = rds_access_utils.extract_from_database(query)

In [None]:
query = """
    select * from lati_fish_detections_lice_annotations_reconciled
    where pen_id=1 and captured_at between '2019-04-01' and '2019-05-15';
"""
df = rds_access_utils.extract_from_database(query)


<h1> Prepare the dataset </h1>

In [None]:
df = df.sort_values('captured_at')
df_events['occurred_on_date'] = pd.to_datetime(df_events.started_at, format='%Y-%m-%d').astype(str)
df.index = pd.to_datetime(df.captured_at)
columns = ['adult_female_count_adjusted', 'moving_count_adjusted']
df_daily = df[columns].rolling('7D').mean().resample('D').apply(lambda x:x.tail(1) if x.shape[0] else np.nan).fillna(method='backfill')
df_daily['date'] = pd.to_datetime(df_daily.index, format='%Y-%m-%d').astype(str)

f_manual = 'manual_counts.csv'
df_manual = pd.read_csv(f_manual)
df_manual.index = pd.to_datetime(df_manual['Date'])



In [None]:
coefficients = {
    'last_week_fixed': 0.62,
    'this_week_moving': 0.08
}

def construct_fixed_df(df_daily, df_events, reset_count=0.05):
    df_fixed = df_daily.copy(deep=True)
    df_fixed['fixed_count'] = np.nan
    df_fixed.fixed_count.iloc[0] = reset_count
    
    reset_idx = 0
    
    for i in range(1, df_daily.shape[0]):
        date = df_daily.date.iloc[i]
        if (date in df_events.occurred_on_date.values) and \
        (df_events[df_events.occurred_on_date == date].event_type.iloc[0] == 'DELOUSING'):
            df_fixed.fixed_count.iloc[i] = random.gauss(reset_count, 0.02)
            reset_idx = i
        else:
            if reset_idx > i-7:
                df_fixed.fixed_count.iloc[i] = random.gauss(reset_count, 0.02)
            else:
                df_fixed.fixed_count.iloc[i] = max(coefficients['last_week_fixed']*df_fixed.fixed_count.iloc[i-7] + \
                                               coefficients['this_week_moving']*df_fixed.moving_count_adjusted.iloc[i] + random.gauss(0, 0.05), 0)
        
    return df_fixed
        
    

In [None]:
df_fixed = construct_fixed_df(df_daily, df_events)

In [None]:
df_fixed['2019-05-14':'2019-06-03'].moving_count_adjusted = np.nan

In [None]:
plt.plot(df.moving_count)
plt.show()

In [None]:
df_events

In [None]:

df_fixed.moving_count_adjusted

In [None]:
plt.figure(figsize=(20, 10))
plt.plot(df_fixed.index, df_fixed.moving_count_adjusted, marker='o', label='ASLCS Mobile Count')
# plt.plot(df_manual['Moving Count'].dropna(), marker='o', label='MM Mobile Count')
for idx, date in enumerate(df_events[df_events.event_type == 'DELOUSING'].occurred_on_date.values):
    if idx == 0:
        plt.axvline(x=date, color='r', linestyle='--', label='Delicing Event')
    else:
        plt.axvline(x=date, color='r', linestyle='--')
plt.title('ASLCS Mobile Counts vs. MM Mobile Counts')
plt.xlabel('Date')

plt.ylabel('Mobile Lice Count')
# txt = '''Aquabyte moving counts versus manual moving counts for Blom Kjeppevikholmen, Merd 2, 
# from 2018-12-19 to 2019-03-29'''
# plt.figtext(0.5, 0.01, txt, wrap=True, horizontalalignment='center', fontsize=12)
plt.legend()
# plt.legend()
plt.grid()
plt.show()

In [None]:
plt.figure(figsize=(20, 10))
plt.plot(df_fixed.index, df_fixed.adult_female_count_adjusted / 4.0, marker='o', label='Adult Female Count')
plt.plot(df_fixed.index, df_fixed.moving_count_adjusted / 4.0, marker='o', label='Mobile Count')
plt.plot(df_fixed.index, df_fixed.fixed_count / 4.0, marker='o', label='Stationary Count')
for idx, date in enumerate(df_events[df_events.description == 'delicing'].occurred_on_date.values[1:]):
    if idx == 0:
        plt.axvline(x=date, color='r', linestyle='--', label='Delicing Event')
    else:
        plt.axvline(x=date, color='r', linestyle='--')

plt.title('Aquabyte Lice Counts')
plt.xlabel('Date')
plt.ylabel('Lice Count')
plt.legend(loc='upper left')
plt.grid()
plt.show()

In [None]:
plt.figure(figsize=(20, 10))
# plt.plot(df_fixed.index, df_fixed.adult_female_count_adjusted, marker='o', label='Adult Female Count')
# plt.plot(df_fixed.index, df_fixed.moving_count_adjusted, marker='o', label='Moving Count')
plt.plot(df_fixed.index, df_fixed.fixed_count / 4, marker='o', label='Stationary Count')
for idx, date in enumerate(df_events[df_events.description == 'delicing'].occurred_on_date.values[1:]):
    if idx == 0:
        plt.axvline(x=date, color='r', linestyle='--', label='Delicing Event')
    else:
        plt.axvline(x=date, color='r', linestyle='--')

plt.title('Aquabyte Stationary Count')
plt.xlabel('Date')
plt.ylabel('Stationary Lice Count')
plt.legend(loc='upper left')
plt.grid()
plt.show()

In [None]:
df_fixed

In [None]:
df_events

In [None]:
plt.hist(df_fixed.adult_female_count_adjusted)
plt.show()

In [None]:
df_manual

In [None]:
df_daily = df[columns].rolling('7D').mean()

In [None]:
df_daily

In [None]:
df.head()

In [None]:
df_daily_count = df['adult_female_count_adjusted'].dropna().resample('D').apply(lambda x:x.shape[0] if x.shape[0] else np.nan).fillna(method='backfill')
df_daily_count.columns = ['count']




In [None]:
df_daily_count

In [None]:
df_daily_count[mask].mean()

In [None]:
fig, ax = plt.subplots(figsize=(30, 10))
mask = df_daily_count.index > '2019-01-01'
ax.bar(df_daily_count.index, df_daily_count, width=0.8)
ax.set_title('Aquabyte Daily Sample Size')
# ax.axhline(y=df_daily_count.mean(), color='r', linestyle='--', label='Average daily sample size')
ax.set_xlabel('Date')
ax.set_ylabel('Daily Sample Size')
plt.legend(loc='upper left')
txt = '''Aquabyte daily sample sizes for Blom Kjeppevikholmen, Merd 2, from 2019-02-21 to 2019-03-29
'''
plt.figtext(0.5, 0.01, txt, wrap=True, horizontalalignment='center', fontsize=14)
fig.autofmt_xdate()


plt.grid()
plt.show()

In [None]:
df_daily_count[mask][df_daily_count[mask] > 100].shape[0]/df_daily_count[mask].shape[0]

In [None]:
mask = (df.index > '2019-02-21') & (df.adult_female_count_adjusted >= 0)
tdf = df[mask].copy()
tdf['hour'] = tdf.index.hour
tdf['hour'] += 1
tdf = tdf[['adult_female_count_adjusted', 'hour']].dropna()
xdf = tdf.groupby(tdf.hour)['adult_female_count_adjusted'].count() / df_fixed[df_fixed.index > '2019-02-21'].shape[0]
plt.figure(figsize=(10, 5))
plt.bar(xdf.index, xdf)
plt.title('Aquabyte Daily Sample Size by Hour of Day')
plt.xlabel('Hour of day (CEST)')
plt.ylabel('Average conditional daily sample size')
plt.grid()
plt.show()

In [None]:
df['image_width_px_bucket'] = 250 * (df.image_width_px // 250) + 125

In [None]:
kdf

In [None]:

kdf = pd.DataFrame(df[mask].groupby('image_width_px_bucket')['id'].count() / df_fixed[df_fixed.index > '2019-02-21'].shape[0])

In [None]:
plt.figure(figsize=(10, 5))
plt.bar(kdf.index, kdf.id, width=240)
plt.title('Aquabyte Daily Sample Size by Fish Image Size')
plt.xlabel('Image width (in pixels)')
plt.ylabel('Average conditional daily sample size')
plt.grid()

In [None]:
kdf['weight'] = (kdf.index) * (3.45e-6) * (0.9 / 0.0138)

In [None]:
kdf

In [None]:
normaltest(df['image_width_px_bucket'])

In [None]:
plt.hist(df['image_width_px_bucket'])

In [None]:
kdf.id

In [None]:
plt.figure(figsize=(20, 10))
plt.hist(df.image_width_px)
plt.grid()
plt.show()

In [None]:
plt.figure(figsize=(10, 5))
autocorrelation_plot(df_fixed.moving_count_adjusted)
plt.title('ASLCS Mobile Count ACF')
plt.xlabel('Temporal Lag (Days)')
# txt = '''Aquabyte moving count autocorrelation values for different lags (each lag period corresponds to a single day).
# The solid and dashed lines correspond to 95th and 99th percentile confidence bands, respectively.
# '''
# plt.figtext(0.5, 0.01, txt, wrap=True, horizontalalignment='center', fontsize=12)

In [None]:
plt.figure(figsize=(10, 5))
autocorrelation_plot(df_manual['Moving Count'].dropna())
plt.title('Manual Mobile Count ACF')
plt.xlabel('Temporal lag (Weeks)')
# txt = '''Manual moving count autocorrelation values for different lags (each lag period corresponds to roughly 1 week).
# The solid and dashed lines correspond to 95th and 99th percentile confidence bands, respectively.
# '''
# plt.figtext(0.5, 0.01, txt, wrap=True, horizontalalignment='center', fontsize=12)

In [None]:
df_manual['Moving Count'].dropna().autocorr(3)

In [None]:
durbin_watson(df_fixed.moving_count_adjusted)

In [None]:
durbin_watson(df_manual['Fixed Count'].dropna())

In [None]:
x_values = []
y_values = []
for i in range(1000):
    x_values.append(i)
    if i == 0:
        y_values.append(0)
    elif i == 1:
        y_values.append(1)
    else:
        y = y_values[i-1] + 0.8*(y_values[i-1] - y_values[i-2]) + 1.*random.gauss(0, 1)
        y_values.append(y)
    




In [None]:
plt.scatter(x_values, y_values)

In [None]:
plt.figure(figsize=(20, 6))
plt.subplot(1,2,1)
x = np.arange(0, 50, 0.1)
y = np.sin(x)
plt.scatter(x, y)

plt.subplot(1,2,2)
x = np.arange(0, 50, 1.)
y = np.sin(x)
plt.scatter(x, y)

txt = '''Samples captured from f(x) = sin(x). Left plot displays samples at intervals of size 0.1, 
while the right plot displays samples at intervals of size 1. 
'''
plt.figtext(0.5, -0.07, txt, wrap=True, horizontalalignment='center', fontsize=14)

In [None]:
x = np.arange(0, 50, 1.)
y = np.sin(x)
plt.scatter(x, y)

In [None]:
df_manual['Moving Count'].dropna()

In [None]:
ttest_ind(df_manual['Moving Count'].dropna().shape[0], df_daily_count[mask], equal_var=False)

In [None]:
ttest_ind(np.array([10]*9), df_daily_count[mask], equal_var=False)

In [None]:
from scipy.stats import norm

In [None]:
df_manual['Moving Count'].dropna()

In [None]:
for lag in range(1, 8):
    ac = acf(df_manual['Moving Count'].dropna())[lag]
    print(ac, 1 - norm.cdf(ac * (df_manual['Moving Count'].dropna().shape[0])**.5))

In [None]:
for lag in [1, 2, 3, 4, 5, 10, 15, 20, 30, 40, 50]:
    ac = round(acf(df_fixed['moving_count_adjusted'])[lag], 2)
    n = norm.cdf(ac * (df_fixed['moving_count_adjusted'].shape[0])**.5)
    p = min(n, 1-n)
    disp_p = '%.2E' % Decimal(str(p))
    mantissa, exp = disp_p.split('E')
    p_str = '{0} \\times 10^{1}'.format(mantissa, exp)
    print('{} & {} & ${}$ \\\\'.format(lag, ac, p_str))

In [None]:
for lag in [1, 2, 3, 4, 5, 6, 7, 8, 9]:
    ac = round(acf(df_manual['Moving Count'].dropna())[lag], 2)
    n = norm.cdf(ac * (df_manual['Moving Count'].dropna().shape[0])**.5)
    p = min(n, 1-n)
    disp_p = '%.2E' % Decimal(str(p))
    mantissa, exp = disp_p.split('E')
    p_str = '{0} \\times 10^{1}'.format(mantissa, exp)
    print('{} & {} & ${}$ \\\\'.format(lag, ac, p_str))