In [None]:
import datetime
import pandas as pd
import numpy as np
import random
from matplotlib import pyplot as plt
from pandas.plotting import autocorrelation_plot
from sklearn.linear_model import LinearRegression

from statsmodels.tsa.stattools import acf, pacf
import json, os
import matplotlib.dates as mdates

from statsmodels.stats.stattools import durbin_watson
from scipy.stats import ttest_ind, normaltest
from decimal import Decimal
from aquabyte.data_access_utils import S3AccessUtils, RDSAccessUtils
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_colwidth', 500)

In [None]:
df_bw = pd.read_csv('2012_2020_18-fixed-3.csv', index_col=0).sort_index()

In [None]:
query = """
    select * from sites;
"""

df_sites = rds_access_utils.extract_from_database(query)

df_sites

In [None]:
# query = """
#     select * from pens;
# """

# df_pens = rds_access_utils.extract_from_database(query)

# df_pens

In [None]:
site_name = 'Seglberget'
#site_name = 'Tittelsnes'
#site_name = 'Aplavika'

site = df_sites[df_sites.name == site_name]

siteId = site['id'].values[0]

localityNo = site['government_site_number'].values[0]

manualCountsForLocality = df_bw[df_bw.localityNo == localityNo]

In [None]:
query = """
    select * from pens where site_id=%i;
""" % (siteId, )

df_pens = rds_access_utils.extract_from_database(query)

penIds = df_pens.id.values.tolist()

In [None]:
penDFs = []

for penId in penIds:
    query = """
        select * from day_summaries
        where pen_id = %i;
    """ % (penId, )
    
    print(query)
    
    try:
        df_ab = rds_access_utils.extract_from_database(query)    
        #df_ab.index = pd.to_datetime(df_ab.captured_at)
        #df_ab = df_ab.sort_index()
        #df_daily = df_ab[columns].rolling('1D').mean().resample('D').apply(lambda x:x.tail(1) if x.shape[0] else np.nan).fillna(method='backfill')
        #df_daily['date'] = pd.to_datetime(df_daily.index, format='%Y-%m-%d').astype(str)
        
        
        
        penDFs.append(df_ab)
    except Exception as e:
        print('Not including pen id %i' % (penId, ))
        print(e)
        continue

In [None]:
columns = ['female_avg', 'moving_avg', 'female_moving_avg', 'moving_moving_avg']

siteDF = pd.concat(tuple(penDFs))
siteDF.index = pd.to_datetime(siteDF.date)
siteDF = siteDF.sort_index()
siteDF = siteDF.dropna(subset=['female_avg', 'moving_avg'])

siteAvgDF = siteDF[columns].rolling('1D').mean()

siteAvgDFStartDate = np.min(siteAvgDF.index).date()
siteAvgDFEndDate = np.max(siteAvgDF.index).date()

siteAvgDF['date'] = pd.to_datetime(siteAvgDF.index).tz_localize(None)

In [None]:
siteAvgDF

In [None]:
results = []

for index, manualCount in manualCountsForLocality.iterrows():   
    d = '%i-W%i' % (manualCount.year, manualCount.week)
    endDate = datetime.datetime.strptime(d + '-1', "%Y-W%W-%w").date()
    startDate = endDate - datetime.timedelta(days=6)

    if startDate >= siteAvgDFStartDate and endDate <= siteAvgDFEndDate:
        subset = (siteAvgDF.date > startDate) & (siteAvgDF.date <= endDate)
        
        aquabyteAFSubset = siteAvgDF.female_avg[subset]
        aquabyteAFSubset2 = aquabyteAFSubset[aquabyteAFSubset > 0]
        aquabyteAFSubset3 = siteAvgDF.female_moving_avg[subset]
        
        aquabyteMSubset = siteAvgDF.moving_avg[subset]
        aquabyteMSubset2 = aquabyteMSubset[aquabyteMSubset > 0]
        aquabyteMSubset3 = siteAvgDF.moving_moving_avg[subset]
        
        if len(aquabyteAFSubset2) < 2:
            continue
        
        results.append((startDate, manualCount.avgAdultFemaleLice, manualCount.avgMobileLice, np.mean(aquabyteAFSubset), np.mean(aquabyteAFSubset2), np.mean(aquabyteAFSubset3), np.mean(aquabyteMSubset), np.mean(aquabyteMSubset2), np.mean(aquabyteMSubset3)))

results = list(zip(*results))

In [None]:
dates = np.array(results[0])
manualCountsAF = np.array(results[1])
manualCountsM = np.array(results[2])
aquabyteCountsAF = np.array(results[3])
aquabyteCountsNonZeroAF = np.array(results[4])
aquabyteCountsMovingAF = np.array(results[5])
aquabyteCountsM = np.array(results[6])
aquabyteCountsNonZeroM = np.array(results[7])
aquabyteCountsMovingM = np.array(results[8])

analyzeAF = False

if analyzeAF:
    liceType = 'Adult Female'
    manualCounts = manualCountsAF
    aquabyteCounts = aquabyteCountsAF
    aquabyteCountsNonZero = aquabyteCountsNonZeroAF
    aquabyteCountsMoving = aquabyteCountsMovingAF
else:
    liceType = 'Mobile'
    manualCounts = manualCountsM
    aquabyteCounts = aquabyteCountsM
    aquabyteCountsNonZero = aquabyteCountsNonZeroM
    aquabyteCountsMoving = aquabyteCountsMovingM

X = manualCounts.reshape(-1, 1)
Y = aquabyteCountsMoving
reg = LinearRegression(fit_intercept = False).fit(X, Y)

print(reg.intercept_, reg.coef_)
print(reg.score(X, Y))

predY = reg.predict(X)

fig, ax = plt.subplots(3)

fig.set_size_inches(10, 30)


ax[0].scatter(manualCounts, aquabyteCounts)
ax[0].plot(manualCounts, predY, '-')
ax[0].set_title('%s: Aquabyte vs Manual (%s)' % (site_name, liceType, ))
ax[0].set_xlabel('Manual Count')
ax[0].set_ylabel('Aquabyte Count')

ax[1].scatter(dates, aquabyteCounts - manualCounts)
ax[1].plot(dates, np.zeros(len(dates)), '--')
ax[1].set_title('%s: Aquabyte - Manual Delta (%s)' % (site_name, liceType, ))
ax[1].set_xlabel('Date')
ax[1].set_ylabel('Count')

ax[2].plot(dates, manualCounts, linestyle = '--', marker = 'o', color = 'red')
ax[2].plot(dates, aquabyteCounts, linestyle = '-', marker = 'o', color = 'blue')
#ax[2].plot(dates, aquabyteCountsNonZero, linestyle = '-', marker = 'o', color = 'green')
ax[2].plot(dates, aquabyteCountsMoving, linestyle = '-', marker = 'o', color = 'purple')
ax[2].set_title('%s: Counts over time (%s)' % (site_name, liceType, ))
ax[2].set_xlabel('Date')
ax[2].set_ylabel('Count')