In [None]:
from collections import defaultdict
import json
import os
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd

from research.utils.data_access_utils import RDSAccessUtils

In [None]:
rds = RDSAccessUtils(json.load(open(os.environ['PROD_RESEARCH_SQL_CREDENTIALS'])))

In [None]:
query = """
    select *
    from annotations
    where annotated_by_email not in ('jane@aquabyte.ai', 'alok@aquabyte.ai', 'reina@aquabyte.ai')
    and group_id like 'plali-test'
    and created_at > '2020-08-01 08:00:00'
    and is_qa=TRUE;
"""

df = rds.extract_from_database(query)

In [None]:
df = df.drop(0)

In [None]:
url = df.left_image_url.iloc[0]
df['date'] = df.left_image_url.apply(lambda x: x.split('/')[6][5:])


<h1> Publish basic stats </h1>

In [None]:
# how many wounds per fish are there?


In [None]:
wound_data = defaultdict(list)
for idx, row in df.iterrows():
    ann = row.annotation
    if ann.get('leftCrop'):
        for item in ann['leftCrop']['lice']:
            wound_data['fish_type'].append('full' if 'full' in item['label'] else 'partial')
            wound_data['wound_type'].append('winter' if 'winter' in item['label'] else 'other')
            wound_data['width'].append(item['width'])
            wound_data['height'].append(item['height'])
            wound_data['url'].append(row.left_image_url)
            wound_data['date'].append(row.date)

In [None]:
wound_df = pd.DataFrame(wound_data)

In [None]:
dates = sorted(list(df.date.unique()))
widths = np.arange(0, 100, 20)
winter_mask = wound_df.wound_type == 'winter'
mean_counts_by_width = defaultdict(list)
pcts_by_width = defaultdict(list)
sample_sizes = []
for date in dates:
    sample_sizes.append(df[df.date == date].shape[0])
    for width in widths:
        mean_count = wound_df[(wound_df.date == date) & (wound_df.width > width) & winter_mask].shape[0] / df[df.date == date].shape[0]
        mean_counts_by_width[width].append(mean_count)
        pct = len(wound_df[(wound_df.date == date) & (wound_df.width > width) & winter_mask].url.unique()) / df[df.date == date].shape[0]
        pcts_by_width[width].append(pct)
    

In [None]:
fig, axes = plt.subplots(3, 1, figsize=(10, 13))

for width in [0]:
    axes[0].plot(dates, sample_sizes)
    axes[1].plot(dates, mean_counts_by_width[width], label='width > {} pixels'.format(width))
    axes[2].plot(dates, 100 * np.array(pcts_by_width[width]), label='width > {} pixels'.format(width))
    

axes[0].set_ylim([0, 120])
axes[0].grid()
axes[0].legend()
axes[0].set_title('Daily Sample Size')

axes[1].set_ylim([0, 0.5])
axes[1].grid()
axes[1].legend()
axes[1].set_title('Daily Winter Wounds per fish')

axes[2].set_ylim([0, 25])
axes[2].grid()
axes[2].legend()
axes[2].set_xlabel('Date')
axes[2].set_ylabel('% of fish with winter wounds')

axes[2].set_title('Daily Percentage (%) of fish with winter wounds')
    
plt.show()

In [None]:
wound_df[wound_df.type == 'partial'].shape[0] / wound_df.shape[0]

In [None]:
sample_sizes

In [None]:
mean_counts_by_width

In [None]:
plt.figure(figsize=(20, 10))
plt.hist(wound_df.width.values, bins=100)
plt.grid()
plt.show()

In [None]:
df.to_csv('/root/data/alok/biomass_estimation/playground/qa_wound_data.csv')