In [None]:
%load_ext autoreload
%autoreload 2

from research.utils.data_access_utils import S3AccessUtils
from research.weight_estimation.keypoint_utils.body_parts import BodyParts



In [None]:
from collections import defaultdict
from copy import copy, deepcopy
import json, math, os, random
import numpy as np
import pandas as pd

from scipy.spatial import Delaunay

from research.weight_estimation.keypoint_utils.optics import pixel2world
from research.weight_estimation.akpd_utils.akpd_scorer import generate_confidence_score
from research.weight_estimation.keypoint_utils.body_parts import BodyParts
from research.utils.data_access_utils import S3AccessUtils, RDSAccessUtils

# get core body parts used in GTSF and weight estimation model
BODY_PARTS = BodyParts().get_core_body_parts()


# Class for extracting and pre-processing GTSF data
class GTSFDataset(object):

    def __init__(self, start_date, end_date, akpd_scorer_url, species='salmon', add_template_matching_keypoints=False):
        self.s3_access_utils = S3AccessUtils('/root/data', json.load(open(os.environ['AWS_CREDENTIALS'])))
        self.df = self.generate_raw_df(start_date, end_date)
        self.prepare_df(akpd_scorer_url, species, add_template_matching_keypoints)

    # generate raw GTSF dataframe from database
    @staticmethod
    def generate_raw_df(start_date, end_date):
        rds_access_utils = RDSAccessUtils(json.load(open(os.environ['PROD_RESEARCH_SQL_CREDENTIALS'])))
        query = """
            select * from research.fish_metadata a left join keypoint_annotations b
            on a.left_url = b.left_image_url 
            where b.keypoints -> 'leftCrop' is not null
            and b.keypoints -> 'rightCrop' is not null
            and b.captured_at between '{0}' and '{1}';
        """.format(start_date, end_date)
        df = rds_access_utils.extract_from_database(query)
        print('Raw dataframe loaded!')
        return df

    @staticmethod
    def get_world_keypoints(row):
        return pixel2world(row.keypoints['leftCrop'], row.keypoints['rightCrop'], row.camera_metadata)

    # add AKPD score, spatial information, k-factor, and template matching body-keypoints (if applicable)
    def prepare_df(self, akpd_scorer_url, species, add_template_matching_keypoints):
        # use QA'ed entries, and only use Cogito entries when QA data is unavailable
        self.df = self.df[self.df.data.apply(lambda x: x['species'].lower()) == species].copy(deep=True)
        qa_df = self.df[self.df.is_qa == True]
        cogito_df = self.df[(self.df.is_qa != True) & ~(self.df.left_image_url.isin(qa_df.left_image_url))]
        self.df = pd.concat([qa_df, cogito_df], axis=0)
        print('Dataset preparation beginning...')

        # add 3D spatial information
        self.df['world_keypoints'] = self.df.apply(lambda x: self.get_world_keypoints(x), axis=1)
#         self.df['median_depth'] = self.df.world_keypoints.apply(lambda x: np.median([wkp[1] for wkp in x.values()]))
        print('3D spatial information added!')

        # add k-factor
        self.df['k_factor'] = 1e5 * self.df.weight / self.df.data.apply(lambda x: x['lengthMms']**3).astype(float)
        
        # add AKPD scores and convert world keypoints to matrix form
        self.add_akpd_scores(akpd_scorer_url)
        if add_template_matching_keypoints:
            self.add_template_matching_keypoints()

    @staticmethod
    def in_hull(p, hull):
        hull = Delaunay(hull)
        return hull.find_simplex(p)>=0

    # generate SIFT based template matching keypoints
    def add_template_matching_keypoints(self):
        print('Adding template matching body keypoints...')

        # load data
        gen = self.s3_access_utils.get_matching_s3_keys(
            'aquabyte-research', 
            prefix='template-matching/2019-12-05T02:50:57', 
            suffixes=['.parquet']
        )

        keys = [key for key in gen]
        f = self.s3_access_utils.download_from_s3('aquabyte-research', keys[0])
        pdf = pd.read_parquet(f)
        pdf['homography'] = pdf.homography_and_matches.apply(lambda x: np.array(x[0].tolist(), dtype=np.float))
        pdf['matches'] = pdf.homography_and_matches.apply(lambda x: np.array(x[1].tolist(), dtype=np.int) if len(x) > 1 else None)

        # merge with existing dataframe
        self.df = pd.merge(self.df, pdf[['left_image_url', 'homography', 'matches']], how='inner', on='left_image_url')

        # generate list of modified keypoints
        modified_keypoints_list = []
        count = 0
        for idx, row in self.df.iterrows():
            if count % 100 == 0:
                print(count)
            count += 1
            X_keypoints = np.array([[item['xFrame'], item['yFrame']] for item in row.keypoints['leftCrop']])
            X_body = np.array(row.matches)
            is_valid = self.in_hull(X_body[:, :2], X_keypoints)
            X_body = X_body[np.where(is_valid)]
            
            keypoints = deepcopy(row.keypoints)
            left_keypoints, right_keypoints = keypoints['leftCrop'], keypoints['rightCrop']
            left_item = {'keypointType': 'BODY', 'xFrame': X_body[:, 0], 'yFrame': X_body[:, 1]}
            right_item = {'keypointType': 'BODY', 'xFrame': X_body[:, 2], 'yFrame': X_body[:, 3]}
            
            left_keypoints.append(left_item)
            right_keypoints.append(right_item)
            modified_keypoints = {'leftCrop': left_keypoints, 'rightCrop': right_keypoints}
            modified_keypoints_list.append(modified_keypoints)

        # add modified keypoints information to dataframe
        self.df['old_keypoints'] = self.df.keypoints
        self.df['keypoints'] = modified_keypoints_list
        self.df = self.df[self.df.keypoints.apply(lambda x: x['leftCrop'][-1]['xFrame'].shape[0]) > 500]

    # generate AKPD scores
    def add_akpd_scores(self, akpd_scorer_url):
        print('Adding AKPD scores...')
        # load neural network weights
        akpd_scorer_path, _, _ = self.s3_access_utils.download_from_url(akpd_scorer_url)
        akpd_scorer_network = load_model(akpd_scorer_path)

        akpd_scores = []
        for idx, row in self.df.iterrows():
            input_sample = {
                'keypoints': row.keypoints,
                'cm': row.camera_metadata,
                'stereo_pair_id': row.id,
                'single_point_inference': True
            }
            akpd_score = generate_confidence_score(input_sample, akpd_scorer_network)
            akpd_scores.append(akpd_score)
        self.df['akpd_score'] = akpd_scores

    # return fully pre-processed GTSF dataset for downstream training
    def get_prepared_dataset(self):
        return self.df


def main():
    akpd_scorer_url = 'https://aquabyte-models.s3-us-west-1.amazonaws.com/keypoint-detection-scorer/akpd_scorer_model_TF.h5'
    gtsf_dataset = GTSFDataset('2019-06-01', '2019-06-10', akpd_scorer_url)
    df = gtsf_dataset.get_prepared_dataset()
    print(df.shape)




In [None]:
start_date, end_date = '2019-03-01', '2020-01-01'
rds_access_utils = RDSAccessUtils(json.load(open(os.environ['SQL_CREDENTIALS'])))
query = """
    select * from stereo_frame_pairs;
""".format(start_date, end_date)
df = rds_access_utils.extract_from_database(query)

In [None]:
df[(df.ground_truth_metadata.apply(lambda x: json.loads(x)['data'].get('species')) == 'trout')]

In [None]:
from keras.models import load_model

In [None]:
akpd_scorer_url = 'https://aquabyte-models.s3-us-west-1.amazonaws.com/keypoint-detection-scorer/akpd_scorer_model_TF.h5'
gtsf_dataset = GTSFDataset('2019-03-01', '2020-02-10', akpd_scorer_url, species='trout')
df = gtsf_dataset.get_prepared_dataset()
# df = df[(df.captured_at < '2019-09-20') & (df.median_depth < 1.0) & (df.akpd_score > 0.5)]

In [None]:
from research.weight_estimation.old.weight_estimator_old import NormalizedStabilityTransform, Network
from research.weight_estimation.old.data_loader import KeypointsDataset, NormalizeCentered2D, ToTensor, BODY_PARTS
from research.weight_estimation.keypoint_utils.optics import pixel2world
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils


In [None]:
normalize_centered_2D_transform = NormalizeCentered2D()
normalized_stability_transform = NormalizedStabilityTransform()
to_tensor_transform = ToTensor()

s3_access_utils = S3AccessUtils('/root/data', json.load(open(os.environ['AWS_CREDENTIALS'])))
model_url = 'https://aquabyte-models.s3-us-west-1.amazonaws.com/biomass/trained_models/2019-11-08T00-13-09/nn_epoch_798.pb'
model_f, _, _ = s3_access_utils.download_from_url(model_url)
network = torch.load(model_f)


weight_predictions = []
count = 0
for idx, row in df.iterrows():
    if count % 1000 == 0:
        print(count)
    count += 1
    
    input_sample = {
        'keypoints': row.keypoints,
        'cm': row.camera_metadata,
        'stereo_pair_id': 0,
        'single_point_inference': True
    }
    nomralized_centered_2D_kps = \
        normalize_centered_2D_transform.__call__(input_sample)
    
    normalized_stability_kps = normalized_stability_transform.__call__(nomralized_centered_2D_kps)
    tensorized_kps = to_tensor_transform.__call__(normalized_stability_kps)
    weight_prediction = network(tensorized_kps['kp_input']).item() * 1e4
    weight_predictions.append(weight_prediction)
    



In [None]:
weight_predictions

In [None]:
from collections import defaultdict, namedtuple
import datetime as dt
import json
import os
import time
import numpy as np
import pandas as pd
from research.utils.data_access_utils import S3AccessUtils, RDSAccessUtils
from research.utils.datetime_utils import add_days, get_dates_in_range
from research.weight_estimation.population_metrics import PopulationMetricsEstimator

import warnings
warnings.filterwarnings("ignore")

S3 = S3AccessUtils('/root/data', json.load(open(os.environ['AWS_CREDENTIALS'])))
RDS = RDSAccessUtils(json.load(open(os.environ['DATA_WAREHOUSE_SQL_CREDENTIALS'])))
OUTPUT_DIR = '/root/data/recommendations'
UPLOAD_BUCKET = 'aquabyte-images-adhoc'
UPLOAD_KEY_BASE = 'alok/filter_recommendations'


class NoDataException(Exception):
    pass


SamplingFilter = namedtuple('SamplingFilter', 'start_hour end_hour kf_cutoff akpd_score_cutoff')


def generate_filter_mask(df, sampling_filter):
    """Generates boolean mask on data-frame of raw biomass computations corresponding to sampling filter."""

    if sampling_filter.start_hour < sampling_filter.end_hour:
        hour_mask = (df.hour >= sampling_filter.start_hour) & (df.hour <= sampling_filter.end_hour)
    else:
        hour_mask = (df.hour >= sampling_filter.start_hour) | (df.hour <= sampling_filter.end_hour)
    kf_mask = (df.estimated_k_factor >= sampling_filter.kf_cutoff)
    akpd_score_mask = (df.akpd_score >= sampling_filter.akpd_score_cutoff)
    mask = hour_mask & kf_mask & akpd_score_mask
    return mask


def generate_pme(df, sampling_filter):
    """Generates population metrics estimator give data-frame of raw biomass computations and sampling filter.
    Args:
        - df: data-frame of raw biomass computations
        - sampling_filter: SamplingFilter instance representing filters to apply

    Returns:
        - PopulationMetricsEstimator instance
    """
    mask = generate_filter_mask(df, sampling_filter)

    # get filtered set of biomass computations
    biomass_computations = list(zip(df[mask].date.values,
                                    df.loc[mask, 'estimated_weight_g'].values,
                                    df[mask].estimated_k_factor.values))

    # generate population metrics estimator
    if not biomass_computations:
        raise NoDataException('No data found for given filter!')
    return PopulationMetricsEstimator(biomass_computations)


def _not_none_mean(x):
    """Returns mean of all non-None values in list."""
    return np.mean([i for i in x if i is not None])


def generate_metrics_for_pme(pme, dates):
    """Generates mean biomass KPI given a PopulationMetricsEstimator instance and dates to consider."""

    kpis = []
    for date in dates:
        metrics = pme.generate_smart_metrics_on_date(date)
        kpis.append(metrics.get('biomass_kpi'))

    # compute mean kpi, mean distribution consistency, and final smart average
    mean_kpi = _not_none_mean(kpis)
    return mean_kpi


def find_optimal_filter(df, sampling_filters):
    """Finds optimal filter given data-frame of raw biomass computations and different sampling filters.
    Args:
        - df: DataFrame of raw biomass computations from data warehouse
        - sampling_filters: list of SamplingFilter instances to iterate over
    Returns:
        - best_sampling_filter: SamplingFilter instance corresponding to the one that maximizes biomass KPI
    """
    analysis_data = defaultdict(list)
    for sampling_filter in sampling_filters:
        print('Start hour: {}, End hour: {}, KF cutoff: {}'.format(
            sampling_filter.start_hour, sampling_filter.end_hour, sampling_filter.kf_cutoff
        ))
        try:
            pme = generate_pme(df, sampling_filter)
            unique_dates = sorted(df.date.unique().tolist())
            dates = get_dates_in_range(unique_dates[0], unique_dates[-1])
            mean_kpi = generate_metrics_for_pme(pme, dates)
        except NoDataException as err:
            print(str(err))
            mean_kpi = None

        # add to data
        analysis_data['mean_kpi'].append(mean_kpi)
        analysis_data['start_hour'].append(sampling_filter.start_hour)
        analysis_data['end_hour'].append(sampling_filter.end_hour)
        analysis_data['kf_cutoff'].append(sampling_filter.kf_cutoff)
        analysis_data['akpd_score_cutoff'].append(sampling_filter.akpd_score_cutoff)

    analysis_df = pd.DataFrame(analysis_data)
    best_row = analysis_df.sort_values('mean_kpi', ascending=False).iloc[0]

    best_sampling_filter = SamplingFilter(
        start_hour=float(best_row.start_hour),
        end_hour=float(best_row.end_hour),
        kf_cutoff=float(best_row.kf_cutoff),
        akpd_score_cutoff=float(best_row.akpd_score_cutoff)
    )
    return best_sampling_filter


def generate_sampling_filters(start_hours, end_hours, kf_cutoffs, akpd_score_cutoff=0.99):
    """Generates list of SamplingFilter instances given start hour, end hour, and k-factor values to grid over."""
    sampling_filters = []
    for start_hour in start_hours:
        for end_hour in end_hours:
            for kf_cutoff in kf_cutoffs:
                sampling_filters.append(
                    SamplingFilter(
                        start_hour=start_hour,
                        end_hour=end_hour,
                        kf_cutoff=kf_cutoff,
                        akpd_score_cutoff=akpd_score_cutoff
                    )
                )
    return sampling_filters


def perform_coarse_grid_search(df, max_kf=1.3):
    """Perform a coarse but broad grid search to determine best sampling filter.
    Args:
        - df: DataFrame of raw biomass computations from data-warehouse
        - max_kf: Maximum k-factor value to go up to during grid search
    Returns:
        - best_coarse_filter: SamplingFilter instance corresopnding to best coarse-search filter
    """
    min_hour, max_hour = int(df.hour.min()), int(df.hour.max())
    start_hours = np.arange(min_hour, max_hour, 1)
    end_hours = np.arange(min_hour, max_hour, 1)
    min_kf_cutoff = .05 * int(df.estimated_k_factor.min() / .05)
    kf_cutoffs = np.arange(min_kf_cutoff, max_kf, 0.05)
    sampling_filters = generate_sampling_filters(start_hours, end_hours, kf_cutoffs)
    best_coarse_filter = find_optimal_filter(df, sampling_filters)
    return best_coarse_filter


def perform_fine_grid_search(df, best_coarse_filter):
    """Perform a fine, local grid search around provided sampling filter to determine best sampling filter.
    Args:
        - df: DataFrame of raw biomass computations from data-warehouse
        - max_kf: Maximum k-factor value to go up to during grid search
    Returns:
        - best_fine_filter: SamplingFilter instance corresponding to best find-search filter
    """
    lo_start_hr, hi_start_hr = max(best_coarse_filter.start_hour - 1, 0), min(best_coarse_filter.start_hour + 1, 24)
    lo_end_hr, hi_end_hr = max(best_coarse_filter.end_hour - 1, 0), min(best_coarse_filter.end_hour + 1, 24)
    lo_kf, hi_kf = best_coarse_filter.kf_cutoff - 0.1, best_coarse_filter.kf_cutoff + 0.1
    start_hours = np.arange(lo_start_hr, hi_start_hr, 1)
    end_hours = np.arange(lo_end_hr, hi_end_hr, 1)
    kf_cutoffs = np.arange(lo_kf, hi_kf, 0.005)
    sampling_filters = generate_sampling_filters(start_hours, end_hours, kf_cutoffs)
    best_fine_filter = find_optimal_filter(df, sampling_filters)
    return best_fine_filter


def generate_global_optimum_filter(pen_id, start_date, end_date, akpd_score_cutoff=0.99):
    """Determine best global optimal sampling strategy for given pen_id, start_date, and end_date.
        Args:
    """
    df = extract_biomass_data(pen_id, start_date, end_date, akpd_score_cutoff)

    print('Performing coarse grid search...')
    best_coarse_filter = perform_coarse_grid_search(df)
    print(f'Coarse grid search complete with best start hour of {best_coarse_filter.start_hour}, '
          f'best end hour of {best_coarse_filter.end_hour}, best kf cutoff of {best_coarse_filter.kf_cutoff}')

    print('Perform fine grid search...')
    best_fine_filter = perform_fine_grid_search(df, best_coarse_filter)
    return best_fine_filter


def get_active_pen_ids():
    """Get all active customer pen IDs."""

    query = 'SELECT id FROM customer.pens WHERE is_active=TRUE;'
    pdf = RDS.extract_from_database(query)
    pen_ids = sorted(pdf.id.values.tolist())
    return pen_ids


def get_historical_date_range(pen_id, curr_date, lookback_days):
    """Get date range corresponding to past two weeks if pen has data, else raise NoDataException."""

    # TODO @alok: weight KPI for each date by sample size

    today, two_weeks_ago = curr_date, add_days(curr_date, -lookback_days)
    query = """
        SELECT *
        FROM
            (SELECT CAST(captured_at as DATE) as date, COUNT(estimated_weight_g)
            FROM prod.biomass_computations
            WHERE pen_id={}
            AND akpd_score >= 0.99
            GROUP BY date
            ORDER BY date DESC) AS COUNT_BY_DATE
        WHERE date >= '{}'
        AND date <= '{}';
    """.format(pen_id, two_weeks_ago, today)
    print(query)
    tdf = RDS.extract_from_database(query)
    if not tdf.shape[0]:
        raise NoDataException('No data present was found in last two weeks for this pen!')
    return two_weeks_ago, today


def _add_date_hour_columns(df):
    """Adds date and hour columns to DataFrame of biomass computations"""
    df.index = list(range(df.shape[0]))
    df = df.sort_values('captured_at').copy(deep=True)
    df.index = pd.to_datetime(df.captured_at)
    dates = df.index.date.astype(str)
    df['date'] = dates
    df['hour'] = df.index.hour
    return df


def extract_biomass_data(pen_id, start_date, end_date, akpd_score_cutoff):
    """Get raw biomass computations for given pen_id, date range, and AKPD score cutoff."""

    query = """
        SELECT * FROM
        prod.biomass_computations bc
        WHERE bc.pen_id={}
        AND bc.akpd_score >= {}
        AND bc.captured_at BETWEEN '{}' and '{}'
        AND bc.estimated_weight_g > 0.0
    """.format(pen_id, akpd_score_cutoff, start_date, end_date)

    df = RDS.extract_from_database(query)
    df = df.loc[:, ~df.columns.duplicated()]
    df = _add_date_hour_columns(df)
    return df


def main():

    if not os.path.exists(OUTPUT_DIR):
        os.makedirs(OUTPUT_DIR)

    # pen_ids = get_active_pen_ids()
    pen_ids = [86]


    curr_date = add_days(dt.datetime.strftime(dt.datetime.now(), '%Y-%m-%d'), -1)
    while True:
        f = os.path.join(OUTPUT_DIR, 'recommendations_{}.json'.format(curr_date))
        today = dt.datetime.strftime(dt.datetime.now(), '%Y-%m-%d')
        if curr_date < today:
            recommendations = {}
            for pen_id in pen_ids:
                curr_date = '2020-07-12'
                

                print('Optimizing filters for Pen ID: {}'.format(pen_id))

                # get date range corresponding to last two weeks
                try:
                    start_date, end_date = get_historical_date_range(pen_id, curr_date, 14)
                except NoDataException as err:
                    print(str(err))
                    continue
                
                # get best overall start hour, end hour, and k-factor cutoff
                best_global_filter = generate_global_optimum_filter(pen_id, start_date, end_date)
                recommendations[pen_id] = dict(
                    best_start_hr=best_global_filter.start_hour,
                    best_end_hr=best_global_filter.end_hour,
                    best_kf_cutoff=best_global_filter.kf_cutoff
                )

                print(f'Best Start Hour: {best_global_filter.start_hour}')
                print(f'Best End Hour: {best_global_filter.end_hour}')
                print(f'Best KF Cutoff: {best_global_filter.kf_cutoff}')

                json.dump(recommendations, open(f, 'w'))
                raise

            # upload to s3
            print('Uploading recommendations for {} to S3...'.format(curr_date))
            key = os.path.join(UPLOAD_KEY_BASE, os.path.basename(f))
            S3.s3_client.upload_file(f, UPLOAD_BUCKET, key)
            curr_date = add_days(curr_date, 1)
        else:
            print('Now sleeping for one hour...')
            time.sleep(3600)



In [None]:
sampling_filter = SamplingFilter(start_hour=0, end_hour=24, kf_cutoff=1.17, akpd_score_cutoff=0.99)

In [None]:
df = extract_biomass_data(86, '2020-07-01', '2020-07-15', 0.99)

In [None]:
pme = generate_pme(df, sampling_filter)

In [None]:
sm = pme.generate_smart_metrics_on_date('2020-07-12', bucket_size=1000)

In [None]:
sm

In [None]:
w_dist = {}
kf_breakdown = {}
count = 0
for k in list(sm['smart_distribution'].keys()):
    key = '{}-{}'.format(str(k), str(float(k)+1))
    w_dist[key] = sm['smart_distribution'][k]['count']
    kf_breakdown[key] = sm['smart_distribution'][k]['avgKFactor']
    count += sm['smart_distribution'][k]['count']

In [None]:
w_dist = {k: 100 * float(v) / count for k, v in w_dist.items()}

In [None]:
w_dist


In [None]:
kf_breakdown