In [None]:
import pandas as pd
from research.utils.data_access_utils import RDSAccessUtils



In [None]:
from collections import defaultdict, namedtuple
from typing import List
import datetime as dt
import json
import os
import time
import numpy as np
import pandas as pd
from research.utils.data_access_utils import S3AccessUtils, RDSAccessUtils
from research.utils.datetime_utils import add_days, get_dates_in_range
from population_metrics.population_metrics_base import generate_pm_base, PopulationMetricsBase
from population_metrics.confidence_metrics import compute_biomass_kpi, generate_distribution_consistency
from population_metrics.raw_metrics import get_raw_sample_size


import warnings
warnings.filterwarnings("ignore")

S3 = S3AccessUtils('/root/data', json.load(open(os.environ['AWS_CREDENTIALS'])))
RDS = RDSAccessUtils(json.load(open(os.environ['DATA_WAREHOUSE_SQL_CREDENTIALS'])))
OUTPUT_DIR = '/root/data/recommendations'
UPLOAD_BUCKET = 'aquabyte-images-adhoc'
UPLOAD_KEY_BASE = 'alok/filter_recommendations'


class NoDataException(Exception):
    pass


SamplingFilter = namedtuple('SamplingFilter', 'start_hour end_hour kf_cutoff akpd_score_cutoff')


def generate_filter_mask(df: pd.DataFrame, sampling_filter: SamplingFilter):
    """Generates boolean mask on data-frame of raw biomass computations corresponding to sampling filter.
    Args:
        df: data-frame of raw biomass computations from data warehouse. This contains:
            - group_id, site_id, pen_id, left_crop_url, right_crop_url, estimated_weight_g, captured_at, 
              akpd_score, estimated_length_mm, estimated_k_factor, process_info, annotation, camera_metadata
        sampling_filter: SamplingFilter instance representing filter to apply
    Returns:
        mask: boolean Pandas series representing df subset that falls into sampling filter
    """

    if sampling_filter.start_hour < sampling_filter.end_hour:
        hour_mask = (df.hour >= sampling_filter.start_hour) & (df.hour <= sampling_filter.end_hour)
    else:
        hour_mask = (df.hour >= sampling_filter.start_hour) | (df.hour <= sampling_filter.end_hour)
    kf_mask = (df.estimated_k_factor >= sampling_filter.kf_cutoff)
    akpd_score_mask = (df.akpd_score >= sampling_filter.akpd_score_cutoff)
    mask = hour_mask & kf_mask & akpd_score_mask
    return mask


def gen_pm_base(df: pd.DataFrame, sampling_filter: SamplingFilter) -> PopulationMetricsBase:
    """Generates PopulationMetricsBase instance give data-frame of raw biomass computations and sampling filter."""
    mask = generate_filter_mask(df, sampling_filter)

    # get filtered set of biomass computations
    biomass_computations = list(zip(df[mask].date.values,
                                    df.loc[mask, 'estimated_weight_g'].values,
                                    df[mask].estimated_k_factor.values))

    if len(biomass_computations) == 0:
        return None
    pm_base = generate_pm_base(biomass_computations)
    return pm_base


def generate_metrics_for_pm_base(pm_base: PopulationMetricsBase, dates: List[str]) -> float:
    """Generates mean biomass KPI given a PopulationMetricsBase instance and dates to consider."""

    kpis, sample_sizes = [], []
    for date in dates:
        sample_size = get_raw_sample_size(pm_base, date)
        biomass_kpi = compute_biomass_kpi(pm_base, date)
        sample_sizes.append(sample_size)
        kpis.append(biomass_kpi)

    # compute sample-size weighted kpi and final smart average
    kpis = np.array([k if k else np.nan for k in kpis])
    sample_sizes = np.array([s if s else np.nan for s in sample_sizes])
    mean_kpi = np.nansum(kpis * sample_sizes) / np.nansum(sample_sizes)
    return mean_kpi


def find_optimal_filter(df: pd.DataFrame, sampling_filters: List[SamplingFilter]) -> SamplingFilter:
    """Finds optimal filter given data-frame of raw biomass computations and different sampling
    filters. """

    analysis_data = defaultdict(list)
    for sampling_filter in sampling_filters:
        print('Start hour: {}, End hour: {}, KF cutoff: {}'.format(
            sampling_filter.start_hour, sampling_filter.end_hour, sampling_filter.kf_cutoff
        ))
        pm_base = gen_pm_base(df, sampling_filter)

        if pm_base:
            unique_dates = sorted(df.date.unique().tolist())
            dates = get_dates_in_range(unique_dates[0], unique_dates[-1])
            mean_kpi = generate_metrics_for_pm_base(pm_base, dates)
        else:
            mean_kpi = None

        # add to data
        analysis_data['mean_kpi'].append(mean_kpi)
        analysis_data['start_hour'].append(sampling_filter.start_hour)
        analysis_data['end_hour'].append(sampling_filter.end_hour)
        analysis_data['kf_cutoff'].append(sampling_filter.kf_cutoff)
        analysis_data['akpd_score_cutoff'].append(sampling_filter.akpd_score_cutoff)

    analysis_df = pd.DataFrame(analysis_data)
    best_sampling_filter_params = analysis_df.sort_values('mean_kpi', ascending=False).iloc[0]

    best_sampling_filter = SamplingFilter(
        start_hour=float(best_sampling_filter_params.start_hour),
        end_hour=float(best_sampling_filter_params.end_hour),
        kf_cutoff=float(best_sampling_filter_params.kf_cutoff),
        akpd_score_cutoff=float(best_sampling_filter_params.akpd_score_cutoff)
    )
    return best_sampling_filter


def generate_sampling_filters(start_hours: List[int], end_hours: List[int],
                              kf_cutoffs: List[float], akpd_score_cutoff: float = 0.99) -> List[SamplingFilter]:
    """Generates list of SamplingFilter instances given start hour, end hour, and k-factor values to grid over."""
    sampling_filters = []
    for start_hour in start_hours:
        for end_hour in end_hours:
            for kf_cutoff in kf_cutoffs:
                sampling_filters.append(
                    SamplingFilter(
                        start_hour=start_hour,
                        end_hour=end_hour,
                        kf_cutoff=kf_cutoff,
                        akpd_score_cutoff=akpd_score_cutoff
                    )
                )
    return sampling_filters


def perform_coarse_grid_search(df: pd.DataFrame, max_kf: float = 1.5) -> SamplingFilter:
    """Perform a coarse but broad grid search to determine best sampling filter.
    Args:
        - df: DataFrame of raw biomass computations from data-warehouse
        - max_kf: Maximum k-factor value to go up to during grid search
    Returns:
        - best_coarse_filter: SamplingFilter instance corresponding to best coarse-search filter
    """
    start_hours = [0]
    end_hours = [24]
    min_kf_cutoff = .05 * int(df.estimated_k_factor.min() / .05)
    kf_cutoffs = list(np.arange(min_kf_cutoff, max_kf, 0.05))
    sampling_filters = generate_sampling_filters(start_hours, end_hours, kf_cutoffs)
    best_coarse_filter = find_optimal_filter(df, sampling_filters)
    return best_coarse_filter


def perform_fine_grid_search(df: pd.DataFrame, best_coarse_filter: SamplingFilter) -> SamplingFilter:
    """Perform a fine, local grid search around provided sampling filter to determine best sampling filter.
    Args:
        - df: DataFrame of raw biomass computations from data-warehouse
        - max_kf: Maximum k-factor value to go up to during grid search
    Returns:
        - best_fine_filter: SamplingFilter instance corresponding to best find-search filter
    """
    lo_start_hr, hi_start_hr = max(best_coarse_filter.start_hour - 1, 0), min(best_coarse_filter.start_hour + 1, 24)
    lo_end_hr, hi_end_hr = max(best_coarse_filter.end_hour - 1, 0), min(best_coarse_filter.end_hour + 1, 24)
    lo_kf, hi_kf = best_coarse_filter.kf_cutoff - 0.1, best_coarse_filter.kf_cutoff + 0.1

    start_hours = list(np.arange(lo_start_hr, hi_start_hr, 1))
    end_hours = list(np.arange(lo_end_hr, hi_end_hr, 1))
    kf_cutoffs = list(np.arange(lo_kf, hi_kf, 0.005))

    sampling_filters = generate_sampling_filters(start_hours, end_hours, kf_cutoffs)
    best_fine_filter = find_optimal_filter(df, sampling_filters)
    return best_fine_filter


def generate_global_optimum_filter(df: pd.DataFrame):
    """Determine best global optimal sampling strategy for given pen_id, start_date, and end_date."""


    print('Performing coarse grid search...')
    best_coarse_filter = perform_coarse_grid_search(df)
    print(f'Coarse grid search complete with best start hour of {best_coarse_filter.start_hour}, '
          f'best end hour of {best_coarse_filter.end_hour}, best kf cutoff of {best_coarse_filter.kf_cutoff}')

    print('Perform fine grid search...')
    best_fine_filter = perform_fine_grid_search(df, best_coarse_filter)
    return best_fine_filter


def get_active_pen_ids():
    """Get all active customer pen IDs."""

    query = 'SELECT id FROM customer.pens WHERE is_active=TRUE;'
    pdf = RDS.extract_from_database(query)
    pen_ids = sorted(pdf.id.values.tolist())
    return pen_ids


def get_historical_date_range(pen_id, curr_date, lookback_days):
    """Get date range corresponding to past two weeks if pen has data, else raise NoDataException."""

    # TODO @alok: weigh KPI for each date by sample size

    today, two_weeks_ago = curr_date, add_days(curr_date, -lookback_days)
    query = """
        SELECT *
        FROM
            (SELECT CAST(captured_at as DATE) as date, COUNT(estimated_weight_g)
            FROM prod.biomass_computations
            WHERE pen_id={}
            AND akpd_score >= 0.99
            GROUP BY date
            ORDER BY date DESC) AS COUNT_BY_DATE
        WHERE date >= '{}'
        AND date <= '{}';
    """.format(pen_id, two_weeks_ago, today)
    print(query)
    tdf = RDS.extract_from_database(query)
    if not tdf.shape[0]:
        raise NoDataException('No data present was found in last two weeks for this pen!')
    return two_weeks_ago, today


def _add_date_hour_columns(df):
    """Adds date and hour columns to DataFrame of biomass computations"""
    df.index = list(range(df.shape[0]))
    df = df.sort_values('captured_at').copy(deep=True)
    df.index = pd.to_datetime(df.captured_at)
    dates = df.index.date.astype(str)
    df['date'] = dates
    df['hour'] = df.index.hour
    return df


def extract_biomass_data(pen_id, start_date, end_date, akpd_score_cutoff):
    """Get raw biomass computations for given pen_id, date range, and AKPD score cutoff."""

    query = """
        SELECT * FROM
        prod.biomass_computations bc
        WHERE bc.pen_id={}
        AND bc.akpd_score >= {}
        AND bc.captured_at BETWEEN '{}' and '{}'
        AND bc.estimated_weight_g > 0.0
    """.format(pen_id, akpd_score_cutoff, start_date, end_date)

    df = RDS.extract_from_database(query)
    df = df.loc[:, ~df.columns.duplicated()]
    df = _add_date_hour_columns(df)
    return df




In [None]:
if not os.path.exists(OUTPUT_DIR):
        os.makedirs(OUTPUT_DIR)

pen_ids = [56]

recommendations_by_date = {}

#     curr_date = dt.datetime.strftime(dt.datetime.utcnow(), '%Y-%m-%d')
curr_date = '2020-03-01'
while True:
    f = os.path.join(OUTPUT_DIR, 'recommendations_{}.json'.format(curr_date))
    tomorrow = add_days(dt.datetime.strftime(dt.datetime.utcnow(), '%Y-%m-%d'), 1)
    if curr_date < tomorrow:
        recommendations = {}
        for pen_id in pen_ids:

            print('Optimizing filters for Pen ID: {}'.format(pen_id))

            # get date range corresponding to last two weeks
            try:
                start_date, end_date = get_historical_date_range(pen_id, curr_date, 14)
            except NoDataException as err:
                print(str(err))
                continue

            # get best overall start hour, end hour, and k-factor cutoff
            df = extract_biomass_data(pen_id, start_date, end_date, 0.99)
            best_global_filter = generate_global_optimum_filter(df)
            recommendations[pen_id] = dict(
                best_start_hr=best_global_filter.start_hour,
                best_end_hr=best_global_filter.end_hour,
                best_kf_cutoff=best_global_filter.kf_cutoff
            )

            print(f'Best Start Hour: {best_global_filter.start_hour}')
            print(f'Best End Hour: {best_global_filter.end_hour}')
            print(f'Best KF Cutoff: {best_global_filter.kf_cutoff}')

            recommendations_by_date[curr_date] = recommendations
#                 json.dump(recommendations, open(f, 'w'))

        curr_date = add_days(curr_date, 1)
    else:
        print('Now sleeping for one hour...')
        time.sleep(3600)


In [None]:
from population_metrics.population_metrics_base import generate_pm_base
from population_metrics.growth_rate import compute_local_growth_rate
from population_metrics.raw_metrics import generate_raw_average_weight, generate_raw_average_kf, get_raw_sample_size
from population_metrics.smart_metrics import generate_smart_avg_weight, generate_smart_standard_deviation, \
     generate_smart_distribution, generate_smart_avg_kf, get_smart_sample_size, \
     get_smart_growth_rate
from population_metrics.confidence_metrics import generate_distribution_consistency, generate_trend_stability, \
     compute_biomass_kpi
from research.utils.datetime_utils import add_days, get_dates_in_range

In [None]:
from research.utils.datetime_utils import add_days

pen_id = 56
start_date = '2020-03-01'
end_date = '2020-09-25'
dates_to_include = get_dates_in_range(add_days(start_date, -7), add_days(end_date, 7))
dates_to_compute = get_dates_in_range(start_date, end_date)

print('Extracting Data...')
df = extract_biomass_data(pen_id, dates_to_include[0], dates_to_include[-1], 0.99)
print('Data extracted!')

results_by_date_fh = {}

for date in dates_to_compute:
    print('On date: {}'.format(date))

#     sampling_filter_dict = recommendations_by_date[date][pen_id]
    sampling_filter = SamplingFilter(
        start_hour=7,
        end_hour=15,
        kf_cutoff=0.0,
        akpd_score_cutoff=0.99
    )
    start_date, end_date = add_days(date, -4), add_days(date, 4)
    tdf = df[(df.date >= start_date) & (df.date <= end_date)].copy(deep=True)
    pm_base = gen_pm_base(tdf, sampling_filter)

    smart_average_weight = generate_smart_avg_weight(pm_base, date)
    smart_standard_deviation = generate_smart_standard_deviation(pm_base, date)
    if smart_average_weight and smart_standard_deviation:
        coefficient_of_variation = smart_standard_deviation / smart_average_weight
    else:
        coefficient_of_variation = None

    raw_data = dict(
        avgWeight=generate_raw_average_weight(pm_base, date),
        avgKFactor=generate_raw_average_kf(pm_base, date),
        numFish=get_raw_sample_size(pm_base, date)
    )

    smart_data = dict(
        avgWeight=smart_average_weight,
        avgKFactor=generate_smart_avg_kf(pm_base, date),
        numFish=get_smart_sample_size(pm_base, date),
        weightDistribution=generate_smart_distribution(pm_base, date),
        standardDeviation=smart_standard_deviation,
        coefficientOfVariation=coefficient_of_variation,
        growthRate=compute_local_growth_rate(pm_base, date),
        growthRateForSmartAvg=get_smart_growth_rate(pm_base, date),
        distributionConsistency=generate_distribution_consistency(pm_base, date),
        trendStability=generate_trend_stability(pm_base, date),
        biomassKPI=compute_biomass_kpi(pm_base, date)
    )
    
    results_by_date_fh[date] = {}
    results_by_date_fh[date]['raw_data'] = raw_data
    results_by_date_fh[date]['smart_data'] = smart_data
    print(date, smart_data['avgWeight'])

In [None]:
from research.utils.datetime_utils import add_days

pen_id = 56
start_date = '2020-03-01'
end_date = '2020-09-25'
dates_to_include = get_dates_in_range(add_days(start_date, -7), add_days(end_date, 7))
dates_to_compute = get_dates_in_range(start_date, end_date)

print('Extracting Data...')
df = extract_biomass_data(pen_id, dates_to_include[0], dates_to_include[-1], 0.99)
print('Data extracted!')

results_by_date_u = {}

for date in dates_to_compute:
    print('On date: {}'.format(date))

#     sampling_filter_dict = recommendations_by_date[date][pen_id]
    sampling_filter = SamplingFilter(
        start_hour=0,
        end_hour=24,
        kf_cutoff=0.0,
        akpd_score_cutoff=0.99
    )
    start_date, end_date = add_days(date, -4), add_days(date, 4)
    tdf = df[(df.date >= start_date) & (df.date <= end_date)].copy(deep=True)
    pm_base = gen_pm_base(tdf, sampling_filter)

    smart_average_weight = generate_smart_avg_weight(pm_base, date)
    smart_standard_deviation = generate_smart_standard_deviation(pm_base, date)
    if smart_average_weight and smart_standard_deviation:
        coefficient_of_variation = smart_standard_deviation / smart_average_weight
    else:
        coefficient_of_variation = None

    raw_data = dict(
        avgWeight=generate_raw_average_weight(pm_base, date),
        avgKFactor=generate_raw_average_kf(pm_base, date),
        numFish=get_raw_sample_size(pm_base, date)
    )

    smart_data = dict(
        avgWeight=smart_average_weight,
        avgKFactor=generate_smart_avg_kf(pm_base, date),
        numFish=get_smart_sample_size(pm_base, date),
        weightDistribution=generate_smart_distribution(pm_base, date),
        standardDeviation=smart_standard_deviation,
        coefficientOfVariation=coefficient_of_variation,
        growthRate=compute_local_growth_rate(pm_base, date),
        growthRateForSmartAvg=get_smart_growth_rate(pm_base, date),
        distributionConsistency=generate_distribution_consistency(pm_base, date),
        trendStability=generate_trend_stability(pm_base, date),
        biomassKPI=compute_biomass_kpi(pm_base, date)
    )
    
    results_by_date_u[date] = {}
    results_by_date_u[date]['raw_data'] = raw_data
    results_by_date_u[date]['smart_data'] = smart_data
    print(date, smart_data['avgWeight'])

In [None]:
from research.utils.datetime_utils import add_days

pen_id = 56
start_date = '2020-03-01'
end_date = '2020-09-25'
dates_to_include = get_dates_in_range(add_days(start_date, -7), add_days(end_date, 7))
dates_to_compute = get_dates_in_range(start_date, end_date)

print('Extracting Data...')
df = extract_biomass_data(pen_id, dates_to_include[0], dates_to_include[-1], 0.99)
print('Data extracted!')

results_by_date_ws = {}

for date in dates_to_compute:
    print('On date: {}'.format(date))

#     sampling_filter_dict = recommendations_by_date[date][pen_id]
    sampling_filter = SamplingFilter(
        start_hour=0,
        end_hour=24,
        kf_cutoff=1.085,
        akpd_score_cutoff=0.99
    )
    start_date, end_date = add_days(date, -4), add_days(date, 4)
    tdf = df[(df.date >= start_date) & (df.date <= end_date)].copy(deep=True)
    pm_base = gen_pm_base(tdf, sampling_filter)

    smart_average_weight = generate_smart_avg_weight(pm_base, date)
    smart_standard_deviation = generate_smart_standard_deviation(pm_base, date)
    if smart_average_weight and smart_standard_deviation:
        coefficient_of_variation = smart_standard_deviation / smart_average_weight
    else:
        coefficient_of_variation = None

    raw_data = dict(
        avgWeight=generate_raw_average_weight(pm_base, date),
        avgKFactor=generate_raw_average_kf(pm_base, date),
        numFish=get_raw_sample_size(pm_base, date)
    )

    smart_data = dict(
        avgWeight=smart_average_weight,
        avgKFactor=generate_smart_avg_kf(pm_base, date),
        numFish=get_smart_sample_size(pm_base, date),
        weightDistribution=generate_smart_distribution(pm_base, date),
        standardDeviation=smart_standard_deviation,
        coefficientOfVariation=coefficient_of_variation,
        growthRate=compute_local_growth_rate(pm_base, date),
        growthRateForSmartAvg=get_smart_growth_rate(pm_base, date),
        distributionConsistency=generate_distribution_consistency(pm_base, date),
        trendStability=generate_trend_stability(pm_base, date),
        biomassKPI=compute_biomass_kpi(pm_base, date)
    )
    
    results_by_date_ws[date] = {}
    results_by_date_ws[date]['raw_data'] = raw_data
    results_by_date_ws[date]['smart_data'] = smart_data
    print(date, smart_data['avgWeight'])

In [None]:
sampling_filter

In [None]:
from matplotlib import pyplot as plt

In [None]:
dates = sorted(list(results_by_date.keys()))
plt.figure(figsize=(20, 10))
plt.grid()
avg_weights = [results_by_date[date]['smart_data']['avgWeight'] for date in sorted(results_by_date.keys())]
avg_weights_fh = [results_by_date_fh[date]['smart_data']['avgWeight'] for date in sorted(results_by_date_fh.keys())]
avg_weights_u = [results_by_date_u[date]['smart_data']['avgWeight'] for date in sorted(results_by_date_u.keys())]
avg_weights_ws = [results_by_date_ws[date]['smart_data']['avgWeight'] for date in sorted(results_by_date_ws.keys())]
plt.plot(dates, avg_weights, color='blue')
# plt.plot(dates, avg_weights_fh, color='red')
# plt.plot(dates, avg_weights_u, color='green')
plt.plot(dates, prod_sys_ws, color='purple')
plt.plot(dates, avg_weights_ws, color='red')
plt.show()

In [None]:
a = results_by_date['2020-09-22']['smart_data']['avgWeight']
b = results_by_date_fh['2020-09-22']['smart_data']['avgWeight']
a, b, (a - b)/ b

In [None]:
pdf = pd.read_csv('/root/data/alok/biomass_estimation/playground/biomass_prod_system_data.csv')

In [None]:
prod_sys_ws = []
for date in dates:
    try:
        prod_sys_w = pdf.ix[pdf.Date == date, 'Avg weight'].iloc[0]
    except IndexError as e:
        prod_sys_w = None
    prod_sys_ws.append(prod_sys_w)