In [None]:
from collections import defaultdict
import numpy as np
import pandas as pd
from research.utils.datetime_utils import get_dates_in_range
from research.weight_estimation.population_metrics import PopulationMetricsEstimator
from research.utils.data_generation_utils import extract_biomass_data
from research.utils.image_utils import Picture
from research.utils.data_access_utils import S3AccessUtils

In [None]:
def generate_filter_mask(df, start_date, end_date, start_hour, end_hour, kf_cutoff):
    # generate filter mask
    date_mask = (df.date >= start_date) & (df.date <= end_date)
    if start_hour < end_hour:
        hour_mask = (df.hour >= start_hour) & (df.hour <= end_hour)
    else:
        hour_mask = (df.hour >= start_hour) | (df.hour <= end_hour)
    kf_mask = (df.estimated_k_factor >= kf_cutoff)
    mask = date_mask & hour_mask & kf_mask
    return mask


def generate_pme(df, start_date, end_date, start_hour, end_hour, kf_cutoff, akpd_score_cutoff=0.99):
    mask = generate_filter_mask(df, start_date, end_date, start_hour, end_hour, kf_cutoff)

    # get filtered set of biomass computations
    biomass_computations = list(zip(df[mask].date.values,
                                    df.loc[mask, 'estimated_weight_g'].values,
                                    df[mask].estimated_k_factor.values))

    # generate population metrics estimator
    if biomass_computations:
        return PopulationMetricsEstimator(biomass_computations)
    return None

def not_none_mean(x):
    return np.mean([i for i in x if i is not None])
    

def generate_metrics_for_filter(df, start_date, end_date, start_hour, end_hour, kf_cutoff):
    pme = generate_pme(df, start_date, end_date, start_hour, end_hour, kf_cutoff)
    mean_dc, mean_kpi, final_smart_average = None, None, None
    dates = get_dates_in_range(start_date, end_date)
    if pme:
        kpis, dcs, smart_avgs = [], [], []
        for date in dates:
            metrics = pme.generate_smart_metrics_on_date(date)
            kpis.append(metrics.get('biomass_kpi'))
            dcs.append(metrics.get('distribution_consistency'))
            smart_avgs.append(metrics.get('smart_average_weight'))

        # compute mean kpi, mean distribution consistency, and final smart average
        mean_kpi = not_none_mean(kpis)
        mean_dc = not_none_mean(dcs)
        final_smart_average = smart_avgs[-1]

    return mean_dc, mean_kpi, final_smart_average


def generate_optimized_filters(df, start_date, end_date, start_hours, end_hours, kf_cutoffs):

    analysis_data = defaultdict(list)
    for start_hour in start_hours:
        for end_hour in end_hours:
            print(start_hour, end_hour)
            for kf_cutoff in kf_cutoffs:
                mean_dc, mean_kpi, final_smart_avg = generate_metrics_for_filter(df, start_date, end_date, start_hour,
                                                                            end_hour, kf_cutoff)

                # add to data
                analysis_data['mean_kpi'].append(mean_kpi)
                analysis_data['mean_dc'].append(mean_dc)
                analysis_data['smart_avg'].append(final_smart_avg)
                analysis_data['start_hour'].append(start_hour)
                analysis_data['end_hour'].append(end_hour)
                analysis_data['kf_cutoff'].append(kf_cutoff)

    analysis_df = pd.DataFrame(analysis_data)
    return analysis_df


def generate_global_optimum_filter(pen_id, start_date, end_date, akpd_score_cutoff=0.99):
    # generate df
    print('Extract raw biomass data...')
    df = extract_biomass_data(pen_id, start_date, end_date, min_akpd_score=akpd_score_cutoff)
    print('Extraction Complete!')

    # perform coarse grid search
    print('Performing Coarse Grid Search...')
    start_hours = np.arange(0, 24, 1)
    end_hours = np.arange(0, 24, 1)
    min_kf_cutoff = .05 * int(df.estimated_k_factor.min() / .05)
    max_kf = 1.3
    kf_cutoffs = np.arange(min_kf_cutoff, max_kf, 0.05)

    # get best values from coarse grid search
    analysis_df = generate_optimized_filters(df, start_date, end_date, start_hours, end_hours, kf_cutoffs)
    best_row = analysis_df.sort_values('mean_kpi', ascending=False).iloc[0]
    best_start_hour, best_end_hour, best_kf_cutoff = best_row.start_hour, best_row.end_hour, best_row.kf_cutoff
    print(f'Coarse grid search complete with best start hour of {best_start_hour}, '
          f'best end hour of {best_end_hour}, best kf cutoff of {best_kf_cutoff}')

    # performe fine grid search in local neighborhood of best values above
    lo_start_hr, hi_start_hr = max(best_start_hour-1, 0), min(best_start_hour+1, 24)
    lo_end_hr, hi_end_hr = max(best_end_hour-1, 0), min(best_end_hour+1, 24)
    lo_kf, hi_kf = best_kf_cutoff - 0.1, best_kf_cutoff + 0.01
    
    start_hours = np.arange(lo_start_hr, hi_start_hr, 1)
    end_hours = np.arange(lo_end_hr, hi_end_hr, 1)
    kf_cutoffs = np.arange(lo_kf, hi_kf, 0.005)
    analysis_df = generate_optimized_filters(df, start_date, end_date, start_hours, end_hours, kf_cutoffs)
    best_row = analysis_df.sort_values('mean_kpi', ascending=False).iloc[0]
    best_start_hour, best_end_hour, best_kf_cutoff = best_row.start_hour, best_row.end_hour, best_row.kf_cutoff
    return best_start_hour, best_end_hour, best_kf_cutoff


def main():
    pen_id = 1
    start_date = '2020-06-15'
    end_date = '2020-06-29'
    best_start_hour, best_end_hour, best_kf_cutoff = generate_global_optimum_filter(pen_id, start_date, end_date)
    print(f'Best Start Hour: {best_start_hour}')
    print(f'Best End Hour: {best_end_hour}')
    print(f'Best KF Cutoff: {best_kf_cutoff}')

In [None]:
main()

In [None]:
pen_id = 108
start_date = '2020-05-07'
end_date = '2020-05-17'
df = extract_biomass_data(pen_id, start_date, end_date, min_akpd_score=0.99)

start_hour, end_hour, kf_cutoff = 17, 14, 1.25
pme = generate_pme(df, start_date, end_date, start_hour, end_hour, kf_cutoff)
dates = get_dates_in_range(start_date, end_date)
if pme:
    kpis, dcs, smart_avgs = [], [], []
    for date in dates:
        metrics = pme.generate_smart_metrics_on_date(date)
        kpis.append(metrics.get('biomass_kpi'))
        dcs.append(metrics.get('distribution_consistency'))
        smart_avgs.append(metrics.get('smart_average_weight'))

In [None]:
pd.DataFrame({'date': dates, 'smart_average': smart_avgs})

In [None]:
(5387-5544)/5544

In [None]:
s3_access_utils = S3AccessUtils('/root/data', json.load(open(os.environ['AWS_CREDENTIALS'])))