In [None]:
from collections import defaultdict
import numpy as np
from filter_optimization.filter_optimization_task import generate_sampling_filters, extract_biomass_data, \
    NoDataException, SamplingFilter, generate_pme, generate_filter_mask, get_dates_in_range, \
    generate_metrics_for_pme, _not_none_mean
from matplotlib import pyplot as plt
import pandas as pd



In [None]:
pen_id = 37
start_date = '2020-06-08'
end_date = '2020-06-22'
akpd_score_cutoff = 0.99

df = extract_biomass_data(pen_id, start_date, end_date, akpd_score_cutoff)
start_hours = [0]
end_hours = [24]
kf_cutoffs = np.arange(0.9, 1.1, 0.005)
sampling_filters = generate_sampling_filters(start_hours, end_hours, kf_cutoffs)

In [None]:
def generate_metrics_for_pme(pme, dates):
    """Generates mean biomass KPI given a PopulationMetricsEstimator instance and dates to consider."""

    kpis, weights, sample_sizes = [], [], []
    for date in dates:
        metrics = pme.generate_smart_metrics_on_date(date)
        kpis.append(metrics.get('biomass_kpi'))
        sample_sizes.append(metrics.get('raw_sample_size'))
        weights.append(metrics.get('smart_average_weight'))

    # compute mean kpi, mean distribution consistency, and final smart average
    print((np.array([k if k else np.nan for k in kpis]) * np.array(sample_sizes)).sum())
    mean_kpi = np.nansum(np.array([k if k else np.nan for k in kpis]) * np.array(sample_sizes)) / np.array(sample_sizes).sum()
    weight = weights[-1]
    return mean_kpi, weight

In [None]:
def find_optimal_filter(df, sampling_filters):
    """Finds optimal filter given data-frame of raw biomass computations and different sampling filters.
    Args:
        - df: DataFrame of raw biomass computations from data warehouse
        - sampling_filters: list of SamplingFilter instances to iterate over
    Returns:
        - best_sampling_filter: SamplingFilter instance corresponding to the one that maximizes biomass KPI
    """
    analysis_data = defaultdict(list)
    for sampling_filter in sampling_filters:
        print('Start hour: {}, End hour: {}, KF cutoff: {}'.format(
            sampling_filter.start_hour, sampling_filter.end_hour, sampling_filter.kf_cutoff
        ))
        try:
            pme = generate_pme(df, sampling_filter)
        except NoDataException as err:
            print(str(err))
            pme = None
        
        if pme:
            unique_dates = sorted(df.date.unique().tolist())
            dates = get_dates_in_range(unique_dates[0], unique_dates[-1])
            mean_kpi, weight = generate_metrics_for_pme(pme, dates)
        else:
            mean_kpi, weight = None, None
        

        # add to data
        analysis_data['mean_kpi'].append(mean_kpi)
        analysis_data['weight'].append(weight)
        analysis_data['start_hour'].append(sampling_filter.start_hour)
        analysis_data['end_hour'].append(sampling_filter.end_hour)
        analysis_data['kf_cutoff'].append(sampling_filter.kf_cutoff)
        analysis_data['akpd_score_cutoff'].append(sampling_filter.akpd_score_cutoff)

    analysis_df = pd.DataFrame(analysis_data)
    return analysis_df
    best_sampling_filter_params = analysis_df.sort_values('mean_kpi', ascending=False).iloc[0]

    best_sampling_filter = SamplingFilter(
        start_hour=float(best_sampling_filter_params.start_hour),
        end_hour=float(best_sampling_filter_params.end_hour),
        kf_cutoff=float(best_sampling_filter_params.kf_cutoff),
        akpd_score_cutoff=float(best_sampling_filter_params.akpd_score_cutoff)
    )
    return analysis_df, best_sampling_filter

In [None]:
# analysis_df, best_sampling_filter = find_optimal_filter(df, sampling_filters)
analysis_df = find_optimal_filter(df, sampling_filters)

In [None]:
plt.figure(figsize=(20, 10))
plt.plot(analysis_df.kf_cutoff, analysis_df.mean_kpi)
plt.grid()
plt.show()

In [None]:
analysis_df.sort_values('mean_kpi', ascending=False)