In [None]:
from typing import List
from collections import defaultdict
import numpy as np
from filter_optimization.filter_optimization_task import generate_sampling_filters, extract_biomass_data, \
    NoDataException, SamplingFilter, generate_pm_base, PopulationMetricsBase, generate_filter_mask, get_dates_in_range, \
    find_optimal_filter, gen_pm_base
from population_metrics.population_metrics_base import generate_pm_base, PopulationMetricsBase
from population_metrics.confidence_metrics import compute_biomass_kpi, generate_distribution_consistency
from population_metrics.raw_metrics import get_raw_sample_size

import pandas as pd

In [None]:
from typing import List, Tuple, Union
import numpy as np
from population_metrics.population_metrics_base import PopulationMetricsBase, ValidationError
from population_metrics.growth_rate import generate_regression_input, compute_growth_rate
from population_metrics.raw_metrics import get_raw_weight_values, get_raw_sample_size
from research.utils.datetime_utils import add_days, get_dates_in_range


"""
This module contains helper functions for computing daily level features representing confidence metrics.
Namely, it covers trend stability, distribution consistency, and overall biomass KPI. Ask Alok for more 
context behind how these are mathematically formulated.
"""


def _not_none_mean(x):
    return np.mean([i for i in x if i is not None])


def compute_distribution_consistency(raw_weights: List[float], historical_weights: List[float]) -> float:
    """Computes distribution consistency using qq-plot approach for two arbitrary lists."""
    raw_weights = np.array(raw_weights)
    historical_weights = np.array(historical_weights)
    mean_adjustment = _not_none_mean(raw_weights) - _not_none_mean(historical_weights)

    # compute qq-plot based metric
    x = np.percentile(historical_weights + mean_adjustment, list(range(100)))
    y = np.percentile(raw_weights, list(range(100)))
    distribution_consistency = 1.0 - 10.0 * (np.mean(np.abs(y[1:99] - x[1:99]) ** 2) ** 0.5 / 10000.0)
    return distribution_consistency


def get_raw_and_historical_weights(pm_base: PopulationMetricsBase, date: str, window: int) -> Tuple[List, List]:
    """
    Gets list of raw weights for input date and historical weights of the provided window size
    (not including input date).
    """

    # get today's weights
    raw_weights = []
    three_days_ago = add_days(date, -3)
    dates = get_dates_in_range(three_days_ago, date)
    for curr_date in dates:
        weights = get_raw_weight_values(pm_base, curr_date)
        raw_weights.extend(weights)
        
    # get past weights
    seven_days_ago, four_days_ago = add_days(date, -window), add_days(date, -4)
    dates = get_dates_in_range(seven_days_ago, four_days_ago)
    historical_weights = []
    for curr_date in dates:
        weights = get_raw_weight_values(pm_base, curr_date)
        historical_weights.extend(weights)

    if not raw_weights or not historical_weights:
        raise ValidationError('Insufficient data to compute distribution consistency!')

    return raw_weights, historical_weights


def generate_distribution_consistency(pm_base: PopulationMetricsBase, date: str, window: int = 7) \
                                      -> Union[float, None]:
    """
    Generates distribution consistency, which represents how consistent a given day's raw weight distribution
    is to previous days' distribution not including latest date.
    """
    try:
        raw_weights, historical_weights = get_raw_and_historical_weights(pm_base, date, window)
    except ValidationError as err:
        print(str(err))
        return None
    dc = compute_distribution_consistency(raw_weights, historical_weights)
    return dc

def compute_biomass_kpi(pm_base: PopulationMetricsBase, date: str) -> Union[float, None]:
    """
    Computes biomass KPI for given PopulationMetricsBase instance and date.
    """
    raw_sample_size = get_raw_sample_size(pm_base, date)
    distribution_consistency = generate_distribution_consistency(pm_base, date)
    if not raw_sample_size or not distribution_consistency:
        return None
    biomass_kpi = np.log(raw_sample_size * distribution_consistency**20) / np.log(500 * 0.9**20)
    return biomass_kpi

In [None]:
pen_id = 125
start_date = '2020-08-10'
end_date = '2020-08-29'
akpd_score_cutoff = 0.99

In [None]:
df = extract_biomass_data(pen_id, start_date, end_date, akpd_score_cutoff)
start_hours = [0]
end_hours = [24]
kf_cutoffs = np.arange(1.0, 1.5, 0.005)
sampling_filters = generate_sampling_filters(start_hours, end_hours, kf_cutoffs)

In [None]:
def generate_metrics_for_pm_base(pm_base: PopulationMetricsBase, dates: List[str]) -> float:
    """Generates mean biomass KPI given a PopulationMetricsBase instance and dates to consider."""

    kpis, sample_sizes = [], []
    for date in dates:
        sample_size = get_raw_sample_size(pm_base, date)
        biomass_kpi = compute_biomass_kpi(pm_base, date)
        sample_sizes.append(sample_size)
        kpis.append(biomass_kpi)

    # compute sample-size weighted kpi and final smart average
    kpis = np.array([k if k else np.nan for k in kpis])
    sample_sizes = np.array([s if s else np.nan for s in sample_sizes])
    mean_kpi = np.nansum(kpis * sample_sizes) / np.nansum(sample_sizes)
    return mean_kpi


In [None]:
def find_optimal_filter(df: pd.DataFrame, sampling_filters: List[SamplingFilter]) -> SamplingFilter:
    """Finds optimal filter given data-frame of raw biomass computations and different sampling filters. """

    analysis_data = defaultdict(list)
    for sampling_filter in sampling_filters:
        print('Start hour: {}, End hour: {}, KF cutoff: {}'.format(
            sampling_filter.start_hour, sampling_filter.end_hour, sampling_filter.kf_cutoff
        ))
        pm_base = gen_pm_base(df, sampling_filter)

        if pm_base:
            unique_dates = sorted(df.date.unique().tolist())
            dates = get_dates_in_range(unique_dates[0], unique_dates[-1])
            mean_kpi = generate_metrics_for_pm_base(pm_base, dates)
        else:
            mean_kpi = None

        # add to data
        analysis_data['mean_kpi'].append(mean_kpi)
        analysis_data['start_hour'].append(sampling_filter.start_hour)
        analysis_data['end_hour'].append(sampling_filter.end_hour)
        analysis_data['kf_cutoff'].append(sampling_filter.kf_cutoff)
        analysis_data['akpd_score_cutoff'].append(sampling_filter.akpd_score_cutoff)

    analysis_df = pd.DataFrame(analysis_data)
    best_sampling_filter_params = analysis_df.sort_values('mean_kpi', ascending=False).iloc[0]

    best_sampling_filter = SamplingFilter(
        start_hour=float(best_sampling_filter_params.start_hour),
        end_hour=float(best_sampling_filter_params.end_hour),
        kf_cutoff=float(best_sampling_filter_params.kf_cutoff),
        akpd_score_cutoff=float(best_sampling_filter_params.akpd_score_cutoff)
    )
    return analysis_df

In [None]:
# analysis_df, best_sampling_filter = find_optimal_filter(df, sampling_filters)
analysis_df = find_optimal_filter(df, sampling_filters)

In [None]:
from matplotlib import pyplot as plt

In [None]:
plt.figure(figsize=(20, 10))
plt.plot(analysis_df.kf_cutoff, analysis_df.mean_kpi)
plt.xlabel('KF Cutoff')
plt.ylabel('Sample size weighted KPI')
plt.grid()
plt.show()

In [None]:
plt.figure(figsize=(20, 10))
plt.plot(analysis_df.kf_cutoff, analysis_df.mean_kpi)
plt.xlabel('KF Cutoff')
plt.ylabel('Sample size weighted KPI')
plt.grid()
plt.show()