In [None]:
from collections import defaultdict
import json
import os
import math
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
from research.utils.data_access_utils import RDSAccessUtils
from sklearn.linear_model import LinearRegression
from research.utils.datetime_utils import day_difference, add_days
from research.utils.datetime_utils import get_dates_in_range

DATE_FORMAT = '%Y-%m-%d'


class PopulationMetricsEstimator(object):
    """
    Population Metrics Estimator is a class to help generate biomass population-level metrics for a given
    set of historical biomass computations. The most important ones here are the biomass smart average,
    smart distribution, and KPI. See this document for further information:
    https://aquabyte.atlassian.net/wiki/spaces/Research/pages/361562423/2020-06-02+New+Smart+Average+and+Smart+Distribution
    ...
    Attributes
    ----------
    biomass_computations : list
        List of tuples, where each tuple contains: date, estimated_weight_g, and estimated_k_factor
    bcs_by_date : dict
        Dict mapping date with list of corresponding individual biomass computations
    unique_dates_nr : list
        List of day numbers (with first date in the input data corresponding to zero)
    unique_dates: list
        List of ordered unique dates
    average_weights: list
        List of raw daily average weights for each date in unique_dates
    sample_sizes: list
        List of raw daily sample sizes for each date in unique_dates
    """

    def __init__(self, biomass_computations, biomass_computations_new):
        self.biomass_computations = biomass_computations
        self.biomass_computations_new = biomass_computations_new
        self.bcs_by_date = defaultdict(list)
        self.unique_dates_nr = []
        self.unique_dates = []
        self.average_weights = []
        self.sample_sizes = []
        self.prepare_data()

    def prepare_data(self):
        """Prepares the biomass computation data.
        The input biomass computations in the constructor are an (unordered) list of tuples.
        This method sorts them by date. It also generates and stores daily level aggregations
        of this data.
        """
        self.biomass_computations = sorted(self.biomass_computations, key=lambda x: x[0])
        self.generate_daily_values()

    def generate_daily_values(self):
        """Generates daily level aggregations of biomass computations.
        This includes: list of unique dates, list of daily raw average weights,
        list of daily sample sizes, and list of day numbers
        """

        weights_for_date = []
        curr_date = self.biomass_computations[0][0]
        for date, weight, kf in self.biomass_computations:
            self.bcs_by_date[date].append((weight, kf))
            if date != curr_date:
                self.unique_dates.append(curr_date)
                self.average_weights.append(np.mean(weights_for_date))
                self.sample_sizes.append(len(weights_for_date))
                weights_for_date = [weight]
                curr_date = date
            else:
                weights_for_date.append(weight)

        self.unique_dates.append(curr_date)
        self.average_weights.append(np.mean(weights_for_date))
        self.sample_sizes.append(len(weights_for_date))
        self.unique_dates_nr = [day_difference(date, self.unique_dates[0]) for date in self.unique_dates]

    def generate_raw_daily_metrics_on_date(self, date):
        """Returns the raw average weight and raw sample size on the given date
        Parameters
        ----------
        date: str
            Date is in the form 'YYYY-MM-DD'
        Returns
        -------
        average_weight: float, sample_size: int
        """
        if date in self.unique_dates:
            idx = self.unique_dates.index(date)
            return self.average_weights[idx], self.sample_sizes[idx]
        return None, None

    def generate_raw_weights_kfs_on_date(self, date):
        """Returns all individual raw weights and k-factors on date.
        Parameters
        ----------
        date: str
            Date is in the form 'YYYY-MM-DD'
        Returns
        -------
        weights: list, kfs: list
        """
        if len(self.bcs_by_date[date]) > 0:
            weights, kfs = [list(l) for l in list(zip(*self.bcs_by_date[date]))]
            return weights, kfs
        return [], []

    def get_start_end_idx(self, start_date, end_date):
        """Returns indices of self.unique_dates corresponding to start and end dates.
        Parameters
        ----------
        start_date: str
            Start date is in the form 'YYYY-MM-DD'
        end_date: str
            End date is in the form 'YYYY-MM-DD'
        Returns
        -------
        start_idx: int
        end_idx: int
        """

        if start_date > self.unique_dates[-1] or end_date < self.unique_dates[0]:
            return -1, -1
        start_idx = [idx for idx, date in enumerate(self.unique_dates) if date >= start_date][0]
        end_idx = [idx for idx, date in enumerate(self.unique_dates) if date <= end_date][-1] + 1
        return start_idx, end_idx

    def compute_growth_rate(self, date, start_date, end_date, decay=0.1):
        """Returns growth rate between start and end dates, with more weight applied to days closer to
        present date.
        Parameters
        ----------
        date: str
            date is in the form 'YYYY-MM-DD'. This is considered 'present date'.
        start_date: str
        end_date: str
        decay: float
            This is the exponential weight decay factor to apply to dates not equal to present date.
            For example, a 2 day difference would result in a weighting of exp(-decay * 2).
        Returns
        -------
        growth_rate: float
        error_magnitude_pct: float
            The error magnitude percentage is a representation of deviation between raw average weight and the
            exponential curve fit. It is weighted as described in the decay factor description.
        """

        #print(date, start_date, end_date)

        if not any([date in self.unique_dates for date in get_dates_in_range(start_date, end_date)]):
            return None, None
        start_idx, end_idx = self.get_start_end_idx(start_date, end_date)
        X = np.array([day_difference(d, date) for d in self.unique_dates[start_idx:end_idx]]).reshape(-1, 1)
        y = np.log(np.array(self.average_weights[start_idx:end_idx]))
        n = np.array(self.sample_sizes[start_idx:end_idx])

        if X.shape[0] < 4:
            return None, None

        sample_weights = np.multiply(n, np.exp(-decay * np.abs(X.squeeze())))
        reg = LinearRegression().fit(X, y, sample_weight=sample_weights)
        growth_rate = reg.coef_[0]
        y_pred = reg.predict(X)

        error_magnitude_pct = np.average(((np.exp(y) - np.exp(y_pred)) / np.exp(y_pred))**2,
                                         weights=sample_weights)**0.5
        return growth_rate, error_magnitude_pct

    def compute_local_growth_rate(self, date, incorporate_future, window=7):
        """Returns local growth rate given date, window size, and whether or not future should be incorporated.
        Parameters
        ----------
        date: str
        incorporate_future: bool
        window: int
        Returns
        -------
        growth_rate: float
        error_magnitude_pct: float
            More details on this variable in the doc-string of compute_growth_rate(...)
        """

        # compute local growth rate
        day_diffs = np.array([day_difference(d, date) for d in self.unique_dates])
        if incorporate_future:
            start, end = add_days(date, -window), add_days(date, window // 2)
            if not any([date in get_dates_in_range(start, end) for date in self.unique_dates]):
                return None, None
            end_idx = np.where(day_diffs <= window // 2)[0][-1]
            end_date = self.unique_dates[end_idx]
            start_date = add_days(end_date, -window)
        else:
            start_date, end_date = add_days(date, -window), date
        growth_rate, error_magnitude_pct = self.compute_growth_rate(date, start_date, end_date)
        return growth_rate, error_magnitude_pct

    def generate_historical_weights(self, date, window=7):
        """Generates array of historical individual weights from seven days ago to yesterday.
        Parameters
        ----------
        date: str
        window: int
        Returns
        -------
        historical_weights: list
        """
        seven_days_ago, yesterday = add_days(date, -window), add_days(date, -1)
        dates = get_dates_in_range(seven_days_ago, yesterday)
        historical_weights = []
        for curr_date in dates:
            weights, _ = self.generate_raw_weights_kfs_on_date(curr_date)
            historical_weights.extend(weights)

        return historical_weights

    def generate_distribution_consistency(self, date, window=7):
        """Generates distribution consistency score for a given date.
        Parameters
        ----------
        date: str
        window: int
            Describes the look-back-period in days. The consistency metric is measured
            between today's weight distribution and that of the last `window` days not
            including today.
        Returns
        -------
        distribution_consistency: float
        """

        raw_weights, _ = self.generate_raw_weights_kfs_on_date(date) # vector of today's raw weights
        historical_weights = self.generate_historical_weights(date, window=window) # vector of historical weights
        if not raw_weights or not historical_weights:
            return None
        raw_weights = np.array(raw_weights)
        historical_weights = np.array(historical_weights)

        # adjust historical weights so that today's weights and historical weights have same mean
        # this controls for fish growth when doing the distribution consistency calculation.
        mean_adjustment = np.mean(raw_weights) - np.mean(historical_weights)

        # compute qq-plot based metric
        x = np.percentile(historical_weights + mean_adjustment, list(range(100)))
        y = np.percentile(raw_weights, list(range(100)))
        distribution_consistency = 1.0 - 10.0 * (np.mean(np.abs(y[1:99] - x[1:99]) ** 2) ** 0.5 / 10000.0)
        return distribution_consistency
    
    def generate_smart_weight_on_date(self, date):
        return 3

    def generate_smart_metrics_on_date(self, date, max_day_difference=3, bucket_size=100, incorporate_future=True,
                                       apply_growth_rate=True):
        """
        Generates smart metrics for a given date.
        Parameters
        ----------
        date: str
        max_day_difference: int
            This is the maximum number of days beyond which we cannot incorporate information into the smart
            average calculation
        bucket_size: int
            This is the bucket size to use for the smart weight distribution representation
        incorporate_future: bool
            Should we incorporate future data relative to date or not?
        apply_growth_rate: bool
            Should we enable growth rate application or not?
        Returns
        -------
        metrics: dict
            Dict contains raw and smart metrics
        """

        # compute metrics on this date
        distribution_consistency = self.generate_distribution_consistency(date, window=7)
        raw_average_weight, raw_sample_size = self.generate_raw_daily_metrics_on_date(date)
        _, raw_kfs = self.generate_raw_weights_kfs_on_date(date)
        raw_average_kf = np.mean([kf for kf in raw_kfs if kf is not None])

        # compute local growth rate
        growth_rate, error_magnitude_pct = self.compute_local_growth_rate(date, incorporate_future)

        # compute smart average

        look_ahead = max_day_difference if incorporate_future else 0
        start_date, end_date = add_days(date, -max_day_difference), add_days(date, look_ahead)
        if not any([date in get_dates_in_range(start_date, end_date) for date in self.unique_dates]):
            return {}
        start_idx, end_idx = self.get_start_end_idx(start_date, end_date)

        if growth_rate and apply_growth_rate and raw_sample_size and error_magnitude_pct < 0.02 and \
                abs(growth_rate) < 0.02:
            growth_rate_for_smart_avg = growth_rate
        else:
            growth_rate_for_smart_avg = 0.0

        x = np.array([day_difference(d, date) for d in self.unique_dates[start_idx:end_idx]])
        y = np.array(self.average_weights[start_idx:end_idx])
        n = np.array(self.sample_sizes[start_idx:end_idx])
        sample_size = int(np.sum(n))
        smart_average = np.sum(np.exp(-x * growth_rate_for_smart_avg) * y * n) / sample_size

        # compute smart distribution
        adj_weights, kfs = [], []
        for date_idx, date in enumerate(self.unique_dates[start_idx:end_idx]):
            weights_for_date, kfs_for_date = self.generate_raw_weights_kfs_on_date(date)
            adj_weights_for_date = np.array(weights_for_date) * np.exp(-x[date_idx] * growth_rate_for_smart_avg)
            adj_weights.extend(adj_weights_for_date)
            kfs.extend(kfs_for_date)

        # perform consistency checks
        assert len(adj_weights) == np.sum(n), 'Inconsistent sample sizes!'
        assert math.isclose(smart_average, np.mean(adj_weights), rel_tol=1e-5), 'Inconsistent smart average numbers!'

        adj_weights, kfs = np.array(adj_weights), np.array(kfs)
        smart_distribution = dict()

        bucket_size_kg = 1e-3 * bucket_size
        buckets = [round(x, 2) for x in np.arange(0.0, 1e-3 * np.max(adj_weights), bucket_size_kg)]
        for b in buckets:
            low, high = 1e3 * b, 1e3 * (b + bucket_size_kg)
            count = adj_weights[(adj_weights >= low) & (adj_weights < high)].shape[0]
            kfs_for_bucket = [kf if kf else np.nan for kf in kfs[(adj_weights >= low) & (adj_weights < high)]]
            mean_kf = np.mean(kfs_for_bucket) if count > 0 else np.nan
            smart_distribution[str(b)] = {
                'count': count,
                'avgKFactor': None if np.isnan(mean_kf) else mean_kf
            }

        metrics = dict(
            raw_average_weight=raw_average_weight,
            raw_average_kf=raw_average_kf,
            raw_sample_size=raw_sample_size,
            smart_average_weight=smart_average,
            smart_average_kf=np.mean([kf for kf in kfs if kf is not None]),
            smart_distribution=smart_distribution,
            smart_sample_size=sample_size,
            adj_weights=adj_weights,
            kfs=kfs,
            growth_rate=growth_rate,
            error_magnitude_pct=error_magnitude_pct,
            distribution_consistency=distribution_consistency
        )

        return metrics

In [None]:
pme = PopulationMetricsEstimator(biomass_computations, biomass_computations_new)
start_date, end_date = df.date.iloc[0], df.date.iloc[-1]
dates_in_range = get_dates_in_range(start_date, end_date)

for curr_date in dates_in_range:
    # generate raw daily metrics
    raw_average_weight, raw_sample_size = pme.generate_raw_daily_metrics_on_date(curr_date)

    # generate smart daily metrics
    output = pme.generate_smart_metrics_on_date(
        curr_date,
        incorporate_future=True,
        apply_growth_rate=True)
    
    weight = pme.generate_smart_weight_on_date(curr_date)
    
    if 'smart_average_weight' in output:
        print(curr_date, output['smart_average_weight'], weight)
        #print(curr_date, output['raw_average_weight'], output['raw_sample_size'], output['smart_average_weight'], output['growth_rate'])


In [None]:
rds_access_utils = RDSAccessUtils(json.load(open(os.environ['DATA_WAREHOUSE_SQL_CREDENTIALS'])))
query = """
    SELECT * FROM
    prod.biomass_computations bc
    WHERE bc.pen_id=23
    and bc.akpd_score > 0.0
    and bc.estimated_weight_g > 0.0;
"""
df = rds_access_utils.extract_from_database(query)

df = df.sort_values('captured_at').copy(deep=True)
df.index = pd.to_datetime(df.captured_at)
dates = df.index.date.astype(str)
df['date'] = dates
df['hour'] = df.index.hour

if 'estimated_k_factor' not in df.columns.tolist():
    df['estimated_k_factor'] = 0.0

biomass_computations = []
biomass_computations_new = []

for idx, row in df.iterrows():
    biomass_computations.append((row.date, row.estimated_weight_g, row.estimated_k_factor))
    biomass_computations_new.append((row.captured_at, row.estimated_weight_g, row.estimated_k_factor))

In [None]:
lookback_days = 7

date_time_str = '2020-02-20 00:00'
current_time = datetime.strptime(date_time_str, '%Y-%m-%d %H:%M')
last_time = current_time - timedelta(days = lookback_days)
biomass_computations_new[0]