In [None]:
from collections import namedtuple
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
from research_lib.utils.data_access_utils import S3AccessUtils, RDSAccessUtils
from population_metrics.smart_metrics import generate_smart_avg_weight, compute_local_growth_rate, \
    generate_smart_individual_values
from report_generation.report_generator import gen_pm_base
from report_generation.report_generator import generate_ts_data, SamplingFilter


<h1> Load data and generate AKPD scores / weight estimates </h1>

In [None]:
s3 = S3AccessUtils('/root/data')

df = pd.concat([
    pd.read_csv('/root/data/alok/biomass_estimation/playground/pen=61/data_dump_1.csv'),
    pd.read_csv('/root/data/alok/biomass_estimation/playground/pen=61/biomass.csv-61-06-from-2019-10-25-to-2019-11-01.csv'),
    pd.read_csv('/root/data/alok/biomass_estimation/playground/pen=61/biomass.csv-61-07-from-2019-11-01-to-2019-11-08.csv'),
    pd.read_csv('/root/data/alok/biomass_estimation/playground/pen=61/biomass.csv-61-08-from-2019-11-08-to-2019-11-15.csv'),
    pd.read_csv('/root/data/alok/biomass_estimation/playground/pen=61/biomass.csv-61-09-from-2019-11-15-to-2019-11-22.csv'),
    pd.read_csv('/root/data/alok/biomass_estimation/playground/pen=61/biomass.csv-61-10-from-2019-11-22-to-2019-11-29.csv'),
    pd.read_csv('/root/data/alok/biomass_estimation/playground/pen=61/biomass.csv-61-11-from-2019-11-29-to-2019-12-06.csv'),
    pd.read_csv('/root/data/alok/biomass_estimation/playground/pen=61/biomass.csv-61-12-from-2019-12-06-to-2019-12-13.csv'),
    pd.read_csv('/root/data/alok/biomass_estimation/playground/pen=61/biomass.csv-61-13-from-2019-12-13-to-2019-12-20.csv'),
    pd.read_csv('/root/data/alok/biomass_estimation/playground/pen=61/biomass.csv-61-14-from-2019-12-20-to-2019-12-27.csv')
])    




In [None]:
df['estimated_weight_g'] = df['weight']
df['estimated_k_factor'] = 0.0
df.index = pd.to_datetime(df.captured_at)
df['hour'] = df.index.hour
df['date'] = df.index.date.astype(str)

In [None]:
df = df[~df.estimated_weight_g.isnull()]

In [None]:
SamplingFilter = namedtuple('SamplingFilter', 'start_hour end_hour kf_cutoff akpd_score_cutoff')

sampling_filter = SamplingFilter(
    start_hour=0,
    end_hour=24,
    kf_cutoff=0.0,
    akpd_score_cutoff=0.95
)

pm_base = gen_pm_base(df[df.akpd_score > 0.01], sampling_filter)

In [None]:
avg = generate_smart_avg_weight(pm_base, '2020-02-11')

In [None]:
compute_local_growth_rate(pm_base, '2019-09-13')

In [None]:
generate_smart_avg_weight(pm_base, '2019-09-13')

In [None]:
weights, _ = generate_smart_individual_values(pm_base, '2019-12-05', 3, True, True, 0.9)

In [None]:
weights = weights * (1.0108**-6)

In [None]:
np.mean(weights)

In [None]:
for w in weights:
    print(w)

In [None]:
def get_distribution(weights, bucket_cutoffs):
    dist = {}
    count = 0
    for low, high in zip(bucket_cutoffs, bucket_cutoffs[1:]):
        bucket = f'{round(1e-3 * low, 1)}-{round(1e-3 * high, 1)}'
        bucket_count = weights[(weights >= low) & (weights < high)].shape[0]
        dist[bucket] = bucket_count
        count += bucket_count
    
    dist = {k: round(100 * v / count, 1) for k, v in dist.items()}
    return dist

In [None]:
dist = get_distribution(weights, np.arange(0, 5000, 200))

In [None]:
plt.figure(figsize=(20,10))
plt.bar(list(dist.keys()), list(dist.values()))
plt.xlabel('Weight Bucket (kg)')
plt.ylabel('Frequency (%)')
plt.title('Weight Distribution')
plt.xticks(rotation=90)
plt.grid()
plt.show()



In [None]:
dist

In [None]:
generate_smart_avg_weight(pm_base, '2019-12-03')

In [None]:
rds = RDSAccessUtils()
query = """
    select * from prod.biomass_computations
    where pen_id=61
    and group_id='61'
    and captured_at between '2020-02-05' and '2020-02-15';
"""

df = rds.extract_from_database(query)

In [None]:
df = df.sort_values('captured_at', ascending=True)
df.index = pd.to_datetime(df.captured_at)
df['hour'] = df.index.hour
df['date'] = df.index.date.astype(str)

In [None]:
sampling_filter = SamplingFilter(
    start_hour=0,
    end_hour=24,
    kf_cutoff=0.0,
    akpd_score_cutoff=0.01
)

pm_base = gen_pm_base(df[df.akpd_score > 0.01], sampling_filter)

In [None]:
compute_local_growth_rate(pm_base, '2020-02-11')

In [None]:
weights, _ = generate_smart_individual_values(pm_base, '2020-02-11', 3, True, True, 0.9)

In [None]:
dist = get_distribution(weights, np.arange(0, 12000, 200))

In [None]:
plt.figure(figsize=(20, 10))
plt.bar(list(dist.keys()), list(dist.values()))
plt.xlabel('Weight Bucket (kg)')
plt.ylabel('Frequency (%)')
plt.title('Weight Distribution')
plt.xticks(rotation=90)
plt.grid()
plt.show()



In [None]:
dist

In [None]:
import numpy as np
from typing import List, Tuple, Union
from sklearn.linear_model import LinearRegression
from population_metrics.population_metrics_base import generate_pm_base, PopulationMetricsBase, ValidationError
from research.utils.datetime_utils import add_days, day_difference, get_dates_in_range


"""
This module contains functions for computing daily local growth rate. Ask Alok for more 
context behind mathematical formulation.
"""


def compute_growth_rate(X: np.ndarray, y: np.ndarray, n: np.ndarray, decay: float = 0.1) \
        -> Tuple[float, float]:
    """
    Computes growth rate given input log-weight, day number, and sample size information.
    Args:
        X: numpy array representing day number (i.e. day difference between date and today)
        y: numpy array representing log of average weights
        n: numpy array representing raw daily sample sizes
        decay: exponential decay factor applied to data for dates not equal to today
    Returns:
        growth_rate: daily growth rate (exponential)
        error_magnitude_pct: RMS of error percentages (important for computing trend stability)
    """

    sample_weights = np.multiply(n, np.exp(-decay * np.abs(X.squeeze())))
    reg = LinearRegression().fit(X, y, sample_weight=sample_weights)
    growth_rate = reg.coef_[0]
    y_pred = reg.predict(X)
    error_magnitude_pct = np.average(((np.exp(y) - np.exp(y_pred)) / np.exp(y_pred))**2,
                                        weights=sample_weights)**0.5

    return float(growth_rate), float(error_magnitude_pct)


def generate_regression_input(pm_base: PopulationMetricsBase, date: str,
                              incorporate_future: bool, window: int = 7,
                              min_days_required: int = 4) -> Tuple:
    """Returns inputs for performing growth rate regression and does data validation in the process."""

    if incorporate_future:
        min_end, max_end = date, add_days(date, window // 2)
        possible_end_dates = \
            sorted([date for date in get_dates_in_range(min_end, max_end) if date in pm_base.unique_dates])
        end = possible_end_dates[-1] if possible_end_dates else date
    else:
        end = date

    start = add_days(end, -window)
    
    included_dates = sorted([date for date in get_dates_in_range(start, end) if date in pm_base.unique_dates])
    if len(included_dates) < min_days_required:
        raise ValidationError('Insufficient data found for computing growth rate!')
        
    start_idx = pm_base.unique_dates.index(included_dates[0])
    end_idx = pm_base.unique_dates.index(included_dates[-1]) + 1
    X = np.array([day_difference(d, date) for d in pm_base.unique_dates[start_idx:end_idx]]).reshape(-1, 1)
    y = np.log(np.array(pm_base.average_weights[start_idx:end_idx]))
    n = np.array(pm_base.sample_sizes[start_idx:end_idx])
    
    return X, y, n


def compute_local_growth_rate(pm_base: PopulationMetricsBase, date: str, incorporate_future: bool = True) \
        -> Union[float, None]:
    """Computes local growth rate on given date. """
    print('hey')
    try:
        X, y, n = generate_regression_input(pm_base, date, incorporate_future)
    except ValidationError as err:
        print(str(err))
        return None
    growth_rate, _ = compute_growth_rate(X, y, n)
    return growth_rate

In [None]:
from typing import Dict, List, Tuple, Union
import numpy as np
from population_metrics.population_metrics_base import PopulationMetricsBase, ValidationError
from population_metrics.raw_metrics import get_raw_sample_size, get_raw_weight_values, get_raw_kf_values
from population_metrics.growth_rate import compute_local_growth_rate
from population_metrics.confidence_metrics import generate_trend_stability, get_raw_and_historical_weights
from research.utils.datetime_utils import add_days, day_difference, get_dates_in_range

"""
This module contains functions for computing daily level smart features - for example, smart growth rate,
smart distribution, smart average, smart k-factor, smart sample-size, and smart standard deviation. 
Ask Alok for more context behind mathematical formulation.
"""


def get_included_dates(pm_base: PopulationMetricsBase, date: str,
                       max_day_difference: int, incorporate_future: bool) -> List:
    """
    Gets list of dates that fall into window corresponding to max_day_difference. Window
    is affected by whether or not incorporate_future is set to True.
    """

    start = add_days(date, -max_day_difference)
    end = add_days(date, max_day_difference if incorporate_future else 0)
    included_dates = sorted([date for date in get_dates_in_range(start, end) if date in pm_base.unique_dates])
    if not included_dates:
        raise ValidationError('No raw biomass data found in window!')
    return included_dates


def get_smart_growth_rate(pm_base: PopulationMetricsBase, date: str,
                          incorporate_future: bool = True, apply_growth_rate: bool = True,
                          trend_stability_threshold: float = 0.9) -> float:
    """Get local growth rate adjustment to use for smart average computation."""
    raw_sample_size = get_raw_sample_size(pm_base, date)
    growth_rate_for_smart_metrics = 0.0
    if apply_growth_rate:
        try:
            growth_rate = compute_local_growth_rate(pm_base, date, incorporate_future=incorporate_future)
            trend_stability = generate_trend_stability(pm_base, date, incorporate_future=incorporate_future)
            if raw_sample_size and trend_stability and trend_stability > trend_stability_threshold:
                growth_rate_for_smart_metrics = growth_rate
        except ValidationError as err:
            print(str(err))
    return growth_rate_for_smart_metrics


def generate_smart_individual_values(pm_base: PopulationMetricsBase, date: str, max_day_difference: int,
                                     incorporate_future: bool, apply_growth_rate: bool,
                                     trend_stability_threshold: float) -> Tuple:
    """
    Generate smart individual values for weight and k-factor on given date.
    Args:
        pm_base: PopulationMetricsBase instance
        date: the date to compute smart individual values for
        max_day_difference: what is the maximum day difference of dates in the window?
        incorporate_future: should future data be incorporated?
        apply_growth_rate: should we apply a growth rate adjustment?
        trend_stability_threshold: if apply_growth_rate is True, what minimum trend_stability_threshold
                                   should we mandate for growth rate adjustment?
    Returns:
        adj_weights: growth rate adjusted individual weights in window
        kfs: individual k-factor values in window
    """
    
    # validate data
    included_dates = get_included_dates(pm_base, date, max_day_difference, incorporate_future)

    # compute local growth rate to use for smart average
    growth_rate_for_smart_metrics = get_smart_growth_rate(pm_base, date, incorporate_future=incorporate_future,
                                                          apply_growth_rate=apply_growth_rate,
                                                          trend_stability_threshold=trend_stability_threshold)

    # get adjusted weights and kfs for smart metrics
    adj_weights, kfs = [], []
    for d in included_dates:

        # extend adjusted weights list for this date
        weights_for_date = get_raw_weight_values(pm_base, d)
        day_diff = day_difference(d, date)
        adj_weights_for_date = np.array(weights_for_date) * np.exp(-day_diff * growth_rate_for_smart_metrics)
        adj_weights.extend(adj_weights_for_date)

        # extend k-factor list for this date
        kfs_for_date = get_raw_kf_values(pm_base, d)
        kfs.extend(kfs_for_date)

    return np.array(adj_weights), np.array(kfs)


def generate_smart_avg_weight(pm_base: PopulationMetricsBase, date: str, max_day_difference: int = 3,
                              incorporate_future: bool = True, apply_growth_rate: bool = True,
                              trend_stability_threshold: float = 0.9) -> Union[float, None]:
    """Generates smart average weight on given date."""
    try:
        adj_weights, _ = generate_smart_individual_values(pm_base, date, max_day_difference, incorporate_future,
                                                          apply_growth_rate, trend_stability_threshold)
        return float(np.mean(adj_weights))
    except ValidationError as err:
        print(str(err))
        return None


def generate_smart_distribution(pm_base: PopulationMetricsBase, date: str, max_day_difference: int = 3,
                                incorporate_future: bool = True, apply_growth_rate=True,
                                trend_stability_threshold: float = 0.9, bucket_size: int = 100) -> Union[Dict, None]:
    """Generates smart distribution on given date."""
    try:
        adj_weights, kfs = generate_smart_individual_values(pm_base, date, max_day_difference, incorporate_future,
                                                            apply_growth_rate, trend_stability_threshold)
        # convert None values to nan
        kfs = np.array([val if val else np.nan for val in kfs])
    except ValidationError as err:
        print(str(err))
        return None

    smart_distribution = dict()
    bucket_size_kg = 1e-3 * bucket_size
    buckets = [round(x, 2) for x in np.arange(0.0, 1e-3 * np.max(adj_weights), bucket_size_kg)]
    for b in buckets:
        low, high = 1e3 * b, 1e3 * (b + bucket_size_kg)
        mask = (adj_weights >= low) & (adj_weights < high)
        count = adj_weights[mask].shape[0]
        kfs_for_bucket = kfs[mask]
        mean_kf = np.mean(kfs_for_bucket)
        smart_distribution[str(b)] = {
            'count': count,
            'avgKFactor': None if np.isnan(mean_kf) else mean_kf
        }

    return smart_distribution


def generate_smart_avg_kf(pm_base: PopulationMetricsBase, date: str, max_day_difference: int = 3,
                          incorporate_future: bool = True, apply_growth_rate: bool = True,
                          trend_stability_threshold: float = 0.9) -> Union[float, None]:
    """Generates smart average k-factor on given date."""
    try:
        _, kfs = generate_smart_individual_values(pm_base, date, max_day_difference,
                                                  incorporate_future, apply_growth_rate,
                                                  trend_stability_threshold)

        kfs = [val for val in kfs if val]
        mean_kf = np.mean(kfs)
        smart_avg_kf = None if np.isnan(mean_kf) else float(mean_kf)
        return smart_avg_kf

    except ValidationError as err:
        print(str(err))
        return None


def generate_smart_standard_deviation(pm_base: PopulationMetricsBase, date: str, max_day_difference: int = 3,
                                      incorporate_future: bool = True, apply_growth_rate: bool = True,
                                      trend_stability_threshold: float = 0.9) -> Union[float, None]:
    """Generates smart standard deviation on given date."""
    try:
        adj_weights, _ = generate_smart_individual_values(pm_base, date, max_day_difference, incorporate_future,
                                                          apply_growth_rate, trend_stability_threshold)
        return float(np.std(adj_weights))
    except ValidationError as err:
        print(str(err))
        return None


def get_smart_sample_size(pm_base: PopulationMetricsBase, date: str, max_day_difference: int = 3,
                          incorporate_future: bool = True, apply_growth_rate: bool = True,
                          trend_stability_threshold: float = 0.9) -> int:
    """Generates smart sample size on given date."""
    try:
        adj_weights, _ = generate_smart_individual_values(pm_base, date, max_day_difference, incorporate_future,
                                                          apply_growth_rate, trend_stability_threshold)
        return len(adj_weights)
    except ValidationError as err:
        print(str(err))
        return 0