In [None]:
import json
import os
from matplotlib import pyplot as plt
import numpy as np
from PIL import Image
from research.utils.data_access_utils import S3AccessUtils, RDSAccessUtils
from filter_optimization.filter_optimization_task import _add_date_hour_columns
import matplotlib.mlab as mlab
from scipy.stats import norm

<h1> Load Data </h1>

In [None]:
rds = RDSAccessUtils(json.load(open(os.environ['DATA_WAREHOUSE_SQL_CREDENTIALS'])))

In [None]:
query = """
    select * from prod.biomass_computations
    where pen_id=56
    and captured_at between '2020-08-21' and '2020-08-30'
    and akpd_score >= 0.9
"""

df = rds.extract_from_database(query)

In [None]:
df = _add_date_hour_columns(df)
hour_mask = (df.hour >= 7) & (df.hour <= 15)
akpd_mask = (df.akpd_score > 0.99)
kf_mask = (df.estimated_k_factor > 1.135)

In [None]:
df[hour_mask & akpd_mask].estimated_weight_g.mean()

In [None]:
df[hour_mask].estimated_weight_g.mean()

In [None]:
df[kf_mask & akpd_mask].estimated_weight_g.mean()

In [None]:
df[akpd_mask].estimated_weight_g.mean()

<h1> Histogram of weights below minimum acceptable weight </h1>

In [None]:
# hour_mask = (df.hour >= 7) & (df.hour <= 15)
akpd_mask = (df.akpd_score > 0.99)

plt.figure(figsize=(20, 10))
plt.hist(df.estimated_weight_g, bins=100)
plt.hist(df[hour_mask].estimated_weight_g, bins=100, color='red')
plt.axvline(1120, color='red', linestyle='--')
plt.grid()
plt.show()

<h1> Curve fitting on part of distribution </h1>

In [None]:
import argparse
from collections import defaultdict
import json
import os
from matplotlib import pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import matplotlib.dates as mdates
import numpy as np
import pandas as pd

from filter_optimization.filter_optimization_task import NoDataException, SamplingFilter, generate_filter_mask, \
     extract_biomass_data
from population_metrics.population_metrics_base import generate_pm_base, PopulationMetricsBase
from population_metrics.growth_rate import compute_local_growth_rate
from population_metrics.raw_metrics import get_raw_kf_values, generate_raw_average_weight, get_raw_sample_size
from population_metrics.smart_metrics import generate_smart_avg_weight, generate_smart_individual_values, \
     generate_smart_distribution, generate_smart_avg_kf, get_smart_sample_size, get_smart_growth_rate, \
     generate_smart_standard_deviation
from population_metrics.confidence_metrics import generate_trend_stability, generate_distribution_consistency, \
     compute_biomass_kpi, get_raw_and_historical_weights
from research.utils.datetime_utils import get_dates_in_range
from research.utils.data_access_utils import S3AccessUtils, RDSAccessUtils

In [None]:
rds = RDSAccessUtils(json.load(open(os.environ['DATA_WAREHOUSE_SQL_CREDENTIALS'])))


def gen_pm_base(df: pd.DataFrame, sampling_filter: SamplingFilter) -> PopulationMetricsBase:
    """
    Returns PopulationMetricsBase instance given input biomass computations
    data-frame (see README for more details) and SamplingFilter instance.
    """

    mask = generate_filter_mask(df, sampling_filter)

    # get filtered set of biomass computations
    biomass_computations = list(zip(df[mask].date.values,
                                    df.loc[mask, 'estimated_weight_g'].values,
                                    df[mask].estimated_k_factor.values))

    # generate population metrics estimator
    if not biomass_computations:
        raise NoDataException('No data found for given filter!')
    return generate_pm_base(biomass_computations)


def generate_ts_data(df: pd.DataFrame, sampling_filter: SamplingFilter) -> defaultdict:
    """
    Given input data-frame of biomass computations and SamplingFilter instance,
    generates time-series data for different raw metrics, smart metrics, growth rate metrics,
    and confidence metrics.
    """

    pm_base = gen_pm_base(df, sampling_filter)
    start_date, end_date = pm_base.unique_dates[0], pm_base.unique_dates[-1]
    dates = get_dates_in_range(start_date, end_date)
    ts_data = defaultdict(list)
    ts_data['date'].extend(dates)
    for date in dates:

        # raw metrics
        raw_average_weight = generate_raw_average_weight(pm_base, date)
        raw_sample_size = get_raw_sample_size(pm_base, date)

        # growth rate metrics
        growth_rate = compute_local_growth_rate(pm_base, date)

        # confidence metrics
        distribution_consistency = generate_distribution_consistency(pm_base, date)
        kpi = compute_biomass_kpi(pm_base, date)

        # smart metrics
        smart_average_weight = generate_smart_avg_weight(pm_base, date)
        smart_average_kf = generate_smart_avg_kf(pm_base, date)
        smart_sample_size = get_smart_sample_size(pm_base, date)
        smart_growth_rate = get_smart_growth_rate(pm_base, date)

        ts_data['raw_average_weight'].append(raw_average_weight)
        ts_data['raw_sample_size'].append(raw_sample_size)
        ts_data['growth_rate'].append(growth_rate)
        ts_data['distribution_consistency'].append(distribution_consistency)
        ts_data['kpi'].append(kpi)
        ts_data['smart_average_weight'].append(smart_average_weight)
        ts_data['smart_average_kf'].append(smart_average_kf)
        ts_data['smart_sample_size'].append(smart_sample_size)
        ts_data['smart_growth_rate'].append(smart_growth_rate)

    return ts_data




In [None]:
pen_id, start_date, end_date = 60, '2020-08-15', '2020-08-25'
sampling_filter = SamplingFilter(start_hour=7, end_hour=15, kf_cutoff=1.135, akpd_score_cutoff=0.95)
df = extract_biomass_data(pen_id, start_date, end_date, sampling_filter.akpd_score_cutoff)

In [None]:
pm_base = gen_pm_base(df, sampling_filter)

In [None]:
round_weights, _ = generate_smart_individual_values(pm_base, '2020-08-24', 3, True, True, 0.9)
weights = 0.83 * round_weights

In [None]:
np.mean(weights)

In [None]:
from scipy.stats import truncnorm, norm

In [None]:
import numpy as np
from scipy.stats import truncnorm
from scipy.optimize import fmin_slsqp

import matplotlib.pyplot as plt


def func(p, r, xa, xb):
    return truncnorm.nnlf(p, r)


def constraint(p, r, xa, xb):
    a, b, loc, scale = p
    return np.array([a*scale + loc - xa, b*scale + loc - xb])


# xa, xb = 3000, 6000

# Generate some data to work with.

lo, hi = 2500, 6000
mask = (weights > lo) & (weights < hi)
vals = weights[mask]

u, sigma = np.mean(vals), np.std(vals)
xa, xb = (lo - u) / sigma, (hi - u) / sigma
r = (vals - np.mean(vals)) / np.std(vals)

loc_guess = 0
scale_guess = 1

a_guess = (xa - loc_guess)/scale_guess
b_guess = (xb - loc_guess)/scale_guess
p0 = [a_guess, b_guess, loc_guess, scale_guess]

par = fmin_slsqp(func, p0, f_eqcons=constraint, args=(r, xa, xb),
                 iprint=True, iter=1000)

fig, ax = plt.subplots(1, 1, figsize=(20, 10))
cf_mean, cf_std = sigma * par[2] + u, sigma * par[3]
x = np.linspace(0, 10000, 1000)
ax.plot(x, 0.83 * norm.pdf(x, cf_mean, cf_std), 'k--', lw=1, alpha=1.0, label='norm fit')
ax.hist(weights, bins=15, density=True, histtype='stepfilled', alpha=0.3)
ax.legend(shadow=True)
plt.grid(True)
plt.show()


In [None]:
s3 = S3AccessUtils('/root/data', json.load(open(os.environ['AWS_CREDENTIALS'])))
cohort_name = 'vikane_pen_id_60_2020-08-05_2020-08-30'

s3_dir = os.path.join(
    'https://aquabyte-images-adhoc.s3-eu-west-1.amazonaws.com/alok/production_datasets',
    cohort_name
)

ground_truth_metadata_url = os.path.join(s3_dir, 'ground_truth_metadata.json')
ground_truth_f, _, _ = s3.download_from_url(ground_truth_metadata_url)
ground_truth_metadata = json.load(open(ground_truth_f))

In [None]:
bucket_cutoffs = np.arange(0, 10000, 1000)
pred_distribution = {}
for low_weight, high_weight in zip(bucket_cutoffs, bucket_cutoffs[1:]):
    bucket = '{}-{}'.format(low_weight, high_weight)
    pct = norm.cdf(high_weight, cf_mean, cf_std) - norm.cdf(low_weight, cf_mean, cf_std)
    pred_distribution[bucket] = round(100 * pct, 2)
    

In [None]:
pred_distribution

In [None]:
ground_truth_metadata

In [None]:
(cf_mean - 3515) / 3515

In [None]:

xmin = -2
xmax = 4
x = np.linspace(xmin, xmax, 1000)

fig, ax = plt.subplots(1, 1)
ax.plot(x, truncnorm.pdf(x, *par),
        'k--', lw=1, alpha=1.0, label='truncnorm fit')
ax.hist(r, bins=15, density=True, histtype='stepfilled', alpha=0.3)
ax.legend(shadow=True)
plt.xlim(xmin, xmax)
plt.grid(True)

plt.show()

In [None]:
cf_mean

In [None]:
df.shape[0] * norm.pdf(x, cf_mean, cf_std)