In [None]:
import json
import os
import numpy as np
import pandas as pd 
from matplotlib import pyplot as plt
from filter_optimization.filter_optimization_task import extract_biomass_data, _add_date_hour_columns
from research.weight_estimation.keypoint_utils.optics import euclidean_distance, pixel2world, depth_from_disp, convert_to_world_point
from report_generation.report_generator import generate_ts_data, SamplingFilter, gen_pm_base
from research.utils.datetime_utils import add_days
from population_metrics.smart_metrics import generate_smart_avg_weight, generate_smart_individual_values, ValidationError

from population_metrics.population_metrics_base import PopulationMetricsBase, ValidationError
from population_metrics.raw_metrics import get_raw_sample_size, get_raw_weight_values, get_raw_kf_values
from population_metrics.growth_rate import compute_local_growth_rate
from population_metrics.confidence_metrics import generate_trend_stability, get_raw_and_historical_weights

from research.utils.data_access_utils import S3AccessUtils, RDSAccessUtils
from research.utils.datetime_utils import add_days, day_difference, get_dates_in_range

from scipy import stats
import statsmodels.api as sm

plt.rcParams['font.size'] = 18

In [None]:
s3 = S3AccessUtils('/root/data', json.load(open(os.environ['AWS_CREDENTIALS'])))

In [None]:
cohort_names = [
    'seglberget_pen_id_66_2020-05-13_2020-06-13',
    'bolaks_pen_id_88_2020-02-10_2020-03-10',
    'langoy_pen_id_108_2020-05-07_2020-05-17',
    'tittelsnes_pen_id_37_2020-05-23_2020-06-24',
    'aplavika_pen_id_95_2020-06-26_2020-07-26',
    'kjeppevikholmen_pen_id_5_2019-06-05_2019-07-02',
    'silda_pen_id_86_2020-06-19_2020-07-19',
    'vikane_pen_id_60_2020-08-05_2020-08-30',
    'eldviktaren_pen_id_164_2020-09-06_2020-10-06',
    'habranden_pen_id_100_2020-08-10_2020-08-31'
]

In [None]:
ROOT_DIR = '/root/data/alok/biomass_estimation/playground'
batch_name = 'simulation'
dfs, gt_metadatas = {}, {}
for cohort_name in cohort_names:
    s3_dir = os.path.join(
        'https://aquabyte-images-adhoc.s3-eu-west-1.amazonaws.com/alok/production_datasets',
        cohort_name
    )

    ground_truth_metadata_url = os.path.join(s3_dir, 'ground_truth_metadata.json')
    ground_truth_key_base = os.path.join(batch_name, cohort_name, 'ground_truth_metadata.json')
    ground_truth_f = os.path.join(ROOT_DIR, ground_truth_key_base)
    s3.download_from_url(ground_truth_metadata_url, custom_location=ground_truth_f)
    gt_metadata = json.load(open(ground_truth_f))
    gt_metadatas[cohort_name] = gt_metadata
    
    data_url = os.path.join(s3_dir, 'annotation_dataset.csv')
    data_f, _, _= s3.download_from_url(data_url)
    df = pd.read_csv(data_f)
    df = _add_date_hour_columns(df)
    dfs[cohort_name] = df
    
    

In [None]:
from typing import Dict, List, Tuple, Union

def get_included_dates(pm_base: PopulationMetricsBase, date: str,
                       max_day_difference: int, incorporate_future: bool) -> List:
    """
    Gets list of dates that fall into window corresponding to max_day_difference. Window
    is affected by whether or not incorporate_future is set to True.
    """

    start = add_days(date, -max_day_difference)
    end = add_days(date, max_day_difference if incorporate_future else 0)
    included_dates = sorted([date for date in get_dates_in_range(start, end) if date in pm_base.unique_dates])
    if not included_dates:
        raise ValidationError('No raw biomass data found in window!')
    return included_dates

def get_smart_growth_rate(pm_base: PopulationMetricsBase, date: str,
                          incorporate_future: bool = True, apply_growth_rate: bool = True,
                          trend_stability_threshold: float = 0.9) -> float:
    """Get local growth rate adjustment to use for smart average computation."""

    raw_sample_size = get_raw_sample_size(pm_base, date)
    growth_rate_for_smart_metrics = 0.0
    if apply_growth_rate:
        try:
            growth_rate = compute_local_growth_rate(pm_base, date, incorporate_future=incorporate_future)
            trend_stability = generate_trend_stability(pm_base, date, incorporate_future=incorporate_future)
            if raw_sample_size and trend_stability and trend_stability > trend_stability_threshold:
                growth_rate_for_smart_metrics = growth_rate
        except ValidationError as err:
            print(str(err))
    return growth_rate_for_smart_metrics

def generate_smart_individual_values(pm_base: PopulationMetricsBase, date: str, max_day_difference: int,
                                     incorporate_future: bool, apply_growth_rate: bool,
                                     trend_stability_threshold: float) -> Tuple:
    """
    Generate smart individual values for weight and k-factor on given date.
    Args:
        pm_base: PopulationMetricsBase instance
        date: the date to compute smart individual values for
        max_day_difference: what is the maximum day difference of dates in the window?
        incorporate_future: should future data be incorporated?
        apply_growth_rate: should we apply a growth rate adjustment?
        trend_stability_threshold: if apply_growth_rate is True, what minimum trend_stability_threshold
                                   should we mandate for growth rate adjustment?
    Returns:
        adj_weights: growth rate adjusted individual weights in window
        kfs: individual k-factor values in window
    """
    
    # validate data
    included_dates = get_included_dates(pm_base, date, max_day_difference, incorporate_future)

    # compute local growth rate to use for smart average
    growth_rate_for_smart_metrics = get_smart_growth_rate(pm_base, date, incorporate_future=incorporate_future,
                                                          apply_growth_rate=apply_growth_rate,
                                                          trend_stability_threshold=trend_stability_threshold)

    # get adjusted weights and kfs for smart metrics
    all_weights = []
    adj_weights, kfs = [], []
    for d in included_dates:

        # extend adjusted weights list for this date
        weights_for_date = get_raw_weight_values(pm_base, d)
        day_diff = day_difference(d, date)
        adj_weights_for_date = np.array(weights_for_date) * np.exp(-day_diff * growth_rate_for_smart_metrics)
        adj_weights.extend(adj_weights_for_date)

        # extend k-factor list for this date
        kfs_for_date = get_raw_kf_values(pm_base, d)
        kfs.extend(kfs_for_date)
        all_weights.extend(weights_for_date)

    weights = np.array(adj_weights)
#     new_reflection_point = reflection_point * np.median(adj_weights) / np.median(all_weights)
#     weights = np.array(list(weights[weights < new_reflection_point]) + list(new_reflection_point + (new_reflection_point - weights[weights < new_reflection_point])))
    return weights

def generate_raw_individual_values(pm_base, gt_metadata, start_hour, end_hour, apply_growth_rate, max_day_diff, days_post_feeding, final_days_post_feeding, ):
    last_feeding_date = gt_metadata['last_feeding_date']
    date = add_days(last_feeding_date, days_post_feeding)
    weights = generate_smart_individual_values(pm_base, date, max_day_diff, True, apply_growth_rate, 0.9)
    return weights

In [None]:
# https://stackoverflow.com/questions/21100716/fast-arbitrary-distribution-random-sampling-inverse-transform-sampling

from scipy.interpolate import interp1d

def inverse_sample_decorator(dist):
    
    def wrapper(pnts, x_min=0, x_max=20000, n=1e5, **kwargs):
        
        x = np.linspace(x_min, x_max, int(n))
        cumulative = np.cumsum(dist(x, **kwargs))
        cumulative -= cumulative.min()
        f = interp1d(cumulative/cumulative.max(), x)
        return f(np.random.random(pnts))
    
    return wrapper

In [None]:
def get_length_from_weight(weight):
    return weight ** (1/3) / 23.6068

def get_simulated_weights(degrees, density, pdf, max_iter):
    sample_pdf = inverse_sample_decorator(pdf)
    
    fov = degrees * np.pi / 180
    params_depth = 2
    camera_location = 5
    total_length = 10

    all_weights = []

    num_samples = int(total_length * density)
    
    count = 0
    
    while(len(all_weights) < 3000 and count < max_iter):
        count = count + 1
#         if count % 5000 == 0:
#             print(count)
            
        sampled_weights = sample_pdf(num_samples)

        x = []

        for weight in sampled_weights:
            location = np.random.uniform(0, total_length)
            length = get_length_from_weight(weight)
            depth = np.random.uniform(0, params_depth)

            x.append([location, length, depth, weight])

        a = np.array(x)
        b = a[np.argsort(a[:, 2])]

        all_segments = []
        curr_segments = []
        curr_depth = 0

        for row in b:
            curr_depth = row[2]

            band = np.tan(fov / 2) * curr_depth

            lower_bound = camera_location - band
            upper_bound = camera_location + band
            
            if not ((row[0] > lower_bound) and (row[0] + row[1] < upper_bound)):
                if (row[0] > lower_bound) and (row[0] < upper_bound):
                    all_segments.append(row)
                elif ((row[0] + row[1]) > lower_bound) and ((row[0] + row[1]) < upper_bound):
                    all_segments.append(row)
                continue

            is_occluded = False

            for seg in all_segments:
                lower_adj_segment = camera_location + (row[0] - camera_location) * curr_depth / seg[2]
                upper_adj_segment = camera_location + ((row[0] + row[1]) - camera_location) * curr_depth / seg[2]

                if not ((row[0] + row[1]) < lower_adj_segment or row[0] > upper_adj_segment):
                    is_occluded = True

            if not is_occluded:
                all_weights.append(row[3])
                
            all_segments.append(row)

    return all_weights


In [None]:
'''
This implements a Gram-Charlier expansion of the normal distribution where the first 2 moments coincide with those of the normal distribution but skew and kurtosis can deviate from it.
'''
# https://www.statsmodels.org/devel/generated/statsmodels.sandbox.distributions.extras.pdf_mvsk.html#statsmodels.sandbox.distributions.extras.pdf_mvsk
# Not used: https://github.com/gregversteeg/gaussianize
# Not used: https://stats.stackexchange.com/questions/20445/how-to-transform-data-to-normality
# Inspiration: https://stats.stackexchange.com/questions/43482/transformation-to-increase-kurtosis-and-skewness-of-normal-r-v

from statsmodels.sandbox.distributions.extras import pdf_mvsk

def simulate_with_params(mean_factor, sd_factor, density, max_iter):
    fov = 55
    
    print('Simulation: mean_factor: %0.2f, sd_factor: %0.2f, density: %i, max_iter: %i' % (mean_factor, sd_factor, density, max_iter))
    
    estimated_initial_weight = np.mean(original_weights) / mean_factor
    estimated_initial_sd = np.std(original_weights) / sd_factor

    new_pdf = pdf_mvsk([estimated_initial_weight, estimated_initial_sd ** 2, 0, 1])

    simulated_weights = get_simulated_weights(fov, density, new_pdf, max_iter)
    
    return simulated_weights

In [None]:
def get_error(simulated_weights, original_weights, verbose):
    if verbose:
        print(np.mean(simulated_weights), np.std(simulated_weights), len(simulated_weights))
        print(np.mean(original_weights), np.std(original_weights))

    weight_error = (np.mean(simulated_weights) - np.mean(original_weights)) / np.mean(original_weights) * 100
    sd_error = (np.std(simulated_weights) - np.std(original_weights)) / np.std(original_weights) * 100
    
    if verbose:
        plt.hist(simulated_weights, alpha = 0.5, color = 'blue', density = True)
        plt.hist(original_weights, alpha = 0.5, color = 'red', density = True)

    buckets = [0, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000]
    x_buckets = np.array(buckets[:-1])

    d1 = np.array(original_weights)
    d2 = np.array(simulated_weights)

    pcts1 = []
    pcts2 = []

    errors1 = []
    errors2 = []

    for i in range(len(buckets) - 1):
        mask1 = (d1 > buckets[i]) & (d1 <= buckets[i + 1])
        mask2 = (d2 > buckets[i]) & (d2 <= buckets[i + 1])

        pct1 = np.sum(mask1) / len(mask1)
        pcts1.append(pct1)
        pct2 = np.sum(mask2) / len(mask2)
        pcts2.append(pct2)

        errors1.append(np.abs(100 * (pct1 - pct2)))

        if verbose:
            print('%i: %0.2f%%' % (buckets[i], 100 * (pct1 - pct2)))

    if verbose:
        print(np.max(errors1))
        print(np.mean(errors1))

        plt.figure(figsize=(20, 10))
        plt.bar(x_buckets - 150, pcts1, color = 'red', width = 150, label = 'Original')
        plt.bar(x_buckets, pcts2, color = 'blue', width = 150, label = 'Dedup')
    
    return weight_error, sd_error, np.max(errors1), np.mean(errors1)

In [None]:
'''
Scratchwork
'''

In [None]:
# np.mean(weights), np.std(original_weights)
# print(estimated_initial_weight, estimated_initial_sd)

In [None]:
# sample_pdf = inverse_sample_decorator(new_pdf)

# plt.hist(sample_pdf(10000), density = True, bins = 50)

# vec = np.arange(0, 10000, 1)
# example_pdf = new_pdf(vec)

# plt.plot(vec, example_pdf, lw = 4, color = 'green')

In [None]:
'''
1. Calculate empirical SD, mean
2. Initialize Gram-Charlier distribution to empirical SD * 1.06, empirical mean * 0.965, density to 5
3. Run gradient search to find minimum distribution deviation via simulation
4. Resulting distribution is the ground truth distribution
'''

In [None]:
cohort_name = 'vikane_pen_id_60_2020-08-05_2020-08-30'
start_hour = 5
end_hour = 15
apply_growth_rate = True
max_day_diff = 3
final_days_post_feeding = 1
max_final_days_post_feeding = 1
loss_factor = 0.17

gt_metadata = gt_metadatas[cohort_name]

sampling_filter = SamplingFilter(
    start_hour=start_hour,
    end_hour=end_hour,
    kf_cutoff=0.0,
    akpd_score_cutoff=0.01
)
df = dfs[cohort_name]
final_date_post_feeding = add_days(gt_metadata['last_feeding_date'], final_days_post_feeding)
tdf = df[(df.date <= final_date_post_feeding) & (df.date >= add_days(final_date_post_feeding, -14))]
pm_base = gen_pm_base(tdf, sampling_filter)
                
original_weights = generate_raw_individual_values(pm_base, gt_metadata, start_hour, end_hour, apply_growth_rate, max_day_diff, days_post_feeding, final_days_post_feeding)

mean_factors = []
sd_factors = []

sim_avg_weights = []
sim_sds = []
sim_len = []
sim_raw_weights = []

weight_errors = []
sd_errors = []
max_dist_errors = []
avg_dist_errors = []

density = 5
max_iter = 10000

for mean_factor in np.arange(0.95, .98, .005):
    for sd_factor in np.arange(1, 1.08, .01):
        simulated_weights = simulate_with_params(mean_factor, sd_factor, density, max_iter)
        weight_error, sd_error, max_dist_error, avg_dist_error = get_error(simulated_weights, original_weights, False)
        print('weight_error: %0.2f, sd_error: %0.2f, max_dist_error: %0.2f, avg_dist_error: %0.2f' % (weight_error, sd_error, max_dist_error, avg_dist_error))
        
        mean_factors.append(mean_factor)
        sd_factors.append(sd_factor)
        
        sim_avg_weights.append(np.mean(simulated_weights))
        sim_raw_weights.append(simulated_weights)
        sim_sds.append(np.std(simulated_weights))
        sim_len.append(len(simulated_weights))
        
        weight_errors.append(weight_error)
        sd_errors.append(sd_error)
        max_dist_errors.append(max_dist_error)
        avg_dist_errors.append(avg_dist_error)

In [None]:
output = pd.DataFrame(np.column_stack((mean_factors, sd_factors, weight_errors, sd_errors, max_dist_errors, avg_dist_errors)), columns = ['mean_factors', 'sd_factors', 'weight_errors', 'sd_errors', 'max_dist_errors', 'avg_dist_errors']) 

pivot1 = output.pivot(index = 'mean_factors', columns = 'sd_factors', values = 'weight_errors')
pivot2 = output.pivot(index = 'mean_factors', columns = 'sd_factors', values = 'sd_errors')
pivot3 = output.pivot(index = 'mean_factors', columns = 'sd_factors', values = 'max_dist_errors')
pivot4 = output.pivot(index = 'mean_factors', columns = 'sd_factors', values = 'avg_dist_errors')

pivot_all = 0.25 * (np.abs(pivot1) + np.abs(pivot2) + np.abs(pivot3) + np.abs(pivot4))

pivot_interp = 0.25 * (pivot_all.values[1:,1:] + pivot_all.values[:-1,1:] + pivot_all.values[1:,:-1] + pivot_all.values[:-1,:-1])

print(pivot_all)

plt.figure(figsize=(5, 5))
plt.imshow((pivot_all), cmap='hot', interpolation='nearest')
plt.colorbar()

plt.figure(figsize=(5, 5))
plt.imshow((pivot_interp), cmap='hot', interpolation='nearest')
plt.colorbar()

print(np.mean(pivot_all, 0))
print(np.mean(pivot_all, 1))

In [None]:
# Vikane Pen 5 Review

selected_mean_factor = .96
selected_sd_factor = 1.02

exact_mean_factor = .9625
exact_sd_factor = 1.015

count = 0
index = None

for mean_factor in np.arange(0.95, .98, .005):
    for sd_factor in np.arange(1, 1.08, .02):
        if np.abs(mean_factor - selected_mean_factor) < .001 and np.abs(sd_factor - selected_sd_factor) < .001:
            index = count
        count = count + 1

count, bins, _ = plt.hist(sim_raw_weights[index], alpha = 0.5, density = True, color = 'red', bins = 30)
plt.hist(original_weights, alpha = 0.5, density = True, color = 'blue', bins = bins)

estimated_initial_weight = np.mean(original_weights) / exact_mean_factor
estimated_initial_sd = np.std(original_weights) / exact_sd_factor

gt_mean = 4234.9465174614315
gt_sd = 820.6951915326154

print('Sim', sim_avg_weights[index], sim_sds[index])
print('Orig', np.mean(original_weights), np.std(original_weights))
print('Estimate', estimated_initial_weight, estimated_initial_sd)
print('GT', gt_mean, gt_sd)

print()

print('Estimate', '%0.2f, %0.2f%%' % (estimated_initial_weight - gt_mean, 100 * (estimated_initial_weight - gt_mean) / gt_mean))
print('Orig', '%0.2f, %0.2f%%' % (np.mean(original_weights) - gt_mean, 100 * (np.mean(original_weights) - gt_mean) / gt_mean))

print()

buckets = [0, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000]
x_buckets = np.array(buckets[:-1])
gt_pcts = [
    0.0,
    0.012362459546925567,
    0.05272923408845739,
    0.31016181229773465,
    0.4608414239482201,
    0.14977346278317152,
    0.013398058252427184,
    0.0006040992448759439,
    0.00012944983818770226,
    0.0
]

d1 = np.array(original_weights)
# d2 = np.array(sim_raw_weights[index]) / exact_mean_factor

sim_vec = np.arange(start=0, stop = 10000, step = 1)

orig_pdf = pdf_mvsk([np.mean(original_weights), np.std(original_weights) ** 2, 0, 1])
orig_dist = orig_pdf(sim_vec)

sim_pdf = pdf_mvsk([estimated_initial_weight, estimated_initial_sd ** 2, 0, 1])
sim_dist = sim_pdf(sim_vec)

avg_pdf = pdf_mvsk([0.5 * (estimated_initial_weight + np.mean(original_weights)), (0.5 * (estimated_initial_sd + np.std(original_weights))) ** 2, 0, 1])
avg_dist = avg_pdf(sim_vec)

pcts1 = []
pcts2 = []
pcts3 = []

# errors1 = []
# errors2 = []

for i in range(len(buckets) - 1):
#     mask1 = (d1 > buckets[i]) & (d1 <= buckets[i + 1])
    pct1 = np.sum(orig_dist[(sim_vec > buckets[i]) & (sim_vec <= buckets[i + 1])]) / np.sum(orig_dist)
#     mask2 = (d2 > buckets[i]) & (d2 <= buckets[i + 1])

#     pct1 = np.sum(mask1) / len(mask1)
    pcts1.append(pct1)
#     pct2 = np.sum(mask2) / len(mask2)
    pct2 = np.sum(sim_dist[(sim_vec > buckets[i]) & (sim_vec <= buckets[i + 1])]) / np.sum(sim_dist)
    pcts2.append(pct2)
    
    pct3 = np.sum(avg_dist[(sim_vec > buckets[i]) & (sim_vec <= buckets[i + 1])]) / np.sum(avg_dist)
    pcts3.append(pct3)

#     errors1.append(np.abs(100 * (pct1 - pct2)))

#     if verbose:
    gt_pct = gt_pcts[i]
    print('%i: %0.2f%% vs %0.2f%% vs %0.2f%%' % (buckets[i], 100 * (pct1 - gt_pct), 100 * (pct2 - gt_pct), 100 * (pct3 - gt_pct)))

#     print(np.max(errors1))
#     print(np.mean(errors1))

plt.figure(figsize=(20, 10))
plt.bar(x_buckets - 150, pcts1, color = 'red', width = 150, label = 'Original')
plt.bar(x_buckets, gt_pcts, color = 'green', width = 150, label = 'GT')
plt.bar(x_buckets + 150, pcts2, color = 'blue', width = 150, label = 'Sim')
plt.bar(x_buckets + 300, pcts3, color = 'purple', width = 150, label = 'Orig + Sim Avg')
plt.legend()

In [None]:
# selected_mean_factor = .97
# selected_sd_factor = 1.04

# count = 0
# index = None

# for mean_factor in np.arange(0.95, .98, .005):
#     for sd_factor in np.arange(1, 1.08, .02):
#         if np.abs(mean_factor - selected_mean_factor) < .001 and np.abs(sd_factor - selected_sd_factor) < .001:
#             index = count
#         count = count + 1
        
# print(sim_avg_weights[index], sim_sds[index])
# print(np.mean(original_weights), np.std(original_weights))

# count, bins, _ = plt.hist(sim_raw_weights[index], alpha = 0.5, density = True, color = 'red', bins = 30)
# plt.hist(original_weights, alpha = 0.5, density = True, color = 'blue', bins = bins)

# estimated_initial_weight = np.mean(original_weights) / selected_mean_factor
# estimated_initial_sd = np.std(original_weights) / selected_sd_factor

# print(estimated_initial_weight, estimated_initial_sd)
# print(estimated_initial_weight - 4234.9465174614315)
# print(np.mean(original_weights) - 4234.9465174614315)

In [None]:
cohort_name = 'aplavika_pen_id_95_2020-06-26_2020-07-26'
start_hour = 6
end_hour = 16
apply_growth_rate = True
max_day_diff = 3
final_days_post_feeding = 1
max_final_days_post_feeding = 1
loss_factor = 0.1753

gt_metadata = gt_metadatas[cohort_name]

sampling_filter = SamplingFilter(
    start_hour=start_hour,
    end_hour=end_hour,
    kf_cutoff=0.0,
    akpd_score_cutoff=0.01
)
df = dfs[cohort_name]
final_date_post_feeding = add_days(gt_metadata['last_feeding_date'], final_days_post_feeding)
tdf = df[(df.date <= final_date_post_feeding) & (df.date >= add_days(final_date_post_feeding, -14))]
pm_base = gen_pm_base(tdf, sampling_filter)
                
original_weights = generate_raw_individual_values(pm_base, gt_metadata, start_hour, end_hour, apply_growth_rate, max_day_diff, days_post_feeding, final_days_post_feeding)

mean_factors = []
sd_factors = []

sim_avg_weights = []
sim_sds = []
sim_len = []
sim_raw_weights = []

weight_errors = []
sd_errors = []
max_dist_errors = []
avg_dist_errors = []

density = 5
max_iter = 10000

for mean_factor in np.arange(0.95, .98, .01):
    for sd_factor in np.arange(1, 1.08, .01):
        simulated_weights = simulate_with_params(mean_factor, sd_factor, density, max_iter)
        weight_error, sd_error, max_dist_error, avg_dist_error = get_error(simulated_weights, original_weights, False)
        print('weight_error: %0.2f, sd_error: %0.2f, max_dist_error: %0.2f, avg_dist_error: %0.2f' % (weight_error, sd_error, max_dist_error, avg_dist_error))
        
        mean_factors.append(mean_factor)
        sd_factors.append(sd_factor)
        
        sim_avg_weights.append(np.mean(simulated_weights))
        sim_raw_weights.append(simulated_weights)
        sim_sds.append(np.std(simulated_weights))
        sim_len.append(len(simulated_weights))
        
        weight_errors.append(weight_error)
        sd_errors.append(sd_error)
        max_dist_errors.append(max_dist_error)
        avg_dist_errors.append(avg_dist_error)

In [None]:
a_output = pd.DataFrame(np.column_stack((mean_factors, sd_factors, weight_errors, sd_errors, max_dist_errors, avg_dist_errors)), columns = ['mean_factors', 'sd_factors', 'weight_errors', 'sd_errors', 'max_dist_errors', 'avg_dist_errors']) 

a_pivot1 = a_output.pivot(index = 'mean_factors', columns = 'sd_factors', values = 'weight_errors')
a_pivot2 = a_output.pivot(index = 'mean_factors', columns = 'sd_factors', values = 'sd_errors')
a_pivot3 = a_output.pivot(index = 'mean_factors', columns = 'sd_factors', values = 'max_dist_errors')
a_pivot4 = a_output.pivot(index = 'mean_factors', columns = 'sd_factors', values = 'avg_dist_errors')

a_pivot_all = 0.25 * (np.abs(a_pivot1) + np.abs(a_pivot2) + np.abs(a_pivot3) + np.abs(a_pivot4))

a_pivot_interp = 0.25 * (a_pivot_all.values[1:,1:] + a_pivot_all.values[:-1,1:] + a_pivot_all.values[1:,:-1] + a_pivot_all.values[:-1,:-1])

print(a_pivot_all)

plt.figure(figsize=(5, 5))
plt.imshow((a_pivot_all), cmap='hot', interpolation='nearest')
plt.colorbar()

plt.figure(figsize=(5, 5))
plt.imshow((a_pivot_interp), cmap='hot', interpolation='nearest')
plt.colorbar()

print(np.mean(a_pivot_all, 0))
print(np.mean(a_pivot_all, 1))

In [None]:
# Aplavika Review

selected_mean_factor = .96
selected_sd_factor = 1.02

exact_mean_factor = .965
exact_sd_factor = 1.015

count = 0
index = None

for mean_factor in np.arange(0.95, .98, .01):
    for sd_factor in np.arange(1, 1.08, .02):
        if np.abs(mean_factor - selected_mean_factor) < .001 and np.abs(sd_factor - selected_sd_factor) < .001:
            index = count
        count = count + 1

count, bins, _ = plt.hist(sim_raw_weights[index], alpha = 0.5, density = True, color = 'red', bins = 30)
plt.hist(original_weights, alpha = 0.5, density = True, color = 'blue', bins = bins)

loss_factor = 0.1753

estimated_initial_weight = np.mean(original_weights) / exact_mean_factor
estimated_initial_sd = np.std(original_weights) / exact_sd_factor

gt_mean = 4944 / (1 - loss_factor)

print('Sim', sim_avg_weights[index], sim_sds[index])
print('Orig', np.mean(original_weights), np.std(original_weights))
print('Estimate', estimated_initial_weight, estimated_initial_sd)
print('GT', gt_mean)

print()

print('Estimate', '%0.2f, %0.2f%%' % (estimated_initial_weight - gt_mean, 100 * (estimated_initial_weight - gt_mean) / gt_mean))
print('Orig', '%0.2f, %0.2f%%' % (np.mean(original_weights) - gt_mean, 100 * (np.mean(original_weights) - gt_mean) / gt_mean))

print()

buckets = [0, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000]
x_buckets = np.array(buckets[:-1])
gt_pcts = [
    0.0,
    0.0,
    0.0036,
    0.1060,
    0.3990,
    0.3576,
    .1147,
    .0180,
    .0011,
    0.0
]

d1 = np.array(original_weights) * (1 - loss_factor)
# d2 = np.array(sim_raw_weights[index]) / exact_mean_factor

sim_vec = np.arange(start=0, stop = 10000, step = 1)

orig_pdf = pdf_mvsk([np.mean(original_weights * (1 - loss_factor)), np.std(original_weights * (1 - loss_factor)) ** 2, 0, 1])
orig_dist = orig_pdf(sim_vec)

sim_pdf = pdf_mvsk([estimated_initial_weight * (1 - loss_factor), (estimated_initial_sd  * np.sqrt(1 - loss_factor)) ** 2, 0, 1])
sim_dist = sim_pdf(sim_vec)

avg_pdf = pdf_mvsk([0.5 * (estimated_initial_weight + np.mean(original_weights)) * (1 - loss_factor), (0.5 * (estimated_initial_sd  * np.sqrt(1 - loss_factor) + np.std(original_weights * (1 - loss_factor)))) ** 2, 0, 1])
avg_dist = avg_pdf(sim_vec)

pcts0 = []
pcts1 = []
pcts2 = []
pcts3 = []

# errors1 = []
# errors2 = []

for i in range(len(buckets) - 1):
    mask0 = (d1 > buckets[i]) & (d1 <= buckets[i + 1])
    pct0 = np.sum(mask0) / len(mask0)
    pcts0.append(pct0)

    pct1 = np.sum(orig_dist[(sim_vec > buckets[i]) & (sim_vec <= buckets[i + 1])]) / np.sum(orig_dist)
#     mask2 = (d2 > buckets[i]) & (d2 <= buckets[i + 1])

#     pct1 = np.sum(mask1) / len(mask1)
    pcts1.append(pct1)
#     pct2 = np.sum(mask2) / len(mask2)
    pct2 = np.sum(sim_dist[(sim_vec > buckets[i]) & (sim_vec <= buckets[i + 1])]) / np.sum(sim_dist)
    pcts2.append(pct2)
    
    pct3 = np.sum(avg_dist[(sim_vec > buckets[i]) & (sim_vec <= buckets[i + 1])]) / np.sum(avg_dist)
    pcts3.append(pct3)

#     errors1.append(np.abs(100 * (pct1 - pct2)))

#     if verbose:
    gt_pct = gt_pcts[i]
    print('%i: %0.2f%% vs %0.2f%% vs %0.2f%% vs %0.2f%%' % (buckets[i], 100 * (pct0 - gt_pct), 100 * (pct1 - gt_pct), 100 * (pct2 - gt_pct), 100 * (pct3 - gt_pct)))

#     print(np.max(errors1))
#     print(np.mean(errors1))

plt.figure(figsize=(20, 10))
plt.bar(x_buckets - 150, pcts1, color = 'red', width = 150, label = 'Original')
plt.bar(x_buckets, gt_pcts, color = 'green', width = 150, label = 'GT')
plt.bar(x_buckets + 150, pcts2, color = 'blue', width = 150, label = 'Sim')
plt.bar(x_buckets + 300, pcts3, color = 'purple', width = 150, label = 'Orig + Sim Avg')
plt.legend()

In [None]:
cohort_name = 'tittelsnes_pen_id_37_2020-05-23_2020-06-24'
start_hour = 6
end_hour = 16
apply_growth_rate = True
max_day_diff = 3
final_days_post_feeding = 1
max_final_days_post_feeding = 1
loss_factor = 0.16

gt_metadata = gt_metadatas[cohort_name]

sampling_filter = SamplingFilter(
    start_hour=start_hour,
    end_hour=end_hour,
    kf_cutoff=0.0,
    akpd_score_cutoff=0.01
)
df = dfs[cohort_name]
final_date_post_feeding = add_days(gt_metadata['last_feeding_date'], final_days_post_feeding)
tdf = df[(df.date <= final_date_post_feeding) & (df.date >= add_days(final_date_post_feeding, -14))]
pm_base = gen_pm_base(tdf, sampling_filter)
                
original_weights = generate_raw_individual_values(pm_base, gt_metadata, start_hour, end_hour, apply_growth_rate, max_day_diff, days_post_feeding, final_days_post_feeding)

mean_factors = []
sd_factors = []

sim_avg_weights = []
sim_sds = []
sim_len = []
sim_raw_weights = []

weight_errors = []
sd_errors = []
max_dist_errors = []
avg_dist_errors = []

density = 5
max_iter = 10000

for mean_factor in np.arange(0.95, .98, .01):
    for sd_factor in np.arange(1, 1.08, .01):
        simulated_weights = simulate_with_params(mean_factor, sd_factor, density, max_iter)
        weight_error, sd_error, max_dist_error, avg_dist_error = get_error(simulated_weights, original_weights, False)
        print('weight_error: %0.2f, sd_error: %0.2f, max_dist_error: %0.2f, avg_dist_error: %0.2f' % (weight_error, sd_error, max_dist_error, avg_dist_error))
        
        mean_factors.append(mean_factor)
        sd_factors.append(sd_factor)
        
        sim_avg_weights.append(np.mean(simulated_weights))
        sim_raw_weights.append(simulated_weights)
        sim_sds.append(np.std(simulated_weights))
        sim_len.append(len(simulated_weights))
        
        weight_errors.append(weight_error)
        sd_errors.append(sd_error)
        max_dist_errors.append(max_dist_error)
        avg_dist_errors.append(avg_dist_error)

In [None]:
a_output = pd.DataFrame(np.column_stack((mean_factors, sd_factors, weight_errors, sd_errors, max_dist_errors, avg_dist_errors)), columns = ['mean_factors', 'sd_factors', 'weight_errors', 'sd_errors', 'max_dist_errors', 'avg_dist_errors']) 

a_pivot1 = a_output.pivot(index = 'mean_factors', columns = 'sd_factors', values = 'weight_errors')
a_pivot2 = a_output.pivot(index = 'mean_factors', columns = 'sd_factors', values = 'sd_errors')
a_pivot3 = a_output.pivot(index = 'mean_factors', columns = 'sd_factors', values = 'max_dist_errors')
a_pivot4 = a_output.pivot(index = 'mean_factors', columns = 'sd_factors', values = 'avg_dist_errors')

a_pivot_all = 0.25 * (np.abs(a_pivot1) + np.abs(a_pivot2) + np.abs(a_pivot3) + np.abs(a_pivot4))

a_pivot_interp = 0.25 * (a_pivot_all.values[1:,1:] + a_pivot_all.values[:-1,1:] + a_pivot_all.values[1:,:-1] + a_pivot_all.values[:-1,:-1])

# print(a_pivot_all)

print(a_pivot1)
plt.figure(figsize=(5, 5))
plt.imshow((a_pivot1), cmap='hot', interpolation='nearest')
plt.colorbar()

# print(a_pivot2)
# plt.figure(figsize=(5, 5))
# plt.imshow((a_pivot2), cmap='hot', interpolation='nearest')
# plt.colorbar()

# print(a_pivot3)
# plt.figure(figsize=(5, 5))
# plt.imshow((a_pivot3), cmap='hot', interpolation='nearest')
# plt.colorbar()

# print(a_pivot4)
# plt.figure(figsize=(5, 5))
# plt.imshow((a_pivot4), cmap='hot', interpolation='nearest')
# plt.colorbar()

# plt.figure(figsize=(5, 5))
# plt.imshow((a_pivot_all), cmap='hot', interpolation='nearest')
# plt.colorbar()

# plt.figure(figsize=(5, 5))
# plt.imshow((a_pivot_interp), cmap='hot', interpolation='nearest')
# plt.colorbar()

print(np.mean(a_pivot_all, 0))
print(np.mean(a_pivot_all, 1))

In [None]:
# Aplavika Review

selected_mean_factor = .95
selected_sd_factor = 1.08

exact_mean_factor = .95
exact_sd_factor = 1.08

count = 0
index = None

for mean_factor in np.arange(0.95, .98, .01):
    for sd_factor in np.arange(1, 1.08, .02):
        if np.abs(mean_factor - selected_mean_factor) < .001 and np.abs(sd_factor - selected_sd_factor) < .001:
            index = count
        count = count + 1

count, bins, _ = plt.hist(sim_raw_weights[index], alpha = 0.5, density = True, color = 'red', bins = 30)
plt.hist(original_weights, alpha = 0.5, density = True, color = 'blue', bins = bins)

loss_factor = 0.16

estimated_initial_weight = np.mean(original_weights) / exact_mean_factor
estimated_initial_sd = np.std(original_weights) / exact_sd_factor

gt_mean = 3894.77 / (1 - loss_factor)

print('Sim', sim_avg_weights[index], sim_sds[index])
print('Orig', np.mean(original_weights), np.std(original_weights))
print('Estimate', estimated_initial_weight, estimated_initial_sd)
print('GT', gt_mean)

print()

print('Estimate', '%0.2f, %0.2f%%' % (estimated_initial_weight - gt_mean, 100 * (estimated_initial_weight - gt_mean) / gt_mean))
print('Orig', '%0.2f, %0.2f%%' % (np.mean(original_weights) - gt_mean, 100 * (np.mean(original_weights) - gt_mean) / gt_mean))
print('New', '%0.2f, %0.2f%%' % (0.5 * (estimated_initial_weight + sim_avg_weights[index]) - gt_mean, 100 * (0.5 * (estimated_initial_weight + sim_avg_weights[index]) - gt_mean) / gt_mean))


print()

buckets = [0, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000]
x_buckets = np.array(buckets[:-1])
# gt_pcts = [
#     0.0,
#     0.0365,
#     .2114,
#     .3232,
#     .2538,
#     .1254,
#     .0399,
#     .0087,
#     .0010,
#     0.0001
# ]
gt_pcts = [
    0.0,
    0.0174,
    .1711,
    .3285,
    .2777,
    .1459,
    .0477,
    .0104,
    .0013,
    0.0001
]

d1 = np.array(original_weights) * (1 - loss_factor)
# d2 = np.array(sim_raw_weights[index]) / exact_mean_factor

sim_vec = np.arange(start=0, stop = 10000, step = 1)

orig_pdf = pdf_mvsk([np.mean(original_weights * (1 - loss_factor)), np.std(original_weights * (1 - loss_factor)) ** 2, 0, 1])
orig_dist = orig_pdf(sim_vec)

sim_pdf = pdf_mvsk([estimated_initial_weight * (1 - loss_factor), (estimated_initial_sd  * np.sqrt(1 - loss_factor)) ** 2, 0, 1])
sim_dist = sim_pdf(sim_vec)

avg_pdf = pdf_mvsk([0.5 * (estimated_initial_weight + sim_avg_weights[index]) * (1 - loss_factor), (0.5 * (estimated_initial_sd  * np.sqrt(1 - loss_factor) + sim_sds[index] * np.sqrt(1 - loss_factor))) ** 2, 0, 1])
avg_dist = avg_pdf(sim_vec)

pcts0 = []
pcts1 = []
pcts2 = []
pcts3 = []

# errors1 = []
# errors2 = []

for i in range(len(buckets) - 1):
    mask0 = (d1 > buckets[i]) & (d1 <= buckets[i + 1])
    pct0 = np.sum(mask0) / len(mask0)
    pcts0.append(pct0)

    pct1 = np.sum(orig_dist[(sim_vec > buckets[i]) & (sim_vec <= buckets[i + 1])]) / np.sum(orig_dist)
#     mask2 = (d2 > buckets[i]) & (d2 <= buckets[i + 1])

#     pct1 = np.sum(mask1) / len(mask1)
    pcts1.append(pct1)
#     pct2 = np.sum(mask2) / len(mask2)
    pct2 = np.sum(sim_dist[(sim_vec > buckets[i]) & (sim_vec <= buckets[i + 1])]) / np.sum(sim_dist)
    pcts2.append(pct2)
    
    pct3 = np.sum(avg_dist[(sim_vec > buckets[i]) & (sim_vec <= buckets[i + 1])]) / np.sum(avg_dist)
    pcts3.append(pct3)

#     errors1.append(np.abs(100 * (pct1 - pct2)))

#     if verbose:
    gt_pct = gt_pcts[i]
    print('%i: %0.2f%% vs %0.2f%% vs %0.2f%% vs %0.2f%%' % (buckets[i], 100 * (pct0 - gt_pct), 100 * (pct1 - gt_pct), 100 * (pct2 - gt_pct), 100 * (pct3 - gt_pct)))

#     print(np.max(errors1))
#     print(np.mean(errors1))

plt.figure(figsize=(20, 10))
plt.bar(x_buckets - 150, pcts1, color = 'red', width = 150, label = 'Original')
plt.bar(x_buckets, gt_pcts, color = 'green', width = 150, label = 'GT')
plt.bar(x_buckets + 150, pcts2, color = 'blue', width = 150, label = 'Sim')
plt.bar(x_buckets + 300, pcts3, color = 'purple', width = 150, label = 'Orig + Sim Avg')
plt.legend()

In [None]:
# cohort_name = 'langoy_pen_id_108_2020-05-07_2020-05-17'
# start_hour = 6
# end_hour = 12
# apply_growth_rate = True
# max_day_diff = 3
# final_days_post_feeding = 1
# max_final_days_post_feeding = 1
# loss_factor = 0.16

cohort_name = 'eldviktaren_pen_id_164_2020-09-06_2020-10-06'
start_hour = 7
end_hour = 15
apply_growth_rate = True
max_day_diff = 3
final_days_post_feeding = 1
max_final_days_post_feeding = 1
loss_factor = 0.16

gt_metadata = gt_metadatas[cohort_name]

sampling_filter = SamplingFilter(
    start_hour=start_hour,
    end_hour=end_hour,
    kf_cutoff=0.0,
    akpd_score_cutoff=0.01
)
df = dfs[cohort_name]
final_date_post_feeding = add_days(gt_metadata['last_feeding_date'], final_days_post_feeding)
tdf = df[(df.date <= final_date_post_feeding) & (df.date >= add_days(final_date_post_feeding, -14))]
pm_base = gen_pm_base(tdf, sampling_filter)
                
original_weights = generate_raw_individual_values(pm_base, gt_metadata, start_hour, end_hour, apply_growth_rate, max_day_diff, days_post_feeding, final_days_post_feeding)

mean_factors = []
sd_factors = []

sim_avg_weights = []
sim_sds = []
sim_len = []
sim_raw_weights = []

weight_errors = []
sd_errors = []
max_dist_errors = []
avg_dist_errors = []

density = 5
max_iter = 10000

for mean_factor in np.arange(0.95, .98, .01):
    for sd_factor in np.arange(1, 1.08, .01):
        simulated_weights = simulate_with_params(mean_factor, sd_factor, density, max_iter)
        weight_error, sd_error, max_dist_error, avg_dist_error = get_error(simulated_weights, original_weights, False)
        print('weight_error: %0.2f, sd_error: %0.2f, max_dist_error: %0.2f, avg_dist_error: %0.2f' % (weight_error, sd_error, max_dist_error, avg_dist_error))
        
        mean_factors.append(mean_factor)
        sd_factors.append(sd_factor)
        
        sim_avg_weights.append(np.mean(simulated_weights))
        sim_raw_weights.append(simulated_weights)
        sim_sds.append(np.std(simulated_weights))
        sim_len.append(len(simulated_weights))
        
        weight_errors.append(weight_error)
        sd_errors.append(sd_error)
        max_dist_errors.append(max_dist_error)
        avg_dist_errors.append(avg_dist_error)

In [None]:
a_output = pd.DataFrame(np.column_stack((mean_factors, sd_factors, weight_errors, sd_errors, max_dist_errors, avg_dist_errors)), columns = ['mean_factors', 'sd_factors', 'weight_errors', 'sd_errors', 'max_dist_errors', 'avg_dist_errors']) 

a_pivot1 = a_output.pivot(index = 'mean_factors', columns = 'sd_factors', values = 'weight_errors')
a_pivot2 = a_output.pivot(index = 'mean_factors', columns = 'sd_factors', values = 'sd_errors')
a_pivot3 = a_output.pivot(index = 'mean_factors', columns = 'sd_factors', values = 'max_dist_errors')
a_pivot4 = a_output.pivot(index = 'mean_factors', columns = 'sd_factors', values = 'avg_dist_errors')

a_pivot_all = 0.25 * (np.abs(a_pivot1) + np.abs(a_pivot2) + np.abs(a_pivot3) + np.abs(a_pivot4))

a_pivot_interp = 0.25 * (a_pivot_all.values[1:,1:] + a_pivot_all.values[:-1,1:] + a_pivot_all.values[1:,:-1] + a_pivot_all.values[:-1,:-1])

print(a_pivot_all)

plt.figure(figsize=(5, 5))
plt.imshow((a_pivot_all), cmap='hot', interpolation='nearest')
plt.colorbar()

plt.figure(figsize=(5, 5))
plt.imshow((a_pivot_interp), cmap='hot', interpolation='nearest')
plt.colorbar()

print(np.mean(a_pivot_all, 0))
print(np.mean(a_pivot_all, 1))

In [None]:
# Eldviktaren Review

selected_mean_factor = .98
selected_sd_factor = 1.01

exact_mean_factor = .98
exact_sd_factor = 1.01

count = 0
index = None

for mean_factor in np.arange(0.95, .98, .01):
    for sd_factor in np.arange(1, 1.08, .01):
        if np.abs(mean_factor - selected_mean_factor) < .001 and np.abs(sd_factor - selected_sd_factor) < .001:
            index = count
        count = count + 1

count, bins, _ = plt.hist(sim_raw_weights[index], alpha = 0.5, density = True, color = 'red', bins = 30)
plt.hist(original_weights, alpha = 0.5, density = True, color = 'blue', bins = bins)

loss_factor = 0.18

estimated_initial_weight = np.mean(original_weights) / exact_mean_factor
estimated_initial_sd = np.std(original_weights) / exact_sd_factor

gt_mean = 3365.32 / (1 - loss_factor)

print('Sim', sim_avg_weights[index], sim_sds[index])
print('Orig', np.mean(original_weights), np.std(original_weights))
print('Estimate', estimated_initial_weight, estimated_initial_sd)
print('GT', gt_mean)

print()

print('Estimate', '%0.2f, %0.2f%%' % (estimated_initial_weight - gt_mean, 100 * (estimated_initial_weight - gt_mean) / gt_mean))
print('Orig', '%0.2f, %0.2f%%' % (np.mean(original_weights) - gt_mean, 100 * (np.mean(original_weights) - gt_mean) / gt_mean))
print('New', '%0.2f, %0.2f%%' % (0.5 * (estimated_initial_weight + sim_avg_weights[index]) - gt_mean, 100 * (0.5 * (estimated_initial_weight + sim_avg_weights[index]) - gt_mean) / gt_mean))


print()

buckets = [0, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000]
x_buckets = np.array(buckets[:-1])
# gt_pcts = [
#     0.0,
#     0.0365,
#     .2114,
#     .3232,
#     .2538,
#     .1254,
#     .0399,
#     .0087,
#     .0010,
#     0.0001
# ]
gt_pcts = [
    0.0,
    0.0062,
    .2281,
    .6490,
    .1143,
    .0023,
    .0001,
    0,
    0,
    0
]

d1 = np.array(original_weights) * (1 - loss_factor)
# d2 = np.array(sim_raw_weights[index]) / exact_mean_factor

sim_vec = np.arange(start=0, stop = 10000, step = 1)

orig_pdf = pdf_mvsk([np.mean(original_weights * (1 - loss_factor)), np.std(original_weights * (1 - loss_factor)) ** 2, 0, 1])
orig_dist = orig_pdf(sim_vec)

sim_pdf = pdf_mvsk([estimated_initial_weight * (1 - loss_factor), (estimated_initial_sd  * np.sqrt(1 - loss_factor)) ** 2, 0, 1])
sim_dist = sim_pdf(sim_vec)

avg_pdf = pdf_mvsk([0.5 * (estimated_initial_weight + sim_avg_weights[index]) * (1 - loss_factor), (0.5 * (estimated_initial_sd  * np.sqrt(1 - loss_factor) + sim_sds[index] * np.sqrt(1 - loss_factor))) ** 2, 0, 1])
avg_dist = avg_pdf(sim_vec)

pcts0 = []
pcts1 = []
pcts2 = []
pcts3 = []

# errors1 = []
# errors2 = []

for i in range(len(buckets) - 1):
    mask0 = (d1 > buckets[i]) & (d1 <= buckets[i + 1])
    pct0 = np.sum(mask0) / len(mask0)
    pcts0.append(pct0)

    pct1 = np.sum(orig_dist[(sim_vec > buckets[i]) & (sim_vec <= buckets[i + 1])]) / np.sum(orig_dist)
#     mask2 = (d2 > buckets[i]) & (d2 <= buckets[i + 1])

#     pct1 = np.sum(mask1) / len(mask1)
    pcts1.append(pct1)
#     pct2 = np.sum(mask2) / len(mask2)
    pct2 = np.sum(sim_dist[(sim_vec > buckets[i]) & (sim_vec <= buckets[i + 1])]) / np.sum(sim_dist)
    pcts2.append(pct2)
    
    pct3 = np.sum(avg_dist[(sim_vec > buckets[i]) & (sim_vec <= buckets[i + 1])]) / np.sum(avg_dist)
    pcts3.append(pct3)

#     errors1.append(np.abs(100 * (pct1 - pct2)))

#     if verbose:
    gt_pct = gt_pcts[i]
    print('%i: %0.2f%% vs %0.2f%% vs %0.2f%% vs %0.2f%%' % (buckets[i], 100 * (pct0 - gt_pct), 100 * (pct1 - gt_pct), 100 * (pct2 - gt_pct), 100 * (pct3 - gt_pct)))

#     print(np.max(errors1))
#     print(np.mean(errors1))

plt.figure(figsize=(20, 10))
plt.bar(x_buckets - 150, pcts1, color = 'red', width = 150, label = 'Original')
plt.bar(x_buckets, gt_pcts, color = 'green', width = 150, label = 'GT')
plt.bar(x_buckets + 150, pcts2, color = 'blue', width = 150, label = 'Sim')
plt.bar(x_buckets + 300, pcts3, color = 'purple', width = 150, label = 'Orig + Sim Avg')
plt.legend()

In [None]:
cohort_name = 'langoy_pen_id_108_2020-05-07_2020-05-17'
start_hour = 6
end_hour = 12
apply_growth_rate = True
max_day_diff = 3
final_days_post_feeding = 1
max_final_days_post_feeding = 1
loss_factor = 0.16

gt_metadata = gt_metadatas[cohort_name]

sampling_filter = SamplingFilter(
    start_hour=start_hour,
    end_hour=end_hour,
    kf_cutoff=0.0,
    akpd_score_cutoff=0.01
)
df = dfs[cohort_name]
final_date_post_feeding = add_days(gt_metadata['last_feeding_date'], final_days_post_feeding)
tdf = df[(df.date <= final_date_post_feeding) & (df.date >= add_days(final_date_post_feeding, -14))]
pm_base = gen_pm_base(tdf, sampling_filter)
                
original_weights = generate_raw_individual_values(pm_base, gt_metadata, start_hour, end_hour, apply_growth_rate, max_day_diff, days_post_feeding, final_days_post_feeding)

mean_factors = []
sd_factors = []

sim_avg_weights = []
sim_sds = []
sim_len = []
sim_raw_weights = []

weight_errors = []
sd_errors = []
max_dist_errors = []
avg_dist_errors = []

density = 5
max_iter = 10000

for mean_factor in np.arange(0.95, .98, .01):
    for sd_factor in np.arange(1, 1.08, .01):
        simulated_weights = simulate_with_params(mean_factor, sd_factor, density, max_iter)
        weight_error, sd_error, max_dist_error, avg_dist_error = get_error(simulated_weights, original_weights, False)
        print('weight_error: %0.2f, sd_error: %0.2f, max_dist_error: %0.2f, avg_dist_error: %0.2f' % (weight_error, sd_error, max_dist_error, avg_dist_error))
        
        mean_factors.append(mean_factor)
        sd_factors.append(sd_factor)
        
        sim_avg_weights.append(np.mean(simulated_weights))
        sim_raw_weights.append(simulated_weights)
        sim_sds.append(np.std(simulated_weights))
        sim_len.append(len(simulated_weights))
        
        weight_errors.append(weight_error)
        sd_errors.append(sd_error)
        max_dist_errors.append(max_dist_error)
        avg_dist_errors.append(avg_dist_error)

In [None]:
a_output = pd.DataFrame(np.column_stack((mean_factors, sd_factors, weight_errors, sd_errors, max_dist_errors, avg_dist_errors)), columns = ['mean_factors', 'sd_factors', 'weight_errors', 'sd_errors', 'max_dist_errors', 'avg_dist_errors']) 

a_pivot1 = a_output.pivot(index = 'mean_factors', columns = 'sd_factors', values = 'weight_errors')
a_pivot2 = a_output.pivot(index = 'mean_factors', columns = 'sd_factors', values = 'sd_errors')
a_pivot3 = a_output.pivot(index = 'mean_factors', columns = 'sd_factors', values = 'max_dist_errors')
a_pivot4 = a_output.pivot(index = 'mean_factors', columns = 'sd_factors', values = 'avg_dist_errors')

a_pivot_all = 0.25 * (np.abs(a_pivot1) + np.abs(a_pivot2) + np.abs(a_pivot3) + np.abs(a_pivot4))

a_pivot_interp = 0.25 * (a_pivot_all.values[1:,1:] + a_pivot_all.values[:-1,1:] + a_pivot_all.values[1:,:-1] + a_pivot_all.values[:-1,:-1])

print(a_pivot_all)

plt.figure(figsize=(5, 5))
plt.imshow((a_pivot_all), cmap='hot', interpolation='nearest')
plt.colorbar()

plt.figure(figsize=(5, 5))
plt.imshow((a_pivot_interp), cmap='hot', interpolation='nearest')
plt.colorbar()

print(np.mean(a_pivot_all, 0))
print(np.mean(a_pivot_all, 1))

In [None]:
# Langoy Review

selected_mean_factor = .96
selected_sd_factor = 1.07

exact_mean_factor = .96
exact_sd_factor = 1.07

count = 0
index = None

for mean_factor in np.arange(0.95, .98, .01):
    for sd_factor in np.arange(1, 1.08, .01):
        if np.abs(mean_factor - selected_mean_factor) < .001 and np.abs(sd_factor - selected_sd_factor) < .001:
            index = count
        count = count + 1

count, bins, _ = plt.hist(sim_raw_weights[index], alpha = 0.5, density = True, color = 'red', bins = 30)
plt.hist(original_weights, alpha = 0.5, density = True, color = 'blue', bins = bins)

loss_factor = 0.16

estimated_initial_weight = np.mean(original_weights) / exact_mean_factor
estimated_initial_sd = np.std(original_weights) / exact_sd_factor

gt_mean = 4628.51 / (1 - loss_factor)

print('Sim', sim_avg_weights[index], sim_sds[index])
print('Orig', np.mean(original_weights), np.std(original_weights))
print('Estimate', estimated_initial_weight, estimated_initial_sd)
print('GT', gt_mean)

print()

print('Estimate', '%0.2f, %0.2f%%' % (estimated_initial_weight - gt_mean, 100 * (estimated_initial_weight - gt_mean) / gt_mean))
print('Orig', '%0.2f, %0.2f%%' % (np.mean(original_weights) - gt_mean, 100 * (np.mean(original_weights) - gt_mean) / gt_mean))
print('New', '%0.2f, %0.2f%%' % (0.5 * (estimated_initial_weight + sim_avg_weights[index]) - gt_mean, 100 * (0.5 * (estimated_initial_weight + sim_avg_weights[index]) - gt_mean) / gt_mean))


print()

buckets = [0, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000]
x_buckets = np.array(buckets[:-1])
# gt_pcts = [
#     0.0,
#     0.0365,
#     .2114,
#     .3232,
#     .2538,
#     .1254,
#     .0399,
#     .0087,
#     .0010,
#     0.0001
# ]
gt_pcts = [
    0.0,
    0.0104,
    0.0953,
    .2740,
    .2716,
    .1702,
    .1014,
    .0511,
    .0229,
    .0031
]

d1 = np.array(original_weights) * (1 - loss_factor)
# d2 = np.array(sim_raw_weights[index]) / exact_mean_factor

sim_vec = np.arange(start=0, stop = 10000, step = 1)

orig_pdf = pdf_mvsk([np.mean(original_weights * (1 - loss_factor)), np.std(original_weights * (1 - loss_factor)) ** 2, 0, 1])
orig_dist = orig_pdf(sim_vec)

sim_pdf = pdf_mvsk([estimated_initial_weight * (1 - loss_factor), (estimated_initial_sd  * np.sqrt(1 - loss_factor)) ** 2, 0, 1])
sim_dist = sim_pdf(sim_vec)

avg_pdf = pdf_mvsk([0.5 * (estimated_initial_weight + sim_avg_weights[index]) * (1 - loss_factor), (0.5 * (estimated_initial_sd  * np.sqrt(1 - loss_factor) + sim_sds[index] * np.sqrt(1 - loss_factor))) ** 2, 0, 1])
avg_dist = avg_pdf(sim_vec)

pcts0 = []
pcts1 = []
pcts2 = []
pcts3 = []

# errors1 = []
# errors2 = []

for i in range(len(buckets) - 1):
    mask0 = (d1 > buckets[i]) & (d1 <= buckets[i + 1])
    pct0 = np.sum(mask0) / len(mask0)
    pcts0.append(pct0)

    pct1 = np.sum(orig_dist[(sim_vec > buckets[i]) & (sim_vec <= buckets[i + 1])]) / np.sum(orig_dist)
#     mask2 = (d2 > buckets[i]) & (d2 <= buckets[i + 1])

#     pct1 = np.sum(mask1) / len(mask1)
    pcts1.append(pct1)
#     pct2 = np.sum(mask2) / len(mask2)
    pct2 = np.sum(sim_dist[(sim_vec > buckets[i]) & (sim_vec <= buckets[i + 1])]) / np.sum(sim_dist)
    pcts2.append(pct2)
    
    pct3 = np.sum(avg_dist[(sim_vec > buckets[i]) & (sim_vec <= buckets[i + 1])]) / np.sum(avg_dist)
    pcts3.append(pct3)

#     errors1.append(np.abs(100 * (pct1 - pct2)))

#     if verbose:
    gt_pct = gt_pcts[i]
    print('%i: %0.2f%% vs %0.2f%% vs %0.2f%% vs %0.2f%%' % (buckets[i], 100 * (pct0 - gt_pct), 100 * (pct1 - gt_pct), 100 * (pct2 - gt_pct), 100 * (pct3 - gt_pct)))

#     print(np.max(errors1))
#     print(np.mean(errors1))

plt.figure(figsize=(20, 10))
plt.bar(x_buckets - 150, pcts1, color = 'red', width = 150, label = 'Original')
plt.bar(x_buckets, gt_pcts, color = 'green', width = 150, label = 'GT')
plt.bar(x_buckets + 150, pcts2, color = 'blue', width = 150, label = 'Sim')
plt.bar(x_buckets + 300, pcts3, color = 'purple', width = 150, label = 'Orig + Sim Avg')
plt.legend()

In [None]:
cohort_name = 'bolaks_pen_id_88_2020-02-10_2020-03-10'
start_hour = 7
end_hour = 15
apply_growth_rate = True
max_day_diff = 3
final_days_post_feeding = 1
max_final_days_post_feeding = 1
loss_factor = 0.17

gt_metadata = gt_metadatas[cohort_name]

sampling_filter = SamplingFilter(
    start_hour=start_hour,
    end_hour=end_hour,
    kf_cutoff=0.0,
    akpd_score_cutoff=0.01
)
df = dfs[cohort_name]
final_date_post_feeding = add_days(gt_metadata['last_feeding_date'], final_days_post_feeding)
tdf = df[(df.date <= final_date_post_feeding) & (df.date >= add_days(final_date_post_feeding, -14))]
pm_base = gen_pm_base(tdf, sampling_filter)
                
original_weights = generate_raw_individual_values(pm_base, gt_metadata, start_hour, end_hour, apply_growth_rate, max_day_diff, days_post_feeding, final_days_post_feeding)

mean_factors = []
sd_factors = []

sim_avg_weights = []
sim_sds = []
sim_len = []
sim_raw_weights = []

weight_errors = []
sd_errors = []
max_dist_errors = []
avg_dist_errors = []

density = 5
max_iter = 10000

for mean_factor in np.arange(0.95, .98, .01):
    for sd_factor in np.arange(1, 1.08, .01):
        simulated_weights = simulate_with_params(mean_factor, sd_factor, density, max_iter)
        weight_error, sd_error, max_dist_error, avg_dist_error = get_error(simulated_weights, original_weights, False)
        print('weight_error: %0.2f, sd_error: %0.2f, max_dist_error: %0.2f, avg_dist_error: %0.2f' % (weight_error, sd_error, max_dist_error, avg_dist_error))
        
        mean_factors.append(mean_factor)
        sd_factors.append(sd_factor)
        
        sim_avg_weights.append(np.mean(simulated_weights))
        sim_raw_weights.append(simulated_weights)
        sim_sds.append(np.std(simulated_weights))
        sim_len.append(len(simulated_weights))
        
        weight_errors.append(weight_error)
        sd_errors.append(sd_error)
        max_dist_errors.append(max_dist_error)
        avg_dist_errors.append(avg_dist_error)

In [None]:
# Vikane
# gt_pcts = [
#     0.0,
#     0.012362459546925567,
#     0.05272923408845739,
#     0.31016181229773465,
#     0.4608414239482201,
#     0.14977346278317152,
#     0.013398058252427184,
#     0.0006040992448759439,
#     0.00012944983818770226,
#     0.0
# ]

# Aplavika
# gt_pcts = [
#     0.0,
#     0.0,
#     0.0036,
#     0.1060,
#     0.3990,
#     0.3576,
#     .1147,
#     .0180,
#     .0011,
#     0.0
# ]

# Tittelsnes
# gt_pcts = [
#     0.0,
#     0.0365,
#     .2114,
#     .3232,
#     .2538,
#     .1254,
#     .0399,
#     .0087,
#     .0010,
#     0.0001
# ]

# Eldviktaren
gt_pcts = [
    0.0,
    0.0062,
    .2281,
    .6490,
    .1143,
    .0023,
    .0001,
    0,
    0,
    0
]

a = np.cumsum(gt_pcts)
mask = (a > 0) & (a < 0.999)

plt.figure(figsize=(10, 10))

plt.scatter(x_buckets[mask], stats.norm.ppf(a[mask]))

X = x_buckets[mask]
X = sm.add_constant(X)
model = sm.OLS(stats.norm.ppf(a[mask]), X)
results = model.fit()

plt.plot(x_buckets[mask], results.predict(X), color = 'red')


In [None]:
stats.norm.ppf(a[mask])

In [None]:
mask