In [None]:
from typing import List
from collections import defaultdict
import numpy as np
from filter_optimization.filter_optimization_task import generate_sampling_filters, extract_biomass_data, \
    NoDataException, SamplingFilter, generate_pm_base, PopulationMetricsBase, generate_filter_mask, get_dates_in_range, \
    find_optimal_filter, gen_pm_base
from population_metrics.population_metrics_base import generate_pm_base, PopulationMetricsBase
from population_metrics.confidence_metrics import compute_biomass_kpi, generate_distribution_consistency
from population_metrics.smart_metrics import generate_smart_avg_weight, generate_smart_individual_values
from population_metrics.raw_metrics import get_raw_sample_size

import pandas as pd

In [None]:
pen_id = 167
start_date = '2020-10-18'
end_date = '2020-10-25'
akpd_score_cutoff = 0.99

In [None]:
df = extract_biomass_data(pen_id, start_date, end_date, akpd_score_cutoff)
start_hours = [0]
end_hours = [24]
kf_cutoffs = np.arange(0.8, 1.5, 0.005)
# kf_cutoffs = [1.25]
sampling_filters = generate_sampling_filters(start_hours, end_hours, kf_cutoffs)

In [None]:
def generate_metrics_for_pm_base(pm_base: PopulationMetricsBase, dates: List[str]) -> float:
    """Generates mean biomass KPI given a PopulationMetricsBase instance and dates to consider."""

    kpis, sample_sizes, weights = [], [], []
    for date in dates:
        sample_size = get_raw_sample_size(pm_base, date)
#         biomass_kpi = compute_biomass_kpi(pm_base, date)
        biomass_kpi = generate_distribution_consistency(pm_base, date)
        sample_sizes.append(sample_size)
        weights.append(generate_smart_avg_weight(pm_base, date))
        kpis.append(biomass_kpi)

    # compute sample-size weighted kpi and final smart average
    kpis = np.array([k if k else np.nan for k in kpis])
    sample_sizes = np.array([s if s else np.nan for s in sample_sizes])
    mean_kpi = np.nansum(kpis * sample_sizes) / np.nansum(sample_sizes)
    return mean_kpi, weights


In [None]:
def find_optimal_filter(df: pd.DataFrame, sampling_filters: List[SamplingFilter]) -> SamplingFilter:
    """Finds optimal filter given data-frame of raw biomass computations and different sampling filters. """

    analysis_data = defaultdict(list)
    for sampling_filter in sampling_filters:
        print('Start hour: {}, End hour: {}, KF cutoff: {}'.format(
            sampling_filter.start_hour, sampling_filter.end_hour, sampling_filter.kf_cutoff
        ))
        pm_base = gen_pm_base(df, sampling_filter)

        if pm_base:
            unique_dates = sorted(df.date.unique().tolist())
            dates = get_dates_in_range(unique_dates[0], unique_dates[-1])
            mean_kpi, weights = generate_metrics_for_pm_base(pm_base, dates)
            
        else:
            mean_kpi = None

        # add to data
        analysis_data['mean_kpi'].append(mean_kpi)
        analysis_data['start_hour'].append(sampling_filter.start_hour)
        analysis_data['end_hour'].append(sampling_filter.end_hour)
        analysis_data['kf_cutoff'].append(sampling_filter.kf_cutoff)
        analysis_data['final_weight'].append(weights[-1])
        analysis_data['akpd_score_cutoff'].append(sampling_filter.akpd_score_cutoff)

    analysis_df = pd.DataFrame(analysis_data)
    best_sampling_filter_params = analysis_df.sort_values('mean_kpi', ascending=False).iloc[0]

    best_sampling_filter = SamplingFilter(
        start_hour=float(best_sampling_filter_params.start_hour),
        end_hour=float(best_sampling_filter_params.end_hour),
        kf_cutoff=float(best_sampling_filter_params.kf_cutoff),
        akpd_score_cutoff=float(best_sampling_filter_params.akpd_score_cutoff)
    )
    return analysis_df

In [None]:
# analysis_df, best_sampling_filter = find_optimal_filter(df, sampling_filters)
analysis_df = find_optimal_filter(df, sampling_filters)

In [None]:
from matplotlib import pyplot as plt

In [None]:
plt.figure(figsize=(10, 5))
plt.plot(analysis_df.kf_cutoff, analysis_df.mean_kpi)

plt.xlabel('KF Cutoff')
plt.ylabel('Sample size weighted KPI')
plt.grid()
plt.show()

In [None]:
plt.figure(figsize=(10, 5))
plt.plot(analysis_df.kf_cutoff, analysis_df.final_weight)
plt.xlabel('KF Cutoff')
plt.ylabel('Sample size weighted KPI')
plt.grid()
plt.show()

In [None]:
# mask = (analysis_df.start_hour == 7) & (analysis_df.end_hour == 15)
analysis_df.sort_values('mean_kpi', ascending=False)

pm_base = gen_pm_base(df, SamplingFilter(start_hour=0, end_hour=24, kf_cutoff=0.0, akpd_score_cutoff=0.99))
weights, kfs = generate_smart_individual_values(pm_base, '2020-08-30', 3, True, True, 0.9)
pm_base = gen_pm_base(df, SamplingFilter(start_hour=0, end_hour=24, kf_cutoff=1.3, akpd_score_cutoff=0.99))
weights_2, kfs_2 = generate_smart_individual_values(pm_base, '2020-08-30', 3, True, True, 0.9)

In [None]:
plt.figure(figsize=(20, 10))
plt.hist(weights, bins=50)
# plt.hist(weights_2, bins=50)
plt.grid()
plt.show()

In [None]:
1 - 5794*.84 / 4949

In [None]:
mask = (df.hour >= 19) | (df.hour <= 3)
plt.figure(figsize=(20, 10))
# plt.hist(df[mask].estimated_k_factor.values, bins=20, color='blue', alpha=0.7)
plt.hist(df[~mask].estimated_k_factor.values, bins=20, color='red', alpha=0.7)
plt.show()

In [None]:
sampling_filter = SamplingFilter(start_hour=4, end_hour=18, kf_cutoff=0.0, akpd_score_cutoff=0.99)
pm_base = gen_pm_base(df, sampling_filter)

In [None]:
generate_smart_avg_weight(pm_base, '2020-08-26')

In [None]:
import random
import numpy as np

In [None]:
lengths = np.random.normal(60, 30, 10000)
speed_ratios = np.random.normal(1.0, 0.2, 10000)
speeds = speed_ratios * lengths
depths = np.random.normal(120, 50, 1000)

In [None]:
adj_lengths = []
for i in range(1000):
    p = max(min((1.04 * depths[i] - lengths[i]) / speeds[i], 1), 0.1)
    adj_lengths.extend([lengths[i]] * int(100 * p))

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Setup
rng = np.random.RandomState(0)  # Seed RNG for replicability
n = 100  # Number of samples to draw

# Generate data
x = lengt
y = rng.standard_t(df=5, size=n)  # Sample 2: Y ~ t(5)

# Quantile-quantile plot
plt.figure()
plt.scatter(np.sort(x), np.sort(y))
plt.xlabel('X')
plt.ylabel('Y')
plt.show()
plt.close()

In [None]:
plt.hist(lengths)

In [None]:
plt.hist(adj_lengths)

In [None]:
import numpy as np
import pylab

In [None]:
test1 = np.array(lengths)
test2 = np.array(adj_lengths)

#Calculate quantiles
test1.sort()
quantile_levels1 = np.arange(len(test1),dtype=float)/len(test1)

test2.sort()
quantile_levels2 = np.arange(len(test2),dtype=float)/len(test2)

#Use the smaller set of quantile levels to create the plot
quantile_levels = quantile_levels2

#We already have the set of quantiles for the smaller data set
quantiles2 = test2

#We find the set of quantiles for the larger data set using linear interpolation
quantiles1 = np.interp(quantile_levels,quantile_levels1,test1)

#Plot the quantiles to create the qq plot
pylab.plot(quantiles1,quantiles2)

#Add a reference line
maxval = max(test1[-1],test2[-1])
minval = min(test1[0],test2[0])
pylab.plot([minval,maxval],[minval,maxval],'k-')

In [None]:
4