In [None]:
from collections import defaultdict
import json
import os
import numpy as np
import pandas as pd
from datetime import datetime
from research.utils.data_access_utils import RDSAccessUtils
from sklearn.linear_model import LinearRegression
from research.utils.datetime_utils import day_difference, add_days
from research.utils.datetime_utils import get_dates_in_range

import matplotlib.pyplot as plt
from scipy.stats import norm
from research.weight_estimation.keypoint_utils.optics import euclidean_distance, pixel2world, depth_from_disp, convert_to_world_point

DATE_FORMAT = '%Y-%m-%d'

rds_access_utils = RDSAccessUtils(json.load(open(os.environ['DATA_WAREHOUSE_SQL_CREDENTIALS'])))

In [None]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

In [None]:
class PopulationMetricsEstimator(object):

    def __init__(self, biomass_computations, use_kernel_weight_algo = False, kernel_details=(50, 100, 0.1)):
        self.biomass_computations = biomass_computations
        self.use_kernel_weight_algo = use_kernel_weight_algo
        self.kernel_bins = kernel_details[0]
        self.kernel_split = kernel_details[1]
        self.kernel_sd = kernel_details[2]
        self.bcs_by_date = defaultdict(list)
        self.unique_dates_nr = []
        self.unique_dates = []
        self.average_weights = []
        self.sample_sizes = []
        self.prepare_data()

    def prepare_data(self):
        self.biomass_computations = sorted(self.biomass_computations, key=lambda x: x[0])
        self.generate_daily_values()
        
    def get_average_weight_on_date(self, weights, depths):
        if self.use_kernel_weight_algo:
            df = pd.DataFrame(list(zip(weights, depths)), columns = ['weight', 'depth'])
            
            N, bins, _ = plt.hist(weights, bins = self.kernel_bins)
            
            x_d = np.linspace(0, 2, self.kernel_split)
            
            average_weights = []
            max_densities = []

            for index in range(len(N)):
                lowerBin = bins[index]
                upperBin = bins[index + 1]
                subset = (df['weight'] >= lowerBin) & (df['weight'] < upperBin) 
                depths = df[subset]['depth'].values
                density = sum(norm.pdf((x_d - xi) / self.kernel_sd) for xi in depths)

                average_weights.append(np.mean(df[subset]['weight']))
                max_densities.append(np.max(density))
                
            max_densities = np.array(max_densities)
            average_weights = np.nan_to_num(np.array(average_weights))
            average_weight = np.sum(max_densities * average_weights) / np.sum(max_densities)

            return average_weight
        
        return np.mean(weights)

    def generate_daily_values(self):
        weights_for_date = []
        depths_for_date = []
        curr_date = self.biomass_computations[0][0]
        for date, weight, kf, depth in self.biomass_computations:
            self.bcs_by_date[date].append((weight, kf))
            if date != curr_date:
                self.unique_dates.append(curr_date)
                self.average_weights.append(self.get_average_weight_on_date(weights_for_date, depths_for_date))
                self.sample_sizes.append(len(weights_for_date))
                weights_for_date = [weight]
                depths_for_date = [depth]
                curr_date = date
            else:
                weights_for_date.append(weight)
                depths_for_date.append(depth)

        self.unique_dates.append(curr_date)
        self.average_weights.append(self.get_average_weight_on_date(weights_for_date, depths_for_date))
        self.sample_sizes.append(len(weights_for_date))
        self.unique_dates_nr = [day_difference(date, self.unique_dates[0]) for date in self.unique_dates]

    def generate_raw_daily_metrics_on_date(self, date):
        if date in self.unique_dates:
            idx = self.unique_dates.index(date)
            return self.average_weights[idx], self.sample_sizes[idx]
        return None, None

    def generate_raw_weights_kfs_on_date(self, date):
        if len(self.bcs_by_date[date]) > 0:
            weights, kfs = [list(l) for l in list(zip(*self.bcs_by_date[date]))]
            return weights, kfs
        return [], []

    def get_start_end_idx(self, start_date, end_date):
        if start_date > self.unique_dates[-1] or end_date < self.unique_dates[0]:
            return -1, -1
        start_idx = [idx for idx, date in enumerate(self.unique_dates) if date >= start_date][0]
        end_idx = [idx for idx, date in enumerate(self.unique_dates) if date <= end_date][-1] + 1
        return start_idx, end_idx

    def compute_growth_rate(self, date, start_date, end_date, decay=0.1):
        if not any([date in self.unique_dates for date in get_dates_in_range(start_date, end_date)]):
            return None, None
        start_idx, end_idx = self.get_start_end_idx(start_date, end_date)
        X = np.array([day_difference(d, date) for d in self.unique_dates[start_idx:end_idx]]).reshape(-1, 1)
        y = np.log(np.array(self.average_weights[start_idx:end_idx]))
        n = np.array(self.sample_sizes[start_idx:end_idx])

        if X.shape[0] < 4:
            return None, None

        sample_weights = np.multiply(n, np.exp(-decay * np.abs(X.squeeze())))
        reg = LinearRegression().fit(X, y, sample_weight=sample_weights)
        growth_rate = reg.coef_[0]
        y_pred = reg.predict(X)

        error_magnitude_pct = np.average(((np.exp(y) - np.exp(y_pred)) / np.exp(y_pred))**2,
                                         weights=sample_weights)**0.5
        return growth_rate, error_magnitude_pct

    def compute_local_growth_rate(self, date, incorporate_future, window=7):
        # compute local growth rate
        day_diffs = np.array([day_difference(d, date) for d in self.unique_dates])
        if incorporate_future:
            start, end = add_days(date, -window), add_days(date, window // 2)
            if not any([date in get_dates_in_range(start, end) for date in self.unique_dates]):
                return None, None
            end_idx = np.where(day_diffs <= window // 2)[0][-1]
            end_date = self.unique_dates[end_idx]
            start_date = add_days(end_date, -window)
        else:
            start_date, end_date = add_days(date, -window), date
        growth_rate, error_magnitude_pct = self.compute_growth_rate(date, start_date, end_date)
        return growth_rate, error_magnitude_pct

    def generate_historical_weights(self, date, window=7):
        seven_days_ago, yesterday = add_days(date, -window), add_days(date, -1)
        dates = get_dates_in_range(seven_days_ago, yesterday)
        historical_weights = []
        for curr_date in dates:
            weights, _ = self.generate_raw_weights_kfs_on_date(curr_date)
            historical_weights.extend(weights)

        return historical_weights

    def generate_distribution_consistency(self, date, window=7):
        raw_weights, _ = self.generate_raw_weights_kfs_on_date(date)
        historical_weights = self.generate_historical_weights(date, window=window)
        if not raw_weights or not historical_weights:
            return None
        raw_weights = np.array(raw_weights)
        historical_weights = np.array(historical_weights)
        mean_adjustment = np.mean(raw_weights) - np.mean(historical_weights)
        x = np.percentile(historical_weights + mean_adjustment, list(range(100)))
        y = np.percentile(raw_weights, list(range(100)))
        distribution_confidence = 1.0 - 10.0 * (np.mean(np.abs(y[1:99] - x[1:99]) ** 2) ** 0.5 / 10000.0)
        return distribution_confidence

    def generate_smart_metrics_on_date(self, date, max_day_difference=3, bucket_size=100, incorporate_future=True,
                                       apply_growth_rate=True):

        # compute metrics on this date
        distribution_consistency = self.generate_distribution_consistency(date, window=7)
        raw_average_weight, raw_sample_size = self.generate_raw_daily_metrics_on_date(date)
        _, raw_kfs = self.generate_raw_weights_kfs_on_date(date)
        raw_average_kf = np.mean([kf for kf in raw_kfs if kf is not None])

        # compute local growth rate
        growth_rate, error_magnitude_pct = self.compute_local_growth_rate(date, incorporate_future)

        # compute smart average

        look_ahead = max_day_difference if incorporate_future else 0
        start_date, end_date = add_days(date, -max_day_difference), add_days(date, look_ahead)
        if not any([date in get_dates_in_range(start_date, end_date) for date in self.unique_dates]):
            return {}
        start_idx, end_idx = self.get_start_end_idx(start_date, end_date)

        if growth_rate and apply_growth_rate and raw_sample_size and error_magnitude_pct < 0.02 and \
                abs(growth_rate) < 0.02:
            growth_rate_for_smart_avg = growth_rate
        else:
            growth_rate_for_smart_avg = 0.0

        x = np.array([day_difference(d, date) for d in self.unique_dates[start_idx:end_idx]])
        y = np.array(self.average_weights[start_idx:end_idx])
        n = np.array(self.sample_sizes[start_idx:end_idx])
        sample_size = int(np.sum(n))
        smart_average = np.sum(np.exp(-x * growth_rate_for_smart_avg) * y * n) / sample_size

        # compute smart distribution
        adj_weights, kfs = [], []
        for date_idx, date in enumerate(self.unique_dates[start_idx:end_idx]):
            weights_for_date, kfs_for_date = self.generate_raw_weights_kfs_on_date(date)
            adj_weights_for_date = np.array(weights_for_date) * np.exp(growth_rate_for_smart_avg * x[date_idx])
            adj_weights.extend(adj_weights_for_date)
            kfs.extend(kfs_for_date)

        assert len(adj_weights) == np.sum(n), 'Inconsistent sample sizes!'

        adj_weights, kfs = np.array(adj_weights), np.array(kfs)
        smart_distribution = dict()

        bucket_size_kg = 1e-3 * bucket_size
        buckets = [round(x, 2) for x in np.arange(0.0, 1e-3 * np.max(adj_weights), bucket_size_kg)]
        for b in buckets:
            low, high = 1e3 * b, 1e3 * (b + bucket_size_kg)
            count = adj_weights[(adj_weights >= low) & (adj_weights < high)].shape[0]
            kfs_for_bucket = [kf if kf else np.nan for kf in kfs[(adj_weights >= low) & (adj_weights < high)]]
            mean_kf = np.mean(kfs_for_bucket) if count > 0 else np.nan
            smart_distribution[str(b)] = {
                'count': count,
                'avgKFactor': None if np.isnan(mean_kf) else mean_kf
            }

        metrics = dict(
            raw_average_weight=raw_average_weight,
            raw_average_kf=raw_average_kf,
            raw_sample_size=raw_sample_size,
            smart_average_weight=smart_average,
            smart_average_kf=np.mean([kf for kf in kfs if kf is not None]),
            smart_distribution=smart_distribution,
            smart_sample_size=sample_size,
            adj_weights=adj_weights,
            kfs=kfs,
            growth_rate=growth_rate,
            error_magnitude_pct=error_magnitude_pct,
            distribution_consistency=distribution_consistency
        )

        return metrics

In [None]:
queryCache = {}

In [None]:
pen_id = 60
start_date = '2020-04-25'
end_date = '2020-05-25'
# pen_id = 66
# start_date = '2020-05-20'
# end_date = '2020-06-10'
# akpd_filter = 0.99

query = """
    SELECT * FROM (
      (SELECT * FROM prod.crop_annotation cas
      INNER JOIN prod.annotation_state pas on pas.id=cas.annotation_state_id
      WHERE cas.service_id = (SELECT ID FROM prod.service where name='BATI')
      AND cas.annotation_state_id = 3
      AND cas.pen_id=%i) a
    RIGHT JOIN 
      (SELECT left_crop_url, estimated_weight_g, akpd_score FROM prod.biomass_computations
      WHERE prod.biomass_computations.captured_at >= '%s'
      AND prod.biomass_computations.captured_at <= '%s'
      AND prod.biomass_computations.akpd_score > %0.4f) bc 
    ON 
      (a.left_crop_url=bc.left_crop_url)
    ) x
    WHERE x.captured_at >= '%s'
    AND x.captured_at <= '%s'
    AND x.pen_id = %i
    AND x.group_id = '%i';
""" % (pen_id, start_date, end_date, akpd_filter, start_date, end_date, pen_id, pen_id)

if query in queryCache:
    df = queryCache[query].copy()
else:
    df = rds_access_utils.extract_from_database(query)
    
    depths = []
    for idx, row in df.iterrows():
        ann, cm = row.annotation, row.camera_metadata
        wkps = pixel2world(ann['leftCrop'], ann['rightCrop'], cm)
        depth = np.median([wkp[1] for wkp in wkps.values()])
        depths.append(depth)
    df['depth'] = depths
    
    df = df.sort_values('captured_at').copy(deep=True)
    df.index = pd.to_datetime(df.captured_at)
    dates = df.index.date.astype(str)
    df['date'] = dates
    df['hour'] = df.index.hour

    if 'estimated_k_factor' not in df.columns.tolist():
        df['estimated_k_factor'] = 0.0
    
    queryCache[query] = df.copy()

In [None]:
biomass_computations = []
for idx, row in df.iterrows():
    biomass_computations.append((row.date, row.estimated_weight_g, row.estimated_k_factor, row.depth))

kernel_details = (50, 100, 0.1)
pme = PopulationMetricsEstimator(biomass_computations, use_kernel_weight_algo = False, kernel_details = kernel_details)



In [None]:
#df.head()

# startDate, startHour = datetime.strptime(df.ix[0]['date'], '%Y-%m-%d'), df.ix[0]['hour']
# endDate, endHour = datetime.strptime(df.ix[-1]['date'], '%Y-%m-%d'), df.ix[-1]['hour']

startDate = df.index[0]
endDate = df.index[-1]

maxWeight = max(df['estimated_weight_g'])
maxWeightInt = int(maxWeight / 1000)

diff = endDate - startDate
days, seconds = diff.days, diff.seconds
hours = int((days * 24 + seconds // 3600) / 1)

a = np.zeros((hours + 1, maxWeightInt + 1))
print(a.shape)

count = 0

dateStrings = []
dates = []
distConfs = []

for idx, row in df.iterrows():
    if row.date not in dateStrings:
        dateStrings.append(row.date)
        dates.append(datetime.strptime(row.date, '%Y-%m-%d'))
        
        distConf = pme.generate_distribution_consistency(row.date)
        
        if distConf and distConf < .1:
            distConfs.append(distConfs[-1])
        else:
            distConfs.append(distConf)

    diff = idx - startDate
    days, seconds = diff.days, diff.seconds
    hours = int((days * 24 + seconds // 3600) / 1)
    
    weight = row['estimated_weight_g']
    weightInt = int(weight / 1000)
    
    a[hours, weightInt] = a[hours, weightInt] + 1

In [None]:
# import pysal as ps

# coefs = []

# window = 24
# skip = 24

# for i in np.arange(window, hours, skip):
#     b = a[(i - window):i,2:6]
#     w = ps.lib.weights.lat2W(b.shape[0], b.shape[1])
#     mi = ps.explore.esda.Moran(b, w)
#     coefs.append(mi.I)
    
# fig, axes = plt.subplots(2, 1, figsize=(15, 20))
# axes[0].plot(dates[1:], coefs)

In [None]:
w = ps.lib.weights.lat2W(a.shape[0], a.shape[1])
mi = ps.explore.esda.Moran_Local(a, w)

In [None]:
#f = np.log(1 + np.mean(a, 1))
f = np.mean(a, 1)

w = ps.lib.weights.lat2W(f.shape[0], 1)
mi = ps.explore.esda.Moran_Local(f, w)

plt.plot(mi.Is)

In [None]:
plt.plot(f)

In [None]:
plt.scatter(f, mi.Is)

In [None]:
window = 24
skip = 24

coefs = []

for i in np.arange(window, hours, skip):
    data = f[(i - window):i]
    
    if np.sum(data) < 1:
        coefs.append(coefs[-1])
        continue

    a = (np.percentile(data, 90) - np.percentile(data, 10)) / (np.mean(data))
    
    coefs.append(1 - a / 5)
    #coefs.append(np.percentile(data, 95) / np.mean(data))
    
Y = coefs
X = distConfs[1:]
X = sm.add_constant(X)
model = sm.OLS(Y,X)
results = model.fit()
    
fig, axes = plt.subplots(2, 1, figsize=(15, 20))
axes[0].bar(dates[1:], coefs)
axes[0].set_xlabel('Date')
axes[0].set_ylabel('Sampling Representativity')
axes[0].set_title('Pen %i Sampling Representativity' % (pen_id, ))

axes[1].bar(dates[1:], distConfs[1:])
axes[1].set_xlabel('Date')
axes[1].set_ylabel('Distribution Consistency')
axes[1].set_title('Pen %i Distribution Consistency (%0.2f)' % (pen_id, results.rsquared, ))
axes[1].set_ylim((.8, 1))

In [None]:
c = mi.Is.reshape(a.shape)
d = np.mean(c, 1)

In [None]:
plt.plot(d)

In [None]:
window = 24
skip = 24

coefs = []

for i in np.arange(window, hours, skip):
    coefs.append(np.std(d[(i - window):i]))
    
fig, axes = plt.subplots(2, 1, figsize=(15, 20))
axes[0].plot(dates[1:], coefs)
axes[0].set_xlabel('Date')
axes[0].set_ylabel('Sampling Representativity')
axes[0].set_title('Pen 60 Sampling Representativity')


In [None]:
def get_df():
    query = """
        SELECT * FROM (
          (SELECT * FROM prod.crop_annotation cas
          INNER JOIN prod.annotation_state pas on pas.id=cas.annotation_state_id
          WHERE cas.service_id = (SELECT ID FROM prod.service where name='BATI')
          AND cas.annotation_state_id = 3
          AND cas.pen_id=%i) a
        RIGHT JOIN 
          (SELECT left_crop_url, estimated_weight_g, akpd_score FROM prod.biomass_computations
          WHERE prod.biomass_computations.captured_at >= '%s'
          AND prod.biomass_computations.captured_at <= '%s'
          AND prod.biomass_computations.akpd_score > %0.4f) bc 
        ON 
          (a.left_crop_url=bc.left_crop_url)
        ) x
        WHERE x.captured_at >= '%s'
        AND x.captured_at <= '%s'
        AND x.pen_id = %i
        AND x.group_id = '%i';
    """ % (pen_id, start_date, end_date, akpd_filter, start_date, end_date, pen_id, pen_id)

    if query in queryCache:
        df = queryCache[query].copy()
    else:
        df = rds_access_utils.extract_from_database(query)

        depths = []
        for idx, row in df.iterrows():
            ann, cm = row.annotation, row.camera_metadata
            wkps = pixel2world(ann['leftCrop'], ann['rightCrop'], cm)
            depth = np.median([wkp[1] for wkp in wkps.values()])
            depths.append(depth)
        df['depth'] = depths

        df = df.sort_values('captured_at').copy(deep=True)
        df.index = pd.to_datetime(df.captured_at)
        dates = df.index.date.astype(str)
        df['date'] = dates
        df['hour'] = df.index.hour

        if 'estimated_k_factor' not in df.columns.tolist():
            df['estimated_k_factor'] = 0.0

        queryCache[query] = df.copy()
        
    return df

In [None]:
canonical = [
    {
        pen_id: 60,
        start_date: '2020-04-25',
        end_date: '2020-05-25'
    }
]