In [24]:
import json
import datetime as dt
import os
import numpy as np
import pandas as pd
from research.utils.data_access_utils import RDSAccessUtils
from research.utils.datetime_utils import add_days
from research.weight_estimation.population_metrics import PopulationMetricsEstimator

In [31]:
class DataGenerator(object):

    def __init__(self):
        credentials = json.load(open(os.environ['DATA_WAREHOUSE_SQL_CREDENTIALS']))
        self.rds_access_utils = RDSAccessUtils(credentials)
        self.df = None

    def query_from_db(self, pen_id, start_date=None, end_date=None, min_akpd_score=0.99):
        if not end_date:
            end_date = dt.datetime.strftime(dt.datetime.now(), '%Y-%m-%d')
        if not start_date:
            start_date = add_days(end_date, -30 * 6)
        query = """
            SELECT * FROM
            prod.biomass_computations bc
            WHERE bc.pen_id={}
            AND bc.akpd_score >= {}
            AND bc.captured_at between '{}' and '{}'
            AND bc.estimated_weight_g > 0.0
        """.format(pen_id, min_akpd_score, start_date, end_date)

        print('Executing query...')
        print(query)
        self.df = self.rds_access_utils.extract_from_database(query)
        print('Query complete!')
        self.df = self.df.loc[:, ~self.df.columns.duplicated()]
        self.df.rename(columns={'estimated_weight_g': 'estimated_weight_g_0'}, inplace=True)


    def preprocess_df(self):
        self.df.index = list(range(self.df.shape[0]))
        self.df = self.df.sort_values('captured_at').copy(deep=True)
        self.df.index = pd.to_datetime(self.df.captured_at)
        dates = self.df.index.date.astype(str)
        self.df['date'] = dates
        self.df['estimated_k_factor'] = 0.0
        self.df['hour'] = self.df.index.hour


    # generate default data-frame to use on start-up
    def get_df(self):
        return self.df

    
def generate_pme(df, start_date, end_date, start_hour, end_hour):
    date_mask = (df.date >= start_date) & (df.date <= end_date)
    if start_hour < end_hour:
        hour_mask = (df.hour >= start_hour) & (df.hour <= end_hour)
    else:
        hour_mask = (df.hour >= start_hour) | (df.hour <= end_hour)
    mask = date_mask & hour_mask
    biomass_computations = list(zip(df[mask].date.values,
                                    df.loc[mask, 'estimated_weight_g_0'].values,
                                    df[mask].estimated_k_factor.values))
    if biomass_computations:
        return PopulationMetricsEstimator(biomass_computations)
    return None


In [32]:
pen_id = 56
start_date = '2020-04-20'
end_date = '2020-05-10'
start_hour = 0
end_hour = 24

In [33]:
dg = DataGenerator()
dg.query_from_db(pen_id, start_date=start_date, end_date=end_date)
dg.preprocess_df()
df = dg.get_df()

Executing query...

            SELECT * FROM
            prod.biomass_computations bc
            WHERE bc.pen_id=56
            AND bc.akpd_score >= 0.99
            AND bc.captured_at between '2020-04-20' and '2020-05-10'
            AND bc.estimated_weight_g > 0.0
        
Query complete!


In [34]:
pme = generate_pme(df, start_date, end_date, start_hour, end_hour)


In [35]:
metrics = pme.generate_smart_metrics_on_date('2020-04-22')

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [36]:
metrics

{'raw_average_weight': 2290.5669089855064,
 'raw_average_kf': 0.0,
 'raw_sample_size': 2983,
 'smart_average_weight': 2275.9499052756346,
 'smart_average_kf': 0.0,
 'smart_distribution': {'0.0': {'count': 0, 'avgKFactor': None},
  '0.1': {'count': 0, 'avgKFactor': None},
  '0.2': {'count': 0, 'avgKFactor': None},
  '0.3': {'count': 0, 'avgKFactor': None},
  '0.4': {'count': 0, 'avgKFactor': None},
  '0.5': {'count': 0, 'avgKFactor': None},
  '0.6': {'count': 0, 'avgKFactor': None},
  '0.7': {'count': 1, 'avgKFactor': 0.0},
  '0.8': {'count': 0, 'avgKFactor': None},
  '0.9': {'count': 5, 'avgKFactor': 0.0},
  '1.0': {'count': 5, 'avgKFactor': 0.0},
  '1.1': {'count': 18, 'avgKFactor': 0.0},
  '1.2': {'count': 45, 'avgKFactor': 0.0},
  '1.3': {'count': 88, 'avgKFactor': 0.0},
  '1.4': {'count': 166, 'avgKFactor': 0.0},
  '1.5': {'count': 248, 'avgKFactor': 0.0},
  '1.6': {'count': 315, 'avgKFactor': 0.0},
  '1.7': {'count': 410, 'avgKFactor': 0.0},
  '1.8': {'count': 499, 'avgKFactor': 0