In [None]:
import json
import datetime as dt
import os
import numpy as np
import pandas as pd
from research.utils.data_access_utils import RDSAccessUtils
from research.utils.datetime_utils import add_days
from research.weight_estimation.population_metrics import PopulationMetricsEstimator

In [None]:
class DataGenerator(object):

    def __init__(self):
        credentials = json.load(open(os.environ['DATA_WAREHOUSE_SQL_CREDENTIALS']))
        self.rds_access_utils = RDSAccessUtils(credentials)
        self.df = None

    def query_from_db(self, pen_id, start_date=None, end_date=None, min_akpd_score=0.99):
        if not end_date:
            end_date = dt.datetime.strftime(dt.datetime.now(), '%Y-%m-%d')
        if not start_date:
            start_date = add_days(end_date, -30 * 6)
        query = """
            SELECT * FROM
            prod.biomass_computations bc
            WHERE bc.pen_id={}
            AND bc.akpd_score >= {}
            AND bc.captured_at between '{}' and '{}'
            AND bc.estimated_weight_g > 0.0
        """.format(pen_id, min_akpd_score, start_date, end_date)

        print('Executing query...')
        print(query)
        self.df = self.rds_access_utils.extract_from_database(query)
        print('Query complete!')
        self.df = self.df.loc[:, ~self.df.columns.duplicated()]
        self.df.rename(columns={'estimated_weight_g': 'estimated_weight_g_0'}, inplace=True)


    def preprocess_df(self):
        self.df.index = list(range(self.df.shape[0]))
        self.df = self.df.sort_values('captured_at').copy(deep=True)
        self.df.index = pd.to_datetime(self.df.captured_at)
        dates = self.df.index.date.astype(str)
        self.df['date'] = dates
        self.df['estimated_k_factor'] = 0.0
        self.df['hour'] = self.df.index.hour


    # generate default data-frame to use on start-up
    def get_df(self):
        return self.df

    
def generate_pme(df, start_date, end_date, start_hour, end_hour):
    date_mask = (df.date >= start_date) & (df.date <= end_date)
    if start_hour < end_hour:
        hour_mask = (df.hour >= start_hour) & (df.hour <= end_hour)
    else:
        hour_mask = (df.hour >= start_hour) | (df.hour <= end_hour)
    mask = date_mask & hour_mask
    biomass_computations = list(zip(df[mask].date.values,
                                    df.loc[mask, 'estimated_weight_g_0'].values,
                                    df[mask].estimated_k_factor.values))
    if biomass_computations:
        return PopulationMetricsEstimator(biomass_computations)
    return None


In [None]:
pen_id = 56
start_date = '2020-04-20'
end_date = '2020-05-10'
start_hour = 0
end_hour = 24

In [None]:
dg = DataGenerator()
dg.query_from_db(pen_id, start_date=start_date, end_date=end_date)
dg.preprocess_df()
df = dg.get_df()

In [None]:
pme = generate_pme(df, start_date, end_date, start_hour, end_hour)


In [None]:
metrics = pme.generate_smart_metrics_on_date('2020-04-22')

In [None]:
metrics