In [21]:
from utils.dataframes import *
import pandas as pd
import numpy as np

explode = {s: df.rename(columns = {'id': 'dim_bills_id'}) 
            for s, df in dim_bills.explode('billing_interval')
                                    .assign(date = lambda x: pd.to_datetime(x['billing_interval']),
                                            kwh_left = 0.0,
                                            kwh_used = 0.0)
                                    .groupby('source')}


# Step 2: Merge expanded billing data with meter usage and dimension tables
flat_df = meter_usage.assign(timestamp = lambda df: pd.to_datetime(df['interval_end_datetime'], format = '%m/%d/%Y %I:%M:%S %p')) \
                        .merge(dim_datetimes,     on = 'timestamp', how = 'left', suffixes = ('', '_dat')) \
                        .merge(dim_meters,        on = 'meter_id',  how = 'left', suffixes = ('', '_met')) \
                        .sort_values(by = ['account_number', 'id']).reset_index() \
                        .rename(columns = {'index': 'flat_id'})

# Step 3: Merge with CMP and Ampion billing data
matched_c = flat_df.merge(explode['CMP'],    on = ['account_number', 'date'], how = 'inner')
matched_a = flat_df.merge(explode['Ampion'], on = ['account_number', 'date'], how = 'inner')

# Step 4: Process Ampion data for kWh usage
kwh_used_a = matched_a.merge(matched_c[['flat_id', 'dim_bills_id', 'service_charge', 'taxes']], on = 'flat_id', how = 'left', suffixes = ('', '_cmp'))
kwh_used_a['ratio_bill_id']  = kwh_used_a['dim_bills_id_cmp'].combine_first(kwh_used_a['dim_bills_id'])
kwh_used_a['service_charge'] = kwh_used_a['service_charge_cmp'].combine_first(kwh_used_a['service_charge'])
kwh_used_a['taxes']          = kwh_used_a['taxes_cmp'].combine_first(kwh_used_a['taxes'])
kwh_used_a = kwh_used_a.drop(kwh_used_a.filter(regex = '_cmp$').columns, axis = 1) # Drop temporary `_cmp` columns for subsequent `pd.concat()`

group = kwh_used_a.groupby(['source', 'invoice_number', 'account_number', 'kwh_delivered'], observed = True)
kwh_used_a['kwh_left']   = (group['kwh_delivered'].transform('first') - group['kwh'].cumsum()).clip(lower = 0)
kwh_used_a['kwh_used']   = np.minimum(kwh_used_a['kwh'], kwh_used_a['kwh_left'])
kwh_used_a['kwh_unused'] = kwh_used_a['kwh'] - kwh_used_a['kwh_used']

# Step 5: Incorporate unused kWh from CMP if processing Ampion data
kwh_used_c = matched_c.merge(kwh_used_a[['flat_id', 'kwh_unused']], on = 'flat_id', how = 'left')
kwh_used_c['ratio_bill_id'] = kwh_used_c['dim_bills_id']
kwh_used_c['kwh']           = kwh_used_c['kwh_unused'].combine_first(kwh_used_c['kwh'])

group = kwh_used_c.groupby(['source', 'invoice_number', 'account_number', 'kwh_delivered'], observed = True)
kwh_used_c['kwh_left']   = (group['kwh_delivered'].transform('first') - group['kwh'].cumsum()).clip(lower = 0)
kwh_used_c['kwh_used']   = np.minimum(kwh_used_c['kwh'], kwh_used_c['kwh_left'])
kwh_used_c['kwh_unused'] = kwh_used_c['kwh'] - kwh_used_c['kwh_used']

print(kwh_used_a.columns, kwh_used_c.columns, sep = '\n')
# # Step 6: Combine CMP and Ampion data
# int_df = pd.concat([kwh_used_c, kwh_used_a])[lambda x: x['kwh_used'] > 0]

# # Step 7: Calculate the kWh usage ratio
# int_df['kwh_ratio'] = int_df['kwh_used'] / int_df.groupby(['ratio_bill_id'])['kwh_used'].transform('sum')

# # Step 8: Merge with flat data and sort
# df = flat_df.merge(int_df, on  = 'flat_id', how = 'left', suffixes = ('', '_int')) \
#             .sort_values(by = ['account_number', 'id'])

# # Step 9: Compute cost metrics and keys
# df['dim_datetimes_id'] = df['id']
# df['dim_meters_id']    = df['id_met']
# df['kwh']              = df['kwh_used'].combine_first(df['kwh'])
# df['delivery_cost']    = df['kwh_used']       * df['delivery_rate']
# df['service_cost']     = df['service_charge'] * df['kwh_ratio']
# df['supply_cost']      = df['kwh_used']       * df['supply_rate']
# df['tax_cost']         = df['taxes']          * df['kwh_ratio']
# df['total_cost']       = df.filter(regex = '_cost$').sum(axis = 1)

Index(['flat_id', 'service_point_id', 'meter_id', 'interval_end_datetime',
       'meter_channel', 'kwh', 'account_number', 'timestamp', 'id',
       'increment', 'hour', 'date', 'week', 'week_start', 'month',
       'month_name', 'month_start', 'quarter', 'year', 'period', 'id_met',
       'service_point_id_met', 'account_number_met', 'street', 'label',
       'operational_area', 'dim_bills_id', 'invoice_number', 'supplier',
       'kwh_delivered', 'service_charge', 'taxes', 'delivery_rate',
       'supply_rate', 'source', 'billing_interval', 'kwh_left', 'kwh_used',
       'ratio_bill_id', 'kwh_unused'],
      dtype='object')
Index(['flat_id', 'service_point_id', 'meter_id', 'interval_end_datetime',
       'meter_channel', 'kwh', 'account_number', 'timestamp', 'id',
       'increment', 'hour', 'date', 'week', 'week_start', 'month',
       'month_name', 'month_start', 'quarter', 'year', 'period', 'id_met',
       'service_point_id_met', 'account_number_met', 'street', 'label',
       '