In [1]:
from utils.dataframes import *
import pandas as pd
import numpy as np

# Challenge: It seems like CMP still charges for delivery and taxes in cases where the electricity is fully supplied by Ampion. Therefore, only `kwh_delivered` from CMP is immediately useful for the waterfall, and a different strategy for `service_charge` and `taxes` needs to be employed based on the timeframes of the bills (e.g. across this bill's dates, you paid this much)

explode = {s: df.rename(columns = {'id': 'dim_bills_id'}) 
              for s, df in dim_bills.explode('billing_interval')
                                    .assign(date = lambda x: pd.to_datetime(x['billing_interval']))
                                    .groupby('source')}

# Step 2: Merge with meter usage and dimension tables
flat_df = meter_usage.assign(timestamp = lambda df: pd.to_datetime(df['interval_end_datetime'], format = '%m/%d/%Y %I:%M:%S %p')) \
                     .merge(dim_datetimes,     on = 'timestamp', how = 'left', suffixes = ('', '_dat')) \
                     .merge(dim_meters,        on = 'meter_id',  how = 'left', suffixes = ('', '_met')) \
                     .sort_values(by = ['account_number', 'id']).reset_index() \
                     .rename(columns = {'index': 'flat_id'})

matched_c = flat_df.merge(explode['CMP'],    on = ['account_number', 'date'], how = 'inner')
matched_a = flat_df.merge(explode['Ampion'], on = ['account_number', 'date'], how = 'inner')

In [23]:
def process_matched_df(df, contains_unused = None):

    df['kwh_left'] = 0.0
    df['kwh_used'] = 0.0

    # If processing `matched_a`, merge 'kwh_unused'
    if contains_unused is not None:
        ratio_fields = ['flat_id', 'kwh_unused', 'ratio_bill_id', 'service_charge', 'taxes']
        df = df.merge(contains_unused[ratio_fields], on = 'flat_id', how = 'left', suffixes = ('', '_cmp'))
        
        df['kwh']            = df['kwh_unused'].combine_first(df['kwh'])
        df['ratio_bill_id']  = df['ratio_bill_id'].combine_first(df['dim_bills_id'])
        df['service_charge'] = df['service_charge_cmp'].combine_first(df['service_charge'])
        df['taxes']          = df['taxes_cmp'].combine_first(df['taxes'])

    else:
        df['ratio_bill_id'] = df['dim_bills_id']

    # Calculate 'kwh_left', 'kwh_used', and 'kwh_unused'
    group = df.groupby(['source', 'invoice_number', 'account_number', 'kwh_delivered'], observed = True)
    df['kwh_left']   = (group['kwh_delivered'].transform('first') - group['kwh'].cumsum()).clip(lower = 0)
    df['kwh_used']   = np.minimum(df['kwh'], df['kwh_left'])
    df['kwh_unused'] = df['kwh'] - df['kwh_used']

    return df

kwh_used_c = process_matched_df(matched_c)
kwh_used_a = process_matched_df(matched_a, kwh_used_c)
int_df     = pd.concat([kwh_used_c, kwh_used_a])

int_df['kwh_ratio'] = int_df['kwh_used'] / int_df.groupby(['invoice_number'])['kwh_used'].transform('sum')

int_df


Unnamed: 0,flat_id,service_point_id,meter_id,interval_end_datetime,meter_channel,kwh,account_number,timestamp,id,increment,...,supply_rate,source,billing_interval,kwh_left,kwh_used,kwh_unused,ratio_bill_id,service_charge_cmp,taxes_cmp,kwh_ratio
0,103104,2300822246,L108605388,9/17/2021 12:00:00 AM,10,0.250,30010320353,2021-09-17 00:00:00,33021,0,...,,CMP,2021-09-17,1486.750,0.250,0.0,26,,,0.000170
1,103105,2300822246,L108605388,9/17/2021 12:15:00 AM,10,0.277,30010320353,2021-09-17 00:15:00,33022,15,...,,CMP,2021-09-17,1486.473,0.277,0.0,26,,,0.000188
2,103106,2300822246,L108605388,9/17/2021 12:30:00 AM,10,0.994,30010320353,2021-09-17 00:30:00,33023,30,...,,CMP,2021-09-17,1485.479,0.994,0.0,26,,,0.000675
3,103107,2300822246,L108605388,9/17/2021 12:45:00 AM,10,0.243,30010320353,2021-09-17 00:45:00,33024,45,...,,CMP,2021-09-17,1485.236,0.243,0.0,26,,,0.000165
4,103108,2300822246,L108605388,9/17/2021 1:00:00 AM,10,0.284,30010320353,2021-09-17 01:00:00,33025,0,...,,CMP,2021-09-17,1484.952,0.284,0.0,26,,,0.000193
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
158010,481697,2300588897,L108607371,8/10/2023 7:00:00 PM,10,5.209,35012790198,2023-08-10 19:00:00,99521,0,...,0.206902,Ampion,2023-08-10,1010.955,5.209,0.0,175,39.65,23.34,0.000525
158011,481698,2300588897,L108607371,8/10/2023 8:00:00 PM,10,3.515,35012790198,2023-08-10 20:00:00,99525,0,...,0.206902,Ampion,2023-08-10,1007.440,3.515,0.0,175,39.65,23.34,0.000354
158012,481699,2300588897,L108607371,8/10/2023 9:00:00 PM,10,3.568,35012790198,2023-08-10 21:00:00,99529,0,...,0.206902,Ampion,2023-08-10,1003.872,3.568,0.0,175,39.65,23.34,0.000360
158013,481700,2300588897,L108607371,8/10/2023 10:00:00 PM,10,3.357,35012790198,2023-08-10 22:00:00,99533,0,...,0.206902,Ampion,2023-08-10,1000.515,3.357,0.0,175,39.65,23.34,0.000338


In [3]:
# Still need to design `kwh_ratio`
# When merging contains_unused, merge separately service_charge_{suffix}, taxes_{suffix}, and invoice_number_{suffix} and then use those to calculate kwh_ratio

# # Step 6: Compute cost metrics
# df = pd.DataFrame(index = int_df.index)
# df['dim_datetimes_id']  = int_df['id']
# df['dim_meters_id']     = int_df['id_met']
# df['dim_bills_id']      = int_df['dim_bills_id']
# df['account_number']    = int_df['account_number']
# df['kwh']               = int_df['kwh']
# df['delivery_cost']     = int_df['kwh_used']       * int_df['delivery_rate']
# df['service_cost']      = int_df['service_charge'] * int_df['kwh_ratio']
# df['supply_cost']       = int_df['kwh_used']       * int_df['supply_rate']
# df['tax_cost']          = int_df['taxes']          * int_df['kwh_ratio']
# df['total_cost']        = df.filter(regex = '_cost$').sum(axis = 1)