In [1]:
from utils.dataframes import *
import pandas as pd
import numpy as np

# Challenge: It seems like CMP still charges for delivery and taxes in cases where the electricity is fully supplied by Ampion. Therefore, only `kwh_delivered` from CMP is immediately useful for the waterfall, and a different strategy for `service_charge` and `taxes` needs to be employed based on the timeframes of the bills (e.g. across this bill's dates, you paid this much)

explode = {s: df.rename(columns = {'id': 'dim_bills_id'}) 
              for s, df in dim_bills.explode('billing_interval')
                                    .assign(date = lambda x: pd.to_datetime(x['billing_interval']))
                                    .groupby('source')}

# Step 2: Merge with meter usage and dimension tables
flat_df = meter_usage.assign(timestamp = lambda df: pd.to_datetime(df['interval_end_datetime'], format = '%m/%d/%Y %I:%M:%S %p')) \
                     .merge(dim_datetimes,     on = 'timestamp', how = 'left', suffixes = ('', '_dat')) \
                     .merge(dim_meters,        on = 'meter_id',  how = 'left', suffixes = ('', '_met')) \
                     .reset_index().rename(columns = {'index' : 'flat_id'})

bill_fields = ['account_number', 'date']
matched_c = flat_df.merge(explode['CMP'],    on = bill_fields, how = 'inner')
matched_a = flat_df.merge(explode['Ampion'], on = bill_fields, how = 'inner')

# int_df = pd.concat([matched_c, matched_a]) \
#            .apply(lambda col: col.fillna(0) if col.dtype.kind in 'biufc' else col) \
#            .drop_duplicates()

# # Step 3: Calculate total kWh recorded
# int_df['total_recorded_kwh'] = int_df.groupby(['invoice_number', 'kwh_delivered'])['kwh'].transform('sum')
# int_df['kwh_ratio']          = int_df['kwh'] / int_df['total_recorded_kwh']

In [2]:
# # Sort the DataFrame
# sort_df = int_df.sort_values(by = ['source', 'invoice_number', 'timestamp'])
# sort_df.reset_index(drop = True, inplace = True)
                
# group_df = sort_df.groupby(['source', 'invoice_number', 'kwh_delivered'])

# # Initialize new columns
# sort_df['kwh_left'] = 0.0
# sort_df['kwh_used'] = 0.0

# # Calculate for CMP
# c_mask = sort_df['source'] == 'CMP'
# sort_df.loc[c_mask, 'kwh_left'] = (group_df['kwh_delivered'].transform('first') - 
#                                    group_df['kwh'].cumsum().iloc[::-1]).clip(lower = 0)
# sort_df.loc[c_mask, 'kwh_used'] = np.minimum(sort_df['kwh'], sort_df['kwh_left'])

# # Calculate kwh_remaining for CMP
# sort_df['kwh_remaining'] = sort_df['kwh'] - sort_df['kwh_used']

# # Update kwh_remaining in int_df using the map
# int_df['kwh_remaining'] = sort_df['kwh_remaining']

# # Calculate for Ampion
# a_mask = sort_df['source'] == 'Ampion'
# sort_df.loc[a_mask, 'kwh_left'] = (group_df['kwh_delivered'].transform('first') - 
#                                    group_df['kwh_remaining'].cumsum()).clip(lower = 0)
# sort_df.loc[a_mask, 'kwh_used'] = np.minimum(sort_df['kwh_remaining'], sort_df['kwh_left'])

# # Final cleanup
# int_df['kwh_used'] = sort_df['kwh_used']
# int_df['kwh_used'] = int_df['kwh_used'].fillna(0).astype(float)

# int_df

In [3]:
# Step 1: Waterfall for CMP (matched_c)

# Initialize columns
matched_c['kwh_left'] = 0.0
matched_c['kwh_used'] = 0.0

# Group by necessary fields
group_c = matched_c.groupby(['source', 'invoice_number', 'account_number', 'kwh_delivered'], observed = True)

# Calculate 'kwh_left' and 'kwh_used' for CMP
matched_c['kwh_left']   = (group_c['kwh_delivered'].transform('first') - 
                           group_c['kwh'].cumsum()).clip(lower=0)
matched_c['kwh_used']   = np.minimum(matched_c['kwh'], matched_c['kwh_left'])
matched_c['kwh_unused'] = matched_c['kwh'] - matched_c['kwh_used']

# Step 2: Waterfall for Ampion (matched_a) using 'kwh_remaining' from CMP

# Initialize columns
# Merging matched_c's 'kwh_unused' onto matched_a
matched_a = matched_a.merge(matched_c[['flat_id', 'kwh_unused']], on = 'flat_id', how = 'left')

matched_a['kwh_left'] = 0.0
matched_a['kwh_used'] = 0.0
matched_a['kwh']      = matched_a['kwh_unused'].combine_first(matched_a['kwh'])

# Group by necessary fields
group_a = matched_a.groupby(['source', 'invoice_number', 'account_number', 'kwh_delivered'], observed = True)

# Calculate 'kwh_left' and 'kwh_used' for Ampion
matched_a['kwh_left']   = (group_a['kwh_delivered'].transform('first') - 
                           group_a['kwh'].cumsum()).clip(lower = 0)
matched_a['kwh_used']   = np.minimum(matched_a['kwh'], matched_a['kwh_left'])
matched_a['kwh_unused'] = matched_a['kwh'] - matched_a['kwh_used']

# Combine the results
int_df = pd.concat([matched_c, matched_a])

int_df

# Cumulative sum is subtracting even though there isn't technically a `kwh` value to subtract


Unnamed: 0,flat_id,service_point_id,meter_id,interval_end_datetime,meter_channel,kwh,account_number,timestamp,id,increment,...,kwh_delivered,service_charge,taxes,delivery_rate,supply_rate,source,billing_interval,kwh_left,kwh_used,kwh_unused
0,0,2300822246,L108605388,10/1/2022 12:00:00 AM,10,0.594,30010320353,2022-10-01 00:00:00,69401,0,...,0,25.67,1.41,,,CMP,2022-10-01,0.000,0.000,0.594
1,1,2300822246,L108605388,10/1/2022 12:15:00 AM,10,0.101,30010320353,2022-10-01 00:15:00,69402,15,...,0,25.67,1.41,,,CMP,2022-10-01,0.000,0.000,0.101
2,2,2300822246,L108605388,10/1/2022 12:30:00 AM,10,0.104,30010320353,2022-10-01 00:30:00,69403,30,...,0,25.67,1.41,,,CMP,2022-10-01,0.000,0.000,0.104
3,3,2300822246,L108605388,10/1/2022 12:45:00 AM,10,0.106,30010320353,2022-10-01 00:45:00,69404,45,...,0,25.67,1.41,,,CMP,2022-10-01,0.000,0.000,0.106
4,4,2300822246,L108605388,10/1/2022 1:00:00 AM,10,0.099,30010320353,2022-10-01 01:00:00,69405,0,...,0,25.67,1.41,,,CMP,2022-10-01,0.000,0.000,0.099
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
158010,491681,2300588897,L108607371,9/30/2022 7:00:00 PM,10,4.385,35012790198,2022-09-30 19:00:00,69381,0,...,1250,0.00,0.00,0.0,0.166968,Ampion,2022-09-30,6.992,4.385,0.000
158011,491682,2300588897,L108607371,9/30/2022 8:00:00 PM,10,3.004,35012790198,2022-09-30 20:00:00,69385,0,...,1250,0.00,0.00,0.0,0.166968,Ampion,2022-09-30,3.988,3.004,0.000
158012,491683,2300588897,L108607371,9/30/2022 9:00:00 PM,10,2.799,35012790198,2022-09-30 21:00:00,69389,0,...,1250,0.00,0.00,0.0,0.166968,Ampion,2022-09-30,1.189,1.189,1.610
158013,491684,2300588897,L108607371,9/30/2022 10:00:00 PM,10,3.152,35012790198,2022-09-30 22:00:00,69393,0,...,1250,0.00,0.00,0.0,0.166968,Ampion,2022-09-30,0.000,0.000,3.152


In [4]:
sources = ['Ampion', 'CMP']

print(dim_bills[dim_bills['source'].isin(sources)]['kwh_delivered'].sum(),
int_df[int_df['source'].isin(sources)]['kwh_used'].sum())

275719 261629.958


In [5]:
group_and_join = ['invoice_number', 'source', 'account_number']

df_test = (dim_bills.groupby(group_and_join, observed=True)
                    .agg({'kwh_delivered': 'sum'})
                    .merge(int_df.groupby(group_and_join, observed=True)
                                 .agg({'kwh_used': 'sum'}), how = 'left', on=group_and_join))

# Calculate the absolute difference
df_test['difference'] = np.abs(df_test['kwh_delivered'] - df_test['kwh_used'])

# Sort by the 'difference' column in descending order to see the largest discrepancies first
df_test = df_test.sort_values(by='difference', ascending=False)

df_test.to_csv('differences.csv')
df_test


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,kwh_delivered,kwh_used,difference
invoice_number,source,account_number,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2022120000512891,Ampion,30010320353,3668,1735.504,1932.496
2023100000830629,Ampion,30010320353,3152,1720.440,1431.560
2022120000512891,Ampion,30010601281,2528,1438.100,1089.900
2023100000830629,Ampion,35012790198,2092,1095.184,996.816
2023080000758144,Ampion,30010601281,2352,1518.717,833.283
...,...,...,...,...,...
705001871140,CMP,30010320361,0,0.000,0.000
711001621300,CMP,35012787756,0,0.000,0.000
704001827865,CMP,35012787756,0,0.000,0.000
704001827864,CMP,35012787137,0,0.000,0.000


In [6]:
df = int_df

df_print = df[(df['invoice_number'] == '2023080000758144') & (df['account_number'] == '30010894035')]

df_print.to_csv('int_df.csv')

In [7]:
# # Step 6: Compute cost metrics
# df = pd.DataFrame(index = int_df.index)
# df['dim_datetimes_id']  = int_df['id']
# df['dim_meters_id']     = int_df['id_met']
# df['dim_bills_id']      = int_df['dim_bills_id']
# df['account_number']    = int_df['account_number']
# df['kwh']               = int_df['kwh']
# df['delivery_cost']     = int_df['kwh_used']       * int_df['delivery_rate']
# df['service_cost']      = int_df['service_charge'] * int_df['kwh_ratio']
# df['supply_cost']       = int_df['kwh_used']       * int_df['supply_rate']
# df['tax_cost']          = int_df['taxes']          * int_df['kwh_ratio']
# df['total_cost']        = df.filter(regex = '_cost$').sum(axis = 1)