### Revisions
1. use merge_asof to match the forecast date to the dates in the rate table
2. filter the meter aq so it’s in between the min aq and max aq in the rate table
3. (Optional) Graphs to show linear

In [145]:
import pandas as pd

#retrieve csv
def read_csv(path):
    return pd.read_csv(path)

def combine_df():

read_csv("./data/rate.csv", "./data/forecast.csv", "./data/meter.csv")

# Merge the meters and rates table
df_mr = pd.merge(meters, rates, on='exit_zone')

# Convert dates to datetime
df_mr['date'] = pd.to_datetime(df_mr['date'])
forecast['date'] = pd.to_datetime(forecast['date'])

# Sort values by date for asof merge
df_mr = df_mr.sort_values(by='date')
forecast = forecast.sort_values(by='date')

# Merge meters-rates with forecast
df_mrf = pd.merge_asof(df_mr, forecast, by='meter_id', on='date', direction='nearest')

# Filter meter aq to be between min and max aq
filtered_aq = df_mrf[(df_mrf['aq_kwh'] >= df_mrf['aq_min_kwh']) & (df_mrf['aq_kwh'] <= df_mrf['aq_max_kwh'])].copy()

# Calculate the daily charge
filtered_aq.loc[:, 'daily_charge'] = filtered_aq['rate_p_per_kwh'] * filtered_aq['kwh']

# Calculate the total cost and forecast
cost_total = filtered_aq.groupby('meter_id')['daily_charge'].sum() * 0.01
forecast_total = filtered_aq.groupby('meter_id')['kwh'].sum()

# Create one table, reset the index as it's registering meter_id as the index due to previous groupby
df_total = pd.merge(forecast_total, cost_total, on='meter_id').reset_index()

# Making it pretty now
columns = {
    'meter_id': 'Meter ID', 
    'kwh': 'Total Estimated Consumption (kWh)', 
    'daily_charge': 'Total Cost (£)'
}
df_result = df_total.rename(columns=columns)

# Display the result
print(df_result.to_string(index=False))

 Meter ID  Total Estimated Consumption (kWh)  Total Cost (£)
 14676236                         369.887399        1.756720
 34509937                         930.498627        4.397839
 50264822                        3458.297730       13.067652
 88357331                        5140.547491       20.442827


In [146]:
def gen_meter_list(m, ez, min=0, max=5000):
    id = np.random.randint(1,1e7, size=m) # generate random meters
    exit_zones = np.random.choice(ez, size=m) # select random exit zones
    aq = np.random.uniform(min,max, size=m)
    
    df_meters = pd.DataFrame({
        'meter_id': id,
        "exit_zone": exit_zones,
        'annual_quantity': aq
    })
    return df_meters

ez = df_rate['exit_zone'].unique() # this is just to get only unique exit zones in existing data.
meter = gen_meter_list(len(ez),ez)

print(meter)

    meter_id exit_zone  annual_quantity
0    2425729       NE2        89.347130
1    4074511       SC1      1419.543651
2    8747581       SE1      3504.641361
3    4468081       WA2      1668.947985
4    1720539       NT3      2126.481269
5    5049518       NT2      1187.166190
6    4926763        LC      3161.644228
7    4157549       WM1       484.181825
8    6236037       NO1      1373.444702
9    8704734       WM1      3309.060651
10   2645253       SW3      4402.561350
11   4217700       NW1      2205.204611
12   5798066       EM2      3371.800237
13    140264       EM2      1660.113572
14   5878164       SW2      3332.227278
15   2970933       WM1      3379.493289
16    176677       SW2      2604.365594
17   7846468       EA1      3269.762320
18    344836       NO2      4951.324763
19   5192070       NE1      2777.686493
20   3074739       WM3       914.399190
21   1935193        LC      2280.288758
22   6069573       SE2      2673.741546
23   9598548       NT3      4317.049636


In [147]:
def gen_consumption_list(meter_list, start_date, periods):
    # make sure start date is in the correct format using Panda Datetime stamp
    if isinstance(start_date, str):
        start_date = pd.Timestamp(start_date)
    else:
        raise TypeError("Start Date must be a string and a valid date YYYY-MM-DD format.")

    # generate forecast periods using duration by Day
    dates = pd.date_range(start=start_date, periods=periods, freq='D')

    # print(dates)

    # get generated meter list (meter_id, dates, kwh)
    meters = np.array([meter_list['meter_id']]).flatten()
    dates = np.tile(dates, len(meters))
    consumption = np.random.uniform(0,5000,size=(len(meter_list),periods)).flatten() # might need to consider what the aq_min and aq_max ranges are based on the date (?)
    
    # print(meters, len(dates), len(consumption))

    # get consumption data
    df_consumption = pd.DataFrame({
        'meter_id': np.repeat(meters,periods),
        'date': dates,
        'kwh': consumption
    })
    
    return df_consumption
    
consumption = gen_consumption_list(df_meter, '2024-01-01', 30)

print (consumption)

     meter_id       date          kwh
0    14676236 2024-01-01  4713.375696
1    14676236 2024-01-02  3125.688978
2    14676236 2024-01-03  3127.332996
3    14676236 2024-01-04  1775.576609
4    14676236 2024-01-05  1838.227506
..        ...        ...          ...
115  88357331 2024-01-26  4421.523917
116  88357331 2024-01-27  3696.202010
117  88357331 2024-01-28  2735.338160
118  88357331 2024-01-29  3913.232885
119  88357331 2024-01-30   723.888616

[120 rows x 3 columns]


In [148]:
# Function to calculate transportation cost table & Benchmark
"""
I - Meter list (meter, exit zone, aq columns), consumption table (meter_id, date, kwh)
O - Transportation cost table (meter_id, total consumption, total cost)
C - valid meter list, valid consumption forecast, valid rate table
E - empty dataframes, non-unique meters, missing data (i.e. NaN), inconsistent data consumption not in meter, neg values in rates or consumption
"""

def format_value(x):
    if isinstance(x, (int, float)):
        return f"{x:.2f}"
    return x

In [149]:
def calc_cost(meter, consumption):
    # check edge cases and constraints
        # is there any missing data (i.e. NaN)?
        # is it a valid consumption table?
        # is the data frame empty?
        # are all the meters unique in the list?
        # is there any missing data (i.e. NaN)?
    # thinking about ways to check if data consumption is inconsistent
    
    # combine all data from params
    meters_and_consumption = pd.merge(meter, consumption, on='meter_id')

    # need rate_p_per_kwh
    meters_and_consumption['rate_p_per_kwh'] = meters_and_consumption['annual_quantity'] / meters_and_consumption['kwh']
    
    # calc the daily charge
    meters_and_consumption['daily_charge'] = meters_and_consumption['rate_p_per_kwh'] * meters_and_consumption['kwh']

    # calc total cost and total consumption per meter
    cost_total = meters_and_consumption.groupby('meter_id')['daily_charge'].sum() * 0.01
    consumption_total = meters_and_consumption.groupby('meter_id')['kwh'].sum()

    # create the data frame
    df_total = pd.merge(consumption_total, cost_total, on='meter_id').reset_index()
    df_total = df_total.map(format_value)

    # making it pretty
    columns = {
        'meter_id': 'Meter ID', 
        'kwh': 'Total Estimated Consumption (kWh)', 
        'daily_charge': 'Total Cost (£)'
    }

    df_result = df_total.rename(columns=columns)
    
    # return transportation_cost
    return df_result

m = gen_meter_list(len(ez),ez)
c = gen_consumption_list(m, '2024-01-01', 30)

# see the magic
calc_cost(m, c)

Unnamed: 0,Meter ID,Total Estimated Consumption (kWh),Total Cost (£)
0,222375.0,66797.63,529.86
1,337074.0,67092.28,265.77
2,656977.0,68322.41,1224.87
3,813796.0,83898.34,1269.17
4,864032.0,74071.56,387.53
5,949973.0,78253.26,1095.8
6,1126986.0,86554.2,1387.08
7,1318902.0,79165.03,876.41
8,1475524.0,73831.64,1130.61
9,1709202.0,82032.3,1107.56


In [152]:
import timeit
import pandas as pd
import numpy as np
from memory_profiler import memory_usage

# creating large sample data for testing
def gen_meter_list (size, seed):
    np.random.seed(seed)
    id = np.arange(1, size + 1)
    aq = np.random.randint(1000,5000,size)
    return pd.DataFrame({
        'meter_id': id,
        'annual_quantity': aq
    })

def gen_consumption_list(meter_df, start_date, days):
    meter_ids = meter_df['meter_id'].values
    date_range = pd.date_range(start=start_date, periods=days)
    consumption_data = {
        'meter_id': np.random.choice(meter_ids, size=days * len(meter_ids)),
        'date': np.tile(date_range, len(meter_ids)),
        'kwh': np.random.randint(10, 100, size=days * len(meter_ids))
    }
    return pd.DataFrame(consumption_data)

def benchmark(fn, size, seed=42, days=30):
    m = gen_meter_list(size, seed)
    # print (m)
    c = gen_consumption_list(m, '2024-01-01', days)

    # execution time
    exe_time = timeit.timeit(lambda: fn(m,c), number=10)
    avg_time = exe_time / 10

    # memory usage
    mem = memory_usage((calc_cost, (m, c)), max_iterations=1)
    avg_mem = max(mem) - min(mem)

    return avg_time, avg_mem

# testing the test
def benchmark_test(fn):
    sizes = [100, 1000, 10000, 100000, 500000, 1000000]
    results = []
    
    for size in sizes:
        runtime, mem_usage = benchmark(fn, size)
        results.append((size, runtime))
        print(f"Size: {size}, Avg Runtime: {runtime:.2f} seconds, Memory Usage: {mem_usage:.2f} MiB")

benchmark_test(calc_cost)

Size: 100, Avg Runtime: 0.00 seconds, Memory Usage: 26.27 MiB
Size: 1000, Avg Runtime: 0.00 seconds, Memory Usage: 0.02 MiB
Size: 10000, Avg Runtime: 0.05 seconds, Memory Usage: 11.64 MiB
Size: 100000, Avg Runtime: 0.49 seconds, Memory Usage: 38.56 MiB
Size: 500000, Avg Runtime: 2.54 seconds, Memory Usage: 77.09 MiB
Size: 1000000, Avg Runtime: 5.11 seconds, Memory Usage: 437.34 MiB


In [161]:
import dask.dataframe as dd

def calc_cost_dask(meter, consumption):
    meter_dask = dd.from_pandas(meter, npartitions=4)
    consumption_dask = dd.from_pandas(consumption, npartitions=4)

    meters_and_consumption = dd.merge(meter_dask, consumption_dask, on='meter_id')

    meters_and_consumption['rate_p_per_kwh'] = meters_and_consumption['annual_quantity'] / meters_and_consumption['kwh']
    meters_and_consumption['daily_charge'] = meters_and_consumption['rate_p_per_kwh'] * meters_and_consumption['kwh']

    grouped = meters_and_consumption.groupby('meter_id').agg({
        'daily_charge': 'sum',
        'kwh': 'sum'
    }).compute()

    grouped['daily_charge'] *= 0.01

    grouped = grouped.reset_index()
    grouped.columns = ['Meter ID', 'Total Cost (£)', 'Total Estimated Consumption (kWh)']

    return grouped.map(format_value)

In [162]:
# Creating graphs to show linear execution time and benchmarking
import matplotlib.pyplot as plot

# Run benchmark for Dask function
results = benchmark_test(calc_cost_dask)

# Plotting the results
sizes, runtimes, mem_usages = zip(*results)

plot.figure(figsize=(12, 6))

plot.subplot(1, 2, 1)
plot.plot(sizes, runtimes, marker='o')
plot.title('Runtime vs Data Size')
plot.xlabel('Data Size')
plot.ylabel('Runtime (seconds)')
plot.xscale('log')
plot.yscale('log')

plot.subplot(1, 2, 2)
plot.plot(sizes, mem_usages, marker='o')
plot.title('Memory Usage vs Data Size')
plot.xlabel('Data Size')
plot.ylabel('Memory Usage (MiB)')
plot.xscale('log')
plot.yscale('log')

plot.tight_layout()
plot.show()

Size: 100, Avg Runtime: 0.04 seconds, Memory Usage: 0.09 MiB
Size: 1000, Avg Runtime: 0.05 seconds, Memory Usage: 0.00 MiB
Size: 10000, Avg Runtime: 0.08 seconds, Memory Usage: 0.83 MiB
Size: 100000, Avg Runtime: 0.50 seconds, Memory Usage: 12.39 MiB
Size: 500000, Avg Runtime: 2.47 seconds, Memory Usage: 441.45 MiB
Size: 1000000, Avg Runtime: 5.47 seconds, Memory Usage: 2608.09 MiB


TypeError: zip() argument after * must be an iterable, not NoneType