### Revisions
1. use merge_asof to match the forecast date to the dates in the rate table
2. filter the meter aq so it’s in between the min aq and max aq in the rate table
3. (Optional) Graphs to show linear

In [148]:
import pandas as pd

#retrieve csv
def read_csv(path):
    return pd.read_csv(path)

def combine_df(rates, meters, forecast):
    # Merge the meters and rates table
    df_mr = pd.merge(meters, rates, on='exit_zone')
    
    # Convert dates to datetime
    df_mr['date'] = pd.to_datetime(df_mr['date'])
    forecast['date'] = pd.to_datetime(forecast['date'])
    
    # Sort values by date for asof merge
    df_mr = df_mr.sort_values(by='date')
    forecast = forecast.sort_values(by='date')
    
    # Merge meters-rates with forecast
    df_mrf = pd.merge_asof(df_mr, forecast, by='meter_id', on='date', direction='nearest')

    return df_mrf


def format_value(x):
    if isinstance(x, (int, float)):
        return f"{x:.2f}"
    return x

def filtered_df(df):
    filtered_aq = df[(df['aq_kwh'] >= df['aq_min_kwh']) & (df['aq_kwh'] <= df['aq_max_kwh'])].copy()

    # Calculate the daily charge
    filtered_aq.loc[:, 'daily_charge'] = filtered_aq['rate_p_per_kwh'] * filtered_aq['kwh']
    
    # Calculate the total cost and forecast
    cost_total = filtered_aq.groupby('meter_id')['daily_charge'].sum() * 0.01
    forecast_total = filtered_aq.groupby('meter_id')['kwh'].sum()
    
    # Create one table, reset the index as it's registering meter_id as the index due to previous groupby
    df_total = pd.merge(forecast_total, cost_total, on='meter_id').reset_index()
    
    # Making it pretty now
    columns = {
        'meter_id': 'Meter ID', 
        'kwh': 'Total Estimated Consumption (kWh)', 
        'daily_charge': 'Total Cost (£)'
    }
    df_total['kwh'] = df_total['kwh'].map(format_value)
    df_total['daily_charge'] = df_total['daily_charge'].map(format_value)
    
    df_result = df_total.rename(columns=columns)
    
    # Display the result
    return df_result


rates = read_csv("./data/rate.csv")
meters = read_csv("./data/meter.csv")
forecast = read_csv("./data/forecast.csv")

df = combine_df(rates, meters, forecast)
filtered_df(df)

Unnamed: 0,Meter ID,Total Estimated Consumption (kWh),Total Cost (£)
0,14676236,369.89,1.76
1,34509937,930.5,4.4
2,50264822,3458.3,13.07
3,88357331,5140.55,20.44


In [152]:
def gen_meter_list(m, ez=None, seed=None, min_aq=0, max_aq=5000):
    if seed is not None:
        np.random.seed(seed)
    
    meter_ids = np.random.randint(1, int(1e7), size=m)
    exit_zones = np.random.choice(ez, size=m) if ez is not None else None
    annual_quantity = np.random.uniform(min_aq, max_aq, size=m)

    data = {
        'meter_id': meter_ids,
        "exit_zone": exit_zones,
        'aq_kwh': annual_quantity
    }

    if exit_zones is not None:
        data['exit_zone'] = exit_zones

    df_meters = pd.DataFrame(data)
    return df_meters

In [153]:
def gen_consumption_list(meter_list, start_date, periods):
    # make sure start date is in the correct format using Panda Datetime stamp
    if isinstance(start_date, str):
        start_date = pd.Timestamp(start_date)
    else:
        raise TypeError("Start Date must be a string and a valid date YYYY-MM-DD format.")

    # generate forecast periods using duration by Day
    dates = pd.date_range(start=start_date, periods=periods, freq='D')
    # get generated meter list (meter_id, dates, kwh)
    meters = meter_list['meter_id'].values    
    # print(meters, len(dates), len(consumption))

    # get consumption data
    df_consumption = pd.DataFrame({
        'meter_id': np.random.choice(meters, size=periods * len(meters)),
        'date': np.tile(dates, len(meters)),
        'kwh': np.random.randint(10, 100, size=periods * len(meters))
    })
    
    return df_consumption

In [154]:
ez = rates['exit_zone'].unique() # this is just to get only unique exit zones in existing data.
m = gen_meter_list(len(ez),ez)
c = gen_consumption_list(m, '2024-01-01', 30)

def calc_cost(m,c):
    gen_df = combine_df(rates,m,c)
    return filtered_df(gen_df)

calc_cost(m,c)

Unnamed: 0,Meter ID,Total Estimated Consumption (kWh),Total Cost (£)
0,361003,218.0,1.03
1,663031,570.0,2.23
2,1265029,286.0,1.87
3,1506055,640.0,2.44
4,1711419,356.0,1.31
5,1845262,340.0,2.0
6,1884349,718.0,2.77
7,1987361,860.0,3.03
8,2393285,702.0,2.14
9,2393780,310.0,1.92


In [156]:
import timeit
import pandas as pd
import numpy as np
from memory_profiler import memory_usage

def benchmark(fn, size, seed=42, days=30):
    m = gen_meter_list(size, ez, seed)
    c = gen_consumption_list(m, '2024-01-01', days)

    # execution time
    exe_time = timeit.timeit(lambda: fn(m,c), number=10)
    avg_time = exe_time / 10

    # memory usage
    mem = memory_usage((calc_cost, (m, c)), max_iterations=1)
    avg_mem = max(mem) - min(mem)

    return avg_time, avg_mem

# testing the test
def benchmark_test(fn):
    sizes = [100, 1000, 10000, 100000, 500000, 1000000]
    results = []
    
    for size in sizes:
        runtime, mem_usage = benchmark(fn, size)
        results.append((size, runtime))
        print(f"Size: {size}, Avg Runtime: {runtime:.2f} seconds, Memory Usage: {mem_usage:.2f} MiB")

benchmark_test(calc_cost)

Size: 100, Avg Runtime: 0.01 seconds, Memory Usage: 0.00 MiB
Size: 1000, Avg Runtime: 0.02 seconds, Memory Usage: 0.02 MiB
Size: 10000, Avg Runtime: 0.11 seconds, Memory Usage: 25.64 MiB
Size: 100000, Avg Runtime: 1.28 seconds, Memory Usage: 8.80 MiB
Size: 500000, Avg Runtime: 8.96 seconds, Memory Usage: 46.77 MiB
Size: 1000000, Avg Runtime: 19.37 seconds, Memory Usage: 1002.69 MiB


In [158]:
import dask.dataframe as dd

def calc_cost_dask(meter, consumption):
    meter_dask = dd.from_pandas(meter, npartitions=4)
    consumption_dask = dd.from_pandas(consumption, npartitions=4)

    meters_and_consumption = dd.merge(meter_dask, consumption_dask, on='meter_id')

    meters_and_consumption['rate_p_per_kwh'] = meters_and_consumption['aq_kwh'] / meters_and_consumption['kwh']
    meters_and_consumption['daily_charge'] = meters_and_consumption['rate_p_per_kwh'] * meters_and_consumption['kwh']

    grouped = meters_and_consumption.groupby('meter_id').agg({
        'daily_charge': 'sum',
        'kwh': 'sum'
    }).compute()

    grouped['daily_charge'] *= 0.01

    grouped = grouped.reset_index()
    grouped.columns = ['Meter ID', 'Total Cost (£)', 'Total Estimated Consumption (kWh)']

    return grouped.map(format_value)

# Run benchmark for Dask function
benchmark_test(calc_cost_dask)

Size: 100, Avg Runtime: 0.05 seconds, Memory Usage: 0.00 MiB
Size: 1000, Avg Runtime: 0.05 seconds, Memory Usage: 0.58 MiB
Size: 10000, Avg Runtime: 0.08 seconds, Memory Usage: 8.83 MiB
Size: 100000, Avg Runtime: 0.46 seconds, Memory Usage: 29.52 MiB
Size: 500000, Avg Runtime: 2.33 seconds, Memory Usage: 1148.61 MiB
Size: 1000000, Avg Runtime: 5.00 seconds, Memory Usage: 4607.17 MiB


In [161]:
import matplotlib.pyplot as pt

def plot_benchmark_results(sizes, runtimes, mem_usages):
    fig, ax1 = plt.subplots()

    color = 'tab:blue'
    ax1.set_xlabel('Dataset Size')
    ax1.set_ylabel('Average Runtime (s)', color=color)
    ax1.plot(sizes, runtimes, color=color)
    ax1.tick_params(axis='y', labelcolor=color)

    ax2 = ax1.twinx()
    color = 'tab:red'
    ax2.set_ylabel('Memory Usage (MiB)', color=color)
    ax2.plot(sizes, mem_usages, color=color)
    ax2.tick_params(axis='y', labelcolor=color)

    fig.tight_layout()
    plt.title('Benchmark Results')
    plt.show()

# Run the benchmark test
sizes, runtimes, mem_usages = benchmark_test(calc_cost)

# Plot the results
plot_benchmark_results(sizes, runtimes, mem_usages)

Size: 100, Avg Runtime: 0.01 seconds, Memory Usage: 0.44 MiB
Size: 1000, Avg Runtime: 0.02 seconds, Memory Usage: 4.66 MiB
Size: 10000, Avg Runtime: 0.15 seconds, Memory Usage: 0.00 MiB


KeyboardInterrupt: 