In [8]:
import numpy as np
import matplotlib.pyplot as pyplot
import pandas as pd
import os
from datetime import datetime, timedelta
import random

In [9]:
# step 1: get or simulate some data


In [10]:

OUTPUT_DIR = "synthetic_synergy_data"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# ------------------------------------------------------------
# PARAMETERS
# ------------------------------------------------------------
N_CUSTOMERS = 50_000      # change to 1_170_000 when needed
N_SMART_METERS = 12_000   # subset for interval data
N_YEARS = 3

np.random.seed(42)


# ============================================================
# A. BILLING SYSTEM DATA
# ============================================================

def simulate_billing_system_data():
    tariffs = ["A1", "L1", "L3", "R1", "R3", "M1", "K1"]
    customer_types = ["residential", "small_business", "medium_business", "large_business", "home_business"]

    df = pd.DataFrame({
        "customer_id": np.arange(1, N_CUSTOMERS + 1),
        "postcode": np.random.randint(6000, 6999, N_CUSTOMERS),
        "suburb_code": np.random.randint(1, 500, N_CUSTOMERS),
        "tariff": np.random.choice(tariffs, N_CUSTOMERS),
        "customer_type": np.random.choice(customer_types, N_CUSTOMERS),
        "monthly_consumption_kwh": np.random.gamma(shape=2.0, scale=200, size=N_CUSTOMERS),  # realistic heavy-tail
        "bill_amount": np.random.uniform(80, 600, N_CUSTOMERS),
        "payment_history_score": np.random.uniform(0, 1, N_CUSTOMERS),
        "concession_status": np.random.choice([0, 1], N_CUSTOMERS, p=[0.85, 0.15]),
        "connection_date": pd.date_range("2010-01-01", periods=N_CUSTOMERS).to_series().sample(N_CUSTOMERS).values,
    })

    df["customer_tenure_years"] = (pd.Timestamp("2025-01-01") - pd.to_datetime(df["connection_date"])).dt.days / 365

    df.to_csv(f"{OUTPUT_DIR}/billing_system_data.csv", index=False)
    print("Saved billing_system_data.csv")
    return df



# ============================================================
# B. ADVANCED METER INTERVAL DATA (15/30 MIN)
# ============================================================

def simulate_interval_data(customer_ids):
    interval_records = []

    for cid in customer_ids:
        for day in range(0, 7):  # one week simulation, scale if needed
            dt_start = datetime(2024, 1, 1) + timedelta(days=day)
            for i in range(48):  # 30-min intervals
                timestamp = dt_start + timedelta(minutes=30 * i)
                kwh = max(0, np.random.normal(0.5, 0.3))  # realistic low usage per interval
                solar = max(0, np.random.normal(0.3, 0.2)) if 10 <= timestamp.hour <= 15 else 0

                interval_records.append([cid, timestamp, kwh, solar])

    df = pd.DataFrame(interval_records,
                      columns=["customer_id", "timestamp", "kwh", "solar_export_kwh"])

    df.to_csv(f"{OUTPUT_DIR}/advanced_interval_data.csv", index=False)
    print("Saved advanced_interval_data.csv")
    return df



# ============================================================
# C. REGULATORY REPORTING DATA (ANNUAL)
# ============================================================

def simulate_regulatory_reporting():
    data = []
    for year in [2022, 2023, 2024]:
        data.append({
            "year": year,
            "total_customers": N_CUSTOMERS,
            "residential_customers": int(N_CUSTOMERS * 0.8),
            "business_customers": int(N_CUSTOMERS * 0.2),
            "complaints": np.random.randint(5000, 15000),
            "affordability_index": np.random.uniform(0.2, 0.8),
            "disconnections": np.random.randint(2000, 5000),
            "payment_difficulty_cases": np.random.randint(5000, 15000),
            "life_support_customers": np.random.randint(500, 1500),
        })

    df = pd.DataFrame(data)
    df.to_csv(f"{OUTPUT_DIR}/regulatory_reporting.csv", index=False)
    print("Saved regulatory_reporting.csv")
    return df



# ============================================================
# D. TECHNOLOGY ADOPTION DATA (Solar, EV, Batteries)
# ============================================================

def simulate_technology_adoption(customer_ids):
    df = pd.DataFrame({
        "customer_id": customer_ids,
        "solar_capacity_kw": np.random.choice([0, 3, 6, 10], size=len(customer_ids), p=[0.6, 0.2, 0.15, 0.05]),
        "battery_capacity_kwh": np.random.choice([0, 5, 10, 13], size=len(customer_ids), p=[0.85, 0.1, 0.03, 0.02]),
        "ev_plan_enrolled": np.random.choice([0, 1], len(customer_ids), p=[0.92, 0.08]),
        "midday_saver_enrolled": np.random.choice([0, 1], len(customer_ids), p=[0.7, 0.3]),
        "smart_meter_status": np.random.choice(["yes", "no"], len(customer_ids), p=[0.3, 0.7]),
    })

    df.to_csv(f"{OUTPUT_DIR}/technology_adoption.csv", index=False)
    print("Saved technology_adoption.csv")
    return df



# ============================================================
# E. WESTERN POWER NETWORK DATA (substation / feeder)
# ============================================================

def simulate_network_load_profiles():
    substations = 25
    records = []

    for s in range(substations):
        for day in range(365):
            for i in range(48):  # 30-min intervals
                load = np.random.normal(10_000, 2_000)  # kW load at substation
                records.append([s, day, i, max(0, load)])

    df = pd.DataFrame(records,
                      columns=["substation_id", "day_of_year", "interval_30min", "load_kw"])

    df.to_csv(f"{OUTPUT_DIR}/network_load_profiles.csv", index=False)
    print("Saved network_load_profiles.csv")
    return df



# ============================================================
# RUN EVERYTHING
# ============================================================

if __name__ == "__main__":
    billing_df = simulate_billing_system_data()

    smart_meter_ids = np.random.choice(billing_df["customer_id"], N_SMART_METERS, replace=False)
    interval_df = simulate_interval_data(smart_meter_ids)

    regulatory_df = simulate_regulatory_reporting()
    tech_df = simulate_technology_adoption(billing_df["customer_id"])
    network_df = simulate_network_load_profiles()

    print("All synthetic data generated in:", OUTPUT_DIR)


Saved billing_system_data.csv
Saved advanced_interval_data.csv
Saved regulatory_reporting.csv
Saved technology_adoption.csv
Saved network_load_profiles.csv
All synthetic data generated in: synthetic_synergy_data


In [11]:
billing_df.head()

Unnamed: 0,customer_id,postcode,suburb_code,tariff,customer_type,monthly_consumption_kwh,bill_amount,payment_history_score,concession_status,connection_date,customer_tenure_years
0,1,6102,100,A1,large_business,214.540771,439.460982,0.213238,1,2014-09-10,10.317808
1,2,6435,384,L1,small_business,299.118885,559.032124,0.077685,0,2094-10-04,-69.80274
2,3,6860,455,M1,residential,665.443131,439.120229,0.697628,0,2053-07-30,-28.594521
3,4,6270,144,L3,medium_business,187.480672,422.690873,0.987886,0,2075-08-01,-50.613699
4,5,6106,192,K1,medium_business,44.408474,178.543948,0.546124,0,2046-07-19,-21.558904


In [13]:
# using the data from the actual synergy system to fix the simulaition parameters