In [None]:
%pip install pdfplumber pandas tabulate

Collecting tabulate
  Using cached tabulate-0.9.0-py3-none-any.whl.metadata (34 kB)
Using cached tabulate-0.9.0-py3-none-any.whl (35 kB)
Installing collected packages: tabulate
Successfully installed tabulate-0.9.0


In [3]:
%pip install pdfplumber transformers accelerate pandas

Collecting transformers
  Downloading transformers-4.57.1-py3-none-any.whl.metadata (43 kB)
Collecting accelerate
  Downloading accelerate-1.11.0-py3-none-any.whl.metadata (19 kB)
Collecting filelock (from transformers)
  Downloading filelock-3.20.0-py3-none-any.whl.metadata (2.1 kB)
Collecting huggingface-hub<1.0,>=0.34.0 (from transformers)
  Downloading huggingface_hub-0.36.0-py3-none-any.whl.metadata (14 kB)
Collecting pyyaml>=5.1 (from transformers)
  Using cached pyyaml-6.0.3-cp310-cp310-win_amd64.whl.metadata (2.4 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2025.11.3-cp310-cp310-win_amd64.whl.metadata (41 kB)
Collecting requests (from transformers)
  Using cached requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting tokenizers<=0.23.0,>=0.22.0 (from transformers)
  Downloading tokenizers-0.22.1-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Downloading safetensors-0.6.2-cp38-abi3-win_amd64.whl.me


[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import pdfplumber, re, json, subprocess, pandas as pd
from pathlib import Path

# ---------- CONFIG ----------
PDF_PATH = "National Grid Tariff-New York.pdf"
MODEL = "mistral"   # or try "phi3" if smaller CPU model is preferred
OUTPUT_JSON = "tariff_rates_from_ollama.json"

# ---------- FUNCTION TO QUERY OLLAMA ----------
def query_ollama(prompt, model=MODEL):
    """Send a prompt to the local Ollama model and return the response."""
    process = subprocess.Popen(["ollama", "run", model],
                               stdin=subprocess.PIPE, stdout=subprocess.PIPE)
    output, _ = process.communicate(prompt.encode("utf-8"))
    return output.decode("utf-8")

# ---------- STEP 1: Extract SC Sections from the PDF ----------
sections = {}
with pdfplumber.open(PDF_PATH) as pdf:
    text = "\n".join(page.extract_text() or "" for page in pdf.pages)

for match in re.finditer(r"SERVICE\s+CLASSIFICATION\s+NO\.?\s*(\d+[A-Z\-]*)", text, re.IGNORECASE):
    sc_id = match.group(1).strip()
    start = match.start()
    next_match = re.search(r"SERVICE\s+CLASSIFICATION\s+NO\.?\s*(\d+[A-Z\-]*)", text[start+10:], re.IGNORECASE)
    end = start + 10 + next_match.start() if next_match else len(text)
    sections[f"SC-{sc_id}"] = text[start:end]

print(f"üìÑ Found {len(sections)} service classifications in PDF.")

# ---------- STEP 2: Define the Prompt ----------
def make_prompt(sc, content):
    return f"""
You are an expert in electric utility tariff extraction.

Read the text below for {sc} and extract *only* rate information in JSON format.

Required keys:
{{
  "Service Classification": "{sc}",
  "Effective Date": "",
  "Previous Effective Date": "",
  "Basic Service Charge ($/month)": "",
  "Monthly Minimum Charge ($)": "",
  "Energy Rates ($/kWh)": {{
      "On Peak": "",
      "Off Peak": "",
      "Super Peak": "",
      "All Hours": ""
  }},
  "Demand / Distribution Rates ($/kW)": {{
      "Distribution": "",
      "Delivery": "",
      "As-Used On Peak": "",
      "As-Used Super Peak": ""
  }},
  "Reactive Demand ($/RkVA)": "",
  "Notes": ""
}}

Text:
{content}

Respond with **valid JSON only**, no explanations.
"""

# ---------- STEP 3: Loop Through and Query Ollama ----------
results = []
for sc, content in sections.items():
    print(f"‚öôÔ∏è Extracting rates for {sc} ...")
    prompt = make_prompt(sc, content)
    response = query_ollama(prompt)

    # Extract JSON portion safely
    json_part = response[response.find("{"):response.rfind("}")+1]
    try:
        data = json.loads(json_part)
        results.append(data)
        print(f"‚úÖ Parsed {sc} successfully.")
    except Exception as e:
        print(f"‚ö†Ô∏è Could not parse JSON for {sc}: {e}")
        print("Raw output snippet:", response[:400])

# ---------- STEP 4: Save and Convert ----------
Path(OUTPUT_JSON).write_text(json.dumps(results, indent=2))
print(f"\nüíæ Saved structured data to {OUTPUT_JSON}")

df = pd.json_normalize(results)
excel_path = "Tariff_Rates_from_Ollama.xlsx"
df.to_excel(excel_path, index=False)
print(f"üìä Data exported to {excel_path}")


üìÑ Found 23 service classifications in PDF.
‚öôÔ∏è Extracting rates for SC-12 ...


FileNotFoundError: [WinError 2] The system cannot find the file specified

In [1]:
import pdfplumber
import re
import pandas as pd

pdf_path = "National Grid Tariff-New York.pdf"
output_excel = "SC1_SC1C_Tariff_Rates.xlsx"

# ---------- 1. Extract SC text ----------
def extract_sc_text(pdf_path, sc_no):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text() or ""
            if re.search(rf"SERVICE\s+CLASSIFICATION\s+NO\.?\s*{sc_no}\b", page_text, re.I):
                text += page_text + "\n"
    return text

sc1_text = extract_sc_text(pdf_path, "1(?!-C)")   # SC-1 only
sc1c_text = extract_sc_text(pdf_path, "1-?C")     # SC-1-C

# ---------- 2. Define regex patterns ----------
patterns = {
    "Effective Date": r"Effective\s*(?:Date|on|from)[:\-]?\s*([A-Za-z]+\s*\d{1,2},?\s*\d{4})",
    "Basic Service Charge": r"Basic\s+Service\s+Charge[^$\d]*(\$?\d+\.\d{2})",
    "Monthly Minimum Charge": r"Monthly\s+Minimum\s+Charge[^$\d]*(\$?\d+\.\d{2})",
    "Per kWh All Hours": r"Per\s*kWh[^$\d]*(\$?\d+\.\d{4})",
    "On Peak": r"On\s*Peak[^$\d]*(\$?\d+\.\d{4})",
    "Off Peak": r"Off\s*Peak[^$\d]*(\$?\d+\.\d{4})",
    "Super Peak": r"Super\s*Peak[^$\d]*(\$?\d+\.\d{4})",
    "Distribution Delivery": r"Distribution\s*(?:Delivery|Charge)[^$\d]*(\$?\d+\.\d{4})",
}

def extract_values(text, sc_name):
    values = {key: None for key in patterns}
    for key, pat in patterns.items():
        match = re.search(pat, text, re.I)
        if match:
            values[key] = match.group(1)
    return {
        "Service_Classification": sc_name,
        "Effective_Date": values["Effective Date"],
        "Basic_Service_Charge_($/month)": values["Basic Service Charge"],
        "Monthly_Minimum_Charge_($)": values["Monthly Minimum Charge"],
        "Energy_Rate_All_Hours_($/kWh)": values["Per kWh All Hours"],
        "On_Peak_($/kWh)": values["On Peak"],
        "Off_Peak_($/kWh)": values["Off Peak"],
        "Super_Peak_($/kWh)": values["Super Peak"],
        "Distribution_Delivery_($/kWh)": values["Distribution Delivery"],
    }

# ---------- 3. Build DataFrame ----------
records = [
    extract_values(sc1_text, "SC-1"),
    extract_values(sc1c_text, "SC-1-C")
]

df = pd.DataFrame(records)

# ---------- 4. Save to Excel ----------
df.to_excel(output_excel, index=False)
print(f"‚úÖ Tariff data saved to Excel file: {output_excel}")


‚úÖ Tariff data saved to Excel file: SC1_SC1C_Tariff_Rates.xlsx


In [2]:
import pdfplumber
import re
import pandas as pd

pdf_path = "National Grid Tariff-New York.pdf"
output_excel = "SC2_SC2D_Tariff_Rates.xlsx"

# ---------- 1. Extract SC text ----------
def extract_sc_text(pdf_path, sc_no):
    """Extracts text for a given service classification"""
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text() or ""
            if re.search(rf"SERVICE\s+CLASSIFICATION\s+NO\.?\s*{sc_no}\b", page_text, re.I):
                text += page_text + "\n"
    return text

sc2_text = extract_sc_text(pdf_path, "2(?!D)")   # SC-2 only
sc2d_text = extract_sc_text(pdf_path, "2D")      # SC-2D section

# ---------- 2. Define regex patterns ----------
patterns = {
    "Effective Date": r"Effective\s*(?:Date|on|from)[:\-]?\s*([A-Za-z]+\s*\d{1,2},?\s*\d{4})",
    "Basic Service Charge": r"Basic\s+Service\s+Charge[^$\d]*(\$?\d+\.\d{2})",
    "Monthly Minimum Charge": r"Monthly\s+Minimum\s+Charge[^$\d]*(\$?\d+\.\d{2})",
    "Per kWh All Hours": r"Per\s*kWh[^$\d]*(\$?\d+\.\d{4})",
    "On Peak": r"On\s*Peak[^$\d]*(\$?\d+\.\d{4})",
    "Off Peak": r"Off\s*Peak[^$\d]*(\$?\d+\.\d{4})",
    "Super Peak": r"Super\s*Peak[^$\d]*(\$?\d+\.\d{4})",
    "Distribution Delivery": r"Distribution\s*(?:Delivery|Charge)[^$\d]*(\$?\d+\.\d{4})",
    "Demand Charge": r"(?:Demand|per\s*kW)[^$\d]*(\$?\d+\.\d{2})"
}

# ---------- 3. Extract values ----------
def extract_values(text, sc_name):
    values = {key: None for key in patterns}
    for key, pat in patterns.items():
        match = re.search(pat, text, re.I)
        if match:
            values[key] = match.group(1)
    return {
        "Service_Classification": sc_name,
        "Effective_Date": values["Effective Date"],
        "Basic_Service_Charge_($/month)": values["Basic Service Charge"],
        "Monthly_Minimum_Charge_($)": values["Monthly Minimum Charge"],
        "Energy_Rate_All_Hours_($/kWh)": values["Per kWh All Hours"],
        "On_Peak_($/kWh)": values["On Peak"],
        "Off_Peak_($/kWh)": values["Off Peak"],
        "Super_Peak_($/kWh)": values["Super Peak"],
        "Distribution_Delivery_($/kWh_or_kW)": values["Distribution Delivery"],
        "Demand_Charge_($/kW)": values["Demand Charge"]
    }

# ---------- 4. Build DataFrame ----------
records = [
    extract_values(sc2_text, "SC-2"),
    extract_values(sc2d_text, "SC-2D")
]

df = pd.DataFrame(records)

# ---------- 5. Save to Excel ----------
df.to_excel(output_excel, index=False)
print(f"‚úÖ Tariff data saved to Excel: {output_excel}")


‚úÖ Tariff data saved to Excel: SC2_SC2D_Tariff_Rates.xlsx


In [3]:
import pdfplumber
import re
import pandas as pd

pdf_path = "National Grid Tariff-New York.pdf"
output_excel = "SC3_SC3A_Tariff_Rates_ByVoltage.xlsx"

# ---------- 1. Extract SC text ----------
def extract_sc_text(pdf_path, sc_no):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text() or ""
            if re.search(rf"SERVICE\s+CLASSIFICATION\s+NO\.?\s*{sc_no}\b", page_text, re.I):
                text += page_text + "\n"
    return text

sc3_text = extract_sc_text(pdf_path, "3(?!A)")   # SC-3
sc3a_text = extract_sc_text(pdf_path, "3A")      # SC-3A

# ---------- 2. Regex patterns ----------
patterns = {
    "Effective Date": r"Effective\s*(?:Date|on|from)[:\-]?\s*([A-Za-z]+\s*\d{1,2},?\s*\d{4})",
    "Customer Charge": r"(?:Customer|Basic)\s+Charge[^$\d]*(\$?\d+\.\d{2,4})",
    "On Peak": r"On\s*Peak[^$\d]*(\$?\d+\.\d{4})",
    "Off Peak": r"Off\s*Peak[^$\d]*(\$?\d+\.\d{4})",
    "Super Peak": r"Super\s*Peak[^$\d]*(\$?\d+\.\d{4})",
    "Distribution": r"Distribution\s*(?:Delivery|Charge)[^$\d]*(\$?\d+\.\d{2,4})",
    "Reactive Demand": r"Reactive\s*Demand[^$\d]*(\$?\d+\.\d{2,4})",
    "Demand": r"per\s*kW[^$\d]*(\$?\d+\.\d{2,4})"
}

voltage_patterns = {
    "Secondary": r"(?i)(?:Secondary\s*\(<2\.?2\s*kV\)|Secondary)",
    "Primary": r"(?i)(?:Primary\s*\(2\.?2\s*[-‚Äì]?\s*15\s*kV\)|Primary)",
    "Subtransmission": r"(?i)(?:Subtransmission\s*\(22\s*[-‚Äì]?\s*50\s*kV\)|Subtransmission)",
    "Transmission": r"(?i)(?:Transmission\s*\(?>\s*60\s*kV\)|Transmission)"
}

# ---------- 3. Function to extract values by voltage ----------
def extract_by_voltage(text, sc_name):
    effective_match = re.search(patterns["Effective Date"], text)
    effective_date = effective_match.group(1) if effective_match else None

    rows = []
    for voltage_label, v_pat in voltage_patterns.items():
        section_matches = re.findall(v_pat + r"([^A-Z]{0,500})", text)  # capture nearby lines
        for section in section_matches:
            entry = {
                "Service_Classification": sc_name,
                "Voltage_Level": voltage_label,
                "Effective_Date": effective_date,
                "Customer_Charge_($/month)": None,
                "Distribution_Delivery_($/kW)": None,
                "On_Peak_($/kWh)": None,
                "Off_Peak_($/kWh)": None,
                "Super_Peak_($/kWh)": None,
                "Demand_Charge_($/kW)": None,
                "Reactive_Demand_Charge_($/RkVA)": None
            }
            # apply smaller regex searches in the captured block
            for key, pat in patterns.items():
                match = re.search(pat, section)
                if match:
                    if "On Peak" in key: entry["On_Peak_($/kWh)"] = match.group(1)
                    elif "Off Peak" in key: entry["Off_Peak_($/kWh)"] = match.group(1)
                    elif "Super Peak" in key: entry["Super_Peak_($/kWh)"] = match.group(1)
                    elif "Distribution" in key: entry["Distribution_Delivery_($/kW)"] = match.group(1)
                    elif "Demand" == key: entry["Demand_Charge_($/kW)"] = match.group(1)
                    elif "Reactive" in key: entry["Reactive_Demand_Charge_($/RkVA)"] = match.group(1)
                    elif "Customer" in key: entry["Customer_Charge_($/month)"] = match.group(1)
            rows.append(entry)
    return rows

# ---------- 4. Extract all ----------
sc3_rows = extract_by_voltage(sc3_text, "SC-3")
sc3a_rows = extract_by_voltage(sc3a_text, "SC-3A")

df = pd.DataFrame(sc3_rows + sc3a_rows)

# ---------- 5. Clean up duplicates ----------
df = df.drop_duplicates(subset=["Service_Classification", "Voltage_Level"]).reset_index(drop=True)

# ---------- 6. Save ----------
df.to_excel(output_excel, index=False)
print(f"‚úÖ Tariff data (by voltage) saved to Excel: {output_excel}")


‚úÖ Tariff data (by voltage) saved to Excel: SC3_SC3A_Tariff_Rates_ByVoltage.xlsx


In [4]:
import pdfplumber
import re
import pandas as pd

pdf_path = "National Grid Tariff-New York.pdf"
output_excel = "SC3_SC3A_Tariff_Rates.xlsx"

# ---------- 1. Extract SC text ----------
def extract_sc_text(pdf_path, sc_no):
    """Extracts text for a given service classification"""
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text() or ""
            if re.search(rf"SERVICE\s+CLASSIFICATION\s+NO\.?\s*{sc_no}\b", page_text, re.I):
                text += page_text + "\n"
    return text

sc3_text = extract_sc_text(pdf_path, "3(?!A)")   # SC-3 only
sc3a_text = extract_sc_text(pdf_path, "3A")      # SC-3A section

# ---------- 2. Define regex patterns ----------
patterns = {
    "Effective Date": r"Effective\s*(?:Date|on|from)[:\-]?\s*([A-Za-z]+\s*\d{1,2},?\s*\d{4})",
    "Customer Charge": r"(?:Customer|Basic)\s+Charge[^$\d]*(\$?\d+\.\d{2})",
    "Distribution Delivery": r"Distribution\s*(?:Delivery|Charge)[^$\d]*(\$?\d+\.\d{2,4})",
    "On Peak": r"On\s*Peak[^$\d]*(\$?\d+\.\d{4})",
    "Off Peak": r"Off\s*Peak[^$\d]*(\$?\d+\.\d{4})",
    "Super Peak": r"Super\s*Peak[^$\d]*(\$?\d+\.\d{4})",
    "Reactive Demand": r"Reactive\s*Demand[^$\d]*(\$?\d+\.\d{2,4})",
    "Demand Charge": r"per\s*kW[^$\d]*(\$?\d+\.\d{2,4})"
}

# ---------- 3. Extraction function ----------
def extract_values(text, sc_name):
    values = {key: None for key in patterns}
    for key, pat in patterns.items():
        match = re.search(pat, text, re.I)
        if match:
            values[key] = match.group(1)
    return {
        "Service_Classification": sc_name,
        "Effective_Date": values["Effective Date"],
        "Customer_Charge_($/month)": values["Customer Charge"],
        "Distribution_Delivery_($/kWh_or_kW)": values["Distribution Delivery"],
        "On_Peak_($/kWh)": values["On Peak"],
        "Off_Peak_($/kWh)": values["Off Peak"],
        "Super_Peak_($/kWh)": values["Super Peak"],
        "Demand_Charge_($/kW)": values["Demand Charge"],
        "Reactive_Demand_Charge_($/RkVA)": values["Reactive Demand"]
    }

# ---------- 4. Build DataFrame ----------
records = [
    extract_values(sc3_text, "SC-3"),
    extract_values(sc3a_text, "SC-3A")
]

df = pd.DataFrame(records)

# ---------- 5. Save to Excel ----------
df.to_excel(output_excel, index=False)
print(f"‚úÖ Tariff data saved to Excel: {output_excel}")


‚úÖ Tariff data saved to Excel: SC3_SC3A_Tariff_Rates.xlsx


In [2]:
%pip install pandas

Collecting pandas
  Using cached pandas-2.3.3-cp310-cp310-win_amd64.whl.metadata (19 kB)
Collecting numpy>=1.22.4 (from pandas)
  Using cached numpy-2.2.6-cp310-cp310-win_amd64.whl.metadata (60 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Using cached tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Using cached pandas-2.3.3-cp310-cp310-win_amd64.whl (11.3 MB)
Using cached numpy-2.2.6-cp310-cp310-win_amd64.whl (12.9 MB)
Using cached pytz-2025.2-py2.py3-none-any.whl (509 kB)
Using cached tzdata-2025.2-py2.py3-none-any.whl (347 kB)
Installing collected packages: pytz, tzdata, numpy, pandas

   ---------------------------------------- 0/4 [pytz]
   ---------------------------------------- 0/4 [pytz]
   ---------- ----------------------------- 1/4 [tzdata]
   ---------- ----------------------------- 1/4 [tzdata]
   ---------- ----------------------------- 1/4 [tzdata]
   ------------


[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [20]:
import json, re, pandas as pd
with open("D:\\utility-billing-ai\\data\\processed\\raw_extracted_tarif.json", "r", encoding="utf-8") as f:
    raw = json.load(f)

pages = raw["pages"]

def extract_effective_date(text):
    m = re.search(r"Effective Date:\s*([A-Za-z0-9,\/ ]+)", text)
    return m.group(1).strip() if m else None

def dollars(line):
    return [float(x) for x in re.findall(r"\$([0-9]+\.[0-9]+)", line)]


In [21]:
def parse_SC1(text, page):
    rec = {
        "scheme": "SC1",
        "page": page,
        "effective_date": extract_effective_date(text),
        "basic_service_charge": None,
        "per_kWh": None
    }

    for ln in text.splitlines():
        ln = ln.strip()
        if ln.startswith("Basic Service Charge"):
            vals = dollars(ln)
            if vals: rec["basic_service_charge"] = vals[0]

        if ln.startswith("Per kWh"):
            vals = dollars(ln)
            if vals: rec["per_kWh"] = vals[0]

    return rec


In [22]:
def parse_SC1C(text, page):
    rec = {
        "scheme": "SC1C",
        "page": page,
        "effective_date": extract_effective_date(text),
        "basic_service_charge": None,
        "per_kWh": None
    }

    for ln in text.splitlines():
        ln = ln.strip()

        if "Basic Service Charge" in ln:
            vals = dollars(ln)
            if vals: rec["basic_service_charge"] = vals[0]

        if ln.startswith("Per kWh"):
            vals = dollars(ln)
            if vals: rec["per_kWh"] = vals[0]

    return rec


In [23]:
def parse_SC2_non_demand(text, page):
    rec = {
        "scheme": "SC2_NonDemand",
        "page": page,
        "effective_date": extract_effective_date(text),
        "basic_service_charge": None,
        "basic_special_O": None,
        "per_kWh": None
    }

    for ln in text.splitlines():
        ln = ln.strip()

        if ln.startswith("Basic Service Charge: $25.00"):
            vals = dollars(ln)
            if vals: rec["basic_service_charge"] = vals[0]

        if "Special Provision O" in ln:
            vals = dollars(ln)
            if vals: rec["basic_special_O"] = vals[0]

        if ln.lower().startswith("per kwh"):
            vals = dollars(ln)
            if vals: rec["per_kWh"] = vals[0]

    return rec


In [24]:
def parse_SC2_demand(text, page):
    rec = {
        "scheme": "SC2_Demand",
        "page": page,
        "effective_date": extract_effective_date(text),
        "basic_service_charge": None,
        "basic_special_P": None,
        "distribution_per_kW": None,
    }

    for ln in text.splitlines():
        ln = ln.strip()

        if ln.startswith("Basic Service Charge"):
            vals = dollars(ln)
            if vals: rec["basic_service_charge"] = vals[0]

        if "Special Provision P" in ln:
            vals = dollars(ln)
            if vals: rec["basic_special_P"] = vals[0]

        if "per kW" in ln and "Distribution" in ln:
            vals = dollars(ln)
            if vals: rec["distribution_per_kW"] = vals[0]

    return rec


In [25]:
def parse_SC3(text, page):
    rec = {
        "scheme": "SC3",
        "page": page,
        "effective_date": extract_effective_date(text),
        "row_DistributionDelivery": [],
        "row_DistributionDelivery_SP": [],
        "row_MinDemandCharge": [],
        "row_AddlDemandCharge": []
    }

    for ln in text.splitlines():
        ln = ln.strip()

        # Distribution Delivery
        if ln.startswith("Distribution Delivery") and "Special Provision" not in ln:
            rec["row_DistributionDelivery"] = dollars(ln)

        # Distribution Delivery (Special Provision L)
        if "Special Provision L" in ln:
            rec["row_DistributionDelivery_SP"] = dollars(ln)

        # Minimum Demand Charges
        if ln.startswith("Minimum Demand Charges"):
            rec["row_MinDemandCharge"] = dollars(ln)

        # Additional Demand Charges
        if ln.startswith("Additional Demand Charges"):
            rec["row_AddlDemandCharge"] = dollars(ln)

    return rec


In [26]:
def parse_SC3A(text, page):
    rec = {
        "scheme": "SC3A",
        "page": page,
        "effective_date": extract_effective_date(text),
        "distribution_delivery": [],
        "demand_per_kW": []
    }

    for ln in text.splitlines():
        ln = ln.strip()

        if ln.startswith("Distribution Delivery") and "per kW" not in ln:
            rec["distribution_delivery"] = dollars(ln)

        if "Charges; per kW" in ln or "per kW:" in ln:
            rec["demand_per_kW"] = dollars(ln)

    return rec


In [27]:
SC1_rows=[]
SC1C_rows=[]
SC2_non_rows=[]
SC2_dem_rows=[]
SC3_rows=[]
SC3A_rows=[]

for p in pages:
    txt = p["text"]
    pg = p["page_number"]

    if "SERVICE CLASSIFICATION NO. 1" in txt and "1-C" not in txt:
        SC1_rows.append(parse_SC1(txt, pg))

    if "SERVICE CLASSIFICATION NO. 1-C" in txt:
        SC1C_rows.append(parse_SC1C(txt, pg))

    if "NON-DEMAND SERVICE" in txt and "SERVICE CLASSIFICATION NO. 2" in txt:
        SC2_non_rows.append(parse_SC2_non_demand(txt, pg))

    if "METERED DEMAND SERVICE" in txt:
        SC2_dem_rows.append(parse_SC2_demand(txt, pg))

    if "SERVICE CLASSIFICATION NO. 3" in txt and "3A" not in txt:
        SC3_rows.append(parse_SC3(txt, pg))

    if "SERVICE CLASSIFICATION NO. 3A" in txt:
        SC3A_rows.append(parse_SC3A(txt, pg))

df_SC1  = pd.DataFrame(SC1_rows)
df_SC1C = pd.DataFrame(SC1C_rows)
df_SC2N = pd.DataFrame(SC2_non_rows)
df_SC2D = pd.DataFrame(SC2_dem_rows)
df_SC3  = pd.DataFrame(SC3_rows)
df_SC3A = pd.DataFrame(SC3A_rows)


In [28]:
def parse_SC2D(text, page):
    rec = {
        "scheme": "SC2D",
        "page": page,
        "effective_date": extract_effective_date(text),

        # initialize all tiers as None
        "distribution_tier_1": None, "distribution_tier_2": None,
        "distribution_tier_3": None, "distribution_tier_4": None,

        "on_peak_tier_1": None, "on_peak_tier_2": None,
        "on_peak_tier_3": None, "on_peak_tier_4": None,

        "off_peak_tier_1": None, "off_peak_tier_2": None,
        "off_peak_tier_3": None, "off_peak_tier_4": None,

        "super_peak_tier_1": None, "super_peak_tier_2": None,
        "super_peak_tier_3": None, "super_peak_tier_4": None,
    }

    for ln in text.splitlines():
        ln = ln.strip()

        # ------------------------------
        # Distribution (per kW)
        # ------------------------------
        if ln.startswith("Distribution (per kW):"):
            vals = dollars(ln)
            for i, v in enumerate(vals[:4], start=1):
                rec[f"distribution_tier_{i}"] = v

        # ------------------------------
        # On Peak (per kWh)
        # ------------------------------
        if ln.startswith("On Peak") and "per kWh" in ln:
            vals = dollars(ln)
            for i, v in enumerate(vals[:4], start=1):
                rec[f"on_peak_tier_{i}"] = v

        # ------------------------------
        # Off Peak (per kWh)
        # ------------------------------
        if ln.startswith("Off Peak") and "per kWh" in ln:
            vals = dollars(ln)
            for i, v in enumerate(vals[:4], start=1):
                rec[f"off_peak_tier_{i}"] = v

        # ------------------------------
        # Super Peak (per kWh)
        # ------------------------------
        if ln.startswith("Super Peak") and "per kWh" in ln:
            vals = dollars(ln)
            for i, v in enumerate(vals[:4], start=1):
                rec[f"super_peak_tier_{i}"] = v

    return rec


In [29]:
SC2D_rows = []

for p in pages:
    txt = p["text"]
    pg = p["page_number"]

    if "SERVICE CLASSIFICATION NO. 2-D" in txt:
        SC2D_rows.append(parse_SC2D(txt, pg))

df_SC2D = pd.DataFrame(SC2D_rows)


In [30]:
def parse_SC2D_blocks(text, page):
    eff = extract_effective_date(text)

    blocks = {
        "distribution": [None, None, None, None],
        "on_peak": [None, None, None, None],
        "off_peak": [None, None, None, None],
        "super_peak": [None, None, None, None],
    }

    for ln in text.splitlines():
        ln = ln.strip()

        if ln.startswith("Distribution (per kW):"):
            vals = dollars(ln)
            for i, v in enumerate(vals[:4]):
                blocks["distribution"][i] = v

        if ln.startswith("On Peak") and "per kWh" in ln:
            vals = dollars(ln)
            for i, v in enumerate(vals[:4]):
                blocks["on_peak"][i] = v

        if ln.startswith("Off Peak") and "per kWh" in ln:
            vals = dollars(ln)
            for i, v in enumerate(vals[:4]):
                blocks["off_peak"][i] = v

        if ln.startswith("Super Peak") and "per kWh" in ln:
            vals = dollars(ln)
            for i, v in enumerate(vals[:4]):
                blocks["super_peak"][i] = v

    return {
        "scheme": "SC2D",
        "page": page,
        "effective_date": eff,
        **{
            f"dist_t{i+1}": blocks["distribution"][i] for i in range(4)
        },
        **{
            f"on_t{i+1}": blocks["on_peak"][i] for i in range(4)
        },
        **{
            f"off_t{i+1}": blocks["off_peak"][i] for i in range(4)
        },
        **{
            f"super_t{i+1}": blocks["super_peak"][i] for i in range(4)
        }
    }


In [31]:
sc2d_clean = []

for p in pages:
    if "SERVICE CLASSIFICATION NO. 2-D" in p["text"]:
        sc2d_clean.append(parse_SC2D_blocks(p["text"], p["page_number"]))

df_SC2D = pd.DataFrame(sc2d_clean)
df_SC2D


Unnamed: 0,scheme,page,effective_date,dist_t1,dist_t2,dist_t3,dist_t4,on_t1,on_t2,on_t3,on_t4,off_t1,off_t2,off_t3,off_t4,super_t1,super_t2,super_t3,super_t4
0,SC2D,339,10/01/2025,0.0,4.25,8.5,12.74,0.0742,0.05565,0.0371,0.01855,0.0371,0.02783,0.01855,0.00928,0.11131,0.08348,0.05565,0.02783


In [32]:
df_SC2D_dist = df_SC2D[[
    "scheme","page","effective_date",
    "dist_t1","dist_t2","dist_t3","dist_t4"
]]


In [33]:
df_SC2D_on = df_SC2D[[
    "scheme","page","effective_date",
    "on_t1","on_t2","on_t3","on_t4"
]]


In [34]:
df_SC2D_off = df_SC2D[[
    "scheme","page","effective_date",
    "off_t1","off_t2","off_t3","off_t4"
]]


In [35]:
df_SC2D_super = df_SC2D[[
    "scheme","page","effective_date",
    "super_t1","super_t2","super_t3","super_t4"
]]


In [36]:
def parse_SC1(text, page):
    rec = {
        "scheme": "SC1",
        "page": page,
        "effective_date": extract_effective_date(text),
        "basic_service_charge": None,
        "per_kWh": None,
        "monthly_minimum": None
    }

    for ln in text.splitlines():
        ln = ln.strip()

        # Basic Service Charge $19.00
        if re.search(r"basic\s+service\s+charge", ln, re.I):
            vals = dollars(ln)
            if vals:
                rec["basic_service_charge"] = vals[0]

        # Per kWh $0.08889
        if re.search(r"\bper\s*kwh\b", ln, re.I):
            vals = dollars(ln)
            if vals:
                rec["per_kWh"] = vals[0]

        # MONTHLY MINIMUM CHARGE: $19.00
        if "MONTHLY MINIMUM CHARGE" in ln.upper():
            vals = dollars(ln)
            if vals:
                rec["monthly_minimum"] = vals[0]

    return rec


In [37]:
for p in pages:
    if "SERVICE CLASSIFICATION NO. 1-C" in p["text"]:
        print("PAGE:", p["page_number"])
        print(p["text"])
        break


PAGE: 543
Status: EFFECTIVE
Received: 03/31/2009 Effective Date: 04/27/2009
PSC NO: 220 ELECTRICITY LEAF: 358
NIAGARA MOHAWK POWER CORPORATION REVISION: 0
INITIAL EFFECTIVE DATE: APRIL 27, 2009 SUPERSEDING REVISION:
SERVICE CLASSIFICATION NO. 1-C
RESIDENTIAL AND FARM SERVICE - OPTIONAL LARGE TIME OF USE RATE
APPLICABLE TO USE OF SERVICE FOR:
Single or three phase residential purposes at the option of customers who would otherwise be served under Service
Classification No. 1 of this Schedule. For use to 1) an individual residence, a flat or apartment in a multiple family
dwelling; 2) residential purposes in a multiple occupancy building where not more than two individual flats, apartments
or divided living spaces are available; 3) residential purposes in a roominghouse where not more than four rooms are
available for rent; 4) farm service when supplied through the farm residence meter; 5) single or three phase service to
any premise owned or leased by any not-for-profit corporation asso

In [38]:
for p in pages:
    if "NON-DEMAND SERVICE" in p["text"]:
        print("PAGE:", p["page_number"])
        print(p["text"])
        break


PAGE: 556
Status: EFFECTIVE
Received: 08/27/2025 Effective Date: 09/01/2025
PSC NO: 220 ELECTRICITY LEAF: 370
NIAGARA MOHAWK POWER CORPORATION REVISION: 26
INITIAL EFFECTIVE DATE: SEPTEMBER 1, 2025 SUPERSEDING REVISION: 25
STAMPS: Issued in Compliance with Order in Case 24-E-0322, dated August 14, 2025.
SERVICE CLASSIFICATION NO. 2
SMALL GENERAL SERVICE
APPLICABLE TO USE OF SERVICE FOR:
All purposes required by customer on the premises for which no other service classification is specifically provided and
where such entire requirements are delivered at one point and singly metered at the delivery voltage (except as
provided in Special Provision C). A customer once served under this service classification shall remain on this service
classification until the monthly measured demand exceeds 100 kW for twelve consecutive months following the initial
term of service, except as provided in Special Provision I, whereupon service may be taken under another appropriate
service classification. 

In [39]:
for p in pages:
    if "METERED DEMAND SERVICE" in p["text"]:
        print("PAGE:", p["page_number"])
        print(p["text"])
        break


PAGE: 557
Status: EFFECTIVE
Received: 09/10/2025 Effective Date: 10/01/2025
PSC NO: 220 ELECTRICITY LEAF: 371
NIAGARA MOHAWK POWER CORPORATION REVISION: 36
INITIAL EFFECTIVE DATE: OCTOBER 1, 2025 SUPERSEDING REVISION: 35
STAMPS: Issued in Compliance with Order in Case 24-E-0364, dated June 12, 2025.
SERVICE CLASSIFICATION NO. 2 (Continued)
STANDARD TARIFF CHARGES FOR METERED DEMAND SERVICE:
Distribution Delivery Rates and Charges for all Load Zones:
Basic Service Charge $65.00
Basic Service Charge
Special Provision P $90.10
Distribution Delivery Charges,
per kW: $16.99
Company supplied Electricity Supply Service Charges, per kWh:
Company supplied Electricity Supply Service charges shall be set according to the market price of electricity
determined in accordance with Rule 46.1, Electricity Supply Cost. Electricity Supply Cost Customers subject
to Special Provision P will be billed for Electricity Supply Service in accordance with Rule 46.1.3.
MONTHLY MINIMUM CHARGE: $81.99
MONTHLY MINI

In [40]:
for p in pages:
    if "SERVICE CLASSIFICATION NO. 3" in p["text"] and "3A" not in p["text"]:
        print("PAGE:", p["page_number"])
        print(p["text"])
        break


PAGE: 340
Status: EFFECTIVE
Received: 09/02/2025 Effective Date: 10/01/2025
PSC NO: 220 ELECTRICITY LEAF: 234.5
NIAGARA MOHAWK POWER CORPORATION REVISION: 1
INITIAL EFFECTIVE DATE: OCTOBER 1, 2025 SUPERSEDING REVISION: 0
STAMP: Issued in Compliance with Orders in Case 22-E-0236, dated October 17, 2024 and in Case 24-E-0322, dated
August 14, 2025.
GENERAL INFORMATION
48. COMMERCIAL ELECTRIC VEHICLE CHARGING PROGRAMS (CONTINUED)
48.2 Commercial Electric Vehicle Phase-In Rates (‚ÄúEV Phase-In Rates‚Äù) (continued)
48.2.11 Monthly Rates (continued):
SERVICE CLASSIFICATION NO. 3 (delivery voltage 0-2.2 kV)
Delivery Charges Tier 1 Tier 2 Tier 3 Tier 4
Distribution (per kW): $0.00 $3.57 $7.14 $10.71
On Peak (per kWh): $0.04805 $0.03604 $0.02403 $0.01201
Off Peak (per kWh): $0.02403 $0.01802 $0.01201 $0.00601
Super Peak (per kWh): $0.07208 $0.05406 $0.03604 $0.01802
SERVICE CLASSIFICATION NO. 3 (delivery voltage 2.2-15 kV)
Delivery Charges Tier 1 Tier 2 Tier 3 Tier 4
Distribution (per kW): $0.

In [41]:
for p in pages:
    if "SERVICE CLASSIFICATION NO. 3A" in p["text"]:
        print("PAGE:", p["page_number"])
        print(p["text"])
        break


PAGE: 341
Status: EFFECTIVE
Received: 09/02/2025 Effective Date: 10/01/2025
PSC NO: 220 ELECTRICITY LEAF: 234.6
NIAGARA MOHAWK POWER CORPORATION REVISION: 1
INITIAL EFFECTIVE DATE: OCTOBER 1, 2025 SUPERSEDING REVISION: 0
STAMP: Issued in Compliance with Orders in Case 22-E-0236, dated October 17, 2024 and in Case 24-E-0322, dated
August 14, 2025.
GENERAL INFORMATION
48. COMMERCIAL ELECTRIC VEHICLE CHARGING PROGRAMS (CONTINUED)
48.2 Commercial Electric Vehicle Phase-In Rates (‚ÄúEV Phase-In Rates‚Äù) (continued)
48.2.11 Monthly Rates (continued):
SERVICE CLASSIFICATION NO. 3A (delivery voltage 0-2.2 kV)
Delivery Charges Tier 1 Tier 2 Tier 3 Tier 4
Distribution (per kW): $0.00 $3.52 $7.04 $10.55
On Peak (per kWh): $0.04033 $0.03025 $0.02017 $0.01008
Off Peak (per kWh): $0.02017 $0.01512 $0.01008 $0.00504
Super Peak (per kWh): $0.06050 $0.04537 $0.03025 $0.01512
SERVICE CLASSIFICATION NO. 3A (delivery voltage 2.2-15 kV)
Delivery Charges Tier 1 Tier 2 Tier 3 Tier 4
Distribution (per kW): $

In [42]:
for p in pages:
    if "SERVICE CLASSIFICATION NO. 2-D" in p["text"]:
        print("PAGE:", p["page_number"])
        print(p["text"])
        break


PAGE: 339
Status: EFFECTIVE
Received: 09/02/2025 Effective Date: 10/01/2025
PSC NO: 220 ELECTRICITY LEAF: 234.4
NIAGARA MOHAWK POWER CORPORATION REVISION: 1
INITIAL EFFECTIVE DATE: OCTOBER 1, 2025 SUPERSEDING REVISION: 0
STAMP: Issued in Compliance with Orders in Case 22-E-0236, dated October 17, 2024 and in Case 24-E-0322, dated
August 14, 2025.
GENERAL INFORMATION
48. COMMERCIAL ELECTRIC VEHICLE CHARGING PROGRAMS (CONTINUED)
48.2 Commercial Electric Vehicle Phase-In Rates (‚ÄúEV Phase-In Rates‚Äù) (continued)
48.2.11 Monthly Rates:
Customers will be billed for delivery based on their applicable Load Factor Tier Level in the
billing period, based on the rates specified below. Distribution (per kW) charges are the
customer‚Äôs maximum demand at the premise, as calculated in accordance with the parent service
classification‚Äôs Determination of Demand. Distribution (per kWh) charges will be based on the
customer‚Äôs total kWh usage at the customer‚Äôs premise for the applicable TOU peri

In [43]:
import re, json, pandas as pd

def dollars(line):
    return [float(x) for x in re.findall(r"\$([0-9]+\.[0-9]+)", line)]

def extract_effective_date(text):
    m = re.search(r"Effective Date:\s*([A-Za-z0-9/ ,]+)", text, re.I)
    return m.group(1).strip() if m else None


In [44]:
def parse_SC1(text, page):
    rec = {
        "scheme": "SC1",
        "page": page,
        "effective_date": extract_effective_date(text),
        "basic_service_charge": None,
        "per_kWh": None,
        "monthly_minimum": None
    }

    for ln in text.splitlines():
        ln = ln.strip()

        if re.search(r"basic service charge", ln, re.I):
            v = dollars(ln)
            if v: rec["basic_service_charge"] = v[0]

        if re.search(r"\bper\s*kwh\b", ln, re.I):
            v = dollars(ln)
            if v: rec["per_kWh"] = v[0]

        if "MONTHLY MINIMUM CHARGE" in ln.upper():
            v = dollars(ln)
            if v: rec["monthly_minimum"] = v[0]

    return rec


In [45]:
def parse_SC1C(text, page):
    rec = {
        "scheme": "SC1C",
        "page": page,
        "effective_date": extract_effective_date(text),
        "basic_service_charge": None,
        "per_kWh": None
    }

    for ln in text.splitlines():
        ln = ln.strip()

        if re.search(r"basic service charge", ln, re.I):
            v = dollars(ln)
            if v: rec["basic_service_charge"] = v[0]

        if re.search(r"\bper\s*kwh\b", ln, re.I):
            v = dollars(ln)
            if v: rec["per_kWh"] = v[0]

    return rec


In [46]:
def parse_SC2_non_demand(text, page):
    rec = {
        "scheme": "SC2_NonDemand",
        "page": page,
        "effective_date": extract_effective_date(text),
        "monthly_minimum": None,
        "monthly_minimum_SP_O": None
    }

    for ln in text.splitlines():
        ln = ln.strip()

        if "MONTHLY MINIMUM CHARGE:" in ln.upper() and "O" not in ln:
            v = dollars(ln)
            if v: rec["monthly_minimum"] = v[0]

        if "Special Provision O" in ln:
            v = dollars(ln)
            if v: rec["monthly_minimum_SP_O"] = v[0]

    return rec


In [47]:
def parse_SC2_demand(text, page):
    rec = {
        "scheme": "SC2_Demand",
        "page": page,
        "effective_date": extract_effective_date(text),
        "basic_service_charge": None,
        "basic_SP_P": None,
        "distribution_per_kW": None,
        "monthly_minimum": None,
        "monthly_minimum_SP_P": None
    }

    for ln in text.splitlines():
        ln = ln.strip()

        if ln.startswith("Basic Service Charge $"):
            v = dollars(ln)
            if v: rec["basic_service_charge"] = v[0]

        if "Special Provision P" in ln and "Basic" in ln:
            v = dollars(ln)
            if v: rec["basic_SP_P"] = v[0]

        if "per kW" in ln and "Distribution" in ln:
            v = dollars(ln)
            if v: rec["distribution_per_kW"] = v[0]

        if "MONTHLY MINIMUM CHARGE:" in ln and "P" not in ln:
            v = dollars(ln)
            if v: rec["monthly_minimum"] = v[0]

        if "MONTHLY MINIMUM CHARGE:" in ln and "P" in ln:
            v = dollars(ln)
            if v: rec["monthly_minimum_SP_P"] = v[0]

    return rec


In [48]:
def parse_SC2D(text, page):
    rec = {
        "scheme": "SC2D",
        "page": page,
        "effective_date": extract_effective_date(text)
    }

    for block in ["dist","on","off","super"]:
        for t in range(1,5):
            rec[f"{block}_t{t}"] = None

    for ln in text.splitlines():
        ln = ln.strip()

        if ln.startswith("Distribution (per kW):"):
            vals = dollars(ln)
            for i,v in enumerate(vals[:4]):
                rec[f"dist_t{i+1}"]=v

        if ln.startswith("On Peak"):
            vals = dollars(ln)
            for i,v in enumerate(vals[:4]):
                rec[f"on_t{i+1}"]=v

        if ln.startswith("Off Peak"):
            vals = dollars(ln)
            for i,v in enumerate(vals[:4]):
                rec[f"off_t{i+1}"]=v

        if ln.startswith("Super Peak"):
            vals = dollars(ln)
            for i,v in enumerate(vals[:4]):
                rec[f"super_t{i+1}"]=v

    return rec


In [49]:
def parse_SC3(text, page):
    rec = {"scheme":"SC3","page":page,"effective_date":extract_effective_date(text)}
    return parse_SC2D(text, page) | rec


In [50]:
def parse_SC3A(text, page):
    rec = {"scheme":"SC3A","page":page,"effective_date":extract_effective_date(text)}
    return parse_SC2D(text, page) | rec


In [51]:
SC1=[]; SC1C=[]; SC2N=[]; SC2Dem=[]; SC2D=[]; SC3=[]; SC3A=[]

for p in pages:
    txt = p["text"]; pg = p["page_number"]

    if "SERVICE CLASSIFICATION NO. 1" in txt and "1-C" not in txt:
        SC1.append(parse_SC1(txt,pg))

    if "SERVICE CLASSIFICATION NO. 1-C" in txt:
        SC1C.append(parse_SC1C(txt,pg))

    if "NON-DEMAND SERVICE" in txt:
        SC2N.append(parse_SC2_non_demand(txt,pg))

    if "METERED DEMAND SERVICE" in txt:
        SC2Dem.append(parse_SC2_demand(txt,pg))

    if "SERVICE CLASSIFICATION NO. 2-D" in txt:
        SC2D.append(parse_SC2D(txt,pg))

    if "SERVICE CLASSIFICATION NO. 3A" in txt:
        SC3A.append(parse_SC3A(txt,pg))

    if "SERVICE CLASSIFICATION NO. 3" in txt and "3A" not in txt:
        SC3.append(parse_SC3(txt,pg))


In [52]:
df_SC1 = pd.DataFrame(SC1)
df_SC1C = pd.DataFrame(SC1C)
df_SC2N = pd.DataFrame(SC2N)
df_SC2Dem = pd.DataFrame(SC2Dem)
df_SC2D = pd.DataFrame(SC2D)
df_SC3 = pd.DataFrame(SC3)
df_SC3A = pd.DataFrame(SC3A)


In [57]:
%pip install openpyxl


Collecting openpyxl
  Using cached openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting et-xmlfile (from openpyxl)
  Using cached et_xmlfile-2.0.0-py3-none-any.whl.metadata (2.7 kB)
Using cached openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)
Using cached et_xmlfile-2.0.0-py3-none-any.whl (18 kB)
Installing collected packages: et-xmlfile, openpyxl

   -------------------- ------------------- 1/2 [openpyxl]
   -------------------- ------------------- 1/2 [openpyxl]
   -------------------- ------------------- 1/2 [openpyxl]
   -------------------- ------------------- 1/2 [openpyxl]
   ---------------------------------------- 2/2 [openpyxl]

Successfully installed et-xmlfile-2.0.0 openpyxl-3.1.5
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [59]:
import json, re
import pandas as pd

# -------------------------------------------------------------------
# Load JSON
# -------------------------------------------------------------------
with open("D:\\utility-billing-ai\\data\\processed\\raw_extracted_tarif.json","r",encoding="utf-8") as f:
    raw=json.load(f)

pages=raw["pages"]

# -------------------------------------------------------------------
# Helper functions
# -------------------------------------------------------------------
def dollars(line):
    return [float(x) for x in re.findall(r"\$([0-9]+\.[0-9]+)", line)]

def extract_effective_date(text):
    m = re.search(r"Effective Date:\s*([A-Za-z0-9/ ,]+)", text, re.I)
    return m.group(1).strip() if m else None

# -------------------------------------------------------------------
# Parsers for each Service Classification
# -------------------------------------------------------------------

# SC1
def parse_SC1(text, page):
    rec={"scheme":"SC1","page":page,"effective_date":extract_effective_date(text),
         "basic_service_charge":None,"per_kWh":None,"monthly_minimum":None}
    for ln in text.splitlines():
        ln=ln.strip()
        if re.search(r"basic service charge", ln, re.I):
            v=dollars(ln)
            if v: rec["basic_service_charge"]=v[0]
        if re.search(r"\bper\s*kwh\b", ln, re.I):
            v=dollars(ln)
            if v: rec["per_kWh"]=v[0]
        if "MONTHLY MINIMUM CHARGE" in ln.upper():
            v=dollars(ln)
            if v: rec["monthly_minimum"]=v[0]
    return rec

# SC1-C
def parse_SC1C(text, page):
    rec={"scheme":"SC1C","page":page,"effective_date":extract_effective_date(text),
         "basic_service_charge":None,"per_kWh":None}
    for ln in text.splitlines():
        ln=ln.strip()
        if re.search(r"basic service charge", ln, re.I):
            v=dollars(ln)
            if v: rec["basic_service_charge"]=v[0]
        if re.search(r"\bper\s*kwh\b", ln, re.I):
            v=dollars(ln)
            if v: rec["per_kWh"]=v[0]
    return rec

# SC2 Non-Demand
def parse_SC2_non_demand(text,page):
    rec={"scheme":"SC2_NonDemand","page":page,"effective_date":extract_effective_date(text),
         "monthly_minimum":None,"monthly_minimum_SP_O":None}
    for ln in text.splitlines():
        ln=ln.strip()
        if "MONTHLY MINIMUM CHARGE:" in ln.upper() and "O" not in ln:
            v=dollars(ln)
            if v: rec["monthly_minimum"]=v[0]
        if "Special Provision O" in ln:
            v=dollars(ln)
            if v: rec["monthly_minimum_SP_O"]=v[0]
    return rec

# SC2 Demand
def parse_SC2_demand(text,page):
    rec={"scheme":"SC2_Demand","page":page,"effective_date":extract_effective_date(text),
         "basic_service_charge":None,"basic_SP_P":None,"distribution_per_kW":None,
         "monthly_minimum":None,"monthly_minimum_SP_P":None}
    for ln in text.splitlines():
        ln=ln.strip()
        if ln.startswith("Basic Service Charge"):
            v=dollars(ln)
            if v: rec["basic_service_charge"]=v[0]
        if "Special Provision P" in ln and "Basic" in ln:
            v=dollars(ln)
            if v: rec["basic_SP_P"]=v[0]
        if "per kW" in ln and "Distribution" in ln:
            v=dollars(ln)
            if v: rec["distribution_per_kW"]=v[0]
        if "MONTHLY MINIMUM CHARGE:" in ln and "P" not in ln:
            v=dollars(ln)
            if v: rec["monthly_minimum"]=v[0]
        if "MONTHLY MINIMUM CHARGE:" in ln and "P" in ln:
            v=dollars(ln)
            if v: rec["monthly_minimum_SP_P"]=v[0]
    return rec

# SC2D (EV Tiers)
def parse_SC2D(text,page):
    rec={"scheme":"SC2D","page":page,"effective_date":extract_effective_date(text)}
    for prefix in ["dist","on","off","super"]:
        for t in range(1,5):
            rec[f"{prefix}_t{t}"]=None
    for ln in text.splitlines():
        ln=ln.strip()
        if ln.startswith("Distribution (per kW):"):
            vals=dollars(ln)
            for i,v in enumerate(vals[:4]):
                rec[f"dist_t{i+1}"]=v
        if ln.startswith("On Peak") and "per kWh" in ln:
            vals=dollars(ln)
            for i,v in enumerate(vals[:4]):
                rec[f"on_t{i+1}"]=v
        if ln.startswith("Off Peak") and "per kWh" in ln:
            vals=dollars(ln)
            for i,v in enumerate(vals[:4]):
                rec[f"off_t{i+1}"]=v
        if ln.startswith("Super Peak") and "per kWh" in ln:
            vals=dollars(ln)
            for i,v in enumerate(vals[:4]):
                rec[f"super_t{i+1}"]=v
    return rec

# SC3 EV Tiers
def parse_SC3(text,page):
    base=parse_SC2D(text,page)
    base["scheme"]="SC3"
    return base

# SC3-A EV Tiers
def parse_SC3A(text,page):
    base=parse_SC2D(text,page)
    base["scheme"]="SC3A"
    return base

# -------------------------------------------------------------------
# MAIN EXTRACTION LOOP
# -------------------------------------------------------------------
SC1=[]; SC1C=[]; SC2N=[]; SC2Dem=[]; SC2D_rows=[]; SC3_rows=[]; SC3A_rows=[]

for p in pages:
    txt=p["text"]; pg=p["page_number"]

    if "SERVICE CLASSIFICATION NO. 1" in txt and "1-C" not in txt:
        SC1.append(parse_SC1(txt,pg))

    if "SERVICE CLASSIFICATION NO. 1-C" in txt:
        SC1C.append(parse_SC1C(txt,pg))

    if "NON-DEMAND SERVICE" in txt:
        SC2N.append(parse_SC2_non_demand(txt,pg))

    if "METERED DEMAND SERVICE" in txt:
        SC2Dem.append(parse_SC2_demand(txt,pg))

    if "SERVICE CLASSIFICATION NO. 2-D" in txt:
        SC2D_rows.append(parse_SC2D(txt,pg))

    if "SERVICE CLASSIFICATION NO. 3A" in txt:
        SC3A_rows.append(parse_SC3A(txt,pg))

    if "SERVICE CLASSIFICATION NO. 3" in txt and "3A" not in txt:
        SC3_rows.append(parse_SC3(txt,pg))

# -------------------------------------------------------------------
# Create DataFrames
# -------------------------------------------------------------------
df_SC1=pd.DataFrame(SC1)
df_SC1C=pd.DataFrame(SC1C)
df_SC2N=pd.DataFrame(SC2N)
df_SC2Dem=pd.DataFrame(SC2Dem)
df_SC2D=pd.DataFrame(SC2D_rows)
df_SC3=pd.DataFrame(SC3_rows)
df_SC3A=pd.DataFrame(SC3A_rows)

# -------------------------------------------------------------------
# Export to Excel
# -------------------------------------------------------------------
path="D:\\utility-billing-ai\\src\\agents\\tariff_analysis\\All_Service_Classifications.xlsx"
with pd.ExcelWriter(path, engine='openpyxl') as writer:
    df_SC1.to_excel(writer, sheet_name="SC1", index=False)
    df_SC1C.to_excel(writer, sheet_name="SC1C", index=False)
    df_SC2N.to_excel(writer, sheet_name="SC2_NonDemand", index=False)
    df_SC2Dem.to_excel(writer, sheet_name="SC2_Demand", index=False)
    df_SC2D.to_excel(writer, sheet_name="SC2D", index=False)
    df_SC3.to_excel(writer, sheet_name="SC3", index=False)
    df_SC3A.to_excel(writer, sheet_name="SC3A", index=False)

path


'D:\\utility-billing-ai\\src\\agents\\tariff_analysis\\All_Service_Classifications.xlsx'

In [61]:
import json, re
import pandas as pd
from pathlib import Path

# ============================================================
# Load JSON
# ============================================================

json_path = Path("D:\\utility-billing-ai\\data\\processed\\raw_extracted_tarif.json")

with open(json_path, "r", encoding="utf-8") as f:
    raw = json.load(f)

pages = raw["pages"]

# ============================================================
# Helper functions
# ============================================================

def dollars(line):
    return [float(x) for x in re.findall(r"\$([0-9]+\.[0-9]+)", line)]

def extract_effective_date(text):
    m = re.search(r"Effective Date:\s*([A-Za-z0-9/ ,]+)", text, re.I)
    return m.group(1).strip() if m else None

def has_any_values(record):
    """Return True if the record has at least one numeric value."""
    for k, v in record.items():
        if isinstance(v, (int, float)) and pd.notna(v):
            return True
    return False

# ============================================================
# SC1 Parser
# ============================================================

def parse_SC1(text, page):
    rec={
        "scheme":"SC1",
        "page":page,
        "effective_date":extract_effective_date(text),
        "basic_service_charge":None,
        "per_kWh":None,
        "monthly_minimum":None
    }

    for ln in text.splitlines():
        ln = ln.strip()

        if re.search(r"basic service charge", ln, re.I):
            v = dollars(ln)
            if v: rec["basic_service_charge"] = v[0]

        if re.search(r"\bper\s*kwh\b", ln, re.I):
            v = dollars(ln)
            if v: rec["per_kWh"] = v[0]

        if "MONTHLY MINIMUM CHARGE" in ln.upper():
            v = dollars(ln)
            if v: rec["monthly_minimum"] = v[0]

    return rec if has_any_values(rec) else None

# ============================================================
# SC1-C Parser
# ============================================================

def parse_SC1C(text, page):
    rec={
        "scheme":"SC1C",
        "page":page,
        "effective_date":extract_effective_date(text),
        "basic_service_charge":None,
        "per_kWh":None
    }

    for ln in text.splitlines():
        ln = ln.strip()

        if re.search(r"basic service charge", ln, re.I):
            v = dollars(ln)
            if v: rec["basic_service_charge"] = v[0]

        if re.search(r"\bper\s*kwh\b", ln, re.I):
            v = dollars(ln)
            if v: rec["per_kWh"] = v[0]

    return rec if has_any_values(rec) else None

# ============================================================
# SC2 Non-Demand Parser
# ============================================================

def parse_SC2_non_demand(text, page):
    rec={
        "scheme":"SC2_NonDemand",
        "page":page,
        "effective_date":extract_effective_date(text),
        "monthly_minimum":None,
        "monthly_minimum_SP_O":None
    }

    for ln in text.splitlines():
        ln = ln.strip()

        if "MONTHLY MINIMUM CHARGE:" in ln.upper() and "O" not in ln:
            v = dollars(ln)
            if v: rec["monthly_minimum"] = v[0]

        if "Special Provision O" in ln:
            v = dollars(ln)
            if v: rec["monthly_minimum_SP_O"] = v[0]

    return rec if has_any_values(rec) else None

# ============================================================
# SC2 Demand Parser
# ============================================================

def parse_SC2_demand(text, page):
    rec={
        "scheme":"SC2_Demand",
        "page":page,
        "effective_date":extract_effective_date(text),
        "basic_service_charge":None,
        "basic_SP_P":None,
        "distribution_per_kW":None,
        "monthly_minimum":None,
        "monthly_minimum_SP_P":None
    }

    for ln in text.splitlines():
        ln = ln.strip()

        if ln.startswith("Basic Service Charge"):
            v = dollars(ln)
            if v: rec["basic_service_charge"] = v[0]

        if "Special Provision P" in ln and "Basic" in ln:
            v = dollars(ln)
            if v: rec["basic_SP_P"] = v[0]

        if "Distribution" in ln and "per kW" in ln:
            v = dollars(ln)
            if v: rec["distribution_per_kW"] = v[0]

        if "MONTHLY MINIMUM CHARGE:" in ln and "P" not in ln:
            v = dollars(ln)
            if v: rec["monthly_minimum"] = v[0]

        if "MONTHLY MINIMUM CHARGE:" in ln and "P" in ln:
            v = dollars(ln)
            if v: rec["monthly_minimum_SP_P"] = v[0]

    return rec if has_any_values(rec) else None

# ============================================================
# SC2-D / SC3 / SC3A Multi-tier EV Parser
# ============================================================

def parse_SC3_SC3A(text, page, scheme_name):
    effective = extract_effective_date(text)
    rows = []

    temp = None

    for ln in text.splitlines():
        ln = ln.strip()

        # Match voltage range
        m = re.search(r"delivery voltage\s*([0-9\.\- kV‚Äì]+)", ln, re.I)
        if m:
            # Save old row
            if temp and has_any_values(temp):
                rows.append(temp)

            # Start new block
            temp = {
                "scheme": scheme_name,
                "page": page,
                "effective_date": effective,
                "voltage_range": m.group(1).strip(),
                "dist_t1": None, "dist_t2": None, "dist_t3": None, "dist_t4": None,
                "on_t1": None, "on_t2": None, "on_t3": None, "on_t4": None,
                "off_t1": None, "off_t2": None, "off_t3": None, "off_t4": None,
                "super_t1": None, "super_t2": None, "super_t3": None, "super_t4": None
            }
            continue

        if temp is None:
            continue

        # Extract tiered rates
        if ln.startswith("Distribution (per kW):"):
            vals = dollars(ln)
            for i, v in enumerate(vals[:4]):
                temp[f"dist_t{i+1}"] = v

        if ln.startswith("On Peak"):
            vals = dollars(ln)
            for i, v in enumerate(vals[:4]):
                temp[f"on_t{i+1}"] = v

        if ln.startswith("Off Peak"):
            vals = dollars(ln)
            for i, v in enumerate(vals[:4]):
                temp[f"off_t{i+1}"] = v

        if ln.startswith("Super Peak"):
            vals = dollars(ln)
            for i, v in enumerate(vals[:4]):
                temp[f"super_t{i+1}"] = v

    # Append last row
    if temp and has_any_values(temp):
        rows.append(temp)

    return rows

# ============================================================
# MAIN LOOP
# ============================================================

SC1=[]; SC1C=[]; SC2N=[]; SC2Dem=[]; SC2D_rows=[]; SC3_rows=[]; SC3A_rows=[]

for p in pages:
    txt = p["text"]
    pg = p["page_number"]

    # SC1
    if "SERVICE CLASSIFICATION NO. 1" in txt and "1-C" not in txt:
        row = parse_SC1(txt, pg)
        if row: SC1.append(row)

    # SC1-C
    if "SERVICE CLASSIFICATION NO. 1-C" in txt:
        row = parse_SC1C(txt, pg)
        if row: SC1C.append(row)

    # SC2 Non-Demand
    if "NON-DEMAND SERVICE" in txt:
        row = parse_SC2_non_demand(txt, pg)
        if row: SC2N.append(row)

    # SC2 Demand
    if "METERED DEMAND SERVICE" in txt:
        row = parse_SC2_demand(txt, pg)
        if row: SC2Dem.append(row)

    # SC2-D
    if "SERVICE CLASSIFICATION NO. 2-D" in txt:
        rows = parse_SC3_SC3A(txt, pg, "SC2D")
        SC2D_rows.extend(rows)

    # SC3
    if "SERVICE CLASSIFICATION NO. 3" in txt and "3A" not in txt:
        rows = parse_SC3_SC3A(txt, pg, "SC3")
        SC3_rows.extend(rows)

    # SC3A
    if "SERVICE CLASSIFICATION NO. 3A" in txt:
        rows = parse_SC3_SC3A(txt, pg, "SC3A")
        SC3A_rows.extend(rows)

# ============================================================
# Create DataFrames
# ============================================================

df_SC1   = pd.DataFrame(SC1)
df_SC1C  = pd.DataFrame(SC1C)
df_SC2N  = pd.DataFrame(SC2N)
df_SC2Dem= pd.DataFrame(SC2Dem)
df_SC2D  = pd.DataFrame(SC2D_rows)
df_SC3   = pd.DataFrame(SC3_rows)
df_SC3A  = pd.DataFrame(SC3A_rows)

# ============================================================
# Export to Excel (xlsxwriter)
# ============================================================

output_path = "D:\\utility-billing-ai\\src\\agents\\tariff_analysis\\All_Service_Classifications.xlsx"

with pd.ExcelWriter(output_path, engine="openpyxl") as writer:
    df_SC1.to_excel(writer, "SC1", index=False)
    df_SC1C.to_excel(writer, "SC1C", index=False)
    df_SC2N.to_excel(writer, "SC2_NonDemand", index=False)
    df_SC2Dem.to_excel(writer, "SC2_Demand", index=False)
    df_SC2D.to_excel(writer, "SC2D", index=False)
    df_SC3.to_excel(writer, "SC3", index=False)
    df_SC3A.to_excel(writer, "SC3A", index=False)

print("Saved:", output_path)


Saved: D:\utility-billing-ai\src\agents\tariff_analysis\All_Service_Classifications.xlsx


  df_SC1.to_excel(writer, "SC1", index=False)
  df_SC1C.to_excel(writer, "SC1C", index=False)
  df_SC2N.to_excel(writer, "SC2_NonDemand", index=False)
  df_SC2Dem.to_excel(writer, "SC2_Demand", index=False)
  df_SC2D.to_excel(writer, "SC2D", index=False)
  df_SC3.to_excel(writer, "SC3", index=False)
  df_SC3A.to_excel(writer, "SC3A", index=False)


In [63]:
import json, re
import pandas as pd

# ============================================================
# Load JSON
# ============================================================

with open(r"D:\utility-billing-ai\data\processed\raw_extracted_tarif.json", "r", encoding="utf-8") as f:
    raw = json.load(f)

pages = raw["pages"]

# ============================================================
# Helper functions
# ============================================================

COLS = [
    "service_class", "page", "effective_date",
    "section_head", "rate_type",
    "voltage_level",
    "rate",
    "tier1","tier2","tier3","tier4",
    "unit",
    "raw_text"
]

def make_row(**kwargs):
    row = {c: None for c in COLS}
    row.update(kwargs)
    return row

def dollars(s):
    return [float(x.replace(",", "")) for x in re.findall(r"\$([0-9,]+\.[0-9]+)", s)]

def extract_eff(text):
    m = re.search(r"Effective Date:\s*([A-Za-z0-9/ ,]+)", text, re.I)
    return m.group(1).strip() if m else None

def section_tag(lines):
    sec = None
    for ln in lines:
        u = ln.upper()
        if "MONTHLY RATE" in u:
            sec = "MONTHLY RATE"
        if "STANDARD TARIFF" in u:
            sec = "STANDARD TARIFF CHARGES"
        if "CHARGES:" in u:
            sec = "CHARGES"
    return sec


# ============================================================
# PARSER: SC1
# ============================================================

def parse_SC1(text, page):
    eff = extract_eff(text)
    lines = text.splitlines()
    sec = section_tag(lines)
    rows = []

    for ln in lines:
        s = ln.strip()
        if not s:
            continue

        if "basic service charge" in s.lower():
            vals = dollars(s)
            if vals:
                v = vals[0]
                rows.append(make_row(
                    service_class="SC1", page=page, effective_date=eff,
                    section_head=sec, rate_type="Basic Service Charge",
                    rate=v, tier1=v, unit="per month", raw_text=s
                ))

        if re.search(r"per\s*kwh", s, re.I):
            vals = dollars(s)
            if vals:
                v = vals[0]
                rows.append(make_row(
                    service_class="SC1", page=page, effective_date=eff,
                    section_head=sec, rate_type="Per kWh",
                    rate=v, tier1=v, unit="per kWh", raw_text=s
                ))

        if "MONTHLY MINIMUM CHARGE" in s.upper():
            vals = dollars(s)
            if vals:
                v = vals[0]
                rows.append(make_row(
                    service_class="SC1", page=page, effective_date=eff,
                    section_head="Monthly Minimum",
                    rate_type="Monthly Minimum", rate=v,
                    tier1=v, unit="per month", raw_text=s
                ))

    return rows


# ============================================================
# PARSER: SC1C
# ============================================================

def parse_SC1C(text, page):
    eff = extract_eff(text)
    lines = text.splitlines()
    sec = section_tag(lines)
    rows = []

    for ln in lines:
        s = ln.strip()
        if not s:
            continue

        if "basic service charge" in s.lower():
            vals = dollars(s)
            if vals:
                v = vals[0]
                rows.append(make_row(
                    service_class="SC1C", page=page, effective_date=eff,
                    section_head=sec, rate_type="Basic Service Charge",
                    rate=v, tier1=v, unit="per month", raw_text=s
                ))

        if "per kwh" in s.lower():
            vals = dollars(s)
            if vals:
                v = vals[0]
                rows.append(make_row(
                    service_class="SC1C", page=page, effective_date=eff,
                    section_head=sec, rate_type="Per kWh",
                    rate=v, tier1=v, unit="per kWh", raw_text=s
                ))

    return rows


# ============================================================
# PARSER: SC2 ‚Äî Non Demand
# ============================================================

def parse_SC2_non(text, page):
    eff = extract_eff(text)
    lines = text.splitlines()
    sec = section_tag(lines)
    rows = []

    for ln in lines:
        s = ln.strip()
        if "MONTHLY MINIMUM CHARGE:" in s.upper() and "O" not in s.upper():
            vals = dollars(s)
            if vals:
                rows.append(make_row(
                    service_class="SC2", page=page, effective_date=eff,
                    section_head=sec or "SC2 Non-Demand",
                    rate_type="Monthly Minimum",
                    rate=vals[0], tier1=vals[0], unit="per month",
                    raw_text=s
                ))

        if "SPECIAL PROVISION O" in s.upper():
            vals = dollars(s)
            if vals:
                rows.append(make_row(
                    service_class="SC2", page=page, effective_date=eff,
                    section_head=sec or "SC2 Non-Demand",
                    rate_type="Monthly Minimum ‚Äì Special Provision O",
                    rate=vals[0], tier1=vals[0], unit="per month",
                    raw_text=s
                ))

    return rows


# ============================================================
# PARSER: SC2 ‚Äî Demand
# ============================================================

def parse_SC2_dem(text, page):
    eff = extract_eff(text)
    lines = text.splitlines()
    sec = section_tag(lines)
    rows = []

    for ln in lines:
        s = ln.strip()

        if s.startswith("Basic Service Charge"):
            vals = dollars(s)
            if vals:
                rows.append(make_row(
                    service_class="SC2", page=page, effective_date=eff,
                    section_head=sec, rate_type="Basic Service Charge",
                    rate=vals[0], tier1=vals[0], unit="per month", raw_text=s
                ))

        if "Special Provision P" in s and "Basic" in s:
            vals = dollars(s)
            if vals:
                rows.append(make_row(
                    service_class="SC2", page=page, effective_date=eff,
                    section_head=sec, rate_type="Basic Service Charge ‚Äì SP P",
                    rate=vals[0], tier1=vals[0], unit="per month", raw_text=s
                ))

        if "per kW" in s and "Distribution" in s:
            vals = dollars(s)
            if vals:
                rows.append(make_row(
                    service_class="SC2", page=page, effective_date=eff,
                    section_head=sec, rate_type="Distribution per kW",
                    rate=vals[0], tier1=vals[0], unit="per kW", raw_text=s
                ))

        if "MONTHLY MINIMUM CHARGE:" in s and "P" not in s:
            vals = dollars(s)
            if vals:
                rows.append(make_row(
                    service_class="SC2", page=page, effective_date=eff,
                    section_head=sec, rate_type="Monthly Minimum",
                    rate=vals[0], tier1=vals[0], unit="per month", raw_text=s
                ))

        if "MONTHLY MINIMUM CHARGE:" in s and "P" in s:
            vals = dollars(s)
            if vals:
                rows.append(make_row(
                    service_class="SC2", page=page, effective_date=eff,
                    section_head=sec, rate_type="Monthly Minimum ‚Äì SP P",
                    rate=vals[0], tier1=vals[0], unit="per month", raw_text=s
                ))

    return rows


# ============================================================
# PARSER: SC3 / SC3A ‚Äî FULL NON-EV MONTHLY RATE TABLES
# ============================================================

def parse_SC3_full(text, page, service_class):
    """
    Extracts SC3 or SC3A monthly tables (non-EV):
    ‚Ä¢ Distribution Delivery
    ‚Ä¢ Distribution Delivery ‚Äì Special Provisions
    ‚Ä¢ Minimum Demand Charges
    ‚Ä¢ Additional Demand Charges
    ‚Ä¢ On-Peak / Off-Peak / Super-Peak (SC3A only)
    """
    eff = extract_eff(text)
    lines = text.splitlines()
    sec = section_tag(lines)
    rows = []

    current_voltage = None

    for ln in lines:
        s = ln.strip()
        if not s:
            continue

        # Voltage Line: starts with e.g. "0-2.2 kV" or "2.2-15 kV"
        if re.match(r"^[0-9\.\-]+\s*kV", s):
            current_voltage = s
            continue

        # ‚ñº Distribution Delivery
        if s.lower().startswith("distribution delivery") and "special" not in s.lower():
            vals = dollars(s)
            if vals:
                rows.append(make_row(
                    service_class=service_class, page=page, effective_date=eff,
                    section_head=sec or "Monthly Rate",
                    rate_type="Distribution Delivery",
                    voltage_level=current_voltage,
                    tier1=vals[0] if len(vals)>0 else None,
                    tier2=vals[1] if len(vals)>1 else None,
                    tier3=vals[2] if len(vals)>2 else None,
                    tier4=vals[3] if len(vals)>3 else None,
                    unit="per month",
                    raw_text=s
                ))
            continue

        # ‚ñº Distribution Delivery ‚Äì Special Provision (L / N / etc)
        if "special provision" in s.lower():
            vals = dollars(s)
            if vals:
                rows.append(make_row(
                    service_class=service_class, page=page, effective_date=eff,
                    section_head=sec or "Monthly Rate",
                    rate_type="Distribution Delivery ‚Äì Special Provision",
                    voltage_level=current_voltage,
                    tier1=vals[0] if len(vals)>0 else None,
                    tier2=vals[1] if len(vals)>1 else None,
                    tier3=vals[2] if len(vals)>2 else None,
                    tier4=vals[3] if len(vals)>3 else None,
                    unit="per month",
                    raw_text=s
                ))
            continue

        # ‚ñº Minimum Demand Charges
        if s.lower().startswith("minimum demand charges"):
            vals = dollars(s)
            if vals:
                rows.append(make_row(
                    service_class=service_class, page=page, effective_date=eff,
                    section_head="Minimum Demand Charges",
                    rate_type="Minimum Demand",
                    voltage_level=current_voltage,
                    tier1=vals[0] if len(vals)>0 else None,
                    tier2=vals[1] if len(vals)>1 else None,
                    tier3=vals[2] if len(vals)>2 else None,
                    tier4=vals[3] if len(vals)>3 else None,
                    unit="per month",
                    raw_text=s
                ))
            continue

        # ‚ñº Additional Demand Charges
        if "additional demand charges" in s.lower():
            vals = dollars(s)
            if vals:
                rows.append(make_row(
                    service_class=service_class, page=page, effective_date=eff,
                    section_head="Additional Demand Charges",
                    rate_type="Additional Demand (per kW)",
                    voltage_level=current_voltage,
                    tier1=vals[0] if len(vals)>0 else None,
                    tier2=vals[1] if len(vals)>1 else None,
                    tier3=vals[2] if len(vals)>2 else None,
                    tier4=vals[3] if len(vals)>3 else None,
                    unit="per kW",
                    raw_text=s
                ))
            continue

        # ‚ñº SC3A ONLY ‚Äî On Peak / Off Peak / Super Peak
        if service_class == "SC3A":
            if s.lower().startswith("on peak") or s.lower().startswith("off peak") or s.lower().startswith("super peak"):
                vals = dollars(s)
                rt = s.split(":")[0]
                if vals:
                    rows.append(make_row(
                        service_class=service_class, page=page, effective_date=eff,
                        section_head=sec or "Rate Periods",
                        rate_type=rt,
                        voltage_level=current_voltage,
                        tier1=vals[0] if len(vals)>0 else None,
                        tier2=vals[1] if len(vals)>1 else None,
                        tier3=vals[2] if len(vals)>2 else None,
                        tier4=vals[3] if len(vals)>3 else None,
                        unit="per kW",
                        raw_text=s
                    ))
                continue

    return rows


# ============================================================
# Main extraction
# ============================================================

all_SC1 = []
all_SC1C = []
all_SC2N = []
all_SC2D = []
all_SC3 = []
all_SC3A = []

for p in pages:
    txt = p["text"]
    pg = p["page_number"]

    # SC1
    if "SERVICE CLASSIFICATION NO. 1" in txt and "1-C" not in txt:
        all_SC1.extend(parse_SC1(txt, pg))

    # SC1C
    if "SERVICE CLASSIFICATION NO. 1-C" in txt:
        all_SC1C.extend(parse_SC1C(txt, pg))

    # SC2 Non-Demand
    if "NON-DEMAND SERVICE" in txt:
        all_SC2N.extend(parse_SC2_non(txt, pg))

    # SC2 Demand
    if "METERED DEMAND SERVICE" in txt:
        all_SC2D.extend(parse_SC2_dem(txt, pg))

    # SC3 (NON-EV version)
    if "SERVICE CLASSIFICATION NO. 3" in txt and "3A" not in txt:
        all_SC3.extend(parse_SC3_full(txt, pg, "SC3"))

    # SC3A (NON-EV version)
    if "SERVICE CLASSIFICATION NO. 3A" in txt:
        all_SC3A.extend(parse_SC3_full(txt, pg, "SC3A"))


# ============================================================
# Create DataFrames
# ============================================================

df_SC1 = pd.DataFrame(all_SC1, columns=COLS)
df_SC1C = pd.DataFrame(all_SC1C, columns=COLS)
df_SC2N = pd.DataFrame(all_SC2N, columns=COLS)
df_SC2D = pd.DataFrame(all_SC2D, columns=COLS)
df_SC3 = pd.DataFrame(all_SC3, columns=COLS)
df_SC3A = pd.DataFrame(all_SC3A, columns=COLS)


# Clean empty rows
def clean_df(df):
    num = ["rate","tier1","tier2","tier3","tier4"]
    if df.empty:
        return df
    return df.dropna(subset=num, how="all").reset_index(drop=True)

df_SC1 = clean_df(df_SC1)
df_SC1C = clean_df(df_SC1C)
df_SC2N = clean_df(df_SC2N)
df_SC2D = clean_df(df_SC2D)
df_SC3 = clean_df(df_SC3)
df_SC3A = clean_df(df_SC3A)


# ============================================================
# Export
# ============================================================

out = r"D:\utility-billing-ai\src\agents\tariff_analysis\All_Service_Classifications.xlsx"

with pd.ExcelWriter(out, engine="openpyxl") as writer:
    df_SC1.to_excel(writer, sheet_name="SC1", index=False)
    df_SC1C.to_excel(writer, sheet_name="SC1C", index=False)
    df_SC2N.to_excel(writer, sheet_name="SC2_NonDemand", index=False)
    df_SC2D.to_excel(writer, sheet_name="SC2_Demand", index=False)
    df_SC3.to_excel(writer, sheet_name="SC3", index=False)
    df_SC3A.to_excel(writer, sheet_name="SC3A", index=False)

print("Saved:", out)


Saved: D:\utility-billing-ai\src\agents\tariff_analysis\All_Service_Classifications.xlsx


In [64]:
import json, re
import pandas as pd

# ============================================================
# Load JSON
# ============================================================

with open(r"D:\utility-billing-ai\data\processed\raw_extracted_tarif.json",
          "r", encoding="utf-8") as f:
    raw = json.load(f)

pages = raw["pages"]

# ============================================================
# Helper Functions
# ============================================================

COLS = [
    "service_class", "page", "effective_date",
    "section_head", "rate_type",
    "unit",
    "up_to_2_2_kV",
    "kV_2_2_to_15",
    "kV_22_to_50",
    "over_60_kV",
    "raw_text"
]

def make_row(**kwargs):
    row = {c: None for c in COLS}
    row.update(kwargs)
    return row

def dollars(s):
    """Extracts dollar amounts, handling commas."""
    return [float(x.replace(",", "")) for x in re.findall(r"\$([0-9,]+\.[0-9]+)", s)]

def extract_eff(text):
    m = re.search(r"Effective Date:\s*([A-Za-z0-9/ ,]+)", text, re.I)
    return m.group(1).strip() if m else None

def section_tag(lines):
    """Detect high-level section labels."""
    sec = None
    for ln in lines:
        u = ln.upper()
        if "STANDARD TARIFF" in u:
            sec = "STANDARD TARIFF"
        if "MONTHLY RATE" in u:
            sec = "MONTHLY RATE"
        if "CHARGES" in u:
            sec = "CHARGES"
    return sec

# ============================================================
# SC1
# ============================================================

def parse_SC1(text, page):
    eff = extract_eff(text)
    lines = text.splitlines()
    sec = section_tag(lines)
    rows = []

    for s in lines:
        s = s.strip()
        if not s:
            continue

        # Basic Service Charge
        if "basic service charge" in s.lower():
            vals = dollars(s)
            if vals:
                rows.append(make_row(
                    service_class="SC1", page=page, effective_date=eff,
                    section_head=sec, rate_type="Basic Service Charge",
                    unit="per month",
                    up_to_2_2_kV=vals[0], raw_text=s
                ))

        # Per kWh
        if "per kwh" in s.lower():
            vals = dollars(s)
            if vals:
                rows.append(make_row(
                    service_class="SC1", page=page, effective_date=eff,
                    section_head=sec, rate_type="Energy per kWh",
                    unit="per kWh",
                    up_to_2_2_kV=vals[0], raw_text=s
                ))

        # Monthly Minimum
        if "MONTHLY MINIMUM CHARGE" in s.upper():
            vals = dollars(s)
            if vals:
                rows.append(make_row(
                    service_class="SC1", page=page, effective_date=eff,
                    section_head="Monthly Minimum", rate_type="Minimum",
                    unit="per month",
                    up_to_2_2_kV=vals[0], raw_text=s
                ))
    return rows

# ============================================================
# SC1-C
# ============================================================

def parse_SC1C(text, page):
    eff = extract_eff(text)
    lines = text.splitlines()
    sec = section_tag(lines)
    rows = []

    for s in lines:
        s = s.strip()
        if not s:
            continue

        if "basic service charge" in s.lower():
            vals = dollars(s)
            if vals:
                rows.append(make_row(
                    service_class="SC1C", page=page, effective_date=eff,
                    section_head=sec, rate_type="Basic Service Charge",
                    unit="per month",
                    up_to_2_2_kV=vals[0], raw_text=s
                ))

        if "per kwh" in s.lower():
            vals = dollars(s)
            if vals:
                rows.append(make_row(
                    service_class="SC1C", page=page, effective_date=eff,
                    section_head=sec, rate_type="Energy per kWh",
                    unit="per kWh",
                    up_to_2_2_kV=vals[0], raw_text=s
                ))

    return rows

# ============================================================
# SC2 Non-Demand
# ============================================================

def parse_SC2_non(text, page):
    eff = extract_eff(text)
    lines = text.splitlines()
    sec = section_tag(lines)
    rows = []

    for s in lines:
        s = s.strip()
        if not s:
            continue

        if "MONTHLY MINIMUM CHARGE:" in s.upper() and "O" not in s.upper():
            vals = dollars(s)
            if vals:
                rows.append(make_row(
                    service_class="SC2", page=page, effective_date=eff,
                    section_head=sec, rate_type="Monthly Minimum",
                    unit="per month",
                    up_to_2_2_kV=vals[0], raw_text=s
                ))

        if "SPECIAL PROVISION O" in s.upper():
            vals = dollars(s)
            if vals:
                rows.append(make_row(
                    service_class="SC2", page=page, effective_date=eff,
                    section_head=sec, rate_type="Monthly Minimum ‚Äì SP O",
                    unit="per month",
                    up_to_2_2_kV=vals[0], raw_text=s
                ))
    return rows

# ============================================================
# SC2 Demand
# ============================================================

def parse_SC2_dem(text, page):
    eff = extract_eff(text)
    lines = text.splitlines()
    sec = section_tag(lines)
    rows = []

    for s in lines:
        s = s.strip()
        if not s:
            continue

        # Basic
        if s.startswith("Basic Service Charge"):
            vals = dollars(s)
            if vals:
                rows.append(make_row(
                    service_class="SC2", page=page, effective_date=eff,
                    section_head=sec, rate_type="Basic Service Charge",
                    unit="per month",
                    up_to_2_2_kV=vals[0], raw_text=s
                ))

        # SP P
        if "Special Provision P" in s and "Basic" in s:
            vals = dollars(s)
            if vals:
                rows.append(make_row(
                    service_class="SC2", page=page, effective_date=eff,
                    section_head=sec, rate_type="Basic ‚Äì SP P",
                    unit="per month",
                    up_to_2_2_kV=vals[0], raw_text=s
                ))

        # Distribution per kW
        if "per kW" in s and "Distribution" in s:
            vals = dollars(s)
            if vals:
                rows.append(make_row(
                    service_class="SC2", page=page, effective_date=eff,
                    section_head=sec, rate_type="Demand Charge per kW",
                    unit="per kW",
                    up_to_2_2_kV=vals[0], raw_text=s
                ))

        # Monthly Minimum
        if "MONTHLY MINIMUM CHARGE:" in s and "P" not in s:
            vals = dollars(s)
            if vals:
                rows.append(make_row(
                    service_class="SC2", page=page, effective_date=eff,
                    section_head=sec, rate_type="Monthly Minimum",
                    unit="per month",
                    up_to_2_2_kV=vals[0], raw_text=s
                ))

        if "MONTHLY MINIMUM CHARGE:" in s and "P" in s:
            vals = dollars(s)
            if vals:
                rows.append(make_row(
                    service_class="SC2", page=page, effective_date=eff,
                    section_head=sec, rate_type="Monthly Minimum ‚Äì SP P",
                    unit="per month",
                    up_to_2_2_kV=vals[0], raw_text=s
                ))

    return rows


# ============================================================
# SC3 / SC3A ‚Äî NON-EV MONTHLY RATE TABLES
# ============================================================

def parse_SC3_full(text, page, service_class):
    eff = extract_eff(text)
    lines = text.splitlines()
    sec = section_tag(lines)
    rows = []

    vcols = ["up_to_2_2_kV", "kV_2_2_to_15", "kV_22_to_50", "over_60_kV"]

    for s in lines:
        s = s.strip()
        if not s:
            continue

        # Skip header lines ("Up to 2.2 kV  2.2-15 kV etc")
        if re.search(r"\b2\.2\b", s) and "kV" in s:
            continue

        # Distribution Delivery (customer charge)
        if s.startswith("Distribution Delivery") and "per kW" not in s:
            vals = dollars(s)
            if vals:
                rec = make_row(
                    service_class=service_class, page=page, effective_date=eff,
                    section_head=sec, rate_type="Distribution Delivery",
                    unit="per month", raw_text=s
                )
                for i, v in enumerate(vals):
                    rec[vcols[i]] = v
                rows.append(rec)
            continue

        # Demand Charges (per kW)
        if "per kW" in s and ("Distribution Delivery" in s or "Charges" in s):
            vals = dollars(s)
            if vals:
                rec = make_row(
                    service_class=service_class, page=page, effective_date=eff,
                    section_head="Plus Demand Charges",
                    rate_type="Demand Charge",
                    unit="per kW",
                    raw_text=s
                )
                for i, v in enumerate(vals):
                    rec[vcols[i]] = v
                rows.append(rec)
            continue

        # SC3A time-of-day rates
        if service_class == "SC3A":
            if s.startswith(("On Peak", "Off Peak", "Super Peak")):
                vals = dollars(s)
                if vals:
                    rec = make_row(
                        service_class=service_class, page=page, effective_date=eff,
                        section_head="CHARGES", rate_type=s.split(":")[0],
                        unit="per kWh", raw_text=s
                    )
                    for i, v in enumerate(vals):
                        rec[vcols[i]] = v
                    rows.append(rec)
                continue

    return rows

# ============================================================
# Main Loop
# ============================================================

rows_SC1 = []
rows_SC1C = []
rows_SC2N = []
rows_SC2D = []
rows_SC3 = []
rows_SC3A = []

for p in pages:
    txt = p["text"]
    pg = p["page_number"]

    if "SERVICE CLASSIFICATION NO. 1" in txt and "1-C" not in txt:
        rows_SC1.extend(parse_SC1(txt, pg))

    if "SERVICE CLASSIFICATION NO. 1-C" in txt:
        rows_SC1C.extend(parse_SC1C(txt, pg))

    if "NON-DEMAND SERVICE" in txt:
        rows_SC2N.extend(parse_SC2_non(txt, pg))

    if "METERED DEMAND SERVICE" in txt:
        rows_SC2D.extend(parse_SC2_dem(txt, pg))

    if "SERVICE CLASSIFICATION NO. 3" in txt and "3A" not in txt:
        rows_SC3.extend(parse_SC3_full(txt, pg, "SC3"))

    if "SERVICE CLASSIFICATION NO. 3A" in txt:
        rows_SC3A.extend(parse_SC3_full(txt, pg, "SC3A"))

# ============================================================
# DataFrames
# ============================================================

def clean_df(df):
    if df.empty:
        return df
    value_cols = ["up_to_2_2_kV", "kV_2_2_to_15", "kV_22_to_50", "over_60_kV"]
    return df.dropna(subset=value_cols, how="all").reset_index(drop=True)

df_SC1 = clean_df(pd.DataFrame(rows_SC1, columns=COLS))
df_SC1C = clean_df(pd.DataFrame(rows_SC1C, columns=COLS))
df_SC2N = clean_df(pd.DataFrame(rows_SC2N, columns=COLS))
df_SC2D = clean_df(pd.DataFrame(rows_SC2D, columns=COLS))
df_SC3 = clean_df(pd.DataFrame(rows_SC3, columns=COLS))
df_SC3A = clean_df(pd.DataFrame(rows_SC3A, columns=COLS))

# ============================================================
# Export
# ============================================================

out = r"D:\utility-billing-ai\src\agents\tariff_analysis\All_Service_Classifications.xlsx"

with pd.ExcelWriter(out, engine="openpyxl") as writer:
    df_SC1.to_excel(writer, sheet_name="SC1", index=False)
    df_SC1C.to_excel(writer, sheet_name="SC1C", index=False)
    df_SC2N.to_excel(writer, sheet_name="SC2_NonDemand", index=False)
    df_SC2D.to_excel(writer, sheet_name="SC2_Demand", index=False)
    df_SC3.to_excel(writer, sheet_name="SC3", index=False)
    df_SC3A.to_excel(writer, sheet_name="SC3A", index=False)

print("Saved:", out)


Saved: D:\utility-billing-ai\src\agents\tariff_analysis\All_Service_Classifications.xlsx


In [65]:
import json, re
import pandas as pd

# ============================================================
# Load JSON
# ============================================================

with open(r"D:\utility-billing-ai\data\processed\raw_extracted_tarif.json",
          "r", encoding="utf-8") as f:
    raw = json.load(f)

pages = raw["pages"]

# ============================================================
# Helper functions / schema
# ============================================================

COLS = [
    "service_class", "page", "effective_date",
    "section_head",

    # core bill-comparison columns
    "basic_service_charge",      # $/month
    "distribution_per_kwh",      # delivery $/kWh
    "energy_per_kwh",            # if you later split supply vs delivery
    "demand_charge_per_kw",      # generic per-kW charge
    "minimum_charge",            # $/month

    # SC3 / SC3A ‚Äì voltage-based customer/demand charges
    "cust_up_to_2_2_kV",
    "cust_2_2_to_15_kV",
    "cust_22_50_kV",
    "cust_over_60_kV",

    "dem_up_to_2_2_kV",
    "dem_2_2_to_15_kV",
    "dem_22_50_kV",
    "dem_over_60_kV",

    # SC3A ‚Äì TOU rates by voltage
    "on_peak_up_to_2_2_kV",
    "on_peak_2_2_to_15_kV",
    "on_peak_22_50_kV",
    "on_peak_over_60_kV",

    "off_peak_up_to_2_2_kV",
    "off_peak_2_2_to_15_kV",
    "off_peak_22_50_kV",
    "off_peak_over_60_kV",

    "super_peak_up_to_2_2_kV",
    "super_peak_2_2_to_15_kV",
    "super_peak_22_50_kV",
    "super_peak_over_60_kV",

    "raw_text"   # small snippet for debugging
]

def make_blank_row(service_class, page, effective_date, section_head=None):
    row = {c: None for c in COLS}
    row["service_class"] = service_class
    row["page"] = page
    row["effective_date"] = effective_date
    row["section_head"] = section_head
    return row

def dollars(s: str):
    """Extract all $X.XX (with or without thousands comma)."""
    return [float(x.replace(",", "")) for x in re.findall(r"\$([0-9,]+\.[0-9]+)", s)]

def extract_eff(text: str):
    m = re.search(r"Effective Date:\s*([A-Za-z0-9/ ,]+)", text, re.I)
    return m.group(1).strip() if m else None

def detect_section(lines):
    sec = None
    for ln in lines:
        u = ln.upper()
        if "STANDARD TARIFF CHARGES" in u:
            sec = "STANDARD TARIFF CHARGES"
        elif "MONTHLY RATE" in u:
            sec = "MONTHLY RATE"
        elif "CHARGES FOR METERED DEMAND SERVICE" in u:
            sec = "METERED DEMAND SERVICE"
        elif "CHARGES:" in u and sec is None:
            sec = "CHARGES"
    return sec

# ============================================================
# Parsers
# ============================================================

# ---------- SC1 ----------
def parse_SC1(text, page):
    eff = extract_eff(text)
    lines = text.splitlines()
    sec = detect_section(lines)
    row = make_blank_row("SC1", page, eff, sec)
    raw_bits = []

    for ln in lines:
        s = ln.strip()
        if not s:
            continue
        u = s.upper()

        # Basic Service Charge
        if "BASIC SERVICE CHARGE" in u:
            vals = dollars(s)
            if vals:
                row["basic_service_charge"] = vals[0]
                raw_bits.append(s)

        # Per kWh (treat as distribution_per_kwh for comparison)
        if re.search(r"\bPER\s*KWH\b", u):
            vals = dollars(s)
            if vals:
                row["distribution_per_kwh"] = vals[0]
                raw_bits.append(s)

        # Monthly Minimum
        if "MONTHLY MINIMUM CHARGE" in u:
            vals = dollars(s)
            if vals:
                row["minimum_charge"] = vals[0]
                raw_bits.append(s)

    row["raw_text"] = " | ".join(raw_bits)[:500] if raw_bits else None

    # only keep if we actually found something useful
    useful_cols = ["basic_service_charge", "distribution_per_kwh", "minimum_charge"]
    if any(row[c] is not None for c in useful_cols):
        return [row]
    return []


# ---------- SC1-C ----------
def parse_SC1C(text, page):
    eff = extract_eff(text)
    lines = text.splitlines()
    sec = detect_section(lines)
    row = make_blank_row("SC1C", page, eff, sec)
    raw_bits = []

    for ln in lines:
        s = ln.strip()
        if not s:
            continue
        u = s.upper()

        if "BASIC SERVICE CHARGE" in u:
            vals = dollars(s)
            if vals:
                row["basic_service_charge"] = vals[0]
                raw_bits.append(s)

        if re.search(r"\bPER\s*KWH\b", u):
            vals = dollars(s)
            if vals:
                row["distribution_per_kwh"] = vals[0]
                raw_bits.append(s)

        if "MONTHLY MINIMUM CHARGE" in u:
            vals = dollars(s)
            if vals:
                row["minimum_charge"] = vals[0]
                raw_bits.append(s)

    row["raw_text"] = " | ".join(raw_bits)[:500] if raw_bits else None
    useful_cols = ["basic_service_charge", "distribution_per_kwh", "minimum_charge"]
    return [row] if any(row[c] is not None for c in useful_cols) else []


# ---------- SC2 ‚Äì Non-Demand ----------
def parse_SC2_non_demand(text, page):
    eff = extract_eff(text)
    lines = text.splitlines()
    sec = detect_section(lines)
    row = make_blank_row("SC2_NonDemand", page, eff, sec)
    raw_bits = []

    for ln in lines:
        s = ln.strip()
        if not s:
            continue
        u = s.upper()

        if "BASIC SERVICE CHARGE" in u:
            vals = dollars(s)
            if vals:
                row["basic_service_charge"] = vals[0]
                raw_bits.append(s)

        # Distribution per kWh (delivery)
        if "PER KWH" in u and "SUPPLY" not in u:
            vals = dollars(s)
            if vals:
                row["distribution_per_kwh"] = vals[0]
                raw_bits.append(s)

        if "MONTHLY MINIMUM CHARGE:" in u and "SPECIAL PROVISION O" not in u:
            vals = dollars(s)
            if vals:
                row["minimum_charge"] = vals[0]
                raw_bits.append(s)

        if "SPECIAL PROVISION O" in u and "MONTHLY MINIMUM CHARGE" in u:
            vals = dollars(s)
            if vals and row.get("minimum_charge") is None:
                # store SP O into minimum_charge if base empty
                row["minimum_charge"] = vals[0]
                raw_bits.append(s)

    row["raw_text"] = " | ".join(raw_bits)[:500] if raw_bits else None
    useful_cols = ["basic_service_charge", "distribution_per_kwh", "minimum_charge"]
    return [row] if any(row[c] is not None for c in useful_cols) else []


# ---------- SC2 ‚Äì Demand ----------
def parse_SC2_demand(text, page):
    eff = extract_eff(text)
    lines = text.splitlines()
    sec = detect_section(lines)
    row = make_blank_row("SC2_Demand", page, eff, sec)
    raw_bits = []

    for ln in lines:
        s = ln.strip()
        if not s:
            continue
        u = s.upper()

        if s.startswith("Basic Service Charge"):
            vals = dollars(s)
            if vals:
                row["basic_service_charge"] = vals[0]
                raw_bits.append(s)

        if "BASIC SERVICE CHARGE" in u and "SPECIAL PROVISION P" in u:
            vals = dollars(s)
            if vals and row.get("basic_service_charge") is None:
                row["basic_service_charge"] = vals[0]
                raw_bits.append(s)

        # Distribution Delivery Charges, per kW:
        if "PER KW" in u and "DISTRIBUTION" in u:
            vals = dollars(s)
            if vals:
                row["demand_charge_per_kw"] = vals[0]
                raw_bits.append(s)

        if "MONTHLY MINIMUM CHARGE:" in u and "P" not in u:
            vals = dollars(s)
            if vals:
                row["minimum_charge"] = vals[0]
                raw_bits.append(s)

        if "MONTHLY MINIMUM CHARGE:" in u and "P" in u:
            vals = dollars(s)
            if vals and row.get("minimum_charge") is None:
                row["minimum_charge"] = vals[0]
                raw_bits.append(s)

    row["raw_text"] = " | ".join(raw_bits)[:500] if raw_bits else None
    useful_cols = ["basic_service_charge", "demand_charge_per_kw", "minimum_charge"]
    return [row] if any(row[c] is not None for c in useful_cols) else []


# ---------- SC3 / SC3A ‚Äì non-EV, voltage-based ----------
def parse_SC3_like(text, page, service_class):
    eff = extract_eff(text)
    lines = text.splitlines()
    sec = detect_section(lines)
    row = make_blank_row(service_class, page, eff, sec)
    raw_bits = []

    # voltage-based columns
    cust_cols = ["cust_up_to_2_2_kV", "cust_2_2_to_15_kV",
                 "cust_22_50_kV", "cust_over_60_kV"]
    dem_cols  = ["dem_up_to_2_2_kV", "dem_2_2_to_15_kV",
                 "dem_22_50_kV", "dem_over_60_kV"]
    on_cols   = ["on_peak_up_to_2_2_kV", "on_peak_2_2_to_15_kV",
                 "on_peak_22_50_kV", "on_peak_over_60_kV"]
    off_cols  = ["off_peak_up_to_2_2_kV", "off_peak_2_2_to_15_kV",
                 "off_peak_22_50_kV", "off_peak_over_60_kV"]
    sup_cols  = ["super_peak_up_to_2_2_kV", "super_peak_2_2_to_15_kV",
                 "super_peak_22_50_kV", "super_peak_over_60_kV"]

    for ln in lines:
        s = ln.strip()
        if not s:
            continue
        u = s.upper()

        # Skip pure header line with "Up to 2.2 kV 2.2-15 kV ..."
        if "UP TO 2.2 KV" in u and "OVER 60 KV" in u:
            continue

        # Customer charges: Distribution Delivery row with multiple $ but no "per kW"
        if s.startswith("Distribution Delivery") and "PER KW" not in u:
            vals = dollars(s)
            if vals:
                for col, val in zip(cust_cols, vals):
                    row[col] = val
                raw_bits.append(s)
            continue

        # Demand Charges per kW
        if ("PER KW" in u) and ("DISTRIBUTION" in u or "CHARGES" in u):
            vals = dollars(s)
            if vals:
                for col, val in zip(dem_cols, vals):
                    row[col] = val
                # also generic demand_charge_per_kw for comparison
                row["demand_charge_per_kw"] = vals[0]
                raw_bits.append(s)
            continue

        # SC3A ONLY ‚Äì TOU rates (On/Off/Super Peak)
        if service_class == "SC3A":
            if u.startswith("ON PEAK") and "PER KWH" in u:
                vals = dollars(s)
                if vals:
                    for col, val in zip(on_cols, vals):
                        row[col] = val
                    raw_bits.append(s)
                continue

            if u.startswith("OFF PEAK") and "PER KWH" in u:
                vals = dollars(s)
                if vals:
                    for col, val in zip(off_cols, vals):
                        row[col] = val
                    raw_bits.append(s)
                continue

            if u.startswith("SUPER PEAK") and "PER KWH" in u:
                vals = dollars(s)
                if vals:
                    for col, val in zip(sup_cols, vals):
                        row[col] = val
                    raw_bits.append(s)
                continue

    row["raw_text"] = " | ".join(raw_bits)[:500] if raw_bits else None

    useful_cols = (
        cust_cols + dem_cols + on_cols + off_cols + sup_cols +
        ["demand_charge_per_kw"]
    )
    if any(row[c] is not None for c in useful_cols):
        return [row]
    return []


# ============================================================
# MAIN EXTRACTION ‚Äì build ONE master table
# ============================================================

rows_all = []

for p in pages:
    txt = p["text"]
    pg = p["page_number"]

    if "SERVICE CLASSIFICATION NO. 1" in txt and "1-C" not in txt:
        rows_all.extend(parse_SC1(txt, pg))

    if "SERVICE CLASSIFICATION NO. 1-C" in txt:
        rows_all.extend(parse_SC1C(txt, pg))

    if "NON-DEMAND SERVICE" in txt:
        rows_all.extend(parse_SC2_non_demand(txt, pg))

    if "METERED DEMAND SERVICE" in txt:
        rows_all.extend(parse_SC2_demand(txt, pg))

    if "SERVICE CLASSIFICATION NO. 3A" in txt:
        rows_all.extend(parse_SC3_like(txt, pg, "SC3A"))

    # SC3 (must come after 3A check so we don't double count)
    if "SERVICE CLASSIFICATION NO. 3" in txt and "3A" not in txt:
        rows_all.extend(parse_SC3_like(txt, pg, "SC3"))

# Build DataFrame
df_all = pd.DataFrame(rows_all, columns=COLS)

# optional: drop rows that are completely empty for numeric values
numeric_cols = [
    "basic_service_charge", "distribution_per_kwh", "energy_per_kwh",
    "demand_charge_per_kw", "minimum_charge",
    "cust_up_to_2_2_kV", "cust_2_2_to_15_kV", "cust_22_50_kV", "cust_over_60_kV",
    "dem_up_to_2_2_kV", "dem_2_2_to_15_kV", "dem_22_50_kV", "dem_over_60_kV",
    "on_peak_up_to_2_2_kV", "on_peak_2_2_to_15_kV", "on_peak_22_50_kV", "on_peak_over_60_kV",
    "off_peak_up_to_2_2_kV", "off_peak_2_2_to_15_kV", "off_peak_22_50_kV", "off_peak_over_60_kV",
    "super_peak_up_to_2_2_kV", "super_peak_2_2_to_15_kV",
    "super_peak_22_50_kV", "super_peak_over_60_kV"
]
df_all = df_all.dropna(subset=numeric_cols, how="all").reset_index(drop=True)

# ============================================================
# Export to a SINGLE Excel sheet
# ============================================================

out = r"D:\utility-billing-ai\src\agents\tariff_analysis\Tariff_Master_Bill_Comparison.xlsx"

with pd.ExcelWriter(out, engine="openpyxl") as writer:
    df_all.to_excel(writer, sheet_name="Tariffs", index=False)

print("Saved:", out)


Saved: D:\utility-billing-ai\src\agents\tariff_analysis\Tariff_Master_Bill_Comparison.xlsx
