In [None]:
%pip install pdfplumber pandas tabulate

Collecting tabulate
  Using cached tabulate-0.9.0-py3-none-any.whl.metadata (34 kB)
Using cached tabulate-0.9.0-py3-none-any.whl (35 kB)
Installing collected packages: tabulate
Successfully installed tabulate-0.9.0


In [3]:
%pip install pdfplumber transformers accelerate pandas

Collecting transformers
  Downloading transformers-4.57.1-py3-none-any.whl.metadata (43 kB)
Collecting accelerate
  Downloading accelerate-1.11.0-py3-none-any.whl.metadata (19 kB)
Collecting filelock (from transformers)
  Downloading filelock-3.20.0-py3-none-any.whl.metadata (2.1 kB)
Collecting huggingface-hub<1.0,>=0.34.0 (from transformers)
  Downloading huggingface_hub-0.36.0-py3-none-any.whl.metadata (14 kB)
Collecting pyyaml>=5.1 (from transformers)
  Using cached pyyaml-6.0.3-cp310-cp310-win_amd64.whl.metadata (2.4 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2025.11.3-cp310-cp310-win_amd64.whl.metadata (41 kB)
Collecting requests (from transformers)
  Using cached requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting tokenizers<=0.23.0,>=0.22.0 (from transformers)
  Downloading tokenizers-0.22.1-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Downloading safetensors-0.6.2-cp38-abi3-win_amd64.whl.me


[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import pdfplumber, re, json, subprocess, pandas as pd
from pathlib import Path

# ---------- CONFIG ----------
PDF_PATH = "National Grid Tariff-New York.pdf"
MODEL = "mistral"   # or try "phi3" if smaller CPU model is preferred
OUTPUT_JSON = "tariff_rates_from_ollama.json"

# ---------- FUNCTION TO QUERY OLLAMA ----------
def query_ollama(prompt, model=MODEL):
    """Send a prompt to the local Ollama model and return the response."""
    process = subprocess.Popen(["ollama", "run", model],
                               stdin=subprocess.PIPE, stdout=subprocess.PIPE)
    output, _ = process.communicate(prompt.encode("utf-8"))
    return output.decode("utf-8")

# ---------- STEP 1: Extract SC Sections from the PDF ----------
sections = {}
with pdfplumber.open(PDF_PATH) as pdf:
    text = "\n".join(page.extract_text() or "" for page in pdf.pages)

for match in re.finditer(r"SERVICE\s+CLASSIFICATION\s+NO\.?\s*(\d+[A-Z\-]*)", text, re.IGNORECASE):
    sc_id = match.group(1).strip()
    start = match.start()
    next_match = re.search(r"SERVICE\s+CLASSIFICATION\s+NO\.?\s*(\d+[A-Z\-]*)", text[start+10:], re.IGNORECASE)
    end = start + 10 + next_match.start() if next_match else len(text)
    sections[f"SC-{sc_id}"] = text[start:end]

print(f"üìÑ Found {len(sections)} service classifications in PDF.")

# ---------- STEP 2: Define the Prompt ----------
def make_prompt(sc, content):
    return f"""
You are an expert in electric utility tariff extraction.

Read the text below for {sc} and extract *only* rate information in JSON format.

Required keys:
{{
  "Service Classification": "{sc}",
  "Effective Date": "",
  "Previous Effective Date": "",
  "Basic Service Charge ($/month)": "",
  "Monthly Minimum Charge ($)": "",
  "Energy Rates ($/kWh)": {{
      "On Peak": "",
      "Off Peak": "",
      "Super Peak": "",
      "All Hours": ""
  }},
  "Demand / Distribution Rates ($/kW)": {{
      "Distribution": "",
      "Delivery": "",
      "As-Used On Peak": "",
      "As-Used Super Peak": ""
  }},
  "Reactive Demand ($/RkVA)": "",
  "Notes": ""
}}

Text:
{content}

Respond with **valid JSON only**, no explanations.
"""

# ---------- STEP 3: Loop Through and Query Ollama ----------
results = []
for sc, content in sections.items():
    print(f"‚öôÔ∏è Extracting rates for {sc} ...")
    prompt = make_prompt(sc, content)
    response = query_ollama(prompt)

    # Extract JSON portion safely
    json_part = response[response.find("{"):response.rfind("}")+1]
    try:
        data = json.loads(json_part)
        results.append(data)
        print(f"‚úÖ Parsed {sc} successfully.")
    except Exception as e:
        print(f"‚ö†Ô∏è Could not parse JSON for {sc}: {e}")
        print("Raw output snippet:", response[:400])

# ---------- STEP 4: Save and Convert ----------
Path(OUTPUT_JSON).write_text(json.dumps(results, indent=2))
print(f"\nüíæ Saved structured data to {OUTPUT_JSON}")

df = pd.json_normalize(results)
excel_path = "Tariff_Rates_from_Ollama.xlsx"
df.to_excel(excel_path, index=False)
print(f"üìä Data exported to {excel_path}")


üìÑ Found 23 service classifications in PDF.
‚öôÔ∏è Extracting rates for SC-12 ...


FileNotFoundError: [WinError 2] The system cannot find the file specified

In [1]:
import pdfplumber
import re
import pandas as pd

pdf_path = "National Grid Tariff-New York.pdf"
output_excel = "SC1_SC1C_Tariff_Rates.xlsx"

# ---------- 1. Extract SC text ----------
def extract_sc_text(pdf_path, sc_no):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text() or ""
            if re.search(rf"SERVICE\s+CLASSIFICATION\s+NO\.?\s*{sc_no}\b", page_text, re.I):
                text += page_text + "\n"
    return text

sc1_text = extract_sc_text(pdf_path, "1(?!-C)")   # SC-1 only
sc1c_text = extract_sc_text(pdf_path, "1-?C")     # SC-1-C

# ---------- 2. Define regex patterns ----------
patterns = {
    "Effective Date": r"Effective\s*(?:Date|on|from)[:\-]?\s*([A-Za-z]+\s*\d{1,2},?\s*\d{4})",
    "Basic Service Charge": r"Basic\s+Service\s+Charge[^$\d]*(\$?\d+\.\d{2})",
    "Monthly Minimum Charge": r"Monthly\s+Minimum\s+Charge[^$\d]*(\$?\d+\.\d{2})",
    "Per kWh All Hours": r"Per\s*kWh[^$\d]*(\$?\d+\.\d{4})",
    "On Peak": r"On\s*Peak[^$\d]*(\$?\d+\.\d{4})",
    "Off Peak": r"Off\s*Peak[^$\d]*(\$?\d+\.\d{4})",
    "Super Peak": r"Super\s*Peak[^$\d]*(\$?\d+\.\d{4})",
    "Distribution Delivery": r"Distribution\s*(?:Delivery|Charge)[^$\d]*(\$?\d+\.\d{4})",
}

def extract_values(text, sc_name):
    values = {key: None for key in patterns}
    for key, pat in patterns.items():
        match = re.search(pat, text, re.I)
        if match:
            values[key] = match.group(1)
    return {
        "Service_Classification": sc_name,
        "Effective_Date": values["Effective Date"],
        "Basic_Service_Charge_($/month)": values["Basic Service Charge"],
        "Monthly_Minimum_Charge_($)": values["Monthly Minimum Charge"],
        "Energy_Rate_All_Hours_($/kWh)": values["Per kWh All Hours"],
        "On_Peak_($/kWh)": values["On Peak"],
        "Off_Peak_($/kWh)": values["Off Peak"],
        "Super_Peak_($/kWh)": values["Super Peak"],
        "Distribution_Delivery_($/kWh)": values["Distribution Delivery"],
    }

# ---------- 3. Build DataFrame ----------
records = [
    extract_values(sc1_text, "SC-1"),
    extract_values(sc1c_text, "SC-1-C")
]

df = pd.DataFrame(records)

# ---------- 4. Save to Excel ----------
df.to_excel(output_excel, index=False)
print(f"‚úÖ Tariff data saved to Excel file: {output_excel}")


‚úÖ Tariff data saved to Excel file: SC1_SC1C_Tariff_Rates.xlsx


In [2]:
import pdfplumber
import re
import pandas as pd

pdf_path = "National Grid Tariff-New York.pdf"
output_excel = "SC2_SC2D_Tariff_Rates.xlsx"

# ---------- 1. Extract SC text ----------
def extract_sc_text(pdf_path, sc_no):
    """Extracts text for a given service classification"""
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text() or ""
            if re.search(rf"SERVICE\s+CLASSIFICATION\s+NO\.?\s*{sc_no}\b", page_text, re.I):
                text += page_text + "\n"
    return text

sc2_text = extract_sc_text(pdf_path, "2(?!D)")   # SC-2 only
sc2d_text = extract_sc_text(pdf_path, "2D")      # SC-2D section

# ---------- 2. Define regex patterns ----------
patterns = {
    "Effective Date": r"Effective\s*(?:Date|on|from)[:\-]?\s*([A-Za-z]+\s*\d{1,2},?\s*\d{4})",
    "Basic Service Charge": r"Basic\s+Service\s+Charge[^$\d]*(\$?\d+\.\d{2})",
    "Monthly Minimum Charge": r"Monthly\s+Minimum\s+Charge[^$\d]*(\$?\d+\.\d{2})",
    "Per kWh All Hours": r"Per\s*kWh[^$\d]*(\$?\d+\.\d{4})",
    "On Peak": r"On\s*Peak[^$\d]*(\$?\d+\.\d{4})",
    "Off Peak": r"Off\s*Peak[^$\d]*(\$?\d+\.\d{4})",
    "Super Peak": r"Super\s*Peak[^$\d]*(\$?\d+\.\d{4})",
    "Distribution Delivery": r"Distribution\s*(?:Delivery|Charge)[^$\d]*(\$?\d+\.\d{4})",
    "Demand Charge": r"(?:Demand|per\s*kW)[^$\d]*(\$?\d+\.\d{2})"
}

# ---------- 3. Extract values ----------
def extract_values(text, sc_name):
    values = {key: None for key in patterns}
    for key, pat in patterns.items():
        match = re.search(pat, text, re.I)
        if match:
            values[key] = match.group(1)
    return {
        "Service_Classification": sc_name,
        "Effective_Date": values["Effective Date"],
        "Basic_Service_Charge_($/month)": values["Basic Service Charge"],
        "Monthly_Minimum_Charge_($)": values["Monthly Minimum Charge"],
        "Energy_Rate_All_Hours_($/kWh)": values["Per kWh All Hours"],
        "On_Peak_($/kWh)": values["On Peak"],
        "Off_Peak_($/kWh)": values["Off Peak"],
        "Super_Peak_($/kWh)": values["Super Peak"],
        "Distribution_Delivery_($/kWh_or_kW)": values["Distribution Delivery"],
        "Demand_Charge_($/kW)": values["Demand Charge"]
    }

# ---------- 4. Build DataFrame ----------
records = [
    extract_values(sc2_text, "SC-2"),
    extract_values(sc2d_text, "SC-2D")
]

df = pd.DataFrame(records)

# ---------- 5. Save to Excel ----------
df.to_excel(output_excel, index=False)
print(f"‚úÖ Tariff data saved to Excel: {output_excel}")


‚úÖ Tariff data saved to Excel: SC2_SC2D_Tariff_Rates.xlsx


In [3]:
import pdfplumber
import re
import pandas as pd

pdf_path = "National Grid Tariff-New York.pdf"
output_excel = "SC3_SC3A_Tariff_Rates_ByVoltage.xlsx"

# ---------- 1. Extract SC text ----------
def extract_sc_text(pdf_path, sc_no):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text() or ""
            if re.search(rf"SERVICE\s+CLASSIFICATION\s+NO\.?\s*{sc_no}\b", page_text, re.I):
                text += page_text + "\n"
    return text

sc3_text = extract_sc_text(pdf_path, "3(?!A)")   # SC-3
sc3a_text = extract_sc_text(pdf_path, "3A")      # SC-3A

# ---------- 2. Regex patterns ----------
patterns = {
    "Effective Date": r"Effective\s*(?:Date|on|from)[:\-]?\s*([A-Za-z]+\s*\d{1,2},?\s*\d{4})",
    "Customer Charge": r"(?:Customer|Basic)\s+Charge[^$\d]*(\$?\d+\.\d{2,4})",
    "On Peak": r"On\s*Peak[^$\d]*(\$?\d+\.\d{4})",
    "Off Peak": r"Off\s*Peak[^$\d]*(\$?\d+\.\d{4})",
    "Super Peak": r"Super\s*Peak[^$\d]*(\$?\d+\.\d{4})",
    "Distribution": r"Distribution\s*(?:Delivery|Charge)[^$\d]*(\$?\d+\.\d{2,4})",
    "Reactive Demand": r"Reactive\s*Demand[^$\d]*(\$?\d+\.\d{2,4})",
    "Demand": r"per\s*kW[^$\d]*(\$?\d+\.\d{2,4})"
}

voltage_patterns = {
    "Secondary": r"(?i)(?:Secondary\s*\(<2\.?2\s*kV\)|Secondary)",
    "Primary": r"(?i)(?:Primary\s*\(2\.?2\s*[-‚Äì]?\s*15\s*kV\)|Primary)",
    "Subtransmission": r"(?i)(?:Subtransmission\s*\(22\s*[-‚Äì]?\s*50\s*kV\)|Subtransmission)",
    "Transmission": r"(?i)(?:Transmission\s*\(?>\s*60\s*kV\)|Transmission)"
}

# ---------- 3. Function to extract values by voltage ----------
def extract_by_voltage(text, sc_name):
    effective_match = re.search(patterns["Effective Date"], text)
    effective_date = effective_match.group(1) if effective_match else None

    rows = []
    for voltage_label, v_pat in voltage_patterns.items():
        section_matches = re.findall(v_pat + r"([^A-Z]{0,500})", text)  # capture nearby lines
        for section in section_matches:
            entry = {
                "Service_Classification": sc_name,
                "Voltage_Level": voltage_label,
                "Effective_Date": effective_date,
                "Customer_Charge_($/month)": None,
                "Distribution_Delivery_($/kW)": None,
                "On_Peak_($/kWh)": None,
                "Off_Peak_($/kWh)": None,
                "Super_Peak_($/kWh)": None,
                "Demand_Charge_($/kW)": None,
                "Reactive_Demand_Charge_($/RkVA)": None
            }
            # apply smaller regex searches in the captured block
            for key, pat in patterns.items():
                match = re.search(pat, section)
                if match:
                    if "On Peak" in key: entry["On_Peak_($/kWh)"] = match.group(1)
                    elif "Off Peak" in key: entry["Off_Peak_($/kWh)"] = match.group(1)
                    elif "Super Peak" in key: entry["Super_Peak_($/kWh)"] = match.group(1)
                    elif "Distribution" in key: entry["Distribution_Delivery_($/kW)"] = match.group(1)
                    elif "Demand" == key: entry["Demand_Charge_($/kW)"] = match.group(1)
                    elif "Reactive" in key: entry["Reactive_Demand_Charge_($/RkVA)"] = match.group(1)
                    elif "Customer" in key: entry["Customer_Charge_($/month)"] = match.group(1)
            rows.append(entry)
    return rows

# ---------- 4. Extract all ----------
sc3_rows = extract_by_voltage(sc3_text, "SC-3")
sc3a_rows = extract_by_voltage(sc3a_text, "SC-3A")

df = pd.DataFrame(sc3_rows + sc3a_rows)

# ---------- 5. Clean up duplicates ----------
df = df.drop_duplicates(subset=["Service_Classification", "Voltage_Level"]).reset_index(drop=True)

# ---------- 6. Save ----------
df.to_excel(output_excel, index=False)
print(f"‚úÖ Tariff data (by voltage) saved to Excel: {output_excel}")


‚úÖ Tariff data (by voltage) saved to Excel: SC3_SC3A_Tariff_Rates_ByVoltage.xlsx


In [4]:
import pdfplumber
import re
import pandas as pd

pdf_path = "National Grid Tariff-New York.pdf"
output_excel = "SC3_SC3A_Tariff_Rates.xlsx"

# ---------- 1. Extract SC text ----------
def extract_sc_text(pdf_path, sc_no):
    """Extracts text for a given service classification"""
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text() or ""
            if re.search(rf"SERVICE\s+CLASSIFICATION\s+NO\.?\s*{sc_no}\b", page_text, re.I):
                text += page_text + "\n"
    return text

sc3_text = extract_sc_text(pdf_path, "3(?!A)")   # SC-3 only
sc3a_text = extract_sc_text(pdf_path, "3A")      # SC-3A section

# ---------- 2. Define regex patterns ----------
patterns = {
    "Effective Date": r"Effective\s*(?:Date|on|from)[:\-]?\s*([A-Za-z]+\s*\d{1,2},?\s*\d{4})",
    "Customer Charge": r"(?:Customer|Basic)\s+Charge[^$\d]*(\$?\d+\.\d{2})",
    "Distribution Delivery": r"Distribution\s*(?:Delivery|Charge)[^$\d]*(\$?\d+\.\d{2,4})",
    "On Peak": r"On\s*Peak[^$\d]*(\$?\d+\.\d{4})",
    "Off Peak": r"Off\s*Peak[^$\d]*(\$?\d+\.\d{4})",
    "Super Peak": r"Super\s*Peak[^$\d]*(\$?\d+\.\d{4})",
    "Reactive Demand": r"Reactive\s*Demand[^$\d]*(\$?\d+\.\d{2,4})",
    "Demand Charge": r"per\s*kW[^$\d]*(\$?\d+\.\d{2,4})"
}

# ---------- 3. Extraction function ----------
def extract_values(text, sc_name):
    values = {key: None for key in patterns}
    for key, pat in patterns.items():
        match = re.search(pat, text, re.I)
        if match:
            values[key] = match.group(1)
    return {
        "Service_Classification": sc_name,
        "Effective_Date": values["Effective Date"],
        "Customer_Charge_($/month)": values["Customer Charge"],
        "Distribution_Delivery_($/kWh_or_kW)": values["Distribution Delivery"],
        "On_Peak_($/kWh)": values["On Peak"],
        "Off_Peak_($/kWh)": values["Off Peak"],
        "Super_Peak_($/kWh)": values["Super Peak"],
        "Demand_Charge_($/kW)": values["Demand Charge"],
        "Reactive_Demand_Charge_($/RkVA)": values["Reactive Demand"]
    }

# ---------- 4. Build DataFrame ----------
records = [
    extract_values(sc3_text, "SC-3"),
    extract_values(sc3a_text, "SC-3A")
]

df = pd.DataFrame(records)

# ---------- 5. Save to Excel ----------
df.to_excel(output_excel, index=False)
print(f"‚úÖ Tariff data saved to Excel: {output_excel}")


‚úÖ Tariff data saved to Excel: SC3_SC3A_Tariff_Rates.xlsx


In [2]:
%pip install pandas

Collecting pandas
  Using cached pandas-2.3.3-cp310-cp310-win_amd64.whl.metadata (19 kB)
Collecting numpy>=1.22.4 (from pandas)
  Using cached numpy-2.2.6-cp310-cp310-win_amd64.whl.metadata (60 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Using cached tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Using cached pandas-2.3.3-cp310-cp310-win_amd64.whl (11.3 MB)
Using cached numpy-2.2.6-cp310-cp310-win_amd64.whl (12.9 MB)
Using cached pytz-2025.2-py2.py3-none-any.whl (509 kB)
Using cached tzdata-2025.2-py2.py3-none-any.whl (347 kB)
Installing collected packages: pytz, tzdata, numpy, pandas

   ---------------------------------------- 0/4 [pytz]
   ---------------------------------------- 0/4 [pytz]
   ---------- ----------------------------- 1/4 [tzdata]
   ---------- ----------------------------- 1/4 [tzdata]
   ---------- ----------------------------- 1/4 [tzdata]
   ------------


[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [8]:
# Load JSON
with open("D:\\utility-billing-ai\\data\\processed\\raw_extracted_tarif.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Detect structure safely
if isinstance(data, list):
    pages = data
elif isinstance(data, dict):
    # Try common keys
    for key in ["pages", "data", "items"]:
        if key in data:
            pages = data[key]
            break
    else:
        pages = list(data.values())  # fallback

# Display first 5 pages
for i, page in enumerate(pages[:5]):
    print(f"---- PAGE {page.get('page_number', i)} ----")
    print(page.get("text", "")[:800], "\n")


---- PAGE 1 ----
Status: EFFECTIVE
Received: 03/31/2009 Effective Date: 04/27/2009
PSC NO: 220 ELECTRICITY LEAF: 1
NIAGARA MOHAWK POWER CORPORATION REVISION: 0
INITIAL EFFECTIVE DATE: APRIL 27, 2009 SUPERSEDING REVISION:
P.S.C. No. 220 Electricity
SUPERSEDING P.S.C. No. 207
NIAGARA MOHAWK POWER CORPORATION
d/b/a NATIONAL GRID
SCHEDULE
FOR
ELECTRIC SERVICE
APPLICABLE
IN
ALL TERRITORY SERVED BY THIS COMPANY
P.S.C. No. 220 Schedule for Electric Service ("P.S.C. No. 220 Electricity Tariff") supersedes and replaces former
P.S.C. No. 207 Schedule for Electric Service ("P.S.C. No. 207 Electricity Tariff") effective April 27, 2009 ("Effective
Date"). As of the Effective Date, all references to P.S.C. No. 207 Electricity Tariff in agreements existing as of the
Effective Date shall be construed as references to P.S. 

---- PAGE 2 ----
Status: EFFECTIVE
Received: 08/27/2025 Effective Date: 09/01/2025
PSC NO: 220 ELECTRICITY LEAF: 2
NIAGARA MOHAWK POWER CORPORATION REVISION: 18
INITIAL EFFECTIVE D

In [9]:
print(type(data))
print(data.keys() if isinstance(data, dict) else "Not a dict")


<class 'dict'>
dict_keys(['pages'])


In [11]:
import json

with open("D:\\utility-billing-ai\\data\\processed\\raw_extracted_tarif.json", "r", encoding="utf-8") as f:
    data = json.load(f)

pages = data["pages"]   # your actual pages list

for i, page in enumerate(pages[:5]):   # show first 5 pages
    print("---- PAGE", page.get("page_number", i), "----")
    print(page["text"][:800], "\n")   # first 800 characters


---- PAGE 1 ----
Status: EFFECTIVE
Received: 03/31/2009 Effective Date: 04/27/2009
PSC NO: 220 ELECTRICITY LEAF: 1
NIAGARA MOHAWK POWER CORPORATION REVISION: 0
INITIAL EFFECTIVE DATE: APRIL 27, 2009 SUPERSEDING REVISION:
P.S.C. No. 220 Electricity
SUPERSEDING P.S.C. No. 207
NIAGARA MOHAWK POWER CORPORATION
d/b/a NATIONAL GRID
SCHEDULE
FOR
ELECTRIC SERVICE
APPLICABLE
IN
ALL TERRITORY SERVED BY THIS COMPANY
P.S.C. No. 220 Schedule for Electric Service ("P.S.C. No. 220 Electricity Tariff") supersedes and replaces former
P.S.C. No. 207 Schedule for Electric Service ("P.S.C. No. 207 Electricity Tariff") effective April 27, 2009 ("Effective
Date"). As of the Effective Date, all references to P.S.C. No. 207 Electricity Tariff in agreements existing as of the
Effective Date shall be construed as references to P.S. 

---- PAGE 2 ----
Status: EFFECTIVE
Received: 08/27/2025 Effective Date: 09/01/2025
PSC NO: 220 ELECTRICITY LEAF: 2
NIAGARA MOHAWK POWER CORPORATION REVISION: 18
INITIAL EFFECTIVE D

In [17]:
def extract_effective_date(text):
    # Match YYYY-MM-DD or MM/DD/YYYY or MONTH DD, YYYY
    patterns = [
        r"Effective Date:\s*([0-9]{2}/[0-9]{2}/[0-9]{4})",
        r"Effective Date:\s*([A-Za-z]+\s+\d{1,2},\s*\d{4})",
        r"Effective Date:\s*([0-9]{4}-[0-9]{2}-[0-9]{2})"
    ]

    for p in patterns:
        m = re.search(p, text, flags=re.I)
        if m:
            return m.group(1)

    return None


In [18]:
def classify_page(text):
    t = text.upper()

    if "SERVICE CLASSIFICATION NO. 1-C" in t:
        return "SC1C"
    if "SERVICE CLASSIFICATION NO. 1" in t and "1-C" not in t:
        return "SC1"

    if "SERVICE CLASSIFICATION NO. 2-D" in t:
        return "SC2D"
    if "SERVICE CLASSIFICATION NO. 2" in t and "2-D" not in t:
        return "SC2"

    if "SERVICE CLASSIFICATION NO. 3-A" in t:
        return "SC3A"
    if "SERVICE CLASSIFICATION NO. 3" in t and "3-A" not in t:
        return "SC3"

    return None

for page in pages:
    page["service_class"] = classify_page(page["text"])
    page["effective_date"] = extract_effective_date(page["text"])


In [13]:
sc_groups = {
    "SC1": [],
    "SC1C": [],
    "SC2": [],
    "SC2D": [],
    "SC3": [],
    "SC3A": []
}

for page in pages:
    sc = page["service_class"]
    if sc in sc_groups:
        sc_groups[sc].append(page)


In [14]:
def extract_fields(text):
    fields = {}

    fields["basic_charge"] = re.findall(r"(Basic|Customer).*?(\$?\d+\.\d+)", text, flags=re.I)
    fields["min_charge"] = re.findall(r"Minimum.*?(\$?\d+\.\d+)", text, flags=re.I)
    fields["energy_rates"] = re.findall(r"(\d+\.\d+)\s*(¬¢|\$)?/?kWh", text, flags=re.I)

    fields["on_peak"] = re.findall(r"On[- ]Peak.*?(\$?\d+\.\d+)", text, flags=re.I)
    fields["off_peak"] = re.findall(r"Off[- ]Peak.*?(\$?\d+\.\d+)", text, flags=re.I)
    fields["super_peak"] = re.findall(r"Super[- ]Peak.*?(\$?\d+\.\d+)", text, flags=re.I)

    fields["delivery_charge"] = re.findall(r"Delivery.*?(\$?\d+\.\d+)", text, flags=re.I)
    fields["demand_charge"] = re.findall(r"Demand.*?(\$?\d+\.\d+)", text, flags=re.I)

    fields["voltage_ranges"] = re.findall(r"(\d+)\s*-\s*(\d+)\s*volts", text, flags=re.I)
    fields["kv_voltage"] = re.findall(r"(\d+\.?\d*)\s*kV", text, flags=re.I)

    return fields


In [15]:
dfs = {}

for sc, sc_pages in sc_groups.items():
    rows = []
    for page in sc_pages:
        extracted = extract_fields(page["text"])
        extracted["page"] = page["page_number"]
        rows.append(extracted)

    dfs[sc] = pd.DataFrame(rows)


In [16]:
for sc in dfs:
    print(f"\n===== {sc} =====")
    print(dfs[sc].head())



===== SC1 =====
         basic_charge min_charge energy_rates on_peak off_peak super_peak  \
0                  []         []           []      []       []         []   
1                  []         []           []      []       []         []   
2  [(customer, 48.2)]         []           []      []       []         []   
3                  []         []           []      []       []         []   
4  [(Customer, 46.3)]         []           []      []       []         []   

  delivery_charge demand_charge voltage_ranges kv_voltage  page  
0        [263.39]            []             []         []     3  
1              []            []             []         []   155  
2              []            []             []         []   223  
3              []            []             []         []   318  
4              []            []             []         []   325  

===== SC1C =====
        basic_charge min_charge energy_rates on_peak off_peak super_peak  \
0                 []         [