# Data Management

### Imports

In [5]:
import sys
from pathlib import Path
import re
import csv

import pandas as pd
from IPython.display import display, Markdown

### Define Paths

In [6]:
CWD = Path.cwd()
ROOT = CWD.parent if CWD.name.lower() == "notebooks" else CWD

DATA = ROOT / "data"
RAW = DATA / "raw"
PROCESSED = DATA / "processed"

RAW.mkdir(parents=True, exist_ok=True)
PROCESSED.mkdir(parents=True, exist_ok=True)

print("Python:", sys.version.split()[0])
print("CWD:", CWD)
print("ROOT:", ROOT)
print("RAW:", RAW)
print("PROCESSED:", PROCESSED)

Python: 3.11.9
CWD: C:\Users\danci\Interconnection-Queue-Intelligence\Interconnection-Queue-Intelligence\notebooks
ROOT: C:\Users\danci\Interconnection-Queue-Intelligence\Interconnection-Queue-Intelligence
RAW: C:\Users\danci\Interconnection-Queue-Intelligence\Interconnection-Queue-Intelligence\data\raw
PROCESSED: C:\Users\danci\Interconnection-Queue-Intelligence\Interconnection-Queue-Intelligence\data\processed


### Helpers

In [7]:
def looks_like_header_cell(x) -> bool:
    if x is None or (isinstance(x, float) and pd.isna(x)):
        return False
    s = str(x).strip()
    if s == "":
        return False
    if re.fullmatch(r"[-+]?\d+(\.\d+)?", s):
        return False
    return bool(re.search(r"[A-Za-z]", s))


def detect_header_row(preview_df: pd.DataFrame, min_nonnull: int = 4) -> int:
    best_i, best_score = 0, float("-inf")

    for i in range(len(preview_df)):
        row = preview_df.iloc[i]
        nonnull = row.dropna().tolist()
        if len(nonnull) < min_nonnull:
            continue

        headerish = sum(looks_like_header_cell(v) for v in nonnull)
        norm = [str(v).strip().lower() for v in nonnull]
        dup_penalty = len(norm) - len(set(norm))

        score = len(nonnull) + 2 * headerish - 1.5 * dup_penalty
        if score > best_score:
            best_score = score
            best_i = i

    return best_i


def inspect_excel(path: Path, max_scan_rows: int = 60, head_rows: int = 5):
    display(Markdown(f"**File:** `{path.name}`"))
    if not path.exists():
        display(Markdown("**Missing file** (not found in `data/raw/`)."))
        return

    try:
        xls = pd.ExcelFile(path)
    except Exception as e:
        display(Markdown(f"**Failed to open Excel:** `{e}`"))
        return

    sheets = xls.sheet_names
    display(Markdown(f"**Sheets:** {len(sheets)}"))
    display(pd.DataFrame({"sheet": sheets}))

    for sheet in sheets:
        display(Markdown(f"#### Sheet: `{sheet}`"))

        try:
            preview = pd.read_excel(path, sheet_name=sheet, header=None, nrows=max_scan_rows)
        except Exception as e:
            display(Markdown(f"**Preview read failed:** `{e}`"))
            continue

        header_row = detect_header_row(preview)
        print(f"Detected header row (0-based in preview): {header_row}")

        try:
            df = pd.read_excel(path, sheet_name=sheet, header=header_row)
        except Exception as e:
            display(Markdown(f"**Header-based read failed:** `{e}`"))
            continue

        df = df.loc[:, ~df.columns.to_series().astype(str).str.match(r"^Unnamed:")].copy()

        cols = [str(c) for c in df.columns.tolist()]
        display(Markdown(f"Columns: **{len(cols)}**"))
        display(pd.DataFrame({"column": cols}))

        display(Markdown(f"Preview: `head({head_rows})`"))
        display(df.head(head_rows))

### List Raw Files

In [8]:
raw_files = sorted([p for p in RAW.iterdir() if p.is_file() and not p.name.startswith("~$")])
display(pd.DataFrame({"file": [p.name for p in raw_files]}))

Unnamed: 0,file
0,cluster-15-interconnection-requests.xlsx
1,PreliminaryCluster14ProjectListasofMay20-2021....
2,publicqueuereport.xlsx


## Inspect Public Queue

In [9]:
PUBLIC_QUEUE_FILE = "publicqueuereport.xlsx"

path = RAW / PUBLIC_QUEUE_FILE
inspect_excel(path)

**File:** `publicqueuereport.xlsx`

**Sheets:** 3

Unnamed: 0,sheet
0,Grid GenerationQueue
1,Completed Generation Projects
2,Withdrawn Generation Projects


#### Sheet: `Grid GenerationQueue`

Detected header row (0-based in preview): 3


Columns: **33**

Unnamed: 0,column
0,Project Name
1,Queue Position
2,Interconnection Request\nReceive Date
3,Queue Date
4,Application Status
5,Study\nProcess
6,Type-1
7,Type-2
8,Type-3
9,Fuel-1


Preview: `head(5)`

Unnamed: 0,Project Name,Queue Position,Interconnection Request\nReceive Date,Queue Date,Application Status,Study\nProcess,Type-1,Type-2,Type-3,Fuel-1,...,PTO Study Region,Station or Transmission Line,Proposed\nOn-line Date\n(as filed with IR),Current\nOn-line Date,Suspension Status,Feasibility Study or Supplemental Review,System Impact Study or \nPhase I Cluster Study,Facilities Study (FAS) or \nPhase II Cluster Study,Optional Study\n(OS),Interconnection Agreement \nStatus
0,MONTEZUMA (HIGH WINDS III),22,2003-11-18 00:00:00,2003-11-18 08:00:00,ACTIVE,AMEND 39,Wind Turbine,Storage,,Wind Turbine,...,Northern,Birds Landing 230 kV,2005-06-30 07:00:00,2024-04-01 07:00:00,,,Complete,Complete,,Executed
1,TULE WIND,32,2004-05-12 00:00:00,2004-05-24 07:00:00,ACTIVE,Serial LGIP,Wind Turbine,Storage,,Wind Turbine,...,SDGE,Boulevard East Substation 138 kV,2007-09-01 07:00:00,2030-10-31 07:00:00,,Waived,Complete,Complete,,Executed
2,MIDWAY PEAKING,54,2005-01-12 00:00:00,2005-01-12 08:00:00,ACTIVE,Serial LGIP,Gas Turbine,Storage,,Natural Gas,...,Fresno,Panoche Substation,2008-06-01 07:00:00,2027-06-30 07:00:00,,Waived,Complete,Re-Study,,Executed
3,FRESNO COGENERATION EXPANSION PROJECT,61,2005-03-28 00:00:00,2005-03-30 08:00:00,ACTIVE,AMEND 39,Steam Turbine,Storage,,Natural Gas,...,Fresno,Helm-Kerman 70 kV Line,2006-05-31 07:00:00,2023-02-28 08:00:00,,,Complete,Complete,,Executed
4,LAKE ELSINORE ADVANCED PUMPED STORAGE PROJECT,72,2005-04-26 00:00:00,2005-06-21 07:00:00,ACTIVE,Serial LGIP,Storage,,,Pumped-Storage hydro,...,SDGE,Proposed Lee Lake Substation 500 kV,2008-12-31 08:00:00,2028-12-31 08:00:00,,Waived,Complete,Re-Study,,Executed


#### Sheet: `Completed Generation Projects`

Detected header row (0-based in preview): 3


Columns: **32**

Unnamed: 0,column
0,Project Name
1,Queue Position
2,Interconnection Request\nReceive Date
3,Queue Date
4,Application Status
5,Study\nProcess
6,Type-1
7,Type-2
8,Type-3
9,Fuel-1


Preview: `head(5)`

Unnamed: 0,Project Name,Queue Position,Interconnection Request\nReceive Date,Queue Date,Application Status,Study\nProcess,Type-1,Type-2,Type-3,Fuel-1,...,Utility,PTO Study Region,Station or Transmission Line,Proposed\nOn-line Date\n(as filed with IR),Actual\nOn-line Date,Feasibility Study or Supplemental Review,System Impact Study or \nPhase I Cluster Study,Facilities Study (FAS) or \nPhase II Cluster Study,Optional Study\n(OS),Interconnection Agreement \nStatus
0,OTAY MESA GENERATING PROJECT,1A,1999-11-01,1999-11-01 08:00:00,COMPLETED,Pre- Amend. 39,Combined Cycle,,,Natural Gas,...,SDGE,,Otay Mesa Switchyard 230 kV,2002-03-01 08:00:00,2009-10-02 07:00:00,,Complete,Complete,,Executed
1,GATEWAY GENERATING FACILITY (FKA CONTRA COSTA ...,2,1999-08-10,2000-02-03 08:00:00,COMPLETED,Pre- Amend. 39,Combined Cycle,,,Natural Gas,...,PGAE,,Contra Costa Power Plant 230 kV bus,2007-11-28 08:00:00,2009-01-06 08:00:00,,Complete,Complete,,Executed
2,CPV SENTINEL (FKA INTERGEN OCOTILLO),3,2000-04-21,2000-06-14 07:00:00,COMPLETED,Serial LGIP,Gas Turbine,Storage,,Natural Gas,...,SCE,Eastern,Devers Substation 230kV Bus,2004-01-01 08:00:00,2013-06-01 07:00:00,Waived,Re-Study,Re-Study,Complete,Executed
3,PALOMAR ENERGY PROJECT,4,2000-08-08,2000-08-08 07:00:00,COMPLETED,Pre- Amend. 39,Combined Cycle,,,Natural Gas,...,SDGE,,Palomar Energy Switchyard 230 kV,2001-06-01 07:00:00,2005-10-14 07:00:00,,Complete,Complete,,Executed
4,NRG EL SEGUNDO - TOT041,7,2000-08-16,2000-10-06 07:00:00,COMPLETED,Pre- Amend. 39,Combined Cycle,,,Natural Gas,...,SCE,Metro,El Segundo 220 kV Bus,2009-08-01 07:00:00,2013-07-10 07:00:00,,Complete,Complete,Complete,Executed


#### Sheet: `Withdrawn Generation Projects`

Detected header row (0-based in preview): 3


Columns: **31**

Unnamed: 0,column
0,Project Name - Confidential
1,Queue Position
2,Interconnection Request\nReceive Date
3,Queue Date
4,Application Status
5,Withdrawn Date
6,Study\nProcess
7,Type-1
8,Type-2
9,Type-3


Preview: `head(5)`

Unnamed: 0,Project Name - Confidential,Queue Position,Interconnection Request\nReceive Date,Queue Date,Application Status,Withdrawn Date,Study\nProcess,Type-1,Type-2,Type-3,...,Utility,Station or Transmission Line,Proposed\nOn-line Date\n(as filed with IR),Current\nOn-line Date,Feasibility Study or Supplemental Review,System Impact Study or \nPhase I Cluster Study,Facilities Study (FAS) or \nPhase II Cluster Study,Optional Study\n(OS),Interconnection Agreement \nStatus,Reason for Withdrawal
0,ENCINA GENERATING PROJECT (PH. 1 AND 2),5,2000-08-09,2000-08-09 07:00:00,WITHDRAWN,NaT,Pre- Amend. 39,Combined Cycle,,,...,SDGE,Encina Power Plant Switchyard,2003-06-30 07:00:00,2008-06-01 07:00:00,,Complete,,,,
1,TESLA POWER PLANT,6,2007-08-24,2000-08-23 07:00:00,WITHDRAWN,2011-06-16 17:56:36,Serial LGIP,Combined Cycle,,,...,PGAE,Tesla Sub 230kV Bus E,2008-06-01 07:00:00,2014-11-30 08:00:00,Waived,Complete,Complete,Complete,,
2,SAN DIEGO COMMUNITY POWER GENERATING STATION,8,2000-11-28,2000-11-28 08:00:00,WITHDRAWN,NaT,Pre- Amend. 39,Combined Cycle,,,...,SDGE,Sycamore Canyon Substation,2004-06-01 07:00:00,2010-12-31 08:00:00,,Complete,Re-Study,,In Progress,
3,MORRO BAY MODERNIZATION PROJECT,9,2000-12-01,2000-12-01 08:00:00,WITHDRAWN,NaT,Pre- Amend. 39,Combined Cycle,,,...,PGAE,Morro Bay Substation,2008-01-01 08:00:00,2008-01-01 08:00:00,,Complete,Complete,,Executed,
4,AVENAL ENERGY PROJECT,10,2001-05-02,2001-05-02 07:00:00,WITHDRAWN,NaT,Pre- Amend. 39,Combined Cycle,,,...,PGAE,Gates Substation (Arco - Gates 230 kV line),2009-01-01 08:00:00,2009-07-01 07:00:00,,Complete,Complete,,Executed,


## Inspect Cluster 15

In [11]:
CLUSTER15_FILE = "cluster-15-interconnection-requests.xlsx"

path = RAW / CLUSTER15_FILE
inspect_excel(path)

**File:** `cluster-15-interconnection-requests.xlsx`

**Sheets:** 2

Unnamed: 0,sheet
0,Cluster 15
1,Withdrawn


#### Sheet: `Cluster 15 `

Detected header row (0-based in preview): 0


Columns: **20**

Unnamed: 0,column
0,Queue Number
1,Project Number
2,Project Name
3,Generation/Fuel 1
4,NET MW 1
5,Generation/Fuel 2
6,NET MW 2
7,Generation/Fuel 3
8,NET MW 3
9,NET MW POI


Preview: `head(5)`

Unnamed: 0,Queue Number,Project Number,Project Name,Generation/Fuel 1,NET MW 1,Generation/Fuel 2,NET MW 2,Generation/Fuel 3,NET MW 3,NET MW POI,PROJECT COUNTY,Project State,Study Area,PTO,POI,Voltage kV,Requested COD,Queue Date,Application Date,Service Type
0,2207,54516,Alisa Solar Energy Complex 2,Photovoltaic/Solar,500.0,Storage/Battery,500.0,,,500.0,Yuma,AZ,SAN DIEGO,SDGE,NORTH GILA - HOODOO WASH (SDGE Portion Only),525,2030-06-01,2025-02-12,2024-11-18,Energy Only Requested
1,2328,54934,Amanece,Photovoltaic/Solar,418.992798,Storage/Battery,416.545013,,,400.0,Stanislaus,CA,PG&E FRESNO,PGAE,QUINTO SW STA- FINK SW STA 230 kV,230,2029-07-31,2025-02-12,2024-11-21,Full Capacity Deliverability Status Requested
2,2322,55045,Ambar Energy Storage,Storage/Battery,504.9,,,,,500.01,San Bernardino,CA,SCE METRO,SCE,LUGO 500 kV,500,2030-06-01,2025-02-12,2024-11-21,Full Capacity Deliverability Status Requested
3,2244,54963,Annapurna,Storage/Battery,257.0,,,,,250.0,Merced County,CA,PG&E FRESNO,PGAE,QUINTO SW STA 230 kV,230,2028-06-01,2025-02-12,2024-11-20,Full Capacity Deliverability Status Requested
4,2204,54897,Antlia,Storage/Battery,204.859,,,,,199.0,Monterey,CA,PG&E GBA,PGAE,MOSS LANDING PP 115 kV,115,2031-12-01,2025-02-12,2024-11-19,Full Capacity Deliverability Status Requested


#### Sheet: `Withdrawn`

Detected header row (0-based in preview): 0


Columns: **21**

Unnamed: 0,column
0,Queue Number
1,Project Number
2,Project Name
3,Generation/Fuel 1
4,NET MW 1
5,Generation/Fuel 2
6,NET MW 2
7,Generation/Fuel 3
8,NET MW 3
9,NET MW POI


Preview: `head(5)`

Unnamed: 0,Queue Number,Project Number,Project Name,Generation/Fuel 1,NET MW 1,Generation/Fuel 2,NET MW 2,Generation/Fuel 3,NET MW 3,NET MW POI,...,Project State,Study Area,PTO,POI,Voltage kV,Requested COD,Queue Date,Application Date,Withdrawal Date,Service Type
0,2229,54899,Clay Flats,Storage/Battery,437.08,,,,,425.0,...,CA,PG&E FRESNO,LSPC,MANNING 500 kV,500,2030-10-01,2025-02-12,2024-11-22,2025-04-23,Energy Only Requested
1,2202,55018,Gibson,Storage/Battery,154.598,,,,,150.0,...,CA,PG&E FRESNO,PGAE,MERCY SPRINGS SW STA 70 kV,70,2028-04-14,2025-02-12,2024-11-15,2025-04-24,Energy Only Requested
2,2283,54729,Amargosa SEZ,Photovoltaic/Solar,510.35,Storage/Battery,508.19,,,500.0,...,NV,SCE EOP,GLW,BEATTY 230 kV,230,2030-12-01,2025-02-12,2024-11-18,2025-04-25,Merchant- Full Capacity Deliverability Status ...
3,2228,54946,Grapevine 2,Storage/Battery,505.1,,,,,500.0,...,NV,SCE EOP,SCE,ELDORADO 230 kV,230,2028-06-01,2025-02-12,2024-11-18,2025-04-25,Merchant- Full Capacity Deliverability Status ...
4,2290,54668,Ashton Energy Center,Photovoltaic/Solar,1026.67,Storage/Battery,1022.47,,,1000.0,...,NV,SCE EOP,GLW,VALLEY SWITCH 230 kV,230,2032-12-01,2025-02-12,2024-11-18,2025-04-25,Merchant- Full Capacity Deliverability Status ...


## Inspect Cluster 14

In [12]:
CLUSTER14_FILE = "PreliminaryCluster14ProjectListasofMay20-2021.xlsx"

path = RAW / CLUSTER14_FILE
inspect_excel(path)

**File:** `PreliminaryCluster14ProjectListasofMay20-2021.xlsx`

**Sheets:** 2

Unnamed: 0,sheet
0,Notes
1,C14 Prelim Cluster Overview


#### Sheet: `Notes`

Detected header row (0-based in preview): 0


Columns: **1**

Unnamed: 0,column
0,This information is provided as general inform...


Preview: `head(5)`

Unnamed: 0,This information is provided as general information of Interconnection Requests being processed in the ISO Generator Interconnection Queue for Cluster 14.
0,
1,The information provided is based on informati...
2,
3,The use of the information herein is solely at...


#### Sheet: `C14 Prelim Cluster Overview`

Detected header row (0-based in preview): 0


Columns: **12**

Unnamed: 0,column
0,PTO
1,Affected PTO
2,Area
3,Total MW @ POI
4,Technology\n#1
5,MW-T1
6,Technology\n#2
7,MW-T2
8,Technology\n#3
9,MW-T3


Preview: `head(5)`

Unnamed: 0,PTO,Affected PTO,Area,Total MW @ POI,Technology\n#1,MW-T1,Technology\n#2,MW-T2,Technology\n#3,MW-T3,POI,Voltage\n(kV)
0,DCRT,SCE,Eastern,500.0,Battery,516.7638,Solar PV,516.7638,,,Cielo Azul Substation,500.0
1,DCRT,SCE,Eastern,350.0,Battery,350.0,Solar PV,350.0,,,Delaney Substation,500.0
2,DCRT,SCE,Eastern,2000.0,Battery,2000.0,,,,,Cielo Azul Substation,500.0
3,DCRT,SCE,Eastern,350.0,Solar PV,357.53,,,,,Delaney-Colorado River,500.0
4,DCRT,SCE,Eastern,700.0,Battery,718.81,Solar PV,718.81,,,Delaney-Colorado River,500.0
