# Clean Public Queue Report

### Imports

In [101]:
import sys
from pathlib import Path
import re

import pandas as pd
import numpy as np
from IPython.display import display, Markdown

### Define Paths

In [102]:
CWD = Path.cwd()
ROOT = CWD.parent if CWD.name.lower() == "notebooks" else CWD

DATA = ROOT / "data"
RAW = DATA / "raw"
PROCESSED = DATA / "processed"

RAW.mkdir(parents=True, exist_ok=True)
PROCESSED.mkdir(parents=True, exist_ok=True)

print("Python:", sys.version.split()[0])
print("RAW:", RAW)
print("PROCESSED:", PROCESSED)

Python: 3.11.9
RAW: C:\Users\danci\Interconnection-Queue-Intelligence\data\raw
PROCESSED: C:\Users\danci\Interconnection-Queue-Intelligence\data\processed


### Select Public Queue File

In [112]:
raw_files = sorted([p for p in RAW.iterdir() if p.is_file() and not p.name.startswith("~$")])

candidates = [p for p in raw_files if ("public" in p.name.lower() and "queue" in p.name.lower())]
public_path = candidates[0] if candidates else None

print("Detected:", public_path.name if public_path else None)
if public_path is None:
    raise FileNotFoundError("Could not find public queue file. Rename it to include 'public' and 'queue'.")

Detected: publicqueuereport.xlsx


### Helpers

In [119]:
def normalize_cols(cols):
    """
    Normalize column names so mapping keys don't break on newlines/extra spaces.
    Also ensures uniqueness (duplicate names get .1, .2 suffixes).
    """
    norm = []
    for c in cols:
        s = "" if c is None else str(c)
        s = s.replace("\n", " ")
        s = re.sub(r"\s+", " ", s).strip()
        norm.append(s)

    # Ensure uniqueness to avoid pandas silently creating duplicate columns that break selections
    seen = {}
    out = []
    for s in norm:
        if s not in seen:
            seen[s] = 0
            out.append(s)
        else:
            seen[s] += 1
            out.append(f"{s}.{seen[s]}")
    return out


def clean_text(x):
    if pd.isna(x):
        return ""
    return str(x).strip()


def coerce_float(x):
    if pd.isna(x):
        return np.nan
    s = str(x).strip().replace(",", "")
    if s == "":
        return np.nan
    return pd.to_numeric(s, errors="coerce")


def parse_dt(series):
    # Works on either a Series or a scalar; returns Series if Series passed
    return pd.to_datetime(series, errors="coerce")


def normalize_tech(x):
    s = clean_text(x).lower()
    if s == "":
        return ""
    if any(k in s for k in ["battery", "storage", "bess"]):
        return "storage"
    if any(k in s for k in ["solar", "pv", "photovoltaic"]):
        return "solar"
    if "wind" in s:
        return "wind"
    return clean_text(x)


def looks_like_header_cell(x) -> bool:
    if x is None or (isinstance(x, float) and pd.isna(x)):
        return False
    s = str(x).strip()
    if s == "":
        return False
    # reject pure numbers
    if re.fullmatch(r"[-+]?\d+(\.\d+)?", s):
        return False
    # accept if any alpha char
    return bool(re.search(r"[A-Za-z]", s))


def detect_header_row(preview_df: pd.DataFrame, min_nonnull: int = 5) -> int:
    """
    Scores each row in a header=None preview.
    Picks row with many non-null, many header-ish strings, few duplicates.
    """
    best_i, best_score = 0, float("-inf")

    for i in range(len(preview_df)):
        row = preview_df.iloc[i]
        nonnull = row.dropna().tolist()

        if len(nonnull) < min_nonnull:
            continue

        headerish = sum(looks_like_header_cell(v) for v in nonnull)
        norm = [str(v).strip().lower() for v in nonnull]
        dup_penalty = len(norm) - len(set(norm))

        score = len(nonnull) + 2 * headerish - 1.5 * dup_penalty
        if score > best_score:
            best_score = score
            best_i = i

    return best_i


def read_sheet_with_detected_header(path, sheet_name, scan_rows=120):
    """
    1) Reads a preview with header=None
    2) Detects header row index
    3) Reads again using that header row
    4) Normalizes columns and drops 'Unnamed' columns safely (no index alignment issues)
    """
    preview = pd.read_excel(path, sheet_name=sheet_name, header=None, nrows=scan_rows)
    header_row = detect_header_row(preview)

    df = pd.read_excel(path, sheet_name=sheet_name, header=header_row)

    # normalize columns + drop unnamed (SAFE: uses list comprehension)
    df.columns = normalize_cols(df.columns)
    keep_cols = [c for c in df.columns if not str(c).startswith("Unnamed:")]
    df = df.loc[:, keep_cols].copy()

    return df, header_row

### Load + Verify Sheets

In [120]:
SHEETS = [
    "Grid GenerationQueue",
    "Completed Generation Projects",
    "Withdrawn Generation Projects",
]

xls = pd.ExcelFile(public_path)
display(Markdown(f"**Workbook:** `{public_path.name}`"))
display(pd.DataFrame({"sheet_in_file": xls.sheet_names}))

missing = [s for s in SHEETS if s not in xls.sheet_names]
if missing:
    raise ValueError(f"Expected sheets not found: {missing}")

**Workbook:** `publicqueuereport.xlsx`

Unnamed: 0,sheet_in_file
0,Grid GenerationQueue
1,Completed Generation Projects
2,Withdrawn Generation Projects


## Active Sheet

### Load Active

In [121]:
active_raw, active_header_row = read_sheet_with_detected_header(public_path, SHEETS[0])
print("Detected header row:", active_header_row)
display(Markdown(f"**Shape:** {active_raw.shape}"))
display(active_raw.head(3))
display(pd.DataFrame({"column": active_raw.columns.tolist()}))

Detected header row: 3


**Shape:** (335, 33)

Unnamed: 0,Project Name,Queue Position,Interconnection Request Receive Date,Queue Date,Application Status,Study Process,Type-1,Type-2,Type-3,Fuel-1,...,PTO Study Region,Station or Transmission Line,Proposed On-line Date (as filed with IR),Current On-line Date,Suspension Status,Feasibility Study or Supplemental Review,System Impact Study or Phase I Cluster Study,Facilities Study (FAS) or Phase II Cluster Study,Optional Study (OS),Interconnection Agreement Status
0,MONTEZUMA (HIGH WINDS III),22,2003-11-18 00:00:00,2003-11-18 08:00:00,ACTIVE,AMEND 39,Wind Turbine,Storage,,Wind Turbine,...,Northern,Birds Landing 230 kV,2005-06-30 07:00:00,2024-04-01 07:00:00,,,Complete,Complete,,Executed
1,TULE WIND,32,2004-05-12 00:00:00,2004-05-24 07:00:00,ACTIVE,Serial LGIP,Wind Turbine,Storage,,Wind Turbine,...,SDGE,Boulevard East Substation 138 kV,2007-09-01 07:00:00,2030-10-31 07:00:00,,Waived,Complete,Complete,,Executed
2,MIDWAY PEAKING,54,2005-01-12 00:00:00,2005-01-12 08:00:00,ACTIVE,Serial LGIP,Gas Turbine,Storage,,Natural Gas,...,Fresno,Panoche Substation,2008-06-01 07:00:00,2027-06-30 07:00:00,,Waived,Complete,Re-Study,,Executed


Unnamed: 0,column
0,Project Name
1,Queue Position
2,Interconnection Request Receive Date
3,Queue Date
4,Application Status
5,Study Process
6,Type-1
7,Type-2
8,Type-3
9,Fuel-1


### Standardize Active

In [122]:
MAP_ACTIVE = {
    "Project Name": "project_name",
    "Queue Position": "queue_id",
    "Interconnection Request Receive Date": "date_received",
    "Queue Date": "date_queue",
    "Application Status": "status_raw",
    "Study Process": "study_process",
    "Fuel-1": "tech_1",
    "Fuel-2": "tech_2",
    "Fuel-3": "tech_3",
    "MW-1": "mw_1",
    "MW-2": "mw_2",
    "MW-3": "mw_3",
    "Net MWs to Grid": "mw_poi",
    "County": "county",
    "State": "state",
    "Utility": "pto",
    "PTO Study Region": "study_area",
    "Station or Transmission Line": "poi",
    "Proposed On-line Date (as filed with IR)": "date_cod_requested",
    "Current On-line Date": "date_cod_current_or_actual",
}

rename = {k: v for k, v in MAP_ACTIVE.items() if k in active_raw.columns}
active = active_raw.rename(columns=rename).copy()

active["source"] = "public_queue"
active["sheet_outcome"] = "active"

keep = [
    "source","sheet_outcome",
    "queue_id","project_name",
    "pto","study_area","poi",
    "county","state",
    "tech_1","mw_1","tech_2","mw_2","tech_3","mw_3",
    "mw_poi",
    "date_received","date_queue","date_cod_requested","date_cod_current_or_actual",
    "status_raw","study_process"
]
keep = [c for c in keep if c in active.columns]
active = active[keep].copy()

display(active.head(3))

Unnamed: 0,source,sheet_outcome,queue_id,project_name,pto,study_area,poi,county,state,tech_1,...,mw_2,tech_3,mw_3,mw_poi,date_received,date_queue,date_cod_requested,date_cod_current_or_actual,status_raw,study_process
0,public_queue,active,22,MONTEZUMA (HIGH WINDS III),PGAE,Northern,Birds Landing 230 kV,SOLANO,CA,Wind Turbine,...,38.0,,,38.0,2003-11-18 00:00:00,2003-11-18 08:00:00,2005-06-30 07:00:00,2024-04-01 07:00:00,ACTIVE,AMEND 39
1,public_queue,active,32,TULE WIND,SDGE,SDGE,Boulevard East Substation 138 kV,SAN DIEGO,CA,Wind Turbine,...,131.6,,,193.8,2004-05-12 00:00:00,2004-05-24 07:00:00,2007-09-01 07:00:00,2030-10-31 07:00:00,ACTIVE,Serial LGIP
2,public_queue,active,54,MIDWAY PEAKING,PGAE,Fresno,Panoche Substation,FRESNO,CA,Natural Gas,...,119.9,,,119.9,2005-01-12 00:00:00,2005-01-12 08:00:00,2008-06-01 07:00:00,2027-06-30 07:00:00,ACTIVE,Serial LGIP


### Clean Types Active

In [123]:
for c in ["queue_id","project_name","pto","study_area","poi","county","state","status_raw","study_process"]:
    if c in active.columns:
        active[c] = active[c].apply(clean_text)

for t in ["tech_1","tech_2","tech_3"]:
    if t in active.columns:
        active[t] = active[t].apply(normalize_tech)

for m in ["mw_1","mw_2","mw_3","mw_poi"]:
    if m in active.columns:
        active[m] = active[m].apply(coerce_float)

for d in ["date_received","date_queue","date_cod_requested","date_cod_current_or_actual"]:
    if d in active.columns:
        active[d] = parse_dt(active[d])

# mw_poi fallback if missing: sum of slots
if "mw_poi" not in active.columns:
    active["mw_poi"] = np.nan
if all(c in active.columns for c in ["mw_1","mw_2","mw_3"]):
    fallback = active[["mw_1","mw_2","mw_3"]].sum(axis=1, min_count=1)
    active["mw_poi"] = active["mw_poi"].where(active["mw_poi"].notna(), fallback)

display(active[["mw_poi","mw_1","mw_2","mw_3"]].head(5))
print("Active mw_poi missing share:", active["mw_poi"].isna().mean())

Unnamed: 0,mw_poi,mw_1,mw_2,mw_3
0,38.0,38.0,38.0,
1,193.8,127.6,131.6,
2,119.9,119.9,119.9,
3,73.27,73.269997,18.5,
4,500.0,500.0,,


Active mw_poi missing share: 0.023880597014925373


## Completed Sheet

### Load Completed

In [125]:
completed_raw, completed_header_row = read_sheet_with_detected_header(public_path, SHEETS[1])
print("Detected header row:", completed_header_row)
display(Markdown(f"**Shape:** {completed_raw.shape}"))
display(completed_raw.head(3))
display(pd.DataFrame({"column": completed_raw.columns.tolist()}))

Detected header row: 3


**Shape:** (237, 32)

Unnamed: 0,Project Name,Queue Position,Interconnection Request Receive Date,Queue Date,Application Status,Study Process,Type-1,Type-2,Type-3,Fuel-1,...,Utility,PTO Study Region,Station or Transmission Line,Proposed On-line Date (as filed with IR),Actual On-line Date,Feasibility Study or Supplemental Review,System Impact Study or Phase I Cluster Study,Facilities Study (FAS) or Phase II Cluster Study,Optional Study (OS),Interconnection Agreement Status
0,OTAY MESA GENERATING PROJECT,1A,1999-11-01,1999-11-01 08:00:00,COMPLETED,Pre- Amend. 39,Combined Cycle,,,Natural Gas,...,SDGE,,Otay Mesa Switchyard 230 kV,2002-03-01 08:00:00,2009-10-02 07:00:00,,Complete,Complete,,Executed
1,GATEWAY GENERATING FACILITY (FKA CONTRA COSTA ...,2,1999-08-10,2000-02-03 08:00:00,COMPLETED,Pre- Amend. 39,Combined Cycle,,,Natural Gas,...,PGAE,,Contra Costa Power Plant 230 kV bus,2007-11-28 08:00:00,2009-01-06 08:00:00,,Complete,Complete,,Executed
2,CPV SENTINEL (FKA INTERGEN OCOTILLO),3,2000-04-21,2000-06-14 07:00:00,COMPLETED,Serial LGIP,Gas Turbine,Storage,,Natural Gas,...,SCE,Eastern,Devers Substation 230kV Bus,2004-01-01 08:00:00,2013-06-01 07:00:00,Waived,Re-Study,Re-Study,Complete,Executed


Unnamed: 0,column
0,Project Name
1,Queue Position
2,Interconnection Request Receive Date
3,Queue Date
4,Application Status
5,Study Process
6,Type-1
7,Type-2
8,Type-3
9,Fuel-1


### Standardize Completed

In [126]:
MAP_COMPLETED = {
    "Project Name": "project_name",
    "Queue Position": "queue_id",
    "Interconnection Request Receive Date": "date_received",
    "Queue Date": "date_queue",
    "Application Status": "status_raw",
    "Study Process": "study_process",
    "Fuel-1": "tech_1",
    "Fuel-2": "tech_2",
    "Fuel-3": "tech_3",
    "MW-1": "mw_1",
    "MW-2": "mw_2",
    "MW-3": "mw_3",
    "Net MWs to Grid": "mw_poi",
    "County": "county",
    "State": "state",
    "Utility": "pto",
    "PTO Study Region": "study_area",
    "Station or Transmission Line": "poi",
    "Proposed On-line Date (as filed with IR)": "date_cod_requested",
    "Actual On-line Date": "date_cod_current_or_actual",
}

rename = {k: v for k, v in MAP_COMPLETED.items() if k in completed_raw.columns}
completed = completed_raw.rename(columns=rename).copy()

completed["source"] = "public_queue"
completed["sheet_outcome"] = "completed"

keep = [
    "source","sheet_outcome",
    "queue_id","project_name",
    "pto","study_area","poi",
    "county","state",
    "tech_1","mw_1","tech_2","mw_2","tech_3","mw_3",
    "mw_poi",
    "date_received","date_queue","date_cod_requested","date_cod_current_or_actual",
    "status_raw","study_process"
]
keep = [c for c in keep if c in completed.columns]
completed = completed[keep].copy()

display(completed.head(3))

Unnamed: 0,source,sheet_outcome,queue_id,project_name,pto,study_area,poi,county,state,tech_1,...,mw_2,tech_3,mw_3,mw_poi,date_received,date_queue,date_cod_requested,date_cod_current_or_actual,status_raw,study_process
0,public_queue,completed,1A,OTAY MESA GENERATING PROJECT,SDGE,,Otay Mesa Switchyard 230 kV,SAN DIEGO,CA,Natural Gas,...,,,,550.0,1999-11-01,1999-11-01 08:00:00,2002-03-01 08:00:00,2009-10-02 07:00:00,COMPLETED,Pre- Amend. 39
1,public_queue,completed,2,GATEWAY GENERATING FACILITY (FKA CONTRA COSTA ...,PGAE,,Contra Costa Power Plant 230 kV bus,CONTRA COSTA,CA,Natural Gas,...,,,,590.0,1999-08-10,2000-02-03 08:00:00,2007-11-28 08:00:00,2009-01-06 08:00:00,COMPLETED,Pre- Amend. 39
2,public_queue,completed,3,CPV SENTINEL (FKA INTERGEN OCOTILLO),SCE,Eastern,Devers Substation 230kV Bus,RIVERSIDE,CA,Natural Gas,...,16.0,,,850.0,2000-04-21,2000-06-14 07:00:00,2004-01-01 08:00:00,2013-06-01 07:00:00,COMPLETED,Serial LGIP


### Clean Types Completed

In [127]:
for c in ["queue_id","project_name","pto","study_area","poi","county","state","status_raw","study_process"]:
    if c in completed.columns:
        completed[c] = completed[c].apply(clean_text)

for t in ["tech_1","tech_2","tech_3"]:
    if t in completed.columns:
        completed[t] = completed[t].apply(normalize_tech)

for m in ["mw_1","mw_2","mw_3","mw_poi"]:
    if m in completed.columns:
        completed[m] = completed[m].apply(coerce_float)

for d in ["date_received","date_queue","date_cod_requested","date_cod_current_or_actual"]:
    if d in completed.columns:
        completed[d] = parse_dt(completed[d])

if "mw_poi" not in completed.columns:
    completed["mw_poi"] = np.nan
if all(c in completed.columns for c in ["mw_1","mw_2","mw_3"]):
    fallback = completed[["mw_1","mw_2","mw_3"]].sum(axis=1, min_count=1)
    completed["mw_poi"] = completed["mw_poi"].where(completed["mw_poi"].notna(), fallback)

display(completed[["mw_poi","mw_1","mw_2","mw_3"]].head(5))
print("Completed mw_poi missing share:", completed["mw_poi"].isna().mean())

Unnamed: 0,mw_poi,mw_1,mw_2,mw_3
0,550.0,550.0,,
1,590.0,590.0,,
2,850.0,850.0,16.0,
3,521.0,521.0,,
4,570.0,570.0,,


Completed mw_poi missing share: 0.008438818565400843


## Withdrawn Sheet

### Load Withdrawn

In [128]:
withdrawn_raw, withdrawn_header_row = read_sheet_with_detected_header(public_path, SHEETS[2])
print("Detected header row:", withdrawn_header_row)
display(Markdown(f"**Shape:** {withdrawn_raw.shape}"))
display(withdrawn_raw.head(3))
display(pd.DataFrame({"column": withdrawn_raw.columns.tolist()}))

Detected header row: 3


**Shape:** (1718, 31)

Unnamed: 0,Project Name - Confidential,Queue Position,Interconnection Request Receive Date,Queue Date,Application Status,Withdrawn Date,Study Process,Type-1,Type-2,Type-3,...,Utility,Station or Transmission Line,Proposed On-line Date (as filed with IR),Current On-line Date,Feasibility Study or Supplemental Review,System Impact Study or Phase I Cluster Study,Facilities Study (FAS) or Phase II Cluster Study,Optional Study (OS),Interconnection Agreement Status,Reason for Withdrawal
0,ENCINA GENERATING PROJECT (PH. 1 AND 2),5,2000-08-09,2000-08-09 07:00:00,WITHDRAWN,NaT,Pre- Amend. 39,Combined Cycle,,,...,SDGE,Encina Power Plant Switchyard,2003-06-30 07:00:00,2008-06-01 07:00:00,,Complete,,,,
1,TESLA POWER PLANT,6,2007-08-24,2000-08-23 07:00:00,WITHDRAWN,2011-06-16 17:56:36,Serial LGIP,Combined Cycle,,,...,PGAE,Tesla Sub 230kV Bus E,2008-06-01 07:00:00,2014-11-30 08:00:00,Waived,Complete,Complete,Complete,,
2,SAN DIEGO COMMUNITY POWER GENERATING STATION,8,2000-11-28,2000-11-28 08:00:00,WITHDRAWN,NaT,Pre- Amend. 39,Combined Cycle,,,...,SDGE,Sycamore Canyon Substation,2004-06-01 07:00:00,2010-12-31 08:00:00,,Complete,Re-Study,,In Progress,


Unnamed: 0,column
0,Project Name - Confidential
1,Queue Position
2,Interconnection Request Receive Date
3,Queue Date
4,Application Status
5,Withdrawn Date
6,Study Process
7,Type-1
8,Type-2
9,Type-3


### Standardize Withdrawn

In [130]:
MAP_WITHDRAWN = {
    "Project Name - Confidential": "project_name",
    "Project Name": "project_name",
    "Queue Position": "queue_id",
    "Interconnection Request Receive Date": "date_received",
    "Queue Date": "date_queue",
    "Withdrawn Date": "date_withdrawn",
    "Application Status": "status_raw",
    "Study Process": "study_process",
    "Fuel-1": "tech_1",
    "Fuel-2": "tech_2",
    "Fuel-3": "tech_3",
    "MW-1": "mw_1",
    "MW-2": "mw_2",
    "MW-3": "mw_3",
    "Net MWs to Grid": "mw_poi",
    "County": "county",
    "State": "state",
    "Utility": "pto",
    "PTO Study Region": "study_area",  # may be absent in withdrawn
    "Station or Transmission Line": "poi",
    "Proposed On-line Date (as filed with IR)": "date_cod_requested",
    "Current On-line Date": "date_cod_current_or_actual",
    "Reason for Withdrawal": "withdraw_reason",
}

rename = {k: v for k, v in MAP_WITHDRAWN.items() if k in withdrawn_raw.columns}
withdrawn = withdrawn_raw.rename(columns=rename).copy()

withdrawn["source"] = "public_queue"
withdrawn["sheet_outcome"] = "withdrawn"

keep = [
    "source","sheet_outcome",
    "queue_id","project_name",
    "pto","study_area","poi",
    "county","state",
    "tech_1","mw_1","tech_2","mw_2","tech_3","mw_3",
    "mw_poi",
    "date_received","date_queue","date_cod_requested","date_cod_current_or_actual","date_withdrawn",
    "status_raw","study_process","withdraw_reason"
]
keep = [c for c in keep if c in withdrawn.columns]
withdrawn = withdrawn[keep].copy()

display(withdrawn.head(3))

Unnamed: 0,source,sheet_outcome,queue_id,project_name,pto,poi,county,state,tech_1,mw_1,...,mw_3,mw_poi,date_received,date_queue,date_cod_requested,date_cod_current_or_actual,date_withdrawn,status_raw,study_process,withdraw_reason
0,public_queue,withdrawn,5,ENCINA GENERATING PROJECT (PH. 1 AND 2),SDGE,Encina Power Plant Switchyard,SAN DIEGO,CA,Natural Gas,900.0,...,,900.0,2000-08-09,2000-08-09 07:00:00,2003-06-30 07:00:00,2008-06-01 07:00:00,NaT,WITHDRAWN,Pre- Amend. 39,
1,public_queue,withdrawn,6,TESLA POWER PLANT,PGAE,Tesla Sub 230kV Bus E,SAN JOAQUIN,CA,Natural Gas,1156.0,...,,1156.0,2007-08-24,2000-08-23 07:00:00,2008-06-01 07:00:00,2014-11-30 08:00:00,2011-06-16 17:56:36,WITHDRAWN,Serial LGIP,
2,public_queue,withdrawn,8,SAN DIEGO COMMUNITY POWER GENERATING STATION,SDGE,Sycamore Canyon Substation,SAN DIEGO,CA,Natural Gas,750.0,...,,750.0,2000-11-28,2000-11-28 08:00:00,2004-06-01 07:00:00,2010-12-31 08:00:00,NaT,WITHDRAWN,Pre- Amend. 39,


### Clean Types Withdrawn

In [132]:
for c in ["queue_id","project_name","pto","study_area","poi","county","state","status_raw","study_process","withdraw_reason"]:
    if c in withdrawn.columns:
        withdrawn[c] = withdrawn[c].apply(clean_text)

for t in ["tech_1","tech_2","tech_3"]:
    if t in withdrawn.columns:
        withdrawn[t] = withdrawn[t].apply(normalize_tech)

for m in ["mw_1","mw_2","mw_3","mw_poi"]:
    if m in withdrawn.columns:
        withdrawn[m] = withdrawn[m].apply(coerce_float)

for d in ["date_received","date_queue","date_cod_requested","date_cod_current_or_actual","date_withdrawn"]:
    if d in withdrawn.columns:
        withdrawn[d] = parse_dt(withdrawn[d])

if "mw_poi" not in withdrawn.columns:
    withdrawn["mw_poi"] = np.nan
if all(c in withdrawn.columns for c in ["mw_1","mw_2","mw_3"]):
    fallback = withdrawn[["mw_1","mw_2","mw_3"]].sum(axis=1, min_count=1)
    withdrawn["mw_poi"] = withdrawn["mw_poi"].where(withdrawn["mw_poi"].notna(), fallback)

display(withdrawn[["mw_poi","mw_1","mw_2","mw_3"]].head(5))
print("Withdrawn mw_poi missing share:", withdrawn["mw_poi"].isna().mean())

Unnamed: 0,mw_poi,mw_1,mw_2,mw_3
0,900.0,900.0,,
1,1156.0,1156.0,,
2,750.0,750.0,,
3,1200.0,1200.0,,
4,620.0,620.0,,


Withdrawn mw_poi missing share: 0.0011641443538998836


## Combine & Save

In [145]:
public_all = pd.concat([active, completed, withdrawn], ignore_index=True)

display(Markdown("**Combined shape:**"))
display(pd.DataFrame({
    "part": ["active","completed","withdrawn","ALL"],
    "rows": [len(active), len(completed), len(withdrawn), len(public_all)]
}))
display(public_all.head(50))

**Combined shape:**

Unnamed: 0,part,rows
0,active,335
1,completed,237
2,withdrawn,1718
3,ALL,2290


Unnamed: 0,source,sheet_outcome,queue_id,project_name,pto,study_area,poi,county,state,tech_1,...,mw_3,mw_poi,date_received,date_queue,date_cod_requested,date_cod_current_or_actual,status_raw,study_process,date_withdrawn,withdraw_reason
0,public_queue,active,22,MONTEZUMA (HIGH WINDS III),PGAE,Northern,Birds Landing 230 kV,SOLANO,CA,wind,...,,38.0,2003-11-18,2003-11-18 08:00:00,2005-06-30 07:00:00,2024-04-01 07:00:00,ACTIVE,AMEND 39,NaT,
1,public_queue,active,32,TULE WIND,SDGE,SDGE,Boulevard East Substation 138 kV,SAN DIEGO,CA,wind,...,,193.8,2004-05-12,2004-05-24 07:00:00,2007-09-01 07:00:00,2030-10-31 07:00:00,ACTIVE,Serial LGIP,NaT,
2,public_queue,active,54,MIDWAY PEAKING,PGAE,Fresno,Panoche Substation,FRESNO,CA,Natural Gas,...,,119.9,2005-01-12,2005-01-12 08:00:00,2008-06-01 07:00:00,2027-06-30 07:00:00,ACTIVE,Serial LGIP,NaT,
3,public_queue,active,61,FRESNO COGENERATION EXPANSION PROJECT,PGAE,Fresno,Helm-Kerman 70 kV Line,FRESNO,CA,Natural Gas,...,,73.27,2005-03-28,2005-03-30 08:00:00,2006-05-31 07:00:00,2023-02-28 08:00:00,ACTIVE,AMEND 39,NaT,
4,public_queue,active,72,LAKE ELSINORE ADVANCED PUMPED STORAGE PROJECT,SDGE,SDGE,Proposed Lee Lake Substation 500 kV,RIVERSIDE,CA,storage,...,,500.0,2005-04-26,2005-06-21 07:00:00,2008-12-31 08:00:00,2028-12-31 08:00:00,ACTIVE,Serial LGIP,NaT,
5,public_queue,active,81,BOTTLE ROCK POWER,PGAE,Kern,Geysers #17-Fulton 230 kV Line,LAKE,CA,Geothermal,...,,52.01,2005-09-13,2005-09-13 07:00:00,2006-09-01 07:00:00,2027-11-30 08:00:00,ACTIVE,Serial LGIP,NaT,
6,public_queue,active,96,CPC WEST,SCE,Northern,Tehachapi Conceptual Substation #1,KERN,CA,wind,...,150.0,600.0,2006-02-15,2006-03-01 08:00:00,2009-12-31 08:00:00,2025-06-05 07:00:00,ACTIVE,Serial LGIP,NaT,
7,public_queue,active,100,WINDSTAR I ALTERNATE,SCE,Northern,Vincent Substation 230kV,KERN,CA,wind,...,,120.0,2006-04-05,2006-04-05 07:00:00,2007-12-31 08:00:00,2030-10-15 07:00:00,ACTIVE,Serial LGIP,NaT,
8,public_queue,active,124,SILVER RIDGE MOUNT SIGNAL,SDGE,,Imperial Valley Substation 230 kV,IMPERIAL,CA,solar,...,,600.0,2006-08-22,2006-08-22 07:00:00,2011-03-01 08:00:00,2027-06-01 07:00:00,ACTIVE,Serial LGIP,NaT,
9,public_queue,active,138,SANDSTORM WIND POWER,SCE,Eastern,Devers Substation 220 kV,RIVERSIDE,CA,wind,...,,150.0,2006-10-23,2006-10-23 07:00:00,2008-12-31 08:00:00,2030-02-01 08:00:00,ACTIVE,Serial LGIP,NaT,


### Save Processed CSV

In [141]:
out_path = PROCESSED / "public_queue_clean_combined.csv"
public_all.to_csv(out_path, index=False)

display(Markdown(f"**Saved:** `{out_path}`"))

**Saved:** `C:\Users\danci\Interconnection-Queue-Intelligence\data\processed\public_queue_clean_combined.csv`