# Clean Cluster 15

### Imports

In [17]:
import sys
from pathlib import Path
import pandas as pd
import numpy as np
from IPython.display import display, Markdown

### Define Paths 

In [18]:
CWD = Path.cwd()
ROOT = CWD.parent if CWD.name.lower() == "notebooks" else CWD

DATA = ROOT / "data"
RAW = DATA / "raw"
PROCESSED = DATA / "processed"

RAW.mkdir(parents=True, exist_ok=True)
PROCESSED.mkdir(parents=True, exist_ok=True)

print("Python:", sys.version.split()[0])
print("RAW:", RAW)
print("PROCESSED:", PROCESSED)

Python: 3.11.9
RAW: C:\Users\danci\Interconnection-Queue-Intelligence\Interconnection-Queue-Intelligence\data\raw
PROCESSED: C:\Users\danci\Interconnection-Queue-Intelligence\Interconnection-Queue-Intelligence\data\processed


### Select Cluster 15 File

In [19]:
raw_files = sorted([p for p in RAW.iterdir() if p.is_file() and not p.name.startswith("~$")])

candidates = [p for p in raw_files if ("cluster" in p.name.lower() and "15" in p.name.lower())]
cluster15_path = candidates[0] if candidates else None

print("Detected:", cluster15_path.name if cluster15_path else None)
if cluster15_path is None:
    raise FileNotFoundError("Could not find Cluster 15 file. Make sure filename includes 'cluster' and '15'.")

Detected: cluster-15-interconnection-requests.xlsx


### Helpers

In [20]:
def coerce_float(x):
    if pd.isna(x): return np.nan
    s = str(x).strip().replace(",", "")
    if s == "": return np.nan
    return pd.to_numeric(s, errors="coerce")

def clean_text(x):
    if pd.isna(x): return ""
    return str(x).strip()

def parse_dt(series):
    return pd.to_datetime(series, errors="coerce")

def normalize_tech(x):
    s = clean_text(x).lower()
    if s == "": return ""
    if any(k in s for k in ["battery", "storage", "bess"]): return "storage"
    if any(k in s for k in ["solar", "pv", "photovoltaic"]): return "solar"
    if "wind" in s: return "wind"
    return clean_text(x)

### Load Sheets

In [21]:
xls = pd.ExcelFile(cluster15_path)
display(Markdown(f"**Workbook:** `{cluster15_path.name}`"))
display(pd.DataFrame({"sheet_in_file": xls.sheet_names}))

sheet_active = xls.sheet_names[0]
sheet_withdrawn = xls.sheet_names[1]

print("Using active sheet:", sheet_active)
print("Using withdrawn sheet:", sheet_withdrawn)

df15_active_raw = pd.read_excel(cluster15_path, sheet_name=sheet_active)
df15_withdrawn_raw = pd.read_excel(cluster15_path, sheet_name=sheet_withdrawn)

display(pd.DataFrame({
    "sheet": ["active", "withdrawn"],
    "rows": [len(df15_active_raw), len(df15_withdrawn_raw)],
    "cols": [df15_active_raw.shape[1], df15_withdrawn_raw.shape[1]],
}))

**Workbook:** `cluster-15-interconnection-requests.xlsx`

Unnamed: 0,sheet_in_file
0,Cluster 15
1,Withdrawn


Using active sheet: Cluster 15 
Using withdrawn sheet: Withdrawn


Unnamed: 0,sheet,rows,cols
0,active,108,20
1,withdrawn,62,21


## Standardize Column Names

In [22]:
display(Markdown("**Active columns:**"))
display(pd.DataFrame({"column": df15_active_raw.columns.astype(str).tolist()}))

display(Markdown("**Withdrawn columns:**"))
display(pd.DataFrame({"column": df15_withdrawn_raw.columns.astype(str).tolist()}))

display(Markdown("**Active preview:**"))
display(df15_active_raw.head(3))

display(Markdown("**Withdrawn preview:**"))
display(df15_withdrawn_raw.head(3))

**Active columns:**

Unnamed: 0,column
0,Queue Number
1,Project Number
2,Project Name
3,Generation/Fuel 1
4,NET MW 1
5,Generation/Fuel 2
6,NET MW 2
7,Generation/Fuel 3
8,NET MW 3
9,NET MW POI


**Withdrawn columns:**

Unnamed: 0,column
0,Queue Number
1,Project Number
2,Project Name
3,Generation/Fuel 1
4,NET MW 1
5,Generation/Fuel 2
6,NET MW 2
7,Generation/Fuel 3
8,NET MW 3
9,NET MW POI


**Active preview:**

Unnamed: 0,Queue Number,Project Number,Project Name,Generation/Fuel 1,NET MW 1,Generation/Fuel 2,NET MW 2,Generation/Fuel 3,NET MW 3,NET MW POI,PROJECT COUNTY,Project State,Study Area,PTO,POI,Voltage kV,Requested COD,Queue Date,Application Date,Service Type
0,2207,54516,Alisa Solar Energy Complex 2,Photovoltaic/Solar,500.0,Storage/Battery,500.0,,,500.0,Yuma,AZ,SAN DIEGO,SDGE,NORTH GILA - HOODOO WASH (SDGE Portion Only),525,2030-06-01,2025-02-12,2024-11-18,Energy Only Requested
1,2328,54934,Amanece,Photovoltaic/Solar,418.992798,Storage/Battery,416.545013,,,400.0,Stanislaus,CA,PG&E FRESNO,PGAE,QUINTO SW STA- FINK SW STA 230 kV,230,2029-07-31,2025-02-12,2024-11-21,Full Capacity Deliverability Status Requested
2,2322,55045,Ambar Energy Storage,Storage/Battery,504.9,,,,,500.01,San Bernardino,CA,SCE METRO,SCE,LUGO 500 kV,500,2030-06-01,2025-02-12,2024-11-21,Full Capacity Deliverability Status Requested


**Withdrawn preview:**

Unnamed: 0,Queue Number,Project Number,Project Name,Generation/Fuel 1,NET MW 1,Generation/Fuel 2,NET MW 2,Generation/Fuel 3,NET MW 3,NET MW POI,...,Project State,Study Area,PTO,POI,Voltage kV,Requested COD,Queue Date,Application Date,Withdrawal Date,Service Type
0,2229,54899,Clay Flats,Storage/Battery,437.08,,,,,425.0,...,CA,PG&E FRESNO,LSPC,MANNING 500 kV,500,2030-10-01,2025-02-12,2024-11-22,2025-04-23,Energy Only Requested
1,2202,55018,Gibson,Storage/Battery,154.598,,,,,150.0,...,CA,PG&E FRESNO,PGAE,MERCY SPRINGS SW STA 70 kV,70,2028-04-14,2025-02-12,2024-11-15,2025-04-24,Energy Only Requested
2,2283,54729,Amargosa SEZ,Photovoltaic/Solar,510.35,Storage/Battery,508.19,,,500.0,...,NV,SCE EOP,GLW,BEATTY 230 kV,230,2030-12-01,2025-02-12,2024-11-18,2025-04-25,Merchant- Full Capacity Deliverability Status ...


### Rename Mapping

In [23]:
RENAME_15 = {
    "Queue Number": "queue_id",
    "Project Number": "project_number",
    "Project Name": "project_name",

    "Generation/Fuel 1": "tech_1",
    "NET MW 1": "mw_1",
    "Generation/Fuel 2": "tech_2",
    "NET MW 2": "mw_2",
    "Generation/Fuel 3": "tech_3",
    "NET MW 3": "mw_3",

    "NET MW POI": "mw_poi",

    "PROJECT COUNTY": "county",
    "Project State": "state",

    "Study Area": "study_area",
    "PTO": "pto",
    "POI": "poi",
    "Voltage kV": "voltage_kv",

    "Requested COD": "date_cod_requested",
    "Queue Date": "date_queue",
    "Application Date": "date_application",

    "Service Type": "service_type",
}

RENAME_15_WITHDRAWN_OPTIONAL = {
    "Withdrawn Date": "date_withdrawn",
    "Withdrawal Date": "date_withdrawn",
    "Reason for Withdrawal": "withdraw_reason",
    "Withdrawal Reason": "withdraw_reason",
}

### Standardize Function

In [24]:
def standardize_cluster15(df_raw: pd.DataFrame, outcome: str):
    rename = {k: v for k, v in RENAME_15.items() if k in df_raw.columns}

    if outcome == "withdrawn":
        rename.update({k: v for k, v in RENAME_15_WITHDRAWN_OPTIONAL.items() if k in df_raw.columns})

    df = df_raw.rename(columns=rename).copy()

    # Identity
    df["source"] = "cluster15"
    df["sheet_outcome"] = outcome

    # placeholder for schema compatibility (clusters don't have public-style status labels)
    df["status_raw"] = ""

    # Clean strings
    for c in ["queue_id", "project_number", "project_name", "county", "state", "study_area", "pto", "poi", "service_type", "withdraw_reason", "status_raw"]:
        if c in df.columns:
            df[c] = df[c].apply(clean_text)

    # Normalize tech slots
    for t in ["tech_1", "tech_2", "tech_3"]:
        if t in df.columns:
            df[t] = df[t].apply(normalize_tech)

    # Coerce numeric
    for m in ["mw_1", "mw_2", "mw_3", "mw_poi", "voltage_kv"]:
        if m in df.columns:
            df[m] = df[m].apply(coerce_float)

    # Parse dates
    for d in ["date_queue", "date_application", "date_cod_requested", "date_withdrawn"]:
        if d in df.columns:
            df[d] = parse_dt(df[d])

    # Canonical ordering
    CANON_COLS = [
        "source", "sheet_outcome",
        "queue_id", "project_number", "project_name",
        "pto", "study_area", "poi", "voltage_kv",
        "county", "state",
        "tech_1", "mw_1", "tech_2", "mw_2", "tech_3", "mw_3",
        "mw_poi",
        "date_queue", "date_application", "date_cod_requested", "date_withdrawn",
        "service_type",
        "withdraw_reason",
        "status_raw",
    ]
    CANON_COLS = [c for c in CANON_COLS if c in df.columns]
    return df[CANON_COLS].copy()

### Standardize Both Sheets

In [29]:
df15_active = standardize_cluster15(df15_active_raw, outcome="active")
df15_withdrawn = standardize_cluster15(df15_withdrawn_raw, outcome="withdrawn")

df15_all = pd.concat([df15_active, df15_withdrawn], ignore_index=True)

display(pd.DataFrame({
    "outcome": ["active", "withdrawn", "ALL"],
    "rows": [len(df15_active), len(df15_withdrawn), len(df15_all)],
    "cols": [df15_active.shape[1], df15_withdrawn.shape[1], df15_all.shape[1]],
}))

display(df15_all.head(3))

Unnamed: 0,outcome,rows,cols
0,active,108,22
1,withdrawn,62,23
2,ALL,170,24


Unnamed: 0,source,sheet_outcome,queue_id,project_number,project_name,pto,study_area,poi,voltage_kv,county,...,mw_2,tech_3,mw_3,mw_poi,date_application,date_cod_requested,service_type,status_raw,date_queue,date_withdrawn
0,cluster15,active,2207,54516,Alisa Solar Energy Complex 2,SDGE,SAN DIEGO,NORTH GILA - HOODOO WASH (SDGE Portion Only),525,Yuma,...,500.0,,,500.0,2024-11-18,2030-06-01,Energy Only Requested,,NaT,NaT
1,cluster15,active,2328,54934,Amanece,PGAE,PG&E FRESNO,QUINTO SW STA- FINK SW STA 230 kV,230,Stanislaus,...,416.545013,,,400.0,2024-11-21,2029-07-31,Full Capacity Deliverability Status Requested,,NaT,NaT
2,cluster15,active,2322,55045,Ambar Energy Storage,SCE,SCE METRO,LUGO 500 kV,500,San Bernardino,...,,,,500.01,2024-11-21,2030-06-01,Full Capacity Deliverability Status Requested,,NaT,NaT


### Overlap Check

In [26]:
if "queue_id" in df15_active.columns and "queue_id" in df15_withdrawn.columns:
    a = set(df15_active["queue_id"].dropna().unique())
    w = set(df15_withdrawn["queue_id"].dropna().unique())
    overlap = sorted(list(a.intersection(w)))

    print("Active unique queue_id:", len(a))
    print("Withdrawn unique queue_id:", len(w))
    print("Overlap queue_id:", len(overlap))

    if overlap:
        display(Markdown("**Example overlaps (first 20):**"))
        display(pd.DataFrame({"queue_id_overlap": overlap[:20]}))
else:
    print("queue_id missing in one of the sheets; cannot compute overlap cleanly.")

Active unique queue_id: 108
Withdrawn unique queue_id: 62
Overlap queue_id: 0


### QA Checks

In [27]:
fields = ["mw_poi", "voltage_kv", "date_queue", "date_application", "date_cod_requested", "date_withdrawn", "pto", "study_area", "poi"]
qa = []
for f in fields:
    if f in df15_all.columns:
        qa.append((f, df15_all[f].isna().mean(), df15_all[f].nunique(dropna=True)))
display(pd.DataFrame(qa, columns=["field", "share_missing", "n_unique"]))

if "mw_poi" in df15_all.columns:
    display(Markdown("**MW POI by outcome:**"))
    display(df15_all.groupby("sheet_outcome")["mw_poi"].describe())

if "tech_1" in df15_all.columns:
    display(Markdown("**Tech slot 1 by outcome:**"))
    display(df15_all.groupby("sheet_outcome")["tech_1"].value_counts(dropna=False).to_frame("count"))

Unnamed: 0,field,share_missing,n_unique
0,mw_poi,0.0,60
1,voltage_kv,0.0,8
2,date_queue,0.635294,1
3,date_application,0.364706,18
4,date_cod_requested,0.0,65
5,date_withdrawn,0.635294,26
6,pto,0.0,8
7,study_area,0.0,10
8,poi,0.0,93


**MW POI by outcome:**

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
sheet_outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
active,108.0,344.862963,254.690732,12.5,187.5,300.0,460.0,1150.0
withdrawn,62.0,351.088871,250.519886,0.0,153.0,325.0,500.0,1150.0


**Tech slot 1 by outcome:**

Unnamed: 0_level_0,Unnamed: 1_level_0,count
sheet_outcome,tech_1,Unnamed: 2_level_1
active,storage,55
active,solar,50
active,wind,3
withdrawn,storage,41
withdrawn,solar,20
withdrawn,Biomass/Biofuel,1


### Save Outputs

In [32]:
out_combined = PROCESSED / "cluster15_clean_combined.csv"
df15_all.to_csv(out_combined, index=False)

out_active = PROCESSED / "cluster15_clean_active.csv"
out_withdrawn = PROCESSED / "cluster15_clean_withdrawn.csv"
df15_active.to_csv(out_active, index=False)
df15_withdrawn.to_csv(out_withdrawn, index=False)

display(df15_all.head(50))

Unnamed: 0,source,sheet_outcome,queue_id,project_number,project_name,pto,study_area,poi,voltage_kv,county,...,mw_2,tech_3,mw_3,mw_poi,date_application,date_cod_requested,service_type,status_raw,date_queue,date_withdrawn
0,cluster15,active,2207,54516,Alisa Solar Energy Complex 2,SDGE,SAN DIEGO,NORTH GILA - HOODOO WASH (SDGE Portion Only),525,Yuma,...,500.0,,,500.0,2024-11-18,2030-06-01,Energy Only Requested,,NaT,NaT
1,cluster15,active,2328,54934,Amanece,PGAE,PG&E FRESNO,QUINTO SW STA- FINK SW STA 230 kV,230,Stanislaus,...,416.545013,,,400.0,2024-11-21,2029-07-31,Full Capacity Deliverability Status Requested,,NaT,NaT
2,cluster15,active,2322,55045,Ambar Energy Storage,SCE,SCE METRO,LUGO 500 kV,500,San Bernardino,...,,,,500.01,2024-11-21,2030-06-01,Full Capacity Deliverability Status Requested,,NaT,NaT
3,cluster15,active,2244,54963,Annapurna,PGAE,PG&E FRESNO,QUINTO SW STA 230 kV,230,Merced County,...,,,,250.0,2024-11-20,2028-06-01,Full Capacity Deliverability Status Requested,,NaT,NaT
4,cluster15,active,2204,54897,Antlia,PGAE,PG&E GBA,MOSS LANDING PP 115 kV,115,Monterey,...,,,,199.0,2024-11-19,2031-12-01,Full Capacity Deliverability Status Requested,,NaT,NaT
5,cluster15,active,2212,54730,Argenta,SCE,SCE NOL,PISGAH 230 kV,230,San Bernardino,...,,,,150.01,2024-11-20,2031-05-01,Energy Only Requested,,NaT,NaT
6,cluster15,active,2219,54767,Argos Solar,SCE,SCE NOL,CALCITE 230 kV,230,San Bernadino,...,,,,350.12,2024-11-15,2030-06-01,Energy Only Requested,,NaT,NaT
7,cluster15,active,2349,54509,Aviation,SCE,SCE METRO,LUGO 500 kV,500,San Bernardino,...,600.0,,,600.0,2024-11-18,2028-08-01,Full Capacity Deliverability Status Requested,,NaT,NaT
8,cluster15,active,2305,54761,Bajada,SCE,SCE EASTERN,RED BLUFF 230 kV,230,Riverside,...,,,,350.0,2024-11-15,2030-06-01,Energy Only Requested,,NaT,NaT
9,cluster15,active,2333,55013,Belterra Energy Storage,PGAE,PG&E GBA,BELLOTA 230 kV,230,Calaveras,...,,,,500.0,2024-11-19,2028-04-01,Full Capacity Deliverability Status Requested,,NaT,NaT
