# Clean Cluster 14

### Imports

In [1]:
import sys
from pathlib import Path
import pandas as pd
import numpy as np
from IPython.display import display, Markdown

### Define Paths

In [2]:
CWD = Path.cwd()
ROOT = CWD.parent if CWD.name.lower() == "notebooks" else CWD

DATA = ROOT / "data"
RAW = DATA / "raw"
PROCESSED = DATA / "processed"

RAW.mkdir(parents=True, exist_ok=True)
PROCESSED.mkdir(parents=True, exist_ok=True)

print("Python:", sys.version.split()[0])
print("RAW:", RAW)
print("PROCESSED:", PROCESSED)

Python: 3.11.9
RAW: C:\Users\danci\Interconnection-Queue-Intelligence\Interconnection-Queue-Intelligence-1\data\raw
PROCESSED: C:\Users\danci\Interconnection-Queue-Intelligence\Interconnection-Queue-Intelligence-1\data\processed


### Select Cluster 14 File

In [3]:
raw_files = sorted([p for p in RAW.iterdir() if p.is_file() and not p.name.startswith("~$")])

candidates = [p for p in raw_files if ("cluster" in p.name.lower() and "14" in p.name.lower())]
cluster14_path = candidates[0] if candidates else None

print("Detected:", cluster14_path.name if cluster14_path else None)
if cluster14_path is None:
    raise FileNotFoundError("Could not find Cluster 14 file. Make sure filename includes 'cluster' and '14'.")

Detected: PreliminaryCluster14ProjectListasofMay20-2021.xlsx


### Helpers

In [4]:
def coerce_float(x):
    if pd.isna(x): return np.nan
    s = str(x).strip().replace(",", "")
    if s == "": return np.nan
    return pd.to_numeric(s, errors="coerce")

def clean_text(x):
    if pd.isna(x): return ""
    return str(x).strip()

def normalize_tech(x):
    s = clean_text(x).lower()
    if s == "": return ""
    if any(k in s for k in ["battery", "storage", "bess"]): return "storage"
    if any(k in s for k in ["solar", "pv", "photovoltaic"]): return "solar"
    if "wind" in s: return "wind"
    return clean_text(x)

### Load Sheets

In [5]:
xls = pd.ExcelFile(cluster14_path)
display(Markdown(f"**Workbook:** `{cluster14_path.name}`"))
display(pd.DataFrame({"sheet_in_file": xls.sheet_names}))

target_sheet = "C14 Prelim Cluster Overview"
if target_sheet not in xls.sheet_names:
    raise ValueError(f"Expected sheet not found: {target_sheet}")

df14_raw = pd.read_excel(cluster14_path, sheet_name=target_sheet)

display(Markdown(f"**Raw shape:** {df14_raw.shape}"))
display(df14_raw.head(5))
display(pd.DataFrame({"column": df14_raw.columns.astype(str).tolist()}))

**Workbook:** `PreliminaryCluster14ProjectListasofMay20-2021.xlsx`

Unnamed: 0,sheet_in_file
0,Notes
1,C14 Prelim Cluster Overview


**Raw shape:** (363, 12)

Unnamed: 0,PTO,Affected PTO,Area,Total MW @ POI,Technology\n#1,MW-T1,Technology\n#2,MW-T2,Technology\n#3,MW-T3,POI,Voltage\n(kV)
0,DCRT,SCE,Eastern,500.0,Battery,516.7638,Solar PV,516.7638,,,Cielo Azul Substation,500.0
1,DCRT,SCE,Eastern,350.0,Battery,350.0,Solar PV,350.0,,,Delaney Substation,500.0
2,DCRT,SCE,Eastern,2000.0,Battery,2000.0,,,,,Cielo Azul Substation,500.0
3,DCRT,SCE,Eastern,350.0,Solar PV,357.53,,,,,Delaney-Colorado River,500.0
4,DCRT,SCE,Eastern,700.0,Battery,718.81,Solar PV,718.81,,,Delaney-Colorado River,500.0


Unnamed: 0,column
0,PTO
1,Affected PTO
2,Area
3,Total MW @ POI
4,Technology\n#1
5,MW-T1
6,Technology\n#2
7,MW-T2
8,Technology\n#3
9,MW-T3


### Rename + Keep Columns

In [6]:
RENAME_14 = {
    "PTO": "pto",
    "Affected PTO": "affected_pto",  
    "Area": "study_area",
    "Total MW @ POI": "mw_poi",
    "Technology\n#1": "tech_1",
    "MW-T1": "mw_1",
    "Technology\n#2": "tech_2",
    "MW-T2": "mw_2",
    "Technology\n#3": "tech_3",
    "MW-T3": "mw_3",
    "POI": "poi",
}

present = {k: v for k, v in RENAME_14.items() if k in df14_raw.columns}
missing = [k for k in RENAME_14.keys() if k not in df14_raw.columns]

df14 = df14_raw.rename(columns=present).copy()
display(pd.DataFrame({"missing": missing}) if missing else pd.DataFrame({"missing": ["(none)"]}))

Unnamed: 0,missing
0,(none)


### Standardize to Canonical

In [7]:
# Identity
df14["source"] = "cluster14"
df14["sheet_outcome"] = "intake_snapshot"
df14["status_raw"] = ""  # for schema compatibility later

# Clean strings
for c in ["pto", "affected_pto", "study_area", "poi", "status_raw"]:
    if c in df14.columns:
        df14[c] = df14[c].apply(clean_text)

# Normalize tech
for t in ["tech_1", "tech_2", "tech_3"]:
    if t in df14.columns:
        df14[t] = df14[t].apply(normalize_tech)

# Coerce numeric
for m in ["mw_poi", "mw_1", "mw_2", "mw_3", "voltage_kv"]:
    if m in df14.columns:
        df14[m] = df14[m].apply(coerce_float)

# Canonical-ish column order
CANON_COLS = [
    "source", "sheet_outcome",
    "pto", "affected_pto", "study_area", "poi", "voltage_kv",
    "tech_1", "mw_1", "tech_2", "mw_2", "tech_3", "mw_3",
    "mw_poi",
    "status_raw",
]
CANON_COLS = [c for c in CANON_COLS if c in df14.columns]
df14 = df14[CANON_COLS].copy()

display(Markdown(f"**Standardized shape:** {df14.shape}"))
display(df14.head(5))

**Standardized shape:** (363, 14)

Unnamed: 0,source,sheet_outcome,pto,affected_pto,study_area,poi,tech_1,mw_1,tech_2,mw_2,tech_3,mw_3,mw_poi,status_raw
0,cluster14,intake_snapshot,DCRT,SCE,Eastern,Cielo Azul Substation,storage,516.7638,solar,516.7638,,,500.0,
1,cluster14,intake_snapshot,DCRT,SCE,Eastern,Delaney Substation,storage,350.0,solar,350.0,,,350.0,
2,cluster14,intake_snapshot,DCRT,SCE,Eastern,Cielo Azul Substation,storage,2000.0,,,,,2000.0,
3,cluster14,intake_snapshot,DCRT,SCE,Eastern,Delaney-Colorado River,solar,357.53,,,,,350.0,
4,cluster14,intake_snapshot,DCRT,SCE,Eastern,Delaney-Colorado River,storage,718.81,solar,718.81,,,700.0,


### QA Checks

In [8]:
# Basic missingness and distribution checks
fields = ["mw_poi", "voltage_kv", "pto", "study_area", "poi", "tech_1"]
qa = []
for f in fields:
    if f in df14.columns:
        qa.append((f, df14[f].isna().mean(), df14[f].nunique(dropna=True)))
display(pd.DataFrame(qa, columns=["field", "share_missing", "n_unique"]))

if "mw_poi" in df14.columns:
    display(Markdown("**MW POI summary:**"))
    display(df14["mw_poi"].describe().to_frame().T)

if "tech_1" in df14.columns:
    display(Markdown("**Tech slot 1 counts:**"))
    display(df14["tech_1"].value_counts(dropna=False).to_frame("count"))

Unnamed: 0,field,share_missing,n_unique
0,mw_poi,0.0,78
1,pto,0.0,6
2,study_area,0.0,11
3,poi,0.0,237
4,tech_1,0.0,5


**MW POI summary:**

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
mw_poi,363.0,291.998033,263.516511,0.0,125.0,210.0,400.0,2000.0


**Tech slot 1 counts:**

Unnamed: 0_level_0,count
tech_1,Unnamed: 1_level_1
storage,336
solar,15
wind,9
Steam Turbine,2
Steam Turbine Geothermal,1


### Save Output

In [9]:
out_path = PROCESSED / "cluster14_clean.csv"
df14.to_csv(out_path, index=False)

display(Markdown(f"**Saved:** `{out_path}`"))
display(df14.head(5))

**Saved:** `C:\Users\danci\Interconnection-Queue-Intelligence\Interconnection-Queue-Intelligence-1\data\processed\cluster14_clean.csv`

Unnamed: 0,source,sheet_outcome,pto,affected_pto,study_area,poi,tech_1,mw_1,tech_2,mw_2,tech_3,mw_3,mw_poi,status_raw
0,cluster14,intake_snapshot,DCRT,SCE,Eastern,Cielo Azul Substation,storage,516.7638,solar,516.7638,,,500.0,
1,cluster14,intake_snapshot,DCRT,SCE,Eastern,Delaney Substation,storage,350.0,solar,350.0,,,350.0,
2,cluster14,intake_snapshot,DCRT,SCE,Eastern,Cielo Azul Substation,storage,2000.0,,,,,2000.0,
3,cluster14,intake_snapshot,DCRT,SCE,Eastern,Delaney-Colorado River,solar,357.53,,,,,350.0,
4,cluster14,intake_snapshot,DCRT,SCE,Eastern,Delaney-Colorado River,storage,718.81,solar,718.81,,,700.0,
