Purpose:
This notebook is run after "01_import_CPA_boundaries.ipynb"
It assumes there is a GitItDoneAnalysis GDB with a "cpa_prj" feature class showing community planning areas in San Diego (output from first notebook).

This notebook has the following steps:
- Use pandas to import the Get It Done complaints from the city of San Diego, pulling the last month of data
- Filter them to drainage-related data
- Do some minor address cleanup
- Check for any outlying data points outside San Diego or rows with missing coordinates
- Standardize address and coordinate cleanup status message
- Create a feature class from the cleaned up data, then reproject it to match the cpa_proj
- Do a spatial join - match the complaints to which CPA they lie in, so analysis can be performed by CPA

In [None]:
import os
import pandas as pd
import arcpy

arcpy.env.overwriteOutput = True

PROJECT_ROOT = r"C:\Users\kris_\OneDrive - Kris Manske\Documents\Classes\BootcampGIS\Wildfire repositories on AWS\GetItDone"


RAW_DIR = os.path.join(PROJECT_ROOT, "data_raw")
WORK_DIR = os.path.join(PROJECT_ROOT, "data_working")
GDB_PATH = os.path.join(WORK_DIR, "GetItDoneAnalysis.gdb")

# Inputs
GETITDONE_CSV = os.path.join(RAW_DIR, "get_it_done_requests_open_datasd.csv")  # your file name
CPA_FC = os.path.join(GDB_PATH, "cpa_prj")  # from Notebook 1

# Outputs
DRAINAGE_CSV = os.path.join(WORK_DIR, "gid_drainage_clean.csv")
DRAINAGE_POINTS_FC = os.path.join(GDB_PATH, "gid_drainage_points_clean")
DRAINAGE_BY_CPA_FC = os.path.join(GDB_PATH, "gid_drainage_by_cpa")

# Optional: set workspace for easier listing
arcpy.env.workspace = GDB_PATH

for p in [GETITDONE_CSV, CPA_FC]:
    if not os.path.exists(p) and not arcpy.Exists(p):
        raise FileNotFoundError(f"Missing required input: {p}")

print("CSV:", GETITDONE_CSV)
print("CPA:", CPA_FC)
print("GDB:", GDB_PATH)


In [3]:
df = pd.read_csv(GETITDONE_CSV)

print("Rows:", len(df))
print("Columns:", list(df.columns))
df.head(3)


Rows: 87693
Columns: ['service_request_id', 'service_request_parent_id', 'sap_notification_number', 'date_requested', 'case_age_days', 'case_record_type', 'service_name', 'service_name_detail', 'date_closed', 'status', 'lat', 'lng', 'street_address', 'zipcode', 'council_district', 'comm_plan_code', 'comm_plan_name', 'park_name', 'case_origin', 'referred', 'iamfloc', 'floc', 'public_description']


Unnamed: 0,service_request_id,service_request_parent_id,sap_notification_number,date_requested,case_age_days,case_record_type,service_name,service_name_detail,date_closed,status,...,zipcode,council_district,comm_plan_code,comm_plan_name,park_name,case_origin,referred,iamfloc,floc,public_description
0,100763,,40300010000.0,2016-08-20T14:46:00,3435,TSW,Street Sweeping,,,In Process,...,,9.0,56.0,Mid-City:City Heights,,Web,,SS-014304,SS-001240,A) The storm drain channel south of 5135 Univ...
1,100777,,40300010000.0,2016-08-20T15:48:00,3435,TSW,Sidewalk Repair Issue,SIDEWALK MINOR REHAB CONTRACT,,In Process,...,,9.0,59.0,Mid-City:Normal Heights,,Web,,SS-000917-SE1,SS-000917,Curb in rubble
2,100985,,40300010000.0,2016-08-22T10:04:00,3433,TSW,Stormwater,DRAIN HEADWALL,,In Process,...,,6.0,15.0,Mira Mesa,,Phone,,HW01082,SS-019619,HILLSIDE ERODING - POSSIBLE BROKEN DRAIN


In [25]:

# Parse request date
df["date_requested_dt"] = pd.to_datetime(df["date_requested"], errors="coerce")

# Cutoff = same calendar day last month
cutoff = pd.Timestamp.today().normalize() - pd.DateOffset(months=1)

print("Cutoff date (same day last month):", cutoff.date())

# Filter to records on/after cutoff
df_recent = df[df["date_requested_dt"] >= cutoff].copy()

print("Recent rows:", len(df_recent))
print(
    "Date range:",
    df_recent["date_requested_dt"].min(),
    "to",
    df_recent["date_requested_dt"].max()
)

Cutoff date (same day last month): 2025-12-17
Recent rows: 14687
Date range: 2025-12-17 02:14:00 to 2026-01-15 22:54:00


In [4]:
# Lowercase helper columns
df["service_name_lc"] = df["service_name"].astype(str).str.lower()
df["service_detail_lc"] = df["service_name_detail"].astype(str).str.lower()

# Start broad, refine later
drainage_mask = (
    df["service_name_lc"].str.contains("storm", na=False) |
    df["service_name_lc"].str.contains("drain", na=False) |
    df["service_detail_lc"].str.contains("storm", na=False) |
    df["service_detail_lc"].str.contains("drain", na=False) |
    df["service_detail_lc"].str.contains("flood", na=False)
)

dr = df[drainage_mask].copy()
print("Drainage candidate rows:", len(dr))

# Peek at the most common categories to tighten the filter
dr["service_name"].value_counts().head(15)


Drainage candidate rows: 3358


service_name
Stormwater                     2276
Stormwater Code Enforcement     962
ROW Maintenance                 117
Parks Issue                       3
Name: count, dtype: int64

In [5]:
# Ensure numeric
dr["lat_num"] = pd.to_numeric(dr["lat"], errors="coerce")
dr["lng_num"] = pd.to_numeric(dr["lng"], errors="coerce")

dr["qa_missing_coords"] = dr["lat_num"].isna() | dr["lng_num"].isna()
dr["qa_missing_address"] = dr["street_address"].isna() | (dr["street_address"].astype(str).str.strip() == "")

# Basic plausible range check (San Diego-ish bounding box)
# (This is a quick sanity check, not a precise boundary test)
dr["qa_coords_out_of_range"] = ~(
    dr["lat_num"].between(32.5, 33.2, inclusive="both") &
    dr["lng_num"].between(-117.4, -116.8, inclusive="both")
)

print("Missing coords:", int(dr["qa_missing_coords"].sum()))
print("Missing address:", int(dr["qa_missing_address"].sum()))
print("Coords out of range:", int(dr["qa_coords_out_of_range"].sum()))


Missing coords: 27
Missing address: 0
Coords out of range: 30


In [17]:
def qa_status(row):
    if row["qa_missing_coords"]:
        return "MISSING_COORDS"
    if row["qa_coords_out_of_range"]:
        return "OUT_OF_RANGE"
    if row["qa_missing_address"]:
        return "MISSING_ADDRESS"
    return "OK"

dr["QA_STATUS"] = dr.apply(qa_status, axis=1)

# quick check
dr["QA_STATUS"].value_counts(dropna=False)


QA_STATUS
OK                3328
MISSING_COORDS      27
OUT_OF_RANGE         3
Name: count, dtype: int64

In [18]:
dr["street_address_clean"] = (
    dr["street_address"]
    .astype(str)
    .str.upper()
    .str.replace(r"\.", "", regex=True)
    .str.replace(r"\s+", " ", regex=True)
    .str.strip()
)


In [19]:
# Keep only what we need (add/remove fields as desired)
keep_cols = [
    "service_request_id", "service_name", "service_name_detail",
    "public_description", "date_requested", "status",
    "street_address", "street_address_clean",
    "lat_num", "lng_num",
    "qa_missing_coords", "qa_missing_address", "qa_coords_out_of_range", "qa_status",
]

# Some files may have different ID column names; adjust if needed after you inspect columns
existing = [c for c in keep_cols if c in dr.columns]
out = dr[existing].copy()

# Rename coords to lat/lon fields ArcPy can read easily
out = out.rename(columns={"lat_num": "lat", "lng_num": "lon"})

out.to_csv(DRAINAGE_CSV, index=False)
print("Wrote:", DRAINAGE_CSV, "rows:", len(out))


Wrote: C:\Users\kris_\OneDrive - Kris Manske\Documents\Classes\BootcampGIS\Wildfire repositories on AWS\GetItDone\data_working\gid_drainage_clean.csv rows: 3358


---------------------------------------------------
START OF SECTION I REMOVED FROM THE NEW 02 NOTEBOOK
---------------------------------------------------

In [20]:
# Make an XY Event Layer from the CSV
xy_layer = "gid_drainage_xy"

# Spatial reference for incoming coords (WGS84)
wgs84 = arcpy.SpatialReference(4326)

# Create event layer
arcpy.management.MakeXYEventLayer(
    table=DRAINAGE_CSV,
    in_x_field="lon",
    in_y_field="lat",
    out_layer=xy_layer,
    spatial_reference=wgs84
)

# Copy to feature class in GDB
if arcpy.Exists(DRAINAGE_POINTS_FC):
    arcpy.management.Delete(DRAINAGE_POINTS_FC)

arcpy.management.CopyFeatures(xy_layer, DRAINAGE_POINTS_FC)

print("Created points FC:", DRAINAGE_POINTS_FC)
print("Point count:", arcpy.management.GetCount(DRAINAGE_POINTS_FC)[0])


Created points FC: C:\Users\kris_\OneDrive - Kris Manske\Documents\Classes\BootcampGIS\Wildfire repositories on AWS\GetItDone\data_working\GetItDoneAnalysis.gdb\gid_drainage_points_clean
Point count: 3358


In [21]:
# Make an XY Event Layer from the CSV
xy_layer = "gid_drainage_xy"

# Spatial reference for incoming coords (lat/lon) = WGS84
wgs84 = arcpy.SpatialReference(4326)

arcpy.management.MakeXYEventLayer(
    table=DRAINAGE_CSV,
    in_x_field="lon",
    in_y_field="lat",
    out_layer=xy_layer,
    spatial_reference=wgs84
)

# Copy to feature class in GDB (still WGS84)
tmp_points_wgs84 = os.path.join(GDB_PATH, "gid_drainage_points_wgs84")
if arcpy.Exists(tmp_points_wgs84):
    arcpy.management.Delete(tmp_points_wgs84)

arcpy.management.CopyFeatures(xy_layer, tmp_points_wgs84)

print("Created WGS84 points:", tmp_points_wgs84)
print("Count:", arcpy.management.GetCount(tmp_points_wgs84)[0])

# Project to StatePlane CA VI (Feet) to match CPA
TARGET_SR = arcpy.SpatialReference(2229)

if arcpy.Exists(DRAINAGE_POINTS_FC):
    arcpy.management.Delete(DRAINAGE_POINTS_FC)

arcpy.management.Project(
    in_dataset=tmp_points_wgs84,
    out_dataset=DRAINAGE_POINTS_FC,
    out_coor_system=TARGET_SR
)

# ---- CLEANUP: remove intermediate WGS84 feature class ----
arcpy.management.Delete(tmp_points_wgs84)
print("Deleted temporary WGS84 feature class")

print("Projected points FC:", DRAINAGE_POINTS_FC)
print("Projected SR:", arcpy.Describe(DRAINAGE_POINTS_FC).spatialReference.name)

Created WGS84 points: C:\Users\kris_\OneDrive - Kris Manske\Documents\Classes\BootcampGIS\Wildfire repositories on AWS\GetItDone\data_working\GetItDoneAnalysis.gdb\gid_drainage_points_wgs84
Count: 3358
Deleted temporary WGS84 feature class
Projected points FC: C:\Users\kris_\OneDrive - Kris Manske\Documents\Classes\BootcampGIS\Wildfire repositories on AWS\GetItDone\data_working\GetItDoneAnalysis.gdb\gid_drainage_points_clean
Projected SR: NAD_1983_StatePlane_California_V_FIPS_0405_Feet


In [22]:
# Sanity check to make sure both of the layers used in the analysis are in the same projection before doing the join.
print(arcpy.Describe(CPA_FC).spatialReference.name)
print(arcpy.Describe(DRAINAGE_POINTS_FC).spatialReference.name)

NAD_1983_StatePlane_California_V_FIPS_0405_Feet
NAD_1983_StatePlane_California_V_FIPS_0405_Feet


In [23]:
if arcpy.Exists(DRAINAGE_BY_CPA_FC):
    arcpy.management.Delete(DRAINAGE_BY_CPA_FC)

arcpy.analysis.SpatialJoin(
    target_features=CPA_FC,            # polygons
    join_features=DRAINAGE_POINTS_FC,  # points
    out_feature_class=DRAINAGE_BY_CPA_FC,
    join_operation="JOIN_ONE_TO_ONE",
    join_type="KEEP_ALL",
    match_option="INTERSECT"
)

print("Created CPA summary FC:", DRAINAGE_BY_CPA_FC)


Created CPA summary FC: C:\Users\kris_\OneDrive - Kris Manske\Documents\Classes\BootcampGIS\Wildfire repositories on AWS\GetItDone\data_working\GetItDoneAnalysis.gdb\gid_drainage_by_cpa


In [24]:
fc = DRAINAGE_BY_CPA_FC

# Rename Join_Count → COMPLAINT_CNT
fields = [f.name for f in arcpy.ListFields(fc)]
if "Join_Count" in fields and "COMPLAINT_CNT" not in fields:
    arcpy.management.AlterField(fc, "Join_Count", "COMPLAINT_CNT", "COMPLAINT_CNT")

print("Aggregated CPA schema standardized")

Aggregated CPA schema standardized
