In [1]:
import os
import pandas as pd

# === Create output directory if missing ===
OUTPUT_DIR = "UnmappedMarkersPerCounty"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# === Load data ===
df = pd.read_csv("data.csv", low_memory=False)

# === Base filter: ref:hmdb is empty/blank/NaN ===
base_filtered = df[df["ref:hmdb"].isna() | (df["ref:hmdb"].astype(str).str.strip() == "")]

# === Remove if isMissing == True OR isPrivate == True ===
# Handles cases where values may not be boolean or may be NaN
filtered = base_filtered[
    ~((base_filtered.get("isMissing") == True) | (base_filtered.get("isPrivate") == True))
]

# === Group by county and export ===
for county, group in filtered.groupby("addr:county"):
    safe_name = str(county).replace(" ", "_").replace("/", "-")
    outfile = os.path.join(OUTPUT_DIR, f"{safe_name}.csv")
    
    group.to_csv(outfile, index=False)  # overwrite is default behavior
    print(f"Saved: {outfile} ({len(group)} rows)")

Saved: UnmappedMarkersPerCounty/Anderson.csv (62 rows)
Saved: UnmappedMarkersPerCounty/Andrews.csv (2 rows)
Saved: UnmappedMarkersPerCounty/Angelina.csv (33 rows)
Saved: UnmappedMarkersPerCounty/Aransas.csv (1 rows)
Saved: UnmappedMarkersPerCounty/Archer.csv (16 rows)
Saved: UnmappedMarkersPerCounty/Armstrong.csv (6 rows)
Saved: UnmappedMarkersPerCounty/Atascosa.csv (9 rows)
Saved: UnmappedMarkersPerCounty/Austin.csv (41 rows)
Saved: UnmappedMarkersPerCounty/Bailey.csv (2 rows)
Saved: UnmappedMarkersPerCounty/Bandera.csv (8 rows)
Saved: UnmappedMarkersPerCounty/Bastrop.csv (48 rows)
Saved: UnmappedMarkersPerCounty/Baylor.csv (3 rows)
Saved: UnmappedMarkersPerCounty/Bee.csv (11 rows)
Saved: UnmappedMarkersPerCounty/Bell.csv (118 rows)
Saved: UnmappedMarkersPerCounty/Bexar.csv (92 rows)
Saved: UnmappedMarkersPerCounty/Blanco.csv (4 rows)
Saved: UnmappedMarkersPerCounty/Bosque.csv (32 rows)
Saved: UnmappedMarkersPerCounty/Bowie.csv (40 rows)
Saved: UnmappedMarkersPerCounty/Brazoria.csv (2

In [5]:
# summary = filtered.groupby("addr:county").size().sort_values(ascending=False)
counts = filtered.groupby("addr:county").size()

counts[counts <= 1]

addr:county
Aransas       1
Briscoe       1
Castro        1
Coke          1
Cottle        1
Crockett      1
Dallam        1
Donley        1
Floyd         1
Foard         1
Hansford      1
Hartley       1
Hemphill      1
Jeff Davis    1
King          1
Lipscomb      1
Martin        1
Moore         1
Ochiltree     1
Schleicher    1
Yoakum        1
dtype: int64