In [2]:
!pip install geopandas

Collecting geopandas
  Downloading geopandas-1.1.1-py3-none-any.whl.metadata (2.3 kB)
Collecting pyogrio>=0.7.2 (from geopandas)
  Downloading pyogrio-0.11.1-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (5.3 kB)
Collecting pyproj>=3.5.0 (from geopandas)
  Downloading pyproj-3.7.2-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (31 kB)
Collecting shapely>=2.0.0 (from geopandas)
  Downloading shapely-2.1.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading geopandas-1.1.1-py3-none-any.whl (338 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m338.4/338.4 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading pyogrio-0.11.1-cp311-cp311-manylinux_2_28_x86_64.whl (27.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.7/27.7 MB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading pyproj-3.7.2-cp311-cp311-manylinux_2_28_x86_64.whl (9.5 MB)
[2K   [90m━━━━━━━━━━━━━━━

In [5]:
import os, io, zipfile, requests
import geopandas as gpd
import pandas as pd
from pathlib import Path

# === 路径 ===
RAW_DIR   = Path("data/raw")
CLEAN_DIR = Path("data/clean")
RAW_DIR.mkdir(parents=True, exist_ok=True)
CLEAN_DIR.mkdir(parents=True, exist_ok=True)

# ONS LSOA 2011（E+W，Super Generalised Clipped v4，Shapefile下载）
# 来源（data.gov.uk → ArcGIS Hub 直链）：
LSOA_ZIP_URL = ("https://open-geography-portalx-ons.hub.arcgis.com/api/download/v1/items/"
                "f23b8af6504640558a5100dfcd19a7ee/shapefile?layers=0")
LSOA_ZIP     = RAW_DIR / "lsoa_2011_ew_bsc_v4.zip"
LSOA_DIR     = RAW_DIR / "lsoa_2011_ew_bsc_v4"

# 你已有的 IMD 表（File_1 - IMD2019）
IMD_XLSX     = "data/raw/dluhc_imd_2019_tables.xlsx"

# 可选：已有伦敦边界（若 LSOA 属性里无 LAD 代码则会用它空问筛选）
BORO_GPKG    = "data/interim/gla_boroughs_2024_27700.gpkg"

# 输出
OUT_GPKG  = CLEAN_DIR / "imd2019_london_lsoa_27700.gpkg"
OUT_LAYER = "imd2019_lsoa_27700"

# === 1) 下载并解压 LSOA 2011（E+W） ===
if not LSOA_ZIP.exists():
    print("Downloading LSOA 2011 (BSC v4) ...")
    r = requests.get(LSOA_ZIP_URL, timeout=120)
    r.raise_for_status()
    LSOA_ZIP.write_bytes(r.content)

if not LSOA_DIR.exists():
    LSOA_DIR.mkdir(parents=True, exist_ok=True)
    with zipfile.ZipFile(LSOA_ZIP, "r") as z:
        z.extractall(LSOA_DIR)

# 找 shapefile
shps = list(LSOA_DIR.rglob("*.shp"))
assert len(shps) > 0, "未在压缩包中找到 .shp"
lsoa = gpd.read_file(shps[0])

# 统一到 BNG(27700)
if lsoa.crs is None or lsoa.crs.to_epsg() != 27700:
    lsoa = lsoa.to_crs(27700)

# 自动识别关键列
low = {c.lower(): c for c in lsoa.columns}
lsoa_cd = (low.get("lsoa11cd")
           or next(c for c in lsoa.columns if "lsoa" in c.lower() and "cd" in c.lower()))
lad_cd  = (low.get("lad11cd") or low.get("ladcd") or low.get("laua")
           or next((c for c in lsoa.columns if "lad" in c.lower() and "cd" in c.lower()), None))

# === 2) 仅保留伦敦范围（优先用 LAD 代码以 E090* 过滤；否则用空间裁切）===
if lad_cd is not None and lsoa[lad_cd].astype(str).str.startswith("E090").any():
    lsoa_ldn = lsoa[lsoa[lad_cd].astype(str).str.startswith("E090")].copy()
else:
    # 退路：用你已有的 borough 边界空间筛选
    print("LAD 列缺失或不含 E090，改用空间筛选（需要 borough 边界） …")
    boro = gpd.read_file(BORO_GPKG)
    if boro.crs is None or boro.crs.to_epsg() != 27700:
        boro = boro.to_crs(27700)
    # 溶解成一个伦敦面
    london = boro.dissolve().geometry.iloc[0]
    lsoa_ldn = lsoa[lsoa.intersects(london)].copy()

# 只保留必需字段
keep = [lsoa_cd] + ([lad_cd] if lad_cd else [])
lsoa_ldn = lsoa_ldn[keep + ["geometry"]].rename(columns={lsoa_cd: "lsoa11cd"})

# === 3) 读 IMD2019，并入（按 LSOA11CD） ===
imd = pd.read_excel(IMD_XLSX, sheet_name="IMD2019", dtype=str)
imap = {c.lower(): c for c in imd.columns}
lsoa_c = imap.get("lsoa code (2011)") or next(c for c in imd.columns if "lsoa" in c.lower() and "2011" in c.lower())
rank_c = imap.get("index of multiple deprivation (imd) rank") or next(c for c in imd.columns if "imd" in c.lower() and "rank" in c.lower())
deci_c = imap.get("index of multiple deprivation (imd) decile") or next(c for c in imd.columns if "imd" in c.lower() and "decile" in c.lower())

imd_tidy = (imd[[lsoa_c, rank_c, deci_c]]
            .rename(columns={lsoa_c:"lsoa11cd", rank_c:"imd2019_rank", deci_c:"imd2019_decile"}))
imd_tidy["imd2019_rank"]   = pd.to_numeric(imd_tidy["imd2019_rank"], errors="coerce").astype("Int64")
imd_tidy["imd2019_decile"] = pd.to_numeric(imd_tidy["imd2019_decile"], errors="coerce").astype("Int64")

g = lsoa_ldn.merge(imd_tidy, on="lsoa11cd", how="left")
print("IMD decile 缺失（伦敦 LSOA）：", g["imd2019_decile"].isna().sum(), "/", len(g))

# === 4) 写出 GPKG ===
g.to_file(OUT_GPKG, layer=OUT_LAYER, driver="GPKG")
print(f"写出完成：{OUT_GPKG} (layer={OUT_LAYER}) | CRS={g.crs} | features={len(g)}")

import matplotlib.pyplot as plt
from pathlib import Path

g = gpd.read_file(
    "data/clean/imd2019_london_lsoa_27700.gpkg",
    layer="imd2019_lsoa_27700", engine="pyogrio"
)

g["imd2019_decile"] = pd.to_numeric(g["imd2019_decile"], errors="coerce")
g["decile_f"] = g["imd2019_decile"].astype("float64")

ax = g.plot(
    column="decile_f",
    legend=True,
    figsize=(8, 8),
    edgecolor="white",
    linewidth=0.15,
    # cmap="viridis",  # 需要就解开
)

ax.set_title("IMD 2019 Decile (1=poorest, 10=the least poor)")
ax.set_axis_off()

# --- 保存 ---
out_dir = Path("outputs/figs")
out_dir.mkdir(parents=True, exist_ok=True)
fig = ax.figure  # 关键：从 Axes 取 Figure
fig.tight_layout()
fig.savefig(out_dir / "london_imd.png", dpi=300, bbox_inches="tight", facecolor="white")
fig.savefig(out_dir / "london_imd.pdf", bbox_inches="tight", facecolor="white")
plt.close(fig)

print("Saved to:", out_dir.resolve())








LAD 列缺失或不含 E090，改用空间筛选（需要 borough 边界） …
IMD decile 缺失（伦敦 LSOA）： 0 / 4968
写出完成：data/clean/imd2019_london_lsoa_27700.gpkg (layer=imd2019_lsoa_27700) | CRS=EPSG:27700 | features=4968
Saved to: /home/jovyan/work/outputs/figs
