In [2]:
%pip -q install geopandas pyogrio shapely pyproj pandas openpyxl


Note: you may need to restart the kernel to use updated packages.


In [2]:
from pathlib import Path
import pandas as pd, geopandas as gpd, pyogrio, datetime, textwrap, os
from shapely.validation import make_valid


PROJECT_ROOT = Path.cwd()

RAW     = PROJECT_ROOT/"data/raw"
INTERIM = PROJECT_ROOT/"data/interim"
DOCS    = PROJECT_ROOT/"docs"
LOGS    = PROJECT_ROOT/"logs"
for p in (INTERIM, DOCS, LOGS): p.mkdir(parents=True, exist_ok=True)

BNG = 27700  # British National Grid

def read_any(path, layer=None):
    if layer:
        return gpd.read_file(f"{path}|layer={layer}", engine="pyogrio")
    return gpd.read_file(path, engine="pyogrio")

def to_27700(gdf):
    if gdf.crs is None: raise ValueError("Layer has no CRS defined.")
    return gdf.to_crs(BNG) if (gdf.crs.to_epsg() or gdf.crs) != BNG else gdf

def make_valid_if_poly(gdf):
    if gdf.geom_type.isin(["Polygon","MultiPolygon"]).any():
        gdf = gdf.copy(); gdf["geometry"] = gdf.geometry.apply(make_valid)
    return gdf

def write_gpkg(gdf, out_path, layer):
    gdf.to_file(out_path, layer=layer, driver="GPKG", engine="pyogrio")

def log_append(md):
    (LOGS/"cleaning_log.md").open("a", encoding="utf-8").write(md+"\n")

def find_first(pattern):
    return next(RAW.rglob(pattern), None)


In [3]:
shp = find_first("London_Borough_Excluding_MHW.shp")
assert shp, "London_Borough_Excluding_MHW.shp not found under data/raw/**"
gdf = read_any(shp)
gdf = to_27700(make_valid_if_poly(gdf))
out = INTERIM/"gla_boroughs_2024_27700.gpkg"
write_gpkg(gdf, out, layer="boroughs")
print("Boroughs:", len(gdf), "→", out)


Boroughs: 33 → /home/jovyan/work/data/interim/gla_boroughs_2024_27700.gpkg


In [4]:
sta_src = RAW/"tfl_elizabeth_line_stations_2022.geojson"
rte_src = RAW/"tfl_elizabeth_line_route_2022.geojson"
g_sta = to_27700(read_any(sta_src)); g_rte = to_27700(read_any(rte_src))
sta_out = INTERIM/"tfl_elizabeth_line_stations_2022_27700.gpkg"
rte_out = INTERIM/"tfl_elizabeth_line_route_2022_27700.gpkg"
write_gpkg(g_sta, sta_out, "stations"); write_gpkg(g_rte, rte_out, "route")
print("Stations:", len(g_sta), "Route:", len(g_rte))


Stations: 41 Route: 10


In [5]:
%pip -q install geopandas pyogrio shapely pandas openpyxl

from pathlib import Path
import geopandas as gpd
from shapely.validation import make_valid

ROOT = Path.cwd()
RAW, INTERIM = ROOT/"data/raw", ROOT/"data/interim"
INTERIM.mkdir(parents=True, exist_ok=True)

tab = next(RAW.rglob("*PTAL*Contours*.TAB"))
g = gpd.read_file(tab, engine="pyogrio")
g = g.to_crs(27700) if g.crs and g.crs.to_epsg()!=27700 else g
if g.geom_type.isin(["Polygon","MultiPolygon"]).any():
    g = g.copy(); g["geometry"] = g.geometry.apply(make_valid)
# 统一字段名
for c in g.columns:
    if c.lower() in {"ptal","ptal_value","level"} and c!="ptal":
        g = g.rename(columns={c:"ptal"})
out_poly = INTERIM/"gla_ptal_contours_2015_27700.gpkg"
g.to_file(out_poly, layer="ptal_contours", driver="GPKG", engine="pyogrio")
print("Saved:", out_poly, "| n =", len(g))


Note: you may need to restart the kernel to use updated packages.
Saved: /home/jovyan/work/data/interim/gla_ptal_contours_2015_27700.gpkg | n = 9


In [1]:
from pathlib import Path
import pandas as pd, geopandas as gpd
from shapely.geometry import Point

ROOT = Path.cwd()
RAW, INTERIM = ROOT/"data/raw", ROOT/"data/interim"
INTERIM.mkdir(parents=True, exist_ok=True)

# 1) Read the PTAL grid workbook (no dtype forcing)
xlsx = next(RAW.rglob("*PTAL*Grid*Values*.xls*"))
df = pd.read_excel(xlsx)  # let pandas infer; we'll clean below

# 2) Robust column picking
def pick(cols, candidates):
    for want in candidates:
        for c in cols:
            if c.strip().lower() == want:
                return c
    return None

xcol = pick(df.columns, ["x","easting","eastings"])
ycol = pick(df.columns, ["y","northing","northings"])
aicol = pick(df.columns, ["ai2015","ai_2015","ai"])
pcol = pick(df.columns, ["ptal2015","ptal_2015","ptal"])

assert all([xcol,ycol,aicol,pcol]), f"Columns not found: {xcol,ycol,aicol,pcol}"

df = df[[xcol,ycol,aicol,pcol]].rename(columns={
    xcol:"easting", ycol:"northing", aicol:"ai_2015", pcol:"ptal_2015_raw"
})

# 3) Coerce numeric coords + AI; keep PTAL band as string
df["easting"]  = pd.to_numeric(df["easting"], errors="coerce")
df["northing"] = pd.to_numeric(df["northing"], errors="coerce")
df["ai_2015"]  = pd.to_numeric(df["ai_2015"], errors="coerce")

band = df["ptal_2015_raw"].astype(str).str.strip().str.lower()

# Map lettered bands to an ordinal 0–10
ptal_map = {"0":0, "1":1, "1a":2, "1b":3, "2":4, "3":5, "4":6, "5":7, "6":8, "6a":9, "6b":10}
df["ptal_2015_band"] = band
df["ptal_2015_num"]  = band.map(ptal_map)  # numeric convenience column

# 4) Drop rows missing coordinates
df = df.dropna(subset=["easting","northing"])

# 5) Save CSV
csv_out = INTERIM/"gla_ptal_grid_2015.csv"
df[["easting","northing","ai_2015","ptal_2015_band","ptal_2015_num"]].to_csv(csv_out, index=False)
print("CSV →", csv_out, "| rows:", len(df))

# 6) Save as point layer (EPSG:27700)
gdf = gpd.GeoDataFrame(
    df,
    geometry=gpd.points_from_xy(df["easting"], df["northing"]),
    crs=27700
)
gpkg_out = INTERIM/"gla_ptal_grid_2015_27700.gpkg"
gdf[["ai_2015","ptal_2015_band","ptal_2015_num","geometry"]].to_file(
    gpkg_out, layer="ptal_grid", driver="GPKG"
)
print("GPKG →", gpkg_out, "| n:", len(gdf))


CSV → /home/jovyan/work/data/interim/gla_ptal_grid_2015.csv | rows: 159451
GPKG → /home/jovyan/work/data/interim/gla_ptal_grid_2015_27700.gpkg | n: 159451


In [8]:
from pathlib import Path
import geopandas as gpd, pyogrio
from shapely.validation import make_valid
import numpy as np

ROOT = Path.cwd()
RAW, INTERIM = ROOT/"data/raw", ROOT/"data/interim"
INTERIM.mkdir(parents=True, exist_ok=True)

# 1) 读取
caz_src = RAW / "gla_caz_boundary_2023.gpkg"
try:
    g = gpd.read_file(caz_src, engine="pyogrio")
except Exception:
    # 若有多图层，取第一个
    names = [row[0] for row in pyogrio.list_layers(caz_src).tolist()]
    g = gpd.read_file(f"{caz_src}|layer={names[0]}", engine="pyogrio")

# 2) 坐标系确认（这份本质是 BNG；若 WKT 非标准则贴 EPSG）
minx, miny, maxx, maxy = g.total_bounds
epsg = g.crs.to_epsg() if g.crs else None
looks_metric = max(abs(maxx), abs(maxy)) > 1000

if (g.crs is None or epsg is None) and looks_metric:
    g = g.set_crs(27700, allow_override=True)  # 纠正标签，不投影
elif epsg != 27700:
    g = g.to_crs(27700)                        # 真的是经纬度时才投影

# 3) 几何修复 + 融合为单一 CAZ 面
g["geometry"] = g.geometry.apply(make_valid)
caz = g.dissolve().reset_index(drop=True).assign(name="CAZ")


# 4) 写出
out = INTERIM / "gla_caz_boundary_2023_27700.gpkg"
caz.to_file(out, layer="caz", driver="GPKG", engine="pyogrio")
print(f"Saved: {out} | n={len(caz)} | crs={caz.crs} | bounds={caz.total_bounds}")


Saved: /home/jovyan/work/data/interim/gla_caz_boundary_2023_27700.gpkg | n=1 | crs=EPSG:27700 | bounds=[525850.70678212 176752.17918521 534348.77357172 184194.47686008]


In [9]:
from shapely.validation import make_valid
import geopandas as gpd
from pathlib import Path

BNG = 27700
tc_src = RAW / "gla_town_centre_boundaries_2023.gpkg"   # 你的原始文件名
g = gpd.read_file(tc_src, engine="pyogrio")

# 若写成自定义 WKT（to_epsg() 返回 None），按量级纠正为 27700（仅贴标签，不投影）
if g.crs is None or getattr(g.crs, "to_epsg", lambda: None)() is None:
    minx, miny, maxx, maxy = g.total_bounds
    if max(abs(maxx), abs(maxy)) > 1000:
        g = g.set_crs(BNG, allow_override=True)

# 几何修复 + 清理空几何
g["geometry"] = g.geometry.apply(make_valid)
g = g[g.geometry.notna() & ~g.geometry.is_empty].copy()

# 保存分层版（保留每个城镇中心的属性）
out = INTERIM / "gla_town_centre_boundaries_2023_27700.gpkg"
g.to_file(out, layer="town_centres", driver="GPKG", engine="pyogrio")
print(f"Saved: {out} | n={len(g)} | crs={g.crs}")

# 可选：做一个“联合面”（用于快速覆盖判断）
tc_union = g.dissolve().reset_index(drop=True)
tc_union["name"] = "Town centres (union)"
tc_union.to_file(INTERIM/"gla_town_centre_union_2023_27700.gpkg",
                 layer="tc_union", driver="GPKG", engine="pyogrio")


Saved: /home/jovyan/work/data/interim/gla_town_centre_boundaries_2023_27700.gpkg | n=209 | crs=EPSG:27700
