# Calculating Koppen-Geiger Climatic zones

In [1]:
import os
from pathlib import Path

import pandas as pd
import numpy as np
import xarray as xr
import rioxarray
import geopandas as gpd

import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from matplotlib.patches import Patch
import matplotlib.patches as mpatches
from matplotlib.font_manager import FontProperties

import mapclassify

from rapidfuzz import process, fuzz

from shapely.geometry import Point

import rasterio
from rasterio.plot import show
from rasterio.mask import mask
from rasterstats import zonal_stats

In [2]:
# Set base project path
base_path = Path("C:/Users/juami/Dropbox/RAships/2-Folklore-Nathan-Project/EA-Maps-Nathan-project/Measures_work")

# Set file paths
poscol_path = base_path / "data" / "raw" / "ethnologue" / "ancestral_characteristics_database_language_level" / "Ethnologue_16_shapefile" / "langa_no_overlap_biggest_clean.shp"
KG_path = base_path / "maps" / "raw" / "KG_climatic_zones" / "Koppen_geiger_shp" / "Koppen_geiger_climzones.shp"

In [3]:
# Read the shapefiles and CSV files
ethnologue = gpd.read_file(poscol_path)
ethnologue = ethnologue.to_crs(epsg=6933)

KGzones = gpd.read_file(KG_path)
KGzones = KGzones.to_crs(epsg=6933)

print(ethnologue.crs)
print(KGzones.crs)

EPSG:6933
EPSG:6933


In [4]:
print(ethnologue.columns)
print(KGzones.columns)

Index(['FID_langa', 'ID', 'ID_ISO_A3', 'ID_ISO_A2', 'ID_FIPS', 'NAM_LABEL',
       'NAME_PROP', 'NAME2', 'NAM_ANSI', 'CNT', 'C1', 'POP', 'LMP_POP1', 'G',
       'LMP_CLASS', 'FAMILYPROP', 'FAMILY', 'LMP_C1', 'CODE', 'ID_lang',
       'Shape_Leng', 'Shape_Area', 'Area', 'geometry'],
      dtype='object')
Index(['Id', 'gridcode', 'Shape_Leng', 'Shape_Area', 'geometry'], dtype='object')


In [10]:
from shapely import make_valid
from shapely.validation import explain_validity

# --- 0) (Important) Make sure CRS is correct BEFORE reprojecting
print("KGzones original CRS:", KGzones.crs)
# If KGzones.crs is None but you *know* the source CRS, set it here:
# KGzones = KGzones.set_crs("EPSG:4326")  # example, change if needed

# --- 1) Reproject (only after CRS is correct)
KGzones = KGzones.to_crs(6933)

# --- 2) Drop null/empty and obviously bad bounds (NaN/Inf)
KGzones = KGzones[KGzones.geometry.notnull()].copy()
KGzones = KGzones[~KGzones.geometry.is_empty].copy()

b = KGzones.bounds  # minx, miny, maxx, maxy
finite_mask = np.isfinite(b).all(axis=1)
KGzones = KGzones[finite_mask].copy()

# --- 3) Force 2D (just in case there are Z/Ms)
try:
    from shapely import force_2d  # shapely>=2.0
    KGzones["geometry"] = KGzones.geometry.apply(force_2d)
except Exception:
    # If not available, you can skip; most cases work without this.
    pass

# --- 4) Fix invalid geometries
# First attempt: make_valid
KGzones["geometry"] = KGzones.geometry.apply(make_valid)

# Second attempt (only where still invalid): tiny buffer(0)
invalid_mask = ~KGzones.is_valid
if invalid_mask.any():
    KGzones.loc[invalid_mask, "geometry"] = KGzones.loc[invalid_mask, "geometry"].buffer(0)

# Drop anything still invalid or empty after fixes
KGzones = KGzones[KGzones.geometry.notnull()].copy()
KGzones = KGzones[~KGzones.geometry.is_empty].copy()
KGzones = KGzones[KGzones.is_valid].copy()

# (Optional) Inspect what was wrong, if you’re curious:
# bad = KGzones.loc[~KGzones.is_valid, "geometry"].apply(explain_validity)
# print(bad.value_counts().head())

# --- 5) Dissolve by 'gridcode'
# If your GeoPandas supports aggfunc in dissolve:
KGzones_diss = KGzones.dissolve(
    by="gridcode",
    as_index=False,
    aggfunc={
        "Id": "first",
        "Shape_Area": "sum",
        "Shape_Leng": "sum",
    },
)

# --- 6) Recompute area/length from geometry (more reliable)
KGzones_diss["area_m2"] = KGzones_diss.geometry.area
KGzones_diss["area_km2"] = KGzones_diss["area_m2"] / 1e6
KGzones_diss["perimeter_m"] = KGzones_diss.geometry.length

print(f"{len(KGzones)} cleaned input features -> {len(KGzones_diss)} dissolved features")
print(KGzones_diss.columns)


KGzones original CRS: EPSG:6933
20996 cleaned input features -> 30 dissolved features
Index(['gridcode', 'geometry', 'Id', 'Shape_Area', 'Shape_Leng', 'area_m2',
       'area_km2', 'perimeter_m'],
      dtype='object')


In [11]:
# Fix invalid/empty
ethnologue['geometry'] = ethnologue.geometry.apply(make_valid)

eth = ethnologue.copy()
kg  = KGzones_diss.copy()

eth = eth[eth.geometry.notnull() & ~eth.geometry.is_empty].copy()
kg  = kg[kg.geometry.notnull() & ~kg.geometry.is_empty].copy()

# --- 1) Precompute Ethnologue polygon areas (denominator)
eth['area_m2'] = eth.geometry.area

# --- 2) Overlay (intersection) to get Ethnologue × KG overlaps
over = gpd.overlay(eth[['ID', 'geometry']], kg[['gridcode', 'geometry']],
                   how='intersection', keep_geom_type=False)

# Drop empties just in case
over = over[over.geometry.notnull() & ~over.geometry.is_empty].copy()

# --- 3) Overlap area per (ID, gridcode)
over['overlap_m2'] = over.geometry.area
g = (over.groupby(['ID', 'gridcode'], as_index=False)['overlap_m2']
         .sum())

# --- 4) Convert to shares by dividing by each Ethnologue polygon’s area
eth_area = eth.set_index('ID')['area_m2']
g['eth_area_m2'] = g['ID'].map(eth_area)
g['share'] = np.where(g['eth_area_m2'] > 0, g['overlap_m2'] / g['eth_area_m2'], 0.0)

# --- 5) Pivot to wide (one column per gridcode)
# Ensure all 30 gridcodes appear as columns even if missing for some IDs
all_codes = np.sort(kg['gridcode'].unique())
shares_wide = (g.pivot(index='ID', columns='gridcode', values='share')
                 .reindex(columns=all_codes, fill_value=0.0))

# Optional: make friendly column names like share_gc_<code>
shares_wide.columns = [f"share_gc_{int(c) if float(c).is_integer() else str(c)}"
                       for c in shares_wide.columns]

# --- 6) Merge back to Ethnologue
eth_out = eth.drop(columns=['area_m2']).merge(shares_wide, on='ID', how='left')

# Fill NaN shares (IDs with zero overlap with KG zones) with 0
share_cols = [c for c in eth_out.columns if c.startswith('share_gc_')]
eth_out[share_cols] = eth_out[share_cols].fillna(0.0)

print(eth_out.head())

   FID_langa       ID ID_ISO_A3 ID_ISO_A2 ID_FIPS         NAM_LABEL  \
0          1  RUS-RUS   rus-RUS    rus-ru  rus-RS           Russian   
1          2  ENG-USA   eng-USA    eng-us  eng-US           English   
2          3  POR-BRA   por-BRA    por-br  por-BR        Portuguese   
3          4  ENG-AUS   eng-AUS    eng-au  eng-AS           English   
4          5  CMN-CHN   cmn-CHN    cmn-cn  cmn-CH  Mandarin Chinese   

           NAME_PROP              NAME2           NAM_ANSI       CNT  ...  \
0            Russian            RUSSIAN            Russian    Europe  ...   
1            English            ENGLISH            English  Americas  ...   
2         Portuguese         PORTUGUESE         Portuguese  Americas  ...   
3            English            ENGLISH            English   Pacific  ...   
4  Chinese, Mandarin  CHINESE, MANDARIN  Chinese, Mandarin      Asia  ...   

  share_gc_21 share_gc_22  share_gc_23 share_gc_24 share_gc_25 share_gc_26  \
0    0.002891    0.059185     0.

In [12]:
# --- Keep only ID + share columns
cols_to_keep = ['ID'] + [c for c in eth_out.columns if c.startswith('share_gc_')]
eth_shares = eth_out[cols_to_keep].copy()

# --- Export to CSV
eth_shares.to_csv(base_path / "maps" / "raw" / "KG_climatic_zones" / "ethnologue_KGshares.csv", index=False)