<a href="https://colab.research.google.com/github/lionelfragniere/SSD_EORE/blob/master/SSD_EORE_DRAFT_V1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:
from google.colab import drive; drive.mount('/content/drive/')

Mounted at /content/drive/


In [50]:
# Install geospatial stack
!pip -q install geopandas pyogrio shapely rtree pyproj rasterio rioxarray xarray pyarrow statsmodels

import os, json, hashlib, textwrap
from datetime import date
import pandas as pd
import geopandas as gpd
import numpy as np
from pydrive2.auth import GoogleAuth
from pydrive2.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

BASE = "/content/drive/MyDrive/SSD_EORE"
SNAPSHOT = "2025-10-08"  # update as needed

# Folders
PATHS = {
    "inputs": f"{BASE}/data/inputs/{SNAPSHOT}",
    "processed": f"{BASE}/data/processed",
    "outputs": f"{BASE}/data/outputs",
    "config": f"{BASE}/config",
    "boundaries": f"{BASE}/data/inputs/{SNAPSHOT}/boundaries"
}
for p in PATHS.values():
    os.makedirs(p, exist_ok=True)

In [51]:
def sha256sum(path, block_size=65536):
    h = hashlib.sha256()
    with open(path, 'rb') as f:
        for chunk in iter(lambda: f.read(block_size), b""):
            h.update(chunk)
    return h.hexdigest()

manifest_rows = []

def register_file(dataset_id, title, provider, path, boundary_version=None, license_str="open", processing="raw"):
    row = {
        "dataset_id": dataset_id,
        "title": title,
        "provider": provider,
        "path": path,
        "snapshot_date": str(date.today()),
        "license": license_str,
        "processing": processing,
        "boundary_version": boundary_version if boundary_version else ""
    }
    row["sha256"] = sha256sum(path) if os.path.exists(path) else ""
    manifest_rows.append(row)


In [55]:
BOUNDARY_FILE = f"{PATHS['inputs']}/boundaries/ssd_admin0_3.gpkg"
boundary_version = "COD-AB v20230901"

adm = gpd.read_file(BOUNDARY_FILE, layer=None)  # auto detect layer if single-layer gpkg
# Make sure fields exist; rename if needed
rename_map = {
    'ADM1_PCODE':'adm1_pcode','ADM1_EN':'adm1_name',
    'ADM2_PCODE':'adm2_pcode','ADM2_EN':'adm2_name',
    'ADM3_PCODE':'adm3_pcode','ADM3_EN':'adm3_name',
    'ADM0_PCODE':'country_iso3'
}
for old,new in rename_map.items():
    if old in adm.columns and new not in adm.columns:
        adm = adm.rename(columns={old:new})

needed = ['country_iso3','adm1_pcode','adm1_name','adm2_pcode','adm2_name','adm3_pcode','adm3_name','geometry']
missing = [c for c in needed if c not in adm.columns]
if missing:
    raise ValueError(f"Missing fields in boundaries: {missing}")

adm["boundary_version"] = boundary_version
adm["snapshot_date"] = str(date.today())
adm = adm.set_crs(4326)

register_file("boundaries_admin3", "South Sudan admin 0–3", "OCHA COD via HDX", BOUNDARY_FILE, boundary_version)
adm.head(5)


Unnamed: 0,Shape_Leng,Shape_Area,adm3_name,adm3_pcode,adm2_name,adm2_pcode,adm1_name,adm1_pcode,ADM0_EN,country_iso3,date,validOn,validTo,AREA_SQKM,geometry,boundary_version,snapshot_date
0,3.676016,0.864956,Abyei Region,SS000101,Abyei Region,SS0001,Abyei Region,SS00,South Sudan,SS,2023-08-29,2023-08-29,NaT,10495.845837,"POLYGON ((28.03855 9.3439, 28.03606 9.34387, 2...",COD-AB v20230901,2025-10-08
1,1.096062,0.054905,Bungu,SS010101,Juba,SS0101,Central Equatoria,SS01,South Sudan,SS,2023-08-29,2023-08-29,NaT,673.61921,"POLYGON ((31.20592 4.81821, 31.21309 4.81718, ...",COD-AB v20230901,2025-10-08
2,1.791721,0.149619,Dolo,SS010102,Juba,SS0101,Central Equatoria,SS01,South Sudan,SS,2023-08-29,2023-08-29,NaT,1835.038343,"POLYGON ((31.26447 5.13648, 31.28455 5.12549, ...",COD-AB v20230901,2025-10-08
3,0.76249,0.029053,Ganji,SS010103,Juba,SS0101,Central Equatoria,SS01,South Sudan,SS,2023-08-29,2023-08-29,NaT,356.52944,"POLYGON ((31.21002 4.62885, 31.2182 4.62271, 3...",COD-AB v20230901,2025-10-08
4,1.232994,0.051823,Gondokoro,SS010104,Juba,SS0101,Central Equatoria,SS01,South Sudan,SS,2023-08-29,2023-08-29,NaT,635.698071,"POLYGON ((31.7016 5.00761, 31.7164 4.99368, 31...",COD-AB v20230901,2025-10-08


In [60]:
acled_files = [os.path.join(PATHS["inputs"], "acled", f) for f in os.listdir(f"{PATHS['inputs']}/acled") if f.endswith(".csv")]
df_acled = pd.concat((pd.read_csv(f) for f in acled_files), ignore_index=True)

# Parse dates
df_acled["event_date"] = pd.to_datetime(df_acled["event_date"],
               format='%d/%m/%Y %H:%M:%S')
# Convert to GeoDataFrame and spatially join to admin3
acled_gdf = gpd.GeoDataFrame(df_acled, geometry=gpd.points_from_xy(df_acled.longitude, df_acled.latitude), crs=4326)
acled_join = gpd.sjoin(acled_gdf, adm[["adm3_pcode","geometry"]], how="left", predicate="within")
acled_join["year"] = acled_join["event_date"].dt.year
acled_join["month"] = acled_join["event_date"].dt.month

acled_m = (acled_join
           .groupby(["adm3_pcode","year","month"], as_index=False)
           .agg(acled_events=("event_date","count"),
                acled_fatalities=("fatalities","sum")))
register_file("acled_events", "ACLED events monthly admin3", "ACLED", acled_files[0], boundary_version)
acled_m.head(5)


Unnamed: 0,adm3_pcode,year,month,acled_events,acled_fatalities
0,SS010105,2016,7,2,1
1,SS010106,2024,11,1,0
2,SS010107,2022,3,1,0
3,SS010108,2020,4,1,0
4,SS010108,2020,8,1,0
