# **1 Define vars**

## **1.1 Define var name and path**

In [None]:
target_variable = {
    "xco2": "/data3/interns/NRT_CO2_Emission_Map_Project/HaoHu_work/XCO2_resample/global_grid_0.1_2019_2025_xco2.nc",
}
feature_variables = {
    "t2m": "/data3/interns/NRT_CO2_Emission_Map_Project/HaoHu_work/ERA5_resample/t2m_daily_0p1deg.nc",
    "d2m": "/data3/interns/NRT_CO2_Emission_Map_Project/HaoHu_work/ERA5_resample/d2m_daily_0p1deg.nc",
    "u10": "/data3/interns/NRT_CO2_Emission_Map_Project/HaoHu_work/ERA5_resample/u10_daily_0p1deg.nc",
    "v10": "/data3/interns/NRT_CO2_Emission_Map_Project/HaoHu_work/ERA5_resample/v10_daily_0p1deg.nc",
    "msl": "/data3/interns/NRT_CO2_Emission_Map_Project/HaoHu_work/ERA5_resample/msl_daily_0p1deg.nc",
    "sp": "/data3/interns/NRT_CO2_Emission_Map_Project/HaoHu_work/ERA5_resample/sp_daily_0p1deg.nc",
    "skt": "/data3/interns/NRT_CO2_Emission_Map_Project/HaoHu_work/ERA5_resample/skt_daily_0p1deg.nc",
    "tp": "/data3/interns/NRT_CO2_Emission_Map_Project/HaoHu_work/ERA5_resample/tp_daily_0p1deg.nc",
    "e": "/data3/interns/NRT_CO2_Emission_Map_Project/HaoHu_work/ERA5_resample/e_daily_0p1deg.nc",
    "ssr": "/data3/interns/NRT_CO2_Emission_Map_Project/HaoHu_work/ERA5_resample/ssr_daily_0p1deg.nc",
    "str": "/data3/interns/NRT_CO2_Emission_Map_Project/HaoHu_work/ERA5_resample/str_daily_0p1deg.nc",
    "tcw": "/data3/interns/NRT_CO2_Emission_Map_Project/HaoHu_work/ERA5_resample/tcw_daily_0p1deg.nc",
    "blh": "/data3/interns/NRT_CO2_Emission_Map_Project/HaoHu_work/ERA5_resample/blh_daily_0p1deg.nc",
    
    "NO2": "/data3/interns/NRT_CO2_Emission_Map_Project/HaoHu_work/XCO2_resample/global_grid_0.1_2019_2025_NO2.nc",
    "is_weekend": "/data3/interns/NRT_CO2_Emission_Map_Project/HaoHu_work/XCO2_resample/global_grid_0.1_2019_2025_weekday_weekend.nc",
    "population": "/data3/interns/NRT_CO2_Emission_Map_Project/MingjuanZhang_work/Population_global_0.1degree_2019_2025_ns.nc",
    "elevation": "/data3/interns/NRT_CO2_Emission_Map_Project/MingjuanZhang_work/SRTM_elevation_global_0.1degree_2019_2025_ns.nc",
    "landuse": "/data3/interns/NRT_CO2_Emission_Map_Project/MingjuanZhang_work/Landuse_global_0.1degree_2019_2025_ns.nc",
    "aspect": "/data3/interns/NRT_CO2_Emission_Map_Project/MingjuanZhang_work/SRTM_aspect_global_0.1degree_2019_2025_ns.nc",
    "ndvi": "/data3/interns/NRT_CO2_Emission_Map_Project/MingjuanZhang_work/NDVI_global_0.1degree_2019_2025_ns.nc",
    "gpp": "/data3/interns/NRT_CO2_Emission_Map_Project/MingjuanZhang_work/GPP_global_0.1degree_2019_2025_ns.nc",
    "lai": "/data3/interns/NRT_CO2_Emission_Map_Project/MingjuanZhang_work/LAI_global_0.1degree_2019_2025_ns.nc",
    "ntl": "/data3/interns/NRT_CO2_Emission_Map_Project/MingjuanZhang_work/VIIRS_NTL_global_0.1degree_2019_2025_ns.nc",
    "evi": "/data3/interns/NRT_CO2_Emission_Map_Project/MingjuanZhang_work/EVI_global_0.1degree_2019_2025_ns.nc",
    "slope": "/data3/interns/NRT_CO2_Emission_Map_Project/MingjuanZhang_work/SRTM_slope_global_0.1degree_2019_2025_ns.nc",
    "odiac": "/data3/interns/NRT_CO2_Emission_Map_Project/HaoHu_work/odiac_interp_2019_2025.nc",
    "CO2_fire": "/data3/interns/NRT_CO2_Emission_Map_Project/PinyiLu_work/GFAS_resample/GFAS_resample_final.nc"
}


## **1.2 Load modules**

In [None]:
import xarray as xr
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import time
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
from sklearn.model_selection import train_test_split
import os
import gc
from sklearn.metrics import r2_score, mean_squared_error
import xgboost as xgb
import geopandas as gpd
from pathlib import Path

# **2 Split the data to be monthly and Save**

In [None]:
all_vars = {
    "time": None,
    "lat": None,
    "lon": None,
    **target_variable,
    **feature_variables
}

In [None]:
npy_dir = "/data3/interns/NRT_CO2_Emission_Map_Project/ML_XCO2/"  
out_dir = "/data3/interns/NRT_CO2_Emission_Map_Project/ML_XCO2/monthly_slices/" 
os.makedirs(out_dir, exist_ok=True)
all_vars = {"time": None, "lat": None, "lon": None, **target_variable, **feature_variables}
time_path = os.path.join(npy_dir, "time.npy")
time_arr = np.load(time_path, mmap_mode="r")
time_dt = time_arr
time_month = time_dt.astype("datetime64[M]") 

In [None]:
months = np.arange(np.datetime64('2019-01'),
                   np.datetime64('2025-06'),  
                   np.timedelta64(1, 'M'))

outer_pbar = tqdm(months, desc="Per-month processing")


for ym in outer_pbar:
    mask = (time_month == ym)
    n_mask = int(mask.sum())
    if n_mask == 0:
        outer_pbar.set_postfix_str(f"{ym}: no rows, skip")
        continue
    cols = {}
    time_month_slice = time_dt[mask]

    cols["time"] = time_month_slice
    inner_pbar = tqdm(all_vars.keys(), leave=False, desc=f"{ym} variables")

    for var_name in inner_pbar:
        inner_pbar.set_postfix_str(var_name)

        if var_name == "time":
            continue

        var_path = os.path.join(npy_dir, f"{var_name}.npy")

        if not os.path.exists(var_path):
            cols[var_name] = np.full(n_mask, np.nan, dtype="float64")
            continue
        arr = np.load(var_path, mmap_mode="r")
        try:
            sub = arr[mask]
        except Exception as e:
            print(f"⚠️ {ym} {var_name}: Index failed({e})，Using NaN to fill")
            sub = np.full(n_mask, np.nan, dtype="float64")

        if np.issubdtype(sub.dtype, np.number):
            cols[var_name] = sub
        else:
            try:
                sub_num = pd.to_numeric(sub, errors="coerce").astype("float64")
                cols[var_name] = sub_num
            except Exception:
                cols[var_name] = sub 
        del arr, sub
        gc.collect()

    for must in ["lat", "lon"]:
        if must not in cols and must in all_vars:
            vpath = os.path.join(npy_dir, f"{must}.npy")
            if os.path.exists(vpath):
                arr = np.load(vpath, mmap_mode="r")
                try:
                    cols[must] = arr[mask]
                except Exception:
                    cols[must] = pd.to_numeric(arr, errors="coerce")[mask]
                del arr
                gc.collect()
    names = list(cols.keys())
    arrays = [cols[k] for k in names]

    lens = {k: len(v) for k, v in zip(names, arrays)}
    if len(set(lens.values())) != 1:
        print(f"❌ Not consistent in length of row：{lens}；skip {ym}")
        del cols
        gc.collect()
        continue

    rec = np.rec.fromarrays(arrays, names=names)
    out_path = os.path.join(out_dir, f"{str(ym).replace('-', '_')}.npy")
    np.save(out_path, rec)

    del cols, arrays, rec, mask, time_month_slice
    gc.collect()

del time_arr, time_dt, time_month
gc.collect()

print("✅ All Completed：", out_dir)

Per-month processing:  90%|████████▉ | 69/77 [21:57:32<2:19:24, 1045.62s/it] 

# **3. Prediction**

## **Feature engineering like training**

In [None]:
def fill_missing_value(df_month):

    fill_map = {
        "population": -2,
        "aspect":    -2,
        "slope":     -2,
        "ntl":       -2,  
        "evi":       -2,  
        "ndvi":      -2,
        "gpp":       -2,
        "lai":       -2,
    }

    for col, val in fill_map.items():
        if col in df_month.columns:
            df_month[col].fillna(val, inplace=True)

    return df_month

def Process_LU(df_month):

    df_month['landuse'] = df_month['landuse'].astype(str)

    df_month = pd.get_dummies(df_month, columns=['landuse'], prefix='lu')

    return df_month

def time_transfer(df_month):
    df_month["time"] = pd.to_datetime(df_month["time"])

    df_month["year"] = df_month["time"].dt.year
    df_month["month"] = df_month["time"].dt.month
    df_month["day"] = df_month["time"].dt.day

    return df_month

def Process_geo_month(df_month):

    df_month["month_sin"] = np.sin(2 * np.pi * df_month["month"] / 12)
    df_month["month_cos"] = np.cos(2 * np.pi * df_month["month"] / 12)

    df_month["geo_x"] = np.cos(np.radians(df_month["lat"])) * np.cos(np.radians(df_month["lon"]))
    df_month["geo_y"] = np.cos(np.radians(df_month["lat"])) * np.sin(np.radians(df_month["lon"]))
    df_month["geo_z"] = np.sin(np.radians(df_month["lat"]))
    

    date = pd.to_datetime(df_month[['year','month','day']])
    df_month['n_day'] = (date - pd.Timestamp('2019-01-01')).dt.days + 1

    return df_month

def generate_features(df_month):
    features = [col for col in df_month.columns if col not in ["lat", "lon", "xco2", "time", "split", "time_bin", "lat_bin", "lon_bin", "spacetime_block","month", "emission"]]
    return features

def xgb_predict(df_month, features, model):

    X = df_month[features]
    df_month["xco2_pred"] = model.predict(X)

    keep_cols = ["lat", "lon", "time", "xco2", "xco2_pred"]

    return df_month[keep_cols]

def evaluate_prediction(df_month):
    y_true = df_month["xco2"].to_numpy()
    y_pred = df_month["xco2_pred"].to_numpy()
    mask = np.isfinite(y_true) & np.isfinite(y_pred)
    yt = y_true[mask]
    yp = y_pred[mask]
    err = yp - yt
    bias = float(err.mean())                              
    rmse = float(np.sqrt(mean_squared_error(yt, yp)))      
    r2   = float(r2_score(yt, yp))                      
    return bias, rmse, r2


## **Prediction**

In [None]:
monthly_dir = Path("monthly_slices")         
out_pred_dir = Path("XCO2_prediction_full")      
out_pred_dir.mkdir(parents=True, exist_ok=True)
metrics_path = out_pred_dir / "metrics_2019_01_to_2025_06.csv"

model = xgb.XGBRegressor()
model.load_model("Trained_xgb_model_full/xgb_model_full_random.json")

months = pd.period_range("2019-01", "2025-06", freq="M")

results = []

for p in tqdm(months, desc="Processing months", unit="month"):
    tag = f"{p.year}_{p.month:02d}"
    in_path = monthly_dir / f"{tag}.npy"

    if not in_path.exists():
        print(f"[SKIP] {in_path} ")
        continue

    try:
        arr = np.load(in_path, allow_pickle=True)
        df_month = pd.DataFrame(arr)

        df_month = fill_missing_value(df_month)
        df_month = Process_LU(df_month)
        df_month = time_transfer(df_month)
        df_month = Process_geo_month(df_month)
        features = generate_features(df_month)

        df_month = xgb_predict(df_month, features, model)

        bias, rmse, r2 = evaluate_prediction(df_month)
        nrows = len(df_month)
        print(f"{tag}: Bias={bias:.4f} | RMSE={rmse:.4f} | R^2={r2:.4f} | n={nrows}")
        out_path = out_pred_dir / f"monthly_xco2_full_{tag}.npy"
        rec = df_month.to_records(index=False)
        np.save(out_path, rec)

        results.append({
            "month": str(p), 
            "year": p.year,
            "mon": p.month,
            "n": nrows,
            "bias": bias,
            "rmse": rmse,
            "r2": r2,
        })
        del arr, df_month, rec

    except Exception as e:
        print(f"[ERROR]  {tag} ：{e}")

if results:
    df_metrics = pd.DataFrame(results).sort_values(["year", "mon"]).reset_index(drop=True)
    df_metrics.to_csv(metrics_path, index=False)
    print(f"✅ All completed：{metrics_path}")
else:
    print("Failed")
