# **1 Global Set-up**

## **1.1 Define All Vars and Path**

In [None]:
target_variable = {
    "xco2": "/data3/interns/NRT_CO2_Emission_Map_Project/HaoHu_work/XCO2_resample/global_grid_0.1_2019_2025_xco2.nc",
}
feature_variables = {
    "t2m": "/data3/interns/NRT_CO2_Emission_Map_Project/HaoHu_work/ERA5_resample/t2m_daily_0p1deg.nc",
    "d2m": "/data3/interns/NRT_CO2_Emission_Map_Project/HaoHu_work/ERA5_resample/d2m_daily_0p1deg.nc",
    "u10": "/data3/interns/NRT_CO2_Emission_Map_Project/HaoHu_work/ERA5_resample/u10_daily_0p1deg.nc",
    "v10": "/data3/interns/NRT_CO2_Emission_Map_Project/HaoHu_work/ERA5_resample/v10_daily_0p1deg.nc",
    "msl": "/data3/interns/NRT_CO2_Emission_Map_Project/HaoHu_work/ERA5_resample/msl_daily_0p1deg.nc",
    "sp": "/data3/interns/NRT_CO2_Emission_Map_Project/HaoHu_work/ERA5_resample/sp_daily_0p1deg.nc",
    "skt": "/data3/interns/NRT_CO2_Emission_Map_Project/HaoHu_work/ERA5_resample/skt_daily_0p1deg.nc",
    "tp": "/data3/interns/NRT_CO2_Emission_Map_Project/HaoHu_work/ERA5_resample/tp_daily_0p1deg.nc",
    "e": "/data3/interns/NRT_CO2_Emission_Map_Project/HaoHu_work/ERA5_resample/e_daily_0p1deg.nc",
    "ssr": "/data3/interns/NRT_CO2_Emission_Map_Project/HaoHu_work/ERA5_resample/ssr_daily_0p1deg.nc",
    "str": "/data3/interns/NRT_CO2_Emission_Map_Project/HaoHu_work/ERA5_resample/str_daily_0p1deg.nc",
    "tcw": "/data3/interns/NRT_CO2_Emission_Map_Project/HaoHu_work/ERA5_resample/tcw_daily_0p1deg.nc",
    "blh": "/data3/interns/NRT_CO2_Emission_Map_Project/HaoHu_work/ERA5_resample/blh_daily_0p1deg.nc",
    
    "NO2": "/data3/interns/NRT_CO2_Emission_Map_Project/HaoHu_work/XCO2_resample/global_grid_0.1_2019_2025_NO2.nc",
    "is_weekend": "/data3/interns/NRT_CO2_Emission_Map_Project/HaoHu_work/XCO2_resample/global_grid_0.1_2019_2025_weekday_weekend.nc",
    "population": "/data3/interns/NRT_CO2_Emission_Map_Project/MingjuanZhang_work/Population_global_0.1degree_2019_2025_ns.nc",
    "elevation": "/data3/interns/NRT_CO2_Emission_Map_Project/MingjuanZhang_work/SRTM_elevation_global_0.1degree_2019_2025_ns.nc",
    "landuse": "/data3/interns/NRT_CO2_Emission_Map_Project/MingjuanZhang_work/Landuse_global_0.1degree_2019_2025_ns.nc",
    "aspect": "/data3/interns/NRT_CO2_Emission_Map_Project/MingjuanZhang_work/SRTM_aspect_global_0.1degree_2019_2025_ns.nc",
    "ndvi": "/data3/interns/NRT_CO2_Emission_Map_Project/MingjuanZhang_work/NDVI_global_0.1degree_2019_2025_ns.nc",
    "gpp": "/data3/interns/NRT_CO2_Emission_Map_Project/MingjuanZhang_work/GPP_global_0.1degree_2019_2025_ns.nc",
    "lai": "/data3/interns/NRT_CO2_Emission_Map_Project/MingjuanZhang_work/LAI_global_0.1degree_2019_2025_ns.nc",
    "ntl": "/data3/interns/NRT_CO2_Emission_Map_Project/MingjuanZhang_work/VIIRS_NTL_global_0.1degree_2019_2025_ns.nc",
    "evi": "/data3/interns/NRT_CO2_Emission_Map_Project/MingjuanZhang_work/EVI_global_0.1degree_2019_2025_ns.nc",
    "slope": "/data3/interns/NRT_CO2_Emission_Map_Project/MingjuanZhang_work/SRTM_slope_global_0.1degree_2019_2025_ns.nc",
    "odiac": "/data3/interns/NRT_CO2_Emission_Map_Project/HaoHu_work/odiac_interp_2019_2025.nc",
    "CO2_fire": "/data3/interns/NRT_CO2_Emission_Map_Project/PinyiLu_work/GFAS_resample/GFAS_resample_final.nc",

}


## **1.2 Load Modules**

In [None]:
import xarray as xr
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
#import cupy as cp
import time
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
from sklearn.model_selection import train_test_split
import os

# **2 Feature Engineering**

### **2.1 Transfer raw nc files to npy files**

In [None]:
all_vars = {**target_variable, **feature_variables}

coords_saved = True
chunk_size = 30

for var, path in tqdm(all_vars.items(), desc="Variables", total=len(all_vars)):
    ds = xr.open_dataset(path)
    da = ds[var]
    times = da["time"].values
    n_chunks = (len(times) + chunk_size - 1) // chunk_size

    flattened_parts = []
    time_chunks, lat_chunks, lon_chunks = [], [], []

    for i in tqdm(range(n_chunks), desc=f"{var} chunks", leave=False, total=n_chunks):
        start = i * chunk_size
        end   = min((i + 1) * chunk_size, len(times))
        da_chunk = da.isel(time=slice(start, end))
        da_pt = da_chunk.stack(point=("time", "lat", "lon"))
        flattened_parts.append(da_pt.values)

        if not coords_saved:
            time_chunks.append(da_pt["time"].values)
            lat_chunks.append(da_pt["lat"].values)
            lon_chunks.append(da_pt["lon"].values)

        del da_chunk, da_pt

    ds.close()

    flat_array = np.concatenate(flattened_parts)
    np.save(f"{var}.npy", flat_array)
    del flattened_parts

    if not coords_saved:
        time_flat = np.concatenate(time_chunks)
        lat_flat  = np.concatenate(lat_chunks)
        lon_flat  = np.concatenate(lon_chunks)
        np.save("time.npy", time_flat)
        np.save("lat.npy",  lat_flat)
        np.save("lon.npy",  lon_flat)
        coords_saved = False
        del time_chunks, lat_chunks, lon_chunks

print("All vars has been Saved as .npy Files")

## **2.2 Save only valid XCO2 data (MASK)**

In [None]:
# Prepare all variables
all_vars = {
    "time": None,
    "lat": None,
    "lon": None,
    **target_variable,
    **feature_variables
}

npy_dir = "/data3/interns/NRT_CO2_Emission_Map_Project/ML_XCO2/"

# üîç The first: load xco2.npy and them get the mask
xco2_path = os.path.join(npy_dir, "xco2.npy")
xco2_data = np.load(xco2_path)
valid_mask = ~np.isnan(xco2_data)

data_dict = {}
for var_name in tqdm(all_vars, desc="Loading filtered .npy files"):
    npy_path = os.path.join(npy_dir, f"{var_name}.npy")
    if os.path.exists(npy_path):
        data = np.load(npy_path)
        data_dict[var_name] = data[valid_mask] 
    else:
        print(f"‚ö†Ô∏è Can not find {npy_path} and skip")


df = pd.DataFrame(data_dict)

print("‚úÖ Finish shifting, the shape of data is: ", df.shape)
print(df.head())

np.save("xco2_nonnan.npy", df.to_records(index=False))

**statitic the proportion of NonNaN of XOC2**

In [None]:
npy_dir = "/data3/interns/NRT_CO2_Emission_Map_Project/ML_XCO2/"

xco2_path = os.path.join(npy_dir, "xco2.npy")
xco2_data = np.load(xco2_path)

total_n = xco2_data.size
n_non_nan = np.count_nonzero(~np.isnan(xco2_data))
n_nan = total_n - n_non_nan
pct_non_nan = n_non_nan / total_n * 100
pct_nan = n_nan / total_n * 100

print(f"XCO2 total values: {total_n}")
print(f"XCO2 non-NaN values: {n_non_nan} ({pct_non_nan:.2f}%)")
print(f"XCO2 NaN values: {n_nan} ({pct_nan:.2f}%)")

valid_mask = ~np.isnan(xco2_data)


## **2.3 main process - feature engineering**

### 2.3.1 Fill NaN values

Check the number of NaN values

In [None]:
summary = pd.DataFrame({
    'null_count': df.isnull().sum(),
    'non_null_count': df.notnull().sum()
}).reset_index().rename(columns={'index': 'variable'})

summary

Fill NaN values

In [None]:
fill_map = {
    "population": -2,
    "aspect":    -2,
    "slope":     -2,
    "ntl":       -2,  
    "evi":       -2,  
    "ndvi":      -2,
    "gpp":       -2,
    "lai":       -2,
}

for col, val in fill_map.items():
    if col in df.columns:
        df[col].fillna(val, inplace=True)

df

### **2.3.2 One-Hot Coding**

In [None]:
# 2. Make sure the landuse is string
df['landuse'] = df['landuse'].astype(str)

df = pd.get_dummies(df, columns=['landuse'], prefix='lu')

df

### **2.3.3 Split Time Vars**

In [None]:
df["time"] = pd.to_datetime(df["time"])
df["year"] = df["time"].dt.year
df["month"] = df["time"].dt.month
df["day"] = df["time"].dt.day

df.drop(columns=["time"], inplace=True)

df

### **2.3.4 Transfer month to COS/SIN and (lat,lon) to (geo_x, geo_y, geo_z)**

In [None]:
df["month_sin"] = np.sin(2 * np.pi * df["month"] / 12)
df["month_cos"] = np.cos(2 * np.pi * df["month"] / 12)

df["geo_x"] = np.cos(np.radians(df["lat"])) * np.cos(np.radians(df["lon"]))
df["geo_y"] = np.cos(np.radians(df["lat"])) * np.sin(np.radians(df["lon"]))
df["geo_z"] = np.sin(np.radians(df["lat"]))
df

### **2.3.5 Save processed data**

In [None]:
np.save("xco2_nonnan_processed.npy", df.to_records(index=False))