# **Step 1: Create Fish Net**

In [None]:
import numpy as np
import xarray as xr

lat_res = 0.1
lon_res = 0.1
lats = np.arange(-89.95, 90.05, lat_res)
lons = np.arange(-179.95, 180.05, lon_res)
times = np.arange(np.datetime64("2019-01-01"), np.datetime64("2025-07-01"))

ds_empty = xr.Dataset(coords={
    "lat": lats,
    "lon": lons,
    "time": times
})

output_path = "global_grid_0.1_2019_2025.nc"
ds_empty.to_netcdf(output_path)

ds_empty

# **Step 2: Resample XCO2**

### **OCO-2**

In [None]:
import numpy as np
import xarray as xr
from datetime import datetime
from glob import glob
from collections import defaultdict
import os
from tqdm import tqdm

oco2_folder = "../OCO2-HaoHu/"
grid_nc_path = "global_grid_0.1_2019_2025.nc"
output_nc_path = "global_grid_0.1_2019_2025_OCO-2.nc"

start_date = datetime.strptime("2019-01-01", "%Y-%m-%d")
end_date   = datetime.strptime("2025-07-01", "%Y-%m-%d")

ds_out = xr.open_dataset(grid_nc_path)
lat = ds_out["lat"].values
lon = ds_out["lon"].values
time_all = ds_out["time"].values
time_len = len(ds_out["time"])
lat_len = len(ds_out["lat"])
lon_len = len(ds_out["lon"])

xco2_all = xr.DataArray(
    np.full((time_len, lat_len, lon_len), np.nan, dtype=np.float32),
    coords={
        "time": ds_out["time"],
        "lat": ds_out["lat"],
        "lon": ds_out["lon"]
    },
    dims=["time", "lat", "lon"],
    name="xco2_oco2"
)

oco2_uncertainty_all = xr.DataArray(
    np.full((time_len, lat_len, lon_len), np.nan, dtype=np.float32),
    coords={
        "time": ds_out["time"],
        "lat": ds_out["lat"],
        "lon": ds_out["lon"]
    },
    dims=["time", "lat", "lon"],
    name="oco2_uncertainty"
)

lat_res = lat[1] - lat[0]
lon_res = lon[1] - lon[0]
grid_shape = (len(lat), len(lon))
half_box = int(0.05 / lat_res) 

def fast_idw(lat0s, lon0s, obs_lat, obs_lon, obs_val, radius):
    results = np.full_like(lat0s, np.nan, dtype=np.float32)
    for idx, (lat0, lon0) in enumerate(zip(lat0s, lon0s)):
        d = np.sqrt((obs_lat - lat0)**2 + (obs_lon - lon0)**2)
        mask = d <= radius
        if not np.any(mask):
            continue
        w = 1 / (d[mask]**2 + 1e-12)
        results[idx] = np.sum(w * obs_val[mask]) / np.sum(w)
    return results

files = sorted(glob(os.path.join(oco2_folder, "*.nc4")))
files_by_day = defaultdict(list)

for f in files:
    try:
        date_str = os.path.basename(f).split("_")[2]  # e.g., '190101'
        date_obj = datetime.strptime(date_str, "%y%m%d")

        if not (start_date <= date_obj <= end_date):
            continue

        files_by_day[date_obj.strftime("%Y-%m-%d")].append(f)
    except Exception as e:
        print(f"Error parsing date for file {f}: {e}")
        continue

for date_str, filelist in tqdm(files_by_day.items(), desc="Processing days"):
    try:
        date_index = np.where(np.datetime_as_string(time_all, unit='D') == date_str)[0][0]
    except IndexError:
        print(f"Date {date_str} not in daily grid, skipping.")
        continue

    lat_all, lon_all, xco2_all_day, unc_all_day = [], [], [], []  # === 新增 unc_all_day
    for file in filelist:
        try:
            with xr.open_dataset(file) as ds:
                mask = (ds["xco2_quality_flag"].values == 0) & np.isfinite(ds["xco2"].values)
                lat_all.append(ds["latitude"].values[mask])
                lon_all.append(ds["longitude"].values[mask])
                xco2_all_day.append(ds["xco2"].values[mask])
                unc_all_day.append(ds["xco2_uncertainty"].values[mask])  # === 新增
        except:
            continue

    if not lat_all:
        continue

    lats = np.concatenate(lat_all)
    lons = np.concatenate(lon_all)
    xco2_vals = np.concatenate(xco2_all_day)
    unc_vals  = np.concatenate(unc_all_day) 
    
    lat_idx = ((lats - lat[0]) / lat_res).astype(int)
    lon_idx = ((lons - lon[0]) / lon_res).astype(int)
    valid_mask = np.zeros(grid_shape, dtype=bool)

    for i, j in zip(lat_idx, lon_idx):
        if 0 <= i < grid_shape[0] and 0 <= j < grid_shape[1]:
            i_min = max(0, i - half_box)
            i_max = min(grid_shape[0], i + half_box + 1)
            j_min = max(0, j - half_box)
            j_max = min(grid_shape[1], j + half_box + 1)
            valid_mask[i_min:i_max, j_min:j_max] = True

    valid_i, valid_j = np.where(valid_mask)
    if len(valid_i) == 0:
        print(f"No valid grid cells found for {date_str}")
        continue

    lat0s = lat[valid_i]
    lon0s = lon[valid_j]

    interpolated_vals = fast_idw(lat0s, lon0s, lats, lons, xco2_vals, radius=0.2)
    interpolated_unc  = fast_idw(lat0s, lon0s, lats, lons, unc_vals,  radius=0.2)  

    out_grid = np.full(grid_shape, np.nan, dtype=np.float32)
    out_grid[valid_i, valid_j] = interpolated_vals
    xco2_all[date_index, :, :] = out_grid

    out_unc = np.full(grid_shape, np.nan, dtype=np.float32)  
    out_unc[valid_i, valid_j] = interpolated_unc           
    oco2_uncertainty_all[date_index, :, :] = out_unc    
    
    print(f"Filled {date_str} with {len(valid_i)} valid grid cells")

xco2_all.name = "xco2_oco2"
ds_out["xco2_oco2"] = xco2_all
ds_out["oco2_uncertainty"] = oco2_uncertainty_all  
ds_out.to_netcdf(output_nc_path)
output_nc_path


In [None]:
import numpy as np
import xarray as xr
from datetime import datetime
from glob import glob
from collections import defaultdict
import os
from tqdm import tqdm

oco2_folder = "../OCO3-HaoHu/"
grid_nc_path = "global_grid_0.1_2019_2025_OCO-2.nc"
output_nc_path = "global_grid_0.1_2019_2025_OCO-2_3.nc"

start_date = datetime.strptime("2019-01-01", "%Y-%m-%d")
end_date   = datetime.strptime("2025-07-01", "%Y-%m-%d")

ds_out = xr.open_dataset(grid_nc_path)
lat = ds_out["lat"].values
lon = ds_out["lon"].values
time_all = ds_out["time"].values
time_len = len(ds_out["time"])
lat_len = len(ds_out["lat"])
lon_len = len(ds_out["lon"])

xco2_all = xr.DataArray(
    np.full((time_len, lat_len, lon_len), np.nan, dtype=np.float32),
    coords={
        "time": ds_out["time"],
        "lat": ds_out["lat"],
        "lon": ds_out["lon"]
    },
    dims=["time", "lat", "lon"],
    name="xco2_oco3"
)

oco2_uncertainty_all = xr.DataArray(
    np.full((time_len, lat_len, lon_len), np.nan, dtype=np.float32),
    coords={
        "time": ds_out["time"],
        "lat": ds_out["lat"],
        "lon": ds_out["lon"]
    },
    dims=["time", "lat", "lon"],
    name="oco3_uncertainty"
)

lat_res = lat[1] - lat[0]
lon_res = lon[1] - lon[0]
grid_shape = (len(lat), len(lon))
half_box = int(0.05 / lat_res) 

def fast_idw(lat0s, lon0s, obs_lat, obs_lon, obs_val, radius):
    results = np.full_like(lat0s, np.nan, dtype=np.float32)
    for idx, (lat0, lon0) in enumerate(zip(lat0s, lon0s)):
        d = np.sqrt((obs_lat - lat0)**2 + (obs_lon - lon0)**2)
        mask = d <= radius
        if not np.any(mask):
            continue
        w = 1 / (d[mask]**2 + 1e-12)
        results[idx] = np.sum(w * obs_val[mask]) / np.sum(w)
    return results

files = sorted(glob(os.path.join(oco2_folder, "*.nc4")))
files_by_day = defaultdict(list)

for f in files:
    try:
        date_str = os.path.basename(f).split("_")[2]  # e.g., '190101'
        date_obj = datetime.strptime(date_str, "%y%m%d")

        if not (start_date <= date_obj <= end_date):
            continue

        files_by_day[date_obj.strftime("%Y-%m-%d")].append(f)
    except Exception as e:
        print(f"Error parsing date for file {f}: {e}")
        continue

for date_str, filelist in tqdm(files_by_day.items(), desc="Processing days"):
    try:
        date_index = np.where(np.datetime_as_string(time_all, unit='D') == date_str)[0][0]
    except IndexError:
        print(f"Date {date_str} not in daily grid, skipping.")
        continue

    lat_all, lon_all, xco2_all_day, unc_all_day = [], [], [], [] 
    for file in filelist:
        try:
            with xr.open_dataset(file) as ds:
                mask = (ds["xco2_quality_flag"].values == 0) & np.isfinite(ds["xco2"].values)
                lat_all.append(ds["latitude"].values[mask])
                lon_all.append(ds["longitude"].values[mask])
                xco2_all_day.append(ds["xco2"].values[mask])
                unc_all_day.append(ds["xco2_uncertainty"].values[mask]) 
        except:
            continue

    if not lat_all:
        continue

    lats = np.concatenate(lat_all)
    lons = np.concatenate(lon_all)
    xco2_vals = np.concatenate(xco2_all_day)
    unc_vals  = np.concatenate(unc_all_day)  

    lat_idx = ((lats - lat[0]) / lat_res).astype(int)
    lon_idx = ((lons - lon[0]) / lon_res).astype(int)
    valid_mask = np.zeros(grid_shape, dtype=bool)

    for i, j in zip(lat_idx, lon_idx):
        if 0 <= i < grid_shape[0] and 0 <= j < grid_shape[1]:
            i_min = max(0, i - half_box)
            i_max = min(grid_shape[0], i + half_box + 1)
            j_min = max(0, j - half_box)
            j_max = min(grid_shape[1], j + half_box + 1)
            valid_mask[i_min:i_max, j_min:j_max] = True

    valid_i, valid_j = np.where(valid_mask)
    if len(valid_i) == 0:
        print(f"No valid grid cells found for {date_str}")
        continue

    lat0s = lat[valid_i]
    lon0s = lon[valid_j]

    interpolated_vals = fast_idw(lat0s, lon0s, lats, lons, xco2_vals, radius=0.2)
    interpolated_unc  = fast_idw(lat0s, lon0s, lats, lons, unc_vals,  radius=0.2)  
    out_grid = np.full(grid_shape, np.nan, dtype=np.float32)
    out_grid[valid_i, valid_j] = interpolated_vals
    xco2_all[date_index, :, :] = out_grid

    out_unc = np.full(grid_shape, np.nan, dtype=np.float32) 
    out_unc[valid_i, valid_j] = interpolated_unc            
    oco2_uncertainty_all[date_index, :, :] = out_unc      
    
    print(f"Filled {date_str} with {len(valid_i)} valid grid cells")

xco2_all.name = "xco2_oco3"
ds_out["xco2_oco3"] = xco2_all
ds_out["oco3_uncertainty"] = oco2_uncertainty_all  
ds_out.to_netcdf(output_nc_path)
output_nc_path


## Merge OCO2 and OCO3

In [None]:
import xarray as xr
import numpy as np
import pandas as pd
from tqdm import tqdm

ds = xr.open_dataset("global_grid_0.1_2019_2025_OCO-2_3.nc")

xco2_oco2 = ds.get("xco2_oco2")
xco2_oco3 = ds.get("xco2_oco3")
uncertainty_oco2 = ds.get("oco2_uncertainty")
uncertainty_oco3 = ds.get("oco3_uncertainty")

combined_xco2 = xr.full_like(xco2_oco2, fill_value=np.nan)

for t in tqdm(range(xco2_oco2.sizes["time"]), desc="Combining xco2"):

    x2 = xco2_oco2.isel(time=t)
    x3 = xco2_oco3.isel(time=t)
    u2 = uncertainty_oco2.isel(time=t)
    u3 = uncertainty_oco3.isel(time=t)

    both_valid = np.isfinite(x2) & np.isfinite(x3)
    only_oco2 = np.isfinite(x2) & ~np.isfinite(x3)
    only_oco3 = ~np.isfinite(x2) & np.isfinite(x3)

    result = xr.full_like(x2, fill_value=np.nan)

    result = result.where(~only_oco2, x2)

    result = result.where(~only_oco3, x3)

    w2 = 1 / (u2 + 1e-6)**2
    w3 = 1 / (u3 + 1e-6)**2
    weighted = (x2 * w2 + x3 * w3) / (w2 + w3)
    result = result.where(~both_valid, weighted)

    combined_xco2[t] = result

out_ds = xr.Dataset({"xco2": combined_xco2})
out_ds.to_netcdf("global_grid_0.1_2019_2025_xco2.nc")

## print the number of non-NaN

In [None]:
import xarray as xr
import numpy as np

ds = xr.open_dataset("global_grid_0.1_2019_2025_OCO-2_3.nc")

xco2_oco3 = ds["xco2_oco3"]
xco2_oco2 = ds["xco2_oco2"]

print("OCO-3   non-NaN", xco2_oco3.count().item())
print("OCO-2   non-NaN:", xco2_oco2.count().item())


In [None]:
import xarray as xr
import numpy as np


ds = xr.open_dataset("global_grid_0.1_2019_2025_OCO-2_3.nc")

xco2_oco2 = ds["xco2_oco2"]
xco2_oco3 = ds["xco2_oco3"]

mask = (~np.isnan(xco2_oco2)) & (~np.isnan(xco2_oco3))

diff = xco2_oco2 - xco2_oco3  # shape: [time, lat, lon]

valid_diff = diff.where(mask)

mean_diff = valid_diff.mean().item()
std_diff = valid_diff.std().item()

mask_anytime = mask.any(dim="time")
count = mask_anytime.sum().item()

print(f"同时有xco2_oco2和xco2_oco3的网格点数量: {count}")
print(f"OCO-2 和 OCO-3 的XCO2平均差值: {mean_diff:.4f}")
print(f"OCO-2 和 OCO-3 的XCO2标准差: {std_diff:.4f}")

# Step 3: Resample NO2

In [None]:
import xarray as xr
import numpy as np
import pandas as pd
import os
from glob import glob
from tqdm import tqdm

input_folder = "/data3/interns/NRT_CO2_Emission_Map_Project/HaoHu_work/TROPOMI-NO2_HaoHu/"
target_grid_nc = "global_grid_0.1_2019_2025.nc"
output_nc = "global_grid_0.1_2019_2025_NO2.nc"

target_grid = xr.open_dataset(target_grid_nc)
target_lat = target_grid['lat']
target_lon = target_grid['lon']
target_time = target_grid['time'].values

file_list = sorted(glob(os.path.join(input_folder, "EOSDISNO2_20????_15d.nc")))

file_dict = {}
for file in file_list:
    filename = os.path.basename(file)
    year_month = filename.split('_')[1][:6]  # "202001"
    file_dict[year_month] = file

NO2_all = xr.DataArray(
    np.full((len(target_time), len(target_lat), len(target_lon)), np.nan, dtype=np.float32),
    coords={"time": target_time, "lat": target_lat, "lon": target_lon},
    dims=["time", "lat", "lon"],
    name="NO2"
)

for i, time_point in enumerate(tqdm(target_time, desc="Interpolating NO2")):
    yyyymm = pd.to_datetime(str(time_point)).strftime("%Y%m")
    if yyyymm not in file_dict:
        continue

    ds = xr.open_dataset(file_dict[yyyymm])
    if "nday" in ds.dims:
        ds = ds.rename({"nday": "time"})

    start_time = np.datetime64(f"{yyyymm[:4]}-{yyyymm[4:]}-01")
    n_days = ds.sizes['time']
    ds['time'] = start_time + np.arange(n_days)

    if time_point not in ds['time']:
        continue

    no2_day = ds['NO2'].sel(time=time_point)
    if "latitude" in no2_day.dims and "longitude" in no2_day.dims:
        no2_day = no2_day.rename({"latitude": "lat", "longitude": "lon"})

    interp_day = no2_day.interp(lat=target_lat, lon=target_lon, method="linear")

    interp_day = interp_day.where(interp_day >= 0, 0)
    
    NO2_all[i, :, :] = interp_day.values

out_ds = xr.Dataset({"NO2": NO2_all})
out_ds.to_netcdf(output_nc)

output_nc


# Step 4: Weekend or Weekeday

In [None]:
import xarray as xr
import pandas as pd
import numpy as np
from tqdm import tqdm

input_nc = "global_grid_0.1_2019_2025.nc"
output_nc = "global_grid_0.1_2019_2025_weekday_weekend.nc"

ds = xr.open_dataset(input_nc)
time = ds['time']
lat = ds['lat']
lon = ds['lon']

is_weekend_1d = xr.DataArray(
    [1 if pd.Timestamp(t).weekday() >= 5 else 0 for t in tqdm(time.values, desc="whether weekend")],
    dims='time',
    coords={'time': time},
    name='is_weekend_flag',
    attrs={
        'description': '1 = weekend (Saturday or Sunday), 0 = weekday',
        'long_name': 'Weekend Indicator (1D)',
        'units': '1'
    }
)

is_weekend_3d, _, _ = xr.broadcast(is_weekend_1d, lat, lon)
is_weekend_3d.name = 'is_weekend'
is_weekend_3d.attrs = {
    'description': '1 = weekend (Saturday or Sunday), 0 = weekday',
    'long_name': 'Weekend Indicator (broadcasted to 3D)',
    'units': '1'
}

ds['is_weekend'] = is_weekend_3d
ds.to_netcdf(output_nc)
print(f"✅ Save: {output_nc}")