## Set up

Set Filepaths
Import packages



In [1]:
# set filepaths
path_to_raw = '../data/raw/'

# import packages - general
import numpy as np
import pandas as pd

# import packages - geospatial
import xarray as xr

# import packages - plotting
import matplotlib.pyplot as plt
import cartopy.crs as ccrs
import cartopy.feature as cfeature

# Read in data

In [2]:
# set datafiles
datafiles = ['era5_mslp_sst.nc', 'era5_sp_sst.nc','era5_500hpa.nc']

# read in datafiles, combine into one dataset
for i in range(len(datafiles)):
    print("Loading datafile: ", datafiles[i])

    # if it's the first file, make it a dataset. Else, just add it on
    if i==0:
        ds = xr.open_dataset(path_to_raw + datafiles[i])
    else:
        print("    Merging into main ds")
        ds = xr.merge([ds, xr.open_dataset(path_to_raw + datafiles[i])])


Loading datafile:  era5_mslp_sst.nc
Loading datafile:  era5_sp_sst.nc
    Merging into main ds
Loading datafile:  era5_500hpa.nc
    Merging into main ds


In [3]:
# check we've got what we think we should
ds.head()

## Clean Data - Check for nan values\

There are no nan vlaues except in the sst field. These likely represent temperatures over land. No additional cleaning is needed here.

In [4]:
# takes a while, so I've included a toggle to swtich this on and off
if False:
    # Check for NaN values in the specified fields
    nan_sst = ds['sst'].isnull().sum().item()
    nan_msl = ds['msl'].isnull().sum().item()
    nan_sp = ds['sp'].isnull().sum().item()
    nan_z = ds['z'].isnull().sum().item()

    print(f"NaN values in 'sst': {nan_sst}")
    print(f"NaN values in 'msl': {nan_msl}")
    print(f"NaN values in 'sp': {nan_sp}")
    print(f"NaN values in 'z': {nan_z}")

## Clean Data - Remove lakes and non-ocean water from SST dataset

I don't want to include these when thinking about ocean temperature.

Here I use the GEBCO bathymetry dataset to mask out the land areas.
 - First, I read in the dataset
 - Next I resample to match the spatial resolution of the ERA5 data in my xarray ds
- Next, I apply a filter for bathymetry below sea level (-5m) to identify ocean areas.
 - Next, I set the SST field to nan for all values where the GEBCO file is deeper than the -5m threshold

In [5]:
# Read in GEBCO bathymetry dataset
gebco_path = path_to_raw + 'GEBCO_2024.nc'
gebco_ds = xr.open_dataset(gebco_path)

# switch gebco_ds longitude values which are less than 0 to be 0-360
gebco_ds = gebco_ds.assign_coords(lon=(gebco_ds.coords['lon'].values + 360))

In [8]:
# resample to era5 gird
elevation = gebco_ds.elevation.interp(lat=ds.lat, lon=ds.lon)

In [7]:

# Set the SST field to NaN for all values where the GEBCO file is deeper than the -5m threshold
ocean_mask = elevation < -5

# Set the SST field to NaN for all values where the GEBCO file is deeper than the -5m threshold
ds['sst'] = ds['sst'].where(ocean_mask, other=np.nan)

## Save data now that it's Cleaned

In [28]:
# save to file
ds.to_netcdf('../data/cleaned/era5.nc')