# save synthetic float data for mapping as netcdf
#
# Note: Here, instead of requiring that a float stays within a region for the whole chosen time period, use all profiles in the region in the chosen time window
#
# USER SETTINGS: 
# see second cell in script
# the following can be set by user: years, depth levels, temporal frequency, region, variables
# (variables can be defined by user but number (3) is currently hard-coded (default: temperature, salinity & oxygen)
# there is also the option to randomly subselect a certain percentage of the data
# all variables are interpolated to the chosen depth levels
# 

In [1]:

import sys
sys.path.append('/global/homes/c/cnissen/scripts/seawater-3.3.4/seawater/')
import os
import glob
import numpy as np
import xarray as xr
import cartopy
import cartopy.crs as ccrs
import matplotlib.pyplot as plt
from matplotlib import cm
#from seawater import dist
#import seawater as sw
import matplotlib.path as mpath
from cartopy.util import add_cyclic_point
import matplotlib.gridspec as gridspec
import matplotlib.ticker as mticker
from cartopy.mpl.ticker import (LongitudeFormatter, LatitudeFormatter,
                                LatitudeLocator)
from numba import njit
import time
from statsmodels.stats.weightstats import DescrStatsW
from tqdm import tqdm
from math import sin, cos, sqrt, atan2, radians
from datetime import datetime, timedelta
from tqdm import tqdm
from scipy import interpolate
from netCDF4 import Dataset, MFDataset
import random


In [2]:
#-----
# some user-defined settings
#-----
# NOTE: the location of the E3SM float data is currently hard-coded further down in the script

# where to store the resulting *nc file?
savepath     = '/global/homes/k/kefalc/code/data_files/Final/'
# check existence of paths
if not os.path.exists(savepath):
    print ('Created '+savepath)
    os.makedirs(savepath)

# where to store the final plot showing the distirbution of the data?
savepath_plots     = '/global/homes/k/kefalc/code/plots/'
# check existence of paths
if not os.path.exists(savepath_plots):
    print ('Created '+savepath_plots)
    os.makedirs(savepath_plots)
    
# optional: instead of talking all available profiles for a given region and time period, 
# the user can here select to only store xx% of all available profiles
# NOTE: the temporal distribution is kept unchanged, i.e., xx% of all profiles are chosen for each month and each year
reduce_floats = True
keep_how_many = 30 # in percent

# choose how often a float should sample (--> one sample every xx days)
# NOTE: the script further down assumes that there are 6 samples per day in E3SM output
xx_daily = 10 # 10-daily sampling

# define name of the region
# NOTE: this string needs to be defined further down in script to restrict the profiles correctly
# currently, I have defined different versions of the SO
region_string = 'SO_30S' # used in filenames and to restrict data to a specific region

# choose years
year1,year2 = 2010,2019  # currently, we have E3SM data from 1980-2019
print('year1,year2:',year1,year2)

# choose a list of target depth levels, interpolate (depth levels must be negative as in E3SM files)
# NOTE: extrapolation is currently set to "False" in script below
#   i.e., that will not be any data when it is shallower than maximum depth chosen here
#   further, the shallowest level might also be NaN when it is shallower than the shallowest level available for E3SM float
depth_interp=np.array([-5,-10,-100,-250,-500,-1000,-1500,-3000])
print('depth_interp',depth_interp)
    
# variables to process
# NOTE: while three variables are currently hard-coded in the script, the variables themselves are interchangeable
vari_temp = 'particleColumnTemperature'
vari_salt = 'particleColumnSalinity'
vari_oxy  = 'particleColumnO2'
# settings for writing the chosen variables into the final nc file
vari1_nc = 'temperature'
vari2_nc = 'salinity'
vari3_nc = 'oxygen'
unit1    = 'degC'
unit2    = 'ppt'
unit3    = 'mmol m-3'


year1,year2: 2010 2019
depth_interp [   -5   -10  -100  -250  -500 -1000 -1500 -3000]


In [3]:
#---
# some functions... 
#---

# convert day/month to day_of_the_year
# functions from here: https://stackoverflow.com/questions/620305/convert-year-month-day-to-day-of-year-in-python

def is_leap_year(year):
    """ if year is a leap year return True
        else return False """
    if year % 100 == 0:
        return year % 400 == 0
    return year % 4 == 0

def doy(Y,M,D):
    """ given year, month, day return day of year
        Astronomical Algorithms, Jean Meeus, 2d ed, 1998, chap 7 """
    if is_leap_year(Y):
        K = 1
    else:
        K = 2
    N = int((275 * M) / 9.0) - K * int((M + 9) / 12.0) + D - 30
    return N

def ymd(Y,N):
    """ given year = Y and day of year = N, return year, month, day
        Astronomical Algorithms, Jean Meeus, 2d ed, 1998, chap 7 """    
    if is_leap_year(Y):
        K = 1
    else:
        K = 2
    M = int((9 * (K + N)) / 275.0 + 0.98)
    if N < 32:
        M = 1
    D = N - int((275 * M) / 9.0) + K * int((M + 9) / 12.0) + 30
    return Y, M, D


In [4]:
#----
# load E3SM float data (positions only for now)
#----
# lat,lon
# construct arrays with year, day_of_the_year, hour

path1 = '/global/cfs/cdirs/m4003/maltrud/southernOceanRefined/nonProfilingFloats/'
year_list = np.arange(year1,year2+1,1) 
print(year_list)

rad_to_deg = 180.0/np.pi

hours_e3sm = np.asarray(len(year_list)*365*[0,4,8,12,16,20]) # num_years * 365 * 4-hourly sampling
doy_e3sm   = np.sort(np.asarray(6*list(np.arange(1,365+1,1)))) # repeat each day 6 times (6 samples per day)
doy_e3sm   = np.asarray(len(year_list)*list(doy_e3sm))
years_e3sm = np.sort(np.asarray(365*6*list(year_list)))
print(hours_e3sm.shape,doy_e3sm.shape,years_e3sm.shape)

for yy in tqdm(range(0,len(year_list))):
    
    f1   = xr.open_dataset(path1+'nonProfilingFloats.year'+str(year_list[yy])+'all.nc')
    lat1 = f1['latParticle'].values*rad_to_deg
    lon1 = f1['lonParticle'].values*rad_to_deg
    floatid1 = f1['indexToParticleID'].astype(int).values
    
    f1.close()
    
    if yy==0:
        lat_all = lat1
        lon_all = lon1
        floatid_all = floatid1
    else:
        lat_all = np.concatenate((lat_all,lat1))
        lon_all = np.concatenate((lon_all,lon1))
        floatid_all = np.concatenate((floatid_all, floatid1))
    del lat1,lon1, floatid1
    print('lat_all.shape',lat_all.shape)


[2010 2011 2012 2013 2014 2015 2016 2017 2018 2019]
(21900,) (21900,) (21900,)


 10%|█         | 1/10 [00:02<00:22,  2.53s/it]

lat_all.shape (2190, 3544)


 20%|██        | 2/10 [00:04<00:19,  2.47s/it]

lat_all.shape (4380, 3544)


 30%|███       | 3/10 [00:07<00:17,  2.51s/it]

lat_all.shape (6570, 3544)


 40%|████      | 4/10 [00:10<00:15,  2.61s/it]

lat_all.shape (8760, 3544)


 50%|█████     | 5/10 [00:12<00:13,  2.61s/it]

lat_all.shape (10950, 3544)


 60%|██████    | 6/10 [00:15<00:10,  2.65s/it]

lat_all.shape (13140, 3544)


 70%|███████   | 7/10 [00:18<00:08,  2.89s/it]

lat_all.shape (15330, 3544)


 80%|████████  | 8/10 [00:22<00:06,  3.01s/it]

lat_all.shape (17520, 3544)


 90%|█████████ | 9/10 [00:26<00:03,  3.31s/it]

lat_all.shape (19710, 3544)


100%|██████████| 10/10 [00:29<00:00,  2.96s/it]

lat_all.shape (21900, 3544)





In [5]:
#----
# load E3SM files with tracer information
#----
# use the indices for all colocated E3SM float data points
# use these when reading in tracer field
#

path1 = '/global/cfs/cdirs/m4003/maltrud/southernOceanRefined/profilingFloats/'
year_list = np.arange(year1,year2+1,1) 

# get list of files for chosen years
files_all = []
for file in glob.glob(path1+"/*.nc"):
    files_all.append(file) # get all files
# reduce to files for chosen years
filelist = []
for ff in range(0,len(files_all)):
    for yy in range(0,len(year_list)):
        if str(year_list[yy]) in files_all[ff]: 
            print(year_list[yy],'...')
            try:
                filelist = np.concatenate((filelist,[files_all[ff]]))
            except: 
                filelist = [files_all[ff]]
filelist = np.sort(filelist) 
print(filelist) 

# load files of chosen years
ff = xr.open_mfdataset(filelist,concat_dim='record',combine='nested')
#print(ff)

# depth levels of floats (varying in space and time because model uses zstar coordinates!)
ff_depth = ff['particleColumnDepth']
print(ff_depth)

# temperature as sampled by floats
ff_temp = ff[vari_temp]
print(ff_temp)

# salinity as sampled by floats
ff_salt = ff[vari_salt]
print(ff_salt)

# oxygen as sampled by floats
ff_o2 = ff[vari_oxy]
print(ff_o2)


2011 ...
2010 ...
2013 ...
2015 ...
2019 ...
2017 ...
2018 ...
2012 ...
2014 ...
2016 ...
['/global/cfs/cdirs/m4003/maltrud/southernOceanRefined/profilingFloats/profilingFloats.year2010all.nc'
 '/global/cfs/cdirs/m4003/maltrud/southernOceanRefined/profilingFloats/profilingFloats.year2011all.nc'
 '/global/cfs/cdirs/m4003/maltrud/southernOceanRefined/profilingFloats/profilingFloats.year2012all.nc'
 '/global/cfs/cdirs/m4003/maltrud/southernOceanRefined/profilingFloats/profilingFloats.year2013all.nc'
 '/global/cfs/cdirs/m4003/maltrud/southernOceanRefined/profilingFloats/profilingFloats.year2014all.nc'
 '/global/cfs/cdirs/m4003/maltrud/southernOceanRefined/profilingFloats/profilingFloats.year2015all.nc'
 '/global/cfs/cdirs/m4003/maltrud/southernOceanRefined/profilingFloats/profilingFloats.year2016all.nc'
 '/global/cfs/cdirs/m4003/maltrud/southernOceanRefined/profilingFloats/profilingFloats.year2017all.nc'
 '/global/cfs/cdirs/m4003/maltrud/southernOceanRefined/profilingFloats/profilingFloats

In [6]:
#---
# REDUCE DATA in space: only keep profiles within a certain region
#---
print(lon_all.shape)

if region_string=='SO_30S':
    latmin = -90 #-69.966
    latmax = -30 #-40.00035
    lonmin = 0 #-149.99954
    lonmax = 360 #-120.00199890136719
elif region_string=='SO_40S':
    latmin = -90 #-69.966
    latmax = -40 #-40.00035
    lonmin = 0 #-149.99954
    lonmax = 360 #-120.00199890136719
elif region_string=='SO_50S':
    latmin = -90 #-69.966
    latmax = -50 #-40.00035
    lonmin = 0 #-149.99954
    lonmax = 360 #-120.00199890136719
elif region_string=='SO_60S':
    latmin = -90 #-69.966
    latmax = -60 #-40.00035
    lonmin = 0 #-149.99954
    lonmax = 360 #-120.00199890136719
elif region_string=='SO_Test':
    latmin = -65
    latmax = -35
    lonmin = 215 ##-145
    lonmax = 235 ##-125 ##includes 5 extra degrees of lat/lon to account for border regions

# create a mask (data to keep: 1, data to discard: 0)
mask_SO = np.ones_like(lon_all)
mask_SO[lon_all>lonmax] = 0 #= np.ones_like(lon_all)
mask_SO[lon_all<lonmin] = 0 ##needed for test area but not for entire SO
mask_SO[lat_all>latmax] = 0 # set locations outside of SO to zero
mask_SO[lat_all<latmin] = 0 ##needed for test area but not for entire SO

# NOTE: no selection based on lon yet! Choose all longitudes for now.
# When selecting based on lon, carefully check min/max of lon array
print(np.min(lon_all),np.max(lon_all))


(21900, 3544)
7.225619220581268e-07 360.0000066217054


In [7]:
#---
# REDUCE DATA in time: 
#---
# create a mask to select only certain data for each float (speed up the data selection and interpolation in next step)

#---
# XX-daily, e.g., 10-daily: don't sample all floats on day 1,11,21 etc (instead, sample some at day 5,15,25, others at 2,12,22 etc)
#---
# account for the fact that we have multiple float observations per day in SO run!

nn = xx_daily*6   # output is 4-hourly for SO run (6 data points per day)
num_time   = ff_o2.shape[0]
num_depths = ff_o2.shape[1]
num_floats = ff_o2.shape[2]
print('num_time, num_depths, num_floats:',num_time,num_depths,num_floats)

# get num_floats random numbers between 0-(10*6-1) (10 daily), 0-(5*6-1) (5-daily)
ind_start = np.random.choice(np.arange(0,nn), size=num_floats, replace=True)
print('min/max ind_start:',np.min(ind_start),np.max(ind_start))
print(ind_start.shape,ind_start)

# create mask
mask_time = np.zeros_like(mask_SO)
print(mask_time.shape)
for fl in tqdm(range(0,num_floats)):
    mask_time[ind_start[fl]::nn,fl] = 1


num_time, num_depths, num_floats: 21900 60 3544
min/max ind_start: 0 59
(3544,) [53 51 59 ... 31 12 25]
(21900, 3544)


100%|██████████| 3544/3544 [00:00<00:00, 285391.44it/s]


In [8]:
%%time
#---
# extract tracer data for chosen subregion from synthetic float obs
#---
# NOTE: It takes too long to do the interpolation for all data first
# Therefore, integrate the data selection step into the vertical interpolation step
#  --> only do calculation for locations within the region selected above
#  --> only do calculation for times identified above
#
# vertical interpolation is following example in "create_BGC_netcdf.ipynb" (a script Kristen gave me)

def pc_interpolation(pressure, parameter):
    interp_values = interpolate.pchip_interpolate(pressure, temp, pres_interp)
    return(interp_values)

ind_pp = num_time # I only processed some time steps for testing --> set this to num_time if all data should be processed

# loop over all float data points but only compute if both mask_SO and mask_time are 1 
temp_interp = np.nan*np.ones([num_time,len(depth_interp),num_floats]) 
salt_interp = np.nan*np.ones([num_time,len(depth_interp),num_floats]) 
o2_interp   = np.nan*np.ones([num_time,len(depth_interp),num_floats]) 
for tt in tqdm(range(0,ind_pp)):
    for fl in range(0,num_floats):
        if (mask_SO[tt,fl]==1) & (mask_time[tt,fl]==1):
            #print(fl)
            temp_aux  = ff_temp.isel(record=tt).isel(nParticles=fl).values.flatten() #adjust this for different variables
            salt_aux  = ff_salt.isel(record=tt).isel(nParticles=fl).values.flatten() #adjust this for different variables
            o2_aux    = ff_o2.isel(record=tt).isel(nParticles=fl).values.flatten() #adjust this for different variables
            depth_aux = ff_depth.isel(record=tt).isel(nParticles=fl).values.flatten() #adjust this for different variables
          #  print(temp_aux)
          #  print(depth_aux)
            # remove all missing data (honestly, I don't fully understand why some data points are -2...)
            temp_aux  = temp_aux[o2_aux!=-1] # remove all missing data (set to -1 in E3SM data)
            salt_aux  = salt_aux[o2_aux!=-1] # remove all missing data (set to -1 in E3SM data)
            o2_aux    = o2_aux[o2_aux!=-1] # remove all missing data (set to -1 in E3SM data)
            depth_aux = depth_aux[depth_aux!=-1]
            temp_aux  = temp_aux[o2_aux!=-2] # remove all missing data (set to -1 in E3SM data)
            salt_aux  = salt_aux[o2_aux!=-2] # remove all missing data (set to -1 in E3SM data)
            o2_aux    = o2_aux[o2_aux!=-2] # remove all missing data (set to -1 in E3SM data)
            depth_aux = depth_aux[depth_aux!=-2]
            temp_aux  = temp_aux[o2_aux!=0] # remove all missing data (might be 0 after above steps)
            salt_aux  = salt_aux[o2_aux!=0] # remove all missing data (might be 0 after above steps)
            o2_aux    = o2_aux[o2_aux!=0] # remove all missing data (might be 0 after above steps)
            depth_aux = depth_aux[depth_aux!=0]

          #  print(temp_aux)
          #  print(depth_aux)
            
            if len(o2_aux)>0: # only continue if there is any data to be processed
                # NOTE: I don't yet fully understand why this is necessary. 
                # Shouldn't E3SM floats have data everywhere and at all time?
                
                # temp
                pressure_o, biogeochem = zip(*sorted(zip(depth_aux, temp_aux))) # depth levels must be ascending
                pressure_o = np.asarray(pressure_o)
                biogeochem = np.asarray(biogeochem) 
                temp_interpolant    = interpolate.PchipInterpolator(pressure_o, biogeochem, extrapolate = False)
                temp_interp_values  = temp_interpolant(depth_interp)
                temp_interp[tt,:,fl]   = temp_interp_values

                # salt
                pressure_o, biogeochem = zip(*sorted(zip(depth_aux, salt_aux))) # depth levels must be ascending
                pressure_o = np.asarray(pressure_o)
                biogeochem = np.asarray(biogeochem) 
                salt_interpolant    = interpolate.PchipInterpolator(pressure_o, biogeochem, extrapolate = False)
                salt_interp_values  = salt_interpolant(depth_interp)
                salt_interp[tt,:,fl]   = salt_interp_values

                # oxygen
                pressure_o, biogeochem = zip(*sorted(zip(depth_aux, o2_aux))) # depth levels must be ascending
                pressure_o = np.asarray(pressure_o)
                biogeochem = np.asarray(biogeochem) 
                oxy_interpolant   = interpolate.PchipInterpolator(pressure_o, biogeochem, extrapolate = False)
                oxy_interp_values = oxy_interpolant(depth_interp)
                o2_interp[tt,:,fl]   = oxy_interp_values
                #print('oxy_interp_values',oxy_interp_values)

                del pressure_o, biogeochem, oxy_interpolant, oxy_interp_values
                del temp_interpolant, temp_interp_values, salt_interpolant, salt_interp_values

            del o2_aux, temp_aux, salt_aux, depth_aux


100%|██████████| 21900/21900 [2:51:41<00:00,  2.13it/s]  

CPU times: user 1h 50min 3s, sys: 24min 39s, total: 2h 14min 43s
Wall time: 2h 51min 49s





In [9]:
#----
# reorganize data arrays: kick out all locations/time without any data
#----
print(xx_daily,'daily')
print(region_string)

print('temp_interp.shape',temp_interp.shape)
print('o2_interp.shape',o2_interp.shape)
print('lat_all.shape',lat_all.shape)

print()

aux = np.copy(temp_interp[0:ind_pp,:,:]) # I only processed some time steps for testing 
aux = np.transpose(aux,[0,2,1])

aux = np.reshape(aux,[aux.shape[0]*aux.shape[1],aux.shape[2]]) # num_profile x depth
display(aux.shape)
ind  = np.nansum(aux,axis=1) # ind is 0 if no data are available
ind1 = np.where(ind!=0)[0]
display(ind1)
display(ind1.size)

# TEMPERATURE
aux = np.copy(temp_interp[0:ind_pp,:,:]) # I only processed some time steps for testing 
aux = np.transpose(aux,[0,2,1])
aux = np.reshape(aux,[aux.shape[0]*aux.shape[1],aux.shape[2]]) # num_profile x depth
# only keep profiles with data
ind  = np.nansum(aux,axis=1) # ind is 0 if no data are available
ind1 = np.where(ind!=0)[0] # temp can be negative, i.e., ind can be negative! don't search for ">0" here!
data_to_save1 = aux[ind1,:]
ind1_noNaN = np.copy(ind1) # keep for creation fo time arrays below
print('data_to_save1.shape',data_to_save1.shape)
# process lat/lon
aux_l = np.copy(lat_all[0:ind_pp,:])
aux_l = np.reshape(aux_l,[aux_l.shape[0]*aux_l.shape[1]])
data_to_save_lat = aux_l[ind1]
print('data_to_save_lat.shape',data_to_save_lat.shape)
aux_l = np.copy(lon_all[0:ind_pp,:])
aux_l = np.reshape(aux_l,[aux_l.shape[0]*aux_l.shape[1]])
data_to_save_lon = aux_l[ind1]
idl = np.copy(floatid_all[0:ind_pp,:])
idl = np.reshape(idl, [idl.shape[0]*idl.shape[1]])
data_to_save_id = idl[ind1]
print('data_to_save_lon.shape',data_to_save_lon.shape)
del ind,ind1,aux,aux_l

# SALINITY
aux = np.copy(salt_interp[0:ind_pp,:,:]) # I only processed some time steps for testing 
aux = np.transpose(aux,[0,2,1])
aux = np.reshape(aux,[aux.shape[0]*aux.shape[1],aux.shape[2]]) # num_profile x depth
# only keep profiles with data
ind  = np.nansum(aux,axis=1) # ind is 0 if no data are available
ind1 = np.where(ind!=0)[0]
data_to_save2 = aux[ind1,:]
print('data_to_save2.shape',data_to_save2.shape)
del ind,ind1,aux

# OXYGEN
aux = np.copy(o2_interp[0:ind_pp,:,:]) # I only processed some time steps for testing 
aux = np.transpose(aux,[0,2,1])
aux = np.reshape(aux,[aux.shape[0]*aux.shape[1],aux.shape[2]]) # num_profile x depth
# only keep profiles with data
ind  = np.nansum(aux,axis=1) # ind is 0 if no data are available
ind1 = np.where(ind!=0)[0]
data_to_save3 = aux[ind1,:]
print('data_to_save3.shape',data_to_save3.shape)
del ind,ind1,aux

10 daily
SO_30S
temp_interp.shape (21900, 8, 3544)
o2_interp.shape (21900, 8, 3544)
lat_all.shape (21900, 3544)



(77613600, 8)

array([    3854,     4064,     4149, ..., 77613107, 77613521, 77613589])

766306

data_to_save1.shape (766306, 8)
data_to_save_lat.shape (766306,)
data_to_save_lon.shape (766306,)
data_to_save2.shape (766306, 8)
data_to_save3.shape (766306, 8)


In [10]:
for yy in range(0,len(year_list)):

    # create dates for xx-daily float sampling chosen for the data
    months = [31,28,31,30,31,30,31,31,30,31,30,31]
    
    days0   = np.concatenate((np.arange(1,months[0]+1),np.arange(1,months[1]+1),\
                          np.arange(1,months[2]+1),np.arange(1,months[3]+1),\
                          np.arange(1,months[4]+1),np.arange(1,months[5]+1),\
                          np.arange(1,months[6]+1),np.arange(1,months[7]+1),\
                          np.arange(1,months[8]+1),np.arange(1,months[9]+1),\
                          np.arange(1,months[10]+1),np.arange(1,months[11]+1)))
    days0 = np.repeat(days0,6) # 4-hourly data
    months0 = np.concatenate((1*np.ones(months[0]),2*np.ones(months[1]),\
                            3*np.ones(months[2]),4*np.ones(months[3]),\
                            5*np.ones(months[4]),6*np.ones(months[5]),\
                            7*np.ones(months[6]),8*np.ones(months[7]),\
                            9*np.ones(months[8]),10*np.ones(months[9]),\
                            11*np.ones(months[10]),12*np.ones(months[11])))
    months0 = np.repeat(months0,6) # 4-hourly data
    years0 = year_list[yy]*np.ones([365])
    years0 = np.repeat(years0,6) # 4-hourly data
        
    if yy==0: 
        days_all   = days0
        months_all = months0
        years_all  = years0
    else:
        days_all = np.concatenate((days_all,days0))
        months_all = np.concatenate((months_all,months0))
        years_all = np.concatenate((years_all,years0))
    del days0,months0,months,years0

print(days_all.shape,days_all)
print(months_all.shape,months_all)
print(years_all.shape,years_all)

(21900,) [ 1  1  1 ... 31 31 31]
(21900,) [ 1.  1.  1. ... 12. 12. 12.]
(21900,) [2010. 2010. 2010. ... 2019. 2019. 2019.]


In [11]:
#---
# CREATE TIME INFO TO STORE IN FILE
#---
# xx-daily (see above): don't sample all floats on day 1,11,21 etc (instead, sample some on day 5,15,25, others on day 3,13,23 etc)

year_list = np.arange(year1,year2+1,1) 

# creat day/month info for all years
for yy in range(0,len(year_list)):

    # create dates for xx-daily float sampling chosen for the data
    months = [31,28,31,30,31,30,31,31,30,31,30,31]
    
    days0   = np.concatenate((np.arange(1,months[0]+1),np.arange(1,months[1]+1),\
                          np.arange(1,months[2]+1),np.arange(1,months[3]+1),\
                          np.arange(1,months[4]+1),np.arange(1,months[5]+1),\
                          np.arange(1,months[6]+1),np.arange(1,months[7]+1),\
                          np.arange(1,months[8]+1),np.arange(1,months[9]+1),\
                          np.arange(1,months[10]+1),np.arange(1,months[11]+1)))
    days0 = np.repeat(days0,6) # 4-hourly data
    months0 = np.concatenate((1*np.ones(months[0]),2*np.ones(months[1]),\
                            3*np.ones(months[2]),4*np.ones(months[3]),\
                            5*np.ones(months[4]),6*np.ones(months[5]),\
                            7*np.ones(months[6]),8*np.ones(months[7]),\
                            9*np.ones(months[8]),10*np.ones(months[9]),\
                            11*np.ones(months[10]),12*np.ones(months[11])))
    months0 = np.repeat(months0,6) # 4-hourly data
    years0 = year_list[yy]*np.ones([365])
    years0 = np.repeat(years0,6) # 4-hourly data
        
    if yy==0: 
        days_all   = days0
        months_all = months0
        years_all  = years0
    else:
        days_all = np.concatenate((days0,days_all))
        months_all = np.concatenate((months0,months_all))
        years_all = np.concatenate((years_all, years0))
    del days0,months0,months,years0
    
print(days_all.shape,days_all)
print(months_all.shape,months_all)
print(years_all.shape,years_all)

(21900,) [ 1  1  1 ... 31 31 31]
(21900,) [ 1.  1.  1. ... 12. 12. 12.]
(21900,) [2010. 2010. 2010. ... 2019. 2019. 2019.]


In [12]:
# adapt shape -> include dimension for floats
days_all   = np.tile(days_all,[num_floats,1]).transpose() # all floats! (also outside of target region)
months_all = np.tile(months_all,[num_floats,1]).transpose()
years_all  = np.tile(years_all,[num_floats,1]).transpose()
print(days_all.shape,months_all.shape,years_all.shape)

# apply time mask to select the correct time entries and reduce arrays to the exisitng profiles
days_all   = np.multiply(days_all,mask_time)
months_all = np.multiply(months_all,mask_time)
years_all  = np.multiply(years_all,mask_time)

# reduce TIME arrays
# days
aux = np.copy(days_all[0:ind_pp,:]) # I only processed some time steps for testing 
aux = np.reshape(aux,[aux.shape[0]*aux.shape[1]]) # num_profile x depth
days_all = aux[ind1_noNaN]
print('days_all.shape',days_all.shape)
del aux
# months
aux = np.copy(months_all[0:ind_pp,:]) # I only processed some time steps for testing 
aux = np.reshape(aux,[aux.shape[0]*aux.shape[1]]) # num_profile x depth
months_all = aux[ind1_noNaN]
print('months_all.shape',months_all.shape)
del aux
# years
aux = np.copy(years_all[0:ind_pp,:]) # I only processed some time steps for testing 
aux = np.reshape(aux,[aux.shape[0]*aux.shape[1]]) # num_profile x depth
years_all = aux[ind1_noNaN]
print('years_all.shape',years_all.shape)
del aux

# get time in ns
time_in_ns = np.nan*np.ones([days_all.shape[0]]) # num_profiles
for tt in range(0,days_all.shape[0]):
    if ~np.isnan(months_all[tt]):
        # input datetime
        dt = datetime(int(years_all[tt]), int(months_all[tt]), int(days_all[tt]), 12, 0) # yy-mm-dd-hh-min
        # epoch time
        epoch_time = datetime(1970, 1, 1)
        # subtract Datetime from epoch datetime
        time_in_ns[tt] = ((dt - epoch_time).total_seconds())*1e9 # store as "ns since epoch_time)
        if (tt==0):
            print('for tt=0: Datetime to nano seconds since epoch:', time_in_ns[tt])
print(time_in_ns.shape)
#time_in_ns       = np.tile(time_in_ns,[len(ind_zz),1])
#time_in_ns       = np.expand_dims(time_in_ns,axis=0)
#print(time_in_ns.shape)   
print('min/max time:',np.min(time_in_ns),np.max(time_in_ns))
print('min/max years:',np.min(years_all),np.max(years_all))

(21900, 3544) (21900, 3544) (21900, 3544)
days_all.shape (766306,)
months_all.shape (766306,)
years_all.shape (766306,)
for tt=0: Datetime to nano seconds since epoch: 1.2623472e+18
(766306,)
min/max time: 1.2623472e+18 1.5777936e+18
min/max years: 2010.0 2019.0


In [13]:
#----
# OPTIONAL: further reduce the data coverage
# REDUCE FLOATS BY PROFILE (for optimized distribution)
#----
# e.g., keep only 50% of all available profiles
# NOTE: keep distribution of data across months, i.e., for each year and month, randomly select half of the data points to keep
print(region_string)

# this is set at the top of the script
# however, once all the data is loaded, the user might want to quickly save a variety of distributions
# for that, uncomment the two lines below and re-run the code from here until the end for as many times as necessary
# (no variables are overwritten, any variables stored in final nc file are defined here)
reduce_floats = False
keep_how_many = 15 # in percent

if reduce_floats:
    print('Keep '+str(keep_how_many)+'% of all profiles')

    try: 
        del ind_sel
    except: 
        pass
    for yy in year_list:
        for mm in range(1,12+1):
            # all indices for current year & month
            ind_xx = np.where(np.asarray(months_all==mm) & np.asarray(years_all==yy))[0]
            # randomly select half
            ind_random = np.random.choice(np.arange(0,len(ind_xx)), size=int(ind_xx.shape[0]/(100/keep_how_many)),replace=False)
            # collect indices to keep
            try: 
                ind_sel = np.concatenate((ind_sel,ind_xx[ind_random]))
            except: # the first time
                ind_sel = ind_xx[ind_random]
            #print(ind_sel.shape)
            del ind_xx,ind_random

    print(data_to_save1.shape)
    print(data_to_save2.shape)
    print(data_to_save3.shape)
    print(data_to_save_lat.shape)
    print(data_to_save_lon.shape)
    print(time_in_ns.shape)
    print(data_to_save_id.shape)

    data_to_save1b    = data_to_save1[ind_sel,:]
    data_to_save2b    = data_to_save2[ind_sel,:]
    data_to_save3b    = data_to_save3[ind_sel,:]
    data_to_save_latb = data_to_save_lat[ind_sel]
    data_to_save_lonb = data_to_save_lon[ind_sel]
    time_in_nsb       = time_in_ns[ind_sel]
    data_to_save_idb   = data_to_save_id[ind_sel]
    
else: 
    print('Keep all profiles')
    data_to_save1b    = data_to_save1
    data_to_save2b    = data_to_save2
    data_to_save3b    = data_to_save3
    data_to_save_latb = data_to_save_lat
    data_to_save_lonb = data_to_save_lon
    time_in_nsb       = time_in_ns
    data_to_save_idb  = data_to_save_id
    
print(data_to_save1b.shape)
print(data_to_save2b.shape)
print(data_to_save3b.shape)
print(data_to_save_latb.shape)
print(data_to_save_lonb.shape)
print(time_in_nsb.shape)
print(data_to_save_idb.shape)


SO_30S
Keep all profiles
(766306, 8)
(766306, 8)
(766306, 8)
(766306,)
(766306,)
(766306,)
(766306,)


In [14]:
#-----
# save as netcdf file: field at several depth levels (for testing of Kristen's routines)
#-----
# I a mimicking the data format of the example file Kristen used

fv = np.nan #-999
    
save_netcdf_multiple_depths = True
if save_netcdf_multiple_depths:
    if reduce_floats:
        reduced_where = '_keep_random_'+str(keep_how_many)+'perc'
        netcdf_name = 'floats_years_'+str(year_list[0])+'_'+str(year_list[-1])+\
                    '_'+vari1_nc+'_'+vari2_nc+'_'+vari3_nc+'_'+str(xx_daily)+'daily_'+region_string+reduced_where+'.nc'
    else:
        netcdf_name = 'floats_years_'+str(year_list[0])+'_'+str(year_list[-1])+\
                    '_'+vari1_nc+'_'+vari2_nc+'_'+vari3_nc+'_'+str(xx_daily)+'daily_'+region_string+'.nc'
    
    if not os.path.exists(savepath+netcdf_name):
        print('Create file '+savepath+netcdf_name)
        w_nc_fid = Dataset(savepath+netcdf_name, 'w', format='NETCDF4_CLASSIC')
        w_nc_fid.contact = 'Kristen Falcinelli, kefalc@uw.edu'
        w_nc_fid.source_file = path1
        w_nc_fid.script    = '/global/homes/k/kefalc/code/gpr-mapping-data/save_floats_E3SM_mapping_idealized_distributions.ipynb'
        # create dimension & variable
        w_nc_fid.createDimension('profile', data_to_save1b.shape[0]) 
        w_nc_fid.createDimension('pressure', data_to_save1b.shape[1]) 
        w_nc_var1 = w_nc_fid.createVariable(vari1_nc, 'f8',('pressure','profile'),fill_value=fv)
        w_nc_var1.description = str(xx_daily)+'-daily '+vari_temp+' on synthetic E3SM floats for the years '+\
        str(year_list[0])+'-'+str(year_list[-1])
        w_nc_var1.units = unit1
        w_nc_var1.coordinates = "lat lon time floatid"
        w_nc_var1 = w_nc_fid.createVariable(vari2_nc, 'f8',('pressure','profile'),fill_value=fv)
        w_nc_var1.description = str(xx_daily)+'-daily '+vari_salt+' on synthetic E3SM floats for the years '+\
                    str(year_list[0])+'-'+str(year_list[-1])
        w_nc_var1.units = unit2
        w_nc_var1.coordinates = "lat lon time floatid"
        
##Oxygen or other bgc variable- comment out for TS steps.
        w_nc_var1 = w_nc_fid.createVariable(vari3_nc, 'f8',('pressure','profile'),fill_value=fv)
        w_nc_var1.description = str(xx_daily)+'-daily '+vari_oxy+' on synthetic E3SM floats for the years '+\
                       str(year_list[0])+'-'+str(year_list[-1])
        w_nc_var1.units = unit3
        w_nc_var1.coordinates = "lat lon time floatid"
    
        w_nc_var1 = w_nc_fid.createVariable('lat', 'f8',('profile'),fill_value=fv)
        w_nc_var1.description = 'Latitude'
        w_nc_var1.units = 'deg N'
        w_nc_var1 = w_nc_fid.createVariable('lon', 'f8',('profile'),fill_value=fv)
        w_nc_var1.description = 'Longitude (-180:180)'
        w_nc_var1.units = 'deg E'
        w_nc_var1 = w_nc_fid.createVariable('pressure', 'f8',('pressure'),fill_value=fv)
        w_nc_var1.description = 'depth'
        w_nc_var1.units = 'm'
        w_nc_var1 = w_nc_fid.createVariable('time', 'f8',('profile'),fill_value=fv)
        w_nc_var1.description = 'Time'
        w_nc_var1.units = 'ns' #'ns since 1970-01-01 12:00'
        w_nc_var1 = w_nc_fid.createVariable('floatid', 'f8',('profile'),fill_value=fv)
        w_nc_var1.description = 'Float ID'
        
        w_nc_fid.close()
                 
    data_to_save1[np.isnan(data_to_save1)] = fv
    data_to_save2[np.isnan(data_to_save2)] = fv
    data_to_save3[np.isnan(data_to_save3)] = fv
    data_to_save_lat[np.isnan(data_to_save_lat)] = fv
    data_to_save_lon[np.isnan(data_to_save_lon)] = fv
    time_in_ns[np.isnan(time_in_ns)] = fv
   # data_to_save_id[np.isnan(data_to_save_id)] = fv

    # convert lon to -180 to 180
    data_to_save_lonb[data_to_save_lonb>180] = data_to_save_lonb[data_to_save_lonb>180]-360
    print('min/max lon saved:',np.min(data_to_save_lonb),np.max(data_to_save_lonb))
    
    w_nc_fid = Dataset(savepath+netcdf_name, 'r+', format='NETCDF4_CLASSIC') 
    w_nc_fid.variables[vari1_nc][:,:] = np.transpose(data_to_save1b)
    w_nc_fid.variables[vari2_nc][:,:] = np.transpose(data_to_save2b)
    w_nc_fid.variables[vari3_nc][:,:] = np.transpose(data_to_save3b)    
    w_nc_fid.variables['lat'][:]  = data_to_save_latb
    w_nc_fid.variables['lon'][:]  = data_to_save_lonb
    w_nc_fid.variables['time'][:] = time_in_nsb
    w_nc_fid.variables['pressure'][:] = depth_interp
    w_nc_fid.variables['floatid'][:] = data_to_save_id
    w_nc_fid.close()  
                
print ('done')


Create file /global/homes/k/kefalc/code/data_files/Final/floats_years_2010_2019_temperature_salinity_oxygen_10daily_SO_30S.nc
min/max lon saved: -179.99962601111304 179.99986215794988
done


In [15]:
#drop nans from the existing data set and save without NaNs
ds = xr.open_dataset('/global/homes/k/kefalc/code/data_files/Final/floats_years_2010_2019_temperature_salinity_oxygen_10daily_SO_30S.nc')
ds = ds.dropna(dim="profile", how = "any")
ds.to_netcdf('/global/homes/k/kefalc/code/data_files/Final/floats_years_2010_2019_temperature_salinity_oxygen_10daily_SO_30S_edit.nc')

In [16]:
# OPTIONAL: further reduce the data coverage
# REDUCE FLOATS BY FLOAT NUMBER
# This is better for real world situations. 

ds = xr.open_dataset('/global/homes/k/kefalc/code/data_files/Final/floats_years_2010_2019_temperature_salinity_oxygen_10daily_SO_30S_edit.nc')
floats = np.unique(ds.floatid.values)
floats = sorted(floats)
n = len(floats)

keep_how_many = 15 #in percent
keepfloats = round(n*(keep_how_many/100))
display(keepfloats)

reduced = random.sample(floats, keepfloats)

floatid = ds["floatid"].to_index()

#select indexes
index = []
for i in range(len(reduced)):
    index = np.append(index, np.where((reduced[i] == floatid)
                                     ))
# display(index_bgc)
#index=np.unique(index)
display(len(index)) #2877
display(len(np.unique(index)))
#display(index[0:100])

#Now I need to get rid of the indexes I do not need in the file. 
index = index.astype(int) ##These are the indexes we are keepign in the file!

ds_new = ds.isel(profile = index)
ds_new.to_netcdf(savepath + 'floats_years_'+str(year_list[0])+'_'+str(year_list[-1])+\
                    '_'+vari1_nc+'_'+vari2_nc+'_'+vari3_nc+'_'+str(xx_daily)+'daily_'+region_string+'keep_random_floats_'+str(keep_how_many)+'perc.nc')


294

70147

70147

In [None]:
#---
# plot a map with all available observations
#---

ms = 0.7
color1 = 'darkblue'
color2 = 'limegreen'
fs = 10

save_plots = True

fig = plt.figure(figsize=(18,7)) # x, y
ax = fig.add_subplot(1, 1, 1, projection=ccrs.PlateCarree())
ax.stock_img()
ax.coastlines()
ax.set_extent([-150, -120, -90, latmax+5])
for nn in tqdm(range(0,data_to_save_lonb.shape[0])):
    plt.plot(data_to_save_lonb[nn], data_to_save_latb[nn],
                 color=color1, marker='o',markersize=ms,transform=ccrs.Geodetic())
ax.yaxis.tick_left()
#gl = ax.gridlines(crs=ccrs.PlateCarree(), draw_labels=True,
#                  linewidth=0.1, color='gray', alpha=0.5, linestyle='None')
#gl.xlabel_style = {'size': 12, 'color': 'k'}
#gl.ylabel_style = {'color': 'red', 'weight': 'bold', 'size':12}
#gl.top_labels = False
#gl.left_labels = False
ax.set_xticks([-150,-140,-130, -120],crs=ccrs.PlateCarree())
ax.set_yticks([-60, -30], crs=ccrs.PlateCarree())
lon_formatter = LongitudeFormatter(zero_direction_label=True)
lat_formatter = LatitudeFormatter()
ax.xaxis.set_major_formatter(lon_formatter)
ax.yaxis.set_major_formatter(lat_formatter)
#ax.xaxis.label.set_color('red')
#ax.yaxis.label.set_color('red')
ax.annotate(str(year1)+'-'+str(year2),xy=(0.2,0.15),\
                xycoords='axes fraction',fontsize=fs,ha='right',color='k',zorder=120,fontweight='bold')
ax.annotate(str(xx_daily)+'-daily',xy=(0.2,0.07),\
                xycoords='axes fraction',fontsize=fs,ha='right',color='k',zorder=120)#,fontweight='bold')
ax.annotate(str(data_to_save_lonb.shape[0])+' data points',xy=(0.92,0.15),\
                xycoords='axes fraction',fontsize=fs,ha='right',color='k',zorder=120,fontweight='bold')
ax.annotate('(of '+str(len(data_to_save_lon))+' in total)',xy=(0.92,0.07),\
                xycoords='axes fraction',fontsize=fs,ha='right',color='k',zorder=120)#,fontweight='bold')
if save_plots:
    dpicnt = 200
    if reduce_floats:
        reduced_where = '_keep_random_'+str(keep_how_many)+'perc'
        filename = 'Map_floats_years_'+str(year_list[0])+'_'+str(year_list[-1])+\
                    '_'+vari1_nc+'_'+vari2_nc+'_'+vari3_nc+'_'+str(xx_daily)+'daily_'+region_string+reduced_where+'.png'
    else:
        filename = 'Map_floats_years_'+str(year_list[0])+'_'+str(year_list[-1])+\
                    '_'+vari1_nc+'_'+vari2_nc+'_'+vari3_nc+'_'+str(xx_daily)+'daily_'+region_string+'.png'
    print(filename)
    #plt.savefig(savepath_plots+filename,dpi = dpicnt, bbox_inches='tight',format='png')#,transparent=True)
    
plt.show()


In [None]:
ds = xr.open_dataset('/global/homes/k/kefalc/code/data_files/floats_years_2011_2020_temperature_salinity_oxygen_10daily_SO_Test_edit.nc')
ds

In [None]:
len(np.unique(ds.floatid))

In [None]:
tplot = plt.scatter(ds.lon, ds.lat, c = ds.time)
cbar = plt.colorbar(tplot)
#plt.savefig('/global/homes/k/kefalc/code/plots/timeplot.png')