# Identify missing datasets

In [1]:
## import required packages
import numpy as np
import matplotlib.pyplot as plt
import xarray as xr
import s3fs
import pandas as pd
from datetime import datetime
import json

### Generate list of dates for complete dataset

In [2]:
## create list of all expected dates for a complete daily mean dataset
dates = pd.date_range(start="2014-01-15",end="2021-02-07").tolist()
date_list_complete = []
for i in range(len(dates)):
    date_list_complete.append(str(dates[i])[:10])

In [3]:
len(date_list_complete)

2581

For a complete dataset, there should be 2581 netCDFs, one for each day

### Create dictionary for missing dates for each dataset using processed datasets (netCDF filenames)

In [5]:
def find_missing_dates(date_list_complete, file_dates_list):
    return [string for string in date_list_complete if string not in file_dates_list]

In [6]:
def identify_missing_datasets(variables_list, s3_netcdf_dir):

    ## create list of all expected dates for a complete daily mean dataset
    dates = pd.date_range(start="2014-01-15",end="2021-02-07").tolist()
    date_list_complete = []
    for i in range(len(dates)):
        date_list_complete.append(str(dates[i])[:10])

    s3 = s3fs.S3FileSystem(anon=False)

    ## create dict for final list of variables and missing dates
    sassie_ecco_variable_gaps = dict()
    
    for variable in variables_list:
        
        ## find filenames for variable
        file_list = np.sort(s3.glob(f'{s3_netcdf_dir}{variable}/*.nc'))
    
        # construct url form of filenames
        # data_urls = ['s3://' + f for f in file_list]

        ## get list of dates from netcdf filenames
        file_dates_list = []
        for netcdf in file_list:
            file_dates_list.append(netcdf.split('/')[-1].split('_')[3])

        ## identify missing dates
        missing_dates_list = find_missing_dates(date_list_complete, file_dates_list)

        sassie_ecco_variable_gaps[variable] = missing_dates_list

        print(f'{variable} is missing {len(missing_dates_list)}')

    return sassie_ecco_variable_gaps

#### Run function for all variable directories

In [7]:
## s3 directory
s3_netcdf_dir = 's3://ecco-processed-data/SASSIE/N1/V1/HH/NETCDF/'

In [8]:
var_list_all = ["SALT_AVG_DAILY",
                "THETA_AVG_DAILY",
                "UVEL_AVG_DAILY",
                "VVEL_AVG_DAILY",
                "WVEL_AVG_DAILY",
                "KPPdiffS_AVG_DAILY",
                "KPPdiffT_AVG_DAILY",
                "KPPviscA_AVG_DAILY",
                "ETAN_AVG_DAILY",
                "PHIBOT_AVG_DAILY",
                "sIceLoad_AVG_DAILY",
                "KPPhbl_AVG_DAILY",
                "EXFaqh_AVG_DAILY",
                "EXFatemp_AVG_DAILY",
                "EXFempmr_AVG_DAILY",
                "EXFevap_AVG_DAILY",
                "EXFpreci_AVG_DAILY",
                "EXFroff_AVG_DAILY",
                "EXFqnet_AVG_DAILY",
                "EXFhl_AVG_DAILY",
                "EXFhs_AVG_DAILY",
                "EXFlwdn_AVG_DAILY",
                "EXFlwnet_AVG_DAILY",
                "EXFswdn_AVG_DAILY",
                "EXFswnet_AVG_DAILY",
                "EXFuwind_AVG_DAILY",
                "EXFtaux_AVG_DAILY",
                "EXFvwind_AVG_DAILY",
                "EXFtauy_AVG_DAILY",
                "oceFWflx_AVG_DAILY",
                "oceQnet_AVG_DAILY",
                "oceQsw_AVG_DAILY",
                "oceTAUX_AVG_DAILY",
                "oceTAUY_AVG_DAILY",
                "SFLUX_AVG_DAILY",
                "TFLUX_AVG_DAILY",
                "PHIHYD_AVG_DAILY",
                "PHIHYDcR_AVG_DAILY",
                "RHOAnoma_AVG_DAILY",
                "SIatmFW_AVG_DAILY",
                "SIatmQnt_AVG_DAILY",
                "SIarea_AVG_DAILY",
                "SIheff_AVG_DAILY",
                "SIhsnow_AVG_DAILY",
                "SIuice_AVG_DAILY",
                "SIvice_AVG_DAILY",
                "ADVr_SLT_AVG_DAILY",
                "ADVr_TH_AVG_DAILY",
                "ADVxHEFF_AVG_DAILY",
                "ADVxSNOW_AVG_DAILY",
                "ADVyHEFF_AVG_DAILY",
                "ADVySNOW_AVG_DAILY",
                "ADVx_SLT_AVG_DAILY",
                "ADVx_TH_AVG_DAILY",
                "ADVy_SLT_AVG_DAILY",
                "ADVy_TH_AVG_DAILY",
                "DFrE_SLT_AVG_DAILY",
                "DFrE_TH_AVG_DAILY",
                "DFrI_SLT_AVG_DAILY",
                "DFrI_TH_AVG_DAILY",
                "DFxE_SLT_AVG_DAILY",
                "DFxE_TH_AVG_DAILY",
                "DFyE_SLT_AVG_DAILY",
                "DFyE_TH_AVG_DAILY",
                "UVELMASS_AVG_DAILY",
                "VVELMASS_AVG_DAILY",
                "WVELMASS_AVG_DAILY"]

In [9]:
## check number
len(var_list_all)

67

In [10]:
sassie_ecco_variable_gaps = identify_missing_datasets(var_list_all, s3_netcdf_dir)

SALT_AVG_DAILY is missing 2
THETA_AVG_DAILY is missing 2
UVEL_AVG_DAILY is missing 16
VVEL_AVG_DAILY is missing 16
WVEL_AVG_DAILY is missing 16
KPPdiffS_AVG_DAILY is missing 1811
KPPdiffT_AVG_DAILY is missing 1811
KPPviscA_AVG_DAILY is missing 2133
ETAN_AVG_DAILY is missing 16
PHIBOT_AVG_DAILY is missing 16
sIceLoad_AVG_DAILY is missing 16
KPPhbl_AVG_DAILY is missing 134
EXFaqh_AVG_DAILY is missing 2
EXFatemp_AVG_DAILY is missing 2
EXFempmr_AVG_DAILY is missing 2
EXFevap_AVG_DAILY is missing 2
EXFpreci_AVG_DAILY is missing 2
EXFroff_AVG_DAILY is missing 2
EXFqnet_AVG_DAILY is missing 2
EXFhl_AVG_DAILY is missing 2
EXFhs_AVG_DAILY is missing 2
EXFlwdn_AVG_DAILY is missing 2
EXFlwnet_AVG_DAILY is missing 2
EXFswdn_AVG_DAILY is missing 2
EXFswnet_AVG_DAILY is missing 2
EXFuwind_AVG_DAILY is missing 2
EXFtaux_AVG_DAILY is missing 2
EXFvwind_AVG_DAILY is missing 2
EXFtauy_AVG_DAILY is missing 2
oceFWflx_AVG_DAILY is missing 2
oceQnet_AVG_DAILY is missing 2
oceQsw_AVG_DAILY is missing 2
oceT

In [113]:
## look at one variable as an example
sassie_ecco_variable_gaps['ADVySNOW_AVG_DAILY']

['2014-04-24', '2015-04-09']

Save dictionary as json output

In [12]:
# save dictionary as json file
file_name = "sassie-ecco-missing-dates.json"

with open(file_name, 'w') as json_file:
    json.dump(sassie_ecco_variable_gaps, json_file)

Identify which variables are empty

In [13]:
def find_variable_with_large_list(dictionary):
    keys_with_gaps = []
    
    # Iterate through the dictionary
    for key, value in dictionary.items():
        # Check if the length of the list is more than XX dates
        if len(value) > 2580:
            keys_with_gaps.append(key)
    
    return keys_with_gaps

In [14]:
variables_with_large_gaps = find_variable_with_large_list(sassie_ecco_variable_gaps)

In [15]:
variables_with_large_gaps

['ADVr_SLT_AVG_DAILY',
 'ADVr_TH_AVG_DAILY',
 'ADVx_SLT_AVG_DAILY',
 'ADVx_TH_AVG_DAILY',
 'ADVy_SLT_AVG_DAILY',
 'ADVy_TH_AVG_DAILY',
 'DFrE_SLT_AVG_DAILY',
 'DFrE_TH_AVG_DAILY',
 'DFrI_SLT_AVG_DAILY',
 'DFrI_TH_AVG_DAILY',
 'DFxE_SLT_AVG_DAILY',
 'DFxE_TH_AVG_DAILY',
 'DFyE_SLT_AVG_DAILY',
 'DFyE_TH_AVG_DAILY']

### BOOKEEPING NOTES

63 datasets on s3 bucket now (identify missing 4)

9 items that need to be added to sassie cloud:

In [None]:
"ADVx_TH_AVG_DAILY"
"EXFuwind_AVG_DAILY"
"KPPdiffT_AVG_DAILY"
"KPPhbl_AVG_DAILY"
"oceFWflx_AVG_DAILY"
"SFLUX_AVG_DAILY"
"SIatmQnt_AVG_DAILY"
"SIuice_AVG_DAILY"
"VVEL_AVG_DAILY"

What's on sassie cloud now (54)

In [73]:
var_list = ["ADVr_SLT_AVG_DAILY",
"ADVr_TH_AVG_DAILY",
"ADVxHEFF_AVG_DAILY",
"ADVxSNOW_AVG_DAILY",
"ADVx_SLT_AVG_DAILY",
"ADVyHEFF_AVG_DAILY",
"ADVySNOW_AVG_DAILY",
"ADVy_SLT_AVG_DAILY",
"ADVy_TH_AVG_DAILY",
"DFrE_SLT_AVG_DAILY",
"DFrE_TH_AVG_DAILY",
"DFrI_SLT_AVG_DAILY",
"DFrI_TH_AVG_DAILY",
"ETAN_AVG_DAILY",
"EXFaqh_AVG_DAILY",
"EXFatemp_AVG_DAILY",
"EXFempmr_AVG_DAILY",
"EXFevap_AVG_DAILY",
"EXFhl_AVG_DAILY",
"EXFhs_AVG_DAILY",
"EXFlwdn_AVG_DAILY",
"EXFlwnet_AVG_DAILY",
"EXFpreci_AVG_DAILY",
"EXFqnet_AVG_DAILY",
"EXFroff_AVG_DAILY",
"EXFswdn_AVG_DAILY",
"EXFswnet_AVG_DAILY",
"EXFtaux_AVG_DAILY",
"EXFtauy_AVG_DAILY",
"EXFvwind_AVG_DAILY",
"KPPdiffS_AVG_DAILY",
"KPPviscA_AVG_DAILY",
"PHIBOT_AVG_DAILY",
"PHIHYD_AVG_DAILY",
"PHIHYDcR_AVG_DAILY",
"RHOAnoma_AVG_DAILY",
"SALT_AVG_DAILY",
"SIarea_AVG_DAILY",
"SIatmFW_AVG_DAILY",
"SIheff_AVG_DAILY",
"SIhsnow_AVG_DAILY",
"SIvice_AVG_DAILY",
"TFLUX_AVG_DAILY",
"THETA_AVG_DAILY",
"UVELMASS_AVG_DAILY",
"UVEL_AVG_DAILY",
"VVELMASS_AVG_DAILY",
"WVELMASS_AVG_DAILY",
"WVEL_AVG_DAILY",
"oceQnet_AVG_DAILY",
"oceQsw_AVG_DAILY",
"oceTAUX_AVG_DAILY",
"oceTAUY_AVG_DAILY",
"sIceLoad_AVG_DAILY"]

In [75]:
len(var_list)

54

In [78]:
var_list[0:32]

['ADVr_SLT_AVG_DAILY',
 'ADVr_TH_AVG_DAILY',
 'ADVxHEFF_AVG_DAILY',
 'ADVxSNOW_AVG_DAILY',
 'ADVx_SLT_AVG_DAILY',
 'ADVyHEFF_AVG_DAILY',
 'ADVySNOW_AVG_DAILY',
 'ADVy_SLT_AVG_DAILY',
 'ADVy_TH_AVG_DAILY',
 'DFrE_SLT_AVG_DAILY',
 'DFrE_TH_AVG_DAILY',
 'DFrI_SLT_AVG_DAILY',
 'DFrI_TH_AVG_DAILY',
 'ETAN_AVG_DAILY',
 'EXFaqh_AVG_DAILY',
 'EXFatemp_AVG_DAILY',
 'EXFempmr_AVG_DAILY',
 'EXFevap_AVG_DAILY',
 'EXFhl_AVG_DAILY',
 'EXFhs_AVG_DAILY',
 'EXFlwdn_AVG_DAILY',
 'EXFlwnet_AVG_DAILY',
 'EXFpreci_AVG_DAILY',
 'EXFqnet_AVG_DAILY',
 'EXFroff_AVG_DAILY',
 'EXFswdn_AVG_DAILY',
 'EXFswnet_AVG_DAILY',
 'EXFtaux_AVG_DAILY',
 'EXFtauy_AVG_DAILY',
 'EXFvwind_AVG_DAILY',
 'KPPdiffS_AVG_DAILY',
 'KPPviscA_AVG_DAILY']