# Analyse OOI netCDF Files For An Instrument

### Define Input variables:

In [5]:
# THERDD server contains the netCDF data files

url_list = ['https://opendap.oceanobservatories.org/thredds/catalog/ooi/leila.ocean@gmail.com/20190306T174413-CP05MOAS-GL379-03-CTDGVM000-recovered_host-ctdgv_m_glider_instrument_recovered/catalog.html',
'https://opendap.oceanobservatories.org/thredds/catalog/ooi/leila.ocean@gmail.com/20190306T174435-CP05MOAS-GL379-03-CTDGVM000-telemetered-ctdgv_m_glider_instrument/catalog.html']

# url_list = ['https://opendap.oceanobservatories.org/thredds/catalog/ooi/leila.ocean@gmail.com/20190319T195519-CP05MOAS-GL335-03-CTDGVM000-recovered_host-ctdgv_m_glider_instrument_recovered/catalog.html',
#             'https://opendap.oceanobservatories.org/thredds/catalog/ooi/leila.ocean@gmail.com/20190319T195533-CP05MOAS-GL335-03-CTDGVM000-telemetered-ctdgv_m_glider_instrument/catalog.html'] 

# review file was created upfront to do this analysis
review_file = 'https://raw.githubusercontent.com/ooi-data-lab/data-review-prep/master/review_list/data_review_list.csv'

# f =  #location to a file containing THREDDs urls with .nc files to analyze. 
#The column containing the THREDDs urls must be labeled 'outputUrl'

# Define folder path to save summary output
sDir =  '/Users/leila/Documents/NSFEduSupport/review/output'

### Import functions:

In [3]:
import os
import xarray as xr
import pandas as pd
import re
import numpy as np
import json
import datetime as dt
import netCDF4 as nc
import functions.common as cf
import functions.plotting as pf
from datetime import timedelta
from collections import OrderedDict

### Select Data Files With Status For Review

In [4]:
reviewlist = pd.read_csv(review_file)
datasets = []
for uu in url_list:
    # get instrument  = reference designator 
    elements = uu.split('/')[-2].split('-')
    rd = '-'.join((elements[1], elements[2], elements[3], elements[4]))
    
    data = OrderedDict(deployments=OrderedDict())
    
    # create an output file
    save_dir = os.path.join(sDir, rd.split('-')[0], rd)
    cf.create_dir(save_dir)
    
    # check for the OOI 1.0 datasets for review    
    rl_filtered = reviewlist.loc[(reviewlist['Reference Designator'] == rd) & (reviewlist['status'] == 'for review')]
    
    # print to the screen
    catalog_rms = '-'.join((rd, elements[-2], elements[-1]))
#    print(catalog_rms)
#    print(pd.DataFrame({'deploymentNumber': rl_filtered['deploymentNumber'],
#                        'startDateTime': rl_filtered['startDateTime'],
#                        'stopDateTime': rl_filtered['stopDateTime'],
#                        'in_am': rl_filtered['in_am']}))
    
        
    # get data files from THREDDS server
    udatasets = cf.get_nc_urls([uu])
    
    # get deployments from file names
    review_deployments = rl_filtered['deploymentNumber'].tolist()
    review_deployments_int = ['deployment%04d' % int(x) for x in review_deployments]
    
    # get data files of interest
    for rev_dep in review_deployments_int:
        rdatasets = [s for s in udatasets if rev_dep in s]
        if len(rdatasets) > 0:            
            for dss in rdatasets:  # filter out collocated data files
                if catalog_rms == dss.split('/')[-1].split('_20')[0][15:]:
                    datasets.append(dss)
print(rd, ': number of files in THREDDS == ',len(datasets))

Data request has fulfilled.
Data request has fulfilled.
CP05MOAS-GL379-03-CTDGVM000 : number of files in THREDDS ==  7


In [5]:
dr_data = cf.refdes_datareview_json(rd)

In [6]:
# create a data frame with data files information
df = pd.DataFrame({'refdes': [],
                    'method': [],
                    'data_stream':[],
                    'n_days_deployed': [],
                    'n_days_file': [],
                    'num_timestamps': [],                        
                    'sampling_rt_sec': [],
                    'time_order': [],
                    'gap_list': [],
                    'num_gaps': [],                      
                    'pressure_comp': [],
                    'coord_test': [],
                    'datasets': []})

In [7]:
for ii, fname in enumerate(datasets):    
    refdes = fname.split('/')[-1].split('_')[1][0:27]
    deploy_num = int(fname.split('/')[-1].split('-')[0][13:-9])
    deploy_info = cf.get_deployment_information(dr_data, deploy_num)
    deploy_depth = deploy_info['deployment_depth']
    
    # Calculate days deployed
    deploy_start = str(deploy_info['start_date'])
    deploy_stop = str(deploy_info['stop_date'])    
    if deploy_stop != 'None':
        r_deploy_start = pd.to_datetime(deploy_start).replace(hour=0, minute=0, second=0)
        if deploy_stop.split('T')[1] == '00:00:00':
            r_deploy_stop = pd.to_datetime(deploy_stop)
        else:
            r_deploy_stop = (pd.to_datetime(deploy_stop) + timedelta(days=1)).replace(hour=0, minute=0, second=0)
        n_days_deployed = (r_deploy_stop - r_deploy_start).days
    else:
        n_days_deployed = None
    
    
    # Get time array
    ds = xr.open_dataset(fname, mask_and_scale=False)
    ds = ds.swap_dims({'obs': 'time'})
    time = ds['time']

    # Check that the timestamps in the file are unique
    len_time = time.__len__()
    len_time_unique = np.unique(time).__len__()
    if len_time == len_time_unique:
        time_unique = 'pass'
    else:
        time_unique = 'fail'
     
    # Check that the timestamps in the file are in ascending order
    # convert time to number
    time_in = [dt.datetime.utcfromtimestamp(np.datetime64(x).astype('O')/1e9) for x in time.values]
    time_data = nc.date2num(time_in, 'seconds since 1900-01-01')

    # Create a list of True or False by iterating through the array of time and checking
    # if every time stamp is increasing
    result = [(time_data[k + 1] - time_data[k]) > 0 for k in range(len(time_data) - 1)]

    # Print outcome of the iteration with the list of indices when time is not increasing
    if result.count(True) == len(time) - 1:
        time_ascending = 'pass'
    else:
        ind_fail = {k: time_in[k] for k, v in enumerate(result) if v is False}
        time_ascending = 'fail: {}'.format(ind_fail)
        
    # Count the number of days for which there is at least 1 timestamp    
    n_days = len(np.unique(time.values.astype('datetime64[D]')))
    
    # Get a list of data gaps >1 day    
    time_df = pd.DataFrame(time.values, columns=['time'])
    gap_list = cf.timestamp_gap_test(time_df)
        
    # Calculate the sampling rate to the nearest second
    time_df['diff'] = time_df['time'].diff().astype('timedelta64[s]')
    rates_df = time_df.groupby(['diff']).agg(['count'])
    n_diff_calc = len(time_df) - 1
    rates = dict(n_unique_rates=len(rates_df), common_sampling_rates=dict())
    for i, row in rates_df.iterrows():
        percent = (float(row['time']['count']) / float(n_diff_calc))
        if percent > 0.1:
            rates['common_sampling_rates'].update({int(i): '{:.2%}'.format(percent)})
    sampling_rt_sec = None
    for k, v in rates['common_sampling_rates'].items():
        if float(v.strip('%')) > 50.00:
            sampling_rt_sec = k

    if not sampling_rt_sec:
        sampling_rt_sec = 'no consistent sampling rate: {}'.format(rates['common_sampling_rates']) 
        
    # Check deployment pressure from asset management against pressure variable in file
    press = pf.pressure_var(ds, list(ds.coords.keys()))
    if press is None:
        press = pf.pressure_var(ds, list(ds.data_vars.keys()))     
    pressure_compare, pressure_max, pressure_mean = cf.calculate_mean_pressure(press, ds, refdes, deploy_depth)
        
    # check coordinate
    file_coordinates = list(ds.coords.keys())
    if 'SBD' not in refdes.split('-')[1]:
        check_coords = list(set(['obs', 'time', 'pressure', 'lat', 'lon']) - set(file_coordinates))
    else:
        check_coords = list(set(['obs', 'time', 'lat', 'lon']) - set(file_coordinates))
    
    if len(check_coords) > 0:
        if 'pressure' in check_coords:
            if len([j for j in file_coordinates if 'pressure' in j]) == 1:
                check_coords.remove('pressure')
                if len(check_coords) > 0:
                    coord_test = 'missing: {}'.format(check_coords)
                else:
                    coord_test = 'pass'
            else:
                coord_test = 'missing: {}'.format(check_coords)
        else:
            coord_test = 'missing: {}'.format(check_coords)
    else:
        coord_test = 'pass'
    
#         data_start=data_start,
#         data_stop=data_stop,
#         notes=notes,       
#         units=pressure_units,
#         variable=press
#         vars_in_file=ds_variables,
#         vars_not_in_file=[x for x in unmatch1 if 'time' not in x],
#         vars_not_in_db=unmatch2,
#         sci_var_stats=OrderedDict())
        
    df0 = pd.DataFrame({'refdes': [refdes],
                        'method': [fname.split('/')[-1].split('-')[4]],
                        'data_stream':[fname.split('/')[-1].split('-')[5][:-23]],
                        'n_days_deployed': [n_days_deployed],
                        'n_days_file': [n_days],
                        'num_timestamps': [len_time],                        
                        'sampling_rt_sec': [sampling_rt_sec],
                        'time_order': [['Unique-'+time_unique,'Ascending-'+time_ascending]],
                        'gap_list': [gap_list],
                        'num_gaps': [int(len(gap_list))],                      
                        'pressure_comp': [[deploy_depth, pressure_max]],
                        'coord_test': [coord_test],
                        'datasets': [fname]
                        }, index=[deploy_num])
    df = df.append(df0)

In [8]:
method_list = ['streamed', 'recovered_inst', 'recovered_wfp', 'recovered_cspp', 'recovered_host', 'telemetered']
deployments = np.unique(np.sort(list(df.index.values)))
df_info = pd.DataFrame()
for d in deployments:
    df_d = df[df.index.values == d]
    if len(df_d['method']) != 1: # sort methods in order of preference
        z = sorted(df_d['method'], key=lambda zz: method_list.index(zz)) # sorted method list        
        df_d = df_d[df_d['method'] == z[0]]

    df_info = df_info.append(df_d)
len(df_info.columns)

13

In [12]:
valid_list = list(np.zeros(len(df_info)))
for index, row in df_info.iterrows():
    sci_vars = cf.return_science_vars(row['data_stream'])
    ds = xr.open_dataset(row['datasets'], mask_and_scale=False)
    ds = ds.swap_dims({'obs': 'time'})
    # calculate statistics for science variables, excluding outliers +/- 5 SD
    valid_list_index = []
    for sv in sci_vars:
        try:
            var = ds[sv]
            num_dims = len(var.dims)

            if num_dims > 1:
                print('variable has more than 1 dimension')
                n_all =  None
                num_outliers = None
                mean = None
                vmin = None
                vmax = None
                sd = None
                n_stats = 'variable has more than 1 dimension'
                var_units = var.units
                n_nan = None
                n_fv = None
                n_grange = None
                fv = None
            else:
                n_all =  len(var)
                # reject NaNs
                var_nonan = var.values[~np.isnan(var.values)]
                n_nan = len(var) - len(var_nonan)

                # reject fill values
                fv = var._FillValue
                var_nonan_nofv = var_nonan[var_nonan != fv]
                n_fv = len(var) - n_nan - len(var_nonan_nofv)

                # reject data outside of global ranges
                try: 
                    [g_min, g_max] = cf.get_global_ranges(rd, sv)
                    if g_min is not None and g_max is not None:
                        gr_ind = cf.reject_global_ranges(var_nonan_nofv, g_min, g_max)
                        var_nonan_nofv_gr = var_nonan_nofv[gr_ind]
                        n_grange = len(var) - n_nan - n_fv - len(var_nonan_nofv_gr)
                    else:
                        n_grange = 'no global ranges'
                        var_nonan_nofv_gr = var_nonan_nofv
                except Exception:
                    print('uFrame is not responding to request for global ranges. Try again later.')
                    var_nonan_nofv_gr = var_nonan_nofv

                if len(var_nonan_nofv_gr) > 1:
                    [num_outliers, mean, vmin, vmax, sd, n_stats] = cf.variable_statistics(var_nonan_nofv_gr, 5)
                elif len(var_nonan_nofv_gr) == 1:
                    num_outliers = 0
                    mean = (round(list(var_nonan_nofv_gr)[0], 4)).astype('float64')
                    vmin = None
                    vmax = None
                    sd = None
                    n_stats = 1
                else:
                    num_outliers = None
                    mean = None
                    vmin = None
                    vmax = None
                    sd = None
                    n_stats = 0

                var_units = var.units

        except KeyError:
            n_all =  None
            num_outliers = None
            mean = None
            vmin = None
            vmax = None
            sd = None
            n_stats = 'variable not found in file'
            var_units = None
            n_nan = None
            n_fv = None
            fv = None
            n_grange = None
            
        if type(n_stats) == str:
            percent_valid_data = 'stats not calculated'
        elif type(n_all) == list:
            if type(n_gr) == str:
                n1 = n_all[1] - n_nan - n_fv
            else:
                n1 = n_all[1] - n_nan - n_fv - n_gr
            percent_valid_data = round((float(n1) / float(n_all[1]) * 100), 2)
        else:
            percent_valid_data = round((float(n_stats)/float(n_all) * 100), 2)
        valid_list_index.append(percent_valid_data)
        
        pvd_test = dict()
        snc = len([x for x in valid_list_index if x == 'stats not calculated'])
        if snc > 0:
            pvd_test['stats not calculated'] = snc
        else:
            valid_list_index = [round(v) for v in valid_list_index]
            pvd_test, dlst = cf.group_percents(pvd_test, valid_list_index)
        print(sv, num_outliers, mean, vmin, vmax, sd, n_stats, var_units, n_nan, n_fv, fv, n_grange, percent_valid_data, pvd_test, dlst)
    
    valid_list[index-1] = pvd_test                            
    print(valid_list[index-1])    
    

practical_salinity 1518 35.3557 33.0998 36.5786 0.4433 1696159 1 4 0 -9999999.0 0 99.91 {'99': 1} []
sci_water_cond 1518 4.1648 3.2019 5.5363 0.5035 1696159 S m-1 0 0 nan 4 99.91 {'99': 2} []
sci_water_pressure 0 18.5088 0.0 101.11 21.5955 1697681 bar 0 0 nan 0 100.0 {'99': 3} []
sci_water_temp 0 13.0989 0.0 25.4908 4.8591 1697681 ºC 0 0 nan 0 100.0 {'99': 4} []
sci_seawater_density 0 1027.3765 1023.3115 1032.3609 1.6905 1696159 kg m-3 4 0 nan 1518 99.91 {'99': 5} []
sci_water_pressure_dbar 0 185.0881 0.0 1011.1 215.9547 1697681 dbar 0 0 nan 0 100.0 {'99': 6} []
{'99': 6}
practical_salinity 639 35.2059 31.5974 38.8005 0.3742 1875850 1 9 0 -9999999.0 0 99.97 {'99': 1} []
sci_water_cond 577 3.7036 3.3225 5.5537 0.4065 1875912 S m-1 0 0 nan 9 99.97 {'99': 2} []
sci_water_pressure 0 47.5734 0.0 98.627 28.2113 1876498 bar 0 0 nan 0 100.0 {'99': 3} []
sci_water_temp 0 8.3477 0.0 26.0479 4.0622 1876498 ºC 0 0 nan 0 100.0 {'99': 4} []
sci_seawater_density 0 1029.4759 1021.0847 1032.2592 1.7614

In [13]:
df_info.insert(len(df_info.columns), column='valid_data', value=valid_list)
df_info

Unnamed: 0,refdes,method,data_stream,n_days_deployed,n_days_file,num_timestamps,sampling_rt_sec,time_order,gap_list,num_gaps,pressure_comp,coord_test,datasets,valid_data
1,CP05MOAS-GL379-03-CTDGVM000,recovered_host,ctdgv_m_glider_instrument_recovered,88.0,83.0,1697681.0,2.0,"[Unique-pass, Ascending-pass]",[],0.0,"[1000, 832.95]",pass,https://opendap.oceanobservatories.org/thredds...,{'99': 6}
2,CP05MOAS-GL379-03-CTDGVM000,recovered_host,ctdgv_m_glider_instrument_recovered,55.0,48.0,1876498.0,2.0,"[Unique-pass, Ascending-pass]",[],0.0,"[1000, 986.27]",pass,https://opendap.oceanobservatories.org/thredds...,{'99': 6}
3,CP05MOAS-GL379-03-CTDGVM000,recovered_host,ctdgv_m_glider_instrument_recovered,119.0,119.0,3822788.0,2.0,"[Unique-pass, Ascending-pass]",[],0.0,"[1000, 961.27]",pass,https://opendap.oceanobservatories.org/thredds...,{'99': 6}
4,CP05MOAS-GL379-03-CTDGVM000,telemetered,ctdgv_m_glider_instrument,1.0,1.0,212.0,60.0,"[Unique-pass, Ascending-pass]",[],0.0,"[1000, 261.65]",pass,https://opendap.oceanobservatories.org/thredds...,"{'99': 5, '75': 1}"


In [110]:
# for i, row in enumerate(df_info.values):
#     sci_vars = cf.return_science_vars(row['data_stream'])
#     ds = xr.open_dataset(row['datasets'], mask_and_scale=False)
#     ds = ds.swap_dims({'obs': 'time'})
#     # calculate statistics for science variables, excluding outliers +/- 5 SD
#     for sv in sci_vars:
#         print(sv)
#         try:
#             var = ds[sv]
#             num_dims = len(var.dims)

#             if num_dims > 1:
#                 print('variable has more than 1 dimension')
#                 num_outliers = None
#                 mean = None
#                 vmin = None
#                 vmax = None
#                 sd = None
#                 n_stats = 'variable has more than 1 dimension'
#                 var_units = var.units
#                 n_nan = None
#                 n_fv = None
#                 n_grange = None
#                 fv = None
#             else:
#                 # reject NaNs
#                 var_nonan = var.values[~np.isnan(var.values)]
#                 n_nan = len(var) - len(var_nonan)

#                 # reject fill values
#                 fv = var._FillValue
#                 var_nonan_nofv = var_nonan[var_nonan != fv]
#                 n_fv = len(var) - n_nan - len(var_nonan_nofv)

#                 # reject data outside of global ranges
#                 try: 
#                     [g_min, g_max] = cf.get_global_ranges(rd, sv)
#                     if g_min is not None and g_max is not None:
#                         gr_ind = cf.reject_global_ranges(var_nonan_nofv, g_min, g_max)
#                         var_nonan_nofv_gr = var_nonan_nofv[gr_ind]
#                         n_grange = len(var) - n_nan - n_fv - len(var_nonan_nofv_gr)
#                     else:
#                         n_grange = 'no global ranges'
#                         var_nonan_nofv_gr = var_nonan_nofv
#                 except Exception:
#                     print('uFrame is not responding to request for global ranges. Try again later.')
#                     var_nonan_nofv_gr = var_nonan_nofv

#                 if len(var_nonan_nofv_gr) > 1:
#                     [num_outliers, mean, vmin, vmax, sd, n_stats] = cf.variable_statistics(var_nonan_nofv_gr, 5)
#                 elif len(var_nonan_nofv_gr) == 1:
#                     num_outliers = 0
#                     mean = (round(list(var_nonan_nofv_gr)[0], 4)).astype('float64')
#                     vmin = None
#                     vmax = None
#                     sd = None
#                     n_stats = 1
#                 else:
#                     num_outliers = None
#                     mean = None
#                     vmin = None
#                     vmax = None
#                     sd = None
#                     n_stats = 0

#                 var_units = var.units

#         except KeyError:
#             num_outliers = None
#             mean = None
#             vmin = None
#             vmax = None
#             sd = None
#             n_stats = 'variable not found in file'
#             var_units = None
#             n_nan = None
#             n_fv = None
#             fv = None
#             n_grange = None

KeyError: 0

In [63]:
# # read data file
# notes = []
# time_ascending = ''
# if len(datasets) == 1:
#     ds = xr.open_dataset(datasets[0], mask_and_scale=False)
#     ds = ds.swap_dims({'obs': 'time'})
# #     fname, subsite, refdes, method, data_stream, deployment = cf.nc_attributes(datasets[0])
# elif len(datasets) > 1:
#     ds = xr.open_mfdataset(datasets, mask_and_scale=False)
#     ds = ds.swap_dims({'obs': 'time'})
#     ds = ds.chunk({'time': 100})
# #     fname, subsite, refdes, method, data_stream, deployment = cf.nc_attributes(datasets[0])
# #     fname = fname.split('_20')[0]
#     notes.append('multiple deployment .nc files')
#     # when opening multiple datasets, don't check that the timestamps are in ascending order
#     time_ascending = 'not_tested'

In [65]:
#  # Get info from the data review database
# dr_data = cf.refdes_datareview_json(refdes)
# stream_vars = cf.return_stream_vars(data_stream)
# sci_vars = cf.return_science_vars(data_stream)
# deploy_info = cf.get_deployment_information(dr_data, int(deployment[-4:]))

In [66]:
# Grab deployment Variables
# deploy_start = str(deploy_info['start_date'])
# deploy_stop = str(deploy_info['stop_date'])
# deploy_lon = deploy_info['longitude']
# deploy_lat = deploy_info['latitude']
# deploy_depth = deploy_info['deployment_depth']

In [8]:
# # Calculate days deployed
# if deploy_stop != 'None':
#     r_deploy_start = pd.to_datetime(deploy_start).replace(hour=0, minute=0, second=0)
#     if deploy_stop.split('T')[1] == '00:00:00':
#         r_deploy_stop = pd.to_datetime(deploy_stop)
#     else:
#         r_deploy_stop = (pd.to_datetime(deploy_stop) + timedelta(days=1)).replace(hour=0, minute=0, second=0)
#     n_days_deployed = (r_deploy_stop - r_deploy_start).days
# else:
#     n_days_deployed = None

In [10]:
# Add reference designator to dictionary
try:
    data['refdes']
except KeyError:
    data['refdes'] = refdes

deployments = data['deployments'].keys()
data_start = pd.to_datetime(min(ds['time'].values)).strftime('%Y-%m-%dT%H:%M:%S')
data_stop = pd.to_datetime(max(ds['time'].values)).strftime('%Y-%m-%dT%H:%M:%S')

# Add deployment and info to dictionary and initialize delivery method sub-dictionary
if deployment not in deployments:
    data['deployments'][deployment] = OrderedDict(deploy_start=deploy_start,
                                                  deploy_stop=deploy_stop,
                                                  n_days_deployed=n_days_deployed,
                                                  lon=deploy_lon,
                                                  lat=deploy_lat,
                                                  deploy_depth=deploy_depth,
                                                  method=OrderedDict())

In [11]:
# Add delivery methods to dictionary and initialize stream sub-dictionary
methods = data['deployments'][deployment]['method'].keys()
if method not in methods:
    data['deployments'][deployment]['method'][method] = OrderedDict(stream=OrderedDict())
    
# Add streams to dictionary and initialize file sub-dictionary
streams = data['deployments'][deployment]['method'][method]['stream'].keys()
if data_stream not in streams:
    data['deployments'][deployment]['method'][method]['stream'][data_stream] = OrderedDict(file=OrderedDict())

In [145]:
# # Get a list of data gaps >1 day
# time_df = pd.DataFrame(ds['time'].values, columns=['time'])
# gap_list = cf.timestamp_gap_test(time_df)

In [147]:
# # Calculate the sampling rate to the nearest second
# time_df['diff'] = time_df['time'].diff().astype('timedelta64[s]')
# rates_df = time_df.groupby(['diff']).agg(['count'])
# n_diff_calc = len(time_df) - 1
# rates = dict(n_unique_rates=len(rates_df), common_sampling_rates=dict())
# for i, row in rates_df.iterrows():
#     percent = (float(row['time']['count']) / float(n_diff_calc))
#     if percent > 0.1:
#         rates['common_sampling_rates'].update({int(i): '{:.2%}'.format(percent)})
# sampling_rt_sec = None
# for k, v in rates['common_sampling_rates'].items():
#     if float(v.strip('%')) > 50.00:
#         sampling_rt_sec = k

# if not sampling_rt_sec:
#     sampling_rt_sec = 'no consistent sampling rate: {}'.format(rates['common_sampling_rates'])

In [149]:
# # Check that the timestamps in the file are unique
# time = ds['time']
# len_time = time.__len__()
# len_time_unique = np.unique(time).__len__()
# if len_time == len_time_unique:
#     time_test = 'pass'
# else:
#     time_test = 'fail'

In [15]:
# # Check that the timestamps in the file are in ascending order
# if time_ascending != 'not_tested':
#     # convert time to number
#     time_in = [dt.datetime.utcfromtimestamp(np.datetime64(x).astype('O')/1e9) for x in
#                ds['time'].values]
#     time_data = nc.date2num(time_in, 'seconds since 1900-01-01')

#     # Create a list of True or False by iterating through the array of time and checking
#     # if every time stamp is increasing
#     result = [(time_data[k + 1] - time_data[k]) > 0 for k in range(len(time_data) - 1)]

#     # Print outcome of the iteration with the list of indices when time is not increasing
#     if result.count(True) == len(time) - 1:
#         time_ascending = 'pass'
#     else:
#         ind_fail = {k: time_in[k] for k, v in enumerate(result) if v is False}
#         time_ascending = 'fail: {}'.format(ind_fail)

In [16]:
# # Count the number of days for which there is at least 1 timestamp
# n_days = len(np.unique(time.values.astype('datetime64[D]')))

In [17]:
# Compare variables in file to variables in Data Review Database
ds_variables = list(ds.data_vars.keys()) + list(ds.coords.keys())
#ds_variables = [k for k in ds]
ds_variables = cf.eliminate_common_variables(ds_variables)
ds_variables = [x for x in ds_variables if 'qc' not in x]
[_, unmatch1] = cf.compare_lists(stream_vars, ds_variables)
[_, unmatch2] = cf.compare_lists(ds_variables, stream_vars)

In [18]:
# # Check deployment pressure from asset management against pressure variable in file
# press = pf.pressure_var(ds, list(ds.coords.keys()))
# if press is None:
#     press = pf.pressure_var(ds, list(ds.data_vars.keys()))

In [None]:
# # calculate mean pressure from data, excluding outliers +/- 3 SD
# try:
#     pressure = ds[press]
#     num_dims = len(pressure.dims)
#     if len(pressure) > 1:
#         # reject NaNs
#         p_nonan = pressure.values[~np.isnan(pressure.values)]

#         # reject fill values
#         p_nonan_nofv = p_nonan[p_nonan != pressure._FillValue]

#         # reject data outside of global ranges
#         try:
#             [pg_min, pg_max] = cf.get_global_ranges(rd, press)
#             if pg_min is not None and pg_max is not None:
#                     pgr_ind = cf.reject_global_ranges(p_nonan_nofv, pg_min, pg_max)
#                     p_nonan_nofv_gr = p_nonan_nofv[pgr_ind]
#             else:
#                 p_nonan_nofv_gr = p_nonan_nofv
#         except Exception: 
#                 print('uFrame is not responding to request for global ranges. Try again later.')
#                 p_nonan_nofv_gr = p_nonan_nofv

#         if (len(p_nonan_nofv_gr) > 0) and (num_dims == 1):
#             [press_outliers, pressure_mean, _, pressure_max, _, _] = cf.variable_statistics(p_nonan_nofv_gr, 3)
#             pressure_mean = round(pressure_mean, 2)
#             pressure_max = round(pressure_max, 2)
#         elif (len(p_nonan_nofv_gr) > 0) and (num_dims > 1):
#             print('variable has more than 1 dimension')
#             press_outliers = 'not calculated: variable has more than 1 dimension'
#             pressure_mean = round(np.nanmean(p_nonan_nofv_gr), 2)
#             pressure_max = round(np.nanmax(p_nonan_nofv_gr), 2)
#         else:
#             press_outliers = None
#             pressure_mean = None
#             pressure_max = None
#             if len(pressure) > 0 and len(p_nonan) == 0:
#                 notes.append('Pressure variable all NaNs')
#             elif len(pressure) > 0 and len(p_nonan) > 0 and len(p_nonan_nofv) == 0:
#                 notes.append('Pressure variable all fill values')
#             elif len(pressure) > 0 and len(p_nonan) > 0 and len(p_nonan_nofv) > 0 and len(p_nonan_nofv_gr) == 0:
#                 notes.append('Pressure variable outside of global ranges')

#     else:  # if there is only 1 data point
#         press_outliers = 0
#         pressure_mean = round(ds[press].values.tolist()[0], 2)
#         pressure_max = round(ds[press].values.tolist()[0], 2)

#     try:
#         pressure_units = pressure.units
#     except AttributeError:
#         pressure_units = 'no units attribute for pressure'

#     if pressure_mean:
#         node = refdes.split('-')[1]
#         if ('WFP' in node) or ('MOAS' in subsite):
#             pressure_compare = int(round(pressure_max))
#         else:
#             pressure_compare = int(round(pressure_mean))

#         if pressure_units == '0.001 dbar':
#             pressure_max = round((pressure_max / 1000), 2)
#             pressure_mean = round((pressure_mean / 1000), 2)
#             notes.append('Pressure converted from 0.001 dbar to dbar for pressure comparison')
#     else:
#         pressure_compare = None

#     if (not deploy_depth) or (not pressure_mean):
#         pressure_diff = None
#     else:
#         pressure_diff = pressure_compare - deploy_depth

# except KeyError:
#     press = 'no seawater pressure in file'
#     pressure_diff = None
#     pressure_mean = None
#     pressure_max = None
#     pressure_compare = None
#     press_outliers = None
#     pressure_units = None

In [27]:
# Add files and info to dictionary
filenames = data['deployments'][deployment]['method'][method]['stream'][data_stream][
    'file'].keys()
if fname not in filenames:
    data['deployments'][deployment]['method'][method]['stream'][data_stream]['file'][
        fname] = OrderedDict(
        file_downloaded=pd.to_datetime(elements[0]).strftime('%Y-%m-%dT%H:%M:%S'),
        file_coordinates=list(ds.coords.keys()),
        sampling_rate_seconds=sampling_rt_sec,
        sampling_rate_details=rates,
        data_start=data_start,
        data_stop=data_stop,
        time_gaps=gap_list,
        unique_timestamps=time_test,
        n_timestamps=len_time,
        n_days=n_days,
        notes=notes,
        ascending_timestamps=time_ascending,
        pressure_comparison=dict(pressure_mean=pressure_mean, units=pressure_units,
                                 num_outliers=press_outliers, diff=pressure_diff,
                                 pressure_max=pressure_max, variable=press,
                                 pressure_compare=pressure_compare),
        vars_in_file=ds_variables,
        vars_not_in_file=[x for x in unmatch1 if 'time' not in x],
        vars_not_in_db=unmatch2,
        sci_var_stats=OrderedDict())