## Data Evaluation - Datasets Comparison

- Missing Data
- Duplicate Data
- Data Comparison

### Python Packages.

In [2]:
import xarray as xr
import pandas as pd
# import numpy as np
# import datetime as dt
# import netCDF4 as nc
# from datetime import timedelta

### Data Files.

In [10]:
# This is how you change your directory to where your data file is stored:
%cd '/Users/leilabelabassi/Desktop/TAMU/online-class/612-DataQuality4theGeosciences/NetCDF-Files/'

filename = 'deployment0004_GP03FLMB-RIM01-02-CTDMOG060-telemetered-\
ctdmo_ghqr_sio_mule_instrument_20161008T080001-20161030T120001.nc'

# Load data
file_content = xr.open_dataset(filename,mask_and_scale=False) 
file_content = file_content.swap_dims({'obs': 'time'})


/Users/leilabelabassi/Desktop/TAMU/online-class/612-DataQuality4theGeosciences/NetCDF-Files


### Pressure Array.

In [50]:
pressure_name = [x for x in tuple(file_content.variables.keys()) if 'pressure' in x]
pressure_name

['pressure',
 'ctdmo_seawater_pressure',
 'ctdmo_seawater_pressure_qc_executed',
 'ctdmo_seawater_pressure_qc_results']

In [55]:
for x in pressure_name:
    try: 
        x_unit = file_content[x].attrs['units']
        if x_unit == 'dbar': 
            pressure_parameter = x
            print(pressure_parameter)
    except KeyError:
        print('no unit attributes')

ctdmo_seawater_pressure
no unit attributes
no unit attributes


In [None]:
def calculate_mean_pressure(press, ds, refdes, deploy_depth):
    """
    Calculate mean pressure from data, excluding outliers +/- 3 SD
    """
    notes = []
    subsite = refdes.split('-')[1]
    node = refdes.split('-')[1]

    try:
        pressure = ds[press]
        num_dims = len(pressure.dims)
        if len(pressure) > 1:
            # reject NaNs
            p_nonan = pressure.values[~np.isnan(pressure.values)]

            # reject fill values
            p_nonan_nofv = p_nonan[p_nonan != pressure._FillValue]

            # reject data outside of global ranges
            try:
                [pg_min, pg_max] = get_global_ranges(refdes, press)
                if pg_min is not None and pg_max is not None:
                        pgr_ind = reject_global_ranges(p_nonan_nofv, pg_min, pg_max)
                        p_nonan_nofv_gr = p_nonan_nofv[pgr_ind]
                else:
                    p_nonan_nofv_gr = p_nonan_nofv
            except Exception: 
                    print('uFrame is not responding to request for global ranges. Try again later.')
                    p_nonan_nofv_gr = p_nonan_nofv

            if (len(p_nonan_nofv_gr) > 0) and (num_dims == 1):
                [press_outliers, pressure_mean, _, pressure_max, _, _] = variable_statistics(p_nonan_nofv_gr, 3)
                pressure_mean = round(pressure_mean, 2)
                pressure_max = round(pressure_max, 2)
            elif (len(p_nonan_nofv_gr) > 0) and (num_dims > 1):
                print('variable has more than 1 dimension')
                press_outliers = 'not calculated: variable has more than 1 dimension'
                pressure_mean = round(np.nanmean(p_nonan_nofv_gr), 2)
                pressure_max = round(np.nanmax(p_nonan_nofv_gr), 2)
            else:
                press_outliers = None
                pressure_mean = None
                pressure_max = None
                if len(pressure) > 0 and len(p_nonan) == 0:
                    notes.append('Pressure variable all NaNs')
                elif len(pressure) > 0 and len(p_nonan) > 0 and len(p_nonan_nofv) == 0:
                    notes.append('Pressure variable all fill values')
                elif len(pressure) > 0 and len(p_nonan) > 0 and len(p_nonan_nofv) > 0 and len(p_nonan_nofv_gr) == 0:
                    notes.append('Pressure variable outside of global ranges')

        else:  # if there is only 1 data point
            press_outliers = 0
            pressure_mean = round(ds[press].values.tolist()[0], 2)
            pressure_max = round(ds[press].values.tolist()[0], 2)

        try:
            pressure_units = pressure.units
        except AttributeError:
            pressure_units = 'no units attribute for pressure'

        if pressure_mean:
            node = refdes.split('-')[1]
            if ('WFP' in node) or ('MOAS' in subsite):
                pressure_compare = int(round(pressure_max))
            else:
                pressure_compare = int(round(pressure_mean))

            if pressure_units == '0.001 dbar':
                pressure_max = round((pressure_max / 1000), 2)
                pressure_mean = round((pressure_mean / 1000), 2)
                notes.append('Pressure converted from 0.001 dbar to dbar for pressure comparison')
        else:
            pressure_compare = None

        if (not deploy_depth) or (not pressure_mean):
            pressure_diff = None
        else:
            pressure_diff = pressure_compare - deploy_depth

    except KeyError:
        press = 'no seawater pressure in file'
        pressure_diff = None
        pressure_mean = None
        pressure_max = None
        pressure_compare = None
        press_outliers = None
        pressure_units = None
        
    return pressure_compare, pressure_max, pressure_mean