**Data Time & Gap Tests**

In [85]:
import pandas as pd
import functions.common as cf
import functions.plotting as pf
import xarray as xr
from datetime import timedelta
import numpy as np
import datetime as dt
import netCDF4 as nc
from termcolor import colored

**Datasets Review List**

In [86]:
reviewlist = pd.read_csv('define_datareview_list.csv')
#reviewlist = pd.DataFrame(reviewlist).rename(index=str, columns={'Unnamed: 0': 'deployment'})
reviewlist.index = reviewlist['Unnamed: 0'].values

In [61]:
pd.set_option('display.max_colwidth', -1)
pd.DataFrame(reviewlist)[['method','datasets']]

Unnamed: 0,method,datasets
deployment0001,recovered_host,https://opendap.oceanobservatories.org/thredds/dodsC/ooi/leila.ocean@gmail.com/20190306T174413-CP05MOAS-GL379-03-CTDGVM000-recovered_host-ctdgv_m_glider_instrument_recovered/deployment0001_CP05MOAS-GL379-03-CTDGVM000-recovered_host-ctdgv_m_glider_instrument_recovered_20140415T132159.798580-20140706T212524.736690.nc
deployment0002,recovered_host,https://opendap.oceanobservatories.org/thredds/dodsC/ooi/leila.ocean@gmail.com/20190306T174413-CP05MOAS-GL379-03-CTDGVM000-recovered_host-ctdgv_m_glider_instrument_recovered/deployment0002_CP05MOAS-GL379-03-CTDGVM000-recovered_host-ctdgv_m_glider_instrument_recovered_20150509T113514.756870-20150625T150702.264890.nc
deployment0003,recovered_host,https://opendap.oceanobservatories.org/thredds/dodsC/ooi/leila.ocean@gmail.com/20190306T174413-CP05MOAS-GL379-03-CTDGVM000-recovered_host-ctdgv_m_glider_instrument_recovered/deployment0003_CP05MOAS-GL379-03-CTDGVM000-recovered_host-ctdgv_m_glider_instrument_recovered_20160121T150043.663730-20160518T015328.144100.nc
deployment0004,telemetered,https://opendap.oceanobservatories.org/thredds/dodsC/ooi/leila.ocean@gmail.com/20190306T174435-CP05MOAS-GL379-03-CTDGVM000-telemetered-ctdgv_m_glider_instrument/deployment0004_CP05MOAS-GL379-03-CTDGVM000-telemetered-ctdgv_m_glider_instrument_20170116T125355.760380-20170116T221950.330870.nc


**Get Deployment Information**

In [90]:
col = list(reviewlist.columns)
refdes = reviewlist[col[1]][0].split('/')[-1].split('_')[1][0:27]
dr_data = cf.refdes_datareview_json(refdes)
tf = pd.DataFrame(dr_data['instrument']['deployments'])[['deployment_number','start_date','stop_date']]
tf.index = tf['deployment_number'].values

In [91]:
pd.set_option('display.max_colwidth', -1)
tf[['start_date','stop_date']]

Unnamed: 0,start_date,stop_date
1,2014-04-15T13:15:00+00:00,2014-07-11T00:00:00+00:00
2,2015-05-09T11:02:00+00:00,2015-07-02T00:00:00+00:00
3,2016-01-21T15:00:00+00:00,2016-05-18T03:56:00+00:00
4,2017-01-16T12:46:00+00:00,2017-01-16T23:59:59+00:00
6,2019-02-05T07:54:00+00:00,


**Get Annotations**

In [74]:
tf = pd.DataFrame(dr_data['instrument']['annotations'])[['reference_designator','annotation','end_datetime','start_datetime']]
tf.index = tf['reference_designator'].values

In [76]:
pd.set_option('display.max_colwidth', -1)
tf[['annotation','end_datetime','start_datetime']]

Unnamed: 0,annotation,end_datetime,start_datetime
CP05MOAS-GL379-03-CTDGVM000,Science data were truncated on 2016-05-01 to extend the battery life and the CTD was finally shut down on 2016-05-06.,2016-05-17T20:00:00+00:00,2016-05-01T16:00:00+00:00
CP05MOAS-GL379,Glider leaked and was recovered within 12 hours. No recovered data are expected.,2017-01-16T18:59:59+00:00,2017-01-16T07:46:00+00:00
CP05MOAS-GL379,Glider recovered same day for leak.,2017-01-16T18:59:59+00:00,2017-01-16T07:46:00+00:00


<p style="color:green;">Deployment Days</p>

Number of days the instrument was deployed.

<p style="color:green;">File Days</p>
Number of days for which there is at least 1 timestamp available for the instrument.

<p style="color:green;">Start Gap</p>
Number of missing days at the start of a deployment: comparison of the deployment start date to the data start date.

<p style="color:green;">Timestamps</p>
Number of timestamps in a data file.

In [77]:
df = pd.DataFrame()
for ii in range(len(reviewlist)):         
    deploy_num = int(reviewlist[col[0]][ii].split('t')[-1])
    deploy_info = cf.get_deployment_information(dr_data, deploy_num)
    deploy_depth = deploy_info['deployment_depth']
    
    # Calculate days deployed
    deploy_start = str(deploy_info['start_date'])
    deploy_stop = str(deploy_info['stop_date'])    
    if deploy_stop != 'None':
        r_deploy_start = pd.to_datetime(deploy_start).replace(hour=0, minute=0, second=0)
        if deploy_stop.split('T')[1] == '00:00:00':
            r_deploy_stop = pd.to_datetime(deploy_stop)
        else:
            r_deploy_stop = (pd.to_datetime(deploy_stop) + timedelta(days=1)).replace(hour=0, minute=0, second=0)
        n_days_deployed = (r_deploy_stop - r_deploy_start).days
    else:
        n_days_deployed = None
    
    # Get time array
    ds = xr.open_dataset(reviewlist[col[1]][ii], mask_and_scale=False)
    ds = ds.swap_dims({'obs': 'time'})
    time = ds['time']
    
    # Check that the timestamps in the file are unique
    len_time = time.__len__()
    len_time_unique = np.unique(time).__len__()
    
    # calculate gaps size at start of deployment    
    start_gap = (pd.to_datetime(str(time.values[0])) - r_deploy_start).days
   
    # calculate gap size at end of deployment
    end_gap = (r_deploy_stop - pd.to_datetime(str(time.values[-1]))).days    
    
    # Count the number of days for which there is at least 1 timestamp    
    n_days = len(np.unique(time.values.astype('datetime64[D]')))
    
    df0 = pd.DataFrame({
                        'method': [reviewlist[col[2]][ii]],                       
                        'n_days_deployed': [n_days_deployed],
                        'n_days_file': [n_days], 
                        'num_timestamps': [len_time],
                        'start_gap': [start_gap],
                        'end_gap': [end_gap]
                        }, index=[deploy_num])

    df = df.append(df0)

In [78]:
pd.set_option('display.max_colwidth', -1)
(df)

Unnamed: 0,method,n_days_deployed,n_days_file,num_timestamps,start_gap,end_gap
1,recovered_host,88,83,1697681,0,5
2,recovered_host,55,48,1876498,0,7
3,recovered_host,119,119,3822788,0,0
4,telemetered,1,1,212,0,0


**Note**
<p style="color:red;">No annotation in the system to explain the gaps at the end of deployments 1 and 2.</p>

<p style="color:green;">End Gap</p>
Number of missing days at the end of a deployment: comparison of the deployment end date to the data end date.
<p style="color:green;">Gaps Count</p>
Number of gaps within a data file (exclusive of missing data at the beginning and end of a deployment). Gap is defined as >1 day of missing data.
<p style="color:green;">Gap Days</p>
Number of days of missing data within a data file (exclusive of missing data at the beginning and end of a deployment).


In [79]:
df = pd.DataFrame()
for ii in range(len(reviewlist)):
    deploy_num = int(reviewlist[col[0]][ii].split('t')[-1])
    # Get time array
    ds = xr.open_dataset(reviewlist[col[1]][ii], mask_and_scale=False)
    ds = ds.swap_dims({'obs': 'time'})
    time = ds['time']
    
    # Get a list of data gaps >1 day    
    time_df = pd.DataFrame(time.values, columns=['time'])
    gap_list = cf.timestamp_gap_test(time_df)
    df0 = pd.DataFrame({
                        'method': [reviewlist[col[2]][ii]],                                           
                        'gap_list': [gap_list],
                        'num_gaps': [int(len(gap_list))]
                        }, index=[deploy_num])

    df = df.append(df0)  

In [80]:
pd.set_option('display.max_colwidth', -1)
(df)

Unnamed: 0,method,gap_list,num_gaps
1,recovered_host,[],0
2,recovered_host,[],0
3,recovered_host,[],0
4,telemetered,[],0


<p style="color:green;">Sampling Rate</p>
Sampling rates are calculated from the differences in timestamps. The most common sampling rate is that which occurs >50%.



In [81]:
df = pd.DataFrame()
for ii in range(len(reviewlist)):
    deploy_num = int(reviewlist[col[0]][ii].split('t')[-1])
    # Get time array
    ds = xr.open_dataset(reviewlist[col[1]][ii], mask_and_scale=False)
    ds = ds.swap_dims({'obs': 'time'})
    time = ds['time']
    
    # Calculate the sampling rate to the nearest second
    time_df['diff'] = time_df['time'].diff().astype('timedelta64[s]')
    rates_df = time_df.groupby(['diff']).agg(['count'])
    n_diff_calc = len(time_df) - 1
    rates = dict(n_unique_rates=len(rates_df), common_sampling_rates=dict())
    for i, row in rates_df.iterrows():
        percent = (float(row['time']['count']) / float(n_diff_calc))
        if percent > 0.1:
            rates['common_sampling_rates'].update({int(i): '{:.2%}'.format(percent)})
    sampling_rt_sec = None
    for k, v in rates['common_sampling_rates'].items():
        if float(v.strip('%')) > 50.00:
            sampling_rt_sec = k

    if not sampling_rt_sec:
        sampling_rt_sec = 'no consistent sampling rate: {}'.format(rates['common_sampling_rates']) 
    
    df0 = pd.DataFrame({
                        'method': [reviewlist[col[2]][ii]],                                           
                        'sampling_rt_sec': [sampling_rt_sec]
                        }, index=[deploy_num])

    df = df.append(df0)  

In [82]:
pd.set_option('display.max_colwidth', -1)
(df)

Unnamed: 0,method,sampling_rt_sec
1,recovered_host,60
2,recovered_host,60
3,recovered_host,60
4,telemetered,60


<p style="color:green;">Time Order</p>
Test that timestamps in the file are unique and in ascending order.

In [83]:
df = pd.DataFrame()
for ii in range(len(reviewlist)):
    deploy_num = int(reviewlist[col[0]][ii].split('t')[-1])
    # Get time array
    ds = xr.open_dataset(reviewlist[col[1]][ii], mask_and_scale=False)
    ds = ds.swap_dims({'obs': 'time'})
    time = ds['time']
    
    # Check that the timestamps in the file are unique
    len_time = time.__len__()
    len_time_unique = np.unique(time).__len__()
    if len_time == len_time_unique:
        time_unique = 'pass'
    else:
        time_unique = 'fail'
        
    # Check that the timestamps in the file are in ascending order
    time_in = [dt.datetime.utcfromtimestamp(np.datetime64(x).astype('O')/1e9) for x in time.values]
    time_data = nc.date2num(time_in, 'seconds since 1900-01-01')

    # Create True/False list for every timestamps
    result = [(time_data[k + 1] - time_data[k]) > 0 for k in range(len(time_data) - 1)]

    # List indices when time is not increasing
    if result.count(True) == len(time) - 1:
        time_ascending = 'pass'
    else:
        ind_fail = {k: time_in[k] for k, v in enumerate(result) if v is False}
        time_ascending = 'fail: {}'.format(ind_fail)
        
    df0 = pd.DataFrame({
                        'method': [reviewlist[col[2]][ii]],                       
                        'time_order': [['Unique: '+time_unique,'Ascending: '+time_ascending]]
                        }, index=[deploy_num])

    df = df.append(df0)

In [84]:
pd.set_option('display.max_colwidth', -1)
(df)

Unnamed: 0,method,time_order
1,recovered_host,"[Unique: pass, Ascending: pass]"
2,recovered_host,"[Unique: pass, Ascending: pass]"
3,recovered_host,"[Unique: pass, Ascending: pass]"
4,telemetered,"[Unique: pass, Ascending: pass]"


**Summary of Results**

In [51]:
print(colored('Instrument', 'green'), colored(refdes, 'blue'))

[32mInstrument[0m [34mCP05MOAS-GL379-03-CTDGVM000[0m


In [63]:
df = pd.DataFrame()
for ii in range(len(reviewlist)):         
    deploy_num = int(reviewlist[col[0]][ii].split('t')[-1])
    deploy_info = cf.get_deployment_information(dr_data, deploy_num)
    deploy_depth = deploy_info['deployment_depth']
    
    # Calculate days deployed
    deploy_start = str(deploy_info['start_date'])
    deploy_stop = str(deploy_info['stop_date'])    
    if deploy_stop != 'None':
        r_deploy_start = pd.to_datetime(deploy_start).replace(hour=0, minute=0, second=0)
        if deploy_stop.split('T')[1] == '00:00:00':
            r_deploy_stop = pd.to_datetime(deploy_stop)
        else:
            r_deploy_stop = (pd.to_datetime(deploy_stop) + timedelta(days=1)).replace(hour=0, minute=0, second=0)
        n_days_deployed = (r_deploy_stop - r_deploy_start).days
    else:
        n_days_deployed = None
    
    # Get time array
    ds = xr.open_dataset(reviewlist[col[1]][ii], mask_and_scale=False)
    ds = ds.swap_dims({'obs': 'time'})
    time = ds['time']
    
    # Check that the timestamps in the file are unique
    len_time = time.__len__()
    len_time_unique = np.unique(time).__len__()
    if len_time == len_time_unique:
        time_unique = 'pass'
    else:
        time_unique = 'fail'
        
    # Check that the timestamps in the file are in ascending order
    # convert time to number
    time_in = [dt.datetime.utcfromtimestamp(np.datetime64(x).astype('O')/1e9) for x in time.values]
    time_data = nc.date2num(time_in, 'seconds since 1900-01-01')

    # Create a list of True or False by iterating through the array of time and checking
    # if every time stamp is increasing
    result = [(time_data[k + 1] - time_data[k]) > 0 for k in range(len(time_data) - 1)]

    # Print outcome of the iteration with the list of indices when time is not increasing
    if result.count(True) == len(time) - 1:
        time_ascending = 'pass'
    else:
        ind_fail = {k: time_in[k] for k, v in enumerate(result) if v is False}
        time_ascending = 'fail: {}'.format(ind_fail)
        
    # calculate gaps size at start of deployment    
    start_gap = (pd.to_datetime(str(time.values[0])) - r_deploy_start).days
   
    # calculate gap size at end of deployment
    end_gap = (r_deploy_stop - pd.to_datetime(str(time.values[-1]))).days    
    
    # Count the number of days for which there is at least 1 timestamp    
    n_days = len(np.unique(time.values.astype('datetime64[D]')))
    
    # Get a list of data gaps >1 day    
    time_df = pd.DataFrame(time.values, columns=['time'])
    gap_list = cf.timestamp_gap_test(time_df)
    
    # Calculate the sampling rate to the nearest second
    time_df['diff'] = time_df['time'].diff().astype('timedelta64[s]')
    rates_df = time_df.groupby(['diff']).agg(['count'])
    n_diff_calc = len(time_df) - 1
    rates = dict(n_unique_rates=len(rates_df), common_sampling_rates=dict())
    for i, row in rates_df.iterrows():
        percent = (float(row['time']['count']) / float(n_diff_calc))
        if percent > 0.1:
            rates['common_sampling_rates'].update({int(i): '{:.2%}'.format(percent)})
    sampling_rt_sec = None
    for k, v in rates['common_sampling_rates'].items():
        if float(v.strip('%')) > 50.00:
            sampling_rt_sec = k

    if not sampling_rt_sec:
        sampling_rt_sec = 'no consistent sampling rate: {}'.format(rates['common_sampling_rates']) 
        
    df0 = pd.DataFrame({
                        'method': [reviewlist[col[2]][ii]],                       
                        'n_days_deployed': [n_days_deployed],
                        'n_days_file': [n_days],
                        'start_gap': [start_gap],
                        'end_gap': [end_gap],        
                        'gap_list': [gap_list],
                        'num_gaps': [int(len(gap_list))],
                        'num_timestamps': [len_time],
                        'sampling_rt_sec': [sampling_rt_sec],
                        'time_order': [['Unique: '+time_unique,'Ascending: '+time_ascending]]
                        }, index=[deploy_num])

    df = df.append(df0)

In [64]:
pd.set_option('display.max_colwidth', -1)
(df)

Unnamed: 0,method,n_days_deployed,n_days_file,start_gap,end_gap,gap_list,num_gaps,num_timestamps,sampling_rt_sec,time_order
1,recovered_host,88,83,0,5,[],0,1697681,2,"[Unique: pass, Ascending: pass]"
2,recovered_host,55,48,0,7,[],0,1876498,2,"[Unique: pass, Ascending: pass]"
3,recovered_host,119,119,0,0,[],0,3822788,2,"[Unique: pass, Ascending: pass]"
4,telemetered,1,1,0,0,[],0,212,60,"[Unique: pass, Ascending: pass]"
