## Data Quality Report.

<blockquote>  </blockquote>

<blockquote> </blockquote> 


### Notebook Outline:


<span style='color:Orange' size=20 > **Attention:** </span> 
- To run the notebook, you need to follow the septs in order.
- For the code cell, run the cell before you move on to the next one. 
    - **Remember**: The output of a cell may be an input in the next cell.

In [3]:
import os
import pandas as pd
import netCDF4 as nc
import requests
import re
import itertools
import time
import xarray as xr
import numpy as np
import datetime as dt
from urllib.request import urlopen
import json
from ast import literal_eval
from datetime import timedelta
from datetime import datetime
from geopy.distance import geodesic
import urllib3

import matplotlib
import matplotlib.dates as mdates
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
sns.set()

from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()

ModuleNotFoundError: No module named 'matplotlib'

### Sensor Deployment Information.
The sensor deployment information is obtained from a system database.

In [None]:
def refdes_datareview_json(refdes):
    """
    Returns information about the sensor reference designator from the Data Review Database.
    Source URL: http://datareview.marine.rutgers.edu/instruments/view/
    """
    url = 'http://datareview.marine.rutgers.edu/instruments/view/'
    ref_des_url = os.path.join(url, refdes)
    ref_des_url += '.json'
    r = requests.get(ref_des_url).json()
    return r

In [None]:
refdes_datareview_json('GP03FLMB-RIM01-02-CTDMOG060')

### Data Annotations.

In [None]:
def sensor_annotation_check(refdes):
    # deployment start and end dates 
    sensor_anno = {}
    dr_data = refdes_datareview_json(refdes)
    for i, d in enumerate(dr_data['instrument']['annotations']):
        sensor_anno[i] = {}
        sensor_anno[i]['reference_designator'] = d['reference_designator']
        sensor_anno[i]['annotation'] = d['annotation']
        sensor_anno[i]['start_datetime'] = d['start_datetime']
        sensor_anno[i]['end_datetime'] = d['end_datetime']

    # put info in a data frame
    df_anno = pd.DataFrame.from_dict(sensor_anno, orient='index').sort_index()
    
    return df_anno


In [None]:
df_anno = sensor_annotation_check('GP03FLMB-RIM01-02-CTDMOG060')
pd.set_option('display.max_colwidth', None)
df_anno

### Deploy Depth Check

In [None]:
def deploy_depth_check(refdes):
    # 
    # deployment locations
    deploy_depth = {}
    dr_data = refdes_datareview_json(refdes)
    for i, d in enumerate(dr_data['instrument']['deployments']):
        deploy_depth[i] = {}
        deploy_depth[i]['deployment'] = d['deployment_number']
        deploy_depth[i]['d_depth'] = d['deployment_depth']
        deploy_depth[i]['water_depth'] = d['water_depth']
        

    # put info in a data frame
    df_depth = pd.DataFrame.from_dict(deploy_depth, orient='index').sort_index()

    return df_depth


In [None]:
df_depth = deploy_depth_check('GP03FLMB-RIM01-02-CTDMOG060')
df_depth

In [None]:
n = 3
fig = plt.figure(1)

listy = [50, 4000, 4250]
visiblex = [False,False,True]

# create axis
axis_list = []
for j in range(n):
    axis_list.append(plt.subplot(n,1,j+1))

handles =[]
labels = []
for j,ax in enumerate(axis_list):
    
    for i in range(len(df_depth['water_depth'])):
        x_start = int(df_depth['deployment'][i])
        y_end = int(df_depth['water_depth'][i])
        if np.isnan(df_depth['d_depth'][i]):
            y_start = np.nanmean(df_depth['d_depth'])
            texti = '?'
        else:
            y_start = int(df_depth['d_depth'][i])
            texti = ' '

        ax.add_patch(matplotlib.patches.Rectangle((x_start,y_start), 0.125, y_end-y_start, ec="red"))
        ax.set_xlim((0, 8))
        ax.get_yaxis().set_major_locator(ticker.FixedLocator(list(range(7))))
        ax.get_xaxis().set_visible(visiblex[j])
 
        if j == 0:
            ax.set_ylim((listy[j],0))
            ax.text(x_start,y_start-5, texti)
            ax.get_yaxis().set_major_locator(ticker.FixedLocator([0,30,50]))
            ax.set(title="Deployment Depth (Green Circle), \n Water Depth (Red Square) \n ?: Unknown Deployment Depth ")
            ax.plot(x_start,y_start, 'go-', label='Sensor Depth', linewidth=2)
        else:
            ax.set_ylim((listy[j],listy[j-1]))
        
        if j == n-1:            
            ax.plot(x_start,y_end, 'rs',  label='Water Depth',linewidth=2 )
            ax.get_yaxis().set_major_locator(ticker.FixedLocator([4145,4176]))
            
plt.subplots_adjust(wspace=0, hspace=0)
plt.xlabel('Deployment Number')
plt.ylabel('Water Depth')
plt.show()

### Deploy Location Check.

In [None]:
def deploy_location_check(refdes):
    # Calculate the distance in kilometers between an instrument's location (defined in asset management) and previous
    # deployment locations
    deploy_loc = {}
    dr_data = refdes_datareview_json(refdes)
    for i, d in enumerate(dr_data['instrument']['deployments']):
        deploy_loc[i] = {}
        deploy_loc[i]['deployment'] = d['deployment_number']
        deploy_loc[i]['lat'] = d['latitude']
        deploy_loc[i]['lon'] = d['longitude']

    # put info in a data frame
    df = pd.DataFrame.from_dict(deploy_loc, orient='index').sort_index()
    y = {}
    for i, k in df.iterrows():
        if i > 0:
            loc1 = [k['lat'], k['lon']]
            d1 = int(k['deployment'])
            for x in range(i):
                info0 = df.iloc[x]
                compare = 'diff_km_D{}_to_D{}'.format(d1, int(info0['deployment']))
                loc0 = [info0['lat'], info0['lon']]
                diff_loc = round(geodesic(loc0, loc1).kilometers, 4)
                y.update({compare: diff_loc})
    return df, y


In [None]:
location, distance = deploy_location_check('GP03FLMB-RIM01-02-CTDMOG060')

In [None]:
location

In [None]:
fig, ax = plt.subplots() 
ax.set_xlim((-10, 10))
ax.set_ylim((-10, 10))
plt.title('Deployment Distance difference in km. \n Labeling only distances > 0.5 km')

handles =[]
labels = []
for i in distance:
    label = [i+'= '+str(round(distance[i],2))+'km']

    if distance[i] <= 0.5:
        circle1 = plt.Circle((0, 0), distance[i], color = 'r', fill=False)#, width=0.1)
        ax.add_artist(circle1)
    else:
        circle = plt.Circle((0, 0), distance[i], color = 'b', fill=False)#, width=0.1)
        ax.add_artist(circle)
        handles.append(circle)
        labels.append(label) 

    ax.legend(handles, labels, bbox_to_anchor=(2, 1))
    ax.get_yaxis().set_major_locator(ticker.NullLocator())

> **Observation:**
Deployment 5 and 7 location is further than 9 km than the rest of the rest of the deployments [1, 2, 3, 4, 6].

### Deploy Time Line Check

In [None]:
def deploy_timeline_check(refdes):
    # deployment start and end dates 
    deploy_time = {}
    dr_data = refdes_datareview_json(refdes)
    for i, d in enumerate(dr_data['instrument']['deployments']):
        deploy_time[i] = {}
        deploy_time[i]['deployment'] = d['deployment_number']
        deploy_time[i]['start_date'] = d['start_date']
        deploy_time[i]['stop_date'] = d['stop_date']

    # put info in a data frame
    df_time = pd.DataFrame.from_dict(deploy_time, orient='index').sort_index()
    
    return df_time


In [None]:
df = deploy_timeline_check('GP03FLMB-RIM01-02-CTDMOG060')

In [None]:
df

In [None]:

dates=list(df['start_date'].values)+list(df['stop_date'].values)
names=list(df['deployment'].values)+list(df['deployment'].values)
dates


In [None]:
from datetime import datetime

now = datetime.now()

current_time = now.strftime("%Y-%m-%dT%H:%M:%S")
print("Current Time =", current_time)

In [None]:
if dates[-1] is None:
    dates[-1] = current_time
dates

In [None]:
ind  = sorted(range(len(dates)),key=dates.__getitem__)

In [None]:
date_list = [dates[i].split('+')[0] for i in ind ]
deployment_list = [names[i] for i in ind ]
date_list, deployment_list

In [None]:
# Convert date strings (e.g. 2014-10-182016-07-03T18:35:00) to datetime
date_list = [datetime.strptime(d, "%Y-%m-%dT%H:%M:%S") for d in date_list]
date_list

In [None]:
# Choose some nice levels
levels = np.tile([-5, 5, -3, 3, -1, 1],
                 int(np.ceil(len(date_list)/6)))[:len(date_list)]
levels

In [None]:
# Create figure and plot a stem plot with the date
fig, ax = plt.subplots(figsize=(8.8, 4), constrained_layout=True)
ax.set(title="Deployment Timeline")

markerline, stemline, baseline = ax.stem(date_list, levels,
                                         linefmt="C3-", basefmt="k-",
                                         use_line_collection=True)

plt.setp(markerline, mec="k", mfc="w", zorder=3)

# Shift the markers to the baseline by replacing the y-data by zeros.
markerline.set_ydata(np.zeros(len(date_list)))

# annotate lines
vert = np.array(['top', 'bottom'])[(levels > 0).astype(int)]
for d, l, r, va in zip(date_list, levels, deployment_list, vert):
    ax.annotate(r, xy=(d, l), xytext=(-3, np.sign(l)*3),
                textcoords="offset points", va=va, ha="right")
    
# format xaxis with 4 month intervals
ax.get_xaxis().set_major_locator(mdates.MonthLocator(interval=4))
ax.get_xaxis().set_major_formatter(mdates.DateFormatter("%b %Y"))
plt.setp(ax.get_xticklabels(), rotation=30, ha="right") 

# remove y axis and spines
ax.get_yaxis().set_visible(False)
for spine in ["left", "top", "right"]:
    ax.spines[spine].set_visible(False)

ax.margins(y=0.1)
plt.show()

### Plot Timeseries Data
- load data files
- prepare figure
- combine with deployment information

In [None]:
%cd '/Users/leilabelabassi/Desktop/TAMU/online-class/612-DataQuality4theGeosciences/class_material/Module4_csvFiles/'
file_recovered = 'data_review_list_GP03FLMB-RIM01-02-CTDMOG060_recovered.csv'
file_telemetered = 'data_review_list_GP03FLMB-RIM01-02-CTDMOG060_telemetered.csv'
list_recovered = pd.read_csv(file_recovered)
list_telemetered = pd.read_csv(file_telemetered)
list_recovered

In [None]:
url_list=list(list_recovered['files'].values)+list(list_telemetered['files'].values)
url_list

In [None]:
# Load the data files
d1 = xr.open_dataset(url_list[0])

# By default, OOI datasets use the 'obs' variable as the index, but time is more convenient
d1 = d1.swap_dims({'obs': 'time'})

d2 = xr.open_dataset(url_list[1])
d2 = d2.swap_dims({'obs': 'time'})

In [None]:
# Exporting data
# Convert the xarray Dataset to a pandas DataFrame
d1 = d1.to_dataframe()

# Create a CSV file with the raw dataset
d1.to_csv('output.csv')

# If we have a large file, you can compress it using:
# !gzip output.csv

In [None]:

# Quick plots
# Choose some nice levels
levels = np.tile([-3, 3, -2, 2, -1, 1],
                 int(np.ceil(len(date_list)/6)))[:len(date_list)]

fig, ax = plt.subplots(figsize=(8.8, 4), constrained_layout=True)

markerline, stemline, baseline = ax.stem(date_list, levels,
                                         linefmt="C3-", basefmt="k-",
                                         use_line_collection=True)
plt.setp(markerline, mec="k", mfc="w", zorder=3)

# Shift the markers to the baseline by replacing the y-data by zeros.
markerline.set_ydata(np.zeros(len(date_list)))

# annotate lines
vert = np.array(['top', 'bottom'])[(levels > 0).astype(int)]
for d, l, r, va in zip(date_list, levels, deployment_list, vert):
    ax.annotate(r, xy=(d, l), xytext=(-3, np.sign(l)*3),
                textcoords="offset points", va=va, ha="right")
    
for i in range(len(list_recovered)):
    d1 = xr.open_dataset(list_recovered['files'][i])
    d1 = d1.swap_dims({'obs': 'time'})
    d1['ctdmo_seawater_temperature'].plot()
    

In [None]:
# Make a histogram
for i in range(len(list_recovered)):
    d1 = xr.open_dataset(list_recovered['files'][i])
    d1 = d1.swap_dims({'obs': 'time'})
    d1['ctdmo_seawater_temperature'].plot.hist(bins=100);

In [None]:
# plot multiple variables at once

fig, (ax1,ax2,ax3,ax4) = plt.subplots(4,1, sharex=True, figsize=(14,9))
for i in range(len(list_recovered)):
    d1 = xr.open_dataset(list_recovered['files'][i])
    d1 = d1.swap_dims({'obs': 'time'})
    
    d1['ctdmo_seawater_temperature'].plot(ax=ax1)
    d1['practical_salinity'].plot(ax=ax2)
    d1['density'].plot(ax=ax3)
    d1['ctdmo_seawater_pressure'].plot(ax=ax4);

In [None]:
fig, (ax1,ax2,ax3,ax4, ax5) = plt.subplots(5,1, sharex=True, figsize=(14,9))
for i in range(len(list_recovered)):
    d1 = xr.open_dataset(list_recovered['files'][i])
    d1 = d1.swap_dims({'obs': 'time'})
    d1['ctdmo_seawater_temperature'].plot(ax=ax1,linestyle='None',marker='.',markersize=1)
    d1['practical_salinity'].plot(ax=ax2,linestyle='None',marker='.',markersize=1)
    d1['ctdmo_seawater_conductivity'].plot(ax=ax4,linestyle='None',marker='.',markersize=1);
    d1['ctdmo_seawater_pressure'].plot(ax=ax5,linestyle='None',marker='.',markersize=1);
    d1['density'].plot(ax=ax3,linestyle='None',marker='.',markersize=1);
# Let's change the salinity y-limits to account for outliners
#ax2.set_ylim(30,35);

In [None]:
rd_list = []
rdm_list = []
for uu in url_list:
    elements = uu.split('_')[1].split('-')
    rd = '-'.join((elements[0], elements[1], elements[2], elements[3]))
    rdm = uu.split('-')[4]
   
 
    if rd not in rd_list:
        rd_list.append(rd)
    if rdm not in rdm_list:
        rdm_list.append(rdm)
print(rd_list, rdm_list)

for r in rd_list:
    rdm_filtered = [k for k in rdm_list if r in k]
    print(r, rdm_filtered)

In [None]:
    dinfo = {}
    if len(rdm_filtered) == 1:
        print('Only one delivery method provided - no comparison.')
        continue
    elif len(rdm_filtered) > 1 & len(rdm_filtered) <= 3:
        print('\nComparing data from different methods for: {}'.format(r))
        for i in range(len(rdm_filtered)):
            urls = [x for x in url_list if rdm_filtered[i] in x]
            for u in urls:
                splitter = u.split('/')[-2].split('-')
                catalog_rms = '-'.join((r, splitter[-2], splitter[-1]))
                udatasets = cf.get_nc_urls([u])
                deployments = [str(k.split('/')[-1][0:14]) for k in udatasets]
                udeploy = np.unique(deployments).tolist()

<a id=7 ></a>
### Data Coordinates Function.

In [None]:
def missing_coordinates(datasets):
    """
    Check file coordinates agains the expected coordinates array ['obs', 'time', 'pressure', 'lat', 'lon']
    """
    file_coordinates = [x for x in tuple(datasets.coords.keys())]

    missing_coords = [x for x in ['obs', 'time', 'pressure', 'lat', 'lon'] if x not in file_coordinates]
    return missing_coords


In [None]:
 # check variable coordinates
missing_coords = missing_coordinates(ds)

In [None]:
df0 = pd.DataFrame({'sv': [sv], 'var_units': [units], 
                    'Missing Coordinates':[(coords, 'missing:',missing_coords)]  
                   }, index = [deployment]) 
df = df.append(df0)

pd.set_option('display.max_colwidth', -1)
df

- The missing coordinates in the parameter attributes should be addressed to fix the data integrity and usability with other datasets.  