# Find Argo floats that were near Saildrones -- downloading the data

***Note: Only run this script once; it's already been run***

# 0. Setup

In [1]:
# data analysis
import numpy    as np
import pandas   as pd
import datetime as dt   
# import seawater as sw                                 # calculate distance saildrone travelled
import argopy
# import gsw                                            # calculate density
# import metpy.calc  as mpcalc                          # calculate gradients
# from   metpy.units import units                       # calculations with units

# netcdf files
import xarray as xr

# plotting
import matplotlib                                     # this is for the SO axes setup function
import matplotlib.pyplot as plt
# import seaborn           as sns

# for loading argo data functions
from   scipy    import interpolate
from   datetime import datetime, timedelta
import requests
import time
import os
# import urllib3
# import shutil

# 1. Downloard Argo data

## Define functions for downloading argo data

Note that `download_file()` and `argo_gdac()` are from Katy Christensen and Ethan Campbell (UW).

See: https://colab.research.google.com/drive/1IAAWxqbbLMM7ZIigC8m7y-bFrlC3x8vL#scrollTo=LyvDQHVxWlWo for examples of runnning functions

In [2]:
##############################################################################################################
# Function to download a single file

def download_file(url_path,filename,save_to=None,overwrite=False,verbose=True):
    """ Downloads and saves a file from a given URL using HTTP protocol.

    Note: If '404 file not found' error returned, function will return without downloading anything.

    Arguments:
        url_path: root URL to download from including trailing slash ('/')
        filename: filename to download including suffix
        save_to: None (to download to root Google Drive GO-BGC directory)
                 or directory path
        overwrite: False to leave existing files in place
                   or True to overwrite existing files
        verbose: True to announce progress
                 or False to stay silent

    """
    urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
    root = '/Users/hannah/Documents/Fulbright/Polar_Gliders/Research/SO_CO2_repo/'

    if save_to is None:
        save_to = root

    try:
        if filename in os.listdir(save_to):
            if not overwrite:
                if verbose: print('>>> File ' + filename + ' already exists. Leaving current version.')
                return
            else:
                if verbose: print('>>> File ' + filename + ' already exists. Overwriting with new version.')

        def get_func(url,stream=True):
            try:
                return requests.get(url,stream=stream,auth=None,verify=False)
            except requests.exceptions.ConnectionError as error_tag:
                print('Error connecting:',error_tag)
                time.sleep(1)
                return get_func(url,stream=stream)

        response = get_func(url_path + filename,stream=True)

        if response.status_code == 404:
            if verbose: print('>>> File ' + filename + ' returned 404 error during download.')
            return
        with open(save_to + filename,'wb') as out_file:
            shutil.copyfileobj(response.raw,out_file)
        del response
        if verbose: print('>>> Successfully downloaded ' + filename + '.')

    except:
        if verbose: print('>>> An error occurred while trying to download ' + filename + '.')
        
        
        
##############################################################################################################
# Function to download and parse GDAC synthetic profile index file
def argo_gdac(lat_range=None,lon_range=None,start_date=None,end_date=None,sensors=None,floats=None,
              overwrite_index=False,overwrite_profiles=False,skip_download=False,
              download_individual_profs=False,save_to=None,verbose=True):
    """ Downloads GDAC Sprof index file, then selects float profiles based on criteria.
      Either returns information on profiles and floats (if skip_download=True) or downloads them (if False).

      Arguments:
          lat_range: None, to select all latitudes
                     or [lower, upper] within -90 to 90 (selection is inclusive)
          lon_range: None, to select all longitudes
                     or [lower, upper] within either -180 to 180 or 0 to 360 (selection is inclusive)
                     NOTE: longitude range is allowed to cross -180/180 or 0/360
          start_date: None or datetime object
          end_date:   None or datetime object
          sensors: None, to select profiles with any combination of sensors
                   or string or list of strings to specify required sensors
                   > note that common options include PRES, TEMP, PSAL, DOXY, CHLA, BBP700,
                                                      PH_IN_SITU_TOTAL, and NITRATE
          floats: None, to select any floats matching other criteria
                  or int or list of ints specifying floats' WMOID numbers
          overwrite_index: False to keep existing downloaded GDAC index file, or True to download new index
          overwrite_profiles: False to keep existing downloaded profile files, or True to download new files
          skip_download: True to skip download and return: (<list of WMOIDs>, <DataFrame of index file subset>,
                                                            <list of downloaded filenames [if applicable]>)
                         or False to download those profiles
          download_individual_profs: False to download single Sprof file containing all profiles for each float
                                     or True to download individual profile files for each float
          save_to: None to download to Google Drive "/GO-BGC Workshop/Profiles" directory
                   or string to specify directory path for profile downloads
          verbose: True to announce progress, or False to stay silent

    """
    # Paths
    url_root = 'https://www.usgodae.org/ftp/outgoing/argo/'
    dac_url_root = url_root + 'dac/'
    index_filename = 'argo_synthetic-profile_index.txt'
    root = '/Users/hannah/Documents/Fulbright/Polar_Gliders/Research/SO_CO2_repo/'

    if save_to is None: save_to = root

    # Download GDAC synthetic profile index file
    download_file(url_root,index_filename,overwrite=overwrite_index)

    # Load index file into Pandas DataFrame
    gdac_index = pd.read_csv(root + index_filename,delimiter=',',header=8,parse_dates=['date','date_update'],
                          date_parser=lambda x: pd.to_datetime(x,format='%Y%m%d%H%M%S'))

    # Establish time and space criteria
    if lat_range is None:  lat_range = [-90.0,90.0]
    if lon_range is None:  lon_range = [-180.0,180.0]
    elif lon_range[0] > 180 or lon_range[1] > 180:
        if lon_range[0] > 180: lon_range[0] -= 360
        if lon_range[1] > 180: lon_range[1] -= 360
    if start_date is None: start_date = datetime(1900,1,1)
    if end_date is None:   end_date = datetime(2200,1,1)

    float_wmoid_regexp = r'[a-z]*/[0-9]*/profiles/[A-Z]*([0-9]*)_[0-9]*[A-Z]*.nc'
    gdac_index['wmoid'] = gdac_index['file'].str.extract(float_wmoid_regexp).astype(int)
    filepath_main_regexp = '([a-z]*/[0-9]*/)profiles/[A-Z]*[0-9]*_[0-9]*[A-Z]*.nc'
    gdac_index['filepath_main'] = gdac_index['file'].str.extract(filepath_main_regexp)
    filepath_regexp = '([a-z]*/[0-9]*/profiles/)[A-Z]*[0-9]*_[0-9]*[A-Z]*.nc'
    gdac_index['filepath'] = gdac_index['file'].str.extract(filepath_regexp)
    filename_regexp = '[a-z]*/[0-9]*/profiles/([A-Z]*[0-9]*_[0-9]*[A-Z]*.nc)'
    gdac_index['filename'] = gdac_index['file'].str.extract(filename_regexp)

    # Subset profiles based on time and space criteria
    gdac_index_subset = gdac_index.loc[np.logical_and.reduce([gdac_index['latitude'] >= lat_range[0],
                                                              gdac_index['latitude'] <= lat_range[1],
                                                              gdac_index['date'] >= start_date,
                                                              gdac_index['date'] <= end_date]),:]
    if lon_range[1] >= lon_range[0]:    # range does not cross -180/180 or 0/360
        gdac_index_subset = gdac_index_subset.loc[np.logical_and(gdac_index_subset['longitude'] >= lon_range[0],
                                                             gdac_index_subset['longitude'] <= lon_range[1])]
    elif lon_range[1] < lon_range[0]:   # range crosses -180/180 or 0/360
        gdac_index_subset = gdac_index_subset.loc[np.logical_or(gdac_index_subset['longitude'] >= lon_range[0],
                                                            gdac_index_subset['longitude'] <= lon_range[1])]

    # If requested, subset profiles using float WMOID criteria
    if floats is not None:
        if type(floats) is not list: floats = [floats]
        gdac_index_subset = gdac_index_subset.loc[gdac_index_subset['wmoid'].isin(floats),:]

    # If requested, subset profiles using sensor criteria
    if sensors is not None:
        if type(sensors) is not list: sensors = [sensors]
        for sensor in sensors:
            gdac_index_subset = gdac_index_subset.loc[gdac_index_subset['parameters'].str.contains(sensor),:]

    # Examine subsetted profiles
    wmoids = gdac_index_subset['wmoid'].unique()
    wmoid_filepaths = gdac_index_subset['filepath_main'].unique()

    # Just return list of floats and DataFrame with subset of index file, or download each profile
    if not skip_download:
        downloaded_filenames = []
        if download_individual_profs:
            for p_idx in gdac_index_subset.index:
                download_file(dac_url_root + gdac_index_subset.loc[p_idx]['filepath'],
                              gdac_index_subset.loc[p_idx]['filename'],
                              save_to=save_to,overwrite=overwrite_profiles,verbose=verbose)
                downloaded_filenames.append(gdac_index_subset.loc[p_idx]['filename'])
        else:
            for f_idx, wmoid_filepath in enumerate(wmoid_filepaths):
                download_file(dac_url_root + wmoid_filepath,str(wmoids[f_idx]) + '_Sprof.nc',
                              save_to=save_to,overwrite=overwrite_profiles,verbose=verbose)
                downloaded_filenames.append(str(wmoids[f_idx]) + '_Sprof.nc')
        return wmoids, gdac_index_subset, downloaded_filenames
    else:
        return wmoids, gdac_index_subset

## Run functions to download Argo data

### Set up directories

*Note: change below directories for other users*

In [3]:
# Base filepath
root        = '/Users/hannah/Documents/Fulbright/Polar_Gliders/Research/SO_CO2_repo/'
profile_dir = root + 'data/01_raw/argo/profiles/'
natl_dir    = profile_dir + '2019_SO/'

### Run function -- this step does the actual downloaded and takes a little while

In [3]:
wmoids, gdac_index, downloaded_filenames = argo_gdac(lat_range  = [-65,-45], lon_range = None,
                                                     start_date = dt.datetime(2019, 1, 1, 0, 0, 0, 0), 
                                                     end_date   = dt.datetime(2019, 9, 1, 0, 0, 0, 0),
                                                     sensors = None,    floats = None,
                                                     overwrite_index = False, overwrite_profiles = False,
                                                     skip_download = False, download_individual_profs = False,
                                                     save_to = profile_dir, verbose = True)

>>> File argo_synthetic-profile_index.txt already exists. Leaving current version.
>>> Successfully downloaded 5904179_Sprof.nc.
>>> Successfully downloaded 5904180_Sprof.nc.
>>> Successfully downloaded 5904186_Sprof.nc.
>>> Successfully downloaded 5904397_Sprof.nc.
>>> Successfully downloaded 5904467_Sprof.nc.
>>> Successfully downloaded 5904468_Sprof.nc.
>>> Successfully downloaded 5904469_Sprof.nc.
>>> Successfully downloaded 5904481_Sprof.nc.
>>> Successfully downloaded 5904482_Sprof.nc.
>>> Successfully downloaded 5904483_Sprof.nc.
>>> Successfully downloaded 5904598_Sprof.nc.
>>> Successfully downloaded 5904599_Sprof.nc.
>>> Successfully downloaded 5904657_Sprof.nc.
>>> Successfully downloaded 5904658_Sprof.nc.
>>> Successfully downloaded 5904659_Sprof.nc.
>>> Successfully downloaded 5904660_Sprof.nc.
>>> Successfully downloaded 5904661_Sprof.nc.
>>> Successfully downloaded 5904663_Sprof.nc.
>>> Successfully downloaded 5904671_Sprof.nc.
>>> Successfully downloaded 5904673_Sprof.n

Notes from Katy and Ethan's code:

After downloading the data, the function returns three variables, which we named `wmoids`, `gdac_index`, and `downloaded_filenames`.

The variable `wmoids` is a list of the WMOID numbers of floats that were downloaded.

The variable `downloaded_filenames` is a list of the actual files that were downloaded.

`gdac_index` contains information on the profiles that matched the criteria specified in `argo_gdac()`.

# 2. Load netCDF Argo files

### Variables of interest

In [2]:
# list of variables you are interested in (this is random)

variable_list = [
    'PLATFORM_NUMBER',
#     'CHLA_ADJUSTED',
    'PSAL_ADJUSTED',
    'TEMP_ADJUSTED',
#     'BBP700_ADJUSTED',
]


variable_list = [
    'PLATFORM_NUMBER',
#     'CHLA_ADJUSTED',
    'PSAL_ADJUSTED',
    'TEMP_ADJUSTED',
#     'BBP700_ADJUSTED',
]

In [3]:
variable_list

['PLATFORM_NUMBER', 'PSAL_ADJUSTED', 'TEMP_ADJUSTED']

### Notes

- Bio variables didn't work in this function -- I guess not all profiles had chl and bbp -- figure out which profiles have what after subsetting dataset for space/time

### Function to format data after it's been downloaded

Open files with a loop, defined in the function below (`load_argo_data()`).

In [3]:
# run your load_data function (slightly adjusted)
# loop through each file, re-label the coordinates and save as new nc file to new directory 

def load_argo_data(data_directory, variable_list, output_directory):
    """
    Adjusted code to read and save each nc file in a new format with selected variables only
    Input path to directory and variable_list 
    """
    import os
#     global data
#     data = {}
    for filename in os.listdir(data_directory):
        filepath = data_directory + '/' + filename
        
        if filename.endswith('.nc'):
            ds    = xr.open_dataset(filepath)
            index = os.listdir(data_directory).index(filename)
#             print(filename)

            ds.coords['LATITUDE']        = ds.LATITUDE
            ds.coords['LONGITUDE']       = ds.LONGITUDE
            ds.coords['N_PROF']          = ds.N_PROF
            ds.coords['N_LEVELS']        = ds.N_LEVELS
            ds.coords['TIME']            = ds.JULD
            ds.coords['PLATFORM_NUMBER'] = ds.PLATFORM_NUMBER
            
            ds_points = ds.argo.profile2point()
            ds_points = ds_points[variable_list]
            ds_points.to_netcdf(output_directory + '/' + '{}_points.nc'.format(filename[:-3]))   # can add path to output directory here
        else:
            continue

### Run function to open all Argo profiles in a list:

In [4]:
argo_profiles = load_argo_data(data_directory   = r'/Users/hannah/Documents/Fulbright/Polar_Gliders/Research/SO_CO2_repo/data/01_raw/argo/profiles/2019_SO/',
                               variable_list    = variable_list,
                               output_directory = r'/Users/hannah/Documents/Fulbright/Polar_Gliders/Research/SO_CO2_repo/data/02_intermediate/argo/profiles_2019_SO/')

PermissionError: [Errno 13] Permission denied: b'/Users/hannah/Documents/Fulbright/Polar_Gliders/Research/SO_CO2_repo/data/02_intermediate/argo/profiles_2019_SO/6902734_Sprof_points.nc'

Look at one of the datasets:

In [11]:
argo_profiles

NameError: name 'argo_profiles' is not defined

# 3. Load BGC netCDF files that were near Saildrone

## 3.1 Add in variables that are missing from one of the floats (argo_5905395)

In [5]:
argo_5905395_raw = xr.open_dataset('/Users/hannah/Documents/Fulbright/Polar_Gliders/Research/SO_CO2_repo/data/01_raw/argo/profiles/2019_SO/bgc_copied/5905395_Sprof.nc')

In [7]:
argo_5905395_raw

In [22]:
# Add in missing nitrate as nan

empty_array = np.empty(len(argo_5905395_raw.CHLA_ADJUSTED)) * np.nan

argo_5905395_raw['NITRATE_ADJUSTED']       = xr.DataArray(data=empty_array, dims=['N_PROF'])
argo_5905395_raw['NITRATE_ADJUSTED_QC']    = xr.DataArray(data=empty_array, dims=['N_PROF'])
argo_5905395_raw['NITRATE_ADJUSTED_ERROR'] = xr.DataArray(data=empty_array, dims=['N_PROF'])

In [23]:
argo_5905395_raw

In [25]:
# Save new file with nitrate included
argo_5905395_raw.to_netcdf('/Users/hannah/Documents/Fulbright/Polar_Gliders/Research/SO_CO2_repo/data/01_raw/argo/profiles/2019_SO/bgc_copied/' + '5905395_Sprof_wNitrate.nc'.format('.nc'))

In [28]:
# list of variables you are interested in

variable_list_bgc = [
    'PLATFORM_NUMBER',
    'PLATFORM_TYPE',
    'PRES',
    'PRES_QC',
    'PRES_ADJUSTED',
    'PRES_ADJUSTED_QC',
    'PRES_ADJUSTED_ERROR',
    'PSAL_ADJUSTED',
    'PSAL_ADJUSTED_QC',
    'PSAL_ADJUSTED_ERROR',
    'TEMP_ADJUSTED',
    'TEMP_ADJUSTED_QC',
    'TEMP_ADJUSTED_ERROR',
    'CHLA_ADJUSTED',
    'CHLA_ADJUSTED_QC',
    'CHLA_ADJUSTED_ERROR',
    'NITRATE_ADJUSTED',
    'NITRATE_ADJUSTED_QC',
    'NITRATE_ADJUSTED_ERROR',
]

# Note that argo_5905395 is missing nitrate data -- if I can figure out how to add in NAs for this

In [30]:
argo_profiles_bgc_saildrone = load_argo_data(data_directory   = r'/Users/hannah/Documents/Fulbright/Polar_Gliders/Research/SO_CO2_repo/data/01_raw/argo/profiles/2019_SO/bgc_copied/',
                               variable_list    = variable_list_bgc,
                               output_directory = r'/Users/hannah/Documents/Fulbright/Polar_Gliders/Research/SO_CO2_repo/data/02_intermediate/argo/profiles_2019_SO_bgc_near_saildrones/')

In [19]:
argo_profiles_bgc_saildrone

# Notes

- Bio variables didn't work in this function -- I guess not all profiles had chl and bbp -- figure out which profiles have what after subsetting dataset for space/time
- Clean up script
- Plot points on map

import netCDF4 as nc
fn = '/path/to/file.nc4'
ds = nc.Dataset(fn)# Questions

1. Swap dimensions of all datasets?
2. Should I try to merge all profiles into one data set?
    - If I don't do this, I guess I'll write another loop to plot the locations of each float and decide which ones I want

In [5]:
import netCDF4 as nc
fn = '/Users/hannah/Documents/UW-PMEL/Research/so_co2_flux_repo/data/01_raw/SOCCOM_bgc_argo_float_data/SOCCOM_HiResQC_LIAR_21Dec2021_netcdf/1901378_HRQC.nc'
ds = nc.Dataset(fn)
ds

In [8]:
ds.to_xarray()

AttributeError: NetCDF: Attribute not found

In [9]:
ds = xr.open_dataset(fn)

In [10]:
ds