Gemini prompt:

> Using python, can you download the .nc files found at https://www.ncei.noaa.gov/data/oceans/ioos/atn/california_state_university_long_beach/ and convert them to the Darwin Core standard? Assign an occurrence as the first detection per location per hour.

In [1]:
import os
import glob
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urljoin, urlparse
import xarray as xr
import netCDF4
from geopy.geocoders import Nominatim
import re
from jinja2 import Template
import codecs
#import pyobistools
from shapely.geometry import LineString
import shapely.wkt

## Create a function to recursively download files

In [4]:
def recursive_wget(url, output_dir):
    """
    Recursively downloads files from a given URL to a specified output directory,
    mirroring the directory structure of the website.

    Args:
        url (str): The URL to start downloading from.
        output_dir (str): The local directory to save files to.
    """
    print(f"Accessing: {url}")
    try:
        # --- Create the output directory if it doesn't exist ---
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
            print(f"Created directory: {output_dir}")

        # --- Send a GET request and parse the HTML ---
        response = requests.get(url)
        # Raise an exception for bad status codes (like 404 Not Found)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        # --- Find all links on the page ---
        for link in soup.find_all('a'):
            href = link.get('href')

            # --- Skip invalid or parent directory links ---
            if not href or href.startswith('?') or href.startswith('/') or '..' in href:
                continue

            # --- Construct the full, absolute URL for the link ---
            absolute_url = urljoin(url, href)

            # Get the path component of the URL to create local directories/files
            path = urlparse(absolute_url).path
            # Create a valid local path from the last part of the URL path
            local_path = os.path.join(output_dir, os.path.basename(path))

            # --- If the link points to a directory, recurse into it ---
            if href.endswith('/'):
                print(f"\nEntering directory: {absolute_url}")
                # Call the function again for the new directory
                recursive_wget(absolute_url, local_path)
            # --- If the link points to a file, download it ---
            else:
                download_file(absolute_url, local_path)

    except requests.exceptions.HTTPError as e:
        print(f"HTTP Error accessing URL {url}: {e}")
    except requests.exceptions.RequestException as e:
        print(f"Error accessing URL {url}: {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")


def download_file(url, local_path):
    """
    Downloads a single file from a URL and saves it to a local path.

    Args:
        url (str): The URL of the file to download.
        local_path (str): The local path where the file will be saved.
    """
    try:
        print(f"  Downloading file: {os.path.basename(local_path)}")
        # Use stream=True to efficiently download large files
        with requests.get(url, stream=True) as r:
            r.raise_for_status()
            # Open the file in binary write mode
            with open(local_path, 'wb') as f:
                # Write the file in chunks
                for chunk in r.iter_content(chunk_size=8192):
                    f.write(chunk)
        # print(f"  Successfully downloaded {os.path.basename(local_path)}")
    except requests.exceptions.RequestException as e:
        print(f"  Failed to download {url}: {e}")
    except IOError as e:
        print(f"  Failed to write file {local_path}: {e}")

## Execute download

In [5]:
# --- Main execution block ---
start_url = "https://www.ncei.noaa.gov/data/oceans/ioos/atn/"
# Create a base directory for all the downloads
download_directory = "data/src/"

print("--- Starting Recursive Download ---")
print(f"Source URL: {start_url}")
print(f"Local Directory: {download_directory}\n")

recursive_wget(start_url, download_directory)

print("\n--- Recursive Download Finished ---")

--- Starting Recursive Download ---
Source URL: https://www.ncei.noaa.gov/data/oceans/ioos/atn/
Local Directory: data/src/

Accessing: https://www.ncei.noaa.gov/data/oceans/ioos/atn/
  Downloading file: ACCESSION_UPDATE_LOG.TXT

Entering directory: https://www.ncei.noaa.gov/data/oceans/ioos/atn/california_state_university_long_beach/
Accessing: https://www.ncei.noaa.gov/data/oceans/ioos/atn/california_state_university_long_beach/
  Downloading file: atn_45866_great-white-shark_trajectory_20090923-20091123.nc
  Downloading file: atn_45869_great-white-shark_trajectory_20090923-20091213.nc

Entering directory: https://www.ncei.noaa.gov/data/oceans/ioos/atn/cascadia_research_collective/
Accessing: https://www.ncei.noaa.gov/data/oceans/ioos/atn/cascadia_research_collective/
  Downloading file: atn_53631_false-killer-whale_trajectory_20100927-20101001.nc
  Downloading file: atn_53644_false-killer-whale_trajectory_20100927-20101118.nc
  Downloading file: atn_53652_false-killer-whale_trajector

In [2]:
def create_dwc_occurrence(ds, output_csv):

  dwc_df = pd.DataFrame()
  dwc_df['occurrenceID'] = "ioos_atn_"+ds.ptt_id+"_"+ds['time'].dt.strftime('%Y-%m-%dT%H:%M:%SZ')+"_"+ds['z'].astype(str)+"_"+ds.animal_common_name.replace(" ","_")
  dwc_df['eventID'] = "ioos_atn_"+ds.ptt_id
  #dwc_df['eventID'] = ds.ptt_id+"_"+ds.animal_common_name.replace(" ","_") +"_"+ds['time'].dt.strftime('%Y-%m-%dT%H:%M:%SZ')
  dwc_df['organismID'] = ds.platform_id+"_"+ds.animal_common_name.replace(" ","_")
  dwc_df['occurrenceStatus'] = 'present'
  dwc_df['basisOfRecord'] = ds['type']
  dwc_df['eventDate'] = ds['time'].dt.strftime('%Y-%m-%dT%H:%M:%SZ')
  dwc_df['decimalLatitude'] = ds['lat']
  dwc_df['decimalLongitude'] = ds['lon']
  dwc_df['geodeticDatum'] = ds.crs.epsg_code
  dwc_df['scientificName'] = ds['taxon_name'].values.tolist()
  dwc_df['scientificNameID'] = ds['taxon_lsid'].values.tolist()
  dwc_df['samplingProtocol'] = 'satellite telemetry'
  dwc_df['kingdom'] = ds['animal'].attrs['kingdom']
  dwc_df['taxonRank'] = ds['animal'].attrs['rank']
  dwc_df['lifeStage'] = ds['animal_life_stage'].values.tolist()
  dwc_df['sex'] = ds['animal_sex'].values.tolist()
  dwc_df['associatedReferences'] = "https://doi.org/10.25921/wp4e-ph20"
  dwc_df['minimumDepthInMeters'] = ds['z'].values.tolist()
  dwc_df['maximumDepthInMeters'] = ds['z'].values.tolist()
  dwc_df['bibliographicCitation'] = ds.citation # barking about this.

  # set basisOfRecord
  dwc_df.loc[dwc_df['basisOfRecord'] == 'User','basisOfRecord'] = 'HumanObservation'
  dwc_df.loc[dwc_df['basisOfRecord'] == 'Argos','basisOfRecord'] = 'MachineObservation'
  dwc_df.loc[dwc_df['basisOfRecord'] == 'FastGPS','basisOfRecord'] = 'MachineObservation'

  # filter to respectable locations
  dwc_df['location_class'] = ds['location_class'].to_series()

  dwc_df.drop(dwc_df.loc[
      (dwc_df['location_class'] == 'A') |
      (dwc_df['location_class'] == 'B') |
      (dwc_df['location_class'] == 'Z')].index, inplace=True)

  # test using xarray
  # ds['time'].where((ds['location_class'] != 'A') &
  #     (ds['location_class'] != 'B') &
  #     (ds['location_class'] != 'Z'),drop=True).values

  print(f"  Extracted {len(dwc_df)} occurrences with valid locations.")

  # assign value to codes
  dwc_df.loc[dwc_df['location_class'] == 'nan','location_class'] = 0
  dwc_df.loc[dwc_df['location_class'] == 'G','location_class'] = 200
  dwc_df.loc[dwc_df['location_class'] == '3','location_class'] = 250
  dwc_df.loc[dwc_df['location_class'] == '2','location_class'] = 500
  dwc_df.loc[dwc_df['location_class'] == '1','location_class'] = 1500
  dwc_df.loc[dwc_df['location_class'] == '0','location_class'] = 10000

  # --- Define Occurrences: First detection per location per hour ---
  dwc_df['event_hour'] = pd.to_datetime(dwc_df['eventDate']).dt.strftime('%Y-%m-%dT%H')
  dwc_df.sort_values('event_hour', inplace=True)
  duplicate_counts = dwc_df.groupby(by='event_hour').transform('size')
  dwc_df['dataGeneralizations'] = f'first of ' + duplicate_counts.astype(str) + ' records for this hour.'
  dwc_df.loc[dwc_df['dataGeneralizations']=='first of 1 records for this hour.','dataGeneralizations'] = ''
  dwc_df = dwc_df.drop_duplicates(subset=['event_hour'], keep='first').copy()

  dwc_df['occurrenceRemarks'] = 'This is a representative occurrence from a full deployment. For the complete dataset please see https://doi.org/10.25921/wp4e-ph20.'

  print(f"  Extracted {len(dwc_df)} occurrences to first row in hour.")

  # --- Rename a and drop few columns --
  dwc_df.rename(columns={'location_class': 'coordinateUncertaintyInMeters',
                          },
                inplace=True)

  dwc_df.drop(columns=['event_hour'], inplace=True)

  # only pick specific columns to save
  cols = ['eventID', 'occurrenceID', 'occurrenceStatus', 'basisOfRecord',
          'organismID', 'eventDate', 'decimalLatitude',
          'decimalLongitude', 'geodeticDatum',
          'scientificName', 'scientificNameID',
          'samplingProtocol', 'kingdom', 'taxonRank', 'lifeStage',
          'sex', 'associatedReferences',
          'coordinateUncertaintyInMeters', 
          'minimumDepthInMeters', 'maximumDepthInMeters',
          'dataGeneralizations', 'bibliographicCitation',
          'occurrenceRemarks']
  
  # Save the individual CSV
  dwc_df.to_csv(output_csv, columns=cols, index=False)
  print(f"  Saved data to '{output_csv}'")

  return dwc_df, cols


def create_dwc_event(dwc_df, output_csv):

  # --- Processing for Event ---
  # event_df = dwc_df.loc[dwc_df['basisOfRecord']=='HumanObservation',
  # ['eventID','eventDate','decimalLatitude','decimalLongitude','geodeticDatum',
  #   'minimumDepthInMeters','maximumDepthInMeters']]
  #event_df['parentEventID'] = dwc_df['occurrenceID'].str.split('_',expand=True).iloc[:,:3].apply(lambda x: '_'.join(x), axis=1).unique()
  
  # create parent event that is a summary of dwc_df
  event_df = pd.DataFrame()
  event_df['eventID'] = dwc_df['eventID'].unique()

  #event_df['eventID'] = dwc_df['occurrenceID'].str.split('_',expand=True).iloc[:,:3].apply(lambda x: '_'.join(x), axis=1).unique()
  event_df['eventDate'] = dwc_df['eventDate'].min() + '/' + dwc_df['eventDate'].max()
  event_df['footprintWKT'] = LineString(list(zip(dwc_df['decimalLongitude'], dwc_df['decimalLatitude'])))
  event_df['footprintWKT'] = event_df['footprintWKT'][0].wkt
  event_df['footprintWKT'] = event_df['footprintWKT'].str.replace('LINESTRING', 'MULTIPOINT')
  event_df['minimumDepthInMeters'] = dwc_df['minimumDepthInMeters'].min()
  event_df['maximumDepthInMeters'] = dwc_df['maximumDepthInMeters'].max()
  #event_df['basisOfRecord'] = 'Event' # barking about this
  event_df['eventType'] = 'deployment'

  #event_df = pd.concat([event_parent_df, event_df])

  if event_df.empty:
      print("No HumanObservations found in the dataset.")
      return pd.DataFrame()  # Return an empty DataFrame if no observations are found
  else:  
    print(f"  found {len(event_df)} HumanObservations.")
    event_df['countryCode'] = 'US'
    event_df['samplingProtocol'] = 'satellite telemetry'
    
    # # initialize Nominatim API - not trusted enough yet
    # # see https://nominatim.org/release-docs/develop/api/Reverse/
    # geolocator = Nominatim(user_agent="my_geopy_app")

    # lat = event_df['decimalLatitude'][0].astype(str)
    # lon = event_df['decimalLongitude'][0].astype(str)

    # location = geolocator.reverse(lat+","+lon)

    # event_df['countryCode'] = location.raw['address'].get('country_code').upper()
    event_df.to_csv(output_csv.replace("occurrence","event"), index=False)
    print(f"  Created {len(event_df)} events.")
    print(f"  Saved data to {output_csv.replace('occurrence','event')}")

    return event_df


def create_dwc_emof(ds, dwc_df, output_csv):
  ## Add more details about the sensor following https://www.gbif.org/occurrence/5068380514
  # See https://vocab.nerc.ac.uk/search_nvs/MVB/
  # add serial number and all the other details here.
  # --- Processing for emof ---
  vars = list(ds.keys())
  animal_vars = [x for x in vars if re.match(r'animal_(?!life_stage\b|sex\b).*',x)]
  new_rows = pd.DataFrame()

  for animal_var in animal_vars:
    row = pd.DataFrame({
        'measurementValue': ds[animal_var].values.tolist(),
        'measurementType': [f'{animal_var}: {ds[animal_var].long_name}'],
        'measurementMethod': ds[animal_var].attrs[animal_var],
        'measurementUnit': [ds[animal_var].units if 'units' in ds[animal_var].attrs else ''],
    })
    new_rows = pd.concat([new_rows,
                          row])

  emof_df = dwc_df.loc[dwc_df['basisOfRecord']=='HumanObservation',
                      ['eventID','occurrenceID']
                      ].merge(
                          new_rows,
                          left_index=True,
                          right_index=True)

  emof_df.dropna(axis=0, subset=['measurementValue'], inplace=True)
  # barking about eventID in emof.
  if emof_df.empty:
    print(f'  no emof data found')
    return pd.DataFrame()  # Return an empty DataFrame if no observations are found
  else:
    emof_df.to_csv(output_csv.replace("occurrence","emof"), index=False)
    print(f"  Created {len(emof_df)} emofs.")
    print(f"  Saved data to {output_csv.replace('occurrence','emof')}")
    return emof_df

In [3]:
# EML generation

# borrowed from https://gitlab.oceantrack.org/otn-partner-nodes/ipython-utilities/-/blob/main/dbtools/publish_to_obis.py?ref_type=heads



def save_eml_file(eml_metadata:dict) -> str:
    """
    Save EML dictionary in a file
    Author: Jon Pye, Angela Dini
    Maintainer: Angela Dini
    :param eml_metadata: dictionary of EML metadata
    :return: filepath of where the EML filepath will be
    """
    # Write it out to the package
    template_file = codecs.open('templates/eml.xml.j2', 'r', 'UTF-8').read()
    template = Template(template_file)
    result_string = template.render(eml_metadata)
    eml_file = 'data/dwc/{ptt_id}/eml.xml'.format(**eml_metadata)
    fh = codecs.open(eml_file, 'wb+', 'UTF-8')
    fh.write(result_string)
    fh.close()
    eml_full_path = os.path.abspath(eml_file)
    print(f"  EML metadata has been written to '{eml_full_path}'.")
    return eml_full_path

def create_eml(ds):
    eml_metadata = ds.attrs

    contributors = dict()
    for attr in [x for x in ds.attrs if re.match(r'contributor_(?!role_vocabulary\b).*',x)]:
        contributors[attr] = ds.attrs[attr].split(",")

    contributors_list = [
        {key: contributors[key][i] for key in contributors}
        for i in range(len(next(iter(contributors.values()))))
    ]

    other_meta = {
        'dataset_ipt_id': None,
        'dataset_short_name': ds.encoding.get('source').split("\\")[-1].replace(".nc",""),
        'data_manager_firstname': 'Megan',
        'data_manager_lastname': 'McKinzie',
        'data_manager_title': 'Data Manager',
        'data_manager_phone': '',
        'data_manager_email': 'mmckinzie@mbari.org',
        'contributors': contributors_list,
    }

    eml_metadata.update(other_meta)

    save_eml_file(eml_metadata)
    
    return eml_metadata

In [4]:
def create_meta_xml(dwc_df, emof_df, event_df, output_csv, cols):
    """
    Create meta.xml file for the Darwin Core dataset.
    
    Args:
        dwc_df (DataFrame): DataFrame containing Darwin Core occurrence data.
        emof_df (DataFrame): DataFrame containing eMoF data.
        event_df (DataFrame): DataFrame containing event data.
        output_csv (str): Path to the output CSV file.
        dir (str): Directory where the meta.xml will be saved.
    """
    # Ensure the directory exists
    try:
        os.path.exists(output_csv)
    except:
        print(f"Missing directory: {output_csv}")

    # create and include the meta.xml and eml.xml
    # set the meta.xml paramaters by hand, using the format of the dataframes above
    meta_xml_vars = {}

    # when writing dwc occurrence file, we only save some columns
    dwc_df = dwc_df[cols].copy()
    
    meta_xml_vars['cols_list'] = dwc_df.columns.tolist()
    meta_xml_vars['occurrence_filename'] = output_csv

    if not emof_df.empty:
        meta_xml_vars ['emof_cols_list'] = emof_df.columns.tolist()
        meta_xml_vars['emof_filename'] = output_csv.replace("occurrence","emof")

    if not event_df.empty:
        meta_xml_vars['event_cols_list'] = event_df.columns.tolist()
        meta_xml_vars['event_filename'] = output_csv.replace("occurrence","event")

    # grab the template file for making meta.xml
    meta_template_file = codecs.open('templates/meta.xml.j2', 'r', 'UTF-8').read()
    meta_template = Template(meta_template_file)
    meta_result_string = meta_template.render(meta_xml_vars)
    dir = os.path.join(*output_csv.split("\\")[:-1])
    meta_file = f'{dir}/meta.xml'

    fh = codecs.open(meta_file, 'wb+', 'UTF-8')
    fh.write(meta_result_string)
    fh.close()
    meta_full_path = os.path.abspath(meta_file)
    print(f"  Meta XML has been written to '{meta_full_path}'.")

In [5]:
import zipfile

def package_dwc_zip(output_dir="data/dwc", zip_filename="data/dwc_package.zip"):
    """
    Packages all CSV and XML files in the specified output directory into a zip file.

    Args:
        output_dir (str): The directory containing the files to package.
        zip_filename (str): The path for the output zip file.
    """
    print(f"Packaging Darwin Core files from '{output_dir}' into '{zip_filename}'...")
    with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, _, files in os.walk(output_dir):
            for file in files:
                if file.endswith(('.csv', '.xml')):
                    file_path = os.path.join(root, file)
                    arcname = os.path.relpath(file_path, output_dir)
                    zipf.write(file_path, arcname)
    print(f"  ✅ Packaged files into '{zip_filename}'")

In [6]:
def convert_to_dwc_individual(file_paths, output_dir="data/dwc"):
    """
    Converts a list of NetCDF files to individual Darwin Core Occurrence CSVs.

    An "occurrence" is the first detection of an animal at a specific
    location within a given hour.

    Args:
        file_paths (list): A list of paths to the .nc files.
        output_dir (str): The directory to save the individual CSV files.
    """
    print(f"\n--- 2. Starting Darwin Core Conversion (Individual Files) ---")
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
        print(f"Created output directory: {output_dir}")

    processed_count = 0

    for nc_file in file_paths:

        base_filename = os.path.basename(nc_file)

        if not os.path.exists(f"{output_dir}/{base_filename.split('_')[1]}"):
          os.makedirs(f"{output_dir}/{base_filename.split('_')[1]}")
          print(f"Created output directory: {output_dir}/{base_filename.split('_')[1]}")

        output_csv = os.path.join(output_dir, f"{base_filename.split('_')[1]}/{os.path.splitext(base_filename)[0]}_occurrence.csv")
        output_csv = os.path.normpath(output_csv)
        #output_dir = os.path.join(*output_csv.split("\\")[:-1])
        print(f"Processing {base_filename}...")

        try:
            with xr.open_dataset(nc_file, engine='netcdf4') as ds:
                df = ds.to_dataframe().reset_index()

                print(f"Found {len(df)} records.")

                # --- Data Cleaning and Preparation ---
                if 'lat' not in df.columns or 'lon' not in df.columns:
                    print(f"  Skipping {base_filename}: missing location data.")
                    continue

                df.dropna(subset=['lat', 'lon', 'time'], inplace=True)
                if df.empty:
                    print(f"  Skipping {base_filename}: no valid records.")
                    continue

                # --- Map to Darwin Core Occurrence Terms ---
                dwc_df, cols = create_dwc_occurrence(ds, output_csv)

                # Create and save eml
                create_eml(ds)

                # --- Event and eMoF (as needed) ---
                event_df = create_dwc_event(dwc_df, output_csv)
                emof_df = create_dwc_emof(ds, dwc_df, output_csv)

                # --- Create meta.xml file ---
                create_meta_xml(dwc_df, emof_df, event_df, output_csv, cols)

                # --- Package into DwC-A ---
                output_dir_zip = f"data/dwc/{base_filename.split('_')[1]}/"
                zip_filename=f"data/dwc/{base_filename.split('_')[1]}/{base_filename.replace('.nc','.zip')}"
                package_dwc_zip(output_dir=output_dir_zip, zip_filename=zip_filename)

                processed_count += 1

        except Exception as e:
            print(f"  Could not process {base_filename}: {e}")

    print(f"\n--- 3. Conversion Complete ---")
    print(f"✅ Success! Processed {processed_count} files.")

Convert data to DarwinCore

In [23]:
import glob

# Step 1: Download all .nc files from the URL
#local_files = glob.glob('data\\src\\*.nc')#[:10]

local_files = ['data\\src\\atn_137491_spotted-seal_trajectory_20180418-20180526.nc',
               'data\\src\\atn_137494_ribbon-seal_trajectory_20140426-20140426.nc',
               'data\\src\\atn_38553_bearded-seal_trajectory_20110618-20120314.nc'
             ]
# Step 2: Convert the downloaded files to individual Darwin Core CSVs
if local_files:
    convert_to_dwc_individual(local_files)
else:
    print("No files were downloaded, so conversion cannot proceed.")


--- 2. Starting Darwin Core Conversion (Individual Files) ---
Processing atn_137491_spotted-seal_trajectory_20180418-20180526.nc...
Found 107 records.
  Extracted 12 occurrences with valid locations.
  Extracted 5 occurrences to first row in hour.
  Saved data to 'data\dwc\137491\atn_137491_spotted-seal_trajectory_20180418-20180526_occurrence.csv'
  EML metadata has been written to 'c:\Users\Mathew.Biddle\Documents\GitProjects\bio_data_guide\datasets\atn_satellite_telemetry\data\dwc\137491\eml.xml'.
  found 1 HumanObservations.
  Created 1 events.
  Saved data to data\dwc\137491\atn_137491_spotted-seal_trajectory_20180418-20180526_event.csv
  Created 2 emofs.
  Saved data to data\dwc\137491\atn_137491_spotted-seal_trajectory_20180418-20180526_emof.csv
  Meta XML has been written to 'c:\Users\Mathew.Biddle\Documents\GitProjects\bio_data_guide\datasets\atn_satellite_telemetry\data\dwc\137491\meta.xml'.
Packaging Darwin Core files from 'data/dwc/137491/' into 'data/dwc/137491/atn_137491_

Testing

In [15]:
import xarray as xr
import pandas as pd


ds = xr.open_dataset('data/src/atn_74626_bearded-seal_trajectory_20090625-20100128.nc', engine='netcdf4')
# #ds = xr.open_dataset('data/src/atn_174787_spotted-seal_trajectory_20180410-20180610.nc', engine='netcdf4')
output_csv = 'data//dwc//137491//atn_137491_spotted-seal_trajectory_20180418-20180526_occurrence.csv'

cols = ['occurrenceID', 'occurrenceStatus', 'basisOfRecord',
          'organismID', 'eventDate', 'decimalLatitude',
          'decimalLongitude', 'geodeticDatum',
          'scientificName', 'scientificNameID', 'eventID',
          'samplingProtocol', 'kingdom', 'taxonRank', 'lifeStage',
          'sex', 'associatedReferences',
          'coordinateUncertaintyInMeters', 'dataGeneralizations', 'bibliographicCitation',
          'occurrenceRemarks']

dwc_df, cols = create_dwc_occurrence(ds, output_csv)

# event_df = create_dwc_event(dwc_df)

# emof_df = create_dwc_emof(ds, dwc_df)

dwc_df

  Extracted 1040 occurrences with valid locations.
  Extracted 789 occurrences to first row in hour.
  Saved data to 'data//dwc//137491//atn_137491_spotted-seal_trajectory_20180418-20180526_occurrence.csv'


Unnamed: 0,occurrenceID,eventID,organismID,occurrenceStatus,basisOfRecord,eventDate,decimalLatitude,decimalLongitude,geodeticDatum,scientificName,...,taxonRank,lifeStage,sex,associatedReferences,minimumDepthInMeters,maximumDepthInMeters,bibliographicCitation,coordinateUncertaintyInMeters,dataGeneralizations,occurrenceRemarks
0,ioos_atn_74626_2009-06-25T03:30:00Z_0.0_bearde...,74626_bearded_seal_2009-06-25T03:30:00Z,137079_bearded_seal,present,HumanObservation,2009-06-25T03:30:00Z,66.5000,-162.7000,EPSG:4326,Erignathus barbatus,...,Species,adult,male,https://doi.org/10.25921/wp4e-ph20,0.0,0.0,"Boveng, Peter; London, Josh; Cameron, Michael;...",0,first of 3 records for this hour.,This is a representative occurrence from a ful...
3,ioos_atn_74626_2009-06-25T04:03:31Z_0.0_bearde...,74626_bearded_seal_2009-06-25T04:03:31Z,137079_bearded_seal,present,MachineObservation,2009-06-25T04:03:31Z,66.5454,-162.6788,EPSG:4326,Erignathus barbatus,...,Species,adult,male,https://doi.org/10.25921/wp4e-ph20,0.0,0.0,"Boveng, Peter; London, Josh; Cameron, Michael;...",1500,,This is a representative occurrence from a ful...
4,ioos_atn_74626_2009-06-25T05:05:54Z_0.0_bearde...,74626_bearded_seal_2009-06-25T05:05:54Z,137079_bearded_seal,present,MachineObservation,2009-06-25T05:05:54Z,66.5607,-162.6883,EPSG:4326,Erignathus barbatus,...,Species,adult,male,https://doi.org/10.25921/wp4e-ph20,0.0,0.0,"Boveng, Peter; London, Josh; Cameron, Michael;...",10000,first of 4 records for this hour.,This is a representative occurrence from a ful...
8,ioos_atn_74626_2009-06-25T06:54:23Z_0.0_bearde...,74626_bearded_seal_2009-06-25T06:54:23Z,137079_bearded_seal,present,MachineObservation,2009-06-25T06:54:23Z,66.5434,-162.7785,EPSG:4326,Erignathus barbatus,...,Species,adult,male,https://doi.org/10.25921/wp4e-ph20,0.0,0.0,"Boveng, Peter; London, Josh; Cameron, Michael;...",10000,,This is a representative occurrence from a ful...
9,ioos_atn_74626_2009-06-25T07:19:22Z_0.0_bearde...,74626_bearded_seal_2009-06-25T07:19:22Z,137079_bearded_seal,present,MachineObservation,2009-06-25T07:19:22Z,66.5516,-162.7268,EPSG:4326,Erignathus barbatus,...,Species,adult,male,https://doi.org/10.25921/wp4e-ph20,0.0,0.0,"Boveng, Peter; London, Josh; Cameron, Michael;...",10000,,This is a representative occurrence from a ful...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5651,ioos_atn_74626_2010-03-08T18:16:54Z_0.0_bearde...,74626_bearded_seal_2010-03-08T18:16:54Z,137079_bearded_seal,present,MachineObservation,2010-03-08T18:16:54Z,64.3625,-161.7444,EPSG:4326,Erignathus barbatus,...,Species,adult,male,https://doi.org/10.25921/wp4e-ph20,0.0,0.0,"Boveng, Peter; London, Josh; Cameron, Michael;...",10000,,This is a representative occurrence from a ful...
5652,ioos_atn_74626_2010-03-08T19:03:21Z_0.0_bearde...,74626_bearded_seal_2010-03-08T19:03:21Z,137079_bearded_seal,present,MachineObservation,2010-03-08T19:03:21Z,64.3717,-161.8104,EPSG:4326,Erignathus barbatus,...,Species,adult,male,https://doi.org/10.25921/wp4e-ph20,0.0,0.0,"Boveng, Peter; London, Josh; Cameron, Michael;...",10000,first of 2 records for this hour.,This is a representative occurrence from a ful...
5654,ioos_atn_74626_2010-03-09T05:05:32Z_0.0_bearde...,74626_bearded_seal_2010-03-09T05:05:32Z,137079_bearded_seal,present,FastGPS,2010-03-09T05:05:32Z,64.3360,-161.8394,EPSG:4326,Erignathus barbatus,...,Species,adult,male,https://doi.org/10.25921/wp4e-ph20,0.0,0.0,"Boveng, Peter; London, Josh; Cameron, Michael;...",200,first of 3 records for this hour.,This is a representative occurrence from a ful...
5658,ioos_atn_74626_2010-03-09T07:31:34Z_0.0_bearde...,74626_bearded_seal_2010-03-09T07:31:34Z,137079_bearded_seal,present,MachineObservation,2010-03-09T07:31:34Z,64.3370,-161.8410,EPSG:4326,Erignathus barbatus,...,Species,adult,male,https://doi.org/10.25921/wp4e-ph20,0.0,0.0,"Boveng, Peter; London, Josh; Cameron, Michael;...",10000,,This is a representative occurrence from a ful...


In [10]:
# convert_to_dwc_individual(['data/src/atn_137491_spotted-seal_trajectory_20180418-20180526.nc'])

In [None]:
ds

<xarray.Dataset>
Dimensions:               (obs: 5660)
Coordinates:
    time                  (obs) datetime64[ns] ...
    z                     (obs) float64 ...
    lat                   (obs) float64 ...
    lon                   (obs) float64 ...
Dimensions without coordinates: obs
Data variables: (12/32)
    deploy_id             object ...
    ptt                   (obs) float64 ...
    instrument            (obs) object ...
    type                  (obs) object ...
    location_class        (obs) object ...
    error_radius          (obs) float64 ...
    ...                    ...
    animal                object ...
    instrument_tag        object ...
    instrument_location   object ...
    taxon_name            object ...
    taxon_lsid            object ...
    comment               (obs) object ...
Attributes: (12/89)
    date_created:                 2024-08-28T00:18:16Z
    featureType:                  trajectory
    cdm_data_type:                Trajectory
    Convent

## Push to IPT following

https://github.com/cioos-siooc/pyobistools/blob/ipt_publishing/pyobistools/publish_to_ipt.py

In [26]:
import requests
import os
from requests_toolbelt.multipart.encoder import MultipartEncoder
from bs4 import BeautifulSoup

# TODO: extend this to allow checking for different versions of the IPT as the forms may change?
# Functional for IPT 2.6.3

def open_ipt_session(ipt_auth, ipt_url):
    """
    Begin a session with the target IPT
    Author: Jon Pye
    :param ipt_auth: Authentication details for the ipt, of the form {'email': 'email@mailserver.com', 'password':'cleartextPassword'}
    :param ipt_url: URL of the IPT we are authenticating with.
    :return: None
    """
    
    # relative path to IPT login form
    login_url = ipt_url + 'login.do'
    
    s = requests.Session()  # open a session
    
    # retrieve the login form
    resp = s.get(login_url)
    
    # login forms generate a CSRF token that we have to persist in our response  
    soup = BeautifulSoup(resp.text, 'lxml')
    
    # Add it to our credentials dictionary
    ipt_auth['csrfToken'] = soup.find("input", {"name": "csrfToken"})['value']
    login = s.post(login_url, data=ipt_auth)
    
    if login.status_code != 200:
        print("Login failed, status code {}".format(login.status_code))
        print(login.text)
        return None
    else:
        return s


def create_new_ipt_project(projname: str, filepath: str, ipt_url: str, ipt_session):
    
    """
    Create a new project on the given IPT using an existing DwC archive zip
    Author: Jon Pye
    :param projname: the project name as given by get_obis_shortname()
    :param filepath: payload resource filepath
    :param ipt_url: URL of the IPT to publish to
    :param ipt_session: authenticated requests session for the IPT
    :return: URL of the resource
    """
    
    path, filename = os.path.split(filepath)
    
    if not filename:  # if the filepath has no name in it
        print('no file specified in filepath, aborting')
        return None
    else:
        print(path, filename)
        print(filepath)
    
    # if there IS a file and it is not a valid DwC Archive, do we want to do anything here? The IPT runs its own checks...
    
    values = MultipartEncoder(fields={'create': 'Create',  # hidden form fields with values
                                      'shortname': projname,
                                      'resourceType': 'samplingevent',
                                      '__checkbox_importDwca': 'true',
                                      'importDwca': 'true',
                                      'file': (filename, 
                                               open(filepath, 'rb'),
                                               'application/x-zip-compressed'),
                                     }
                             )
    create_dataset = ipt_session.post(ipt_url + 'manage/create.do',
                                      data=values,
                                      headers={'Content-Type': values.content_type}
                                     )
    return create_dataset

def refresh_ipt_project_files(projname: str, filepath: str, ipt_url: str, ipt_session):
    """
    Update data for a project on the given IPT using an existing DwC archive zip
    Author: Jon Pye
    :param projname: the project name as given by get_obis_shortname()
    :param filepath: payload resource filepath
    :param ipt_url: URL of the IPT to publish to
    :param ipt_session: authenticated requests session for the IPT
    :return: URL of the resource
    """
    
    path, filename = os.path.split(filepath)
    
    if not filename:  # if the filepath has no name in it
        print('no file specified in filepath, aborting')
        return None
    
    values = MultipartEncoder(fields={  'add': 'Add',
                                        'r': projname,
                                        'sourceType': 'source-file',
                                        'validate': 'false', 
                                        'file': (filename,
                                                 open(filepath, 'rb'),
                                                 'application/x-zip-compressed'),
                                     })
    
    update_dataset = ipt_session.post(ipt_url + 'manage/addsource.do',
                                      data=values, 
                                      headers = {'Content-Type': values.content_type}
                                     )
    if update_dataset.status_code == 200:
        # Handle the Are you Sure popup.        
        print("Publication successful")
        return update_dataset
    else:
        print("publication error, check landing page output")
        return update_dataset


def refresh_ipt_project_metadata(projname: str, filepath: str, ipt_url: str, ipt_session):
    """
    Update metadata for a project on the given IPT using an existing eml.xml file
    Author: Jon Pye
    :param projname: the project name as given by get_obis_shortname()
    :param filepath: payload resource filepath
    :param ipt_url: URL of the IPT to publish to
    :param ipt_session: authenticated requests session for the IPT
    :return: URL of the resource
    """
    
    path, filename = os.path.split(filepath)
    
    if not filename:  # if the filepath has no name in it
        print('no file specified in filepath, aborting')
        return None
    
    values = MultipartEncoder(fields={  'emlReplace': 'Replace',
                                        'r': projname,
                                        'sourceType': 'source-file',
                                        'validateEml': 'true',
                                        '__checkbox_validateEml': 'true',
                                        'emlFile': (filename,
                                                    open(filepath, 'rb'),
                                                    'application/xml'),
                                     })

    update_metadata = ipt_session.post(ipt_url + 'manage/replace-eml.do',
                                       data=values, 
                                       headers = {'Content-Type':values.content_type}
                                      )
    return update_metadata


def make_public_ipt_project(projname: str, ipt_url: str, ipt_session):
    """
    Update metadata for a project on the given IPT
    Author: Jon Pye
    :param projname: the project name as given by get_obis_shortname()
    :param ipt_url: URL of the IPT to publish to
    :param ipt_session: authenticated requests session for the IPT
    :return: URL of the resource
    """
    pub_params = {'r' : projname,          # resource = dataset name
                  'makePrivate': 'Public'
                 }
    
    contents = ipt_session.post(ipt_url + 'manage/resource-makePublic.do', data = pub_params)
    return contents


def publish_ipt_project(projname: str, ipt_url: str, ipt_session, publishing_notes: str = ""):
    """
    Update metadata for a project on the given IPT
    Author: Jon Pye
    :param projname: the project name as given by get_obis_shortname()
    :param ipt_url: URL of the IPT to publish to
    :param ipt_session: authenticated requests session for the IPT
    :param publishing_notes: optional message to publish this version with
    :return: URL of the resource
    """
    
    pub_params = {'r' : projname,      # resource = dataset name
                  'autopublish': '',
                  'currPubMode' : 'AUTO_PUBLISH_OFF',
                  'pubMode': '',
                  'currPubFreq': '',
                  'pubFreq': '',
                  'publish': 'Publish',
                  'summary': publishing_notes
             }
    contents = ipt_session.post(ipt_url + 'manage/publish.do', data = pub_params)
    return contents


def check_if_project_exists(projname: str, ipt_url: str, ipt_session):
    """
    Test if a project exists on the IPT already
    Author: Jon Pye
    :param projname: the project name as given by get_obis_shortname()
    :param ipt_url: URL of the IPT to check for this publication
    :param ipt_session: authenticated requests session for the IPT
    :return: True if the project already exists on the IPT in question
    """

    checkUrl = '{ipt_url}ipt/resource?r={projname}'.format(ipt_url=ipt_url, projname=projname)

    contents = ipt_session.post(checkUrl)

    # if it's not found, the IPT returns a 404
    if contents.status_code == 404:
        print("No existing repository by this name: '{}'".format(projname))
        return False
    elif contents.status_code == 200:
        print("Found existing project by name: '{}'".format(projname))
        return True

In [30]:
from dotenv import dotenv_values

config = dotenv_values(".env")

ipt_auth = {
    'email': config['IPT_ADMIN_EMAIL'],
    'password': config['IPT_PASSWORD'],
}

ipt_url = 'https://ipt-obis.gbif.us/'

ipt_session = open_ipt_session(ipt_auth, ipt_url)

#filepath = 'data/dwc/38553/atn_38553_bearded-seal_trajectory_20110618-20120314.zip'
#filepath = 'data/dwc/137491/atn_137491_spotted-seal_trajectory_20180418-20180526.zip'
filepath = 'data/dwc/137494/atn_137494_ribbon-seal_trajectory_20140426-20140426.zip'

projname = filepath.split("/")[-1].replace(".zip","")

create_dataset = create_new_ipt_project(projname, filepath, ipt_url, ipt_session)

print(create_dataset)

data/dwc/137494 atn_137494_ribbon-seal_trajectory_20140426-20140426.zip
data/dwc/137494/atn_137494_ribbon-seal_trajectory_20140426-20140426.zip
<Response [200]>
