Gemini prompt:

> Using python, can you download the .nc files found at https://www.ncei.noaa.gov/data/oceans/ioos/atn/california_state_university_long_beach/ and convert them to the Darwin Core standard? Assign an occurrence as the first detection per location per hour.

In [1]:
import os
import glob
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urljoin, urlparse
import xarray as xr
import netCDF4
from geopy.geocoders import Nominatim
import re
#import pyobistools

## Create a function to recursively download files

In [2]:
def recursive_wget(url, output_dir):
    """
    Recursively downloads files from a given URL to a specified output directory,
    mirroring the directory structure of the website.

    Args:
        url (str): The URL to start downloading from.
        output_dir (str): The local directory to save files to.
    """
    print(f"Accessing: {url}")
    try:
        # --- Create the output directory if it doesn't exist ---
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
            print(f"Created directory: {output_dir}")

        # --- Send a GET request and parse the HTML ---
        response = requests.get(url)
        # Raise an exception for bad status codes (like 404 Not Found)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        # --- Find all links on the page ---
        for link in soup.find_all('a'):
            href = link.get('href')

            # --- Skip invalid or parent directory links ---
            if not href or href.startswith('?') or href.startswith('/') or '..' in href:
                continue

            # --- Construct the full, absolute URL for the link ---
            absolute_url = urljoin(url, href)

            # Get the path component of the URL to create local directories/files
            path = urlparse(absolute_url).path
            # Create a valid local path from the last part of the URL path
            local_path = os.path.join(output_dir, os.path.basename(path))

            # --- If the link points to a directory, recurse into it ---
            if href.endswith('/'):
                print(f"\nEntering directory: {absolute_url}")
                # Call the function again for the new directory
                recursive_wget(absolute_url, local_path)
            # --- If the link points to a file, download it ---
            else:
                download_file(absolute_url, local_path)

    except requests.exceptions.HTTPError as e:
        print(f"HTTP Error accessing URL {url}: {e}")
    except requests.exceptions.RequestException as e:
        print(f"Error accessing URL {url}: {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")


def download_file(url, local_path):
    """
    Downloads a single file from a URL and saves it to a local path.

    Args:
        url (str): The URL of the file to download.
        local_path (str): The local path where the file will be saved.
    """
    try:
        print(f"  Downloading file: {os.path.basename(local_path)}")
        # Use stream=True to efficiently download large files
        with requests.get(url, stream=True) as r:
            r.raise_for_status()
            # Open the file in binary write mode
            with open(local_path, 'wb') as f:
                # Write the file in chunks
                for chunk in r.iter_content(chunk_size=8192):
                    f.write(chunk)
        # print(f"  Successfully downloaded {os.path.basename(local_path)}")
    except requests.exceptions.RequestException as e:
        print(f"  Failed to download {url}: {e}")
    except IOError as e:
        print(f"  Failed to write file {local_path}: {e}")

## Execute download

In [3]:
# --- Main execution block ---
start_url = "https://www.ncei.noaa.gov/data/oceans/ioos/atn/"
# Create a base directory for all the downloads
download_directory = "data/src/"

print("--- Starting Recursive Download ---")
print(f"Source URL: {start_url}")
print(f"Local Directory: {download_directory}\n")

recursive_wget(start_url, download_directory)

print("\n--- Recursive Download Finished ---")

--- Starting Recursive Download ---
Source URL: https://www.ncei.noaa.gov/data/oceans/ioos/atn/
Local Directory: data/src/

Accessing: https://www.ncei.noaa.gov/data/oceans/ioos/atn/
  Downloading file: ACCESSION_UPDATE_LOG.TXT

Entering directory: https://www.ncei.noaa.gov/data/oceans/ioos/atn/california_state_university_long_beach/
Accessing: https://www.ncei.noaa.gov/data/oceans/ioos/atn/california_state_university_long_beach/
  Downloading file: atn_45866_great-white-shark_trajectory_20090923-20091123.nc
  Downloading file: atn_45869_great-white-shark_trajectory_20090923-20091213.nc

Entering directory: https://www.ncei.noaa.gov/data/oceans/ioos/atn/cascadia_research_collective/
Accessing: https://www.ncei.noaa.gov/data/oceans/ioos/atn/cascadia_research_collective/
  Downloading file: atn_53631_false-killer-whale_trajectory_20100927-20101001.nc
  Downloading file: atn_53644_false-killer-whale_trajectory_20100927-20101118.nc
  Downloading file: atn_53652_false-killer-whale_trajector

In [4]:
def create_dwc_occurrence(ds):

  dwc_df = pd.DataFrame()
  dwc_df['occurrenceID'] = "ioos_atn_"+ds['time'].dt.strftime('%Y-%m-%dT%H:%M:%SZ')+"_"+ds['z'].astype(str)+"_"+ds.animal_common_name.replace(" ","_")
  dwc_df['occurrenceStatus'] = 'present'
  dwc_df['basisOfRecord'] = ds['type']
  dwc_df['organismID'] = ds.platform_id+"_"+ds.animal_common_name.replace(" ","_")
  dwc_df['eventDate'] = ds['time'].dt.strftime('%Y-%m-%dT%H:%M:%SZ')
  dwc_df['decimalLatitude'] = ds['lat']
  dwc_df['decimalLongitude'] = ds['lon']
  dwc_df['geodeticDatum'] = ds.crs.epsg_code
  dwc_df['scientificName'] = ds['taxon_name'].values.tolist()
  dwc_df['scientificNameID'] = ds['taxon_lsid'].values.tolist()
  dwc_df['eventID'] = ds.animal_common_name.replace(" ","_") +"_"+ds['time'].dt.strftime('%Y-%m-%dT%H:%M:%SZ')
  dwc_df['samplingProtocol'] = 'satellite telemetry'
  dwc_df['kingdom'] = ds['animal'].attrs['kingdom']
  dwc_df['taxonRank'] = ds['animal'].attrs['rank']
  dwc_df['lifeStage'] = ds['animal_life_stage'].values.tolist()
  dwc_df['sex'] = ds['animal_sex'].values.tolist()
  dwc_df['associatedReferences'] = "https://doi.org/10.25921/wp4e-ph20"
  dwc_df['minimumDepthInMeters'] = ds['z'].values.tolist()
  dwc_df['maximumDepthInMeters'] = ds['z'].values.tolist()

  # set basisOfRecord
  dwc_df.loc[dwc_df['basisOfRecord'] == 'User','basisOfRecord'] = 'HumanObservation'
  dwc_df.loc[dwc_df['basisOfRecord'] == 'Argos','basisOfRecord'] = 'MachineObservation'

  # filter to respectable locations
  dwc_df['location_class'] = ds['location_class'].to_series()

  dwc_df.drop(dwc_df.loc[
      (dwc_df['location_class'] == 'A') |
      (dwc_df['location_class'] == 'B') |
      (dwc_df['location_class'] == 'Z')].index, inplace=True)

  # test using xarray
  # ds['time'].where((ds['location_class'] != 'A') &
  #     (ds['location_class'] != 'B') &
  #     (ds['location_class'] != 'Z'),drop=True).values

  print(f"  Extracted {len(dwc_df)} occurrences with valid locations.")

  # assign value to codes
  dwc_df.loc[dwc_df['location_class'] == 'nan','location_class'] = 0
  dwc_df.loc[dwc_df['location_class'] == 'G','location_class'] = 200
  dwc_df.loc[dwc_df['location_class'] == '3','location_class'] = 250
  dwc_df.loc[dwc_df['location_class'] == '2','location_class'] = 500
  dwc_df.loc[dwc_df['location_class'] == '1','location_class'] = 1500
  dwc_df.loc[dwc_df['location_class'] == '0','location_class'] = 10000

  # --- Define Occurrences: First detection per location per hour ---
  dwc_df['event_hour'] = pd.to_datetime(dwc_df['eventDate']).dt.strftime('%Y-%m-%dT%H')
  dwc_df.sort_values('event_hour', inplace=True)
  duplicate_counts = dwc_df.groupby(by='event_hour').transform('size')
  dwc_df['dataGeneralizations'] = f'first of ' + duplicate_counts.astype(str) + ' records.'
  dwc_df.loc[dwc_df['dataGeneralizations']=='first of 1 records.','dataGeneralizations'] = ''
  dwc_df = dwc_df.drop_duplicates(subset=['event_hour'], keep='first').copy()

  print(f"  Extracted {len(dwc_df)} occurrences to first row in hour.")

  # --- Rename a and drop few columns --
  dwc_df.rename(columns={'location_class': 'coordinateUncertaintyInMeters',
                          },
                inplace=True)

  dwc_df.drop(columns=['event_hour'], inplace=True)

  return dwc_df


def create_dwc_event(dwc_df):

  # --- Processing for Event ---
  event_df = dwc_df.loc[dwc_df['basisOfRecord']=='HumanObservation',
  ['eventID','eventDate','decimalLatitude','decimalLongitude','geodeticDatum',
    'minimumDepthInMeters','maximumDepthInMeters']]

  # # initialize Nominatim API - not trusted enough yet
  # # see https://nominatim.org/release-docs/develop/api/Reverse/
  # geolocator = Nominatim(user_agent="my_geopy_app")

  # lat = event_df['decimalLatitude'][0].astype(str)
  # lon = event_df['decimalLongitude'][0].astype(str)

  # location = geolocator.reverse(lat+","+lon)

  # event_df['countryCode'] = location.raw['address'].get('country_code').upper()

  event_df['countryCode'] = 'US'

  event_df['samplingProtocol'] = 'satellite telemetry'

  return event_df


def create_dwc_emof(ds, dwc_df):

  # --- Processing for emof ---
  vars = list(ds.keys())
  animal_vars = [x for x in vars if re.match(r'animal_(?!life_stage\b|sex\b).*',x)]
  new_rows = pd.DataFrame()

  for animal_var in animal_vars:
    row = pd.DataFrame({
        'measurementValue': ds[animal_var].values.tolist(),
        'measurementType': [f'{animal_var}: {ds[animal_var].long_name}'],
        'measurementMethod': ds[animal_var].attrs[animal_var],
        'measurementUnit': [ds[animal_var].units if 'units' in ds[animal_var].attrs else ''],
    })
    new_rows = pd.concat([new_rows,
                          row])

  emof_df = dwc_df.loc[dwc_df['basisOfRecord']=='HumanObservation',
                      ['organismID','occurrenceID','eventID']
                      ].merge(
                          new_rows,
                          left_index=True,
                          right_index=True)

  emof_df.dropna(axis=0, subset=['measurementValue'], inplace=True)

  return emof_df

In [5]:
# EML generation

# borrowed from https://gitlab.oceantrack.org/otn-partner-nodes/ipython-utilities/-/blob/main/dbtools/publish_to_obis.py?ref_type=heads

from jinja2 import Template
import codecs

def save_eml_file(eml_metadata:dict) -> str:
    """
    Save EML dictionary in a file
    Author: Jon Pye, Angela Dini
    Maintainer: Angela Dini
    :param eml_metadata: dictionary of EML metadata
    :return: filepath of where the EML filepath will be
    """
    # Write it out to the package
    template_file = codecs.open('templates/eml.xml.j2', 'r', 'UTF-8').read()
    template = Template(template_file)
    result_string = template.render(eml_metadata)
    eml_file = 'data/dwc/{ptt_id}/eml.xml'.format(**eml_metadata)
    fh = codecs.open(eml_file, 'wb+', 'UTF-8')
    fh.write(result_string)
    fh.close()
    eml_full_path = os.path.abspath(eml_file)
    print(f"EML metadata has been written to '{eml_full_path}'.")
    return eml_full_path

def create_eml(ds):
    eml_metadata = ds.attrs

    contributors = dict()
    for attr in [x for x in ds.attrs if re.match(r'contributor_(?!role_vocabulary\b).*',x)]:
        contributors[attr] = ds.attrs[attr].split(",")

    contributors_list = [
        {key: contributors[key][i] for key in contributors}
        for i in range(len(next(iter(contributors.values()))))
    ]

    other_meta = {
        'dataset_ipt_id': None,
        'dataset_short_name': ds.encoding.get('source').split("\\")[-1].replace(".nc",""),
        'data_manager_firstname': 'Mathew',
        'data_manager_lastname': 'Biddle',
        'data_manager_title': 'Physical Scientist',
        'data_manager_phone': '',
        'data_manager_email': 'mathew.biddle@noaa.gov',
        'contributors': contributors_list,
    }

    eml_metadata.update(other_meta)

    save_eml_file(eml_metadata)
    
    return eml_metadata

In [6]:
def convert_to_dwc_individual(file_paths, output_dir="data/dwc"):
    """
    Converts a list of NetCDF files to individual Darwin Core Occurrence CSVs.

    An "occurrence" is the first detection of an animal at a specific
    location within a given hour.

    Args:
        file_paths (list): A list of paths to the .nc files.
        output_dir (str): The directory to save the individual CSV files.
    """
    print(f"\n--- 2. Starting Darwin Core Conversion (Individual Files) ---")
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
        print(f"Created output directory: {output_dir}")

    processed_count = 0

    for nc_file in file_paths:

        base_filename = os.path.basename(nc_file)

        if not os.path.exists(f"{output_dir}/{base_filename.split('_')[1]}"):
          os.makedirs(f"{output_dir}/{base_filename.split('_')[1]}")
          print(f"Created output directory: {output_dir}/{base_filename.split('_')[1]}")

        output_csv = os.path.join(output_dir, f"{base_filename.split('_')[1]}/{os.path.splitext(base_filename)[0]}_occurrence.csv")
        print(f"Processing {base_filename}...")

        try:
            with xr.open_dataset(nc_file, engine='netcdf4') as ds:
                df = ds.to_dataframe().reset_index()

                print(f"Found {len(df)} records.")

                # --- Data Cleaning and Preparation ---
                if 'lat' not in df.columns or 'lon' not in df.columns:
                    print(f"  Skipping {base_filename}: missing location data.")
                    continue

                df.dropna(subset=['lat', 'lon', 'time'], inplace=True)
                if df.empty:
                    print(f"  Skipping {base_filename}: no valid records.")
                    continue

                # --- Map to Darwin Core Occurrence Terms ---
                dwc_df = create_dwc_occurrence(ds)

                # only pick specific columns to save
                cols = ['occurrenceID', 'occurrenceStatus', 'basisOfRecord',
                        'organismID', 'eventDate', 'decimalLatitude',
                        'decimalLongitude', 'geodeticDatum',
                        'scientificName', 'scientificNameID', 'eventID',
                        'samplingProtocol', 'kingdom', 'taxonRank', 'lifeStage',
                        'sex', 'associatedReferences',
                        'coordinateUncertaintyInMeters', 'dataGeneralizations']

                # Save the individual CSV
                dwc_df.to_csv(output_csv, columns=cols, index=False)
                print(f"  Saved data to '{output_csv}'")

                # Create and save eml
                create_eml(ds)

                processed_count += 1

                # --- Event and eMoF ---
                # bail out if there are no HumanObservations
                if len(dwc_df.loc[dwc_df['basisOfRecord']=='HumanObservation']) == 0:
                  print(f'  no HumanObservation events found')
                  continue
                else:

                  print(f"  found {len(dwc_df.loc[dwc_df['basisOfRecord']=='HumanObservation'])} HumanObservations.")

                  event_df = create_dwc_event(dwc_df)

                  event_df.to_csv(output_csv.replace("occurrence","event"), index=False)
                  print(f"  Created {len(event_df)} events.")
                  print(f"  Saved data to {output_csv.replace('occurrence','event')}")

                  emof_df = create_dwc_emof(ds, dwc_df)
                  if len(emof_df) == 0:
                    print(f'  no emof data found')
                    continue
                  else:
                    emof_df.to_csv(output_csv.replace("occurrence","emof"), index=False)
                    print(f"  Created {len(emof_df)} emofs.")
                    print(f"  Saved data to {output_csv.replace('occurrence','emof')}")

        except Exception as e:
            print(f"  Could not process {base_filename}: {e}")

    print(f"\n--- 3. Conversion Complete ---")
    print(f"✅ Success! Processed {processed_count} files.")

Convert data to DarwinCore

In [7]:
import glob

# Step 1: Download all .nc files from the URL
local_files = glob.glob('data/src/*.nc')

# Step 2: Convert the downloaded files to individual Darwin Core CSVs
if local_files:
    convert_to_dwc_individual(local_files)
else:
    print("No files were downloaded, so conversion cannot proceed.")


--- 2. Starting Darwin Core Conversion (Individual Files) ---
Created output directory: data/dwc/131373
Processing atn_131373_ribbon-seal_trajectory_20140428-20141213.nc...
Found 1215 records.
  Extracted 389 occurrences with valid locations.
  Extracted 179 occurrences to first row in hour.
  Saved data to 'data/dwc\131373/atn_131373_ribbon-seal_trajectory_20140428-20141213_occurrence.csv'
EML metadata has been written to 'c:\Users\Mathew.Biddle\Documents\GitProjects\bio_data_guide\datasets\atn_satellite_telemetry\data\dwc\131373\eml.xml'.
  no HumanObservation events found
Created output directory: data/dwc/137487
Processing atn_137487_ribbon-seal_trajectory_20140412-20140413.nc...
Found 17 records.
  Extracted 15 occurrences with valid locations.
  Extracted 3 occurrences to first row in hour.
  Saved data to 'data/dwc\137487/atn_137487_ribbon-seal_trajectory_20140412-20140413_occurrence.csv'
EML metadata has been written to 'c:\Users\Mathew.Biddle\Documents\GitProjects\bio_data_gu

Testing

In [8]:
# import xarray as xr
# import pandas as pd


# ds = xr.open_dataset('data/src/atn_74626_bearded-seal_trajectory_20090625-20100128.nc', engine='netcdf4')
# #ds = xr.open_dataset('data/src/atn_174787_spotted-seal_trajectory_20180410-20180610.nc', engine='netcdf4')

# dwc_df = create_dwc_occurrence(ds)

# event_df = create_dwc_event(dwc_df)

# emof_df = create_dwc_emof(ds, dwc_df)

In [9]:
# convert_to_dwc_individual(['data/src/atn_137491_spotted-seal_trajectory_20180418-20180526.nc'])

In [10]:
save_eml_file(eml_metadata)

NameError: name 'eml_metadata' is not defined