In [None]:
# Assemble a new Darwin Core Archive using data from another archive
# This is based on a DwC-A exported from Symbiota
# The process only uses the occurrences.csv and images.csv file for filtering
# and creates a copy of the original meta.xml file.
# All other files are ignored.
# Intended for upload to BioSpex
# -----------------
# Extract a DwC Archive file and put the contents in a directory named dwc_source in same path as this notebook
# Create a directory called dwc_out to store output
# To create the new DwC archive file, ZIP the contents of dwc_out (not the directory itself)

In [None]:
from shutil import copyfile
import pandas as pd
pd.set_option('display.max_columns', 10)
pd.set_option('display.max_colwidth', -1)

In [None]:
# load the occurrences file from a Darwin Core Archive
df_occurrences = pd.read_csv("dwc_source/occurrences.csv", low_memory=False)
# load the images file from a Darwin Core Archive
df_images = pd.read_csv("dwc_source/images.csv", low_memory=False)

In [None]:
# Make sure imported records match what you expect
df_occurrences.shape

In [None]:
df_images.shape

In [None]:
# Filter the occurrence records from Symbiota to include the records you want to import into BioSpex
# processingStatus isn't in the Symbiota DwCA which is generated using DwC Publishing, must use backup DwC file instead
# Use one or more filters to determine what will be included/excluded from the output DwC-A
# Filter to get records that match a particular Symbiota processingStatus:
df_filtered_occurrences = df_occurrences[df_occurrences['processingStatus'] == 'pending review-nfn']

# Filter to get records that have particular DwC fields unpopulated
df_filtered_occurrences = df_occurrences[(df_occurrences['stateProvince'].isnull())&(df_occurrences['recordedBy'].isnull())&(df_occurrences['scientificName'].isnull())] 
# If not filtering just assign to new DF:
#df_filtered_occurrences = df_occurrences

In [None]:
# Check to make sure the record count is what you expect
df_filtered_occurrences.shape

In [None]:
# Filter the image records to only include those with occurrence records 
df_filtered_images = df_images[df_images['coreid'].isin(df_filtered_occurrences['id'])]

In [None]:
df_filtered_images.shape

In [None]:
# If you want to exclude any records from the filtered set based on catalog numbers, first load the catalog numbers here
#df_exclude = pd.read_csv("exclude_catalog_numbers.csv", low_memory=False)

In [None]:
# Exclude records (e.g. those already in BioSpex or in a separate transcription workflow)
#df_filtered_occurrences_use = df_filtered_occurrences[~df_filtered_occurrences['catalogNumber'].isin(df_exclude['catalog_number'])]

In [None]:
# Specify catalog numbers to include
# df_include = pd.read_csv("example_include_catnums.csv", low_memory=False)

In [None]:
# Include records based on catalog numbers
df_filtered_occurrences_use = df_filtered_occurrences[df_filtered_occurrences['catalogNumber'].isin(df_include['catalog_number'])]

In [None]:
df_filtered_occurrences_use.shape

In [None]:
# Select only images to be used
df_filtered_images_use = df_images[df_images['coreid'].isin(df_filtered_occurrences_use['id'])]

In [None]:
df_filtered_images_use.shape

In [None]:
# Check for duplicates if you wish
#print(df_filtered_images_use[df_filtered_images_use.duplicated(subset='coreid', keep=False)]['accessURI'])

In [None]:
# Write occurrences to destination directory
df_filtered_occurrences_use.to_csv('dwc_out/occurrences.csv', index = False)

In [None]:
# Write images to destination directory
df_filtered_images_use.to_csv('dwc_out/images.csv', index = False)

In [None]:
# Copy meta.xml to destination directory
copyfile('dwc_source/meta.xml', 'dwc_out/meta.xml')