In [87]:
# Assemble a new Darwin Core Archive using data from another archive
# This is based on a DwC-A exported from Symbiota
# The process only uses the occurrences.csv and images.csv file for filtering
# and creates a copy of the original meta.xml file.
# All other files are ignored.
# Intended for upload to BioSpex
# -----------------
# Extract a DwC Archive file and put the contents in a direcotry named dwc_source in same path as this notebook
# Create a directory called dwc_out to store output
# To create the new DwC archive file, ZIP the contents of dwc_out (not the directory itself)

In [88]:
from shutil import copyfile
import pandas as pd
pd.set_option('display.max_columns', 10)
pd.set_option('display.max_colwidth', -1)

In [89]:
# load the occurrences file from a Darwin Core Archive
df_occurrences = pd.read_csv("dwc_source/occurrences.csv", low_memory=False)
# load the images file from a Darwin Core Archive
df_images = pd.read_csv("dwc_source/images.csv", low_memory=False)

In [91]:
# Make sure imported records match what you expect
df_occurrences.shape

(158391, 84)

In [92]:
df_images.shape

(82233, 18)

In [93]:
# Filter the occurrence records from Symbiota to include the records you want to import into BioSpex
# processingStatus isn't in the Symbiota DwCA which is generated using DwC Publishing, must use backup DwC file instead
#df_filtered_occurrences = df_occurrences[df_occurrences['processingStatus'] == 'pending review-nfn']

df_filtered_occurrences = df_occurrences[(df_occurrences['stateProvince'] == 'Texas')&(df_occurrences['recordedBy'].isnull())&(df_occurrences['scientificName'].isnull())] 

In [94]:
# Check to make sure the record count is what you expect
df_filtered_occurrences.shape

(27758, 84)

In [95]:
# Filter the image records to only include those with occurrence records 
df_filtered_images = df_images[df_images['coreid'].isin(df_filtered_occurrences['id'])]

In [96]:
df_filtered_images.shape

(27777, 18)

In [97]:
# If you want to exclude any records from the filtered set, load the catalog numbers here
df_exclude = pd.read_csv("exclude_catalog_numbers.csv", low_memory=False)

In [98]:
# Exclude records (e.g. those already in BioSpex)
df_filtered_occurrences_use = df_filtered_occurrences[~df_filtered_occurrences['catalogNumber'].isin(df_exclude['catalog_number'])]

In [99]:
df_filtered_occurrences_use.shape

(21347, 84)

In [100]:
# Select only images to be used
df_filtered_images_use = df_images[df_images['coreid'].isin(df_filtered_occurrences_use['id'])]

In [101]:
df_filtered_images_use.shape

(21366, 18)

In [102]:
# Check for duplicates if you wish
#print(df_filtered_images_use[df_filtered_images_use.duplicated(subset='coreid', keep=False)]['accessURI'])

In [103]:
# Write occurrences to destination directory
df_filtered_occurrences_use.to_csv('dwc_out/occurrences.csv', index = False)

In [104]:
# Write images to destination directory
df_filtered_images_use.to_csv('dwc_out/images.csv', index = False)

In [105]:
# Copy meta.xml to destination directory
copyfile('dwc_source/meta.xml', 'dwc_out/meta.xml')

'dwc_out/meta.xml'