# Curation Script - Harvard College Obervatory Announcement Cards

**About**: This script contains the curation and deposit workflow for adding data from the Harvard College Observatory Announcement Cards to Harvard Dataverse. 

**Notes**: 
- Uses metadata.ipynb to build a metadata spreadsheet for each dataset and data file
- Be sure that `curation.py` is in the correct directory. 

#### 0.0 Set global variables and import libraries

In [1]:
#set globals
# set curation source path
g_module_path = '/Users/katherinemika/Desktop/curation/historic_datasets/hco'

# path to output file
g_dataverse_inventory_file = '/Users/katherinemika/Desktop/curation/historic_datasets/hco/dataset_metadata.csv'
# series names
g_series_names = []

# dataset inventories (keyed on series name)
g_series_inventories = {}

# dataset metadata (keyed on series name)
g_dataset_metadata = {}

# dataverse installation
g_dataverse_installation_url = 'https://demo.dataverse.org'

# dataverse API key
g_dataverse_api_key = '5187fd33-cef0-42b9-a3ea-95ac104afda9'

# dataverse collection name
g_dataverse_collection = 'hco_card_test'

# dataverse inventory dataframe
g_dataverse_inventory_df = None

# dataset author
g_dataset_author = 'Mika, Katherine'

# dataset author affiliation
g_dataset_author_affiliation = 'Harvard Library'

# dataset contact information
g_dataset_contact = 'Mika, Katherine'
g_dataset_contact_email = 'katherine_mika@harvard.edu'

# full path to location of datafiles (e.g., ../data/trade_statistics)
g_datafiles_path = '/Users/katherinemika/Desktop/curation/historic_datasets/hco/data_files'
# dataverse dataset information (keyed on series name)
g_dataverse_dataset_info = {}

# datafile metadata (dataframe of datafile metadata, keyed on series name)
g_datafile_metadata = {}

# datafile description template
g_datafile_description_template_txt = 'File contains OCR text with data from: '
g_datafile_description_template_csv = 'File contains csv table with data from: '
g_datafile_description_template_xml = 'File contains xml tree with OCR bounding box data from: '
g_datafile_description_template_jpg = 'File contains jpg image of: '


# dataset batches (array of batches of series to create/upload)
g_dataset_batches = []

In [2]:
#import libraries
import sys
if g_module_path not in sys.path:
    sys.path.append(g_module_path)

import curate
import requests
import numpy as np
import pandas as pd
import pprint as pprint
import rich
import yaml
import os

from easyDataverse import Dataverse

#### 0.1 Build local functions

In [3]:
 #add files to datasets 
def add_files_to_dataset(dataset, file_inventory, datafiles_path):
    for index, row in file_inventory.iterrows():
        dataset.add_file(
            local_path = datafiles_path + "/" + row['filename'],
            description = row['description'],
            #mimetype = row['mimetype'], -- no arg yet for mimetype
            categories = row['tags']
        )
    return dataset

#create config file to upload datafiles to datasets via DOI
def generate_config_from_dataframe(df, doi, output_dir):
    config = {
        'persistent_id': doi,
        'dataverse_url': 'https://demo.dataverse.org/',
        'api_token': 'd09f4047-417c-4768-879a-9ae5e98b3a91',
        'files': []
    }
    for _, row in df.iterrows():
        file_entry = {
            'filepath': g_datafiles_path + "/" + row['filename']
        }
        if pd.notnull(row['description']):
            file_entry['description'] = row['description']
        if pd.notnull(row['mimetype']):
            file_entry['mimetype'] = row['mimetype']
        
        tags = row['tags']
        if isinstance(tags, list) and len(tags) > 0:
            file_entry['categories'] = tags
        elif pd.notnull(tags):  # Handle case where 'tags' is a single string, not a list
            file_entry['categories'] = [tags]

        config['files'].append(file_entry)
        
        # Define the config filename using the DOI
    filename_safe_doi = doi.replace("/", "_")  # Make filename safe by replacing slashes
    config_filename = f'config_{filename_safe_doi}.yml'

    # Combine output directory with the filename
    config_filepath = os.path.join(output_dir, config_filename)

    # Save the config to a file
    with open(config_filepath, 'w') as file:
        yaml.dump(config, file, default_flow_style=False)
    
    print(f"Config file created: {config_filename}")
    return config_filename


## Curate Inventory
### 1.0. Prepare inventory data for curation
#### 1.1 Read `dataverse_inventory`

- Review example_spreadsheet.csv for proper table formatting
- Create `DataFrame` for later use
- Note: the `upload_datafiles()` function expects all files to be in a single, flat directory (i.e. not grouped by file type)

In [4]:
# read the dataverse inventory file
import chardet

with open(g_dataverse_inventory_file, 'rb') as f: 
    result = chardet.detect(f.read())
    encoding = result['encoding']
    print(f"detected encoding: {encoding}")
    
g_dataverse_inventory_df = pd.read_csv(g_dataverse_inventory_file,
                                       index_col=None,
                                       dtype={'card_date_year': str,'card_date_month': str, 'card_date_day': str},
                                       encoding=encoding, 
                                       low_memory=False)

detected encoding: utf-8


In [5]:
g_dataverse_inventory_df

Unnamed: 0,filename,file_type,card_number,card_date_year,card_date_month,card_date_day,contributor,observation,all_observations,series_name,url,subjects,topic_class,permalink
0,HCOAnnouncement0001_0001.innodata.xml,xml,1,1926,3,12,Harlow Shapley,,BLATHWAYT’S COMET,Announcement Card number: 1,http://www.cbat.eps.harvard.edu/IAUCs/HAC0001.jpg,Astronomy--Observations; Astronomy--Research; ...,History of Science; History of Astronomy,http://www.cbat.eps.harvard.edu
1,HCOAnnouncement0001_0001.innodata.jpg,jpg,1,1926,3,12,Harlow Shapley,,BLATHWAYT’S COMET,Announcement Card number: 1,http://www.cbat.eps.harvard.edu/IAUCs/HAC0001.jpg,Astronomy--Observations; Astronomy--Research; ...,History of Science; History of Astronomy,http://www.cbat.eps.harvard.edu
2,HCOAnnouncement0001_0001.innodata.txt,txt,1,1926,3,12,Harlow Shapley,,BLATHWAYT’S COMET,Announcement Card number: 1,http://www.cbat.eps.harvard.edu/IAUCs/HAC0001.jpg,Astronomy--Observations; Astronomy--Research; ...,History of Science; History of Astronomy,http://www.cbat.eps.harvard.edu
3,HCOAnnouncement0001_0001_a.innodata.csv,csv,1,1926,3,12,Harlow Shapley,BLATHWAYT’S COMET,BLATHWAYT’S COMET,Announcement Card number: 1,http://www.cbat.eps.harvard.edu/IAUCs/HAC0001.jpg,Astronomy--Observations; Astronomy--Research; ...,History of Science; History of Astronomy,http://www.cbat.eps.harvard.edu
4,HCOAnnouncement0001_0001_b.innodata.csv,csv,1,1926,3,12,Harlow Shapley,BLATHWAYT’S COMET,BLATHWAYT’S COMET,Announcement Card number: 1,http://www.cbat.eps.harvard.edu/IAUCs/HAC0001.jpg,Astronomy--Observations; Astronomy--Research; ...,History of Science; History of Astronomy,http://www.cbat.eps.harvard.edu
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8430,HCOAnnouncement0039_0043_c.innodata.csv,csv,1675,1964,12,23,Richard Southworth For Owen Gingerich,COMET IKEYA (1964f),COMET IKEYA (1964f),Announcement Card number: 1675,,Astronomy--Observations; Astronomy--Research; ...,History of Science; History of Astronomy,http://www.cbat.eps.harvard.edu
8431,HCOAnnouncement0039_0044.innodata.xml,xml,1676,1965,12,30,David Latham For Owen Gingerich,,COMET TOMITA-GERBER-HONDA (1964c),Announcement Card number: 1676,,Astronomy--Observations; Astronomy--Research; ...,History of Science; History of Astronomy,http://www.cbat.eps.harvard.edu
8432,HCOAnnouncement0039_0044.innodata.jpg,jpg,1676,1965,12,30,David Latham For Owen Gingerich,,COMET TOMITA-GERBER-HONDA (1964c),Announcement Card number: 1676,,Astronomy--Observations; Astronomy--Research; ...,History of Science; History of Astronomy,http://www.cbat.eps.harvard.edu
8433,HCOAnnouncement0039_0044.innodata.txt,txt,1676,1965,12,30,David Latham For Owen Gingerich,,COMET TOMITA-GERBER-HONDA (1964c),Announcement Card number: 1676,,Astronomy--Observations; Astronomy--Research; ...,History of Science; History of Astronomy,http://www.cbat.eps.harvard.edu


In [6]:
g_dataverse_inventory_df = g_dataverse_inventory_df.dropna(subset=["card_number"]).reset_index(drop=True)
#g_dataverse_inventory_df = g_dataverse_inventory_df[g_dataverse_inventory_df['card_number'].str.strip() != ""]


In [7]:
g_dataverse_inventory_df.iloc[419:425]

Unnamed: 0,filename,file_type,card_number,card_date_year,card_date_month,card_date_day,contributor,observation,all_observations,series_name,url,subjects,topic_class,permalink
419,HCOAnnouncement0004_0012.innodata.txt,txt,91,1929,8,12,Harlow Shapley,,NEUJMIN'S COMET,Announcement Card number: 91,,Astronomy--Observations; Astronomy--Research; ...,History of Science; History of Astronomy,http://www.cbat.eps.harvard.edu
420,HCOAnnouncement0004_0012_a.innodata.csv,csv,91,1929,8,12,Harlow Shapley,NEUJMIN'S COMET,NEUJMIN'S COMET,Announcement Card number: 91,,Astronomy--Observations; Astronomy--Research; ...,History of Science; History of Astronomy,http://www.cbat.eps.harvard.edu
421,HCOAnnouncement0004_0014.innodata.xml,xml,92,1929,8,14,Harlow Shapley,,NEW OBJECT; NEUJMIN'S COMET,Announcement Card number: 92,,Astronomy--Observations; Astronomy--Research; ...,History of Science; History of Astronomy,http://www.cbat.eps.harvard.edu
422,HCOAnnouncement0004_0014.innodata.jpg,jpg,92,1929,8,14,Harlow Shapley,,NEW OBJECT; NEUJMIN'S COMET,Announcement Card number: 92,,Astronomy--Observations; Astronomy--Research; ...,History of Science; History of Astronomy,http://www.cbat.eps.harvard.edu
423,HCOAnnouncement0004_0014.innodata.txt,txt,92,1929,8,14,Harlow Shapley,,NEW OBJECT; NEUJMIN'S COMET,Announcement Card number: 92,,Astronomy--Observations; Astronomy--Research; ...,History of Science; History of Astronomy,http://www.cbat.eps.harvard.edu
424,HCOAnnouncement0004_0014_a.innodata.csv,csv,92,1929,8,14,Harlow Shapley,NEW OBJECT,NEW OBJECT; NEUJMIN'S COMET,Announcement Card number: 92,,Astronomy--Observations; Astronomy--Research; ...,History of Science; History of Astronomy,http://www.cbat.eps.harvard.edu


#### 1.2 Create dataset inventories
- Get the list of series names
- Crate  a `dict` of inventories keyed on series name

In [8]:
# get list of series in the full inventory
g_series_names = list(g_dataverse_inventory_df.series_name.unique())

# create series inventories
for name in g_series_names:
    # get series inventory
    g_series_inventories[name] = g_dataverse_inventory_df.loc[g_dataverse_inventory_df['series_name'] == name]

pprint.pprint(g_series_names)

['Announcement Card number: 1',
 'Announcement Card number: 2',
 'Announcement Card number: 3',
 'Announcement Card number: 4',
 'Announcement Card number: 5',
 'Announcement Card number: 6',
 'Announcement Card number: 7',
 'Announcement Card number: 8',
 'Announcement Card number: 9',
 'Announcement Card number: 10',
 'Announcement Card number: 11',
 'Announcement Card number: 12',
 'Announcement Card number: 13',
 'Announcement Card number: 14',
 'Announcement Card number: 15',
 'Announcement Card number: 16',
 'Announcement Card number: 17',
 'Announcement Card number: 18',
 'Announcement Card number: 19',
 'Announcement Card number: 20',
 'Announcement Card number: 21',
 'Announcement Card number: 22',
 'Announcement Card number: 23',
 'Announcement Card number: 24',
 'Announcement Card number: 25',
 'Announcement Card number: 26',
 'Announcement Card number: 27',
 'Announcement Card number: 28',
 'Announcement Card number: 29',
 'Announcement Card number: 30',
 'Announcement Card

#### 1.3 Create dataset metadata
- Create `dict` of dataset metadata extracted from each series 

In [9]:
# for each series name, create dataset metadata
for series_name in g_series_names:
    # get series inventory
    series_inventory = g_series_inventories[series_name]
    md = curate.create_dataset_metadata(g_dataset_author, g_dataset_author_affiliation, 
                                        g_dataset_contact, g_dataset_contact_email,
                                        series_name, series_inventory)
    g_dataset_metadata[series_name] = md

pprint.pprint(g_dataset_metadata)

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



'keywordVocabularyURI': 'https://www.loc.gov/aba/cataloging/subject/'},
                                               {'keywordValue': ' Women '
                                                                'astronomers',
                                                'keywordVocabulary': 'LCSH',
                                                'keywordVocabularyURI': 'https://www.loc.gov/aba/cataloging/subject/'},
                                               {'keywordValue': ' ',
                                                'keywordVocabulary': 'LCSH',
                                                'keywordVocabularyURI': 'https://www.loc.gov/aba/cataloging/subject/'}],
                                  'license': 'CC0 1.0',
                                  'subject': ['Astronomy and Astrophysics'],
                                  'title': 'Announcement Card number: 79',
                                  'topic_classification': [{'topicClassValue': 'History '
             

#### 1.4 Create datafile metadata
- Create `dict` of `DataFrames` containing metadata about individual files

In [10]:
for series_name in g_series_names:
    # get dataset metadata for the series
    series_metadata = g_dataset_metadata[series_name]
    # get the series inventory
    series_inventory_df = g_series_inventories[series_name]
    print(series_inventory_df)
    # create datafile metadata
    g_datafile_metadata[series_name] = curate.create_datafile_metadata(series_inventory_df,
                                                                       g_datafile_description_template_csv,
                                                                       g_datafile_description_template_txt,
                                                                       g_datafile_description_template_xml, 
                                                                       g_datafile_description_template_jpg)

                                  filename file_type card_number  \
0    HCOAnnouncement0001_0001.innodata.xml       xml           1   
1    HCOAnnouncement0001_0001.innodata.jpg       jpg           1   
2    HCOAnnouncement0001_0001.innodata.txt       txt           1   
3  HCOAnnouncement0001_0001_a.innodata.csv       csv           1   
4  HCOAnnouncement0001_0001_b.innodata.csv       csv           1   

  card_date_year card_date_month card_date_day     contributor  \
0           1926               3            12  Harlow Shapley   
1           1926               3            12  Harlow Shapley   
2           1926               3            12  Harlow Shapley   
3           1926               3            12  Harlow Shapley   
4           1926               3            12  Harlow Shapley   

         observation   all_observations                  series_name  \
0                NaN  BLATHWAYT’S COMET  Announcement Card number: 1   
1                NaN  BLATHWAYT’S COMET  Announcem

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [11]:
g_datafile_metadata['Announcement Card number: 92']

Unnamed: 0,filename,file_type,description,tags
0,HCOAnnouncement0004_0014.innodata.xml,xml,File contains OCR text with data from: Announ...,[Data]
1,HCOAnnouncement0004_0014.innodata.jpg,jpg,File contains OCR text with data from: Announ...,[Data]
2,HCOAnnouncement0004_0014.innodata.txt,txt,File contains OCR text with data from: Announ...,[Data]
3,HCOAnnouncement0004_0014_a.innodata.csv,csv,File contains OCR text with data from: Announ...,"[Data, NEW OBJECT]"
4,HCOAnnouncement0004_0014_b.innodata.csv,csv,File contains OCR text with data from: Announ...,"[Data, NEUJMIN'S COMET]"


### 2.0 Initilize easyDataverse
- Connect to your server


In [12]:
dataverse = Dataverse(
    server_url = g_dataverse_installation_url,
    api_token = g_dataverse_api_key
)

Output()





In [16]:
#poking around
dataset = dataverse.create_dataset()

In [17]:
dataverse.list_metadatablocks(detailed=False)

geospatial
socialscience
astrophysics
biomedical
journal
customPSI
customMRA
customPSRI
customGSD
customDigaai
customCHIA
customARCS
custom_hbgdki
citation
custom3d
AdditionalMetadataAboutDataset
customNDR
customSAEF
computationalworkflow
customMORU
customCAFEDataSources
customCAFEDataLocation


In [18]:
dataset.astrophysics.info()

In [15]:
# Singular fields
dataset.astrophysics.astro_type = ["Observation"]

print(dataset)

metadatablocks:
  astrophysics:
    astroType:
      - Observation



In [16]:
#single fields - with validation - having trouble with astroType....
from pydantic import ValidationError

try:
    dataset.astrophysics.astro_object = ["BLATHWAYT'S COMET"]
except ValidationError as e:
    rich.print(e)

#Turns out I just failed to read the output in cell 13: astroType == astro_type. Leaving this validation cell here in case I need it in the future for other fields

### 3.0 Create datasets
- Create datasets from metadata inventory
- Add datafiles to datasets

#### 3.1. Create datasets

In [13]:
#for each series, create a dataset and save its information

for series_name in g_series_names:
    #initiate datset for each series
    series_dataset = dataverse.create_dataset()
    #get the series metadata
    series_metadata = g_dataset_metadata[series_name]
    #create the dataset
    g_dataverse_dataset_info[series_name] = curate.create_dataset(series_dataset, series_metadata)

In [14]:
rich.print(g_dataverse_dataset_info['Announcement Card number: 92'].dataverse_dict())

#### 3.2 Add files to datasets

In [15]:
for dataset_name, dataset in g_dataverse_dataset_info.items():
    file_inventory = g_datafile_metadata.get(dataset_name)
    add_files_to_dataset(dataset, file_inventory, g_datafiles_path)
    title = dataset.citation.title
    rich.print(f"Added {len(dataset.files)} files to {title}")

### 4.0 Deposit datasets to the repository
- Deposit datasets to create drafts
- Upload files to datasets with python DVUploader (using .yml config files??)
- Publish datasets

In [16]:
# Convert dictionary items to a list so we can slice easily.
items = list(g_dataverse_dataset_info.items())
test_set = items[0:9]
print(test_set)

[('Announcement Card number: 1', metadatablocks:
  astrophysics:
    astroFacility:
      - Harvard Bureau of Astronomical Telegrams
    astroObject:
      - "BLATHWAYT\u2019S COMET"
  citation:
    title: 'Announcement Card number: 1'
    subject:
      - Astronomy and Astrophysics
    distributionDate: 1926-3-12
    dataSources:
      - http://www.cbat.eps.harvard.edu
    author:
      - authorName: Mika, Katherine
        authorAffiliation: Harvard Library
    datasetContact:
      - datasetContactName: Mika, Katherine
        datasetContactEmail: katherine_mika@harvard.edu
    dsDescription:
      - dsDescriptionValue: "Announcement Card number: 1 is a series of tables and\
          \ text files associated with HCO Announcement Card number: 1. Compiled by:\
          \ Harlow Shapley. Objects observed include BLATHWAYT\u2019S COMET."
    keyword:
      - keywordValue: Astronomy--Observations
        keywordVocabulary: LCSH
        keywordVocabularyURI: !!python/object/new:pydantic

In [17]:
#test deposit
collection_pids = {}

for dataset_name, dataset in test_set:
    pid = dataset.upload(dataverse_name = g_dataverse_collection, n_parallel = 4)
    collection_pids[dataset_name] = pid

Dataset with pid 'doi:10.70122/FK2/OEYLUQ' created.




Output()









Output()





Dataset with pid 'doi:10.70122/FK2/0HMSY2' created.




Output()









Output()

ClientResponseError: 500, message='Internal Server Error', url=URL('https://demo.dataverse.org/api/datasets/:persistentId/replaceFiles?persistentId=doi:10.70122/FK2/0HMSY2')

In [41]:
#deposit datasets
collection_pids = {}
for dataset_name, dataset in g_dataverse_dataset_info.items():
    pid = dataset.upload(dataverse_name = g_dataverse_collection)
    collection_pids[dataset_name] = pid

HTTPStatusError: Client error '403 Forbidden' for url 'https://dataverse.harvard.edu/api/v1/dataverses/hco_card/datasets'
For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/403

In [15]:
collection_pids

{'Announcement Card number: 1': 'doi:10.70122/FK2/LM9SMC',
 'Announcement Card number: 2': 'doi:10.70122/FK2/FEOJWQ',
 'Announcement Card number: 3': 'doi:10.70122/FK2/CY1LIH',
 'Announcement Card number: 4': 'doi:10.70122/FK2/TNJS4T',
 'Announcement Card number: 5': 'doi:10.70122/FK2/YNWHIC',
 'Announcement Card number: 6': 'doi:10.70122/FK2/OCBZBO',
 'Announcement Card number: 7': 'doi:10.70122/FK2/0LEAHM',
 'Announcement Card number: 8': 'doi:10.70122/FK2/ULF2GK',
 'Announcement Card number: 9': 'doi:10.70122/FK2/DJRVNB',
 'Announcement Card number: 10': 'doi:10.70122/FK2/QUXXH5',
 'Announcement Card number: 11': 'doi:10.70122/FK2/4TCIDB',
 'Announcement Card number: 12': 'doi:10.70122/FK2/X02SQI',
 'Announcement Card number: 13': 'doi:10.70122/FK2/QQFGBX'}

In [39]:
#connect to pydataverse api 
from pyDataverse.api import NativeApi

g_api = NativeApi(g_dataverse_installation_url, g_dataverse_api_key)

# print results
print('{}'.format(g_api))

Native API: https://demo.dataverse.org/api/v1


In [43]:
#Upload files to deposited datasets (ignore 500 errors...I know, not great, JR is fixing it...)
for dataset, pid in collection_pids.items():
    try:
        curate.python_dvuploader(g_api, g_dataverse_installation_url, pid, g_datafiles_path, g_datafile_metadata[dataset]) 
    except Exception as e:
        if "500" in str(e):
            print(f"500 error encountered for {series_name}. Skipping to the next series.")
            continue
        else:
            raise e





Output()









Output()





Output()









Output()

500 error encountered for Announcement Card number: 13. Skipping to the next series.




Output()









Output()





Output()









Output()





Output()









Output()





Output()









Output()





Output()









Output()





Output()









Output()

500 error encountered for Announcement Card number: 13. Skipping to the next series.




Output()









Output()





Output()









Output()





Output()









Output()





Output()









Output()





Output()









Output()

In [35]:
#unlock locked datasets when needed
from pyDataverse.api import NativeApi

In [36]:
g_api = NativeApi(g_dataverse_installation_url, g_dataverse_api_key)

# print results
print('{}'.format(g_api))

Native API: https://demo.dataverse.org/api/v1


In [44]:
curate.unlock_datasets(g_api, g_dataverse_collection)

{'doi:10.70122/FK2/YNWHIC': {'status': True,
  'message': 'publish_dataset::Success - unlocked dataset: 200:doi:10.70122/FK2/YNWHIC'},
 'doi:10.70122/FK2/QQFGBX': {'status': True,
  'message': 'publish_dataset::Success - unlocked dataset: 200:doi:10.70122/FK2/QQFGBX'}}

In [42]:
import importlib
importlib.reload(curate)

<module 'curate' from '/Users/katherinemika/Desktop/curation/historic_datasets/hco/curate.py'>

## Testing section

### Metadata

In [28]:
# get the first series
first_series = g_series_names[0]
first_series_metadata = g_dataset_metadata[first_series]
first_series_inventory_df = g_series_inventories[first_series]

In [29]:
first_series_inventory_df

Unnamed: 0,filename,file_type,card_number,card_date_year,card_date_month,card_date_day,contributor,observation,all_observations,series_name,url,volume_title,published,subjects,topic_class,permalink
0,HCOAnnouncement0001_0001.innodata.xml,xml,1,1926,3,12,Harlow Shapley,,BLATHWAYT'S COMET,Announcement Card number: 1,http://tamkin2.eps.harvard.edu/IAUCs/HAC0001.jpg,Harvard College Observatory Announcement Cards,Harvard College Observatory,Astronomy--Observations; Astronomy--Research; ...,History of Science; History of Astronomy,http://tamkin2.eps.harvard.edu/services/HACs.html
1,HCOAnnouncement0001_0001.innodata.jpg,jpg,1,1926,3,12,Harlow Shapley,,BLATHWAYT'S COMET,Announcement Card number: 1,http://tamkin2.eps.harvard.edu/IAUCs/HAC0001.jpg,Harvard College Observatory Announcement Cards,Harvard College Observatory,Astronomy--Observations; Astronomy--Research; ...,History of Science; History of Astronomy,http://tamkin2.eps.harvard.edu/services/HACs.html
2,HCOAnnouncement0001_0001.innodata.txt,txt,1,1926,3,12,Harlow Shapley,,BLATHWAYT'S COMET,Announcement Card number: 1,http://tamkin2.eps.harvard.edu/IAUCs/HAC0001.jpg,Harvard College Observatory Announcement Cards,Harvard College Observatory,Astronomy--Observations; Astronomy--Research; ...,History of Science; History of Astronomy,http://tamkin2.eps.harvard.edu/services/HACs.html
3,HCOAnnouncement0001_0001_a.innodata.csv,csv,1,1926,3,12,Harlow Shapley,BLATHWAYT‚ÄôS COMET,BLATHWAYT'S COMET,Announcement Card number: 1,http://tamkin2.eps.harvard.edu/IAUCs/HAC0001.jpg,Harvard College Observatory Announcement Cards,Harvard College Observatory,Astronomy--Observations; Astronomy--Research; ...,History of Science; History of Astronomy,http://tamkin2.eps.harvard.edu/services/HACs.html
4,HCOAnnouncement0001_0001_b.innodata.csv,csv,1,1926,3,12,Harlow Shapley,BLATHWAYT‚ÄôS COMET,BLATHWAYT'S COMET,Announcement Card number: 1,http://tamkin2.eps.harvard.edu/IAUCs/HAC0001.jpg,Harvard College Observatory Announcement Cards,Harvard College Observatory,Astronomy--Observations; Astronomy--Research; ...,History of Science; History of Astronomy,http://tamkin2.eps.harvard.edu/services/HACs.html


In [30]:
g_datafile_metadata[first_series] = curate.create_datafile_metadata(first_series_inventory_df,
                                                                    g_datafile_description_template_csv,
                                                                    g_datafile_description_template_txt,
                                                                    g_datafile_description_template_xml)

In [31]:
g_datafile_metadata[first_series]

Unnamed: 0,filename,file_type,description,mimetype,tags
0,HCOAnnouncement0001_0001.innodata.xml,xml,File contains xml tree with OCR bounding box d...,application/xml,[Data]
1,HCOAnnouncement0001_0001.innodata.jpg,jpg,File contains OCR text with data from: Announ...,image/jpeg,[Data]
2,HCOAnnouncement0001_0001.innodata.txt,txt,File contains OCR text with data from: Announ...,text/plain,[Data]
3,HCOAnnouncement0001_0001_a.innodata.csv,csv,File contains OCR text with data from: Announ...,text/csv,"[Data, BLATHWAYT‚ÄôS COMET]"
4,HCOAnnouncement0001_0001_b.innodata.csv,csv,File contains OCR text with data from: Announ...,text/csv,"[Data, BLATHWAYT‚ÄôS COMET]"


### Create datasets 

In [32]:
dataset = dataverse.create_dataset()
print(dataset)

metadatablocks: {}



In [18]:
dataset.astrophysics.info()

In [19]:
dataset.astrophysics.astro_object = ['blathwayts comet']

In [20]:
dataset.citation.info()

In [41]:
#create dataset function
def create_dataset(api, dataset_metadata):
    """
    Create a dataverse dataset using easyDataverse.
    Note that metadata fields are hardcoded to reflect dataset's requirements. 

    Parameters
    ----------
    api : easyDataverse initialized dataverse 
    dataset_metadata : dict
        Dictionary of dataset metadata values

    Return
    ------
    dict: 
        {status: bool, dataset_id: int, dataset_pid: str}

    """
    # validate parameters
    if ((not dataverse) or
        (not dataset_metadata)):
        return {
            'status':False, 
            'dataset_id':-1, 
            'dataset_pid':''
        }

    # create the easyDataverse dataset model
    ds = dataverse.create_dataset()
    # populate the dataset model with metadata values
    ds.citation.title = dataset_metadata.get('title')

    for authors in dataset_metadata.get('author'):
        ds.citation.add_author(name = authors['authorName'],
                              affiliation = authors['authorAffiliation'])

    for desc in dataset_metadata.get('description'):
        ds.citation.add_ds_description(value=desc['dsDescriptionValue'])
    
    for contact in dataset_metadata.get('contact'):
        ds.citation.add_dataset_contact(name = contact['datasetContactName'],
                                        email = contact['datasetContactEmail'])

    ds.citation.subject = dataset_metadata.get('subject')

    for keyword in dataset_metadata.get('keywords'):
        ds.citation.add_keyword(value = keyword['keywordValue'],
                                vocabulary = keyword['keywordVocabulary'],
                                vocabulary_uri = keyword['keywordVocabularyURI'])

    for topic in dataset_metadata.get('topic_classification'):
            ds.citation.add_topic_classification(value = topic['topicClassValue'])

    ds.citation.data_sources = dataset_metadata.get('data_source')
    ds.citation.distribution_date = dataset_metadata.get('creation_date')
    ds.astrophysics.astro_object = dataset_metadata.get('astroObject')
    ds.astrophysics.astro_facility = dataset_metadata.get('astroFacility')


    #dict = rich.print(ds.dataverse_dict())
    return ds


In [33]:
dataverse = Dataverse(
    server_url = g_dataverse_installation_url,
    api_token = g_dataverse_api_key
)

Output()





In [66]:
dataset = create_dataset(dataverse, first_series_metadata)

In [67]:
rich.print(dataset.dataverse_dict())

In [68]:
dataset.add_file(
    local_path = g_datafiles_path + "/HCOAnnouncement0001_0001.innodata.txt", # Path to the file on your system
    description = g_datafile_description_template_txt + "Announcement Card number: 1",
    categories = ["Data"]
)

In [69]:
dataset.add_file(
    local_path = g_datafiles_path + "/HCOAnnouncement0001_0001.innodata.xml", # Path to the file on your system
    description = g_datafile_description_template_xml + "Announcement Card number: 1",
    categories = ["Data"]
)

In [70]:
dataset.add_file(
    local_path = g_datafiles_path + "/HCOAnnouncement0001_0001_a.innodata.csv", # Path to the file on your system
    description = g_datafile_description_template_txt + "Announcement Card number: 1",
    categories = ["Data", "Blathwayt's Comet"]
)

In [71]:
dataset.add_file(
    local_path = g_datafiles_path + "/HCOAnnouncement0001_0001_b.innodata.csv", # Path to the file on your system
    description = g_datafile_description_template_txt + "Announcement Card number: 1",
    categories = ["Data", "Blathwayt's Comet"]
)

In [72]:
dataset.add_file(
    local_path = g_datafiles_path + "/HCOAnnouncement0001_0001.innodata.jpg", # Path to the file on your system
    description = g_datafile_description_template_jpg + "Announcement Card number: 1",
    categories = ["Data"]
)

In [73]:
#upload everything
pid = dataset.upload(dataverse_name= g_dataverse_collection, n_parallel=4)

Dataset with pid 'doi:10.70122/FK2/XCMDK2' created.




Output()









Output()





In [None]:
#Testing config file dvuploader strategy (does it correctly assign mimetypes??)
#create config files from metadata dataframe
datasets = {}
for dataset, pid in collection_pids.items(): 
    config = generate_config_from_dataframe(g_datafile_metadata[dataset], pid, g_datafiles_path)
    datasets[pid] = config

In [None]:
#upload files in easyDataverse
#upload datasets and deal with 500error (Jan is fixing it - in the meantime this "works")
collection_pids = []
for dataset_name, dataset in g_dataverse_dataset_info.items():
    try:
        pid = dataset.upload(dataverse_name = g_dataverse_collection, n_parallel=4)
        collection_pids.append(pid)
    except Exception as e:
        if "500" in str(e):
            print(f"500 error encountered for {dataset_name}. Skipping to the next dataset.")
            continue
        else:
            raise e

In [None]:
dataset.add_directory(
    dirpath= g_datafiles_path + "/data_files/test",
    ignores=[
        "^\..*",         # Ignore hidden files and dirs
    ]
)

In [163]:
import importlib
importlib.reload(curate)

<module 'curate' from '/Users/katherinemika/Desktop/curation/historic_datasets/hco/curate.py'>

In [24]:
#single fields - with validation - having trouble with astroType....
from pydantic import ValidationError

try:
    dataset.astrophysics.astroType = "Observation"
except ValidationError as e:
    rich.print(e)

In [30]:
#delete all datasets in collection
headers = {
    'X-Dataverse-key': g_dataverse_api_key
}

for pid in collection_pids:
    url = '%s/api/datasets/:persistentId/destroy/?persistentId=%s' % (g_dataverse_installation_url, pid)

    r = requests.delete(url=url, headers=headers)
    print(r.json())

{'status': 'OK', 'data': {'message': 'Dataset :persistentId destroyed'}}
{'status': 'OK', 'data': {'message': 'Dataset :persistentId destroyed'}}
{'status': 'OK', 'data': {'message': 'Dataset :persistentId destroyed'}}
{'status': 'OK', 'data': {'message': 'Dataset :persistentId destroyed'}}
{'status': 'OK', 'data': {'message': 'Dataset :persistentId destroyed'}}
{'status': 'OK', 'data': {'message': 'Dataset :persistentId destroyed'}}
{'status': 'OK', 'data': {'message': 'Dataset :persistentId destroyed'}}
{'status': 'OK', 'data': {'message': 'Dataset :persistentId destroyed'}}
{'status': 'OK', 'data': {'message': 'Dataset :persistentId destroyed'}}
{'status': 'OK', 'data': {'message': 'Dataset :persistentId destroyed'}}
{'status': 'OK', 'data': {'message': 'Dataset :persistentId destroyed'}}
{'status': 'OK', 'data': {'message': 'Dataset :persistentId destroyed'}}
{'status': 'OK', 'data': {'message': 'Dataset :persistentId destroyed'}}
