# Curation Script - Chinese Maritime Trade Data


## About
- This script contains the basic curation workflow for creating datasets and uploading files from a metadata spreadsheet of data files.   
- **Be SURE `curate.py` and `directupload.py` are set in correct directories**
<p>
- **Created:** 2023/10/12
- **Updated:** 

## Globals
- Global variables for this script. 
- Set variable names (e.g., `g_api_key` as needed)

In [1]:
# set curation source path
g_module_path = '/Users/katherinemika/Desktop/curation/historic_datasets/hco'

# path to output file
g_dataverse_inventory_file = '/Users/katherinemika/Desktop/curation/historic_datasets/hco/hco_batch_metadata.csv'
# series names
g_series_names = []

# dataset inventories (keyed on series name)
g_series_inventories = {}

# dataset metadata (keyed on series name)
g_dataset_metadata = {}

# dataverse installation
g_dataverse_installation_url = 'https://demo.dataverse.org'

# dataverse API key
g_dataverse_api_key = 'e487101b-6f0e-47dc-8109-7a72a9a9b0ed'

# dataverse collection name
g_dataverse_collection = 'hco_card'

# dataverse inventory dataframe
g_dataverse_inventory_df = None

# dataset author
g_dataset_author = 'Mika, Katherine'

# dataset author affiliation
g_dataset_author_affiliation = 'Harvard Library'

# dataset contact information
g_dataset_contact = 'Mika, Katherine'
g_dataset_contact_email = 'katherine_mika@harvard.edu'

# full path to location of datafiles (e.g., ../data/trade_statistics)
g_datafiles_path = '/Users/katherinemika/Desktop/curation/historic_datasets/hco/data_files'
# demo dataverse dataset information (keyed on series name)
g_dataverse_dataset_info = {}

# datafile metadata (dataframe of datafile metadata, keyed on series name)
g_datafile_metadata = {}

# datafile description template
g_datafile_description_template_txt = 'File contains OCR text with data from Announcement Card number:  '
g_datafile_description_template_csv = 'File contains csv table with data from Announcement Card number:  '
g_datafile_description_template_xml = 'File contains xml tree with OCR bounding box data from Annoucement Card: '


# dataset batches (array of batches of series to create/upload)
g_dataset_batches = []

## Modules

- Add local modules path to Jupyter system path
- Load all modules including local modules such as `curate`

In [2]:
import sys
if g_module_path not in sys.path:
    sys.path.append(g_module_path)

import curate
import requests
import numpy as np
import pandas as pd
import pprint as pprint
from pyDataverse.api import NativeApi

## Local Functions

In [3]:
# get a dictionary of dataset pids keyed on series name. Collected from output of "create dataset" function
def get_dataset_pids(batch, dataset_info):
    pids = {}
    for series_name in batch:
        pids[series_name] = dataset_info[series_name].get('dataset_pid')
    return pids

def get_dataset_pids_from_hdv(api, dataverse_collection):
    ds_tree = api.get_children(dataverse_collection, children_types=["datasets"])
    key = api.api_token
    base_url = api.base_url
    headers = {'X-Dataverse-key': key}
    
    formatted_ds_tree = {}
    
    for t in ds_tree:
        pid = t['pid']
        request_url = '{}/api/datasets/:persistentId/?persistentId={}'.format(base_url, pid)
        response = requests.get(request_url, headers=headers)
        if response.status_code == 200:
            r = response.json()
            title = r['data']['latestVersion']['metadataBlocks']['citation']['fields'][0]['value']
            dataset_id = r['data']['id']
            formatted_ds_tree[title] = {
                'dataset_id': dataset_id,
                'dataset_pid': pid,
                'status': True
            }
        else:
            print(f"Failed to retrieve data for PID: {pid}")
    
    return formatted_ds_tree

# get dictionary of datafile inventories keyed on series name
def get_datafile_inventories(batch, datafile_metadata):
    inventories = {}
    for series_name in batch:
        inventories[series_name] = datafile_metadata[series_name]
    return inventories 

# upload the datafiles associated with a batch
def upload_dataset_batch(api, dataverse_url, batch_list, batch_pids, batch_datafile_metadata, data_directory):
    # upload the datafiles associated with each series in the batch
    results = {}
    for series_name in batch_list:
        pid = batch_pids[series_name]
        datafiles_metadata = batch_datafile_metadata[series_name]
        results[series_name] = curate.direct_upload_datafiles(api, dataverse_url, pid, path, datafiles_metadata)
    return results

# upload datafiles for a batch of datasets
def upload_files_batch(api, dataverse_url, batch_list, batch_pids, data_directory, batch_datafile_metadata):
    results = {}
    for series_name in batch_list:
        pid = batch_pids[series_name]
        batch_datafile_metadata = g_datafile_metadata[series_name]
        results[series_name] = curate.python_dvuploader(api, dataverse_url, pid, data_directory, batch_datafile_metadata)
    return results

# upload datafiles for a batch of datasets - but ignoring 500 errors that the python_dvuploader keeps issuing...
def upload_files_batch_errors(api, dataverse_url, batch_list, batch_pids, data_directory, g_datafile_metadata):
    results = {}
    for series_name in batch_list:
        try:
            pid = batch_pids[series_name]
            batch_datafile_metadata = g_datafile_metadata[series_name]
            results[series_name] = curate.python_dvuploader(api, dataverse_url, pid, data_directory, batch_datafile_metadata)
        except Exception as e:
            if "500" in str(e):
                print(f"500 error encountered for {series_name}. Skipping to the next series.")
                continue
            else:
                # Handle other exceptions if necessary
                raise e
    return results

## Curate Inventory

### 1. Prepare inventory data for curation

#### 1.1 Read `dataverse_inventory`
- **Add instructions about spreadsheet format** Eg. Column names, table org, etc.
- Create a `DataFrame` for later use
- Note: Also, the `curate:direct_upload_datafiles` function expects all files to be in a single directory (not grouped by file type)

In [4]:
# read the dataverse inventory file
import chardet

with open(g_dataverse_inventory_file, 'rb') as f: 
    result = chardet.detect(f.read())
    encoding = result['encoding']
    print(f"detected encoding: {encoding}")
    
g_dataverse_inventory_df = pd.read_csv(g_dataverse_inventory_file,index_col=None, encoding=encoding, low_memory=False)

detected encoding: UTF-8-SIG


In [5]:
g_dataverse_inventory_df

Unnamed: 0,filename,file_type,card_number,card_date_year,card_date_month,card_date_day,contributor,observation,all_observations,series_name,url,volume_title,published,subjects,topic_class,permalink
0,HCOAnnouncement0001_0001.innodata.xml,xml,1,1926,3,12,Harlow Shapley,,BLATHWAYT'S COMET,Announcement Card number: 1,http://tamkin2.eps.harvard.edu/IAUCs/HAC0001.jpg,Harvard College Observatory Announcement Cards,Harvard College Observatory,Astronomy--Observations; Astronomy--Research; ...,History of Science; History of Astronomy,http://tamkin2.eps.harvard.edu/services/HACs.html
1,HCOAnnouncement0001_0001.innodata.jpg,jpg,1,1926,3,12,Harlow Shapley,,BLATHWAYT'S COMET,Announcement Card number: 1,http://tamkin2.eps.harvard.edu/IAUCs/HAC0001.jpg,Harvard College Observatory Announcement Cards,Harvard College Observatory,Astronomy--Observations; Astronomy--Research; ...,History of Science; History of Astronomy,http://tamkin2.eps.harvard.edu/services/HACs.html
2,HCOAnnouncement0001_0001.innodata.txt,txt,1,1926,3,12,Harlow Shapley,,BLATHWAYT'S COMET,Announcement Card number: 1,http://tamkin2.eps.harvard.edu/IAUCs/HAC0001.jpg,Harvard College Observatory Announcement Cards,Harvard College Observatory,Astronomy--Observations; Astronomy--Research; ...,History of Science; History of Astronomy,http://tamkin2.eps.harvard.edu/services/HACs.html
3,HCOAnnouncement0001_0001_a.innodata.csv,csv,1,1926,3,12,Harlow Shapley,BLATHWAYT‚ÄôS COMET,BLATHWAYT'S COMET,Announcement Card number: 1,http://tamkin2.eps.harvard.edu/IAUCs/HAC0001.jpg,Harvard College Observatory Announcement Cards,Harvard College Observatory,Astronomy--Observations; Astronomy--Research; ...,History of Science; History of Astronomy,http://tamkin2.eps.harvard.edu/services/HACs.html
4,HCOAnnouncement0001_0001_b.innodata.csv,csv,1,1926,3,12,Harlow Shapley,BLATHWAYT‚ÄôS COMET,BLATHWAYT'S COMET,Announcement Card number: 1,http://tamkin2.eps.harvard.edu/IAUCs/HAC0001.jpg,Harvard College Observatory Announcement Cards,Harvard College Observatory,Astronomy--Observations; Astronomy--Research; ...,History of Science; History of Astronomy,http://tamkin2.eps.harvard.edu/services/HACs.html
5,HCOAnnouncement0001_0002.innodata.xml,xml,2,1926,3,18,Harlow Shapley,,ENSOR'S COMET; BLATHWAYT'S COMET,Announcement Card number: 2,http://tamkin2.eps.harvard.edu/IAUCs/HAC0002.jpg,Harvard College Observatory Announcement Cards,Harvard College Observatory,Astronomy--Observations; Astronomy--Research; ...,History of Science; History of Astronomy,http://tamkin2.eps.harvard.edu/services/HACs.html
6,HCOAnnouncement0001_0002.innodata.jpg,jpg,2,1926,3,18,Harlow Shapley,,ENSOR'S COMET; BLATHWAYT'S COMET,Announcement Card number: 2,http://tamkin2.eps.harvard.edu/IAUCs/HAC0002.jpg,Harvard College Observatory Announcement Cards,Harvard College Observatory,Astronomy--Observations; Astronomy--Research; ...,History of Science; History of Astronomy,http://tamkin2.eps.harvard.edu/services/HACs.html
7,HCOAnnouncement0001_0002.innodata.txt,txt,2,1926,3,18,Harlow Shapley,,ENSOR'S COMET; BLATHWAYT'S COMET,Announcement Card number: 2,http://tamkin2.eps.harvard.edu/IAUCs/HAC0002.jpg,Harvard College Observatory Announcement Cards,Harvard College Observatory,Astronomy--Observations; Astronomy--Research; ...,History of Science; History of Astronomy,http://tamkin2.eps.harvard.edu/services/HACs.html
8,HCOAnnouncement0001_0002_a.innodata.csv,csv,2,1926,3,18,Harlow Shapley,ENSOR'S COMET,ENSOR'S COMET; BLATHWAYT'S COMET,Announcement Card number: 2,http://tamkin2.eps.harvard.edu/IAUCs/HAC0002.jpg,Harvard College Observatory Announcement Cards,Harvard College Observatory,Astronomy--Observations; Astronomy--Research; ...,History of Science; History of Astronomy,http://tamkin2.eps.harvard.edu/services/HACs.html
9,HCOAnnouncement0001_0002_b.innodata.csv,csv,2,1926,3,18,Harlow Shapley,ENSOR'S COMET,ENSOR'S COMET; BLATHWAYT'S COMET,Announcement Card number: 2,http://tamkin2.eps.harvard.edu/IAUCs/HAC0002.jpg,Harvard College Observatory Announcement Cards,Harvard College Observatory,Astronomy--Observations; Astronomy--Research; ...,History of Science; History of Astronomy,http://tamkin2.eps.harvard.edu/services/HACs.html


#### 1.2 Create Dataset Inventories
- Get the list of series names
- Create a `dict` of file inventories keyed on series name

In [6]:
# get list of series in the full inventory
g_series_names = list(g_dataverse_inventory_df.series_name.unique())

# create series inventories
for name in g_series_names:
    # get series inventory
    g_series_inventories[name] = g_dataverse_inventory_df.loc[g_dataverse_inventory_df['series_name'] == name]

pprint.pprint(g_series_names)

['Announcement Card number: 1',
 'Announcement Card number: 2',
 'Announcement Card number: 3',
 'Announcement Card number: 4',
 'Announcement Card number: 5',
 'Announcement Card number: 6',
 'Announcement Card number: 7',
 'Announcement Card number: 8',
 'Announcement Card number: 9',
 'Announcement Card number: 10',
 'Announcement Card number: 11',
 'Announcement Card number: 12',
 'Announcement Card number: 13']


#### 1.3 Create Dataset Metadata
- Create a `dict` of dataset metadata extracted from each inventory

In [7]:
# for each series name, create dataset metadata
for series_name in g_series_names:
    # get series inventory
    series_inventory = g_series_inventories[series_name]
    md = curate.create_dataset_metadata(g_dataset_author, g_dataset_author_affiliation, 
                                        g_dataset_contact, g_dataset_contact_email,
                                        series_name, series_inventory)
    g_dataset_metadata[series_name] = md

pprint.pprint(g_dataset_metadata)

{'Announcement Card number: 1': {'astroFacility': ['Harvard Bureau of '
                                                   'Astronomical Telegrams'],
                                 'astroObject': [{'objectValue': "BLATHWAYT'S "
                                                                 'COMET'}],
                                 'author': [{'authorAffiliation': 'Harvard '
                                                                  'Library',
                                             'authorName': 'Mika, Katherine'}],
                                 'contact': [{'datasetContactAffiliation': 'Harvard '
                                                                           'Library',
                                              'datasetContactEmail': 'katherine_mika@harvard.edu',
                                              'datasetContactName': 'Mika, '
                                                                    'Katherine'}],
                             

### 1.4 Create Datafile Metadata
- Create a `dict` of `DataFrames` containing metadata about individual files

In [8]:
for series_name in g_series_names:
    # get dataset metadata for the series
    series_metadata = g_dataset_metadata[series_name]
    # get the series inventory
    series_inventory_df = g_series_inventories[series_name]
    # create datafile metadata
    g_datafile_metadata[series_name] = curate.create_datafile_metadata(series_inventory_df,
                                                                       g_datafile_description_template_csv,
                                                                       g_datafile_description_template_txt,
                                                                       g_datafile_description_template_xml)

In [9]:
g_datafile_metadata['Announcement Card number: 3']

Unnamed: 0,filename,file_type,description,mimetype,tags
0,HCOAnnouncement0001_0003.innodata.xml,xml,File contains xml tree with OCR bounding box d...,application/xml,"[""Data""]"
1,HCOAnnouncement0001_0003.innodata.jpg,jpg,File contains OCR text with data from Announce...,UNKNOWN,"[""Data""]"
2,HCOAnnouncement0001_0003.innodata.txt,txt,File contains OCR text with data from Announce...,text/plain,"[""Data""]"


### 1.4 Create Series Batches
- Create a set of (approximately) equal length batches of series (to create dataset and upload datafiles)
- Generally, there are too many series in a volume to create the related datasets and then upload all their datafiles in a single tight loop. Therefore, it's useful to create batches of these series and perform the create/upload operation on a single batch at a time.

In [10]:
# max number of series in a batch
batch_size = 3
g_batches = np.array_split(g_series_names, len(g_series_names)/batch_size)
pprint.pprint(g_batches)

[array(['Announcement Card number: 1', 'Announcement Card number: 2',
       'Announcement Card number: 3', 'Announcement Card number: 4'],
      dtype='<U28'),
 array(['Announcement Card number: 5', 'Announcement Card number: 6',
       'Announcement Card number: 7'], dtype='<U28'),
 array(['Announcement Card number: 8', 'Announcement Card number: 9',
       'Announcement Card number: 10'], dtype='<U28'),
 array(['Announcement Card number: 11', 'Announcement Card number: 12',
       'Announcement Card number: 13'], dtype='<U28')]


### 2. Initialize `pyDataverse` API
- Use `pyDataverse` to initialize the API to the dataverse installation

In [11]:
# set pyDataverse API adapter
g_api = NativeApi(g_dataverse_installation_url, g_dataverse_api_key)

# print results
print('{}'.format(g_api))

Native API: https://demo.dataverse.org/api/v1


### 3. Create Datasets and Upload Datafiles

#### 3.1 Create all datasets
- For each series name, create a dataset and retain status information

In [None]:
# for each series, create a dataset and save its information
for series_name in g_series_names:
    # get the series metadata
    series_metadata = g_dataset_metadata[series_name]
    # create the dataset
    g_dataverse_dataset_info[series_name] = curate.create_dataset(g_api, g_dataverse_collection, series_metadata)

pprint.pprint(g_dataverse_dataset_info)

In [None]:
for series_name in g_series_names:
    # get dataset metadata for the series
    series_metadata = g_dataset_metadata[series_name]


#### 3.1.1 Get dataset PIDs from HDV collection

For use if datasets have been previously created

In [None]:
#only necessary if you made changes to the curate module before this section
import importlib
importlib.reload(curate)

In [None]:
g_dataverse_dataset_info = get_dataset_pids_from_hdv(g_api, g_dataverse_collection)

In [None]:
pprint.pprint(g_dataverse_dataset_info)

#### 3.2 Upload dataset datafiles, one batch at a time
- Upload the datafiles associated with each dataset in a batch
- *MAKE SURE DIRECT UPLOAD IS ENABLED*
- problem with "registering files" in dvuploader python library. Usually throws 500 error, but works with small number of files (eg. 1925 Kiaochow). Indexing issue?
- Try looping batches with exception for 500 error (add in pause)

In [None]:
index = 0
batch = g_batches[index]
pids = get_dataset_pids(batch, g_dataverse_dataset_info)
datafile_metadata = get_datafile_inventories(batch, g_datafile_metadata)
print('Uploading batch: {}, series: {}'.format(index, batch))
ret = upload_files_batch_errors(g_api, g_dataverse_installation_url, batch, pids, g_datafiles_path, datafile_metadata) 

In [None]:
index = 42
batch = g_batches[index]
pids = get_dataset_pids(batch, g_dataverse_dataset_info)
datafile_metadata = get_datafile_inventories(batch, g_datafile_metadata)
print('Uploading batch: {}, series: {}'.format(index, batch))
upload_files_batch(g_api, g_dataverse_installation_url, batch, pids, g_datafiles_path, datafile_metadata)

#### 3.3 Publish datasets

In [None]:
# create dataset
import importlib
importlib.reload(curate)

# publish the datasets
errors = curate.publish_datasets(g_api, g_dataverse_collection, version='major')

pprint.pprint(errors)

## Create Inventories

combine metadata files into single, long dataframe

In [None]:
base_dir = "/Users/katherinemika/Desktop/curation/historic_datasets/"
annual_trade_reports_1923 = base_dir + 'annual_trade_reports/1923/1923_metadata.csv'
annual_trade_reports_1924 = base_dir + 'annual_trade_reports/1924/1924_metadata.csv'
annual_trade_reports_1925 = base_dir + 'annual_trade_reports/1925/1925_metadata.csv'
annual_trade_reports_1926 = base_dir + 'annual_trade_reports/1926/1926_metadata.csv'
annual_trade_reports_1927 = base_dir + 'annual_trade_reports/1927/1927_metadata.csv'
annual_trade_reports_1928 = base_dir + 'annual_trade_reports/1928/1928_metadata.csv'
returns_trade_ports = base_dir + 'returns_trade_ports/1866/1866_returns_trades_ports_metadata.csv'
shanghai_returns = base_dir + 'shanghai_returns/1936/1936_shanghai_returns_metadata.csv'
trade_statistics_treaty_ports = base_dir + 'trade_statistics_treaty_ports/1873/1873_metadata.csv'

In [None]:
with open(annual_trade_reports_1923, 'rb') as f: 
    result = chardet.detect(f.read())
    encoding = result['encoding']
    print(f"detected encoding: {encoding}")
    
df_annual_trade_reports_1923 = pd.read_csv(annual_trade_reports_1923,index_col=None, encoding=encoding, low_memory=False)

In [None]:
with open(annual_trade_reports_1924, 'rb') as f: 
    result = chardet.detect(f.read())
    encoding = result['encoding']
    print(f"detected encoding: {encoding}")
    
df_annual_trade_reports_1924 = pd.read_csv(annual_trade_reports_1924,index_col=None, encoding=encoding, low_memory=False)

In [None]:
with open(annual_trade_reports_1925, 'rb') as f: 
    result = chardet.detect(f.read())
    encoding = result['encoding']
    print(f"detected encoding: {encoding}")
    
df_annual_trade_reports_1925 = pd.read_csv(annual_trade_reports_1925,index_col=None, encoding=encoding, low_memory=False)

In [None]:
with open(annual_trade_reports_1926, 'rb') as f: 
    result = chardet.detect(f.read())
    encoding = result['encoding']
    print(f"detected encoding: {encoding}")
    
df_annual_trade_reports_1926 = pd.read_csv(annual_trade_reports_1926,index_col=None, encoding=encoding, low_memory=False)

In [None]:
with open(annual_trade_reports_1927, 'rb') as f: 
    result = chardet.detect(f.read())
    encoding = result['encoding']
    print(f"detected encoding: {encoding}")
    
df_annual_trade_reports_1927 = pd.read_csv(annual_trade_reports_1927,index_col=None, encoding=encoding, low_memory=False)

In [None]:
with open(annual_trade_reports_1928, 'rb') as f: 
    result = chardet.detect(f.read())
    encoding = result['encoding']
    print(f"detected encoding: {encoding}")
    
df_annual_trade_reports_1928 = pd.read_csv(annual_trade_reports_1928,index_col=None, encoding=encoding, low_memory=False)

In [None]:
with open(returns_trade_ports, 'rb') as f: 
    result = chardet.detect(f.read())
    encoding = result['encoding']
    print(f"detected encoding: {encoding}")
    
df_returns_trade_ports = pd.read_csv(returns_trade_ports,index_col=None, encoding=encoding, low_memory=False)

In [None]:
with open(shanghai_returns, 'rb') as f: 
    result = chardet.detect(f.read())
    encoding = result['encoding']
    print(f"detected encoding: {encoding}")
    
df_shanghai_returns = pd.read_csv(shanghai_returns,index_col=None, encoding=encoding, low_memory=False)

In [None]:
with open(trade_statistics_treaty_ports, 'rb') as f: 
    result = chardet.detect(f.read())
    encoding = result['encoding']
    print(f"detected encoding: {encoding}")
    
df_trade_statistics_treaty_ports = pd.read_csv(trade_statistics_treaty_ports,index_col=None, encoding=encoding, low_memory=False)

In [None]:
#concatenate dataframes
dfs = [df_annual_trade_reports_1923,
       df_annual_trade_reports_1924,
       df_annual_trade_reports_1925,
       df_annual_trade_reports_1926,
       df_annual_trade_reports_1927,
       df_annual_trade_reports_1928,
       df_returns_trade_ports,
       df_shanghai_returns, 
       df_trade_statistics_treaty_ports]

result = pd.concat(dfs, axis=0).reset_index(drop=True)
print(result)

In [None]:
result.to_csv(base_dir+ "chinese_maritime_customs_metadata_inventory.csv", index=False)

-----------

## Test Curation Process

### Test: Create a single dataset
This test allows users to create a single dataset and upload its related datafiles. 
Useful for troubleshooting and to test other collections.

#### 1 Test: Create datafile metadata

In [12]:
# create datafile metadata
# get the first series
first_series = g_series_names[0]
first_series_metadata = g_dataset_metadata[first_series]
first_series_inventory_df = g_series_inventories[first_series]

# set the template

datafile_metadata_df = curate.create_datafile_metadata(first_series_inventory_df, g_datafile_description_template_csv, g_datafile_description_template_txt, g_datafile_description_template_xml)

In [13]:
first_series_metadata['journalArticleType'] = 'abstract'
first_series_metadata['astroType'] = 'Observation'
first_series_metadata['universe'] = 'The Universe'

In [14]:
first_series_metadata

{'title': 'Announcement Card number: 1',
 'author': [{'authorName': 'Mika, Katherine',
   'authorAffiliation': 'Harvard Library'}],
 'description': [{'dsDescriptionValue': "Announcement Card number: 1 is a series of tables and text files associated with HCO Announcement Card number: 1. Compiled by: Harlow Shapley. Objects observed include BLATHWAYT'S COMET."}],
 'contact': [{'datasetContactName': 'Mika, Katherine',
   'datasetContactAffiliation': 'Harvard Library',
   'datasetContactEmail': 'katherine_mika@harvard.edu'}],
 'subject': ['Astronomy and Astrophysics'],
 'license': 'CC0 1.0',
 'keywords': [{'keywordValue': 'Astronomy--Observations',
   'keywordVocabulary': 'LCSH',
   'keywordVocabularyURI': 'https://www.loc.gov/aba/cataloging/subject/'},
  {'keywordValue': ' Astronomy--Research',
   'keywordVocabulary': 'LCSH',
   'keywordVocabularyURI': 'https://www.loc.gov/aba/cataloging/subject/'},
  {'keywordValue': ' Astronomical observatories',
   'keywordVocabulary': 'LCSH',
   'keyw

In [15]:
from pyDataverse.models import Dataset

In [28]:
ds = Dataset()
ds.title = first_series_metadata.get('title')
ds.author = first_series_metadata.get('author')
ds.dsDescription =  first_series_metadata.get('description')
ds.datasetContact = first_series_metadata.get('contact')
ds.subject = first_series_metadata.get('subject')
ds.license = first_series_metadata.get('license')
ds.keyword = first_series_metadata.get('keywords')
ds.topicClassification = first_series_metadata.get('topic_classification')
ds.astroObject = first_series_metadata.get('astroObject')
ds.astroFacility = first_series_metadata.get('astroFacility')
ds.astroType = first_series_metadata.get('astroType')
ds.universe = first_series_metadata.get('universe')
ds.journalArticleType = first_series_metadata.get('journalArticleType')

#json_dataset = ds.json()
#pprint.pprint(json_dataset)

pprint.pprint(ds.json())

('{\n'
 '  "datasetVersion": {\n'
 '    "metadataBlocks": {\n'
 '      "citation": {\n'
 '        "fields": [\n'
 '          {\n'
 '            "typeName": "subject",\n'
 '            "multiple": true,\n'
 '            "typeClass": "controlledVocabulary",\n'
 '            "value": [\n'
 '              "Astronomy and Astrophysics"\n'
 '            ]\n'
 '          },\n'
 '          {\n'
 '            "typeName": "title",\n'
 '            "multiple": false,\n'
 '            "typeClass": "primitive",\n'
 '            "value": "Announcement Card number: 1"\n'
 '          },\n'
 '          {\n'
 '            "typeName": "author",\n'
 '            "multiple": true,\n'
 '            "typeClass": "compound",\n'
 '            "value": [\n'
 '              {\n'
 '                "authorName": {\n'
 '                  "typeName": "authorName",\n'
 '                  "typeClass": "primitive",\n'
 '                  "multiple": false,\n'
 '                  "value": "Mika, Katherine"\n'
 '         

In [36]:
ds.json()

'{\n  "datasetVersion": {\n    "metadataBlocks": {\n      "citation": {\n        "fields": [\n          {\n            "typeName": "subject",\n            "multiple": true,\n            "typeClass": "controlledVocabulary",\n            "value": [\n              "Astronomy and Astrophysics"\n            ]\n          },\n          {\n            "typeName": "title",\n            "multiple": false,\n            "typeClass": "primitive",\n            "value": "Announcement Card number: 1"\n          },\n          {\n            "typeName": "author",\n            "multiple": true,\n            "typeClass": "compound",\n            "value": [\n              {\n                "authorName": {\n                  "typeName": "authorName",\n                  "typeClass": "primitive",\n                  "multiple": false,\n                  "value": "Mika, Katherine"\n                },\n                "authorAffiliation": {\n                  "typeName": "authorAffiliation",\n                  

In [None]:
print(ds.astroType)

In [None]:
print(ds.validate_json())

In [None]:
ds = Dataset()
dataset = ds.set(first_series_metadata)
pprint.pprint(ds.json(dataset))

### manually debugging the json() function in Dataset() class 

In [None]:
__attr_import_dv_up_datasetVersion_values = [
    "license",
    "termsOfAccess",
    "fileAccessRequest",
    "protocol",
    "authority",
    "identifier",        
    "termsOfUse",
]

__attr_import_dv_up_citation_fields_values = [
    "accessToSources",
    "alternativeTitle",
    "alternativeURL",
    "characteristicOfSources",
    "dateOfDeposit",
    "dataSources",
    "depositor",
    "distributionDate",
    "kindOfData",
    "language",
    "notesText",
    "originOfSources",
    "otherReferences",
    "productionDate",
    "productionPlace",
    "relatedDatasets",
    "relatedMaterial",
    "subject",
    "subtitle",
    "title",
]

__attr_import_dv_up_citation_fields_arrays = {
    "author": [
        "authorName",
        "authorAffiliation",
        "authorIdentifierScheme",
        "authorIdentifier",
    ],
    "contributor": ["contributorType", "contributorName"],
    "dateOfCollection": ["dateOfCollectionStart", "dateOfCollectionEnd"],
    "datasetContact": [
        "datasetContactName",
        "datasetContactAffiliation",
        "datasetContactEmail",
    ],
    "distributor": [
        "distributorName",
        "distributorAffiliation",
        "distributorAbbreviation",
        "distributorURL",
        "distributorLogoURL",
    ],
    "dsDescription": ["dsDescriptionValue", "dsDescriptionDate"],
    "grantNumber": ["grantNumberAgency", "grantNumberValue"],
    "keyword": ["keywordValue", "keywordVocabulary", "keywordVocabularyURI"],
    "producer": [
        "producerName",
        "producerAffiliation",
        "producerAbbreviation",
        "producerURL",
        "producerLogoURL",
    ],
    "otherId": ["otherIdAgency", "otherIdValue"],
    "publication": [
        "publicationCitation",
        "publicationIDType",
        "publicationIDNumber",
        "publicationURL",
    ],
    "software": ["softwareName", "softwareVersion"],
    "timePeriodCovered": ["timePeriodCoveredStart", "timePeriodCoveredEnd"],
    "topicClassification": [
        "topicClassValue",
        "topicClassVocab",
        "topicClassVocabURI",
    ],
}

__attr_import_dv_up_journal_fields_values = ["journalArticleType"]

__attr_import_dv_up_journal_fields_arrays = {
    'journalVolumeIssue': ["journalVolume", "journalIssue", "journalPubDate"]
}

__attr_import_dv_up_astrophysics_fields_values = [
    "astroType",
    "astroFacility",
    "astroInstrumnet",
    "astroObject",
    "resolution.Spatial", 
    "resolution.Temporal",
    "coverage.Spectral.Bandpass",
    "coverage.Spectral.CentralWavelength",
    "coverage.Spatial",
    "coverage.Depth",
    "coverage.ObjectDensity",
    "coverage.ObjectCount",
    "coverage.SkyFraction",
    "coverage.Polarization",
    "redshiftType",
    "resolution.Redshift"
]

__attr_import_dv_up_astrophysics_fields_arrays = {
    "coverage.Spectral.Wavelength": ["coverage.Spectral.MinimumWavelength", "coverage.Spectral.MaximumWavelength"],
    "coverage.Temporal": ["coverage.Temporal.StartTime", "coverage.Temporal.StopTime"],
    "coverage.RedshiftValue": ["coverage.Redshift.MinimumValue", "coverage.Redshift.MaximumValue"]
}
__attr_dict_dv_up_type_class_primitive = (
    [
        "accessToSources",
        "alternativeTitle",
        "alternativeURL",
        "authorAffiliation",
        "authorIdentifier",
        "authorName",
        "astroType",
        "astroFacility",
        "astroInstrumnet",
        "astroObject",
        "characteristicOfSources",
        "city",
        "contributorName",
        "coverage.Spectral.Bandpass",
        "coverage.Spectral.CentralWavelength",
        "coverage.Spatial",
        "coverage.Depth",
        "coverage.ObjectDensity",
        "coverage.ObjectCount",
        "coverage.SkyFraction",
        "coverage.Polarization",
        "dateOfDeposit",
        "dataSources",
        "depositor",
        "distributionDate",
        "kindOfData",
        "notesText",
        "originOfSources",
        "otherGeographicCoverage",
        "otherReferences",
        "productionDate",
        "productionPlace",
        "publicationCitation",
        "publicationIDNumber",
        "publicationURL",
        "relatedDatasets",
        "relatedMaterial",
        "resolution.Spatial", 
        "resolution.Temporal",
        "redshiftType",
        "resolution.Redshift"
        "seriesInformation",
        "seriesName",
        "state",
        "subtitle",
        "title",
    ]
    + __attr_import_dv_up_citation_fields_arrays["dateOfCollection"]
    + __attr_import_dv_up_citation_fields_arrays["datasetContact"]
    + __attr_import_dv_up_citation_fields_arrays["distributor"]
    + __attr_import_dv_up_citation_fields_arrays["dsDescription"]
    + __attr_import_dv_up_citation_fields_arrays["grantNumber"]
    + __attr_import_dv_up_citation_fields_arrays["keyword"]
    + __attr_import_dv_up_citation_fields_arrays["producer"]
    + __attr_import_dv_up_citation_fields_arrays["otherId"]
    + __attr_import_dv_up_citation_fields_arrays["software"]
    + __attr_import_dv_up_citation_fields_arrays["timePeriodCovered"]
    + __attr_import_dv_up_citation_fields_arrays["topicClassification"]
    + __attr_import_dv_up_journal_fields_arrays["journalVolumeIssue"]
    + __attr_import_dv_up_astrophysics_fields_values
    + __attr_import_dv_up_astrophysics_fields_arrays["coverage.Temporal"]
    + __attr_import_dv_up_astrophysics_fields_arrays["coverage.Spectral.Wavelength"]
    + __attr_import_dv_up_astrophysics_fields_arrays["coverage.RedshiftValue"]
)

__attr_dict_dv_up_type_class_compound = (
    list(__attr_import_dv_up_citation_fields_arrays.keys())
    + list(__attr_import_dv_up_astrophysics_fields_arrays.keys())
)
__attr_dict_dv_up_type_class_controlled_vocabulary = [
    "authorIdentifierScheme",
    "astroType",
    "contributorType",
    "country",
    "journalArticleType",
    "language",
    "publicationIDType",
    "subject",
]

__attr_displayNames = [
    "citation_displayName",
    "geospatial_displayName",
    "socialscience_displayName",
    "journal_displayName",
    "astrophysics_displayName"
]

In [None]:
def __parse_field_array(data, attr_list):
    data_tmp = []

    for d in data:
        tmp_dict = {}
        for key, val in d.items():
            if key in attr_list:
                tmp_dict[key] = val['value']
            else:
                print("Key '{0}' not in attribute list".format(key))
        data_tmp.append(tmp_dict)

    return data_tmp

In [None]:
def __generate_field_arrays(attr, key, sub_keys):
    tmp_list = []
    for d in attr: 
        tmp_dict = {}
        for k, v in d.items():
            if k in sub_keys:
                multiple = None
                type_class = None
                if isinstance(v, list):
                    multiple = True
                else:
                    multiple = False
                if k in __attr_dict_dv_up_type_class_primitive:
                    type_class = "primitive"
                elif k in __attr_dict_dv_up_type_class_compount:
                    type_class = "compound"
                elif k in __attr_dict_dv_up_type_class_controlled_vocabulary:
                    type_class = "controlledVocabulary"
                tmp_dict[k] = {}
                tmp_dict[k]["typeName"] = k
                tmp_dict[k]["typeClass"] = type_class
                tmp_dict[k]["multiple"] = multiple
                tmp_dict[k]["value"] = v
        tmp_list.append(tmp_dict)

    return(tmp_list)

In [None]:
data = {}
ds = Dataset()
data_dict = ds.set(first_series_metadata)
print(ds.json(data_dict))

In [None]:
for attr in (
    __attr_import_dv_up_journal_fields_values
    + list(__attr_import_dv_up_journal_fields_arrays.keys())
    + ["journal_displayName"]
):
    if attr in ds:
        journal = {}
        if attr != "journal_displayName":
            journal["fields"] = []
            break

    if "journal_displayName" in ds:
        journal["displayName"] = ds["journal_displayName"]

print(journal['displayName'])

In [None]:
for attr in (
    __attr_import_dv_up_journal_fields_values
    + list(__attr_import_dv_up_journal_fields_arrays.keys())
    + ["journal_displayName"]
):
    if attr in data_dict:
        journal = {}
        if attr != "journal_displayName":
            journal["fields"] = []
            break



for attr in __attr_import_dv_up_journal_fields_values:
    if attr in data_dict:
        v = data_dict[attr]
        if isinstance(v, list):
            multiple = True
        else:
            multiple = False
        if attr in __attr_dict_dv_up_type_class_primitive:
            type_class = "primitive"
        elif attr in __attr_dict_dv_up_type_class_compound:
            type_class = "compound"
        elif (
            attr in __attr_dict_dv_up_type_class_controlled_vocabulary
        ):
            type_class = "controlledVocabulary"
        journal["fields"].append(
            {
                "typeName": attr,
                "multiple": multiple,
                "typeClass": type_class,
                "value": v,
            }
        )

for (
    key,
    val,
) in __attr_import_dv_up_journal_fields_arrays.items():
    if key in data_dict:
        journal["fields"].append(
            {
                "typeName": key,
                "multiple": True,
                "typeClass": "compound",
                "value": self.__generate_field_arrays(key, val),
            }
        )

print(journal)

In [None]:
data["datasetVersion"] = {}
data["datasetVersion"]["metadataBlocks"] = {}

for attr in (
    __attr_import_dv_up_astrophysics_fields_values
    + list(__attr_import_dv_up_astrophysics_fields_arrays.keys())
    + ["astrophysics_displayName"]
):
    if attr in data_dict:
        astrophysics = {}
        if attr != "astrophysics_displayName":
            astrophysics["fields"] = []
            break

if "astrophysics_displayName" in data_dict:
    astrophysics["displayName"] = data_dict["astrophysics_displayName"]

for attr in __attr_import_dv_up_astrophysics_fields_values:
    if attr in data_dict:
        v = data_dict[attr]
        if isinstance(v, list):
            multiple = True
        else:
            multiple = False
        if attr in __attr_dict_dv_up_type_class_primitive:
            type_class = "primitive"
        elif attr in __attr_dict_dv_up_type_class_compound:
            type_class = "compound"
        elif (
            attr in __attr_dict_dv_up_type_class_controlled_vocabulary
        ):
            type_class = "controlledVocabulary"
        astrophysics["fields"].append(
            {
                "typeName": attr,
                "multiple": multiple,
                "typeClass": type_class,
                "value": v,
            }
        )

data["datasetVersion"]["metadataBlocks"]["astrophysics"] = astrophysics
pprint.pprint(data)

In [None]:
json_dataset = ds.json()
pprint.pprint(json_dataset)

In [30]:
import json
filename = "social_data.json"

# Save the object as a JSON file
with open(filename, 'w') as json_file:
    json.dump(social_data, json_file, indent=4)

In [22]:
# get the base url
base_url = g_api.base_url
# get the api token
api_token = g_api.api_token
# dataverse collection url 
dataverse_alias = g_dataverse_collection
# create the headers
headers = {'X-Dataverse-key': api_token, 'Content-Type' : 'application/json'}
# create the request url
request_url = '{}/api/dataverses/{}/datasets'.format(base_url, dataverse_alias)

In [33]:
response = requests.post(request_url, headers=headers, data="social_data.json")
# get the status and message from the response
print(response.text)

{"status":"ERROR","message":"Error parsing Json: Unexpected char 115 at (line no=1, column no=1, offset=0)"}


In [34]:
filename = "social_data.json"

try:
    with open(filename, 'r') as file:
        content = file.read()  # Read the entire file content
        if len(content) >= 115:  # Check if the file has at least 115 characters
            char_115 = content[114]  # Index 114 corresponds to the 115th character (0-based indexing)
            print(f"The 115th character in the file is: '{char_115}'")
        else:
            print(f"The file has less than 115 characters.")
except FileNotFoundError:
    print(f"The file {filename} does not exist.")
except Exception as e:
    print(f"An error occurred: {e}")

The 115th character in the file is: 'n'


In [None]:
Dataset.validate_json(ds)

In [None]:
from pyDataverse.utils import read_csv_as_dicts

In [None]:
csv_datasets_filename = g_module_path + "/test_dataset.csv"
print(csv_datasets_filename)

In [None]:
ds_data = read_csv_as_dicts(csv_datasets_filename)
print(ds_data)

In [None]:
ds_lst = []
for ds in ds_data: 
    ds_obj = Dataset()
    ds_obj.set(ds)
    ds_lst.append(ds_obj)

In [None]:
g_api

In [None]:
dv_alias = g_dataverse_collection
dataset_id_2_pid = {}
for ds in ds_lst:
    resp = g_api.create_dataset(dv_alias, ds.json())
    dataset_id_2_pid[ds.get()["org.dataset_id"]] = resp.json()['data']['persistentId']

In [None]:
resp

In [None]:
pprint.pprint(ds.json())

#### 2. Test: Create the dataset

In [21]:
# create the test dataset
dataset_ret = curate.create_dataset(g_api, g_dataverse_collection, first_series_metadata)
print(dataset_ret.json())

Error: 400 - failed to create dataset Announcement Card number: 1


AttributeError: 'dict' object has no attribute 'json'

In [23]:
print(dataset_ret)

{'status': False, 'dataset_id': -1, 'dataset_pid': ''}


In [None]:
import importlib
importlib.reload(curate)

#### 2.25. Test: using easyDataverse

In [None]:
import easyDataverse ### IMPORTANT ##### easyDataverse works with an earlier version of pydantic. DVUploader (python) works with a newer version.

#### 2.5 Test: test single file with python DVUploader 
Hard coded doi & file directory 

In [None]:
import dvuploader as dv

files = [
    #File(filepath="/Users/katherinemika/Desktop/test/43675367_a.csv"),
    #File(directoryLabel="txt", filepath="/Users/katherinemika/Desktop/test/txt/43675367.txt"),
    *dv.add_directory("/Users/katherinemika/Desktop/curation/historic_datasets/annual_trade_reports/1925/test_batch/"),
]

dvuploader = DVUploader(files=files)
dvuploader.upload(
    api_token=g_dataverse_api_key,
    dataverse_url="https://dataverse.harvard.edu",
    persistent_id="doi:10.7910/DVN/WDBXNN",
    n_parallel_uploads= 2
)

#### 3. Test: Direct upload the datafiles associated with the dataset (series name)

In [None]:
# upload the series dataset datafiles 
#test_datafiles_path = "/Users/katherinemika/Desktop/test/"
pid = dataset_ret.get('dataset_pid')
ret = curate.direct_upload_datafiles(g_api, g_dataverse_installation_url, pid, g_datafiles_path, datafile_metadata_df)

#### 4. Test: Examine a directory to make certain all files exist before attempting an upload of datafiles

In [None]:
# test to see if all files are there and report the ones that aren't

import os
errors = {}
for row in g_dataverse_inventory_df.iterrows():
    filename = row[1].get('filename_osn')
    filepath = g_datafiles_path + '/' + filename
    if (os.path.exists(filepath)):
        errors[filepath] = True
    else:
        print('File not found: {}'.format(filepath))
        errors[filepath] = False

#### Test and build functions for adding files to datasets via python dvuploader

psuedo code: 

1. list of files to add to each dataset
2. loop dvuploader.upload module for each dataset

In [None]:
import dvuploader as dv

In [None]:
def python_dvuploader(api, dataverse_url, dataset_pid, data_directory, metadata_df):
    """
    Upload Open Metadata datafiles to dataverse repository using direct upload method

    Parameters
    ----------
    api : pyDataverse api
    dataverse_url : str
        Dataverse installation url (e.g., https://demo.dataverse.org)
    dataset_pid : str
        Persistent identifier for the dataset (its DOI, takes form: doi:xxxxx)
    data_directory : str
        Directory where datafiles are kept
    metadata_df : DataFrame
        DataFrame containing metadata about datafiles to upload

    Return
    ------
    dict
        {upload: bool, errors: list, finalize: bool}
    """

    #validate params
    if ((not api) or 
        (not dataverse_url) or
        (not dataset_pid) or
        (not data_directory) or
        (metadata_df.empty==True)):
        return False

    #error msg
    errors = []

    json_data = []
    cats = None

    #add each file in metadata_df to files list for dvuploader
    
    files = []
    
    for row in metadata_df.iterrows():
        file = row[1].get('filename_osn')
        filepath = g_datafiles_path + "/" + file
        file_name = row[1].get('custom_name')
        desc = row[1].get('description')
        mime_type = row[1].get('mimetype')

        #format tags
        tags = row[1].get('tags')
        tags_lst = eval(tags)
        
        files.append(dv.File(filepath = filepath,
                             file_name = file_name,
                             description = desc,
                             mimeType = mime_type,
                             categories = tags_lst
                            )
                    )
        
        print('Uploading: {}/{} - {} {}'.format(data_directory, filepath, desc, mimeType))

        
    key = api.api_token
    dvuploader = dv.DVUploader(files=files)
        
    dvuploader.upload(
        api_token = key,
        dataverse_url = dataverse_url,
        persistent_id = dataset_pid,
        n_parallel_uploads= 2 #however many your installation can handle
    )

In [None]:
pid = dataset_ret.get('dataset_pid')
ret = python_dvuploader(g_api, g_dataverse_installation_url, pid, g_datafiles_path, datafile_metadata_df)

#### 5. Test: Delete all the datasets in the collection and start again
- WARNING: This is a permanent operation. Be very certain you want to perform this operation!

In [None]:
# delete all the datasets
# ARE YOU SURE ABOUT THIS? if so, uncomment the next line and execute
ret = curate.delete_datasets(g_api, g_dataverse_collection)

**End document.**