# Curation Script - Chinese Maritime Trade Data


## About
- This script contains the basic curation workflow for creating datasets and uploading files from a metadata spreadsheet of data files.   
- **Be SURE `curate.py` and `directupload.py` are set in correct directories**
<p>
- **Created:** 2023/10/12
- **Updated:** 

## Globals
- Global variables for this script. 
- Set variable names (e.g., `g_api_key` as needed)

In [1]:
# set curation source path
g_module_path = '/Users/katherinemika/Desktop/curation/historic_datasets/shanghai_returns/1936'

# path to output file
g_dataverse_inventory_file = '/Users/katherinemika/Desktop/curation/historic_datasets/shanghai_returns/1936/1936_shanghai_returns_metadata.csv'
# series names
g_series_names = []

# dataset inventories (keyed on series name)
g_series_inventories = {}

# dataset metadata (keyed on series name)
g_dataset_metadata = {}

# dataverse installation
g_dataverse_installation_url = 'https://dataverse.harvard.edu'

# dataverse API key
g_dataverse_api_key = 'xxx-xxxxx-xxxxx-xxxxx-xxx'

# dataverse collection name
g_dataverse_collection = 'shanghai_returns'

# dataverse inventory dataframe
g_dataverse_inventory_df = None

# dataset author
g_dataset_author = 'Mika, Katherine'

# dataset author affiliation
g_dataset_author_affiliation = 'Harvard Library'

# dataset contact information
g_dataset_contact = 'Mika, Katherine'
g_dataset_contact_email = 'katherine_mika@harvard.edu'

# full path to location of datafiles (e.g., ../data/trade_statistics)
g_datafiles_path = '/Users/katherinemika/Desktop/curation/historic_datasets/shanghai_returns/1936/csv_and_txt'
# demo dataverse dataset information (keyed on series name)
g_dataverse_dataset_info = {}

# datafile metadata (dataframe of datafile metadata, keyed on series name)
g_datafile_metadata = {}

# datafile description template
g_datafile_description_template_txt = 'File contains OCR text with data from Port: '
g_datafile_description_template_csv = 'File contains csv table with data from Port: '


# dataset batches (array of batches of series to create/upload)
g_dataset_batches = []

## Modules

- Add local modules path to Jupyter system path
- Load all modules including local modules such as `curate`

In [2]:
import sys
if g_module_path not in sys.path:
    sys.path.append(g_module_path)

import curate
import requests
import numpy as np
import pandas as pd
import pprint as pprint
from pyDataverse.api import NativeApi

## Local Functions

In [3]:
# get a dictionary of dataset pids keyed on series name. Collected from output of "create dataset" function
def get_dataset_pids(batch, dataset_info):
    pids = {}
    for series_name in batch:
        pids[series_name] = dataset_info[series_name].get('dataset_pid')
    return pids

def get_dataset_pids_from_hdv(api, dataverse_collection):
    ds_tree = api.get_children(dataverse_collection, children_types=["datasets"])
    key = api.api_token
    base_url = api.base_url
    headers = {'X-Dataverse-key': key}
    
    formatted_ds_tree = {}
    
    for t in ds_tree:
        pid = t['pid']
        request_url = '{}/api/datasets/:persistentId/?persistentId={}'.format(base_url, pid)
        response = requests.get(request_url, headers=headers)
        if response.status_code == 200:
            r = response.json()
            title = r['data']['latestVersion']['metadataBlocks']['citation']['fields'][0]['value']
            dataset_id = r['data']['id']
            formatted_ds_tree[title] = {
                'dataset_id': dataset_id,
                'dataset_pid': pid,
                'status': True
            }
        else:
            print(f"Failed to retrieve data for PID: {pid}")
    
    return formatted_ds_tree

# get dictionary of datafile inventories keyed on series name
def get_datafile_inventories(batch, datafile_metadata):
    inventories = {}
    for series_name in batch:
        inventories[series_name] = datafile_metadata[series_name]
    return inventories 

# upload the datafiles associated with a batch
def upload_dataset_batch(api, dataverse_url, batch_list, batch_pids, batch_datafile_metadata, data_directory):
    # upload the datafiles associated with each series in the batch
    results = {}
    for series_name in batch_list:
        pid = batch_pids[series_name]
        datafiles_metadata = batch_datafile_metadata[series_name]
        results[series_name] = curate.direct_upload_datafiles(api, dataverse_url, pid, path, datafiles_metadata)
    return results

# upload datafiles for a batch of datasets
def upload_files_batch(api, dataverse_url, batch_list, batch_pids, data_directory, batch_datafile_metadata):
    results = {}
    for series_name in batch_list:
        pid = batch_pids[series_name]
        batch_datafile_metadata = g_datafile_metadata[series_name]
        results[series_name] = curate.python_dvuploader(api, dataverse_url, pid, data_directory, batch_datafile_metadata)
    return results

# upload datafiles for a batch of datasets - but ignoring 500 errors that the python_dvuploader keeps issuing...
def upload_files_batch_errors(api, dataverse_url, batch_list, batch_pids, data_directory, g_datafile_metadata):
    results = {}
    for series_name in batch_list:
        try:
            pid = batch_pids[series_name]
            batch_datafile_metadata = g_datafile_metadata[series_name]
            results[series_name] = curate.python_dvuploader(api, dataverse_url, pid, data_directory, batch_datafile_metadata)
        except Exception as e:
            if "500" in str(e):
                print(f"500 error encountered for {series_name}. Skipping to the next series.")
                continue
            else:
                # Handle other exceptions if necessary
                raise e
    return results

## Curate Inventory

### 1. Prepare inventory data for curation

#### 1.1 Read `dataverse_inventory`
- **Add instructions about spreadsheet format** Eg. Column names, table org, etc.
- Create a `DataFrame` for later use
- Note: Also, the `curate:direct_upload_datafiles` function expects all files to be in a single directory (not grouped by file type)

In [30]:
# read the dataverse inventory file
import chardet

with open(g_dataverse_inventory_file, 'rb') as f: 
    result = chardet.detect(f.read())
    encoding = result['encoding']
    print(f"detected encoding: {encoding}")
    
g_dataverse_inventory_df = pd.read_csv(g_dataverse_inventory_file,index_col=None, encoding=encoding, low_memory=False)

detected encoding: UTF-8-SIG


In [31]:
g_dataverse_inventory_df['series_name']

0      Analysis of Imports
1      Analysis of Imports
2      Analysis of Imports
3      Analysis of Imports
4      Analysis of Imports
              ...         
251    Analysis of Exports
252           By Countries
253           By Countries
254           By Countries
255           By Countries
Name: series_name, Length: 256, dtype: object

#### 1.2 Create Dataset Inventories
- Get the list of series names
- Create a `dict` of file inventories keyed on series name

In [32]:
# get list of series in the full inventory
g_series_names = list(g_dataverse_inventory_df.series_name.unique())

# create series inventories
for name in g_series_names:
    # get series inventory
    g_series_inventories[name] = g_dataverse_inventory_df.loc[g_dataverse_inventory_df['series_name'] == name]

pprint.pprint(g_series_names)

['Analysis of Imports', 'Analysis of Exports', 'By Countries']


#### 1.3 Create Dataset Metadata
- Create a `dict` of dataset metadata extracted from each inventory

In [33]:
# for each series name, create dataset metadata
for series_name in g_series_names:
    # get series inventory
    series_inventory = g_series_inventories[series_name]
    md = curate.create_dataset_metadata(g_dataset_author, g_dataset_author_affiliation, 
                                        g_dataset_contact, g_dataset_contact_email,
                                        series_name, series_inventory)
    g_dataset_metadata[series_name] = md

pprint.pprint(g_dataset_metadata)

{'Analysis of Exports': {'author': [{'authorAffiliation': 'Harvard Library',
                                     'authorName': 'Mika, Katherine'}],
                         'contact': [{'datasetContactAffiliation': 'Harvard '
                                                                   'Library',
                                      'datasetContactEmail': 'katherine_mika@harvard.edu',
                                      'datasetContactName': 'Mika, Katherine'}],
                         'creation_date': '1936-01-01',
                         'data_source': ['http://nrs.harvard.edu/urn-3:FHCL:9446183'],
                         'description': [{'dsDescriptionValue': 'Analysis of '
                                                                'Exports is a '
                                                                'series of '
                                                                'tables and '
                                                                't

### 1.4 Create Datafile Metadata
- Create a `dict` of `DataFrames` containing metadata about individual files

In [34]:
for series_name in g_series_names:
    # get dataset metadata for the series
    series_metadata = g_dataset_metadata[series_name]
    # get the series inventory
    series_inventory_df = g_series_inventories[series_name]
    # create datafile metadata
    g_datafile_metadata[series_name] = curate.create_datafile_metadata(series_inventory_df, g_datafile_description_template_csv, g_datafile_description_template_txt)

In [25]:
g_datafile_metadata['Analysis of Imports']

Unnamed: 0,filename_osn,custom_name,file_type,description,mimetype,tags,test
0,44421113.txt,Imports_44421113.txt,txt,File contains OCR text with data from Port: A...,text/plain,"[""Data"", ""Shanghai"", "" Limited"", "" HARVARD"", ""...",44421113.txt
1,44421114.txt,Imports_44421114.txt,txt,File contains OCR text with data from Port: A...,text/plain,"[""Data"", ""HARVARD"", "" LIBRARY"", "" ARTHUR"", "" O...",44421114.txt
2,44421115.txt,Imports_44421115.txt,txt,File contains OCR text with data from Port: A...,text/plain,"[""Data"", ""Shanghai"", "" Limited"", "" SERIES"", "" ...",44421115.txt
3,44421116.txt,Imports_44421116.txt,txt,File contains OCR text with data from Port: A...,text/plain,"[""Data"", ""HARVARD"", "" HARVARD UNIVERSITY"", "" J...",44421116.txt
4,44421117.txt,Imports_44421117.txt,txt,File contains OCR text with data from Port: A...,text/plain,"[""Data"", ""Chinese"", "" Weights"", "" Measures, ""]",44421117.txt
...,...,...,...,...,...,...,...
165,44421418.txt,Imports_44421418.txt,txt,File contains OCR text with data from Port: A...,text/plain,"[""Data""]",44421418.txt
166,44421420.txt,Imports_44421420.txt,txt,File contains OCR text with data from Port: A...,text/plain,"[""Data""]",44421420.txt
167,44421423.txt,Imports_44421423.txt,txt,File contains OCR text with data from Port: A...,text/plain,"[""Data""]",44421423.txt
168,44421425.txt,Imports_44421425.txt,txt,File contains OCR text with data from Port: A...,text/plain,"[""Data""]",44421425.txt


### 1.4 Create Series Batches
- Create a set of (approximately) equal length batches of series (to create dataset and upload datafiles)
- Generally, there are too many series in a volume to create the related datasets and then upload all their datafiles in a single tight loop. Therefore, it's useful to create batches of these series and perform the create/upload operation on a single batch at a time.

In [35]:
# max number of series in a batch
batch_size = 3
g_batches = np.array_split(g_series_names, len(g_series_names)/batch_size)
pprint.pprint(g_batches)

[array(['Analysis of Imports', 'Analysis of Exports', 'By Countries'],
      dtype='<U19')]


### 2. Initialize `pyDataverse` API
- Use `pyDataverse` to initialize the API to the dataverse installation

In [36]:
# set pyDataverse API adapter
g_api = NativeApi(g_dataverse_installation_url, g_dataverse_api_key)

# print results
print('{}'.format(g_api))

Native API: https://dataverse.harvard.edu/api/v1


### 3. Create Datasets and Upload Datafiles

#### 3.1 Create all datasets
- For each series name, create a dataset and retain status information

In [37]:
# for each series, create a dataset and save its information
for series_name in g_series_names:
    # get the series metadata
    series_metadata = g_dataset_metadata[series_name]
    # create the dataset
    g_dataverse_dataset_info[series_name] = curate.create_dataset(g_api, g_dataverse_collection, series_metadata)

pprint.pprint(g_dataverse_dataset_info)

{'Analysis of Exports': {'dataset_id': 10391014,
                         'dataset_pid': 'doi:10.7910/DVN/HYUX15',
                         'status': True},
 'Analysis of Imports': {'dataset_id': 10391013,
                         'dataset_pid': 'doi:10.7910/DVN/WIZ6ZM',
                         'status': True},
 'By Countries': {'dataset_id': 10391015,
                  'dataset_pid': 'doi:10.7910/DVN/HIHES0',
                  'status': True}}


#### 3.1.1 Get dataset PIDs from HDV collection

For use if datasets have been previously created

In [None]:
#only necessary if you made changes to the curate module before this section
import importlib
importlib.reload(curate)

In [38]:
g_dataverse_dataset_info = get_dataset_pids_from_hdv(g_api, g_dataverse_collection)

In [39]:
pprint.pprint(g_dataverse_dataset_info)

{'Analysis of Exports': {'dataset_id': 10391014,
                         'dataset_pid': 'doi:10.7910/DVN/HYUX15',
                         'status': True},
 'Analysis of Imports': {'dataset_id': 10391013,
                         'dataset_pid': 'doi:10.7910/DVN/WIZ6ZM',
                         'status': True},
 'By Countries': {'dataset_id': 10391015,
                  'dataset_pid': 'doi:10.7910/DVN/HIHES0',
                  'status': True}}


#### 3.2 Upload dataset datafiles, one batch at a time
- Upload the datafiles associated with each dataset in a batch
- *MAKE SURE DIRECT UPLOAD IS ENABLED*
- problem with "registering files" in dvuploader python library. Usually throws 500 error, but works with small number of files (eg. 1925 Kiaochow). Indexing issue?
- Try looping batches with exception for 500 error (add in pause)

In [40]:
index = 0
batch = g_batches[index]
pids = get_dataset_pids(batch, g_dataverse_dataset_info)
datafile_metadata = get_datafile_inventories(batch, g_datafile_metadata)
print('Uploading batch: {}, series: {}'.format(index, batch))
ret = upload_files_batch_errors(g_api, g_dataverse_installation_url, batch, pids, g_datafiles_path, datafile_metadata) 

Uploading batch: 0, series: ['Analysis of Imports' 'Analysis of Exports' 'By Countries']




Output()









Output()

500 error encountered for Analysis of Imports. Skipping to the next series.




Output()









Output()

500 error encountered for Analysis of Exports. Skipping to the next series.




Output()









Output()

In [None]:
index = 42
batch = g_batches[index]
pids = get_dataset_pids(batch, g_dataverse_dataset_info)
datafile_metadata = get_datafile_inventories(batch, g_datafile_metadata)
print('Uploading batch: {}, series: {}'.format(index, batch))
upload_files_batch(g_api, g_dataverse_installation_url, batch, pids, g_datafiles_path, datafile_metadata)

#### 3.3 Publish datasets

In [None]:
# create dataset
import importlib
importlib.reload(curate)

# publish the datasets
errors = curate.publish_datasets(g_api, g_dataverse_collection, version='major')

pprint.pprint(errors)

## Create Inventories

combine metadata files into single, long dataframe

In [59]:
base_dir = "/Users/katherinemika/Desktop/curation/historic_datasets/"
annual_trade_reports_1923 = base_dir + 'annual_trade_reports/1923/1923_metadata.csv'
annual_trade_reports_1924 = base_dir + 'annual_trade_reports/1924/1924_metadata.csv'
annual_trade_reports_1925 = base_dir + 'annual_trade_reports/1925/1925_metadata.csv'
annual_trade_reports_1926 = base_dir + 'annual_trade_reports/1926/1926_metadata.csv'
annual_trade_reports_1927 = base_dir + 'annual_trade_reports/1927/1927_metadata.csv'
annual_trade_reports_1928 = base_dir + 'annual_trade_reports/1928/1928_metadata.csv'
returns_trade_ports = base_dir + 'returns_trade_ports/1866/1866_returns_trades_ports_metadata.csv'
shanghai_returns = base_dir + 'shanghai_returns/1936/1936_shanghai_returns_metadata.csv'
trade_statistics_treaty_ports = base_dir + 'trade_statistics_treaty_ports/1873/1873_metadata.csv'

In [72]:
with open(annual_trade_reports_1923, 'rb') as f: 
    result = chardet.detect(f.read())
    encoding = result['encoding']
    print(f"detected encoding: {encoding}")
    
df_annual_trade_reports_1923 = pd.read_csv(annual_trade_reports_1923,index_col=None, encoding=encoding, low_memory=False)

detected encoding: UTF-8-SIG


In [73]:
with open(annual_trade_reports_1924, 'rb') as f: 
    result = chardet.detect(f.read())
    encoding = result['encoding']
    print(f"detected encoding: {encoding}")
    
df_annual_trade_reports_1924 = pd.read_csv(annual_trade_reports_1924,index_col=None, encoding=encoding, low_memory=False)

detected encoding: utf-8


In [74]:
with open(annual_trade_reports_1925, 'rb') as f: 
    result = chardet.detect(f.read())
    encoding = result['encoding']
    print(f"detected encoding: {encoding}")
    
df_annual_trade_reports_1925 = pd.read_csv(annual_trade_reports_1925,index_col=None, encoding=encoding, low_memory=False)

detected encoding: utf-8


In [75]:
with open(annual_trade_reports_1926, 'rb') as f: 
    result = chardet.detect(f.read())
    encoding = result['encoding']
    print(f"detected encoding: {encoding}")
    
df_annual_trade_reports_1926 = pd.read_csv(annual_trade_reports_1926,index_col=None, encoding=encoding, low_memory=False)

detected encoding: utf-8


In [76]:
with open(annual_trade_reports_1927, 'rb') as f: 
    result = chardet.detect(f.read())
    encoding = result['encoding']
    print(f"detected encoding: {encoding}")
    
df_annual_trade_reports_1927 = pd.read_csv(annual_trade_reports_1927,index_col=None, encoding=encoding, low_memory=False)

detected encoding: utf-8


In [77]:
with open(annual_trade_reports_1928, 'rb') as f: 
    result = chardet.detect(f.read())
    encoding = result['encoding']
    print(f"detected encoding: {encoding}")
    
df_annual_trade_reports_1928 = pd.read_csv(annual_trade_reports_1928,index_col=None, encoding=encoding, low_memory=False)

detected encoding: utf-8


In [78]:
with open(returns_trade_ports, 'rb') as f: 
    result = chardet.detect(f.read())
    encoding = result['encoding']
    print(f"detected encoding: {encoding}")
    
df_returns_trade_ports = pd.read_csv(returns_trade_ports,index_col=None, encoding=encoding, low_memory=False)

detected encoding: UTF-8-SIG


In [79]:
with open(shanghai_returns, 'rb') as f: 
    result = chardet.detect(f.read())
    encoding = result['encoding']
    print(f"detected encoding: {encoding}")
    
df_shanghai_returns = pd.read_csv(shanghai_returns,index_col=None, encoding=encoding, low_memory=False)

detected encoding: UTF-8-SIG


In [80]:
with open(trade_statistics_treaty_ports, 'rb') as f: 
    result = chardet.detect(f.read())
    encoding = result['encoding']
    print(f"detected encoding: {encoding}")
    
df_trade_statistics_treaty_ports = pd.read_csv(trade_statistics_treaty_ports,index_col=None, encoding=encoding, low_memory=False)

detected encoding: UTF-8-SIG


In [81]:
#concatenate dataframes
dfs = [df_annual_trade_reports_1923,
       df_annual_trade_reports_1924,
       df_annual_trade_reports_1925,
       df_annual_trade_reports_1926,
       df_annual_trade_reports_1927,
       df_annual_trade_reports_1928,
       df_returns_trade_ports,
       df_shanghai_returns, 
       df_trade_statistics_treaty_ports]

result = pd.concat(dfs, axis=0).reset_index(drop=True)
print(result)

      series_name  table_type  year    filepath_osn  \
0           Notes       Notes  1923    43675355.txt   
1           Aigun       Notes  1923    43675356.txt   
2           Aigun    Contents  1923    43675357.txt   
3           Aigun       Notes  1923    43675358.txt   
4           Aigun       Notes  1923    43675359.txt   
...           ...         ...   ...             ...   
16284    Tientsin  Population  1873    44319950.txt   
16285    Tientsin  Population  1873  44319950_a.csv   
16286    Tientsin  Population  1873  44319950_b.csv   
16287    Tientsin  Population  1873    44319951.txt   
16288    Tientsin  Population  1873    44319952.txt   

                              custom_name    filename_osn file_type  \
0                            43675355.txt    43675355.txt       txt   
1                            43675356.txt    43675356.txt       txt   
2                            43675357.txt    43675357.txt       txt   
3                            43675358.txt    43675358.t

In [82]:
result.to_csv(base_dir+ "chinese_maritime_customs_metadata_inventory.csv", index=False)

-----------

## Test Curation Process

### Test: Create a single dataset
This test allows users to create a single dataset and upload its related datafiles. 
Useful for troubleshooting and to test other collections.

#### 1 Test: Create datafile metadata

In [None]:
# create datafile metadata
# get the first series
first_series = g_series_names[1]
first_series_metadata = g_dataset_metadata[first_series]
first_series_inventory_df = g_series_inventories[first_series]

# set the template
template = 'File associated with data tables series:'
datafile_metadata_df = curate.create_datafile_metadata(first_series_inventory_df, g_datafile_description_template_csv, g_datafile_description_template_txt)

In [None]:
datafile_metadata_df

#### 2. Test: Create the dataset

In [None]:
# create the test dataset
dataset_ret = curate.create_dataset(g_api, g_dataverse_collection, first_series_metadata)
pprint.pprint(dataset_ret)

#### 2.25. Test: using easyDataverse

In [None]:
import easyDataverse ### IMPORTANT ##### easyDataverse works with an earlier version of pydantic. DVUploader (python) works with a newer version.

#### 2.5 Test: test single file with python DVUploader 
Hard coded doi & file directory 

In [None]:
import dvuploader as dv

files = [
    #File(filepath="/Users/katherinemika/Desktop/test/43675367_a.csv"),
    #File(directoryLabel="txt", filepath="/Users/katherinemika/Desktop/test/txt/43675367.txt"),
    *dv.add_directory("/Users/katherinemika/Desktop/curation/historic_datasets/annual_trade_reports/1925/test_batch/"),
]

dvuploader = DVUploader(files=files)
dvuploader.upload(
    api_token=g_dataverse_api_key,
    dataverse_url="https://dataverse.harvard.edu",
    persistent_id="doi:10.7910/DVN/WDBXNN",
    n_parallel_uploads= 2
)

#### 3. Test: Direct upload the datafiles associated with the dataset (series name)

In [None]:
# upload the series dataset datafiles 
#test_datafiles_path = "/Users/katherinemika/Desktop/test/"
pid = dataset_ret.get('dataset_pid')
ret = curate.direct_upload_datafiles(g_api, g_dataverse_installation_url, pid, g_datafiles_path, datafile_metadata_df)

#### 4. Test: Examine a directory to make certain all files exist before attempting an upload of datafiles

In [None]:
# test to see if all files are there and report the ones that aren't

import os
errors = {}
for row in g_dataverse_inventory_df.iterrows():
    filename = row[1].get('filename_osn')
    filepath = g_datafiles_path + '/' + filename
    if (os.path.exists(filepath)):
        errors[filepath] = True
    else:
        print('File not found: {}'.format(filepath))
        errors[filepath] = False

#### Test and build functions for adding files to datasets via python dvuploader

psuedo code: 

1. list of files to add to each dataset
2. loop dvuploader.upload module for each dataset

In [None]:
import dvuploader as dv

In [None]:
def python_dvuploader(api, dataverse_url, dataset_pid, data_directory, metadata_df):
    """
    Upload Open Metadata datafiles to dataverse repository using direct upload method

    Parameters
    ----------
    api : pyDataverse api
    dataverse_url : str
        Dataverse installation url (e.g., https://demo.dataverse.org)
    dataset_pid : str
        Persistent identifier for the dataset (its DOI, takes form: doi:xxxxx)
    data_directory : str
        Directory where datafiles are kept
    metadata_df : DataFrame
        DataFrame containing metadata about datafiles to upload

    Return
    ------
    dict
        {upload: bool, errors: list, finalize: bool}
    """

    #validate params
    if ((not api) or 
        (not dataverse_url) or
        (not dataset_pid) or
        (not data_directory) or
        (metadata_df.empty==True)):
        return False

    #error msg
    errors = []

    json_data = []
    cats = None

    #add each file in metadata_df to files list for dvuploader
    
    files = []
    
    for row in metadata_df.iterrows():
        file = row[1].get('filename_osn')
        filepath = g_datafiles_path + "/" + file
        file_name = row[1].get('custom_name')
        desc = row[1].get('description')
        mime_type = row[1].get('mimetype')

        #format tags
        tags = row[1].get('tags')
        tags_lst = eval(tags)
        
        files.append(dv.File(filepath = filepath,
                             file_name = file_name,
                             description = desc,
                             mimeType = mime_type,
                             categories = tags_lst
                            )
                    )
        
        print('Uploading: {}/{} - {} {}'.format(data_directory, filepath, desc, mimeType))

        
    key = api.api_token
    dvuploader = dv.DVUploader(files=files)
        
    dvuploader.upload(
        api_token = key,
        dataverse_url = dataverse_url,
        persistent_id = dataset_pid,
        n_parallel_uploads= 2 #however many your installation can handle
    )

In [None]:
pid = dataset_ret.get('dataset_pid')
ret = python_dvuploader(g_api, g_dataverse_installation_url, pid, g_datafiles_path, datafile_metadata_df)

#### 5. Test: Delete all the datasets in the collection and start again
- WARNING: This is a permanent operation. Be very certain you want to perform this operation!

In [None]:
# delete all the datasets
# ARE YOU SURE ABOUT THIS? if so, uncomment the next line and execute
ret = curate.delete_datasets(g_api, g_dataverse_collection)

**End document.**