In [1]:
import httpx
import orjson
import pandas as pd
import re


The source dataset to be tested: https://borealisdata.ca/dataset.xhtml?persistentId=doi:10.5683/SP3/3N6JVZ&version=1.0

In [2]:
# Read the dataset native JSON using the exporter 
native_json = httpx.get('https://borealisdata.ca/api/datasets/export?exporter=dataverse_json&persistentId=doi%3A10.5683/SP3/3N6JVZ').json()

In [3]:
# Read the dataset croissant JSON using the exporter
croissant_json = httpx.get('https://borealisdata.ca/api/datasets/export?exporter=croissant&persistentId=doi%3A10.5683/SP3/3N6JVZ').json()

In [None]:
# Define the base download URL for a data file
DF_BASE_URL = 'https://borealisdata.ca/api/access/datafile/' 

In [None]:
# Loop through the native JSON file and extract the file name, file id their respective URLs
native_json_files_list = native_json.get('datasetVersion').get('files')  # Get the list of files

# Create a list to store the file names and URLs
native_json_files_list_parsed = []
for item in native_json_files_list:
    # Extract the file name; Get the original file name first if it exists
    file_name = item.get('dataFile').get('originalFileName') or item.get('dataFile').get('filename')
    # Extract the file directory
    file_directory = item.get('directoryLabel')
    # Join the file directory and file name
    file_abs_path = f"{file_directory}/{file_name}" if file_directory else file_name
    # Extract the file id
    file_id = item.get('dataFile').get('id')
    # Extract the file md5
    file_md5 = item.get('dataFile').get('md5')
    # Construct the download URL
    # If the original file name exists add original parameter to the URL
    if item.get('dataFile').get('originalFileName'):
        # If the original file name exists add original parameter to the URL
        download_url = f"{DF_BASE_URL}{file_id}?format=original"
    else:
        download_url = f"{DF_BASE_URL}{file_id}"
    # Append the file name and URL to the list
    native_json_files_list_parsed.append({
        'file_abs_path': file_abs_path,
        'file_id': file_id,
        'download_url': download_url,
        'file_md5': file_md5
    })

In [None]:
# Loop through the croissant JSON file and extract the file name, their respective URLs
# Get the 'distribution' from the croissant JSON
croissant_json_files_list = croissant_json.get('distribution')  # Get the list of files

# Create a list to store the file names and URLs
croissant_json_files_list_parsed = []
for item in croissant_json_files_list:
    if item.get('@type') == 'cr:FileObject':
        # Extract the file id (that is the absolute path of the file)
        file_abs_path = item.get('@id')
        # Extract the data file access url
        download_url = item.get('contentUrl')
        # Base on the download_url, extract the file id
        file_id = int(re.findall(r'\d+', download_url)[0]) if download_url else None
        # Extract the file md5
        file_md5 = item.get('md5')
        # Append the file name, file_id and download_url to the list
        croissant_json_files_list_parsed.append({
            'file_abs_path': file_abs_path,
            'file_id': file_id,
            'download_url': download_url,
            'file_md5': file_md5
        })


In [None]:
# Compare the two lists, the source is the native JSON and the target is the croissant JSON
# Create a DataFrame from the native JSON files list
native_json_df = pd.DataFrame(native_json_files_list_parsed)
# Create a DataFrame from the croissant JSON files list
croissant_json_df = pd.DataFrame(croissant_json_files_list_parsed)

# Merge the two DataFrames on the 'file_md5' column
merged_df = pd.merge(native_json_df, croissant_json_df, on='file_md5', how='outer', suffixes=('_native', '_croissant'))

In [8]:
# Check if the file_abs_path_native and file_abs_path_croissant are the same
merged_df['file_abs_path_match'] = merged_df['file_abs_path_native'] == merged_df['file_abs_path_croissant']

# Check if file_id_native and file_id_croissant are the same
merged_df['file_id_match'] = merged_df['file_id_native'] == merged_df['file_id_croissant']

In [9]:
# Export the merged DataFrame to a CSV file
merged_df.to_csv('borealis_ds_json_vs_croissant.csv', index=False)