In [15]:
import csv
from functools import reduce
import io
import numpy as np
import pandas as pd
import requests

# Import data

Import CSV files that contain:
- The PIDs of all datasets and which repositories published them
- Author field metadata entered in all datasets published by many repositories that use the Dataverse software.

In [45]:
# Get PIDs of all datasets and which repositories published them
datasetPIDsDF = (pd
    .read_csv(
        'dataset_pids_from_most_known_dataverse_installations_2023.08.csv',
        sep=',', na_filter=False)
    .query(
        '(dataverse_json_export_saved == True) and\
        (dataverse_installation_name != "ODISSEI_Portal")')
    .drop(columns=['dataverse_json_export_saved'])
    .reset_index(drop=True, inplace=False)
 )

datasetPIDsDF.head()

Unnamed: 0,dataverse_installation_name,dataset_pid_url,dataverse_collection_alias,dataverse_collection_name,dataverse_collection_type
0,UNB_Libraries_Dataverse,https://doi.org/10.25545/NVN79Z,snb,Service New Brunswick,ORGANIZATIONS_INSTITUTIONS
1,UNB_Libraries_Dataverse,https://doi.org/10.25545/YB60PU,blightchemistry,Blight Chemistry Research Omniverse,RESEARCH_GROUP
2,UNB_Libraries_Dataverse,https://doi.org/10.25545/WHD3KC,blightchemistry,Blight Chemistry Research Omniverse,RESEARCH_GROUP
3,UNB_Libraries_Dataverse,https://doi.org/10.25545/6TZWCG,Ir-CationBinding,Elucidation of Charge Contributions in Iridium-Chelated Hydrogen-Bonding Systems,JOURNALS
4,UNB_Libraries_Dataverse,https://doi.org/10.25545/NYM13B,IFMLAB,Integrated Forest Management Lab,RESEARCH_GROUP


In [29]:
# Get Author field metadata entered in all datasets in Dataverse repositories
authorMetadataDF = (pd
    .read_csv(
        'author(citation)_2023.08.22-2023.08.28.csv',
        sep=',', na_filter=False)
     .drop(columns=['dataset_pid'])
     .reset_index(drop=True, inplace=False)
     )

authorMetadataDF.head()

Unnamed: 0,dataset_pid_url,dataset_publication_date,dataset_version_number,dataset_version_create_time,authorName,authorAffiliation,authorIdentifierScheme,authorIdentifier
0,https://doi.org/10.48370/OFD/8TTRLC,2022-05-13,1.0,2022-05-11T14:44:29Z,"Mammal Research Institute, Polish Academy of Sciences",,,
1,https://doi.org/10.17026/dans-26a-cq4r,2015-09-18,2.0,2022-02-18T19:29:27Z,H.J. Hesseling,RAAP Archeologisch Adviesbureau B.V.,,
2,https://doi.org/10.17026/dans-zc2-mc2g,2016-12-31,1.0,2022-02-24T21:01:03Z,S. Moerman,,,
3,https://doi.org/10.17026/dans-x9z-bmn6,2020-12-22,1.0,2022-02-14T05:11:03Z,G. Zielman,RAAP,,
4,https://doi.org/10.7910/DVN/QD0V0H,2021-09-20,1.0,2021-07-29T19:45:43Z,"Master, Daniel M.",Wheaton College,,


In [46]:
print(f'Count of datasets in datasetPIDsDF: {len(datasetPIDsDF)}')
datasetCountInAuthorMetadataDF = len(pd.unique(authorMetadataDF['dataset_pid_url']))
print(f'Number of datasets in authorMetadataDF: {datasetCountInAuthorMetadataDF}')

Count of datasets in datasetPIDsDF: 390401
Number of datasets in authorMetadataDF: 390401


- Get count of all author metadata in each installation
- Get count of each type of author identifier in each installation

In [9]:
# Join the latestDatasetVersionsDF and the datasetPIDsDF to add the installation column,
# so we know which installations published each dataset

geospatialMetadataLatestVersionDF = (pd
    .merge(geospatialMetadataDF, basicDatasetMetadataDF,
            how='inner',
            on=['dataset_pid_url', 'dataset_version_number'])
    .drop(columns=[
        'dataset_version_create_time', 'installation',
        'dataverse_alias'])
    .reset_index(drop=True, inplace=False))

# Make sure the count of rows is the same as the count of total datasets: 340,857
print(len(basicDatasetMetadataDF))

340857


In [10]:
basicDatasetMetadataDF.head()

Unnamed: 0,dataset_pid_url,dataset_version_number,dataset_version_create_time,installation,dataverse_alias
0,http://dx.doi.org/10.26193/00HBWG,2.0,2019-05-13 08:43:01+00:00,ADA Dataverse,
1,http://dx.doi.org/10.26193/01P0AI,2.0,2019-05-13 06:23:26+00:00,ADA Dataverse,
2,http://dx.doi.org/10.26193/04F7C1,2.0,2019-05-13 10:02:15+00:00,ADA Dataverse,
3,http://dx.doi.org/10.26193/07R31R,2.0,2019-05-13 06:50:29+00:00,ADA Dataverse,
4,http://dx.doi.org/10.26193/0AF6TZ,5.0,2022-02-02 23:01:26+00:00,ADA Dataverse,


In [13]:
# Combine basicDatasetMetadataDF and grantInformationLatestVersionDF with a
# full outer join on dataset_pid_url and dataset_version_number columns
dataframes = [basicDatasetMetadataDF, geospatialMetadataLatestVersionDF]
indexList = ['dataset_pid_url', 'dataset_version_number']
for df in dataframes:
    df.set_index(indexList, inplace=True)

In [14]:
geospatialDatasetMetadataInDataverseInstallationsDF = reduce(lambda left, right: left.join(right, how='outer'), dataframes)
fundingDatasetMetadataInDataverseInstallationsDF = (geospatialDatasetMetadataInDataverseInstallationsDF.reset_index(drop=False, inplace=True))
geospatialDatasetMetadataInDataverseInstallationsDF.head()

Unnamed: 0,dataset_pid_url,dataset_version_number,dataset_version_create_time,installation,dataverse_alias,westLongitude,eastLongitude,northLongitude,southLongitude
0,http://dx.doi.org/10.26193/00HBWG,2.0,2019-05-13 08:43:01+00:00,ADA Dataverse,,,,,
1,http://dx.doi.org/10.26193/01P0AI,2.0,2019-05-13 06:23:26+00:00,ADA Dataverse,,,,,
2,http://dx.doi.org/10.26193/04F7C1,2.0,2019-05-13 10:02:15+00:00,ADA Dataverse,,,,,
3,http://dx.doi.org/10.26193/07R31R,2.0,2019-05-13 06:50:29+00:00,ADA Dataverse,,,,,
4,http://dx.doi.org/10.26193/0AF6TZ,5.0,2022-02-02 23:01:26+00:00,ADA Dataverse,,,,,


In [15]:
# geospatialDatasetMetadataInDataverseInstallationsDF.to_csv('geospatialDatasetMetadataInDataverseInstallationsDF.csv', index=False)

In [16]:
# Retain only datasets that have values in any of the four bounding box fields
geospatialDatasetMetadataInDataverseInstallationsDF = (
    geospatialDatasetMetadataInDataverseInstallationsDF
        .query(
            '(westLongitude == westLongitude) or\
            (eastLongitude == eastLongitude) or\
            (northLongitude == northLongitude) or\
            (southLongitude == southLongitude)'
        )
        .reset_index(drop=True, inplace=False)
)
geospatialDatasetMetadataInDataverseInstallationsDF.head()

Unnamed: 0,dataset_pid_url,dataset_version_number,dataset_version_create_time,installation,dataverse_alias,westLongitude,eastLongitude,northLongitude,southLongitude
0,https://doi.org/10.11588/data/10000,1.1,2017-04-06 07:16:00+00:00,HeiDATA,iwrgraphics,49.425272,49.397574,8.649282,8.720865
1,https://doi.org/10.11588/data/10039,2.2,2017-07-06 07:00:46+00:00,HeiDATA,healtheco,3.8667,,12.7333,
2,https://doi.org/10.11588/data/10044,1.1,2017-04-06 07:14:50+00:00,HeiDATA,iwrgraphics,49.425272,49.397574,8.649282,8.720865
3,https://doi.org/10.11588/data/10045,1.1,2017-04-06 07:15:31+00:00,HeiDATA,iwrgraphics,49.425272,49.397574,8.649282,8.720865
4,https://doi.org/10.11588/data/10046,1.1,2017-04-06 07:13:29+00:00,HeiDATA,iwrgraphics,49.425272,49.397574,8.649282,8.720865


In [17]:
print(f'Number of bounding boxes in latestDatasetVersionsDF: {len(geospatialDatasetMetadataInDataverseInstallationsDF)}')
print(f'Number of datasets in geospatialDatasetMetadataInDataverseInstallationsDF: {(len(pd.unique(geospatialDatasetMetadataInDataverseInstallationsDF["dataset_pid_url"])))}')

Number of bounding boxes in latestDatasetVersionsDF: 116054
Number of datasets in geospatialDatasetMetadataInDataverseInstallationsDF: 115444


In [18]:
# Create list of dataset PIDs with "invalid" bounding box metadata

datasetsWithInvalidBBMetadataList = []
for index, row in geospatialDatasetMetadataInDataverseInstallationsDF.iterrows():
    datasetPid = row['dataset_pid_url']
    boxes = [
        row['westLongitude'],
        row['eastLongitude'],
        row['northLongitude'],
        row['southLongitude']
    ]
    invalidCount = 0
    for box in boxes:
        try:
            value = float(box)
            if np.isnan(value):
                invalidCount += 1
        except Exception:
            invalidCount += 1
    if invalidCount > 0:
        datasetsWithInvalidBBMetadataList.append(datasetPid)

datasetsWithInvalidBBMetadataList = list(set(datasetsWithInvalidBBMetadataList))

In [19]:
print(len(datasetsWithInvalidBBMetadataList))

12332


In [20]:
print(datasetsWithInvalidBBMetadataList[:10])

['https://doi.org/10.48370/OFD/DENPIN', 'https://doi.org/10.48370/OFD/QSNSLR', 'https://doi.org/10.48370/OFD/3TP8N9', 'https://doi.org/10.48370/OFD/9RVBWV', 'https://doi.org/10.48370/OFD/W7J7LI', 'https://doi.org/10.48370/OFD/RZQFYZ', 'https://doi.org/10.48370/OFD/44D4VR', 'https://doi.org/10.48370/OFD/UKRIIM', 'https://doi.org/10.48370/OFD/84I1VG', 'https://doi.org/10.21223/P3/AZRDGH']


In [21]:
# Create dataframe with metadata of only datasets with "invalid" bounding box metadata
datasetsWithInvalidBBMetadataDF = (
    # Drop all but the needed columns
    geospatialDatasetMetadataInDataverseInstallationsDF
        # Remove all datasets except those in given list of collection aliases
        .query(
        'dataset_pid_url in @datasetsWithInvalidBBMetadataList')
)
datasetsWithInvalidBBMetadataDF.head()

Unnamed: 0,dataset_pid_url,dataset_version_number,dataset_version_create_time,installation,dataverse_alias,westLongitude,eastLongitude,northLongitude,southLongitude
1,https://doi.org/10.11588/data/10039,2.2,2017-07-06 07:00:46+00:00,HeiDATA,healtheco,3.8667,,12.7333,
12,https://doi.org/10.11588/data/LSG8TN,1.0,2020-09-01 13:38:28+00:00,HeiDATA,geomorph,"6°37'56.27""E","6°54'33.22""E","50°59'31.93""N","50°46'19.53""N"
24,https://doi.org/10.15454/2TWG8B,1.0,2021-05-28 09:16:27+00:00,Recherche Data Gouv,TEMPO,,2°49,48°80,
37,https://doi.org/10.15454/CLYPS4,2.0,2020-01-30 17:38:02+00:00,Recherche Data Gouv,TEMPO,,"4°52'41.69""E","43°54'59.63""N",
38,https://doi.org/10.15454/D4MJMJ,3.1,2022-01-14 10:22:28+00:00,Recherche Data Gouv,pheno_ueh,-0.5966699246002216,47.674806960947606,,


In [22]:
datasetsWithInvalidBBMetadataDF.to_csv('datasetsWithInvalidBBMetadataDF.csv', index=False)