In [1]:
import pandas as pd

# Import data

Import CSV files that contain:
- The PIDs of all datasets published by many repositories that use the Dataverse software and which repositories published them
- Author field metadata entered in all of those datasets

In [2]:
# Get PIDs of all datasets and which repositories published them
datasetPIDsDF = (pd
    .read_csv(
        'dataset_pids_from_most_known_dataverse_installations_2023.08.csv',
        sep=',',
        na_filter=False)
    .query(
        '(dataverse_json_export_saved == True) and\
        (dataverse_installation_name != "ODISSEI_Portal")')
    .drop(columns=['dataverse_json_export_saved'])
    .reset_index(drop=True, inplace=False)
 )

datasetPIDsDF.head()

Unnamed: 0,dataverse_installation_name,dataset_pid_url,dataverse_collection_alias,dataverse_collection_name,dataverse_collection_type
0,UNB_Libraries_Dataverse,https://doi.org/10.25545/NVN79Z,snb,Service New Brunswick,ORGANIZATIONS_INSTITUTIONS
1,UNB_Libraries_Dataverse,https://doi.org/10.25545/YB60PU,blightchemistry,Blight Chemistry Research Omniverse,RESEARCH_GROUP
2,UNB_Libraries_Dataverse,https://doi.org/10.25545/WHD3KC,blightchemistry,Blight Chemistry Research Omniverse,RESEARCH_GROUP
3,UNB_Libraries_Dataverse,https://doi.org/10.25545/6TZWCG,Ir-CationBinding,Elucidation of Charge Contributions in Iridium-Chelated Hydrogen-Bonding Systems,JOURNALS
4,UNB_Libraries_Dataverse,https://doi.org/10.25545/NYM13B,IFMLAB,Integrated Forest Management Lab,RESEARCH_GROUP


In [3]:
# Get Author field metadata entered in all datasets in Dataverse repositories
authorMetadataDF = (pd
    .read_csv(
        'author(citation)_2023.08.22-2023.08.28.csv',
        sep=',',
        na_filter=False)
     .drop(columns=['dataset_pid'])
     .reset_index(drop=True, inplace=False)
     )

authorMetadataDF.head()

Unnamed: 0,dataset_pid_url,dataset_publication_date,dataset_version_number,dataset_version_create_time,authorName,authorAffiliation,authorIdentifierScheme,authorIdentifier
0,https://doi.org/10.48370/OFD/8TTRLC,2022-05-13,1.0,2022-05-11T14:44:29Z,"Mammal Research Institute, Polish Academy of Sciences",,,
1,https://doi.org/10.17026/dans-26a-cq4r,2015-09-18,2.0,2022-02-18T19:29:27Z,H.J. Hesseling,RAAP Archeologisch Adviesbureau B.V.,,
2,https://doi.org/10.17026/dans-zc2-mc2g,2016-12-31,1.0,2022-02-24T21:01:03Z,S. Moerman,,,
3,https://doi.org/10.17026/dans-x9z-bmn6,2020-12-22,1.0,2022-02-14T05:11:03Z,G. Zielman,RAAP,,
4,https://doi.org/10.7910/DVN/QD0V0H,2021-09-20,1.0,2021-07-29T19:45:43Z,"Master, Daniel M.",Wheaton College,,


In [4]:
print(f'Count of datasets in datasetPIDsDF: {len(datasetPIDsDF)}')
datasetCountInAuthorMetadataDF = len(pd.unique(authorMetadataDF['dataset_pid_url']))
print(f'Number of datasets in authorMetadataDF: {datasetCountInAuthorMetadataDF}')

Count of datasets in datasetPIDsDF: 390401
Number of datasets in authorMetadataDF: 390401


- Get count of all author metadata in each installation
- Get count of each type of author identifier in each installation

In [5]:
# Join the datasetPIDsDF and the authorMetadataDF to add the installation column,
# so we know which installations published each dataset

datasetPIDsAndAuthorMetadataDF = (pd
    .merge(datasetPIDsDF, authorMetadataDF,
        how='outer',
        on=['dataset_pid_url'])
    .drop(columns=[
        'dataset_version_number',
        'dataset_version_create_time'])
    .reset_index(drop=True, inplace=False))

# Make sure the count of rows is the same as the count of total datasets: 390401
datasetCountInDatasetPIDsAndAuthorMetadataDF = len(pd.unique(datasetPIDsAndAuthorMetadataDF['dataset_pid_url']))
print(f'Number of datasets in datasetCountInDatasetPIDsAndAuthorMetadataDF: {datasetCountInDatasetPIDsAndAuthorMetadataDF}')

Number of datasets in datasetCountInDatasetPIDsAndAuthorMetadataDF: 390401


In [6]:
datasetPIDsAndAuthorMetadataDF.head()

Unnamed: 0,dataverse_installation_name,dataset_pid_url,dataverse_collection_alias,dataverse_collection_name,dataverse_collection_type,dataset_publication_date,authorName,authorAffiliation,authorIdentifierScheme,authorIdentifier
0,UNB_Libraries_Dataverse,https://doi.org/10.25545/NVN79Z,snb,Service New Brunswick,ORGANIZATIONS_INSTITUTIONS,2018-05-10,Service New Brunswick,Government of New Brunswick,,
1,UNB_Libraries_Dataverse,https://doi.org/10.25545/YB60PU,blightchemistry,Blight Chemistry Research Omniverse,RESEARCH_GROUP,2018-06-20,"Blight, Barry A",University of New Brunswick,ORCID,0000-0003-1166-6206
2,UNB_Libraries_Dataverse,https://doi.org/10.25545/WHD3KC,blightchemistry,Blight Chemistry Research Omniverse,RESEARCH_GROUP,2019-06-23,"Balonova, Barbora",University of New Brunswick,,
3,UNB_Libraries_Dataverse,https://doi.org/10.25545/6TZWCG,Ir-CationBinding,Elucidation of Charge Contributions in Iridium-Chelated Hydrogen-Bonding Systems,JOURNALS,2021-05-18,"Blight, Barry A",University of New Brunswick,,
4,UNB_Libraries_Dataverse,https://doi.org/10.25545/NYM13B,IFMLAB,Integrated Forest Management Lab,RESEARCH_GROUP,2020-06-30,"Chen, Yingbing",University of New Brunswick,,


Some datasets have no author metadata. Let's remove those datasets.

In [None]:
datasetsWithNoAuthorsDf = (
    datasetPIDsAndAuthorMetadataDF
        .query(
            '(authorIdentifierScheme == authorIdentifierScheme and\
            authorIdentifier == authorIdentifier)')
        .reset_index(drop=True, inplace=False)
)

In [None]:

# Get count of author metadata that includes any values in the identifier type or identifier fields

# Get count of "valid author identifier metadata", that is author metadata that includes any values in both the identifier type and identifier fields

# Get count of "valid author identifier metadata" for each type of identifier, e.g. ORCID, GND.

# Get count of "valid author identifier metadata" for each type of identifier, e.g. ORCID, GND, in each installations

In [None]:
# how many have a value in both the Author Identifier Scheme and Author Identifier field?

authorIdentifierMetadataComplete = (
    authorMetadataLatestVersionDF
        .drop(columns=['authorAffiliation'])
        # .dropna(subset=['authorIdentifierScheme', 'authorIdentifier'], how='all')
        .query(
        '(authorIdentifierScheme == authorIdentifierScheme and\
        authorIdentifier == authorIdentifier)')
        .reset_index(drop=True, inplace=False)
)

authorIdentifierMetadataComplete.head()

# How many have values in the Author Identifier field but not the Author Identifier Scheme field?
authorIdentifiersMissingScheme = (
    authorMetadataLatestVersionDF
        .drop(columns=['authorAffiliation'])
        .query(
        'authorIdentifier == authorIdentifier and\
        authorIdentifierScheme != authorIdentifierScheme')
        .reset_index(drop=True, inplace=False)
)

authorIdentifiersMissingScheme.head()

# How many have values in the Author Scheme field but not in the Author Identifier field?

authorIdentifierSchemeMissingIdentifier = (
    authorMetadataLatestVersionDF
        .drop(columns=['authorAffiliation'])
        .query(
        'authorIdentifier != authorIdentifier and\
        authorIdentifierScheme == authorIdentifierScheme')
        .reset_index(drop=True, inplace=False)
)

authorIdentifierSchemeMissingIdentifier.head()

# Number of author metadata values with ORCIDs
authorMetadataWithORCIDs = (
    authorMetadataWithIdentifiers
        .query(
        '~authorIdentifier.str.contains("orcid.org").values or\
        authorIdentifierScheme == "ORCID"')
        .reset_index(drop=True, inplace=False)
)

authorMetadataWithORCIDs.head()