In [61]:
import pandas as pd

# Import, check and prepare data

Import CSV files that contain:
- The PIDs of all datasets published by many repositories that use the Dataverse software and which repositories published them
- Author field metadata entered in all of those datasets

In [62]:
# Get basic info about datasets in repositories and remove ODISSEI Portal
datasetPIDsDF = (pd
    .read_csv(
        'dataset_pids_from_most_known_dataverse_installations_2023.08.csv',
        sep=',',
        na_filter=False)
    .query(
        '(dataverse_json_export_saved == True) and\
        (dataverse_installation_name != "ODISSEI_Portal")')
    .drop(columns=['dataverse_json_export_saved'])
    .reset_index(drop=True, inplace=False)
 )

datasetPIDsDF.head()

Unnamed: 0,dataverse_installation_name,dataset_pid_url,dataverse_collection_alias,dataverse_collection_name,dataverse_collection_type
0,UNB_Libraries_Dataverse,https://doi.org/10.25545/NVN79Z,snb,Service New Brunswick,ORGANIZATIONS_INSTITUTIONS
1,UNB_Libraries_Dataverse,https://doi.org/10.25545/YB60PU,blightchemistry,Blight Chemistry Research Omniverse,RESEARCH_GROUP
2,UNB_Libraries_Dataverse,https://doi.org/10.25545/WHD3KC,blightchemistry,Blight Chemistry Research Omniverse,RESEARCH_GROUP
3,UNB_Libraries_Dataverse,https://doi.org/10.25545/6TZWCG,Ir-CationBinding,Elucidation of Charge Contributions in Iridium-Chelated Hydrogen-Bonding Systems,JOURNALS
4,UNB_Libraries_Dataverse,https://doi.org/10.25545/NYM13B,IFMLAB,Integrated Forest Management Lab,RESEARCH_GROUP


In [63]:
# Get Author field metadata entered in all datasets in Dataverse repositories
# and remove datasets have have no author metadata
authorMetadataDF = (pd
    .read_csv(
        'author(citation)_2023.08.22-2023.08.28.csv',
        sep=',',
        na_filter=True,
        parse_dates=['dataset_publication_date', 'dataset_version_create_time'])
    .drop(columns=['dataset_pid', 'authorAffiliation'])
    .query('authorName != "N/A"')
    .reset_index(drop=True, inplace=False)
    )

authorMetadataDF.head()

Unnamed: 0,dataset_pid_url,dataset_publication_date,dataset_version_number,dataset_version_create_time,authorName,authorIdentifierScheme,authorIdentifier
0,https://doi.org/10.48370/OFD/8TTRLC,2022-05-13,1.0,2022-05-11T14:44:29Z,"Mammal Research Institute, Polish Academy of Sciences",,
1,https://doi.org/10.17026/dans-26a-cq4r,2015-09-18,2.0,2022-02-18T19:29:27Z,H.J. Hesseling,,
2,https://doi.org/10.17026/dans-zc2-mc2g,2016-12-31,1.0,2022-02-24T21:01:03Z,S. Moerman,,
3,https://doi.org/10.17026/dans-x9z-bmn6,2020-12-22,1.0,2022-02-14T05:11:03Z,G. Zielman,,
4,https://doi.org/10.7910/DVN/QD0V0H,2021-09-20,1.0,2021-07-29T19:45:43Z,"Master, Daniel M.",,


In [64]:
# Sanity check data by making sure count of datasets is the same in both dataframes
print(f'Number of datasets in datasetPIDsDF: {len(datasetPIDsDF)}')
datasetCountInAuthorMetadataDF = len(pd.unique(authorMetadataDF['dataset_pid_url']))
print(f'Number of datasets in authorMetadataDF: {datasetCountInAuthorMetadataDF}')

Number of datasets in datasetPIDsDF: 390401
Number of datasets in authorMetadataDF: 390401


In [65]:
# Join the datasetPIDsDF and the authorMetadataDF to add the installation column,
# so we know which installations published each dataset
datasetPIDsAndAuthorMetadataDF = (pd
    .merge(datasetPIDsDF, authorMetadataDF,
        how='inner',
        on=['dataset_pid_url'])
    .drop(columns=['dataset_version_number'])
    .reset_index(drop=True, inplace=False))

# Make sure the count of datasets in datasetPIDsAndAuthorMetadataDF
# is the same as in datasetPIDsDF: 390401
datasetCountInDatasetPIDsAndAuthorMetadataDF = len(pd.unique(datasetPIDsAndAuthorMetadataDF['dataset_pid_url']))
print(f'Number of datasets in datasetPIDsAndAuthorMetadataDF: {datasetCountInDatasetPIDsAndAuthorMetadataDF}')

# Get count of author metadata
print(f'Number of author metadata in datasetPIDsAndAuthorMetadataDF: {len(datasetPIDsAndAuthorMetadataDF)}')

# Get count of installations. Should by 84: the 85 installations in my dataset minus ODISSEI Portal
allInstallationsList = list(set(datasetPIDsAndAuthorMetadataDF['dataverse_installation_name'].tolist()))
countOfInstallations = len(allInstallationsList)
print(f'Number of installations in datasetPIDsAndAuthorMetadataDF: {countOfInstallations}')

Number of datasets in datasetPIDsAndAuthorMetadataDF: 390401
Number of author metadata in datasetPIDsAndAuthorMetadataDF: 667435
Number of installations in datasetPIDsAndAuthorMetadataDF: 84


In [66]:
datasetPIDsAndAuthorMetadataDF.head()

Unnamed: 0,dataverse_installation_name,dataset_pid_url,dataverse_collection_alias,dataverse_collection_name,dataverse_collection_type,dataset_publication_date,dataset_version_create_time,authorName,authorIdentifierScheme,authorIdentifier
0,UNB_Libraries_Dataverse,https://doi.org/10.25545/NVN79Z,snb,Service New Brunswick,ORGANIZATIONS_INSTITUTIONS,2018-05-10,2022-03-23T19:49:10Z,Service New Brunswick,,
1,UNB_Libraries_Dataverse,https://doi.org/10.25545/YB60PU,blightchemistry,Blight Chemistry Research Omniverse,RESEARCH_GROUP,2018-06-20,2018-02-21T18:26:43Z,"Blight, Barry A",ORCID,0000-0003-1166-6206
2,UNB_Libraries_Dataverse,https://doi.org/10.25545/WHD3KC,blightchemistry,Blight Chemistry Research Omniverse,RESEARCH_GROUP,2019-06-23,2019-06-21T18:30:39Z,"Balonova, Barbora",,
3,UNB_Libraries_Dataverse,https://doi.org/10.25545/6TZWCG,Ir-CationBinding,Elucidation of Charge Contributions in Iridium-Chelated Hydrogen-Bonding Systems,JOURNALS,2021-05-18,2021-05-18T16:39:24Z,"Blight, Barry A",,
4,UNB_Libraries_Dataverse,https://doi.org/10.25545/NYM13B,IFMLAB,Integrated Forest Management Lab,RESEARCH_GROUP,2020-06-30,2020-11-30T22:44:12Z,"Chen, Yingbing",,


# Get metrics

In a given time frame, such as 12 months, what percentage of author metadata published in each Dataverse installation includes an ORCID?

In [67]:
publicationStartDate = '2022-01-01'
publicationEndDate = '2022-12-31'

In [68]:
countOfAuthorMetadataPerInstallationDf = (
    datasetPIDsAndAuthorMetadataDF
        .query(
            'dataset_version_create_time >= @publicationStartDate and\
            dataset_version_create_time <= @publicationEndDate and\
            authorName != "N/A"',
            engine='python')
    .assign(dataset_version_create_time_dt=lambda datasetPIDsAndAuthorMetadataDF: pd.to_datetime(
        datasetPIDsAndAuthorMetadataDF['dataset_version_create_time']))
    .assign(dataset_version_create_year=lambda datasetPIDsAndAuthorMetadataDF: pd.to_datetime(
        datasetPIDsAndAuthorMetadataDF['dataset_version_create_time_dt']).dt.year)
    [[
        'dataverse_installation_name',
        'dataset_version_create_year',
        'authorName',
        'authorIdentifierScheme',
        'authorIdentifier'
    ]]
    # Within each installation, drop duplicate author metadata. This will mitigate the effect of
    # hundreds or thousands of datasets being published with the same author metadata,
    # such as during a dataset migration or bulk publishing using APIs
    .drop_duplicates(
        subset=[
            'authorName',
            'authorIdentifierScheme',
            'authorIdentifier'],
        keep='first')
    [[
        'dataverse_installation_name',
        'dataset_version_create_year'
    ]]

    .reset_index(drop=True, inplace=False)

    # Group by count of rows for each year-month
    .groupby(pd.Grouper(key='dataverse_installation_name', axis=0)).count()
    .rename(columns={'dataset_version_create_year': 'count_of_author_metadata'})
    .reset_index(drop=False, inplace=False)
)

countOfAuthorMetadataPerInstallationDf.head()

Unnamed: 0,dataverse_installation_name,count_of_author_metadata
0,UNB_Libraries_Dataverse,17
1,Harvard_Dataverse,10972
2,PAPYRUS,23
3,Dataverse_e-cienciaDatos,124
4,NIOZ_Dataverse,36


In [70]:
countOfORCIDsPerInstallationDf = (
    datasetPIDsAndAuthorMetadataDF
        .query(
            'dataset_version_create_time >= @publicationStartDate and\
            dataset_version_create_time <= @publicationEndDate and\
            (authorIdentifier.str.contains("orcid", case=False) or\
            authorIdentifier.str.match(".{4}-.{4}-.{4}-.{4}") or\
            (authorIdentifierScheme == "ORCID" and\
            authorIdentifier == authorIdentifier))',
            engine='python')
    .drop(columns=[
        'dataset_pid_url',
        'dataverse_collection_alias',
        'dataverse_collection_name',
        'dataverse_collection_type'])
    .assign(dataset_version_create_time_dt=lambda datasetPIDsAndAuthorMetadataDF: pd.to_datetime(
        datasetPIDsAndAuthorMetadataDF['dataset_version_create_time']))
    .assign(dataset_version_create_year=lambda datasetPIDsAndAuthorMetadataDF: pd.to_datetime(
        datasetPIDsAndAuthorMetadataDF['dataset_version_create_time_dt']).dt.year)
    [[
        'dataverse_installation_name',
        'dataset_version_create_year',
        'authorName',
        'authorIdentifierScheme',
        'authorIdentifier'
    ]]

    # Within each installation, drop duplicate author metadata. This will mitigate the effect of
    # hundreds or thousands of datasets being published with the same author metadata,
    # such as during a dataset migration or bulk publishing using APIs
    .drop_duplicates(
        subset=[
            'authorName',
            'authorIdentifierScheme',
            'authorIdentifier'],
        keep='first')
    [[
        'dataverse_installation_name',
        'dataset_version_create_year'
    ]]

    .reset_index(drop=True, inplace=False)

    # Group by count of rows for each year-month
    .groupby(pd.Grouper(key='dataverse_installation_name', axis=0)).count()
    .rename(columns={'dataset_version_create_year': 'count_of_orcids'})
    .reset_index(drop=False, inplace=False)
)

countOfORCIDsPerInstallationDf.head()

Unnamed: 0,dataverse_installation_name,count_of_orcids
0,UNB_Libraries_Dataverse,2
1,Harvard_Dataverse,4004
2,PAPYRUS,20
3,Dataverse_e-cienciaDatos,89
4,NIOZ_Dataverse,27


In [71]:
# Merge the two dataframes
countsAndPercentsOfAuthorMetadataAndORCIDsPerInstallationDf = (pd
     .merge(countOfAuthorMetadataPerInstallationDf, countOfORCIDsPerInstallationDf,
        how='outer',
        on=['dataverse_installation_name'])
     .fillna(0)
     # Make dataverse_installation_name the index column
     .set_index('dataverse_installation_name', inplace=False)
     # Make sure all non-indexed columns are integers
     .astype('int32')
     .reset_index(drop=False, inplace=False)
     )

# Add column for percentage of ORCIDs
countsAndPercentsOfAuthorMetadataAndORCIDsPerInstallationDf['percentage_of_orcids'] = (
        (countsAndPercentsOfAuthorMetadataAndORCIDsPerInstallationDf['count_of_orcids']
         / countsAndPercentsOfAuthorMetadataAndORCIDsPerInstallationDf['count_of_author_metadata'])
        * 100
)

countsAndPercentsOfAuthorMetadataAndORCIDsPerInstallationDf.head()

Unnamed: 0,dataverse_installation_name,count_of_author_metadata,count_of_orcids,percentage_of_orcids
0,UNB_Libraries_Dataverse,17,2,11.764706
1,Harvard_Dataverse,10972,4004,36.492891
2,PAPYRUS,23,20,86.956522
3,Dataverse_e-cienciaDatos,124,89,71.774194
4,NIOZ_Dataverse,36,27,75.0


In [73]:
# Export the dataframe as a CSV file
countsAndPercentsOfAuthorMetadataAndORCIDsPerInstallationDf.to_csv(
    'countsAndPercentsOfAuthorMetadataAndORCIDsPerInstallation.csv',
    index=False)