In [1]:
import pandas as pd

# Import, check and prepare data

Import CSV files that contain:
- The PIDs of all datasets published by many repositories that use the Dataverse software and which repositories published them
- Author field metadata entered in all of those datasets

In [2]:
# Get Author field metadata of datasets in Dataverse repositories whose latest versions were created in the last 12 months (between 2023-08-26 and 2024-08-25)

authorMetadataDF = (pd
    .read_csv(
        'author_2024.08.25-2024.08.30.csv',
        sep=',',
        na_filter=False,
        parse_dates=['dataset_publication_date', 'dataset_version_create_time'])
    .drop(columns=['dataset_pid', 'authorAffiliation'])
    .query(
        'dataset_version_create_time >= "2023-08-26T00:00:00Z" and\
        dataset_version_create_time < "2024-08-26T00:00:00Z"')
    .reset_index(drop=True, inplace=False)
    )

authorMetadataDF.head()

Unnamed: 0,dataset_pid_url,dataset_publication_date,dataset_version_number,dataset_version_create_time,authorName,authorIdentifierScheme,authorIdentifier
0,https://doi.org/10.48370/OFD/8TTRLC,2022-05-13,1.0,2022-05-11 14:44:29+00:00,"Mammal Research Institute, Polish Academy of Sciences",,
1,https://doi.org/10.17026/dans-xk3-33d2,2023-04-06,1.0,2023-10-16 13:53:23+00:00,Portable Antiquities of the Netherlands,,
2,https://doi.org/10.17026/dans-26a-cq4r,2015-09-18,2.0,2022-02-18 19:29:27+00:00,H.J. Hesseling,,
3,https://hdl.handle.net/2014/29246,1995-01-01,2.0,2024-04-02 14:37:17+00:00,"Feynman, J.",,
4,https://hdl.handle.net/2014/29246,1995-01-01,2.0,2024-04-02 14:37:17+00:00,"Ruzmaikin, A.",,


In [3]:
# Get basic info about datasets in installations and include only datasets that are also in authorMetadataDF
datasetPidUrlList = pd.unique(authorMetadataDF['dataset_pid_url'])

datasetPIDsDF = (pd
    .read_csv(
        'dataset_pids_from_most_known_dataverse_installations_2024.08.csv',
        sep=',',
        na_filter=False)
    .query(
        '(dataverse_json_export_saved == True) and\
        dataset_pid_url in @datasetPidUrlList')
    .drop(columns=['dataverse_json_export_saved', 'dataset_pid'])
    .reset_index(drop=True, inplace=False)
 )

datasetPIDsDF.head()

Unnamed: 0,dataverse_installation_name,dataset_pid_url,dataverse_collection_alias,dataverse_collection_name,dataverse_collection_type
0,CROSSDA,https://doi.org/10.23669/SIHKQ9,ffzg,University of Zagreb Faculty of Humanities and Social Sciences,ORGANIZATIONS_INSTITUTIONS
1,CROSSDA,https://doi.org/10.23669/2IFBSL,ffzg,University of Zagreb Faculty of Humanities and Social Sciences,ORGANIZATIONS_INSTITUTIONS
2,CROSSDA,https://doi.org/10.23669/JVNVNR,erudito,E-Rudito: An Advanced Online Education System for Smart Specialization and Jobs of the Future,RESEARCH_PROJECTS
3,CROSSDA,https://doi.org/10.23669/CLYQHG,respoc,(Re)building society: A longitudinal study of post-corona social recovery in Croatian general population (ReSPoC),RESEARCH_PROJECTS
4,CROSSDA,https://doi.org/10.23669/VO50RW,his,Croatian Election Studies,RESEARCH_PROJECTS


In [4]:
# Join datasetPIDsDF and authorMetadataDF to add the installation column, so we know which installations published each dataset
datasetPIDsAndAuthorMetadataDF = (pd
    .merge(datasetPIDsDF, authorMetadataDF,
        how='inner',
        on=['dataset_pid_url'])
    .drop(columns=['dataset_version_number'])
    .reset_index(drop=True, inplace=False))

print(f'Count of installations in datasetPIDsAndAuthorMetadataDF: {len(pd.unique(datasetPIDsAndAuthorMetadataDF["dataverse_installation_name"]))}')

Count of installations in datasetPIDsAndAuthorMetadataDF: 101


In [5]:
datasetPIDsAndAuthorMetadataDF.head()

Unnamed: 0,dataverse_installation_name,dataset_pid_url,dataverse_collection_alias,dataverse_collection_name,dataverse_collection_type,dataset_publication_date,dataset_version_create_time,authorName,authorIdentifierScheme,authorIdentifier
0,CROSSDA,https://doi.org/10.23669/SIHKQ9,ffzg,University of Zagreb Faculty of Humanities and Social Sciences,ORGANIZATIONS_INSTITUTIONS,2020-12-02,2020-12-16 09:02:40+00:00,"Jokić-Begić, Nataša",ORCID,0000-0003-2597-535X
1,CROSSDA,https://doi.org/10.23669/SIHKQ9,ffzg,University of Zagreb Faculty of Humanities and Social Sciences,ORGANIZATIONS_INSTITUTIONS,2020-12-02,2020-12-16 09:02:40+00:00,"Lauri Korajlija, Anita",ORCID,0000-0001-8561-9870
2,CROSSDA,https://doi.org/10.23669/SIHKQ9,ffzg,University of Zagreb Faculty of Humanities and Social Sciences,ORGANIZATIONS_INSTITUTIONS,2020-12-02,2020-12-16 09:02:40+00:00,"Mikac, Una",ORCID,0000-0001-9369-6462
3,CROSSDA,https://doi.org/10.23669/2IFBSL,ffzg,University of Zagreb Faculty of Humanities and Social Sciences,ORGANIZATIONS_INSTITUTIONS,2022-01-12,2022-01-07 16:01:29+00:00,"Jelić, Margareta",ORCID,0000-0002-2478-0756
4,CROSSDA,https://doi.org/10.23669/JVNVNR,erudito,E-Rudito: An Advanced Online Education System for Smart Specialization and Jobs of the Future,RESEARCH_PROJECTS,2019-08-28,2021-07-19 13:23:04+00:00,"Vlašiček, Denis",ORCID,0000-0003-1925-6818


# Get metrics

What percentage of author metadata published in each Dataverse installation in the last two years includes an ORCID?

In [6]:
# First we'll get a count of author metadata published in each installation in the last two years

countOfAuthorMetadataPerInstallationDf = (
    datasetPIDsAndAuthorMetadataDF
        .value_counts(subset=['dataverse_installation_name'])
        .to_frame('count_of_author_metadata')
        .sort_values(['dataverse_installation_name'], ascending=[True])
        .reset_index(drop=False, inplace=False)
)

print(f'Count of installations: {len(countOfAuthorMetadataPerInstallationDf)}')

Count of installations: 101


In [7]:
countOfAuthorMetadataPerInstallationDf.head()

Unnamed: 0,dataverse_installation_name,count_of_author_metadata
0,ACSS Dataverse,170
1,ADA Dataverse,1963
2,ARP,279
3,ASU Library Research Data Repository,277
4,AUSSDA Dataverse,2462


In [8]:
# Then we'll get a count of author metadata in each installation that includes an ORCID (in the format we expect)

countOfORCIDsPerInstallationDf = (
    datasetPIDsAndAuthorMetadataDF
        # Count only author metadata that includes ORCIDs
        .query(
            'authorIdentifierScheme.str.contains("orcid", case=False) and\
            authorIdentifier.str.match("(https?:\/\/orcid\.org\/)?.{4}-.{4}-.{4}-.{4}")',
            engine='python')
        # Remove all columns except dataverse_installation_name and authorName
        [[
            'dataverse_installation_name',
            'authorName'
        ]]
        .reset_index(drop=True, inplace=False)
        # Group by count of rows for each dataverse_installation_name
        .groupby(pd.Grouper(key='dataverse_installation_name', axis=0)).count()
        .rename(columns={'authorName': 'count_of_orcids_in_author_metadata'})
        .sort_values(['dataverse_installation_name'], ascending=[True])
        .reset_index(drop=False, inplace=False)
)

print(f'Count of installations in countOfORCIDsPerInstallationDf: {len(countOfORCIDsPerInstallationDf)}')

Count of installations in countOfORCIDsPerInstallationDf: 95


In [9]:
countOfORCIDsPerInstallationDf.head()

Unnamed: 0,dataverse_installation_name,count_of_orcids_in_author_metadata
0,ACSS Dataverse,6
1,ADA Dataverse,708
2,ARP,122
3,ASU Library Research Data Repository,122
4,AUSSDA Dataverse,345


In [10]:
# Now we'll merge the two dataframes so we can see the count of author metadata in each installation and the count of that author metadata that includes a "property-formatted" ORCID
authorMetadataAndORCIDsPerInstallation = (pd
    .merge(countOfAuthorMetadataPerInstallationDf, countOfORCIDsPerInstallationDf,
        how='outer',
        on=['dataverse_installation_name'])
    .fillna(0)
    # Make dataverse_installation_name the index column
    .set_index('dataverse_installation_name', inplace=False)
    # Make sure all non-indexed columns are integers
    .astype('int32')
    .reset_index(drop=False, inplace=False)
    .sort_values(['dataverse_installation_name'], ascending=[True])
    .reset_index(drop=True, inplace=False)
)

# Add column for percentage of ORCIDs
authorMetadataAndORCIDsPerInstallation['percentage_of_orcids_in_author_metadata'] = (
        (authorMetadataAndORCIDsPerInstallation['count_of_orcids_in_author_metadata']
         / authorMetadataAndORCIDsPerInstallation['count_of_author_metadata'])
)

authorMetadataAndORCIDsPerInstallation.head()

Unnamed: 0,dataverse_installation_name,count_of_author_metadata,count_of_orcids_in_author_metadata,percentage_of_orcids_in_author_metadata
0,ACSS Dataverse,170,6,0.035294
1,ADA Dataverse,1963,708,0.360672
2,ARP,279,122,0.437276
3,ASU Library Research Data Repository,277,122,0.440433
4,AUSSDA Dataverse,2462,345,0.14013


In [11]:
# Export the dataframe as a CSV file
authorMetadataAndORCIDsPerInstallation.to_csv(
    'authorMetadataAndORCIDsPerInstallation_2023.08.26-2024.08.25.csv',
    index=False)