In [1]:
import pandas as pd

# Import, check and prepare data

Import CSV files that contain:
- The PIDs of all datasets published by many repositories that use the Dataverse software and which repositories published them
- Author field metadata entered in all of those datasets

In [2]:
# Get Author field metadata of datasets in Dataverse repositories whose latest versions were created in the last 12 months (between 2023-08-26 and 2024-08-25)

authorMetadataDF = (pd
    .read_csv(
        'author_2024.08.25-2024.08.30.csv',
        sep=',',
        na_filter=False,
        parse_dates=['dataset_publication_date', 'dataset_version_create_time'])
    .drop(columns=['dataset_pid', 'authorAffiliation'])
    .query(
        'dataset_version_create_time >= "2023-08-26T00:00:00Z" and\
        dataset_version_create_time < "2024-08-26T00:00:00Z"')
    .reset_index(drop=True, inplace=False)
    )

authorMetadataDF.head()

Unnamed: 0,dataset_pid_url,dataset_publication_date,dataset_version_number,dataset_version_create_time,authorName,authorIdentifierScheme,authorIdentifier
0,https://doi.org/10.17026/dans-xk3-33d2,2023-04-06,1.0,2023-10-16 13:53:23+00:00,Portable Antiquities of the Netherlands,,
1,https://hdl.handle.net/2014/29246,1995-01-01,2.0,2024-04-02 14:37:17+00:00,"Feynman, J.",,
2,https://hdl.handle.net/2014/29246,1995-01-01,2.0,2024-04-02 14:37:17+00:00,"Ruzmaikin, A.",,
3,https://hdl.handle.net/2014/29246,1995-01-01,2.0,2024-04-02 14:37:17+00:00,"Smith, E. J.",,
4,https://doi.org/10.17026/dans-z7f-pft2,2023-08-17,1.0,2023-10-16 19:05:45+00:00,Portable Antiquities of the Netherlands,,


In [3]:
# Get basic info about datasets in installations and include only datasets that are also in authorMetadataDF
datasetPidUrlList = pd.unique(authorMetadataDF['dataset_pid_url'])

datasetPIDsDF = (pd
    .read_csv(
        'dataset_pids_from_most_known_dataverse_installations_2024.08.csv',
        sep=',',
        na_filter=False)
    .query(
        '(dataverse_json_export_saved == True) and\
        dataset_pid_url in @datasetPidUrlList')
    .drop(columns=['dataverse_json_export_saved', 'dataset_pid'])
    .reset_index(drop=True, inplace=False)
 )

datasetPIDsDF.head()

Unnamed: 0,dataverse_installation_name,dataset_pid_url,dataverse_collection_alias,dataverse_collection_name,dataverse_collection_type
0,CROSSDA,https://doi.org/10.23669/TTB7HV,unizd,University of Zadar,ORGANIZATIONS_INSTITUTIONS
1,CROSSDA,https://doi.org/10.23669/JBIXC5,his,Croatian Election Studies,RESEARCH_PROJECTS
2,CROSSDA,https://doi.org/10.23669/RMVRHL,his,Croatian Election Studies,RESEARCH_PROJECTS
3,CROSSDA,https://doi.org/10.23669/ZTAA3M,his,Croatian Election Studies,RESEARCH_PROJECTS
4,CROSSDA,https://doi.org/10.23669/RN40FI,his,Croatian Election Studies,RESEARCH_PROJECTS


In [4]:
# Join datasetPIDsDF and authorMetadataDF to add the installation column, so we know which installations published each dataset
datasetPIDsAndAuthorMetadataDF = (pd
    .merge(datasetPIDsDF, authorMetadataDF,
        how='inner',
        on=['dataset_pid_url'])
    .drop(columns=['dataset_version_number'])
    .reset_index(drop=True, inplace=False))

print(f'Count of installations in datasetPIDsAndAuthorMetadataDF: {len(pd.unique(datasetPIDsAndAuthorMetadataDF["dataverse_installation_name"]))}')

Count of installations in datasetPIDsAndAuthorMetadataDF: 98


In [5]:
datasetPIDsAndAuthorMetadataDF.head()

Unnamed: 0,dataverse_installation_name,dataset_pid_url,dataverse_collection_alias,dataverse_collection_name,dataverse_collection_type,dataset_publication_date,dataset_version_create_time,authorName,authorIdentifierScheme,authorIdentifier
0,CROSSDA,https://doi.org/10.23669/TTB7HV,unizd,University of Zadar,ORGANIZATIONS_INSTITUTIONS,2024-06-26,2023-11-27 15:15:40+00:00,"Cepić, Dražen",ORCID,0000-0003-4544-5778
1,CROSSDA,https://doi.org/10.23669/TTB7HV,unizd,University of Zadar,ORGANIZATIONS_INSTITUTIONS,2024-06-26,2023-11-27 15:15:40+00:00,"Škacan, Mislav",ORCID,0000-0002-8432-201X
2,CROSSDA,https://doi.org/10.23669/TTB7HV,unizd,University of Zadar,ORGANIZATIONS_INSTITUTIONS,2024-06-26,2023-11-27 15:15:40+00:00,"Puzek, Ivan",ORCID,0000-0001-7545-9578
3,CROSSDA,https://doi.org/10.23669/TTB7HV,unizd,University of Zadar,ORGANIZATIONS_INSTITUTIONS,2024-06-26,2023-11-27 15:15:40+00:00,"Ančić, Branko",ORCID,0000-0003-1438-2647
4,CROSSDA,https://doi.org/10.23669/TTB7HV,unizd,University of Zadar,ORGANIZATIONS_INSTITUTIONS,2024-06-26,2023-11-27 15:15:40+00:00,"Kanski, Danijel",ORCID,0009-0004-3846-3550


# Get metrics

What percentage of author metadata published in each Dataverse installation in the last two years includes an ORCID?

In [6]:
# First we'll get a count of author metadata published in each installation in the last two years

countOfAuthorMetadataPerInstallationDf = (
    datasetPIDsAndAuthorMetadataDF
        .value_counts(subset=['dataverse_installation_name'])
        .to_frame('count_of_author_metadata')
        .sort_values(['dataverse_installation_name'], ascending=[True])
        .reset_index(drop=False, inplace=False)
)

print(f'Count of installations: {len(countOfAuthorMetadataPerInstallationDf)}')

Count of installations: 98


In [7]:
countOfAuthorMetadataPerInstallationDf.head()

Unnamed: 0,dataverse_installation_name,count_of_author_metadata
0,ACSS Dataverse,30
1,ADA Dataverse,138
2,ARP,249
3,ASU Library Research Data Repository,75
4,AUSSDA Dataverse,233


In [8]:
# Then we'll get a count of author metadata in each installation that includes an ORCID (in the format we expect)

countOfORCIDsPerInstallationDf = (
    datasetPIDsAndAuthorMetadataDF
        # Count only author metadata that includes ORCIDs
        .query(
            'authorIdentifierScheme.str.contains("orcid", case=False) and\
            authorIdentifier.str.match("(https?:\/\/orcid\.org\/)?.{4}-.{4}-.{4}-.{4}")',
            engine='python')
        # Remove all columns except dataverse_installation_name and authorName
        [[
            'dataverse_installation_name',
            'authorName'
        ]]
        .reset_index(drop=True, inplace=False)
        # Group by count of rows for each dataverse_installation_name
        .groupby(pd.Grouper(key='dataverse_installation_name', axis=0)).count()
        .rename(columns={'authorName': 'count_of_orcids_in_author_metadata'})
        .sort_values(['dataverse_installation_name'], ascending=[True])
        .reset_index(drop=False, inplace=False)
)

print(f'Count of installations in countOfORCIDsPerInstallationDf: {len(countOfORCIDsPerInstallationDf)}')

Count of installations in countOfORCIDsPerInstallationDf: 91


In [9]:
countOfORCIDsPerInstallationDf.head()

Unnamed: 0,dataverse_installation_name,count_of_orcids_in_author_metadata
0,ACSS Dataverse,1
1,ADA Dataverse,51
2,ARP,106
3,ASU Library Research Data Repository,55
4,AUSSDA Dataverse,135


In [10]:
# Now we'll merge the two dataframes so we can see the count of author metadata in each installation and the count of that author metadata that includes a "property-formatted" ORCID
authorMetadataAndORCIDsPerInstallation = (pd
    .merge(countOfAuthorMetadataPerInstallationDf, countOfORCIDsPerInstallationDf,
        how='outer',
        on=['dataverse_installation_name'])
    .fillna(0)
    # Make dataverse_installation_name the index column
    .set_index('dataverse_installation_name', inplace=False)
    # Make sure all non-indexed columns are integers
    .astype('int32')
    .reset_index(drop=False, inplace=False)
    .sort_values(['dataverse_installation_name'], ascending=[True])
    .reset_index(drop=True, inplace=False)
)

# Add column for percentage of ORCIDs
authorMetadataAndORCIDsPerInstallation['percentage_of_orcids_in_author_metadata'] = (
        (authorMetadataAndORCIDsPerInstallation['count_of_orcids_in_author_metadata']
         / authorMetadataAndORCIDsPerInstallation['count_of_author_metadata'])
)

authorMetadataAndORCIDsPerInstallation.head()

Unnamed: 0,dataverse_installation_name,count_of_author_metadata,count_of_orcids_in_author_metadata,percentage_of_orcids_in_author_metadata
0,ACSS Dataverse,30,1,0.033333
1,ADA Dataverse,138,51,0.369565
2,ARP,249,106,0.425703
3,ASU Library Research Data Repository,75,55,0.733333
4,AUSSDA Dataverse,233,135,0.579399


In [11]:
# Export the dataframe as a CSV file
authorMetadataAndORCIDsPerInstallation.to_csv(
    'authorMetadataAndORCIDsPerInstallation_2023.08.26-2024.08.25.csv',
    index=False)