In [1]:
import pandas as pd

# Import, check and prepare data

Import CSV files that contain:
- The PIDs of all datasets published by many repositories that use the Dataverse software and which repositories published them
- Author field metadata entered in all of those datasets

In [26]:
# Get Author field metadata of datasets in Dataverse repositories
# whose latest versions were created in the last 12 months (between 2023-08-26
# and 2023-08-25) and remove datasets that have no author metadata
authorMetadataDF = (pd
    .read_csv(
        'author_2024.08.25-2024.08.30.csv',
        sep=',',
        na_filter=False,
        parse_dates=['dataset_publication_date', 'dataset_version_create_time'])
    .drop(columns=['dataset_pid', 'authorAffiliation'])
    .query(
        # 'authorName != "N/A" and\
        'dataset_version_create_time >= "2023-08-26T00:00:00Z" and\
        dataset_version_create_time < "2024-08-26T00:00:00Z"')
    .reset_index(drop=True, inplace=False)
    )

authorMetadataDF.head()

Unnamed: 0,dataset_pid_url,dataset_publication_date,dataset_version_number,dataset_version_create_time,authorName,authorIdentifierScheme,authorIdentifier
0,https://doi.org/10.17026/dans-xk3-33d2,2023-04-06,1.0,2023-10-16 13:53:23+00:00,Portable Antiquities of the Netherlands,,
1,https://hdl.handle.net/2014/29246,1995-01-01,2.0,2024-04-02 14:37:17+00:00,"Feynman, J.",,
2,https://hdl.handle.net/2014/29246,1995-01-01,2.0,2024-04-02 14:37:17+00:00,"Ruzmaikin, A.",,
3,https://hdl.handle.net/2014/29246,1995-01-01,2.0,2024-04-02 14:37:17+00:00,"Smith, E. J.",,
4,https://doi.org/10.17026/dans-z7f-pft2,2023-08-17,1.0,2023-10-16 19:05:45+00:00,Portable Antiquities of the Netherlands,,


In [27]:
# Get basic info about datasets in repositories and include only datasets in authorMetadataDF
datasetPidUrlList = pd.unique(authorMetadataDF['dataset_pid_url'])

datasetPIDsDF = (pd
    .read_csv(
        'dataset_pids_from_most_known_dataverse_installations_2024.08.csv',
        sep=',',
        na_filter=False)
    .query(
        '(dataverse_json_export_saved == True) and\
        dataset_pid_url in @datasetPidUrlList')
    .drop(columns=['dataverse_json_export_saved', 'dataset_pid'])
    .reset_index(drop=True, inplace=False)
 )

datasetPIDsDF.head()

Unnamed: 0,dataverse_installation_name,dataset_pid_url,dataverse_collection_alias,dataverse_collection_name,dataverse_collection_type
0,CROSSDA,https://doi.org/10.23669/TTB7HV,unizd,University of Zadar,ORGANIZATIONS_INSTITUTIONS
1,CROSSDA,https://doi.org/10.23669/JBIXC5,his,Croatian Election Studies,RESEARCH_PROJECTS
2,CROSSDA,https://doi.org/10.23669/RMVRHL,his,Croatian Election Studies,RESEARCH_PROJECTS
3,CROSSDA,https://doi.org/10.23669/ZTAA3M,his,Croatian Election Studies,RESEARCH_PROJECTS
4,CROSSDA,https://doi.org/10.23669/RN40FI,his,Croatian Election Studies,RESEARCH_PROJECTS


In [64]:
# Join the datasetPIDsDF and the authorMetadataDF to add the installation column,
# so we know which installations published each dataset
datasetPIDsAndAuthorMetadataDF = (pd
    .merge(datasetPIDsDF, authorMetadataDF,
        how='inner',
        on=['dataset_pid_url'])
    .drop(columns=['dataset_version_number'])
    .reset_index(drop=True, inplace=False))

print(f'Count of installations in datasetPIDsAndAuthorMetadataDF: {len(pd.unique(datasetPIDsAndAuthorMetadataDF["dataverse_installation_name"]))}')

Count of installations in datasetPIDsAndAuthorMetadataDF: 98


In [47]:
datasetPIDsAndAuthorMetadataDF.head()

Unnamed: 0,dataverse_installation_name,dataset_pid_url,dataverse_collection_alias,dataverse_collection_name,dataverse_collection_type,dataset_publication_date,dataset_version_create_time,authorName,authorIdentifierScheme,authorIdentifier
0,CROSSDA,https://doi.org/10.23669/TTB7HV,unizd,University of Zadar,ORGANIZATIONS_INSTITUTIONS,2024-06-26,2023-11-27 15:15:40+00:00,"Cepić, Dražen",ORCID,0000-0003-4544-5778
1,CROSSDA,https://doi.org/10.23669/TTB7HV,unizd,University of Zadar,ORGANIZATIONS_INSTITUTIONS,2024-06-26,2023-11-27 15:15:40+00:00,"Škacan, Mislav",ORCID,0000-0002-8432-201X
2,CROSSDA,https://doi.org/10.23669/TTB7HV,unizd,University of Zadar,ORGANIZATIONS_INSTITUTIONS,2024-06-26,2023-11-27 15:15:40+00:00,"Puzek, Ivan",ORCID,0000-0001-7545-9578
3,CROSSDA,https://doi.org/10.23669/TTB7HV,unizd,University of Zadar,ORGANIZATIONS_INSTITUTIONS,2024-06-26,2023-11-27 15:15:40+00:00,"Ančić, Branko",ORCID,0000-0003-1438-2647
4,CROSSDA,https://doi.org/10.23669/TTB7HV,unizd,University of Zadar,ORGANIZATIONS_INSTITUTIONS,2024-06-26,2023-11-27 15:15:40+00:00,"Kanski, Danijel",ORCID,0009-0004-3846-3550


# Get metrics

What percentage of author metadata published in each Dataverse installation includes an ORCID?

In [60]:
countOfAuthorMetadataPerInstallationDf = (
    datasetPIDsAndAuthorMetadataDF
        .value_counts(subset=['dataverse_installation_name'])
        .to_frame('count_of_author_metadata')
        .sort_values(['dataverse_installation_name'], ascending=[True])
        .reset_index(drop=False, inplace=False)
)
# countOfAuthorMetadataPerInstallationDf.head()
print(f'Count of installations: {len(countOfAuthorMetadataPerInstallationDf)}')

Count of installations: 98


In [61]:
countOfORCIDsPerInstallationDf = (
    datasetPIDsAndAuthorMetadataDF
        .query(
            '(authorIdentifier.str.contains("orcid", case=False) or\
            authorIdentifier.str.match(".{4}-.{4}-.{4}-.{4}") or\
            (authorIdentifierScheme == "ORCID" and\
            authorIdentifier == authorIdentifier))',
            engine='python')
        [[
            'dataverse_installation_name',
            'authorName'
        ]]
        .reset_index(drop=True, inplace=False)
        # Group by count of rows for each dataverse_installation_name
        .groupby(pd.Grouper(key='dataverse_installation_name', axis=0)).count()
        .rename(columns={'authorName': 'count_of_orcids_in_author_metadata'})
        .sort_values(['dataverse_installation_name'], ascending=[True])
        .reset_index(drop=False, inplace=False)
)

# countOfORCIDsPerInstallationDf.head()
print(f'Count of installations: {len(countOfORCIDsPerInstallationDf)}')

Count of installations: 97


In [15]:
# Merge the two dataframes
authorMetadataAndORCIDsPerInstallation2022 = (pd
    .merge(countOfAuthorMetadataPerInstallationDf, countOfORCIDsPerInstallationDf,
        how='outer',
        on=['dataverse_installation_name'])
    .fillna(0)
    # Make dataverse_installation_name the index column
    .set_index('dataverse_installation_name', inplace=False)
    # Make sure all non-indexed columns are integers
    .astype('int32')
    .reset_index(drop=False, inplace=False)
    .sort_values(['dataverse_installation_name'], ascending=[True])
    .reset_index(drop=True, inplace=False)
)

# Add column for percentage of ORCIDs
authorMetadataAndORCIDsPerInstallation2022['percentage_of_orcids'] = (
        (authorMetadataAndORCIDsPerInstallation2022['count_of_orcids']
         / authorMetadataAndORCIDsPerInstallation2022['count_of_author_metadata'])
        * 100
)

authorMetadataAndORCIDsPerInstallation2022.head()

Unnamed: 0,dataverse_installation_name,count_of_author_metadata,count_of_orcids,percentage_of_orcids
0,ACSS_Dataverse,5,3,60.0
1,ADA_Dataverse,109,58,53.211009
2,ASU_Library_Research_Data_Repository,62,32,51.612903
3,AUSSDA_Dataverse,111,52,46.846847
4,Abacus,207,0,0.0


In [16]:
# Export the dataframe as a CSV file
authorMetadataAndORCIDsPerInstallation2022.to_csv(
    'authorMetadataAndORCIDsPerInstallation2022.csv',
    index=False)