In [None]:
import pandas as pd

# Import, check and prepare data

Import CSV files that contain:
- The PIDs of all datasets published by many repositories that use the Dataverse software and which repositories published them
- Author field metadata entered in all of those datasets

In [None]:
# Get basic info about datasets in repositories except for ODISSEI Portal
datasetPIDsDF = (pd
    .read_csv(
        'dataset_pids_from_most_known_dataverse_installations_2023.08.csv',
        sep=',',
        na_filter=False)
    .query(
        '(dataverse_json_export_saved == True) and\
        (dataverse_installation_name != "ODISSEI_Portal")')
    .drop(columns=['dataverse_json_export_saved'])
    .reset_index(drop=True, inplace=False)
 )

datasetPIDsDF.head()

In [None]:
# Get Author field metadata entered in all datasets in Dataverse repositories
# and remove datasets have have no author metadata
authorMetadataDF = (pd
    .read_csv(
        'author(citation)_2023.08.22-2023.08.28.csv',
        sep=',',
        na_filter=True,
        parse_dates=['dataset_publication_date', 'dataset_version_create_time'])
    .drop(columns=['dataset_pid', 'authorAffiliation'])
    .query('authorName != "N/A"')
    .reset_index(drop=True, inplace=False)
    )

authorMetadataDF.head()

In [None]:
# Sanity check data by making sure count of datasets is the same in both dataframes
print(f'Number of datasets in datasetPIDsDF: {len(datasetPIDsDF)}')
datasetCountInAuthorMetadataDF = len(pd.unique(authorMetadataDF['dataset_pid_url']))
print(f'Number of datasets in authorMetadataDF: {datasetCountInAuthorMetadataDF}')

In [None]:
# Join the datasetPIDsDF and the authorMetadataDF to add the installation column,
# so we know which installations published each dataset
datasetPIDsAndAuthorMetadataDF = (pd
    .merge(datasetPIDsDF, authorMetadataDF,
        how='inner',
        on=['dataset_pid_url'])
    .drop(columns=['dataset_version_number'])
    .reset_index(drop=True, inplace=False))

# Make sure the count of datasets in datasetPIDsAndAuthorMetadataDF
# is the same as in datasetPIDsDF: 390401
datasetCountInDatasetPIDsAndAuthorMetadataDF = len(pd.unique(datasetPIDsAndAuthorMetadataDF['dataset_pid_url']))
print(f'Number of datasets in datasetPIDsAndAuthorMetadataDF: {datasetCountInDatasetPIDsAndAuthorMetadataDF}')

# Get count of author metadata
print(f'Number of author metadata in datasetPIDsAndAuthorMetadataDF: {len(datasetPIDsAndAuthorMetadataDF)}')

# Get count of installations. Should by 84: the 85 installations in my dataset minus ODISSEI Portal
allInstallationsList = list(set(datasetPIDsAndAuthorMetadataDF['dataverse_installation_name'].tolist()))
countOfInstallations = len(allInstallationsList)
print(f'Number of installations in datasetPIDsAndAuthorMetadataDF: {countOfInstallations}')

In [None]:
datasetPIDsAndAuthorMetadataDF.head()

# Explore data

In a given time frame, such as 12 months, what percentage of author metadata published in each Dataverse installation includes an ORCID?

In [None]:
publicationStartDate = '2022-01-01'
publicationEndDate = '2022-12-31'

In [None]:
allAuthorMetadataDf = (
    datasetPIDsAndAuthorMetadataDF
        .query(
            'dataset_version_create_time >= @publicationStartDate and\
            dataset_version_create_time <= @publicationEndDate and\
            authorName != "N/A"',
            engine='python')
    .assign(dataset_version_create_time_dt=lambda datasetPIDsAndAuthorMetadataDF: pd.to_datetime(
        datasetPIDsAndAuthorMetadataDF['dataset_version_create_time']))
    .assign(dataset_version_create_year=lambda datasetPIDsAndAuthorMetadataDF: pd.to_datetime(
        datasetPIDsAndAuthorMetadataDF['dataset_version_create_time_dt']).dt.year)
    [[
        'dataverse_installation_name',
        'dataset_version_create_year',
        'authorName',
        'authorIdentifierScheme',
        'authorIdentifier'
    ]]
    # Within each month, drop duplicate author metadata. This will mitigate the effect of
    # hundreds or thousands of datasets being published with the same author metadata in a short
    # time frame, such during a dataset migration or bulk publishing using APIs
    .drop_duplicates(
        subset=[
            'authorName',
            'authorIdentifierScheme',
            'authorIdentifier'],
        keep='first')
    [[
        'dataverse_installation_name',
        'dataset_version_create_year'
    ]]

    .reset_index(drop=True, inplace=False)

    # Group by count of rows for each year-month
    .groupby(pd.Grouper(key='dataverse_installation_name', axis=0)).count()
    .rename(columns={'dataset_version_create_year': 'count_of_author_metadata'})
    .reset_index(drop=False, inplace=False)
)

allAuthorMetadataDf.head()

In [None]:
print(len(allAuthorMetadataDf))

In [None]:
orcidsDf = (
    datasetPIDsAndAuthorMetadataDF
        .query(
            'dataset_version_create_time >= @publicationStartDate and\
            dataset_version_create_time <= @publicationEndDate and\
            (authorIdentifier.str.contains("orcid", case=False) or\
            authorIdentifier.str.match(".{4}-.{4}-.{4}-.{4}") or\
            (authorIdentifierScheme == "ORCID" and\
            authorIdentifier == authorIdentifier))',
            engine='python')
    .drop(columns=[
        'dataset_pid_url',
        'dataverse_collection_alias',
        'dataverse_collection_name',
        'dataverse_collection_type'])
    .assign(dataset_version_create_time_dt=lambda datasetPIDsAndAuthorMetadataDF: pd.to_datetime(
        datasetPIDsAndAuthorMetadataDF['dataset_version_create_time']))
    .assign(dataset_version_create_year=lambda datasetPIDsAndAuthorMetadataDF: pd.to_datetime(
        datasetPIDsAndAuthorMetadataDF['dataset_version_create_time_dt']).dt.year)
    [[
        'dataverse_installation_name',
        'dataset_version_create_year',
        'authorName',
        'authorIdentifierScheme',
        'authorIdentifier'
    ]]

    # Within each month, drop duplicate metadata. This will mitigate the effect of
    # hundreds or thousands of datasets being published with the same author metadata in a short
    # time frame, such during a dataset migration or bulk publishing using APIs
    .drop_duplicates(
        subset=[
            'authorName',
            'authorIdentifierScheme',
            'authorIdentifier'],
        keep='first')
    [[
        'dataverse_installation_name',
        'dataset_version_create_year'
    ]]

    .reset_index(drop=True, inplace=False)

    # Group by count of rows for each year-month
    .groupby(pd.Grouper(key='dataverse_installation_name', axis=0)).count()
    .rename(columns={'dataset_version_create_year': 'count_of_orcids'})
    .reset_index(drop=False, inplace=False)
)

orcidsDf.head()

In [None]:
# Merge the two dataframes
allAuthorMetadataVersusORCIDsDf = (pd
     .merge(allAuthorMetadataDf, orcidsDf,
        how='outer',
        on=['dataverse_installation_name'])
     .fillna(0)
     # Make dataverse_installation_name the index column
     .set_index('dataverse_installation_name', inplace=False)
     # Make sure all non-indexed columns are integers
     .astype('int32')
     .reset_index(drop=False, inplace=False)
     )

# Add column for percentage of ORCIDs
allAuthorMetadataVersusORCIDsDf['percentage_of_orcids'] = (
        (allAuthorMetadataVersusORCIDsDf['count_of_orcids']
         / allAuthorMetadataVersusORCIDsDf['count_of_author_metadata'])
        * 100
)

allAuthorMetadataVersusORCIDsDf.head()

In [None]:
# Export the dataframe as a CSV file
allAuthorMetadataVersusORCIDsDf.to_csv(
    'allAuthorMetadataVersusORCIDs.csv',
    index=False)