In [2]:
from functools import reduce
import numpy as np
import pandas as pd

# Import, check and prepare data

Import CSV files that contain:
- The PIDs of all datasets published by many repositories that use the Dataverse software and which repositories published them
- Author field metadata entered in all of those datasets

In [3]:
# Get basic info about datasets in repositories except for ODISSEI Portal
datasetPIDsDF = (pd
    .read_csv(
        'dataset_pids_from_most_known_dataverse_installations_2023.08.csv',
        sep=',',
        na_filter=False)
    .query(
        '(dataverse_json_export_saved == True) and\
        (dataverse_installation_name != "ODISSEI_Portal")')
    .drop(columns=['dataverse_json_export_saved'])
    .reset_index(drop=True, inplace=False)
 )

datasetPIDsDF.head()

Unnamed: 0,dataverse_installation_name,dataset_pid_url,dataverse_collection_alias,dataverse_collection_name,dataverse_collection_type
0,UNB_Libraries_Dataverse,https://doi.org/10.25545/NVN79Z,snb,Service New Brunswick,ORGANIZATIONS_INSTITUTIONS
1,UNB_Libraries_Dataverse,https://doi.org/10.25545/YB60PU,blightchemistry,Blight Chemistry Research Omniverse,RESEARCH_GROUP
2,UNB_Libraries_Dataverse,https://doi.org/10.25545/WHD3KC,blightchemistry,Blight Chemistry Research Omniverse,RESEARCH_GROUP
3,UNB_Libraries_Dataverse,https://doi.org/10.25545/6TZWCG,Ir-CationBinding,Elucidation of Charge Contributions in Iridium-Chelated Hydrogen-Bonding Systems,JOURNALS
4,UNB_Libraries_Dataverse,https://doi.org/10.25545/NYM13B,IFMLAB,Integrated Forest Management Lab,RESEARCH_GROUP


In [4]:
# Get Author field metadata entered in all datasets in Dataverse repositories
# and remove datasets have have no author metadata
authorMetadataDF = (pd
    .read_csv(
        'author(citation)_2023.08.22-2023.08.28.csv',
        sep=',',
        na_filter=True)
    .drop(columns=['dataset_pid', 'authorAffiliation'])
    .query('authorName != "N/A"')
    .reset_index(drop=True, inplace=False)
    )

authorMetadataDF.head()

Unnamed: 0,dataset_pid_url,dataset_publication_date,dataset_version_number,dataset_version_create_time,authorName,authorIdentifierScheme,authorIdentifier
0,https://doi.org/10.48370/OFD/8TTRLC,2022-05-13,1.0,2022-05-11T14:44:29Z,"Mammal Research Institute, Polish Academy of Sciences",,
1,https://doi.org/10.17026/dans-26a-cq4r,2015-09-18,2.0,2022-02-18T19:29:27Z,H.J. Hesseling,,
2,https://doi.org/10.17026/dans-zc2-mc2g,2016-12-31,1.0,2022-02-24T21:01:03Z,S. Moerman,,
3,https://doi.org/10.17026/dans-x9z-bmn6,2020-12-22,1.0,2022-02-14T05:11:03Z,G. Zielman,,
4,https://doi.org/10.7910/DVN/QD0V0H,2021-09-20,1.0,2021-07-29T19:45:43Z,"Master, Daniel M.",,


In [5]:
print(f'Number of datasets in datasetPIDsDF: {len(datasetPIDsDF)}')
datasetCountInAuthorMetadataDF = len(pd.unique(authorMetadataDF['dataset_pid_url']))
print(f'Number of datasets in authorMetadataDF: {datasetCountInAuthorMetadataDF}')

Number of datasets in datasetPIDsDF: 390401
Number of datasets in authorMetadataDF: 390401


In [6]:
# Join the datasetPIDsDF and the authorMetadataDF to add the installation column,
# so we know which installations published each dataset
# and remove datasets that have no author
datasetPIDsAndAuthorMetadataDF = (pd
    .merge(datasetPIDsDF, authorMetadataDF,
        how='inner',
        on=['dataset_pid_url'])
    .drop(columns=['dataset_version_number'])
    .reset_index(drop=True, inplace=False))

# Make sure the count of rows in datasetPIDsAndAuthorMetadataDF
# is the same as the count of total datasets in datasetPIDsDF: 390401
datasetCountInDatasetPIDsAndAuthorMetadataDF = len(pd.unique(datasetPIDsAndAuthorMetadataDF['dataset_pid_url']))
print(f'Number of datasets in datasetPIDsAndAuthorMetadataDF: {datasetCountInDatasetPIDsAndAuthorMetadataDF}')

print(f'Number of author metadata in datasetPIDsAndAuthorMetadataDF: {len(datasetPIDsAndAuthorMetadataDF)}')

countOfInstallations = len(datasetPIDsAndAuthorMetadataDF.dataverse_installation_name.unique())
print(f'Number of installations in datasetPIDsAndAuthorMetadataDF: {countOfInstallations}')

Number of datasets in datasetPIDsAndAuthorMetadataDF: 390401
Number of author metadata in datasetPIDsAndAuthorMetadataDF: 667435
Number of installations in datasetPIDsAndAuthorMetadataDF: 84


In [7]:
datasetPIDsAndAuthorMetadataDF.head()

Unnamed: 0,dataverse_installation_name,dataset_pid_url,dataverse_collection_alias,dataverse_collection_name,dataverse_collection_type,dataset_publication_date,dataset_version_create_time,authorName,authorIdentifierScheme,authorIdentifier
0,UNB_Libraries_Dataverse,https://doi.org/10.25545/NVN79Z,snb,Service New Brunswick,ORGANIZATIONS_INSTITUTIONS,2018-05-10,2022-03-23T19:49:10Z,Service New Brunswick,,
1,UNB_Libraries_Dataverse,https://doi.org/10.25545/YB60PU,blightchemistry,Blight Chemistry Research Omniverse,RESEARCH_GROUP,2018-06-20,2018-02-21T18:26:43Z,"Blight, Barry A",ORCID,0000-0003-1166-6206
2,UNB_Libraries_Dataverse,https://doi.org/10.25545/WHD3KC,blightchemistry,Blight Chemistry Research Omniverse,RESEARCH_GROUP,2019-06-23,2019-06-21T18:30:39Z,"Balonova, Barbora",,
3,UNB_Libraries_Dataverse,https://doi.org/10.25545/6TZWCG,Ir-CationBinding,Elucidation of Charge Contributions in Iridium-Chelated Hydrogen-Bonding Systems,JOURNALS,2021-05-18,2021-05-18T16:39:24Z,"Blight, Barry A",,
4,UNB_Libraries_Dataverse,https://doi.org/10.25545/NYM13B,IFMLAB,Integrated Forest Management Lab,RESEARCH_GROUP,2020-06-30,2020-11-30T22:44:12Z,"Chen, Yingbing",,


# Explore data

In [11]:
# Get info about author metadata that includes no identifier metadata,
# partial identifier metadata, and complete identifier metadata
authorsWithNoIdentifiersDf = (
    datasetPIDsAndAuthorMetadataDF
        .query(
            'authorIdentifierScheme != authorIdentifierScheme and\
            authorIdentifier != authorIdentifier')
        .reset_index(drop=True, inplace=False)
)

authorsWithPartialIdentifiersDf = (
    datasetPIDsAndAuthorMetadataDF
        .query(
            '(authorIdentifierScheme == authorIdentifierScheme and\
             authorIdentifier != authorIdentifier) or\
            (authorIdentifierScheme != authorIdentifierScheme and\
             authorIdentifier == authorIdentifier)')
        .fillna('')
        .reset_index(drop=True, inplace=False)
)

authorsWithCompleteIdentifiersDf = (
    datasetPIDsAndAuthorMetadataDF
        .query(
            'authorIdentifierScheme == authorIdentifierScheme and\
             authorIdentifier == authorIdentifier')
        .reset_index(drop=True, inplace=False)
)

sumOfAuthorMetadata = (
    len(authorsWithPartialIdentifiersDf)
    + len(authorsWithCompleteIdentifiersDf)
    + len(authorsWithNoIdentifiersDf)
)
print(f'Number of author metadata with partial, complete, and no identifier metadata: {sumOfAuthorMetadata}')

print(f'Number of author metadata with partial identifier metadata: {len(authorsWithPartialIdentifiersDf)}')
print(f'Number of author metadata with complete identifier metadata: {len(authorsWithCompleteIdentifiersDf)}')
print(f'Number of author metadata with no identifier metadata: {len(authorsWithNoIdentifiersDf)}')

Number of author metadata with partial, complete, and no identifier metadata: 667435
Number of author metadata with partial identifier metadata: 4438
Number of author metadata with complete identifier metadata: 82078
Number of author metadata with no identifier metadata: 580919


Get count of author metadata with and without identifiers for each installation

In [12]:
# Count of author metadata in authorsWithNoIdentifiersDf per installation
authorsWithNoIdentifiersByInstallationDf = (
    authorsWithNoIdentifiersDf
        .value_counts(subset=['dataverse_installation_name'])
        .to_frame('count_of_authors_with_no_identifiers')
)

# Count of author metadata in authorsWithPartialIdentifiersDf per installation
authorsWithPartialIdentifiersByInstallationDf = (
    authorsWithPartialIdentifiersDf
        .value_counts(subset=['dataverse_installation_name'])
        .to_frame('count_of_authors_with_partial_identifiers')
)

# Count of author metadata in authorsWithCompleteIdentifiersDf per installation
authorsWithCompleteIdentifiersByInstallationDf = (
    authorsWithCompleteIdentifiersDf
        .value_counts(subset=['dataverse_installation_name'])
        .to_frame('count_of_authors_with_complete_identifiers')
)

# Join the three dataframes
dataframes = [
    authorsWithNoIdentifiersByInstallationDf,
    authorsWithPartialIdentifiersByInstallationDf,
    authorsWithCompleteIdentifiersByInstallationDf
]

countOfAuthorsByInstallationDf = reduce(lambda left, right: left.join(right, how='outer'), dataframes)
countOfAuthorsByInstallationDf = (
    countOfAuthorsByInstallationDf
        .fillna(0)
        .astype('int32')
        .reset_index(drop=False, inplace=False)
        # Reorder columns
        .loc[:,[
            'dataverse_installation_name',
            'count_of_authors_with_complete_identifiers',
            'count_of_authors_with_partial_identifiers',
            'count_of_authors_with_no_identifiers']]
)

countOfAuthorsByInstallationDf.head()

Unnamed: 0,dataverse_installation_name,count_of_authors_with_complete_identifiers,count_of_authors_with_partial_identifiers,count_of_authors_with_no_identifiers
0,ACSS_Dataverse,5,0,141
1,ADA_Dataverse,1102,22,2140
2,ASU_Library_Research_Data_Repository,92,4,115
3,AUSSDA_Dataverse,258,0,2104
4,Abacus,0,1,4083


In [13]:
# Get number of author metadata with each type of identifier, e.g. ORCID, GND
identifierTypesByInstallationDf = (pd
    .crosstab(
        authorsWithCompleteIdentifiersDf.dataverse_installation_name,
        authorsWithCompleteIdentifiersDf.authorIdentifierScheme,
        margins=True, margins_name='Total')
    .reset_index(drop=False, inplace=False)
    .rename_axis(None, axis=1)
)

identifierTypesByInstallationDf.head()

Unnamed: 0,dataverse_installation_name,DAI,DOI,GND,ISNI,LCNA,ORCID,ResearcherID,ScopusID,VIAF,idHAL,Total
0,ACSS_Dataverse,0,0,0,0,0,5,0,0,0,0,5
1,ADA_Dataverse,0,0,0,0,0,1102,0,0,0,0,1102
2,ASU_Library_Research_Data_Repository,0,0,0,0,0,92,0,0,0,0,92
3,AUSSDA_Dataverse,0,0,0,0,0,258,0,0,0,0,258
4,Arca_Dados,0,0,0,0,0,132,0,0,0,0,132


In [None]:
# Export the dataframe as a CSV file
identifierTypesByInstallationDf.to_csv('identifierTypesByInstallation.csv', index=False)


In [33]:
# How much author metadata are missing identifier types but may be ORCIDs
authorMetadataWithPossibleORCIDsDf = (
    authorsWithPartialIdentifiersDf
        .query(
            'authorIdentifier.str.contains("orcid", case=False) or\
            authorIdentifier.str.match("some-regex-\\d+$")',
            engine='python')
        .reset_index(drop=True, inplace=False)
)

# authorMetadataWithPossibleORCIDs = (
#     authorsWithPartialIdentifiersDf[
#         authorsWithPartialIdentifiersDf['authorIdentifier']
#             .str.contains('orcid', case=False, na=False)])

authorMetadataWithPossibleORCIDsDf.head()

Unnamed: 0,dataverse_installation_name,dataset_pid_url,dataverse_collection_alias,dataverse_collection_name,dataverse_collection_type,dataset_publication_date,dataset_version_create_time,authorName,authorIdentifierScheme,authorIdentifier
0,Harvard_Dataverse,https://doi.org/10.7910/DVN/4FNQ8M,IAI_ecosystems,"Ecosystems, biodiversity and land use",ORGANIZATIONS_INSTITUTIONS,2019-05-28,2019-05-27T20:57:57Z,"Espírito-Santo, Mario M.",,https://orcid.org/0000-0001-8274-3075
1,Harvard_Dataverse,https://doi.org/10.7910/DVN/HJJJPJ,PSRM,Political Science Research and Methods (PSRM) Dataverse,JOURNALS,2018-10-02,2018-08-21T12:59:09Z,"Manow, Philip",,orcid.org/0000-0002-7154-7789
2,Harvard_Dataverse,https://doi.org/10.7910/DVN/AUFSID,Weather,Weather Data,ORGANIZATIONS_INSTITUTIONS,2018-04-05,2018-03-22T08:52:55Z,"van Oort, Pepijn",,orcid.org/0000-0001-7617-5382
3,Harvard_Dataverse,https://doi.org/10.7910/DVN/PB8X8P,ajps,American Journal of Political Science (AJPS) Dataverse,JOURNALS,2018-10-22,2019-11-22T16:50:37Z,"Nagler, Jonathan",,https://orcid.org/0000-0001-6918-9428
4,Harvard_Dataverse,https://doi.org/10.7910/DVN/FCT9VO,contextuality,Contextuality Dataverse,RESEARCH_PROJECTS,2019-02-21,2019-02-21T18:13:54Z,"Dzhafarov, Ehtibar",,https://orcid.org/0000-0003-1909-7706


In [None]:
print(len(authorMetadataWithPossibleORCIDs))

In [19]:
# print(authorsWithPartialIdentifiersDf.dtypes)
authorsWithPartialIdentifiersDf.head()

Unnamed: 0,dataverse_installation_name,dataset_pid_url,dataverse_collection_alias,dataverse_collection_name,dataverse_collection_type,dataset_publication_date,dataset_version_create_time,authorName,authorIdentifierScheme,authorIdentifier
0,Harvard_Dataverse,https://doi.org/10.7910/DVN/NEF7ZD,ABDAssessments_Mali,ABD Assessments - Mali Dataverse,RESEARCH_PROJECTS,2016-12-01,2018-12-21T13:00:28Z,"Vodouhe, R.",,Bioversity International
1,Harvard_Dataverse,https://doi.org/10.7910/DVN/O94VWV,AD_EMWaterSupersaturation,Antony Delavois Dataverse,RESEARCH_PROJECTS,2022-06-07,2022-06-07T14:43:32Z,"Delavois, Antony",ORCID,
2,Harvard_Dataverse,https://doi.org/10.7910/DVN/9XLF4O,AQ_BangladeshandNigeria,"Aquaculture: increasing income, diversifying diets, and empowering women in Bangladesh and Nigeria",RESEARCH_PROJECTS,2020-07-10,2020-07-21T13:37:54Z,"Cheong,Kai Ching",,0000-0002-8115-417X
3,Harvard_Dataverse,https://doi.org/10.7910/DVN/L5O0XS,ActiveTrachoma,Dedefo Gebre Repository,RESEARCH_PROJECTS,2022-07-20,2022-07-20T20:01:25Z,"Gebre, Dedefo",ORCID,
4,Harvard_Dataverse,https://doi.org/10.7910/DVN/OCYAPW,AffectiveRatingData,Affective Rating Data,RESEARCH_PROJECTS,2018-05-02,2018-05-02T14:07:42Z,"Tchernichovski, Ofer",ORCID,
