In [1]:
from functools import reduce
import numpy as np
import pandas as pd

# Import, check and prepare data

Import CSV files that contain:
- The PIDs of all datasets published by many repositories that use the Dataverse software and which repositories published them
- Author field metadata entered in all of those datasets

In [2]:
# Get basic info about datasets in repositories except for ODISSEI Portal
datasetPIDsDF = (pd
    .read_csv(
        'dataset_pids_from_most_known_dataverse_installations_2023.08.csv',
        sep=',',
        na_filter=False)
    .query(
        '(dataverse_json_export_saved == True) and\
        (dataverse_installation_name != "ODISSEI_Portal")')
    .drop(columns=['dataverse_json_export_saved'])
    .reset_index(drop=True, inplace=False)
 )

datasetPIDsDF.head()

Unnamed: 0,dataverse_installation_name,dataset_pid_url,dataverse_collection_alias,dataverse_collection_name,dataverse_collection_type
0,UNB_Libraries_Dataverse,https://doi.org/10.25545/NVN79Z,snb,Service New Brunswick,ORGANIZATIONS_INSTITUTIONS
1,UNB_Libraries_Dataverse,https://doi.org/10.25545/YB60PU,blightchemistry,Blight Chemistry Research Omniverse,RESEARCH_GROUP
2,UNB_Libraries_Dataverse,https://doi.org/10.25545/WHD3KC,blightchemistry,Blight Chemistry Research Omniverse,RESEARCH_GROUP
3,UNB_Libraries_Dataverse,https://doi.org/10.25545/6TZWCG,Ir-CationBinding,Elucidation of Charge Contributions in Iridium-Chelated Hydrogen-Bonding Systems,JOURNALS
4,UNB_Libraries_Dataverse,https://doi.org/10.25545/NYM13B,IFMLAB,Integrated Forest Management Lab,RESEARCH_GROUP


In [3]:
# Get Author field metadata entered in all datasets in Dataverse repositories
# and remove datasets have have no author metadata
authorMetadataDF = (pd
    .read_csv(
        'author(citation)_2023.08.22-2023.08.28.csv',
        sep=',',
        na_filter=True)
    .drop(columns=['dataset_pid', 'authorAffiliation'])
    .query('authorName != "N/A"')
    .reset_index(drop=True, inplace=False)
    )

authorMetadataDF.head()

Unnamed: 0,dataset_pid_url,dataset_publication_date,dataset_version_number,dataset_version_create_time,authorName,authorIdentifierScheme,authorIdentifier
0,https://doi.org/10.48370/OFD/8TTRLC,2022-05-13,1.0,2022-05-11T14:44:29Z,"Mammal Research Institute, Polish Academy of Sciences",,
1,https://doi.org/10.17026/dans-26a-cq4r,2015-09-18,2.0,2022-02-18T19:29:27Z,H.J. Hesseling,,
2,https://doi.org/10.17026/dans-zc2-mc2g,2016-12-31,1.0,2022-02-24T21:01:03Z,S. Moerman,,
3,https://doi.org/10.17026/dans-x9z-bmn6,2020-12-22,1.0,2022-02-14T05:11:03Z,G. Zielman,,
4,https://doi.org/10.7910/DVN/QD0V0H,2021-09-20,1.0,2021-07-29T19:45:43Z,"Master, Daniel M.",,


In [4]:
# Sanity check data by making sure count of datasets is the same in both dataframes
print(f'Number of datasets in datasetPIDsDF: {len(datasetPIDsDF)}')
datasetCountInAuthorMetadataDF = len(pd.unique(authorMetadataDF['dataset_pid_url']))
print(f'Number of datasets in authorMetadataDF: {datasetCountInAuthorMetadataDF}')

Number of datasets in datasetPIDsDF: 390401
Number of datasets in authorMetadataDF: 390401


In [5]:
# Join the datasetPIDsDF and the authorMetadataDF to add the installation column,
# so we know which installations published each dataset
datasetPIDsAndAuthorMetadataDF = (pd
    .merge(datasetPIDsDF, authorMetadataDF,
        how='inner',
        on=['dataset_pid_url'])
    .drop(columns=['dataset_version_number'])
    .reset_index(drop=True, inplace=False))

# Make sure the count of datasets in datasetPIDsAndAuthorMetadataDF
# is the same as in datasetPIDsDF: 390401
datasetCountInDatasetPIDsAndAuthorMetadataDF = len(pd.unique(datasetPIDsAndAuthorMetadataDF['dataset_pid_url']))
print(f'Number of datasets in datasetPIDsAndAuthorMetadataDF: {datasetCountInDatasetPIDsAndAuthorMetadataDF}')

# Get count of author metadata
print(f'Number of author metadata in datasetPIDsAndAuthorMetadataDF: {len(datasetPIDsAndAuthorMetadataDF)}')

# Get count of installations. Should by 84: the 85 installations in my dataset minus ODISSEI Portal
allInstallationsList = list(set(datasetPIDsAndAuthorMetadataDF['dataverse_installation_name'].tolist()))
countOfInstallations = len(allInstallationsList)
print(f'Number of installations in datasetPIDsAndAuthorMetadataDF: {countOfInstallations}')

Number of datasets in datasetPIDsAndAuthorMetadataDF: 390401
Number of author metadata in datasetPIDsAndAuthorMetadataDF: 667435
Number of installations in datasetPIDsAndAuthorMetadataDF: 84


In [6]:
datasetPIDsAndAuthorMetadataDF.head()

Unnamed: 0,dataverse_installation_name,dataset_pid_url,dataverse_collection_alias,dataverse_collection_name,dataverse_collection_type,dataset_publication_date,dataset_version_create_time,authorName,authorIdentifierScheme,authorIdentifier
0,UNB_Libraries_Dataverse,https://doi.org/10.25545/NVN79Z,snb,Service New Brunswick,ORGANIZATIONS_INSTITUTIONS,2018-05-10,2022-03-23T19:49:10Z,Service New Brunswick,,
1,UNB_Libraries_Dataverse,https://doi.org/10.25545/YB60PU,blightchemistry,Blight Chemistry Research Omniverse,RESEARCH_GROUP,2018-06-20,2018-02-21T18:26:43Z,"Blight, Barry A",ORCID,0000-0003-1166-6206
2,UNB_Libraries_Dataverse,https://doi.org/10.25545/WHD3KC,blightchemistry,Blight Chemistry Research Omniverse,RESEARCH_GROUP,2019-06-23,2019-06-21T18:30:39Z,"Balonova, Barbora",,
3,UNB_Libraries_Dataverse,https://doi.org/10.25545/6TZWCG,Ir-CationBinding,Elucidation of Charge Contributions in Iridium-Chelated Hydrogen-Bonding Systems,JOURNALS,2021-05-18,2021-05-18T16:39:24Z,"Blight, Barry A",,
4,UNB_Libraries_Dataverse,https://doi.org/10.25545/NYM13B,IFMLAB,Integrated Forest Management Lab,RESEARCH_GROUP,2020-06-30,2020-11-30T22:44:12Z,"Chen, Yingbing",,


# Explore data

Get info about author metadata that includes no identifier metadata, partial identifier metadata, and complete identifier metadata

In [7]:
authorsWithNoIdentifiersDf = (
    datasetPIDsAndAuthorMetadataDF
        .query(
            'authorIdentifierScheme != authorIdentifierScheme and\
            authorIdentifier != authorIdentifier')
        .reset_index(drop=True, inplace=False)
)

authorsWithPartialIdentifiersDf = (
    datasetPIDsAndAuthorMetadataDF
        .query(
            '(authorIdentifierScheme == authorIdentifierScheme and\
             authorIdentifier != authorIdentifier) or\
            (authorIdentifierScheme != authorIdentifierScheme and\
             authorIdentifier == authorIdentifier)')
        .fillna('')
        .reset_index(drop=True, inplace=False)
)

authorsWithCompleteIdentifiersDf = (
    datasetPIDsAndAuthorMetadataDF
        .query(
            'authorIdentifierScheme == authorIdentifierScheme and\
             authorIdentifier == authorIdentifier')
        .reset_index(drop=True, inplace=False)
)

sumOfAuthorMetadata = (
    len(authorsWithPartialIdentifiersDf)
    + len(authorsWithCompleteIdentifiersDf)
    + len(authorsWithNoIdentifiersDf)
)
print(f'Number of author metadata with partial, complete, and no identifier metadata: {sumOfAuthorMetadata}')

print(f'Number of author metadata with partial identifier metadata: {len(authorsWithPartialIdentifiersDf)}')
print(f'Number of author metadata with complete identifier metadata: {len(authorsWithCompleteIdentifiersDf)}')
print(f'Number of author metadata with no identifier metadata: {len(authorsWithNoIdentifiersDf)}')

Number of author metadata with partial, complete, and no identifier metadata: 667435
Number of author metadata with partial identifier metadata: 4438
Number of author metadata with complete identifier metadata: 82078
Number of author metadata with no identifier metadata: 580919


Get count of author metadata with and without identifiers for each installation

In [8]:
# Count of author metadata in authorsWithNoIdentifiersDf per installation
authorsWithNoIdentifiersByInstallationDf = (
    authorsWithNoIdentifiersDf
        .value_counts(subset=['dataverse_installation_name'])
        .to_frame('count_of_authors_with_no_identifiers')
)

# Count of author metadata in authorsWithPartialIdentifiersDf per installation
authorsWithPartialIdentifiersByInstallationDf = (
    authorsWithPartialIdentifiersDf
        .value_counts(subset=['dataverse_installation_name'])
        .to_frame('count_of_authors_with_partial_identifiers')
)

# Count of author metadata in authorsWithCompleteIdentifiersDf per installation
authorsWithCompleteIdentifiersByInstallationDf = (
    authorsWithCompleteIdentifiersDf
        .value_counts(subset=['dataverse_installation_name'])
        .to_frame('count_of_authors_with_complete_identifiers')
)

# Join the three dataframes
dataframes = [
    authorsWithNoIdentifiersByInstallationDf,
    authorsWithPartialIdentifiersByInstallationDf,
    authorsWithCompleteIdentifiersByInstallationDf
]

countOfAuthorsByInstallationDf = reduce(lambda left, right: left.join(right, how='outer'), dataframes)
countOfAuthorsByInstallationDf = (
    countOfAuthorsByInstallationDf
        .fillna(0)
        .astype('int32')
        .reset_index(drop=False, inplace=False)
        # Reorder columns
        .loc[:,[
            'dataverse_installation_name',
            'count_of_authors_with_complete_identifiers',
            'count_of_authors_with_partial_identifiers',
            'count_of_authors_with_no_identifiers']]
)

countOfAuthorsByInstallationDf.head()

Unnamed: 0,dataverse_installation_name,count_of_authors_with_complete_identifiers,count_of_authors_with_partial_identifiers,count_of_authors_with_no_identifiers
0,ACSS_Dataverse,5,0,141
1,ADA_Dataverse,1102,22,2140
2,ASU_Library_Research_Data_Repository,92,4,115
3,AUSSDA_Dataverse,258,0,2104
4,Abacus,0,1,4083


Get number of author metadata with each type of identifier, e.g. ORCID, GND

In [29]:
countOfValidIdentifiersByTypeAndInstallationDf = (pd
    .crosstab(
        authorsWithCompleteIdentifiersDf.dataverse_installation_name,
        authorsWithCompleteIdentifiersDf.authorIdentifierScheme)#,
        # margins=True, margins_name='Total')
    .reset_index(drop=False, inplace=False)
    .rename_axis(None, axis=1)
)

# Add rows for installations that have no complete identifiers
installationsWithCompleteIdentifiers = countOfValidIdentifiersByTypeAndInstallationDf['dataverse_installation_name'].tolist()
installationsWithNoCompleteIdentifiers = set(allInstallationsList) - set(installationsWithCompleteIdentifiers)
for installation in installationsWithNoCompleteIdentifiers:
    countOfValidIdentifiersByTypeAndInstallationDf.loc[-1] = [
        installation, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ]
    countOfValidIdentifiersByTypeAndInstallationDf.index = countOfValidIdentifiersByTypeAndInstallationDf.index + 1

# Create new dataframe with count of invalid ORCIDs in each installation

authorMetadataWithInvalidORCIDsDf = (
    authorsWithPartialIdentifiersDf
        .query(
        'authorIdentifier.str.contains("orcid", case=False) or\
        authorIdentifier.str.match(".*\d{4}-\d{4}-\d{4}-\d{4}.*")',
        engine='python')
        .reset_index(drop=True, inplace=False)
)

countOfAuthorsWithInvalidORCIDsByInstallationDf = (
    authorMetadataWithInvalidORCIDsDf
        .value_counts(subset=['dataverse_installation_name'])
        .to_frame('count_of_authors_with_invalid_orcids')
        .reset_index(drop=False, inplace=False)
)

countOfIdentifiersByTypeAndInstallationDf = (pd
    .merge(countOfValidIdentifiersByTypeAndInstallationDf, countOfAuthorsWithInvalidORCIDsByInstallationDf,
        how='outer',
        on=['dataverse_installation_name'])
    .fillna(0)
    # Make dataverse_installation_name the index column
    .set_index('dataverse_installation_name', inplace=False)
    # Make sure all non-indexed columns are integers
    .astype('int32')
    .reset_index(drop=False, inplace=False)
    )

# Combine the ORCID and invalid_orcid columns
countOfIdentifiersByTypeAndInstallationDf['ORCID_total'] = (
        countOfIdentifiersByTypeAndInstallationDf['ORCID']
        + countOfIdentifiersByTypeAndInstallationDf['count_of_authors_with_invalid_orcids']
)

# Remove ORCID and count_of_authors_with_invalid_orcids columns and rename ORCID_total column
countOfIdentifiersByTypeAndInstallationDf = (countOfIdentifiersByTypeAndInstallationDf
    .drop(columns=['ORCID', 'count_of_authors_with_invalid_orcids'])
    .rename(columns={'ORCID_total': 'ORCID'})
    .set_index('dataverse_installation_name', inplace=False)
    )

# Alphabetize columns
countOfIdentifiersByTypeAndInstallationDf = (countOfIdentifiersByTypeAndInstallationDf
    .reindex(sorted(countOfIdentifiersByTypeAndInstallationDf.columns), axis=1)
    .reset_index(drop=False, inplace=False)
    .sort_values('dataverse_installation_name')
    .reset_index(drop=True, inplace=False)
    )

countOfIdentifiersByTypeAndInstallationDf.head()

Unnamed: 0,dataverse_installation_name,DAI,DOI,GND,ISNI,LCNA,ORCID,ResearcherID,ScopusID,VIAF,idHAL
0,ACSS_Dataverse,0,0,0,0,0,5,0,0,0,0
1,ADA_Dataverse,0,0,0,0,0,1119,0,0,0,0
2,ASU_Library_Research_Data_Repository,0,0,0,0,0,92,0,0,0,0
3,AUSSDA_Dataverse,0,0,0,0,0,258,0,0,0,0
4,Abacus,0,0,0,0,0,0,0,0,0,0


In [11]:
# Export the dataframe as a CSV file
countOfIdentifiersByTypeAndInstallationDf.to_csv('countOfIdentifiersByTypeAndInstallationDf.csv', index=False)

In [30]:
# Create dataframe showing count of ORCIDs versus all other identifiers
countOfORCIDsVersusOtherIdentifiersByInstallationDf = countOfIdentifiersByTypeAndInstallationDf

countOfORCIDsVersusOtherIdentifiersByInstallationDf['Other_identifier'] = countOfORCIDsVersusOtherIdentifiersByInstallationDf[countOfIdentifiersByTypeAndInstallationDf.columns.delete([0, 6])].apply(
    lambda x: sum(x), axis=1
)

countOfORCIDsVersusOtherIdentifiersByInstallationDf = (
    countOfORCIDsVersusOtherIdentifiersByInstallationDf[[
        'dataverse_installation_name',
        'ORCID',
        'Other_identifier']]
)

countOfORCIDsVersusOtherIdentifiersByInstallationDf.head(10)

Unnamed: 0,dataverse_installation_name,ORCID,Other_identifier
0,ACSS_Dataverse,5,0
1,ADA_Dataverse,1119,0
2,ASU_Library_Research_Data_Repository,92,0
3,AUSSDA_Dataverse,258,0
4,Abacus,0,0
5,Arca_Dados,135,0
6,Borealis,3709,35
7,CESA_-_Repositorio_de_datos_académicos,4,0
8,CIDACS,10,0
9,CIFOR,95,0


In [31]:
# Export the dataframe as a CSV file
countOfORCIDsVersusOtherIdentifiersByInstallationDf.to_csv('countOfORCIDsVersusOtherIdentifiersByInstallationDf.csv', index=False)

In [69]:
# How often do installations publish datasets with
# author identifiers that aren't ORCIDs
countOfInstallations = len(countOfIdentifiersByTypeAndInstallationDf.dataverse_installation_name.unique())
print(countOfInstallations)

84


In [27]:
# print(countOfIdentifiersByTypeAndInstallationDf.columns[1:].remove('ORCID'))
print(countOfIdentifiersByTypeAndInstallationDf.columns.delete([0, 6]))


Index(['DAI', 'DOI', 'GND', 'ISNI', 'LCNA', 'ResearcherID', 'ScopusID', 'VIAF',
       'idHAL', 'Other_identifier'],
      dtype='object')
