In [None]:
from functools import reduce
import numpy as np
import pandas as pd

# Import, check and prepare data

Import CSV files that contain:
- The PIDs of all datasets published by many repositories that use the Dataverse software and which repositories published them
- Author field metadata entered in all of those datasets

In [None]:
# Get basic info about datasets in repositories except for ODISSEI Portal
datasetPIDsDF = (pd
    .read_csv(
        'dataset_pids_from_most_known_dataverse_installations_2023.08.csv',
        sep=',',
        na_filter=False)
    .query(
        '(dataverse_json_export_saved == True) and\
        (dataverse_installation_name != "ODISSEI_Portal")')
    .drop(columns=['dataverse_json_export_saved'])
    .reset_index(drop=True, inplace=False)
 )

datasetPIDsDF.head()

In [None]:
# Get Author field metadata entered in all datasets in Dataverse repositories
# and remove datasets have have no author metadata
authorMetadataDF = (pd
    .read_csv(
        'author(citation)_2023.08.22-2023.08.28.csv',
        sep=',',
        na_filter=True)
    .drop(columns=['dataset_pid', 'authorAffiliation'])
    .query('authorName != "N/A"')
    .reset_index(drop=True, inplace=False)
    )

authorMetadataDF.head()

In [None]:
print(f'Number of datasets in datasetPIDsDF: {len(datasetPIDsDF)}')
datasetCountInAuthorMetadataDF = len(pd.unique(authorMetadataDF['dataset_pid_url']))
print(f'Number of datasets in authorMetadataDF: {datasetCountInAuthorMetadataDF}')

In [None]:
# Join the datasetPIDsDF and the authorMetadataDF to add the installation column,
# so we know which installations published each dataset
# and remove datasets that have no author

datasetPIDsAndAuthorMetadataDF = (pd
    .merge(datasetPIDsDF, authorMetadataDF,
        how='inner',
        on=['dataset_pid_url'])
    .drop(columns=[
        'dataset_version_number',
        'dataset_version_create_time'])
    .reset_index(drop=True, inplace=False))

# Make sure the count of rows in datasetPIDsAndAuthorMetadataDF
# is the same as the count of total datasets in datasetPIDsDF: 390401
datasetCountInDatasetPIDsAndAuthorMetadataDF = len(pd.unique(datasetPIDsAndAuthorMetadataDF['dataset_pid_url']))
print(f'Number of datasets in datasetPIDsAndAuthorMetadataDF: {datasetCountInDatasetPIDsAndAuthorMetadataDF}')

print(f'Number of author metadata in datasetPIDsAndAuthorMetadataDF: {len(datasetPIDsAndAuthorMetadataDF)}')

countOfInstallations = len(datasetPIDsAndAuthorMetadataDF.dataverse_installation_name.unique())
print(f'Number of installations in datasetPIDsAndAuthorMetadataDF: {countOfInstallations}')

In [None]:
datasetPIDsAndAuthorMetadataDF.head()

# Explore data

In [None]:
# Get info about author metadata that includes no values and any values in the identifier type or identifier fields
authorsWithNoIdentifiersDf = (
    datasetPIDsAndAuthorMetadataDF
        .query(
            'authorIdentifierScheme != authorIdentifierScheme and\
            authorIdentifier != authorIdentifier')
        .reset_index(drop=True, inplace=False)
)

authorsWithPartialIdentifiersDf = (
    datasetPIDsAndAuthorMetadataDF
        .query(
            '(authorIdentifierScheme == authorIdentifierScheme and\
             authorIdentifier != authorIdentifier) or\
            (authorIdentifierScheme != authorIdentifierScheme and\
             authorIdentifier == authorIdentifier)')
        .reset_index(drop=True, inplace=False)
)

authorsWithCompleteIdentifiersDf = (
    datasetPIDsAndAuthorMetadataDF
        .query(
            'authorIdentifierScheme == authorIdentifierScheme and\
             authorIdentifier == authorIdentifier')
        .reset_index(drop=True, inplace=False)
)

sumOfAuthorMetadata = (
    len(authorsWithPartialIdentifiersDf)
    + len(authorsWithCompleteIdentifiersDf)
    + len(authorsWithNoIdentifiersDf)
)
print(f'Number of author metadata with partial, complete, and no identifier metadata: {sumOfAuthorMetadata}')

print(f'Number of author metadata with partial identifier metadata: {len(authorsWithPartialIdentifiersDf)}')
print(f'Number of author metadata with complete identifier metadata: {len(authorsWithCompleteIdentifiersDf)}')
print(f'Number of author metadata with no identifier metadata: {len(authorsWithNoIdentifiersDf)}')

Get count of metadata for each installation

In [None]:
# Count of author metadata in authorsWithNoIdentifiersDf per installation
authorsWithNoIdentifiersByInstallationDf = (
    authorsWithNoIdentifiersDf
        .value_counts(subset=['dataverse_installation_name'])
        .to_frame('count_of_authors_with_no_identifiers')
)

authorsWithNoIdentifiersByInstallationDf.head()

In [None]:
# Count of author metadata in authorsWithPartialIdentifiersDf per installation
authorsWithPartialIdentifiersByInstallationDf = (
    authorsWithPartialIdentifiersDf
        .value_counts(subset=['dataverse_installation_name'])
        .to_frame('count_of_authors_with_partial_identifiers')
)

authorsWithPartialIdentifiersByInstallationDf.head()

In [None]:
# Count of author metadata in authorsWithCompleteIdentifiersDf per installation
authorsWithCompleteIdentifiersByInstallationDf = (
    authorsWithCompleteIdentifiersDf
        .value_counts(subset=['dataverse_installation_name'])
        .to_frame('count_of_authors_with_complete_identifiers')
)

authorsWithCompleteIdentifiersByInstallationDf.head()

In [None]:
# Join authorsWithNoIdentifiersByInstallationDf, authorsWithPartialIdentifiersByInstallationDf, and authorsWithCompleteIdentifiersByInstallationDf
dataframes = [
    authorsWithNoIdentifiersByInstallationDf,
    authorsWithPartialIdentifiersByInstallationDf,
    authorsWithCompleteIdentifiersByInstallationDf
]

countOfAuthorsByInstallationDf = reduce(lambda left, right: left.join(right, how='outer'), dataframes)
countOfAuthorsByInstallationDf = (
    countOfAuthorsByInstallationDf
        .fillna(0)
        .astype('int32')
        .reset_index(drop=False, inplace=False)
        # Reorder columns
        .loc[:,[
            'dataverse_installation_name',
            'count_of_authors_with_complete_identifiers',
            'count_of_authors_with_partial_identifiers',
            'count_of_authors_with_no_identifiers']]
)

countOfAuthorsByInstallationDf.head()

In [None]:
# Get number of author metadata with each type of identifier, e.g. ORCID, GND
identifierTypesByInstallationDf = (pd
    .crosstab(
        authorsWithCompleteIdentifiersDf.dataverse_installation_name,
        authorsWithCompleteIdentifiersDf.authorIdentifierScheme,
        margins=True, margins_name='Total')
    .reset_index(drop=False, inplace=False)
    .rename_axis(None, axis=1)
)

identifierTypesByInstallationDf.head()

In [None]:
# Export the dataframe as a CSV file
identifierTypesByInstallationDf.to_csv('identifierTypesByInstallation.csv', index=False)


In [None]:
# How much author metadata are missing identifier types but may be ORCIDs
authorMetadataWithPossibleORCIDs = (
    authorsWithPartialIdentifiersDf[
        authorsWithPartialIdentifiersDf['authorIdentifier']
            .str.contains('orcid', case=False, na=False)])
authorMetadataWithPossibleORCIDs.head()

In [None]:
print(len(authorMetadataWithPossibleORCIDs))

In [None]:
print(authorsWithPartialIdentifiersDf.dtypes)