In [1]:
from functools import reduce
import numpy as np
import pandas as pd

In [35]:
# Import, format and filter basic metadata of all dataset versions
datasetVersionMetadataHDVDF = (pd
    .read_csv(
        'basic_metadata_2022.10.02-2022.10.03.csv',
        usecols=lambda x: x not in [
            'dataset_pid', 'dataset_publication_date',
            'dataset_version_state'],
        parse_dates=['dataset_version_create_time'],
        sep=',', na_filter=False)
    .query('(publisher == "Harvard Dataverse")')
    .drop(columns=['publisher'])
    .reset_index(drop=True, inplace=False)
)

# dateColumns = ['dataset_version_create_time']
# datasetVersionMetadataHDVDF[dateColumns] = datasetVersionMetadataHDVDF[dateColumns].apply(pd.to_datetime)

datasetVersionMetadataHDVDF.head()

Unnamed: 0,dataset_pid_url,dataset_version_number,dataset_version_create_time
0,https://doi.org/10.7910/DVN/5PRYPC,4.0,2020-06-17 23:49:50+00:00
1,https://doi.org/10.7910/DVN/VIE1H,248.0,2015-06-18 19:49:12+00:00
2,https://doi.org/10.7910/DVN/KKUJWW,1.0,2015-07-15 00:57:41+00:00
3,https://doi.org/10.7910/DVN/5E6GBN,1.2,2020-04-05 21:32:37+00:00
4,https://doi.org/10.7910/DVN/9MKISZ,10.0,2013-01-29 23:42:10+00:00


In [31]:
print(f'Count of dataset versions: {len(datasetVersionMetadataHDVDF)}')
print(f'Number of datasets: {(len(pd.unique(datasetVersionMetadataHDVDF["dataset_pid_url"])))}')

Count of dataset versions: 130768
Number of datasets: 80278


In [39]:
latestDatasetVersionsHDVDF = (
    datasetVersionMetadataHDVDF
        .iloc[
            datasetVersionMetadataHDVDF
                .groupby('dataset_pid_url')['dataset_version_create_time']
                .agg(pd.Series.idxmax)]
        .reset_index(drop=True, inplace=False))

latestDatasetVersionsHDVDF.head()

Unnamed: 0,dataset_pid_url,dataset_version_number,dataset_version_create_time
0,https://doi.org/10.7910/DVN/00234,1.0,2014-03-23 22:12:34+00:00
1,https://doi.org/10.7910/DVN/004HG6,1.0,2020-08-28 15:42:11+00:00
2,https://doi.org/10.7910/DVN/005SCF,1.0,2021-08-26 03:17:40+00:00
3,https://doi.org/10.7910/DVN/006UPU,1.0,2021-08-13 18:15:36+00:00
4,https://doi.org/10.7910/DVN/007GT,5.0,2017-02-05 23:43:33+00:00


In [38]:
print(f'Count of dataset versions: {len(latestDatasetVersionsHDVDF)}')
print(f'Number of datasets: {(len(pd.unique(latestDatasetVersionsHDVDF["dataset_pid_url"])))}')

Count of dataset versions: 80278
Number of datasets: 80278


In [42]:
# Import author metadata
# Import, format and filter basic metadata of all dataset versions
authorMetadataDF = (pd
    .read_csv(
        'author(citation)_2022.10.02-2022.10.03.csv',
        sep=',', na_filter=True)
   .drop(columns=['dataset_pid'])
   .reset_index(drop=True, inplace=False)
)

authorMetadataDF.head()

Unnamed: 0,dataset_pid_url,dataset_version_number,authorName,authorAffiliation,authorIdentifierScheme,authorIdentifier
0,https://doi.org/10.48370/OFD/DBJUEM,1.0,"Mammal Research Institute, Polish Academy of Sciences",,,
1,https://doi.org/10.21410/7E4/4WG94W,2.1,"Pagès, Jean-Pierre","Commissariat à l'Énergie Atomique, Laboratoire de statistiques et d'études économiques et sociales (LSESS), IPSN",,
2,https://doi.org/10.21410/7E4/4WG94W,2.1,"Agrafiotis, Démosthène","Commissariat à l'Énergie Atomique, Laboratoire de statistiques et d'études économiques et sociales (LSESS), IPSN",,
3,https://doi.org/10.7910/DVN/5PRYPC,4.0,China Data Lab,China Data Lab,,
4,https://doi.org/10.17026/dans-2zm-dsmz,1.0,Portable Antiquities of the Netherlands,,,


In [46]:
# Merge author metadata to latestDatasetVersionsHDVDF
authorMetadataLatestVersionDF = (pd
    .merge(latestDatasetVersionsHDVDF, authorMetadataDF,
          how='inner',
          on=['dataset_pid_url', 'dataset_version_number'])
    .reset_index(drop=True, inplace=False))

authorMetadataLatestVersionDF.head()
# Only author metadata of most recently versions should remain

Unnamed: 0,dataset_pid_url,dataset_version_number,dataset_version_create_time,authorName,authorAffiliation,authorIdentifierScheme,authorIdentifier
0,https://doi.org/10.7910/DVN/00234,1.0,2014-03-23 22:12:34+00:00,"Mazzolari, Francesca",Centro Studi Confindustria,,
1,https://doi.org/10.7910/DVN/00234,1.0,2014-03-23 22:12:34+00:00,"Ragusa, Giuseppe",Luiss University,,
2,https://doi.org/10.7910/DVN/004HG6,1.0,2020-08-28 15:42:11+00:00,"Master, Daniel M.",Wheaton College,,
3,https://doi.org/10.7910/DVN/004HG6,1.0,2020-08-28 15:42:11+00:00,"Stager, Lawrence E.",Harvard University,,
4,https://doi.org/10.7910/DVN/005SCF,1.0,2021-08-26 03:17:40+00:00,"Master, Daniel M.",Wheaton College,,


In [48]:
print(f'Count of author metadata values: {len(authorMetadataLatestVersionDF)}')
print(f'Number of datasets: {(len(pd.unique(authorMetadataLatestVersionDF["dataset_pid_url"])))}')

Count of author metadata values: 155445
Number of datasets: 80278


Of the 155,445 authors, how many have a value in either the Author Identifier Scheme or Author Identifier field?

In [69]:
authorIdentifierMetadata = (
    authorMetadataLatestVersionDF
        .drop(columns=['authorAffiliation'])
        # .dropna(subset=['authorIdentifierScheme', 'authorIdentifier'], how='all')
        .query(
            '(authorIdentifierScheme == authorIdentifierScheme or\
            authorIdentifier == authorIdentifier)')
        .reset_index(drop=True, inplace=False)
)

authorIdentifierMetadata.head()

Unnamed: 0,dataset_pid_url,dataset_version_number,dataset_version_create_time,authorName,authorIdentifierScheme,authorIdentifier
0,https://doi.org/10.7910/DVN/00INMO,1.0,2021-10-20 06:53:39+00:00,Yozo Mitsui,ORCID,orcid.org/0000-0003-2919-0058
1,https://doi.org/10.7910/DVN/00WMEO,1.0,2020-06-06 01:16:06+00:00,"Nolte, Christoph",ORCID,0000-0001-7827-689X
2,https://doi.org/10.7910/DVN/01BU1N,1.0,2022-02-03 18:14:17+00:00,"Wolaver, Amy",ORCID,0000-0002-0905-9397
3,https://doi.org/10.7910/DVN/01BU1N,1.0,2022-02-03 18:14:17+00:00,"Doces, John",ORCID,0000-0002-4671-2885
4,https://doi.org/10.7910/DVN/01KX3V,4.0,2016-12-07 13:19:48+00:00,"Zhong, Qing",ORCID,0000-0002-5340-301X


In [70]:
print(f'Author metadata with at least identifier or identifier scheme: {len(authorIdentifierMetadata)}')
print(f'Number of datasets: {(len(pd.unique(authorIdentifierMetadata["dataset_pid_url"])))}')

Author metadata with at least identifier or identifier scheme: 20231
Number of datasets: 12975


In [65]:
# Of the 155,445 authors, how many have values in the Author Identifier field but not the Author Identifier Scheme field?

authorIdentifiersMissingScheme = (
    authorMetadataLatestVersionDF
        .drop(columns=['authorAffiliation'])
        .query(
            'authorIdentifier == authorIdentifier and\
            authorIdentifierScheme != authorIdentifierScheme')
        .reset_index(drop=True, inplace=False)
                  )

authorIdentifiersMissingScheme.head()

Unnamed: 0,dataset_pid_url,dataset_version_number,dataset_version_create_time,authorName,authorIdentifierScheme,authorIdentifier
0,https://doi.org/10.7910/DVN/0ILLXT,2.0,2022-02-15 00:25:53+00:00,"Rivero, Albert H.",,0000-0002-9669-8452
1,https://doi.org/10.7910/DVN/1APNEJ,1.0,2021-09-21 23:49:12+00:00,"Resh, William",,https://orcid.org/0000-0002-2324-4118
2,https://doi.org/10.7910/DVN/2JKWNS,1.0,2021-04-18 20:57:41+00:00,"Marble, William",,0000-0001-9352-5540
3,https://doi.org/10.7910/DVN/2JKWNS,1.0,2021-04-18 20:57:41+00:00,"Mousa,Salma",,0000-0002-1482-4276
4,https://doi.org/10.7910/DVN/2JKWNS,1.0,2021-04-18 20:57:41+00:00,"Siegel,Alexandra",,0000-0003-0792-7813


In [66]:
print(f'Author identifier metadata with no identifier scheme: {len(authorIdentifiersMissingScheme)}')
print(f'Number of datasets: {(len(pd.unique(authorIdentifiersMissingScheme["dataset_pid_url"])))}')

Author identifier metadata with no identifier scheme: 274
Number of datasets: 236


In [67]:
# Of the 155,445 authors, how many have values in the Author Scheme field but not in the Author Identifier field?

authorIdentifierSchemeMissingIdentifier = (
    authorMetadataLatestVersionDF
        .drop(columns=['authorAffiliation'])
        .query(
        'authorIdentifier != authorIdentifier and\
        authorIdentifierScheme == authorIdentifierScheme')
        .reset_index(drop=True, inplace=False)
)

authorIdentifierSchemeMissingIdentifier.head()

Unnamed: 0,dataset_pid_url,dataset_version_number,dataset_version_create_time,authorName,authorIdentifierScheme,authorIdentifier
0,https://doi.org/10.7910/DVN/01LN9W,1.0,2017-10-26 19:03:01+00:00,"Perez, Efren",ORCID,
1,https://doi.org/10.7910/DVN/02FGSY,1.0,2020-02-16 02:41:41+00:00,"Wen, Yangmao",ORCID,
2,https://doi.org/10.7910/DVN/06EDUI,2.0,2021-01-08 11:05:57+00:00,"Schoolman, Ethan",ORCID,
3,https://doi.org/10.7910/DVN/08OJUV,1.1,2019-08-07 23:51:37+00:00,"Dutkiewicz, Stephanie",ORCID,
4,https://doi.org/10.7910/DVN/0AMJUC,1.0,2021-12-19 18:26:39+00:00,"Eguda, Felix",ORCID,


In [68]:
print(f'Author identifier scheme metadata with no identifier: {len(authorIdentifierSchemeMissingIdentifier)}')
print(f'Number of datasets: {(len(pd.unique(authorIdentifierSchemeMissingIdentifier["dataset_pid_url"])))}')

Author identifier scheme metadata with no identifier: 1155
Number of datasets: 998


In [58]:
# Number of author metadata values with ORCIDs
authorMetadataWithORCIDs = (
    authorMetadataWithIdentifiers
        .query(
            '~authorIdentifier.str.contains("orcid.org").values or\
            authorIdentifierScheme == "ORCID"')
        .reset_index(drop=True, inplace=False)
)

authorMetadataWithORCIDs.head()

Unnamed: 0,dataset_pid_url,dataset_version_number,dataset_version_create_time,authorName,authorIdentifierScheme,authorIdentifier
0,https://doi.org/10.7910/DVN/00INMO,1.0,2021-10-20 06:53:39+00:00,Yozo Mitsui,ORCID,orcid.org/0000-0003-2919-0058
1,https://doi.org/10.7910/DVN/00WMEO,1.0,2020-06-06 01:16:06+00:00,"Nolte, Christoph",ORCID,0000-0001-7827-689X
2,https://doi.org/10.7910/DVN/01BU1N,1.0,2022-02-03 18:14:17+00:00,"Wolaver, Amy",ORCID,0000-0002-0905-9397
3,https://doi.org/10.7910/DVN/01BU1N,1.0,2022-02-03 18:14:17+00:00,"Doces, John",ORCID,0000-0002-4671-2885
4,https://doi.org/10.7910/DVN/01KX3V,4.0,2016-12-07 13:19:48+00:00,"Zhong, Qing",ORCID,0000-0002-5340-301X


In [59]:
print(f'Author metadata with ORCID IDs: {len(authorMetadataWithORCIDs)}')
print(f'Number of datasets: {(len(pd.unique(authorMetadataWithORCIDs["dataset_pid_url"])))}')

Author metadata with ORCID IDs: 19037
Number of datasets: 12121
