In [17]:
from functools import reduce
import pandas as pd

The Dataverse team is planning to add  additions to the dataset deposit form in the Harvard Dataverse Repository, we need to know

In Dataverse repositories, how many datasets have funding metadata, and among these, how many have funding names? Which users include funding metadata in their deposits most often?

## Prepare the data

In [2]:
# Import CSV file that lists PIDs of all datasets and which Dataaverse installation
# they're published in
datasetPIDsDF = pd.read_csv(
    'dataset_pids_from_most_known_dataverse_installations.csv',
    sep=',', na_filter = False)

# Remove the PIDs of datasets whose metadata could not be saved, i.e. dataverse_json_export_saved is FALSE
datasetPIDsDF = (datasetPIDsDF
    .query('(dataverse_json_export_saved == True)')
    .reset_index(drop = True, inplace = False)
    )

datasetPIDsDF = datasetPIDsDF.drop(columns=[
    'dataset_pid', 'dataverse_name', 'dataverse_json_export_saved'])

datasetPIDsDF.head()

Unnamed: 0,installation,dataset_pid_url,dataverse_alias
0,CIDACS,https://doi.org/10.57833/cidacs/WV4JWB,ProjZika
1,CIDACS,https://doi.org/10.57833/cidacs/TGAIVO,ProjZika
2,CIDACS,https://doi.org/10.57833/cidacs/0FZP7V,ProjZika
3,CIDACS,https://doi.org/10.57833/cidacs/5BMSIX,ProjGates0
4,CIDACS,https://doi.org/10.57833/cidacs/I56ZVA,ProjGates0


In [3]:
# Import basic metadata of all dataset versions
datasetVersionMetadataDF = pd.read_csv(
    'basic_metadata_2022.10.02-2022.10.03.csv',
    sep=',', na_filter = False)

# Remove the dataset_publication_date, dataset_version_state and publisher columns
datasetVersionMetadataDF = datasetVersionMetadataDF.drop(columns=[
    'dataset_pid', 'dataset_publication_date', 'dataset_version_state', 'publisher'])

# Make sure values in date columns are interpreted as dates
dateColumns = ['dataset_version_create_time']
datasetVersionMetadataDF[dateColumns] = datasetVersionMetadataDF[dateColumns].apply(pd.to_datetime)

datasetVersionMetadataDF.head()

Unnamed: 0,dataset_pid_url,dataset_version_number,dataset_version_create_time
0,https://doi.org/10.48370/OFD/DBJUEM,1.0,2022-05-25 10:17:50+00:00
1,https://doi.org/10.21410/7E4/4WG94W,2.1,2020-05-13 16:06:28+00:00
2,https://doi.org/10.7910/DVN/5PRYPC,4.0,2020-06-17 23:49:50+00:00
3,https://doi.org/10.17026/dans-2zm-dsmz,1.0,2022-01-27 20:15:47+00:00
4,https://doi.org/10.7910/DVN/VIE1H,248.0,2015-06-18 19:49:12+00:00


In [4]:
print(f'Count of datasets: {len(datasetPIDsDF)}')
print(f'Count of dataset versions: {len(datasetVersionMetadataDF)}')


Count of datasets: 340857
Count of dataset versions: 473910


In [5]:
# From the datasetVersionMetadataDF, lets use the version publication dates to get the PIDs and database version numbers of only the most recently published versions of each dataset. The resulting dataframe should contain the same number of rows as the datasetPIDsDF.

latestDatasetVersionsDF = (datasetVersionMetadataDF
                 .iloc[datasetVersionMetadataDF
                 .groupby('dataset_pid_url')['dataset_version_create_time']
                 .agg(pd.Series.idxmax)]
                 .reset_index(drop=True, inplace=False))

In [6]:
latestDatasetVersionsDF.head()

Unnamed: 0,dataset_pid_url,dataset_version_number,dataset_version_create_time
0,http://dx.doi.org/10.26193/00HBWG,2.0,2019-05-13 08:43:01+00:00
1,http://dx.doi.org/10.26193/01P0AI,2.0,2019-05-13 06:23:26+00:00
2,http://dx.doi.org/10.26193/04F7C1,2.0,2019-05-13 10:02:15+00:00
3,http://dx.doi.org/10.26193/07R31R,2.0,2019-05-13 06:50:29+00:00
4,http://dx.doi.org/10.26193/0AF6TZ,5.0,2022-02-02 23:01:26+00:00


In [7]:
print(f'Count of datasets: {len(datasetPIDsDF)}')
print(f'Count of rows in latestDatasetVersionsDF: {len(latestDatasetVersionsDF)}')

Count of datasets: 340857
Count of rows in latestDatasetVersionsDF: 340857


In [36]:
# Join the latestDatasetVersionsDF and the datasetPIDsDF to add the installation column,
# so we know which installations published each dataset

# Merge the two dataframes
basicDatasetMetadataDF = (pd
    .merge(latestDatasetVersionsDF, datasetPIDsDF,
        how='inner',
        on=['dataset_pid_url'])
    .reset_index(drop=True, inplace=False))
print(len(basicDatasetMetadataDF))

340857


In [37]:
basicDatasetMetadataDF.head()

Unnamed: 0,dataset_pid_url,dataset_version_number,dataset_version_create_time,installation,dataverse_alias
0,http://dx.doi.org/10.26193/00HBWG,2.0,2019-05-13 08:43:01+00:00,ADA Dataverse,
1,http://dx.doi.org/10.26193/01P0AI,2.0,2019-05-13 06:23:26+00:00,ADA Dataverse,
2,http://dx.doi.org/10.26193/04F7C1,2.0,2019-05-13 10:02:15+00:00,ADA Dataverse,
3,http://dx.doi.org/10.26193/07R31R,2.0,2019-05-13 06:50:29+00:00,ADA Dataverse,
4,http://dx.doi.org/10.26193/0AF6TZ,5.0,2022-02-02 23:01:26+00:00,ADA Dataverse,


In [10]:
# basicDatasetMetadataDF.to_csv('basicDatasetMetadataDF.csv', index=False)

In [22]:
# Import funding metadata
grantInformationMetadataDF = pd.read_csv(
    'grant_information(citation)_2022.10.02-2022.10.03.csv',
    sep=',', na_filter = False)

# Remove the dataset_pid column
grantInformationMetadataDF = grantInformationMetadataDF.drop(columns=['dataset_pid'])
print(len(grantInformationMetadataDF))

124072


In [40]:
# Join grantInformationMetadataDF with basicDatasetMetadataDF to retain metadata of
# each dataset's latest version

grantInformationLatestVersionDF = (pd
    .merge(grantInformationMetadataDF, basicDatasetMetadataDF,
        how='inner',
        on=['dataset_pid_url', 'dataset_version_number'])
    .reset_index(drop=True, inplace=False))

grantInformationLatestVersionDF = grantInformationLatestVersionDF.drop(columns=[
    'dataset_version_create_time', 'installation', 'dataverse_alias'])

grantInformationLatestVersionDF.head()

Unnamed: 0,dataset_pid_url,dataset_version_number,grantNumberAgency,grantNumberValue
0,https://doi.org/10.17026/dans-2zm-dsmz,1.0,NWO,380-60-007
1,https://doi.org/10.17026/dans-xq9-4s24,1.0,NWO,380-60-007
2,https://doi.org/10.17026/dans-z2f-qdzn,1.0,NWO,380-60-007
3,https://doi.org/10.17026/dans-xg6-z9ke,1.0,NWO,380-60-007
4,https://doi.org/10.7910/DVN/IJEJGR,1.0,Bill and Melinda Gates Foundation (BMGF),


In [24]:
# Import contributor metadata, where a funding agency might be listed as a contributor
contributorMetadataDF = pd.read_csv(
    'contributor(citation)_2022.10.02-2022.10.03.csv',
    sep=',', na_filter = False)

# Remove the dataset_pid column
contributorMetadataDF = contributorMetadataDF.drop(columns=['dataset_pid'])

print(len(contributorMetadataDF))

126012


In [41]:
# Join contributorMetadataDF with basicDatasetMetadataDF to retain metadata of
# each dataset's latest version

contributorLatestVersionDF = (pd
    .merge(contributorMetadataDF, basicDatasetMetadataDF,
        how='inner',
        on=['dataset_pid_url', 'dataset_version_number'])
    .reset_index(drop=True, inplace=False))

contributorLatestVersionDF = contributorLatestVersionDF.drop(columns=[
    'dataset_version_create_time', 'installation', 'dataverse_alias'])

contributorLatestVersionDF.head()

Unnamed: 0,dataset_pid_url,dataset_version_number,contributorType,contributorName
0,https://doi.org/10.7910/DVN/IJEJGR,1.0,Data Collector,Institute of Social and Medicine Studies (ISMS)
1,https://doi.org/10.7910/DVN/IJEJGR,1.0,Funder,Bill and Melinda Gates Foundation (BMGF)
2,https://doi.org/10.7910/DVN/IJEJGR,1.0,Researcher,"Kim, Sunny (International Food Policy Research Institute (IFPRI))"
3,https://doi.org/10.7910/DVN/IJEJGR,1.0,Researcher,"Menon, Purnima (International Food Policy Research Institute (IFPRI))"
4,https://doi.org/10.7910/DVN/IJEJGR,1.0,Researcher,"Nguyen, Phuong Hong (International Food Policy Research Institute (IFPRI))"


In [42]:
# Combine the basicDatasetMetadataDF, grantInformationLatestVersionDF, and contributorLatestVersionDF with a
# full outer join on dataset_pid_url and dataset_version_number columns

dataframes = [basicDatasetMetadataDF, grantInformationLatestVersionDF, contributorLatestVersionDF]
indexList = ['dataset_pid_url', 'dataset_version_number']
for dataframe in dataframes:
    dataframe.set_index(indexList, inplace=True)

In [49]:
joinedDF = reduce(lambda left, right: left.join(right, how='outer'), dataframes)
joinedDF = joinedDF.reset_index(drop=False, inplace=False)
joinedDF.head()

Unnamed: 0,dataset_pid_url,dataset_version_number,dataset_version_create_time,installation,dataverse_alias,grantNumberAgency,grantNumberValue,contributorType,contributorName
0,http://dx.doi.org/10.26193/00HBWG,2.0,2019-05-13 08:43:01+00:00,ADA Dataverse,,,,Funder,"General Practice Evaluation Program, Department of Health, Housing and Community Services"
1,http://dx.doi.org/10.26193/01P0AI,2.0,2019-05-13 06:23:26+00:00,ADA Dataverse,,,,Data Collector,Roy Morgan
2,http://dx.doi.org/10.26193/04F7C1,2.0,2019-05-13 10:02:15+00:00,ADA Dataverse,,,,Funder,Australian Research Council
3,http://dx.doi.org/10.26193/04F7C1,2.0,2019-05-13 10:02:15+00:00,ADA Dataverse,,,,Data Collector,Workplace Research Centre
4,http://dx.doi.org/10.26193/07R31R,2.0,2019-05-13 06:50:29+00:00,ADA Dataverse,,,,,


In [50]:
print('Number of datasets in joinedDF: %s' % (len(pd.unique(joinedDF['dataset_pid_url']))))

Number of datasets in joinedDF: 340857
