In [1]:
from functools import reduce
import numpy as np
import pandas as pd

The Dataverse team is planning to add  additions to the dataset deposit form in the Harvard Dataverse Repository, we need to know:

- In Dataverse repositories, how many datasets have funding metadata?
- And among those, how many have funder names?
- Which users include funding metadata in their deposits most often?

## Prepare the data

In [2]:
# Import CSV file that lists PIDs of all datasets and which Dataverse installation
# they're published in, removing the PIDs of datasets whose metadata could not be
# saved, i.e. dataverse_json_export_saved is FALSE
datasetPIDsDF = (pd
    .read_csv(
        'dataset_pids_from_most_known_dataverse_installations.csv',
        usecols=lambda x: x not in ['dataset_pid', 'dataverse_name'],
        sep=',', na_filter=False)
     .query('(dataverse_json_export_saved == True)')
     .drop(columns=['dataverse_json_export_saved'])
     .reset_index(drop=True, inplace=False)
)

# Remove the PIDs of datasets whose metadata could not be saved, i.e. dataverse_json_export_saved is FALSE
# datasetPIDsDF = (datasetPIDsDF
#     .query('(dataverse_json_export_saved == True)')
#     .drop(columns=['dataverse_json_export_saved'])
#     .reset_index(drop=True, inplace=False)
#     )

datasetPIDsDF.head()

Unnamed: 0,installation,dataset_pid_url,dataverse_alias
0,CIDACS,https://doi.org/10.57833/cidacs/WV4JWB,ProjZika
1,CIDACS,https://doi.org/10.57833/cidacs/TGAIVO,ProjZika
2,CIDACS,https://doi.org/10.57833/cidacs/0FZP7V,ProjZika
3,CIDACS,https://doi.org/10.57833/cidacs/5BMSIX,ProjGates0
4,CIDACS,https://doi.org/10.57833/cidacs/I56ZVA,ProjGates0


In [3]:
# Import basic metadata of all dataset versions
datasetVersionMetadataDF = pd.read_csv(
    'basic_metadata_2022.10.02-2022.10.03.csv',
    usecols=lambda x: x not in [
        'dataset_pid', 'dataset_publication_date',
        'dataset_version_state', 'publisher'],
    parse_dates=['dataset_version_create_time'],
    sep=',', na_filter=False)

# Make sure values in date columns are interpreted as dates
# datasetVersionMetadataDF[dateColumns] = datasetVersionMetadataDF[['dataset_version_create_time']].apply(pd.to_datetime)

datasetVersionMetadataDF.head()

Unnamed: 0,dataset_pid_url,dataset_version_number,dataset_version_create_time
0,https://doi.org/10.48370/OFD/DBJUEM,1.0,2022-05-25 10:17:50+00:00
1,https://doi.org/10.21410/7E4/4WG94W,2.1,2020-05-13 16:06:28+00:00
2,https://doi.org/10.7910/DVN/5PRYPC,4.0,2020-06-17 23:49:50+00:00
3,https://doi.org/10.17026/dans-2zm-dsmz,1.0,2022-01-27 20:15:47+00:00
4,https://doi.org/10.7910/DVN/VIE1H,248.0,2015-06-18 19:49:12+00:00


In [4]:
print(f'Count of datasets: {len(datasetPIDsDF)}')
print(f'Count of dataset versions: {len(datasetVersionMetadataDF)}')

Count of datasets: 340857
Count of dataset versions: 473910


In [6]:
# From the datasetVersionMetadataDF, lets use the version publication dates to get the PIDs and dataset version numbers of only the most recently published versions of each dataset. The resulting dataframe should contain the same number of rows as the datasetPIDsDF.

latestDatasetVersionsDF = (datasetVersionMetadataDF
    .iloc[
        datasetVersionMetadataDF
        .groupby('dataset_pid_url')['dataset_version_create_time']
        .agg(pd.Series.idxmax)]
    .reset_index(drop=True, inplace=False))

latestDatasetVersionsDF.head()

Unnamed: 0,dataset_pid_url,dataset_version_number,dataset_version_create_time
0,http://dx.doi.org/10.26193/00HBWG,2.0,2019-05-13 08:43:01+00:00
1,http://dx.doi.org/10.26193/01P0AI,2.0,2019-05-13 06:23:26+00:00
2,http://dx.doi.org/10.26193/04F7C1,2.0,2019-05-13 10:02:15+00:00
3,http://dx.doi.org/10.26193/07R31R,2.0,2019-05-13 06:50:29+00:00
4,http://dx.doi.org/10.26193/0AF6TZ,5.0,2022-02-02 23:01:26+00:00


In [7]:
print(f'Count of datasets: {len(datasetPIDsDF)}')
print(f'Count of rows in latestDatasetVersionsDF: {len(latestDatasetVersionsDF)}')

Count of datasets: 340857
Count of rows in latestDatasetVersionsDF: 340857


In [8]:
# Join the latestDatasetVersionsDF and the datasetPIDsDF to add the installation column,
# so we know which installations published each dataset

basicDatasetMetadataDF = (pd
    .merge(latestDatasetVersionsDF, datasetPIDsDF,
        how='inner',
        on=['dataset_pid_url'])
    .reset_index(drop=True, inplace=False))

# Make sure the count of rows is the same as the count of total datasets: 340,857
print(len(basicDatasetMetadataDF))

340857


In [9]:
basicDatasetMetadataDF.head()

Unnamed: 0,dataset_pid_url,dataset_version_number,dataset_version_create_time,installation,dataverse_alias
0,http://dx.doi.org/10.26193/00HBWG,2.0,2019-05-13 08:43:01+00:00,ADA Dataverse,
1,http://dx.doi.org/10.26193/01P0AI,2.0,2019-05-13 06:23:26+00:00,ADA Dataverse,
2,http://dx.doi.org/10.26193/04F7C1,2.0,2019-05-13 10:02:15+00:00,ADA Dataverse,
3,http://dx.doi.org/10.26193/07R31R,2.0,2019-05-13 06:50:29+00:00,ADA Dataverse,
4,http://dx.doi.org/10.26193/0AF6TZ,5.0,2022-02-02 23:01:26+00:00,ADA Dataverse,


In [10]:
# basicDatasetMetadataDF.to_csv('basicDatasetMetadataDF.csv', index=False)

In [11]:
# Import funding metadata, dropping the dataset_pid column
grantInformationMetadataDF = pd.read_csv(
    'grant_information(citation)_2022.10.02-2022.10.03.csv',
    usecols=lambda x: x not in ['dataset_pid'],
    # sep=',', na_filter=False)
    sep=',')

print(f'Count of rows in grantInformationMetadataDF: {len(grantInformationMetadataDF)}')

Count of rows in grantInformationMetadataDF: 124072


In [12]:
# Join grantInformationMetadataDF with basicDatasetMetadataDF to retain metadata of
# each dataset's latest version

grantInformationLatestVersionDF = (pd
    .merge(grantInformationMetadataDF, basicDatasetMetadataDF,
        how='inner',
        on=['dataset_pid_url', 'dataset_version_number'])
    .drop(columns=[
        'dataset_version_create_time', 'installation',
        'dataverse_alias'])
    .reset_index(drop=True, inplace=False))

In [13]:
# Import contributor metadata, where a funding agency might be listed as a contributor
contributorMetadataDF = pd.read_csv(
    'contributor(citation)_2022.10.02-2022.10.03.csv',
    usecols=lambda x: x not in ['dataset_pid'],
    sep=',', na_filter=False)

print(len(contributorMetadataDF))

126012


In [14]:
# Join contributorMetadataDF with basicDatasetMetadataDF to retain metadata of
# each dataset's latest version

contributorLatestVersionDF = (pd
    .merge(contributorMetadataDF, basicDatasetMetadataDF,
        how='inner',
        on=['dataset_pid_url', 'dataset_version_number'])
    .drop(columns=[
        'dataset_version_create_time', 'installation',
        'dataverse_alias'])
    .reset_index(drop=True, inplace=False))

# contributorLatestVersionDF = contributorLatestVersionDF.drop(columns=[
#     'dataset_version_create_time', 'installation', 'dataverse_alias'])

In [16]:
# Check the columns on all three dataframes basicDatasetMetadataDF, grantInformationLatestVersionDF and contributorLatestVersionDF
# All three dataframes should have the 'dataset_pid_url' and 'dataset_version_number' columns

print('Columns in basicDatasetMetadataDF:')
for i in list(basicDatasetMetadataDF.columns):
    print(i)
print(f'\nColumns in grantInformationLatestVersionDF:')
for i in list(grantInformationLatestVersionDF.columns):
    print(i)
print(f'\nColumns in contributorLatestVersionDF:')
for i in list(contributorLatestVersionDF.columns):
    print(i)

Columns in basicDatasetMetadataDF:)
dataset_pid_url
dataset_version_number
dataset_version_create_time
installation
dataverse_alias

Columns in grantInformationLatestVersionDF:
dataset_pid_url
dataset_version_number
grantNumberAgency
grantNumberValue

Columns in contributorLatestVersionDF:
dataset_pid_url
dataset_version_number
contributorType
contributorName


In [17]:
# Combine the basicDatasetMetadataDF, grantInformationLatestVersionDF, and contributorLatestVersionDF with a
# full outer join on dataset_pid_url and dataset_version_number columns

dataframes = [basicDatasetMetadataDF, grantInformationLatestVersionDF, contributorLatestVersionDF]
indexList = ['dataset_pid_url', 'dataset_version_number']
for df in dataframes:
    df.set_index(indexList, inplace=True)

In [18]:

fundingDatasetMetadataInDataverseInstallationsDF = reduce(lambda left, right: left.join(right, how='outer'), dataframes)
fundingDatasetMetadataInDataverseInstallationsDF = fundingDatasetMetadataInDataverseInstallationsDF.reset_index(drop=False, inplace=False)
fundingDatasetMetadataInDataverseInstallationsDF.head()

Unnamed: 0,dataset_pid_url,dataset_version_number,dataset_version_create_time,installation,dataverse_alias,grantNumberAgency,grantNumberValue,contributorType,contributorName
0,http://dx.doi.org/10.26193/00HBWG,2.0,2019-05-13 08:43:01+00:00,ADA Dataverse,,,,Funder,"General Practice Evaluation Program, Department of Health, Housing and Community Services"
1,http://dx.doi.org/10.26193/01P0AI,2.0,2019-05-13 06:23:26+00:00,ADA Dataverse,,,,Data Collector,Roy Morgan
2,http://dx.doi.org/10.26193/04F7C1,2.0,2019-05-13 10:02:15+00:00,ADA Dataverse,,,,Funder,Australian Research Council
3,http://dx.doi.org/10.26193/04F7C1,2.0,2019-05-13 10:02:15+00:00,ADA Dataverse,,,,Data Collector,Workplace Research Centre
4,http://dx.doi.org/10.26193/07R31R,2.0,2019-05-13 06:50:29+00:00,ADA Dataverse,,,,,


In [19]:
# Finally, make sure that the number of datasets in the dataframe match the number of total datasets: 340,857
numberOfDatasets = len(pd.unique(fundingDatasetMetadataInDataverseInstallationsDF['dataset_pid_url']))
print(f'Number of datasets in fundingDatasetMetadataInDataverseInstallationsDF: {numberOfDatasets}')

Number of datasets in fundingDatasetMetadataInDataverseInstallationsDF: 340857


## Exploring the data

Now that we've got the funding metadata of the latest versions of all datasets in the Dataverse installations, let's start answering our questions

### Collections with most funding metadata and most used funder agency names
- In the Harvard Dataverse Repository, which collections have datasets with the most funding metadata? This will help us figure out who to learn from when we make changes to how funding metadata is entered.
- Which funder agency names are entered most often? Knowing that might help us figure out how effective our efforts to standardize funder agency name metadata could be? For example, do the changes to the metadata fields (the "CV javascript") make it easier for depositors to enter the most popular funder agency names? How much easier?

In [20]:
# Lets start by creating a dataframe containing only metadata of datasets published in the Harvard Dataverse Repository
datasetInHDVDF = (fundingDatasetMetadataInDataverseInstallationsDF
    .query('(installation == "Harvard Dataverse")')
    .drop(columns=['installation'])
    .reset_index(drop=True, inplace=False)
    )

datasetInHDVDF.head()

Unnamed: 0,dataset_pid_url,dataset_version_number,dataset_version_create_time,dataverse_alias,grantNumberAgency,grantNumberValue,contributorType,contributorName
0,https://doi.org/10.7910/DVN/00234,1.0,2014-03-23 22:12:34+00:00,restat,,,,
1,https://doi.org/10.7910/DVN/004HG6,1.0,2020-08-28 15:42:11+00:00,levy_photos_2012,The Leon Levy Foundation,,,
2,https://doi.org/10.7910/DVN/005SCF,1.0,2021-08-26 03:17:40+00:00,levy_photos_2015,The Leon Levy Foundation,,,
3,https://doi.org/10.7910/DVN/006UPU,1.0,2021-08-13 18:15:36+00:00,levy_photos_2014,The Leon Levy Foundation,,,
4,https://doi.org/10.7910/DVN/007GT,5.0,2017-02-05 23:43:33+00:00,antislaverypetitionsma,,,,


In [21]:
print('Number of datasets in datasetInHDVDF: %s' % (len(pd.unique(datasetInHDVDF['dataset_pid_url']))))

Number of datasets in datasetInHDVDF: 80278


In [22]:
# Now let's keep the metadata of the datasets that have funding metadata in the three fields:
# grantNumberAgency, grantNumberValue, and contributorName when contributorType is "Funder"
fundingDatasetMetadataInHDVDF = (
    datasetInHDVDF.query(
        '(grantNumberAgency == grantNumberAgency) or\
        (grantNumberValue == grantNumberValue) or\
        (contributorType == "Funder" and contributorName == contributorName)')
     .reset_index(drop=True, inplace=False)
     )

In [23]:
fundingDatasetMetadataInHDVDF.head()

Unnamed: 0,dataset_pid_url,dataset_version_number,dataset_version_create_time,dataverse_alias,grantNumberAgency,grantNumberValue,contributorType,contributorName
0,https://doi.org/10.7910/DVN/004HG6,1.0,2020-08-28 15:42:11+00:00,levy_photos_2012,The Leon Levy Foundation,,,
1,https://doi.org/10.7910/DVN/005SCF,1.0,2021-08-26 03:17:40+00:00,levy_photos_2015,The Leon Levy Foundation,,,
2,https://doi.org/10.7910/DVN/006UPU,1.0,2021-08-13 18:15:36+00:00,levy_photos_2014,The Leon Levy Foundation,,,
3,https://doi.org/10.7910/DVN/00IIWB,1.0,2022-03-16 01:03:47+00:00,levy_photos_2013,The Leon Levy Foundation,,,
4,https://doi.org/10.7910/DVN/00VQS7,1.0,2021-12-16 19:14:00+00:00,levy_photos_1986,The Leon Levy Foundation,,,


In [24]:
print(f'Number of datasets in fundingDatasetMetadataInHDVDF: {(len(pd.unique(fundingDatasetMetadataInHDVDF["dataset_pid_url"])))}')

Number of datasets in fundingDatasetMetadataInHDVDF: 30225


So of the 80,278 datasets in the Harvard Dataverse Repository as of early October 2022, 30,222 had metadata about funding in one of the three fields where we expect it.

What's entered most often in the grantNumberAgency (Funding Information Name) field?

In [25]:
# Create a new dataframe that lists each funder name entered in grantNumberAgency and the number of datasets with that funder name
datasetCountByFundingAgencyNamesInHDV = (
    fundingDatasetMetadataInHDVDF[['dataset_pid_url', 'grantNumberAgency']]
        .query('grantNumberAgency == grantNumberAgency')
        .drop_duplicates()
        .groupby(['grantNumberAgency']).count()
        .rename(columns={'dataset_pid_url': 'count_of_datasets'})
        .sort_values(by=['count_of_datasets'], ascending=False)
        .reset_index(drop=False, inplace=False)
)

datasetCountByFundingAgencyNamesInHDV.head(10)

Unnamed: 0,grantNumberAgency,count_of_datasets
0,The Leon Levy Foundation,28167
1,United States Agency for International Development (USAID),230
2,National Science Foundation,89
3,Bill and Melinda Gates Foundation,74
4,Bill and Melinda Gates Foundation (BMGF),70
5,NSF,62
6,Bill & Melinda Gates Foundation,57
7,NIH,57
8,"U.S. Department of Energy, Office of Science, Basic Energy Sciences",47
9,NASA,32


It's most likely that the Leon Levy collection has the most datasets with a funder name in their grantNumberAgency (Funding Information Name) field.

It also looks like, for datasets with other funder name metadata, we might have to deal with the variations in spellings, maybe using a deduplication or fuzzy matching library, in order to see which funders are added to the funding metadata most often. For example, this shows only 57 datasets with the value "NIH" in the Funding Information Agency field, but I've found more than 57 datasets created from NIH-funded research, with other values in the field, such as "National Institute of Health".

For now, let's see which collections have the most datasets with funding metadata, excluding the Leon Levy collections.

In [26]:
datasetsInHDVByCollection_NoLevy = (
    fundingDatasetMetadataInHDVDF[['dataset_pid_url', 'dataverse_alias']]
    .drop_duplicates()
    .groupby(['dataverse_alias']).count()
    .query('~dataverse_alias.str.contains("levy_photos").values')
    .rename(columns={'dataset_pid_url': 'count_of_datasets'})
    .sort_values(by=['count_of_datasets'], ascending=False)
    .reset_index(drop=False, inplace=False)
)

datasetsInHDVByCollection_NoLevy.head(10)

Unnamed: 0,dataverse_alias,count_of_datasets
0,IFPRI,339
1,harvard,201
2,AfricaRISING,136
3,worldfish,113
4,CIAT,108
5,ipa,65
6,AllianceBioversityCIAT,50
7,IMASC_Publication_Data,42
8,cenaptnmr,40
9,hgis-indias,30


The IFPRI collections (IFPRI and AfricaRISING), worldfish collection and CIAT collection have the most datasets with funding metadata. ("harvard" is the alias of repository's main collection, where anyone can add data.)

What have the depositors of those datasets entered in the metadata?


In [39]:
# fundingDatasetMetadataInHDVDF.to_csv('fundingDatasetMetadataInHDVDF.csv', index=False)

In [29]:
def funder_names_in_collections(dataframe, collectionAliasesList):
    funderNamesInCollectionDF = (
        # Drop all but the needed columns
        dataframe[[
            'dataset_pid_url', 'grantNumberAgency', 'dataverse_alias']]
        # Remove all datasets except those in given list of collection aliases
        .query(
            'dataverse_alias in @collectionAliasesList')
        # Remove the 'dataverse_alias' column
        .drop(columns=['dataverse_alias'])
        # Keep only datasets that have funding agency name metadata
        .query('grantNumberAgency == grantNumberAgency')
        # Drop duplicate rows
        .drop_duplicates()
        # Group by funding agency name
        .groupby(['grantNumberAgency']).count()
        .rename(columns={'dataset_pid_url': 'count_of_datasets'})
        # Sort by count of occurrences of each funder name
        .sort_values(by=['count_of_datasets'], ascending=False)
        .reset_index(drop=False, inplace=False))
    return funderNamesInCollectionDF


In [30]:
datasetCountByFundingAgencyNamesIFPRI = funder_names_in_collections(
    dataframe=fundingDatasetMetadataInHDVDF,
    collectionAliasesList=['IFPRI', 'AfricaRISING'])

datasetCountByFundingAgencyNamesIFPRI.head()

Unnamed: 0,grantNumberAgency,count_of_datasets
0,United States Agency for International Development (USAID),226
1,Bill and Melinda Gates Foundation (BMGF),64
2,Bill and Melinda Gates Foundation,24
3,World Bank,24
4,Bill & Melinda Gates Foundation (BMGF),23


In [31]:
datasetCountByFundingAgencyNamesWorldfish = funder_names_in_collections(
    dataframe=fundingDatasetMetadataInHDVDF,
    collectionAliasesList=['worldfish'])

datasetCountByFundingAgencyNamesWorldfish.head()

# funderNamesList_Worldfish = list(set(datasetCountByFundingAgencyNamesWorldfish["grantNumberAgency"].values.tolist()))

Unnamed: 0,grantNumberAgency,count_of_datasets
0,European Commission - IFAD,2
1,European Commission-IFAD Grant,1
2,Feed the Future Bangladesh Aquaculture and Nutrition Activity,1
3,Feed the Future Innovation Lab for Fish,1
4,GTZ,1


In [43]:
datasetCountByFundingAgencyNamesCIAT = funder_names_in_collections(
    dataframe=fundingDatasetMetadataInHDVDF,
    collectionAliasesList=['CIAT'])

datasetCountByFundingAgencyNamesCIAT.head()

Unnamed: 0,grantNumberAgency,count_of_datasets
0,"CGIAR Research Program on Climate Change, Agriculture and Food Security - CCAFS",16
1,International Center for Tropical Agriculture - CIAT,15
2,Bill & Melinda Gates Foundation,14
3,United States Agency for International Development - USAID,13
4,International Fund for Agricultural Development - IFAD,7


### Duplicate funding metadata

There are two fields where depositors can enter the names of funders: In the Funding Information Agency field and in the Contributor field when they choose the Contributor Type "Funder". How often has this happened and who's done it? By learning these things, we can see how big the problem is and who's used both fields? And later we can ask those people why. We need to learn if the design of the fields are meeting some need that we weren't aware of.

For now, let's continue looking only at the latest version of each dataset. This might cause a further under count of the number of times this issue has actually occurred. For example, first version of a dataset might have both fields filled but the latest might have only one. By considering only the latest version of each dataset, we'll miss cases like this. Eventually we'll have to think about what to do about the metadata of previous dataset versions.

Questions
- In the Harvard Dataverse Repository, how many datasets have values in the Funder Information fields and in the Contributor field when Contributor Type is "Funder"?
- How many datasets have funding metadata in their Contributor field and not in their Funder Information fields?
- How often are the same values in both fields? For example, one dataset might have "NIH" in the Funder Information Agency field and in the Contributor Name field when the Contributor Type is "Funder". How often do things like this happen?
- How often are different values in both fields?

In [80]:
# Let's remind ourselves what information we have in the fundingDatasetMetadataInHDVDF dataframe
fundingDatasetMetadataInHDVDF.head()

Unnamed: 0,dataset_pid_url,dataset_version_number,dataset_version_create_time,dataverse_alias,grantNumberAgency,grantNumberValue,contributorType,contributorName
0,https://doi.org/10.7910/DVN/004HG6,1.0,2020-08-28 15:42:11+00:00,levy_photos_2012,The Leon Levy Foundation,,,
1,https://doi.org/10.7910/DVN/005SCF,1.0,2021-08-26 03:17:40+00:00,levy_photos_2015,The Leon Levy Foundation,,,
2,https://doi.org/10.7910/DVN/006UPU,1.0,2021-08-13 18:15:36+00:00,levy_photos_2014,The Leon Levy Foundation,,,
3,https://doi.org/10.7910/DVN/00IIWB,1.0,2022-03-16 01:03:47+00:00,levy_photos_2013,The Leon Levy Foundation,,,
4,https://doi.org/10.7910/DVN/00VQS7,1.0,2021-12-16 19:14:00+00:00,levy_photos_1986,The Leon Levy Foundation,,,


In [32]:
# Now let's query it to get only datasets that have values in both metadata fields
duplicateFundingFieldsInHDV = (fundingDatasetMetadataInHDVDF
        .query(
            'grantNumberAgency == grantNumberAgency and\
             (contributorType == "Funder" and contributorName == contributorName)')
        .sort_values(by=['dataset_pid_url'], ascending=True)
        .reset_index(drop=True, inplace=False))

duplicateFundingFieldsInHDV.head()

Unnamed: 0,dataset_pid_url,dataset_version_number,dataset_version_create_time,dataverse_alias,grantNumberAgency,grantNumberValue,contributorType,contributorName
0,https://doi.org/10.7910/DVN/064X5M,1.3,2019-04-10 15:48:38+00:00,AfricaRISING,United States Agency for International Development (USAID),,Funder,United States Agency for International Development (USAID)
1,https://doi.org/10.7910/DVN/0GVFDK,1.0,2021-04-13 13:50:58+00:00,LAPS,Inter-American Development Bank,1300600-01-PEC,Funder,Inter-American Development Bank
2,https://doi.org/10.7910/DVN/0TGTF0,2.3,2019-04-10 15:55:53+00:00,AfricaRISING,United States Agency for International Development (USAID),,Funder,United States Agency for International Development (USAID)
3,https://doi.org/10.7910/DVN/1R3F3U,1.0,2022-08-30 12:23:56+00:00,IFPRI,United States Agency for International Development (USAID),,Funder,United States Agency for International Development (USAID)
4,https://doi.org/10.7910/DVN/1R3F3U,1.0,2022-08-30 12:23:56+00:00,IFPRI,United States Agency for International Development (USAID),,Funder,Livelihoods and Food Security Trust Fund (LIFT)


In [33]:
datasetCountDuplicateFundingFields = len(pd.unique(duplicateFundingFieldsInHDV['dataset_pid_url']))
print(f'Number of datasets with metadata in both funding metadata fields: {datasetCountDuplicateFundingFields}')

Number of datasets with metadata in both funding metadata fields: 250


In [34]:
duplicateFundingFieldsInHDV.to_csv('duplicateFundingFieldsInHDV.csv', index=False)

How many datasets have funding metadata in their Contributor field and not in their Grant Information fields?

In [97]:
contributorsButNoFundingInfomration = (fundingDatasetMetadataInHDVDF
    .query(
        '(contributorType == "Funder" and contributorName == contributorName)and\
        grantNumberAgency != grantNumberAgency')
    .reset_index(drop=True, inplace=False))

In [98]:
contributorsButNoFundingInfomration.head()

Unnamed: 0,dataset_pid_url,dataset_version_number,dataset_version_create_time,dataverse_alias,grantNumberAgency,grantNumberValue,contributorType,contributorName
0,https://doi.org/10.7910/DVN/07W79B,1.0,2022-01-05 03:05:32+00:00,worldfish,,,Funder,FISH CRP
1,https://doi.org/10.7910/DVN/0EOCBE,1.0,2021-12-02 07:07:56+00:00,worldfish,,,Funder,"Feed the Future, USAID"
2,https://doi.org/10.7910/DVN/0RTJP3,1.2,2021-08-25 07:43:29+00:00,NPP,,,Funder,FISH CRP
3,https://doi.org/10.7910/DVN/1732LM,1.5,2020-10-13 07:12:01+00:00,worldfish,,,Funder,Indonesia Endowment Fund for Education (LPDP Scholarship)
4,https://doi.org/10.7910/DVN/1732LM,1.5,2020-10-13 07:12:01+00:00,worldfish,,,Funder,Aceh Aquaculture Cooperative (AAC)


In [99]:
# contributorsButNoFundingInfomration.to_csv('contributorsButNoFundingInfomration.csv', index=False)