In [1]:
from functools import reduce
import pandas as pd

## Prepare the data

In [4]:
# Import CSV files that lists PIDs of all datasets and which Dataverse installation
# they're published in, and which Dataverse collections they're publishing in
datasetPIDsDF = (pd
    .read_csv(
        filepath_or_buffer='dataset_pids_from_most_known_dataverse_installations_2023.08.csv',
        usecols=lambda x: x not in ['dataset_pid', 'dataverse_name'],
        sep=',', na_filter=False)
    # Remove datasets whose metadata couldn't be saved and datasets from ODISSEI_Portal, since that installation only indexes metadata from other repositories
    .query(
        '(dataverse_json_export_saved == True) and\
        (dataverse_installation_name != "ODISSEI_Portal")')
    .drop(columns=['dataverse_json_export_saved'])
    .reset_index(drop=True, inplace=False)
)

datasetPIDsDF.head()

Unnamed: 0,dataverse_installation_name,dataset_pid_url,dataverse_collection_alias,dataverse_collection_name,dataverse_collection_type
0,UNB_Libraries_Dataverse,https://doi.org/10.25545/NVN79Z,snb,Service New Brunswick,ORGANIZATIONS_INSTITUTIONS
1,UNB_Libraries_Dataverse,https://doi.org/10.25545/YB60PU,blightchemistry,Blight Chemistry Research Omniverse,RESEARCH_GROUP
2,UNB_Libraries_Dataverse,https://doi.org/10.25545/WHD3KC,blightchemistry,Blight Chemistry Research Omniverse,RESEARCH_GROUP
3,UNB_Libraries_Dataverse,https://doi.org/10.25545/6TZWCG,Ir-CationBinding,Elucidation of Charge Contributions in Iridium-Chelated Hydrogen-Bonding Systems,JOURNALS
4,UNB_Libraries_Dataverse,https://doi.org/10.25545/NYM13B,IFMLAB,Integrated Forest Management Lab,RESEARCH_GROUP


In [52]:
# Import funder metadata and remove the dataset_pid column
grantInformationMetadataDF = pd.read_csv(
    filepath_or_buffer='funding_information(citation)_2023.08.22-2023.08.28.csv',
    usecols=lambda x: x not in ['dataset_pid'],
    na_filter=False,
    sep=',')

print(f'Count of distinct dataset PIDs in grantInformationMetadataDF: {len(pd.unique(grantInformationMetadataDF["dataset_pid_url"]))}')

grantInformationMetadataDF.head()

Count of datasets: 390401


Unnamed: 0,dataverse_installation_name,count_of_all_datasets
0,ACSS_Dataverse,137
1,ADA_Dataverse,1648
2,ASU_Library_Research_Data_Repository,61
3,AUSSDA_Dataverse,1517
4,Abacus,2417
5,Arca_Dados,726
6,Borealis,15001
7,CESA_-_Repositorio_de_datos_académicos,4
8,CIDACS,114
9,CIFOR,283


In [54]:
# Import contributor metadata, remove the dataset_pid column and include only contributors who are the "Funder" contributor type
contributorMetadataDF = (pd
    .read_csv(
        filepath_or_buffer='contributor(citation)_2023.08.22-2023.08.28.csv',
        usecols=lambda x: x not in ['dataset_pid'],
        na_filter=False,
        sep=',')
    .query('(contributorType == "Funder")')
    .reset_index(drop=True, inplace=False))

print(f'Count of distinct dataset PIDs in contributorMetadataDF: {len(pd.unique(contributorMetadataDF["dataset_pid_url"]))}')

contributorMetadataDF.head()

Count of distinct dataset PIDs in contributorMetadataDF: 8655


Unnamed: 0,dataset_pid_url,dataset_publication_date,dataset_version_number,dataset_version_create_time,contributorType,contributorName
0,https://doi.org/10.5683/SP3/6FFGZN,2021-11-04,1.2,2021-12-09T18:34:39Z,Funder,Compute Ontario
1,https://doi.org/10.5683/SP3/XCS9W2,2021-11-04,1.2,2021-12-10T15:12:54Z,Funder,Compute Ontario
2,https://doi.org/10.5683/SP3/N4FNRC,2021-11-04,1.2,2021-12-09T14:49:06Z,Funder,Compute Ontario
3,https://doi.org/10.5683/SP3/DGXLXB,2021-11-04,1.2,2021-12-10T15:55:51Z,Funder,Compute Ontario
4,https://doi.org/10.5683/SP3/4WSVNS,2021-11-04,1.2,2021-12-08T21:45:32Z,Funder,Compute Ontario


In [55]:
# Join grantInformationMetadataDF and contributorMetadataDF
funderAndContributorFunderMetadataDF = (pd
    .merge(
        left=contributorMetadataDF,
        right=grantInformationMetadataDF,
        how='outer',
        on=['dataset_pid_url', 'dataset_publication_date', 'dataset_version_number', 'dataset_version_create_time'])
    .reset_index(drop=True, inplace=False))

funderAndContributorFunderMetadataDF.head()

Unnamed: 0,dataset_pid_url,dataset_publication_date,dataset_version_number,dataset_version_create_time,grantNumberAgency,grantNumberValue,contributorType,contributorName
0,https://doi.org/10.7910/DVN/QD0V0H,2021-09-20,1.0,2021-07-29T19:45:43Z,The Leon Levy Foundation,,,
1,https://doi.org/10.17026/dans-xhp-2h3w,2019-10-02,1.0,2022-01-31T03:58:32Z,NWO,380-60-007,,
2,https://doi.org/10.7910/DVN/FHUNGL,2021-10-19,1.0,2021-08-10T15:31:09Z,The Leon Levy Foundation,,,
3,https://hdl.handle.net/11272.1/AB2/ZZMOPP,2019-05-15,1.0,2020-08-21T23:23:32Z,Air Force Research Laboratory and Defense Advance Research Projects Agency,Air Force Research Laboratory and Defense Advance Research Projects Agency: FA8750-13-2-0045,,
4,https://doi.org/10.17026/dans-xxz-qvjx,2020-06-20,1.0,2022-01-29T06:23:21Z,NWO,380-60-007,,


In [99]:
# Join datasetPIDsDF and funderAndContributorFunderMetadataDF
allFunderNameDataDF = (pd
    .merge(
        left=datasetPIDsDF,
        right=funderAndContributorFunderMetadataDF,
        how='inner',
        on=['dataset_pid_url'])
    .reset_index(drop=True, inplace=False))

allFunderNameDataDF.head()

Unnamed: 0,dataverse_installation_name,dataset_pid_url,dataverse_collection_alias,dataverse_collection_name,dataverse_collection_type,dataset_publication_date,dataset_version_number,dataset_version_create_time,grantNumberAgency,grantNumberValue,contributorType,contributorName
0,UNB_Libraries_Dataverse,https://doi.org/10.25545/ZXDJZ7,MarsLab,Mars Science Laboratory at UNB,LABORATORY,2020-09-12,1.0,2020-09-10T18:59:57Z,Canadian Space Agency,,,
1,Harvard_Dataverse,https://doi.org/10.7910/DVN/RXBNCZ,0000-0002-0068-6933,IgboSynCorp Dataverse,RESEARCH_GROUP,2022-07-08,1.0,2022-07-04T22:48:06Z,"Meridian Institute, 105 Village Place, Dillion, Colorado 80435, United States of America",2020-02--3075551051,Funder,Lacuna Fund
2,Harvard_Dataverse,https://doi.org/10.7910/DVN/YB9FWK,0000-0002-0068-6933,IgboSynCorp Dataverse,RESEARCH_GROUP,2022-07-07,1.0,2022-06-21T06:21:20Z,Lacuna Fund of the Meridian Institute,2020-02--3075551051,,
3,Harvard_Dataverse,https://doi.org/10.7910/DVN/EPP4MA,2001_US_race_crime_survey,2001 US Race & Crime Survey Dataverse,RESEARCH_PROJECTS,2015-06-18,1.0,2015-06-18T22:55:51Z,National Science Foundation,9906346,,
4,Harvard_Dataverse,https://doi.org/10.7910/DVN/97Q2B8,3EA-Lebanon,3EA Lebanon Impact Data,RESEARCH_PROJECTS,2022-01-31,1.1,2023-03-02T14:59:48Z,,,Funder,Dubai Cares


In [57]:
# Export dataframe to a CSV file
# allDataDF.to_csv('allFunderNameDataDF.csv', index=False)

## Exploring the data

Now that we've got the funding metadata of many datasets in known Dataverse repositories, as of late August 2023, let's start answering our questions

In [101]:
# Let's save the queries we'll be using later to find datasets that have metadata in either or both fields
returnAllRowsQuery = 'tuple()'
onlyFunderAgencyNameQuery = 'grantNumberAgency == grantNumberAgency and contributorName.isnull()'
onlyContributorQuery = '(contributorType == "Funder" and contributorName == contributorName) and\
    grantNumberAgency.isnull()'
bothFieldsQuery = '(grantNumberAgency == grantNumberAgency and grantNumberAgency != "") and\
        (contributorType == "Funder" and contributorName == contributorName)'

# Function for creating a dataframe with dataset counts based on given query
def createCountsDF(dataframe, countsQuery, nameOfCountsColumn):
    countsDataframe = (dataframe
        .query(countsQuery, engine='python')
        .drop(columns=dataframe.columns.difference(['dataset_pid_url', 'dataverse_installation_name']))
        .drop_duplicates()
        .groupby(['dataverse_installation_name']).count()
        .rename(columns={'dataset_pid_url': nameOfCountsColumn})
        .sort_values(by=['dataverse_installation_name'], ascending=True)
        .reset_index(drop=False, inplace=False))
    return countsDataframe

In [107]:
# Now let's use that function to create tables that show the counts across all installations for...

# All datasets
countOfDatasetsPerInstallationDF = createCountsDF(
    dataframe=datasetPIDsDF,
    countsQuery=returnAllRowsQuery,
    nameOfCountsColumn='count_of_datasets')

# Datasets with funder names in any of the two fields
countOfDatasetsWithAnyFunderMetadataPerInstallationDF = createCountsDF(
    dataframe=allFunderNameDataDF,
    countsQuery=returnAllRowsQuery,
    nameOfCountsColumn='count_of_datasets_with_funder_names_either_field')

# Datasets that have funder names only in the Contributor field,
countOfFunderNameinOnlyContributorFieldsDF = createCountsDF(
    dataframe=allFunderNameDataDF,
    countsQuery=onlyContributorQuery,
    nameOfCountsColumn='count_of_datasets_with_funders_names_only_contributor_field')

# Datasets that have funder names only in the Funder Information field
countOfFunderNameinOnlyFunderAgencyNameFieldsDF = createCountsDF(
    dataframe=allFunderNameDataDF,
    countsQuery=onlyFunderAgencyNameQuery,
    nameOfCountsColumn='count_of_datasets_with_funder_names_only_funder_agency_name_field')

# Datasets that have funder names in both fields
countOfFunderNameinBothFieldsDF = createCountsDF(
    dataframe=allFunderNameDataDF,
    countsQuery=bothFieldsQuery,
    nameOfCountsColumn='count_of_datasets_with_funder_names_both_fields')

In [108]:
# Let's merge these five dataframes
dataframes = [
    countOfDatasetsPerInstallationDF,
    countOfDatasetsWithAnyFunderMetadataPerInstallationDF,
    countOfFunderNameinOnlyContributorFieldsDF,
    countOfFunderNameinOnlyFunderAgencyNameFieldsDF,
    countOfFunderNameinBothFieldsDF
]
indexList = ['dataverse_installation_name']
for df in dataframes:
    df.set_index(indexList, inplace=True)
allCountsDF = reduce(lambda left, right: left.join(right, how='outer'), dataframes)

# Format data so that...
allCountsDF = (allCountsDF
    # NaN values are changed to 0
    .fillna(0)
    # Values in new count columns are integers instead of floats
    .astype({
        'count_of_datasets_with_funder_names_either_field': 'int',
        'count_of_datasets_with_funders_names_only_contributor_field': 'int',
        'count_of_datasets_with_funder_names_only_funder_agency_name_field': 'int',
        'count_of_datasets_with_funder_names_both_fields': 'int'})
    # Reset index
    .reset_index(drop=False, inplace=False))

# # Let's add a column that shows the percentage of all datasets that have funder metadata either field
# countOfDatasetsTotalAndWithFunderMetadataDF['percentage_with_funder_metadata'] = (
#     (
#         countOfDatasetsTotalAndWithFunderMetadataDF['count_of_datasets_with_funders'] /
#         countOfDatasetsTotalAndWithFunderMetadataDF['count_of_all_datasets'])*100).round(1)

allCountsDF.head()

# allCountsDF.to_csv('allCountsDF.csv', index=False)

Unnamed: 0,dataverse_installation_name,count_of_datasets,count_of_datasets_with_funder_names_either_field,count_of_datasets_with_funders_names_only_contributor_field,count_of_datasets_with_funder_names_only_funder_agency_name_field,count_of_datasets_with_funder_names_both_fields
0,ACSS_Dataverse,137,17,0,17,0
1,ADA_Dataverse,1648,753,512,95,90
2,ASU_Library_Research_Data_Repository,61,14,0,14,0
3,AUSSDA_Dataverse,1517,1497,0,1497,0
4,Abacus,2417,84,0,84,0


## Exploring ADA's metadata

In [61]:
# Create dataframe for just metadata of datasets in ADA
datasetInADADF = (allFunderNameDataDF
    .query('dataverse_installation_name == "ADA_Dataverse"')
    .drop(columns=['dataverse_installation_name'])
    .reset_index(drop=True, inplace=False))

print(f'Count of datasets in ADA with funder metadata: {len(pd.unique(datasetInADADF["dataset_pid_url"]))}')
datasetInADADF.head()

Count of datasets in ADA with funder metadata: 753


Unnamed: 0,dataset_pid_url,dataverse_collection_alias,dataverse_collection_name,dataverse_collection_type,dataset_publication_date,dataset_version_number,dataset_version_create_time,grantNumberAgency,grantNumberValue,contributorType,contributorName
0,http://dx.doi.org/10.26193/00HBWG,ada-general-collection,ADA General Collection Dataverse,UNCATEGORIZED,2018-12-17,2.0,2019-05-13T08:43:01Z,,,Funder,"General Practice Evaluation Program, Department of Health, Housing and Community Services"
1,http://dx.doi.org/10.26193/04F7C1,australia-at-work,Australia at Work Dataverse,UNCATEGORIZED,2019-01-25,2.0,2019-05-13T10:02:15Z,,,Funder,Australian Research Council
2,http://dx.doi.org/10.26193/0AF6TZ,bnla,Building a New Life in Australia Dataverse,RESEARCH_PROJECTS,2019-09-18,5.0,2022-02-02T23:01:26Z,,,Funder,Department of Social Services
3,http://dx.doi.org/10.26193/0AF6TZ,bnla,Building a New Life in Australia Dataverse,RESEARCH_PROJECTS,2019-09-18,5.0,2022-02-02T23:01:26Z,,,Funder,Department of Immigration and Border Protection
4,http://dx.doi.org/10.26193/0AGXC9,australian_historical_criminal_justice_data,Australian Historical Criminal Justice Data Dataverse,RESEARCH_PROJECTS,2020-05-12,1.0,2020-05-12T00:14:01Z,Australian Research Council,FL130100050,,


In [62]:
# datasetInADADF.to_csv('datasetInADADF.csv', index=False)

Let's get a count of these datasets that have:
- Funder metadata in only the Contributor field, where Contributor Type is "Funder"
- Funder metadata in only the Funder Information fields
- Funder metadata in both the Contributor field, where Contributor Type is "Funder", and the Funder Information field
- Funder metadata in the Contributor field, where Contributor Type is "Funder" and in the Funding Information ID field, but nothing in the Funding Information Agency field. (There are at least 57 of these datasets.)

In [63]:
# Let's see the datasets in ADA with funder names only in the Contributor field, where Contributor Type is "Funder"
contributorFunderOnlyinADADF = (datasetInADADF
    .query(onlyContributorQuery, engine='python')
    .reset_index(drop=True, inplace=False))

print(f'Count of datasets with funder metadata only as a Contributor: {len(pd.unique(contributorFunderOnlyinADADF["dataset_pid_url"]))}')

Count of datasets with funder metadata only as a Contributor: 512


In [64]:
# Let's see the datasets in ADA with funder names only in the Funder Information Name field
funderAgencyNameOnlyinADADF = (datasetInADADF
    .query(onlyFunderAgencyNameQuery, engine='python')
    .reset_index(drop=True, inplace=False))

print(f'Count of datasets with funder metadata only in the Funder Information field: {len(pd.unique(funderAgencyNameOnlyinADADF["dataset_pid_url"]))}')

Count of datasets with funder metadata only in the Funder Information field: 95


In [65]:
# Let's see the datasets in ADA with funder names in both the Funder Information Name field and Contributor Name fields
funderAgencyNameinBothFieldsADADF = (datasetInADADF
    .query(bothFieldsQuery, engine='python')
    .reset_index(drop=True, inplace=False))

print(f'Count of datasets with funder metadata in both fields: {len(pd.unique(funderAgencyNameinBothFieldsADADF["dataset_pid_url"]))}')

Count of datasets with funder metadata in both fields: 90


In [92]:
allCountsDF.to_csv('allCounts.csv', index=False)

## Exploring Harvard Dataverse's metadata
- In the Harvard Dataverse Repository, which collections have datasets with the most funding metadata? This will help us figure out who to learn from when we make changes to how funding metadata is entered.
- Which funder agency names are entered most often? Knowing that might help us figure out how effective our efforts to standardize funder agency name metadata could be? For example, do the changes to the metadata fields (the "CV javascript") make it easier for depositors to enter the most popular funder agency names? How much easier?

In [None]:
# Lets start by creating a dataframe containing only metadata of datasets published in the Harvard Dataverse Repository
datasetInHDVDF = (allFunderNameDataDF
    .query('(installation == "Harvard Dataverse")')
    .drop(columns=['installation'])
    .reset_index(drop=True, inplace=False)
    )

datasetInHDVDF.head()

In [None]:
countOfDatasetsInHDVDF = len(pd.unique(datasetInHDVDF['dataset_pid_url']))
print(f'Number of datasets in datasetInHDVDF: {countOfDatasetsInHDVDF}')

In [None]:
# Now let's keep the metadata of the datasets that have funding metadata in the three fields:
# grantNumberAgency, grantNumberValue, and contributorName when contributorType is "Funder"
fundingDatasetMetadataInHDVDF = (
    datasetInHDVDF.query(
        '(grantNumberAgency == grantNumberAgency) or\
        (grantNumberValue == grantNumberValue) or\
        (contributorType == "Funder" and contributorName == contributorName)')
     .reset_index(drop=True, inplace=False)
     )

In [None]:
fundingDatasetMetadataInHDVDF.head()

In [None]:
print(f'Number of datasets in fundingDatasetMetadataInHDVDF: {(len(pd.unique(fundingDatasetMetadataInHDVDF["dataset_pid_url"])))}')

So of the 80,278 datasets in the Harvard Dataverse Repository as of early October 2022, 30,222 had metadata about funding in one of the three fields where we expect it.

What's entered most often in the grantNumberAgency (Funding Information Name) field?

In [None]:
# Create a new dataframe that lists each funder name entered in grantNumberAgency and the number of datasets with that funder name
datasetCountByFundingAgencyNamesInHDV = (
    fundingDatasetMetadataInHDVDF[['dataset_pid_url', 'grantNumberAgency']]
        .query('grantNumberAgency == grantNumberAgency')
        .drop_duplicates()
        .groupby(['grantNumberAgency']).count()
        .rename(columns={'dataset_pid_url': 'count_of_datasets'})
        .sort_values(by=['count_of_datasets'], ascending=False)
        .reset_index(drop=False, inplace=False)
)

datasetCountByFundingAgencyNamesInHDV.head(10)

It's most likely that the Leon Levy collection has the most datasets with a funder name in their grantNumberAgency (Funding Information Name) field.

It also looks like, for datasets with other funder name metadata, we might have to deal with the variations in spellings, maybe using a deduplication or fuzzy matching library, in order to see which funders are added to the funding metadata most often. For example, this shows only 57 datasets with the value "NIH" in the Funding Information Agency field, but I've found more than 57 datasets created from NIH-funded research, with other values in the field, such as "National Institute of Health".

For now, let's see which collections have the most datasets with funding metadata, excluding the Leon Levy collections.

In [None]:
datasetsInHDVByCollection_NoLevy = (
    fundingDatasetMetadataInHDVDF[['dataset_pid_url', 'dataverse_alias']]
    .drop_duplicates()
    .groupby(['dataverse_alias']).count()
    .query('~dataverse_alias.str.contains("levy_photos").values')
    .rename(columns={'dataset_pid_url': 'count_of_datasets'})
    .sort_values(by=['count_of_datasets'], ascending=False)
    .reset_index(drop=False, inplace=False)
)

datasetsInHDVByCollection_NoLevy.head(10)

The IFPRI collections (IFPRI and AfricaRISING), worldfish collection and CIAT collection have the most datasets with funding metadata. ("harvard" is the alias of repository's main collection, where anyone can add data.)

What have the depositors of those datasets entered in the metadata?


In [None]:
# fundingDatasetMetadataInHDVDF.to_csv('fundingDatasetMetadataInHDVDF.csv', index=False)

In [None]:
def funder_names_in_collections(dataframe, collectionAliasesList):
    funderNamesInCollectionDF = (
        # Drop all but the needed columns
        dataframe[[
            'dataset_pid_url', 'grantNumberAgency', 'dataverse_alias']]
        # Remove all datasets except those in given list of collection aliases
        .query(
            'dataverse_alias in @collectionAliasesList')
        # Remove the 'dataverse_alias' column
        .drop(columns=['dataverse_alias'])
        # Keep only datasets that have funding agency name metadata
        .query('grantNumberAgency == grantNumberAgency')
        # Drop duplicate rows
        .drop_duplicates()
        # Group by funding agency name
        .groupby(['grantNumberAgency']).count()
        .rename(columns={'dataset_pid_url': 'count_of_datasets'})
        # Sort by count of occurrences of each funder name
        .sort_values(by=['count_of_datasets'], ascending=False)
        .reset_index(drop=False, inplace=False))
    return funderNamesInCollectionDF


In [None]:
datasetCountByFundingAgencyNamesIFPRI = funder_names_in_collections(
    dataframe=fundingDatasetMetadataInHDVDF,
    collectionAliasesList=['IFPRI', 'AfricaRISING'])

datasetCountByFundingAgencyNamesIFPRI.head()

In [None]:
datasetCountByFundingAgencyNamesWorldfish = funder_names_in_collections(
    dataframe=fundingDatasetMetadataInHDVDF,
    collectionAliasesList=['worldfish'])

datasetCountByFundingAgencyNamesWorldfish.head()

# funderNamesList_Worldfish = list(set(datasetCountByFundingAgencyNamesWorldfish["grantNumberAgency"].values.tolist()))

In [None]:
datasetCountByFundingAgencyNamesCIAT = funder_names_in_collections(
    dataframe=fundingDatasetMetadataInHDVDF,
    collectionAliasesList=[ # Aliases of collections I think are associated with CIAT
        'CIAT',
        'AgBio',
        'AICCRA',
        'CIFOR',
        'gender',
        'crp6',
        'dapa',
        'AllianceBioversityCIATFoodConsumer',
        'AllianceBioversityCIATLandscapes',
        'AllianceBioversityCIATClimate',
        'AllianceBioversityCIATBiodiversity',
        'AllianceBioversityCIATDigital',
        'AllianceBioversityCIATCrops4NH',
        'soils',
        'AllianceBioversityCIAT'
    ])

datasetCountByFundingAgencyNamesCIAT.head(20)

In [None]:
datasetCountByFundingAgencyNamesDFEEP = funder_names_in_collections(
    dataframe=fundingDatasetMetadataInHDVDF,
    collectionAliasesList=['DFEEP', 'ipa', 'jpal'])

datasetCountByFundingAgencyNamesDFEEP.head()

### Duplicate funding metadata in Harvard Dataverse

There are two fields where depositors can enter the names of funders: In the Funding Information Agency field and in the Contributor field when they choose the Contributor Type "Funder". How often has this happened and who's done it? By learning these things, we can see how big the problem is and who's used both fields? And later we can ask those people why. We need to learn if the design of the fields are meeting some need that we weren't aware of.

For now, let's continue looking only at the latest version of each dataset. This might cause a further under count of the number of times this issue has actually occurred. For example, first version of a dataset might have both fields filled but the latest might have only one. By considering only the latest version of each dataset, we'll miss cases like this. Eventually we'll have to think about what to do about the metadata of previous dataset versions.

Questions
- In the Harvard Dataverse Repository, how many datasets have values in the Funder Information fields and in the Contributor field when Contributor Type is "Funder"?
- How many datasets have funding metadata in their Contributor field and not in their Funder Information fields?
- How often are the same values in both fields? For example, one dataset might have "NIH" in the Funder Information Agency field and in the Contributor Name field when the Contributor Type is "Funder". How often do things like this happen?
- How often are different values in both fields?

In [None]:
# Let's remind ourselves what information we have in the fundingDatasetMetadataInHDVDF dataframe
fundingDatasetMetadataInHDVDF.head()

In [None]:
# Now let's query it to get only datasets that have values in both metadata fields
duplicateFundingFieldsInHDV = (fundingDatasetMetadataInHDVDF
        .query(
            'grantNumberAgency == grantNumberAgency and\
             (contributorType == "Funder" and contributorName == contributorName)')
        .sort_values(by=['dataset_pid_url'], ascending=True)
        .reset_index(drop=True, inplace=False))

duplicateFundingFieldsInHDV.head()

In [None]:
datasetCountDuplicateFundingFields = len(pd.unique(duplicateFundingFieldsInHDV['dataset_pid_url']))
print(f'Number of datasets with metadata in both funding metadata fields: {datasetCountDuplicateFundingFields}')

In [None]:
# duplicateFundingFieldsInHDV.to_csv('duplicateFundingFieldsInHDV.csv', index=False)

In [None]:
# Let's see which collections have most of these datasets
countOfDuplicateFundingFieldsInHDVByCollection = (
    # Drop all but the needed columns
    duplicateFundingFieldsInHDV[[
        'dataset_pid_url', 'dataverse_alias']]
        # Drop duplicate rows
        .drop_duplicates()
        # Group by dataverse_alias
        .groupby(['dataverse_alias']).count()
        .rename(columns={'dataset_pid_url': 'count_of_datasets'})
        # Sort by count of occurrences of each funder name
        .sort_values(by=['count_of_datasets'], ascending=False)
        .reset_index(drop=False, inplace=False))

countOfDuplicateFundingFieldsInHDVByCollection.head(100)

How many datasets have funding metadata in their Contributor field and not in their Grant Information fields?

### Duplicate funder metadata in other Dataverse installations

In [None]:
allFunderNameDataDF.head()

In [None]:
# fundingDatasetMetadataInDataverseInstallationsDF.to_csv('fundingDatasetMetadataInDataverseInstallationsDF.csv', index=False)

In [None]:
# Lets see how many of these datasets in each installation have values in both fields
duplicateFundingFieldsInAllInstallations = (allFunderNameDataDF
    .query(
        'grantNumberAgency == grantNumberAgency and\
         (contributorType == "Funder" and contributorName == contributorName)')
    .sort_values(by=['dataset_pid_url'], ascending=True)
    .reset_index(drop=True, inplace=False))

duplicateFundingFieldsInAllInstallations.head()

In [None]:
# duplicateFundingFieldsInAllInstallations.to_csv('duplicateFundingFieldsInAllInstallations.csv', index=False)

In [None]:
countOfDatasetsWithDuplicateFundingFieldsInEachInstallationDF = (
    # Drop all but the needed columns
    duplicateFundingFieldsInAllInstallations[[
        'dataset_pid_url', 'installation']]
        # Drop duplicate rows
        .drop_duplicates()
        # Group by dataverse_alias
        .groupby(['installation']).count()
        .rename(columns={'dataset_pid_url': 'count_of_datasets'})
        # Sort by count of occurrences of each funder name
        .sort_values(by=['count_of_datasets'], ascending=False)
        .reset_index(drop=False, inplace=False))

countOfDatasetsWithDuplicateFundingFieldsInEachInstallationDF.head(100)