In [1]:
import pandas as pd

# Who uses MeSH terms to describe deposits published in Dataverse repositories?

## Import, query and format metadata collected from Dataverse installations in August 2024

In [2]:
# Import metadata about all datasets
datasetInfoDf = pd.read_csv(
    '/Users/juliangautier/Documents/all_installation_metadata_2024.08.25_03.34.11/dataset_pids_from_most_known_dataverse_installations_2024.08.csv',
    usecols=lambda x: x in [
        'dataset_pid_url', 'dataverse_installation_name',
        'dataverse_collection_name', 'dataverse_collection_alias',
        'dataverse_collection_type'],
    low_memory=False
)

# Import keyword metadata
keywordsDf = pd.read_csv(
    '/Users/juliangautier/Documents/all_installation_metadata_2024.08.25_03.34.11/csv_files_with_metadata_from_most_known_dataverse_installations/keyword_2024.08.25-2024.08.30.csv',
    usecols=lambda x: x in [
        'dataset_pid_url', 'dataset_publication_date',
        'keywordValue', 'keywordVocabulary',
        'keywordVocabularyURI', 'keywordTermURI']
)

# Import topic classification metadata
topicClassificationDf = pd.read_csv(
    '/Users/juliangautier/Documents/all_installation_metadata_2024.08.25_03.34.11/csv_files_with_metadata_from_most_known_dataverse_installations/topic_classification_2024.08.25-2024.08.30.csv',
    usecols=lambda x: x in [
        'dataset_pid_url', 'dataset_publication_date',
        'topicClassValue', 'topicClassVocab',
        'topicClassVocabURI'])

# Merge the topicClassificationDf and keywordsDf dataframes
keywordsAndTopicClassDf = keywordsDf.merge(
    topicClassificationDf, how='outer',
    on=['dataset_pid_url', 'dataset_publication_date'])

In [3]:
# Include only datasets where a user wanted to use a MeSH term in a keyword or topic classification field
meshBaseUrls = ['nlm.nih.gov/mesh', 'meshb.nlm.nih.gov']
meshKeywordsAndTopicClassDf = (keywordsAndTopicClassDf
    .query(
        'keywordValue.str.contains("|".join(@meshBaseUrls)) or\
        keywordVocabulary.str.contains("mesh")==True or\
        keywordVocabularyURI.str.contains("|".join(@meshBaseUrls)) or\
        keywordTermURI.str.contains("|".join(@meshBaseUrls)) or\
        topicClassVocab.str.contains("mesh")==True or\
        topicClassValue.str.contains("|".join(@meshBaseUrls)) or\
        topicClassVocabURI.str.contains("|".join(@meshBaseUrls))',
        engine="python"
    )
)

# Merge meshKeywordsAndTopicClassDf with datasetInfoDf so we see where these datasets are published
meshKeywordsAndTopicClassDf = meshKeywordsAndTopicClassDf.merge(
    datasetInfoDf,
    how='inner',
    on=['dataset_pid_url'])

# Reorder columns
meshKeywordsAndTopicClassDf = meshKeywordsAndTopicClassDf[[
    'dataverse_installation_name',
    'dataverse_collection_name',
    'dataverse_collection_alias',
    'dataverse_collection_type',
    'dataset_pid_url',
    'dataset_publication_date',
    'keywordValue',
    'keywordVocabulary',
    'keywordTermURI',
    'keywordVocabularyURI',
    'topicClassValue',
    'topicClassVocab',
    'topicClassVocabURI'
]]

# Check dataframe
meshKeywordsAndTopicClassDf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3785 entries, 0 to 3784
Data columns (total 13 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   dataverse_installation_name  3785 non-null   object
 1   dataverse_collection_name    3785 non-null   object
 2   dataverse_collection_alias   3785 non-null   object
 3   dataverse_collection_type    3212 non-null   object
 4   dataset_pid_url              3785 non-null   object
 5   dataset_publication_date     3785 non-null   object
 6   keywordValue                 3746 non-null   object
 7   keywordVocabulary            3028 non-null   object
 8   keywordTermURI               42 non-null     object
 9   keywordVocabularyURI         2999 non-null   object
 10  topicClassValue              1608 non-null   object
 11  topicClassVocab              1755 non-null   object
 12  topicClassVocabURI           1614 non-null   object
dtypes: object(13)
memory usage: 414.0

In [None]:
# Export dataframe to CSV file
meshKeywordsAndTopicClassDf.to_csv('/Users/juliangautier/Desktop/meshKeywordsAndTopicClassDf.csv', index=False)

# Explore meshKeywordsAndTopicClassDf

- How many datasets are there and which installations publish them?
- In Harvard Dataverse, which collections contain datasets where MeSH terms are used in the keyword and topic classification fields?
- In Harvard Dataverse, which datasets outside of collections include MeSH terms?

In [4]:
# How many datasets are there and which installations publish them?
datasetCount = len(pd.unique(meshKeywordsAndTopicClassDf['dataset_pid_url']))
print(datasetCount)

683


In [44]:
# Count of datasets by installation
installationCount = len(pd.unique(meshKeywordsAndTopicClassDf['dataverse_installation_name']))
print(installationCount)
datasetCountByInstallation = (
    meshKeywordsAndTopicClassDf
        [['dataverse_installation_name', 'dataset_pid_url']]
        .drop_duplicates()
        .value_counts(subset=['dataverse_installation_name'])
        .to_frame('count')
        .reset_index(drop=False, inplace=False)
    )
datasetCountByInstallation.head(installationCount)

29


Unnamed: 0,dataverse_installation_name,count
0,Harvard Dataverse,206
1,Borealis,156
2,CORA. Research Data Repository (RDR),62
3,SciELO Data,62
4,DataverseNL,44
5,Recherche Data Gouv,29
6,DataverseNO,27
7,DaRUS,16
8,UNC Dataverse,12
9,ASU Library Research Data Repository,12


In [45]:
# In Harvard Dataverse, which collections contain datasets where MeSH terms are used?
hdvMeshKeywordsAndTopicClassDf = (meshKeywordsAndTopicClassDf
    .query(
        'dataverse_installation_name == "Harvard Dataverse"'
    )
)
hdvMeshKeywordsAndTopicClassDf.info()
hdvMeshKeywordsAndTopicClassDf.to_csv('/Users/juliangautier/Desktop/hdvMeshKeywordsAndTopicClassDf.csv', index=False)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1328 entries, 0 to 3770
Data columns (total 13 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   dataverse_installation_name  1328 non-null   object
 1   dataverse_collection_name    1328 non-null   object
 2   dataverse_collection_alias   1328 non-null   object
 3   dataverse_collection_type    1328 non-null   object
 4   dataset_pid_url              1328 non-null   object
 5   dataset_publication_date     1328 non-null   object
 6   keywordValue                 1314 non-null   object
 7   keywordVocabulary            1222 non-null   object
 8   keywordTermURI               0 non-null      object
 9   keywordVocabularyURI         1210 non-null   object
 10  topicClassValue              651 non-null    object
 11  topicClassVocab              652 non-null    object
 12  topicClassVocabURI           510 non-null    object
dtypes: object(13)
memory usage: 145.2

In [27]:
collectionCount = len(pd.unique(meshKeywordsAndTopicClassDf['dataverse_collection_alias']))
print(installationCount)

datasetCountByCollection = (
    hdvMeshKeywordsAndTopicClassDf
    [['dataverse_collection_alias', 'dataset_pid_url']]
        .drop_duplicates()
        .value_counts(subset=['dataverse_collection_alias'])
        .to_frame('count')
        .reset_index(drop=False, inplace=False)
)
datasetCountByCollection.head(collectionCount)

29


Unnamed: 0,dataverse_collection_alias,count
0,harvard,85
1,biolumhub,14
2,brown,6
3,minerva-disease-atlas,5
4,acetamide,5
5,HPRC-NPA_PlanHlth_Res,4
6,McCormick_Research,4
7,HSPH_LSCHD_DatAdmin_Social,4
8,covidbr,4
9,HPRC-NPA_PlanHlth_Admin,3
