In [None]:
import csv
from functools import reduce
import numpy as np
import pandas as pd
import panel as pn
import plotly.express as px
import statsmodels

pn.extension()

The terms_metadata.tab file contains some basic metadata and the "Terms" metadata of all published versions of every dataset published in 49 known Dataverse repositories. 

Getting the data in terms_metadata.tab:
- Download the 49 zipped files at https://doi.org/10.7910/DVN/DCDKZQ. Each zipped file contains the metadata of each published version of every dataset published in 49 known Dataverse repositories
- Using your preferred method, move all JSON files into a single folder
- Run the two scripts "get_basic_metadata.py" and "get_terms_metadata.py" at https://github.com/jggautier/dataverse_scripts/tree/master/get-dataverse-metadata/parse_metadata_fields with that folder as the input to get two CSV files, one containing the basic metadata of all datasets (publisher names, PIDs, publication dates, version numbers, etc), and one containing the Terms metadata for each version of each dataset.
- Using your preferred method, retain from the basic_metadata file only the 'publisher', 'persistentUrl', 'datasetVersionId', 'majorVersionNumber', and 'minorVersionNumber' columns.
- Using your preferred method, join both CSV files on their persistentUrl and datasetVersionId columns
- Export the results as a .tab file (or export as a CSV and convert to TAB). Because of the Dataverse software's preference for .tab files, it's easier to work with if you plan to publish this data in a Dataverse repository

In [None]:
data = pd.read_csv('terms_metadata.tab', sep='\t', na_filter = False)

In [None]:
# Check data
print('Number of datasets: %s' % (len(pd.unique(data['persistentUrl']))))
print('Number of dataset versions: %s' %(data.shape[0]))
      

In [None]:
data.head(5)


In [None]:
# Get only metadata for the latest versions of each dataset
latestversion = data.iloc[data.groupby('persistentUrl')['datasetVersionId'].agg(pd.Series.idxmax)].sort_values(by=['publisher'], inplace=False, ascending=True).reset_index(drop=True, inplace=False)


In [None]:
# Check data
print('Number of datasets: %s' % (len(pd.unique(latestversion['persistentUrl']))))
print('Number of dataset versions: %s' %(latestversion.shape[0]))


In [None]:
latestversion.head(5)


In [None]:
# Replace any blank values with NaN
latestversion = latestversion.replace(r'^\s*$', np.nan, regex=True)

In [None]:
# What are repositories putting in the their datasets' license fields?
latestversion.license.unique()

In [None]:
# Which datasets have the CC0 waiver or CCBY licenses in their Terms metadata?
data_with_licenses = (
    latestversion.query(
        'termsOfUse.str.contains("CC0")\
        or termsOfUse.str.contains("CCBY")\
        or termsOfUse.str.contains("CC BY")\
        or termsOfUse.str.contains("creative commons", case = False)'
    )
)

In [None]:
# Check data
print('Number of datasets: %s' % (len(pd.unique(data_with_licenses['persistentUrl']))))


In [None]:
# Of those datasets, which have any text in their other Terms metadata?
data_with_licenses_and_other_terms = (
    data_with_licenses.query(
        'termsOfAccess == termsOfAccess or\
        availabilityStatus == availabilityStatus or\
        citationRequirements == citationRequirements or\
        conditions == conditions or\
        confidentialityDeclaration == confidentialityDeclaration or\
        contactForAccess == contactForAccess or\
        dataaccessPlace == dataaccessPlace or\
        depositorRequirements == depositorRequirements or\
        disclaimer == disclaimer or\
        originalArchive == originalArchive or\
        restrictions == restrictions or\
        sizeOfCollection == sizeOfCollection or\
        specialPermissions == specialPermissions or\
        studyCompletion == studyCompletion'
    )
)


In [None]:
# Check data
print('Number of datasets: %s' % (len(pd.unique(data_with_licenses_and_other_terms['persistentUrl']))))


In [None]:
data_with_licenses_and_other_terms.head(5)

In [None]:
# Export dataframe to CSV
file = 'data_with_licenses_and_other_terms.csv'
data_with_licenses_and_other_terms.to_csv(file, index = False)