<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Import-modules-and-load-functions" data-toc-modified-id="Import-modules-and-load-functions-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Import modules and load functions</a></span></li><li><span><a href="#Load-misc.-functions" data-toc-modified-id="Load-misc.-functions-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Load misc. functions</a></span></li><li><span><a href="#Get-dataverse-info" data-toc-modified-id="Get-dataverse-info-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Get dataverse info</a></span><ul class="toc-item"><li><span><a href="#Get-IDs-of-any-sub-dataverses-in-the-given-dataverse" data-toc-modified-id="Get-IDs-of-any-sub-dataverses-in-the-given-dataverse-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Get IDs of any sub-dataverses in the given dataverse</a></span></li></ul></li><li><span><a href="#Get-dataset-info" data-toc-modified-id="Get-dataset-info-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Get dataset info</a></span><ul class="toc-item"><li><span><a href="#Get-dataset-IDs-and-(sub)dataverse-names" data-toc-modified-id="Get-dataset-IDs-and-(sub)dataverse-names-4.1"><span class="toc-item-num">4.1&nbsp;&nbsp;</span>Get dataset IDs and (sub)dataverse names</a></span></li><li><span><a href="#Collect-info-about-each-dataset-and-files" data-toc-modified-id="Collect-info-about-each-dataset-and-files-4.2"><span class="toc-item-num">4.2&nbsp;&nbsp;</span>Collect info about each dataset and files</a></span></li></ul></li><li><span><a href="#Export-report-to-CSV" data-toc-modified-id="Export-report-to-CSV-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Export report to CSV</a></span></li><li><span><a href="#Get-existing-dataverse-data" data-toc-modified-id="Get-existing-dataverse-data-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Get existing dataverse data</a></span></li><li><span><a href="#Create-summary-stats-of-dataverse" data-toc-modified-id="Create-summary-stats-of-dataverse-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>Create summary stats of dataverse</a></span></li></ul></div>

## Import modules and load functions

In [1]:
from datetime import datetime, timezone
from functools import reduce
import json
import numpy as np
import pandas as pd
import requests
import sys
import time
from urllib.parse import urlparse


# def improved_get(_dict, path, default=None):
#     for key in path.split('.'):
#         try:
#             _dict = _dict[key]
#         except KeyError:
#             return default
#     return _dict

## Load misc. functions

In [2]:
def list_to_string(list):
    # Alphabetize list in case-insensitive way
    list = sorted(list, key=lambda s: s.casefold())

    # Change list to comma-separated string
    delimiter = ","
    string = delimiter.join(list)
    return string


def string_to_list(string): 
    li = list(string.split(",")) 
    return li


def string_to_datetime(string):
    dateTime = datetime.strptime(string, '%Y-%m-%dT%H:%M:%S%z')
    return dateTime


currentTime = datetime.now(timezone.utc)

## Get dataverse info

In [3]:
# Get dataverse server and alias from user - return error if there's no alias or if alias is the Root dataverse
mainDataverseUrl = 'https://dataverse.harvard.edu/dataverse/mit'

parsed = urlparse(mainDataverseUrl)
server = parsed.scheme + '://' + parsed.netloc
try:
    mainDataverseAlias = parsed.path.split('/')[2]
except IndexError:
    mainDataverseAlias = ''

In [4]:
def get_repository_metadatablocks(server):
    repositoryMetadataBlocksApi = '%s/api/v1/metadatablocks' % (server)
    response = requests.get(repositoryMetadataBlocksApi)
    repositoryMetadataBlocks = response.json()

    repositoryMetadataBlockNames = []
    for repositoryMetadataBlock in repositoryMetadataBlocks['data']:
        repositoryMetadataBlockNames.append(repositoryMetadataBlock['name'])
    return repositoryMetadataBlockNames

repositoryMetadataBlockNames = get_repository_metadatablocks(server)

In [6]:
# Get metadata about dataverse
def get_main_dataverse_json(mainDataverseUrl):
    dataverseInfoApi = '%s/api/dataverses/%s' % (server, mainDataverseAlias)
    response = requests.get(dataverseInfoApi)
    dataverseMetadata = response.json()
    return dataverseMetadata

dataverseMetadata = get_main_dataverse_json(mainDataverseUrl)

In [7]:
if dataverseMetadata['status'] == 'ERROR':
    print('No dataverse found. Is the dataverse published on Harvard Dataverse?')

if dataverseMetadata['status'] == 'OK':
    def dataverse_description_exists():
        if 'description' in dataverseMetadata['data']:
            dataverseDescriptionExists = True
        else:
            dataverseDescriptionExists = False
        return dataverseDescriptionExists


    def dataverse_tagline_exists():
        if 'theme' in dataverseMetadata['data'] and 'tagline' in dataverseMetadata['data']['theme']:
            taglineExists = True
        else:
            taglineExists = False
        return taglineExists


    def dataverse_facets():
        dataverseFacetsApi = '%s/api/dataverses/%s/facets' % (server, mainDataverseAlias)
        response = requests.get(dataverseFacetsApi)
        dataverseFacets = response.json()
        facets = []
        for facet in dataverseFacets['data']:
            facets.append(facet)
        return facets


    def dataverse_metadatablocks():
        dataverseMetadatablocksApi = '%s/api/dataverses/%s/metadatablocks' % (server, mainDataverseAlias)
        response = requests.get(dataverseMetadatablocksApi)
        dataverseMetadatablocks = response.json()
        dataverseMetadatablocksList = []
        for dataverseMetadatablock in dataverseMetadatablocks['data']:
            dataverseMetadatablocksList.append(dataverseMetadatablock['name'])
        return dataverseMetadatablocksList


### Get IDs of any sub-dataverses in the given dataverse

In [8]:
def dataverse_ids():
    mainDataverseID = dataverseMetadata['data']['id']
    dataverseIDs = [mainDataverseID]
    for dataverseID in dataverseIDs:

#         sys.stdout.write('.')
#         sys.stdout.flush()

        getContentsApi = '%s/api/dataverses/%s/contents' % (server, dataverseID)

        response = requests.get(getContentsApi)
        dataverseContents = response.json()

        for i in dataverseContents['data']:
            if i['type'] == 'dataverse':
                dataverseID = i['id']
                dataverseIDs.extend([dataverseID])
    return dataverseIDs

print('\nFound 1 dataverse and %s subdataverses' % (len(dataverse_ids()) - 1))


Found 1 dataverse and 0 subdataverses


## Get dataset info

### Get dataset IDs and (sub)dataverse names

In [66]:
# Get PIDs of all published datasets in each of the dataverses
def get_datasets():
    datasetPIDs = []
    datasetInfoDict = []
    for dataverseID in dataverse_ids():
        getDataverseInfoApi = '%s/api/dataverses/%s' % (server, dataverseID)
        response = requests.get(getDataverseInfoApi)
        dataverseInfo = response.json()
        dataverseName = dataverseInfo['data']['name']
        dataverseAlias = dataverseInfo['data']['alias']

        getDataverseContentsApi = '%s/api/dataverses/%s/contents' % (server, dataverseID)
        response = requests.get(getDataverseContentsApi)
        dataverseContents = response.json()
        for item in dataverseContents['data']:
            if item['type'] == 'dataset':
                datasetPID = item['persistentUrl'].replace('https://doi.org/', 'doi:')
                datasetPIDs.append(datasetPID)

                newRow = {'datasetPID': datasetPID,
                      'dataverseName': dataverseName,
                      'dataverseUrl': '%s/dataverse/%s' % (server, dataverseAlias)
                     }
                datasetInfoDict.append(dict(newRow))

#                 sys.stdout.write('.')
#                 sys.stdout.flush()
    datasetDataverseInfoDF = pd.DataFrame(datasetInfoDict)
    return datasetDataverseInfoDF


In [67]:
len(get_datasets().index)# len(report.index)

48

Create a dataframe for dataset info: date of publication, the release date of the latest version, number of versions

_Getting this info can be slow. For example, getting the info of ~375 datasets might take 45 min_

### Collect info about each dataset and files

In [68]:
# Create list of file types that Dataverse can convert to .tab files during ingest
uningestedFileTypes = ['application/x-rlang-transport', 'application/x-stata-13', 'application/x-spss-por',
                      'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', 'text/csv', 'text/tsv',
                      'application/x-spss-sav', 'text/comma-separated-values', 'application/x-stata',
                      'application/x-stata-14']

rowList = []
datasetCount = 0
for datasetPID in get_datasets()['datasetPID']:
    getAllVersionsApi = '%s/api/datasets/:persistentId/versions?persistentId=%s' % (server, datasetPID)
    response = requests.get(getAllVersionsApi)
    datasetVersions = response.json()
    
    # Get only datasets with metadata (exclude responses with no values in 'data' key, e.g. deaccessioned datasets)
    if datasetVersions['status'] == 'OK' and len(datasetVersions['data']) > 0:
        
        # Get metadata of latest version
        latestDatasetVersion = datasetVersions['data'][0]
        
        # Get index location of first dataset version
        firstVersion = len(datasetVersions['data']) - 1

        publicationDate = string_to_datetime(datasetVersions['data'][firstVersion]['releaseTime'])
        latestReleaseDate = string_to_datetime(latestDatasetVersion['releaseTime'])
        
        # Get age of dataset from today's date
        delta = currentTime - publicationDate
        ageOfDataset = delta.days
        
        # Get number of days since last update
        delta = currentTime - latestReleaseDate
        ageOfLastUpdate = delta.days
        if ageOfLastUpdate < 0:
            ageOfLastUpdate = 0
        
        # Get length of description text
        descriptionLength = 0
        
        for field in latestDatasetVersion['metadataBlocks']['citation']['fields']:
            if field['typeName'] == 'dsDescription':
                # "N/A" is the value assigned there was no description given (pre Dataverse 4)
                if len(field['value']) == 1 and field['value'][0]['dsDescriptionValue']['value'] == 'N/A':
                    descriptionLength = 0
                else:
                    for i in field['value']:
                        descriptionLength = descriptionLength + len(i['dsDescriptionValue']['value'])

        # See whether CC0 or Terms of Use metadata exists
        license = latestDatasetVersion.get('license', 'None')

        if 'termsOfUse' in latestDatasetVersion:
            termsOfUse = True
        else:
            termsOfUse = False
            
        if 'termsOfAccess' in latestDatasetVersion:
            termsOfAccess = True
        else:
            termsOfAccess = False

        if license != 'CC0' and termsOfUse == False:
            termsExist = False
        else:
            termsExist = True

        # Get info about related publication metadata
        relPubCount = 0
        relPubPIDCount = 0
        for field in latestDatasetVersion['metadataBlocks']['citation']['fields']:
            if field['typeName'] == 'publication':
                for value in field['value']:
                    relPubCount += 1
                    if 'publicationIDType' and 'publicationIDNumber' in value:
                        relPubPIDCount += 1
        
        # Show metadatablocks whose fields are used by the dataset
        usedMetadataBlocks = []
        for repositoryMetadataBlockName in repositoryMetadataBlockNames:
            try:
                fieldCount = len(latestDatasetVersion['metadataBlocks'][repositoryMetadataBlockName]['fields'])
                if fieldCount > 0:
                    usedMetadataBlocks.append(repositoryMetadataBlockName)
            except KeyError:
                usedMetadataBlocks = usedMetadataBlocks
        if len(usedMetadataBlocks) == 0:
            usedMetadataBlocks = ''
        else:
            usedMetadataBlocks = list_to_string(usedMetadataBlocks)
        
        # Get number of files
        numberOfFiles = len(latestDatasetVersion['files'])

        # Get file info
        noFileDescriptionCount = 0
        contentType = []
        ingestedTabFilesCount = 0
        uningestedTabFilesCount = 0
        restrictedFilesCount = 0
        fileTags = []
        for file in latestDatasetVersion['files']:            
            if 'description' in file:
                noFileDescriptionCount = noFileDescriptionCount
            else:
                noFileDescriptionCount += 1
            contentType.append(file['dataFile']['contentType'])
            if file['restricted'] == True:
                restrictedFilesCount += 1
            if file['dataFile']['contentType'] in uningestedFileTypes:
                uningestedTabFilesCount += 1
            if file['dataFile']['contentType'] == 'text/tab-separated-values':
                ingestedTabFilesCount += 1
            try:
                for tags in file['categories']:
                    fileTags.append(tags)
            except KeyError:
                fileTags = fileTags

        tabularDataFileCount = uningestedTabFilesCount + ingestedTabFilesCount

        if len(fileTags) == 0:
            fileTagsExist = False
        else:
            fileTagsExist = True

        if len(contentType) == 0:
            uniqueContentTypes = 'NA'
        else:
            uniqueContentTypes = list_to_string(list(set(contentType)))

        # Create dictionary
        newRow = {'datasetPID': datasetPID,
                  'datasetPIDUrl' : datasetPID.replace('doi:', 'https://doi.org/'),
                  'numberOfVersions': len(datasetVersions['data']),
                  'numberOfMajorVersions': latestDatasetVersion['versionNumber'],
                  'publicationDate': publicationDate,
                  'latestReleaseDate': latestReleaseDate,
                  'ageOfDataset(Days)': ageOfDataset,
                  'ageOfLastUpdate(Days)': ageOfLastUpdate,
                  'descriptionLenth': descriptionLength,
                  'termsExist': termsExist,
                  'license': license,
                  'termsOfUseExists': termsOfUse,
                  'termsOfAccessExists': termsOfAccess,
                  'relPubCount': relPubCount,
                  'relPubPIDCount': relPubPIDCount,
                  'usedMetadataBlocks': usedMetadataBlocks,
                  'numberOfFiles': numberOfFiles,
                  'noFileDescriptionCount': noFileDescriptionCount,
                  'fileTagsExist': fileTagsExist,
                  'uniqueContentTypes': uniqueContentTypes,
                  'tabularDataFileCount': ingestedTabFilesCount + uningestedTabFilesCount,
                  'ingestedTabFilesCount': ingestedTabFilesCount,
                  'uningestedTabFilesCount': uningestedTabFilesCount,
                  'restrictedFilesCount': restrictedFilesCount
                 }
        rowList.append(dict(newRow))
        datasetCount += 1
        print('%s of %s (%s)' % (datasetCount, len(get_datasets().index), datasetPID), end='\r', flush=True)
        
if len(get_datasets().index) != datasetCount:
    print('The metadata of %s dataset(s) could not be retrieved' % (len(datasetPIDs) - datasetCount))


20 of 48 (doi:10.7910/DVN/WOKBPF)

KeyboardInterrupt: 

In [295]:
datasetInfoDF = pd.DataFrame(rowList)


In [296]:
dataframes = [datasetDataverseInfoDF, datasetInfoDF]

# For each dataframe, set the indexes (or the common columns across the dataframes to join on)
for dataframe in dataframes:
    dataframe.set_index(['datasetPID'], inplace=True)

# Merge both dataframes and save to report
report = reduce(lambda left, right: left.join(right, how='outer'), dataframes)

# Reset index
report.reset_index(drop=False, inplace=True)


In [297]:
report.head(5)

Unnamed: 0,datasetPID,dataverseName,dataverseUrl,datasetPIDUrl,numberOfVersions,numberOfMajorVersions,publicationDate,latestReleaseDate,ageOfDataset(Days),ageOfLastUpdate(Days),...,relPubPIDCount,usedMetadataBlocks,numberOfFiles,noFileDescriptionCount,fileTagsExist,uniqueContentTypes,tabularDataFileCount,ingestedTabFilesCount,uningestedTabFilesCount,restrictedFilesCount
0,doi:10.70122/FK2/HZTO03,Julian Gautier (SU) Dataverse,https://demo.dataverse.org/dataverse/sefsef,https://doi.org/10.70122/FK2/HZTO03,3,1,2020-08-04 19:48:40+00:00,2020-10-26 03:44:39+00:00,86,3,...,0,"citation,geospatial",0,0,False,,0,0,0,0
1,doi:10.70122/FK2/CMFTOD,Julian Gautier (SU) Dataverse,https://demo.dataverse.org/dataverse/sefsef,https://doi.org/10.70122/FK2/CMFTOD,1,1,2020-10-14 20:07:47+00:00,2020-10-14 20:07:47+00:00,15,15,...,0,citation,2,2,False,image/jpeg,0,0,0,0
2,doi:10.70122/FK2/ZYUGHH,Julian Gautier (SU) Dataverse,https://demo.dataverse.org/dataverse/sefsef,https://doi.org/10.70122/FK2/ZYUGHH,16,5,2020-09-17 16:08:53+00:00,2020-10-29 03:16:38+00:00,42,0,...,1,"astrophysics,biomedical,citation,geospatial,so...",3,2,True,"image/jpeg,image/png,text/tab-separated-values",1,1,0,0


## Export report to CSV

In [298]:
# Export report to CSV
file = '%s_%s.csv' % (mainDataverseAlias, currentTime)
report.to_csv(file, index=False)


## Get existing dataverse data

In [9]:
report = pd.read_csv('mit_datasets.csv', na_filter = False)
datasetCount = len(report.index)

In [418]:
# ((len(report[(report['ingestedTabFilesCount']!=0)]))+(len(report[(report['uningestedTabFilesCount']!=0)])))/datasetCount*100

In [10]:
# Get list of metadatablocks used by all datasets
allUsedMetadataBlocks = []
for i in report['usedMetadataBlocks']:
    allUsedMetadataBlocks.extend(list(i.split(",")))

# Deduplicate, alphabetize and change list to string
allUsedMetadataBlocks = list_to_string(list(set(allUsedMetadataBlocks)))

In [11]:
# Get list of uniqueContentTypes used by all datasets
allContentTypes = []
for i in report['uniqueContentTypes']:
    if i != 'NA':
        allContentTypes.extend(list(i.split(",")))


## Create summary stats of dataverse

In [15]:
# Create summary
summaryDict = {
    'Summary': mainDataverseAlias,
    'Has description': dataverse_description_exists(),
    'Has tagline': dataverse_tagline_exists(),
    'Number of search facets': len(dataverse_facets()),
    'Metadatablocks enabled': len(dataverse_metadatablocks()) - 1,
    'Dataset count': datasetCount,
    'Versions (avg # of major and minor versions)': round(report['numberOfVersions'].mean(), 2),
    'Major versions (average #)': round(report['numberOfMajorVersions'].mean(), 2),
    'Description length (avg # of characters)': round(report['descriptionLenth'].mean(), 2),
    'CC0 datasets (% of total datasets)': round((len(report[(report['license'] == 'CC0')]) / datasetCount) * 100, 2),
    'Age of datasets (average)': round(report['ageOfDataset(Days)'].mean(), 2),
    'No terms (% of datasets with no terms metadata)': round(((~report['termsExist']).values.sum()) / datasetCount * 100, 2),
    'Related pub metadata (% of datasets with rel pub metadata)': round(len(report[(report['relPubCount'] != 0)]) / datasetCount * 100, 2),
    'Related pub PIDs (% of datasets with rel pub PIDs)': round(len(report[(report['relPubPIDCount'] != 0)]) / datasetCount * 100, 2),
    'Metadatablocks used (list)': allUsedMetadataBlocks,
    'No files (# of datasets with no files)': len(report[(report['numberOfFiles'] == 0)]),
    'File descriptions (% of datasets with 1 or more file descriptions)': round(len(report[(report['noFileDescriptionCount'] != 0)]) / datasetCount * 100, 2),
    'File tags (% of datasets with 1 or more file tags)': ((report['fileTagsExist']).values.sum()) / datasetCount * 100,
    'Unique file types (count)': len(set(allContentTypes)),
    'Tabular data (% of datasets with tabular data) ': round(((len(report[(report['ingestedTabFilesCount'] != 0)])) + (len(report[(report['uningestedTabFilesCount'] != 0)]))) / len(report[(report['numberOfFiles'] != 0)]) * 100, 2),
    'Tabular data ingest successes (% of datasets with tabular data that has been ingested)': round(len(report[(report['ingestedTabFilesCount'] != 0)]) / ((len(report[(report['ingestedTabFilesCount'] != 0)])) + (len(report[(report['uningestedTabFilesCount'] != 0)]))) * 100, 2),
    'Public files (% of unrestricted files)': round(((report['numberOfFiles'].sum() - report['restrictedFilesCount'].sum()) / report['numberOfFiles'].sum()) * 100, 2)
}


In [17]:
summaryDF = pd.DataFrame.from_records([summaryDict])
summaryDF = summaryDF.set_index('Summary').transpose()

In [18]:
summaryDF

Summary,mit
Has description,True
Has tagline,True
Number of search facets,9
Metadatablocks enabled,2
Dataset count,85
Versions (avg # of major and minor versions),3.55
Major versions (average #),1.71
Description length (avg # of characters),339.08
CC0 datasets (% of total datasets),7.06
Age of datasets (average),1527.98


In [425]:
# Export summary to CSV
file = '%s_summary_%s.csv' % (mainDataverseAlias, currentTime)
summaryDF.to_csv(file, index=False)