How many datasets have CC0 (or CCBY for forked repositories) and all 8 "Terms of Use" fields filled? How many have 7? How many have 6? Etc...

In [153]:
import csv
import numpy as np
import pandas as pd


In [154]:
data = pd.read_csv('terms_metadata.tab', sep='\t', na_filter = False)

In [155]:
# Get only metadata for the latest versions of each dataset
latestversion = data.iloc[data.groupby('persistentUrl')['datasetVersionId'].agg(pd.Series.idxmax)].sort_values(by=['publisher'], inplace=False, ascending=True).reset_index(drop=True, inplace=False)


In [156]:
# Replace any blank values with NaN
latestversion = latestversion.replace(r'^\s*$', np.nan, regex=True)

# Replace publsiher "Root" with Dartmouth
latestversion['publisher'] = latestversion['publisher'].replace(['Root'],'Dartmouth')


In [157]:
latestversion_termsofuse = latestversion.drop(columns=[
    'publisher', 'datasetVersionId', 'majorVersionNumber', 'minorVersionNumber', 'termsOfUse',
    'termsOfAccess', 'availabilityStatus', 'contactForAccess', 'dataaccessPlace',
    'originalArchive', 'sizeOfCollection', 'studyCompletion'])


In [158]:
latestversion_termsofuse.license.unique()

array(['NONE', 'CC0', 'CC BY', nan, 'CCBY'], dtype=object)

In [159]:
latestversionCC0 = (latestversion_termsofuse
    .query('license == "CC0" or license == "CCBY" or license == "CC BY"')
#     .reset_index(drop = False, inplace = True)
    .set_index('persistentUrl')
)

In [160]:
latestversionCC0.head(5)

Unnamed: 0_level_0,license,citationRequirements,conditions,confidentialityDeclaration,depositorRequirements,disclaimer,restrictions,specialPermissions
persistentUrl,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
http://dx.doi.org/10.4225/87/B5AJXD,CC0,,,,,,,
https://hdl.handle.net/11529/10548094,CC0,,,,,,,
https://hdl.handle.net/11529/10548095,CC0,,,,,,,
https://hdl.handle.net/11529/10548096,CC0,,,,,,,
https://hdl.handle.net/11529/10548097,CC0,,,,,,,


In [161]:
len(latestversionCC0)

33961

In [162]:
countsByDataset = (latestversionCC0
    .drop(columns=['license'])
    .count(axis=1)
    .to_frame('count')
    .query('count > 0')
)

In [163]:
countsByDataset.head(5)

Unnamed: 0_level_0,count
persistentUrl,Unnamed: 1_level_1
https://doi.org/10.21979/N9/OF5ZDK,2
https://doi.org/10.21979/N9/DHYM9H,2
https://doi.org/10.34894/FUTGYT,3
https://doi.org/10.34894/Q0XNJS,1
https://doi.org/10.34894/AXFRPB,2


In [164]:
len(countsByDataset)

498

Looks like there are 498 datasets with:
- Either the CC0 waiver or CCBY license
- One or more Terms of Use fields filled (exluding the Terms of Use field)

Let's join the countsByDataset dataframe to the latestversionCC0 dataframe: 

In [176]:
countsByDataset_merged = (pd
    .merge(countsByDataset, latestversion, how='left', on='persistentUrl')
    .drop(columns=[
        'termsOfUse', 'termsOfAccess', 'availabilityStatus',
        'contactForAccess', 'dataaccessPlace', 'originalArchive', 'sizeOfCollection', 'studyCompletion'])
)


In [177]:
countsByDataset_merged.head(5)

Unnamed: 0,persistentUrl,count,publisher,datasetVersionId,majorVersionNumber,minorVersionNumber,license,citationRequirements,conditions,confidentialityDeclaration,depositorRequirements,disclaimer,restrictions,specialPermissions
0,https://doi.org/10.21979/N9/OF5ZDK,2,DR-NTU (Data),1432,1,0,CC0,"To cite this dataset: \nStyles, Suzy; Travers ...",,,,,Researchers must give their name and current r...,
1,https://doi.org/10.21979/N9/DHYM9H,2,DR-NTU (Data),1431,1,0,CC0,"To cite this dataset:\nStyles, Suzy J; Bin Mus...",,,,,"To download these audio files, researchers mus...",
2,https://doi.org/10.34894/FUTGYT,3,DataverseNL,2997,2,0,CC0,Mention publication Habets et al 2020 when usi...,Anonimyzed by changing patients' age in age ra...,,,,Not available for commercial use.,
3,https://doi.org/10.34894/Q0XNJS,1,DataverseNL,1412,1,0,CC0,,"The data, syntaxes, and reports of this projec...",,,,,
4,https://doi.org/10.34894/AXFRPB,2,DataverseNL,1877,1,2,CC0,If active involvement of AIGHD/IFPRI researche...,,,,,,The data are available upon request through th...


In [178]:
len(countsByDataset_merged)

498

In [179]:
# Export to CSV
file = 'countsByDataset_merged.csv'
countsByDataset_merged.to_csv(file, index=False)