In [32]:
import csv
from functools import reduce
from google_trans_new import google_translator
import pandas as pd


## Get data

Get data about whether or not each dataset as one or more restricted files

In [41]:
restrictedFilesCountDF = pd.read_csv('restricted_files_count.tab', sep='\t', na_filter = False)

# Get only metadata for the latest versions of each dataset
restrictedFilesCountLatestversionDF = (restrictedFilesCountDF
    .iloc[restrictedFilesCountDF.groupby('persistentUrl')['datasetVersionId'].agg(pd.Series.idxmax)]
    .reset_index(drop=True, inplace=False)
    .drop(columns=['datasetVersionId'])
    )


In [38]:
restrictedFilesCountLatestversionDF.head(5)

Unnamed: 0,persistentUrl,restricted_files
0,http://dx.doi.org/10.26193/00HBWG,NA (not recorded)
1,http://dx.doi.org/10.26193/01P0AI,NA (not recorded)
2,http://dx.doi.org/10.26193/04F7C1,NA (not recorded)
3,http://dx.doi.org/10.26193/07R31R,NA (not recorded)
4,http://dx.doi.org/10.26193/0AF6TZ,NA (not recorded)


In [42]:
termsMetadataDF = pd.read_csv('terms_metadata.tab', sep='\t', na_filter = False)

# Get only metadata for the latest versions of each dataset
termsMetadataLatestversionDF = (termsMetadataDF
    .iloc[termsMetadataDF.groupby('persistentUrl')['datasetVersionId']
    .agg(pd.Series.idxmax)]
    .sort_values(by=['publisher'], inplace=False, ascending=True)
    .drop(columns=['datasetVersionId'])
    .reset_index(drop=True, inplace=False)
    )


In [39]:
termsMetadataLatestversionDF.head(5)

Unnamed: 0,publisher,persistentUrl,datasetVersionId,majorVersionNumber,minorVersionNumber,license,termsOfAccess,termsOfUse,availabilityStatus,citationRequirements,...,confidentialityDeclaration,contactForAccess,dataaccessPlace,depositorRequirements,disclaimer,originalArchive,restrictions,sizeOfCollection,specialPermissions,studyCompletion
0,ACSS Dataverse,https://doi.org/10.25825/FK2/VXVPVP,18,1,0,NONE,,<b>Acceptance of Terms</b><br></br>\nThe follo...,,,...,,,,,,,,,,
1,ACSS Dataverse,https://doi.org/10.25825/FK2/8YKSQV,59,1,0,NONE,,<b>Acceptance of Terms</b><br></br>\nThe follo...,,,...,,,,,,,,,,
2,ACSS Dataverse,https://doi.org/10.25825/FK2/9QFRW2,82,1,0,NONE,,<b>Acceptance of Terms</b><br></br>\nThe follo...,,,...,,,,,,,,,,
3,ACSS Dataverse,https://doi.org/10.25825/FK2/A3JWCN,107,1,0,NONE,,<b>Acceptance of Terms</b><br></br>\nThe follo...,,,...,,,,,,,,,,
4,ACSS Dataverse,https://doi.org/10.25825/FK2/AGZJI8,121,1,0,NONE,,<b>Acceptance of Terms</b><br></br>\nThe follo...,,,...,,,,,,,,,,


Join restrictedFilesCountLatestversionDF to termsMetadataLatestversionDF

In [50]:
termsAndRestrictedFilesDF = pd.merge(termsMetadataLatestversionDF, restrictedFilesCountLatestversionDF)
termsAndRestrictedFilesDF = termsAndRestrictedFilesDF[['publisher','persistentUrl', 'termsOfAccess', 'restricted_files']]


In [51]:
termsAndRestrictedFilesDF.head(5)

Unnamed: 0,publisher,persistentUrl,termsOfAccess,restricted_files
0,ACSS Dataverse,https://doi.org/10.25825/FK2/VXVPVP,,0
1,ACSS Dataverse,https://doi.org/10.25825/FK2/8YKSQV,,0
2,ACSS Dataverse,https://doi.org/10.25825/FK2/9QFRW2,,0
3,ACSS Dataverse,https://doi.org/10.25825/FK2/A3JWCN,,0
4,ACSS Dataverse,https://doi.org/10.25825/FK2/AGZJI8,,0


Verify data in joined table: Compare count of datasets in restrictedFilesCountLatestversionDf to count of datasets termsMetadataLatestversionDF

In [54]:
print('Number of datasets in termsMetadataLatestversionDF: %s' % (len(pd.unique(termsMetadataLatestversionDF['persistentUrl']))))
print('Number of datasets in termsAndRestrictedFilesDF: %s' % (len(pd.unique(termsAndRestrictedFilesDF['persistentUrl']))))


Number of datasets in termsAndRestrictedFilesDF: 133253
Number of datasets in termsMetadataLatestversionDF: 133253


In [98]:
termsAndRestrictedFilesDF.restricted_files.unique()

array(['0', 'NA (not recorded)', '2', '1', '4', '7', '5', '6', '3', '8',
       'NA (no files)', '30', '9', '47', '15', '19', '54', '16', '12',
       '21', '17', '110', '32', '11', '10', '18', '13', '45', '75', '20',
       '23', '22', '499', '14', '74', '27', '33', '84', '262', '29', '31',
       '326', '392', '25', '249', '288', '24', '46', '43', '37', '52',
       '26', '40', '71', '73', '28', '192', '77', '44', '162', '383',
       '334', '49', '79', '243', '70', '250', '42', '53', '380', '141',
       '211', '66', '34', '159', '365', '182', '520', '121', '41', '35',
       '51', '57', '483', '36', '62', '212', '306', '188', '156', '104',
       '134', '848', '111', '94', '48', '39', '89', '38', '855', '506',
       '196', '152', '308', '58', '265', '195', '168', '72', '59', '55',
       '80', '247', '50', '56', '335', '61', '86', '1227', '85', '93',
       '312', '488', '160', '107', '64', '81', '105', '83', '98', '139',
       '2143', '374', '60', '126', '82', '177', '258', '99'

Count number of datasets whose latest version:
- Is in repositories whose JSON exports don't include if files are restricted or not
- That have Terms of Access metadata
- That have Terms of Access metadata and one or more restricted files
- That have Terms of Access metadata and no restricted files
- That have restricted files but no Terms of Access metadata

In [104]:
print('Number of datasets: %s' % (len(pd.unique(termsAndRestrictedFilesDF['persistentUrl']))))

df1 = (termsAndRestrictedFilesDF.query('termsOfAccess != ""'))
print('Number of datasets with Terms of Access metadata: %s' % (len(pd.unique(df1['persistentUrl']))))

df0 = (termsAndRestrictedFilesDF.query('restricted_files == "NA (not recorded)"'))
print('Number of datasets in repositories whose JSON exports do not include\
 if files are restricted or not: %s' % (len(pd.unique(df0['persistentUrl']))))

df2 = (
    termsAndRestrictedFilesDF.query(
        'termsOfAccess != ""\
        and restricted_files.str.contains("NA") == False\
        and restricted_files != "0"'))
print('Number of datasets with Terms of Access metadata and one or more restricted files: %s' % (len(pd.unique(df2['persistentUrl']))))

df3 = (
    termsAndRestrictedFilesDF.query(
        'termsOfAccess != ""\
        and restricted_files == "0"'))
print('Number of datasets with Terms of Access metadata and no restricted files: %s' % (len(pd.unique(df3['persistentUrl']))))


Number of datasets: 133253
Number of datasets with Terms of Access metadata: 6798
Number of datasets in repositories whose JSON exports do not include if files are restricted or not: 1561
Number of datasets with Terms of Access metadata and one or more restricted files: 2354
Number of datasets with Terms of Access metadata and no restricted files: 2845
