In [32]:
import csv
from functools import reduce
from google_trans_new import google_translator
import pandas as pd


## Get data

Get data about whether or not each dataset as one or more restricted files

In [41]:
restrictedFilesCountDF = pd.read_csv('restricted_files_count.tab', sep='\t', na_filter = False)

# Get only metadata for the latest versions of each dataset
restrictedFilesCountLatestversionDF = (restrictedFilesCountDF
    .iloc[restrictedFilesCountDF.groupby('persistentUrl')['datasetVersionId'].agg(pd.Series.idxmax)]
    .reset_index(drop=True, inplace=False)
    .drop(columns=['datasetVersionId'])
    )


In [38]:
restrictedFilesCountLatestversionDF.head(5)

Unnamed: 0,persistentUrl,restricted_files
0,http://dx.doi.org/10.26193/00HBWG,NA (not recorded)
1,http://dx.doi.org/10.26193/01P0AI,NA (not recorded)
2,http://dx.doi.org/10.26193/04F7C1,NA (not recorded)
3,http://dx.doi.org/10.26193/07R31R,NA (not recorded)
4,http://dx.doi.org/10.26193/0AF6TZ,NA (not recorded)


In [42]:
termsMetadataDF = pd.read_csv('terms_metadata.tab', sep='\t', na_filter = False)

# Get only metadata for the latest versions of each dataset
termsMetadataLatestversionDF = (termsMetadataDF
    .iloc[termsMetadataDF.groupby('persistentUrl')['datasetVersionId']
    .agg(pd.Series.idxmax)]
    .sort_values(by=['publisher'], inplace=False, ascending=True)
    .drop(columns=['datasetVersionId'])
    .reset_index(drop=True, inplace=False)
    )


In [39]:
termsMetadataLatestversionDF.head(5)

Unnamed: 0,publisher,persistentUrl,datasetVersionId,majorVersionNumber,minorVersionNumber,license,termsOfAccess,termsOfUse,availabilityStatus,citationRequirements,...,confidentialityDeclaration,contactForAccess,dataaccessPlace,depositorRequirements,disclaimer,originalArchive,restrictions,sizeOfCollection,specialPermissions,studyCompletion
0,ACSS Dataverse,https://doi.org/10.25825/FK2/VXVPVP,18,1,0,NONE,,<b>Acceptance of Terms</b><br></br>\nThe follo...,,,...,,,,,,,,,,
1,ACSS Dataverse,https://doi.org/10.25825/FK2/8YKSQV,59,1,0,NONE,,<b>Acceptance of Terms</b><br></br>\nThe follo...,,,...,,,,,,,,,,
2,ACSS Dataverse,https://doi.org/10.25825/FK2/9QFRW2,82,1,0,NONE,,<b>Acceptance of Terms</b><br></br>\nThe follo...,,,...,,,,,,,,,,
3,ACSS Dataverse,https://doi.org/10.25825/FK2/A3JWCN,107,1,0,NONE,,<b>Acceptance of Terms</b><br></br>\nThe follo...,,,...,,,,,,,,,,
4,ACSS Dataverse,https://doi.org/10.25825/FK2/AGZJI8,121,1,0,NONE,,<b>Acceptance of Terms</b><br></br>\nThe follo...,,,...,,,,,,,,,,


Join restrictedFilesCountLatestversionDF to termsMetadataLatestversionDF

In [50]:
termsAndRestrictedFilesDF = pd.merge(termsMetadataLatestversionDF, restrictedFilesCountLatestversionDF)
termsAndRestrictedFilesDF = termsAndRestrictedFilesDF[['publisher','persistentUrl', 'termsOfAccess', 'restricted_files']]


In [51]:
termsAndRestrictedFilesDF.head(5)

Unnamed: 0,publisher,persistentUrl,termsOfAccess,restricted_files
0,ACSS Dataverse,https://doi.org/10.25825/FK2/VXVPVP,,0
1,ACSS Dataverse,https://doi.org/10.25825/FK2/8YKSQV,,0
2,ACSS Dataverse,https://doi.org/10.25825/FK2/9QFRW2,,0
3,ACSS Dataverse,https://doi.org/10.25825/FK2/A3JWCN,,0
4,ACSS Dataverse,https://doi.org/10.25825/FK2/AGZJI8,,0


Verify data in joined table: Compare count of datasets in restrictedFilesCountLatestversionDf to count of datasets termsMetadataLatestversionDF

In [54]:
print('Number of datasets in termsMetadataLatestversionDF: %s' % (len(pd.unique(termsMetadataLatestversionDF['persistentUrl']))))
print('Number of datasets in termsAndRestrictedFilesDF: %s' % (len(pd.unique(termsAndRestrictedFilesDF['persistentUrl']))))


Number of datasets in termsAndRestrictedFilesDF: 133253
Number of datasets in termsMetadataLatestversionDF: 133253


Count number of datasets whose latest version has:

 - Terms of Access metadata and one or more restricted files

 - Terms of Access metadata and no restricted files

 - Restricted files but no Terms of Access metadata