In [29]:
import numpy as np
from numpy import nan
import pandas as pd

In [9]:
licenseMetadataDF = pd.read_csv('licenses_and_terms.csv', sep=',', na_filter = False)
licenseMetadataDF.head()

Unnamed: 0,dataset_pid,dataset_pid_url,dataset_version_number,license_name,license_uri,terms_of_use,confidentiality_declaration,special_permissions,restrictions,citation_requirements,depositor_requirements,conditions,disclaimer,terms_of_access,data_access_place,original_archive,availability_status,contact_for_access,size_of_collection,study_completion
0,doi:10.48370/OFD/DBJUEM,https://doi.org/10.48370/OFD/DBJUEM,1.0,NONE,,"<a href=""http://creativecommons.org/licenses/by/4.0/"" target=""_blank"" rel=""nofollow""><img alt=""Creative Commons License"" src=""https://i.creativecommons.org/l/by/4.0/88x31.png""></a> <br> These data and documents are licensed under a <a href=""http://creativecommons.org/licenses/by/4.0/"" target=""_blank"" rel=""nofollow""> Creative Commons Attribution 4.0 International license.</a> You may copy, distribute and transmit the data as long as you acknowledge the source through proper <a href=""http://best-practices.dataverse.org/data-citation/"" target=""_blank"" rel=""nofollow"">data citation</a>.",,,,,,,,,,,,,,
1,doi:10.21410/7E4/4WG94W,https://doi.org/10.21410/7E4/4WG94W,2.1,,,This dataset is made available without information on how it can be used. You should communicate with the Contact(s) specified before use.,,,,,,,,,,,,,,
2,doi:10.7910/DVN/5PRYPC,https://doi.org/10.7910/DVN/5PRYPC,4.0,CC0 1.0,http://creativecommons.org/publicdomain/zero/1.0,,,,,,,,,,,,,,,
3,doi:10.17026/dans-2zm-dsmz,https://doi.org/10.17026/dans-2zm-dsmz,1.0,CC-BY-NC-SA-4.0,http://creativecommons.org/licenses/by-nc-sa/4.0,,,,,,,,,,,,,,,
4,doi:10.7910/DVN/VIE1H,https://doi.org/10.7910/DVN/VIE1H,248.0,,,,,,,,,,,,,,,,,


In [22]:
basicMetadataDF = pd.read_csv('basic_metadata.csv', sep=',', na_filter = False)
basicMetadataDF.head()

Unnamed: 0,dataset_pid,dataset_pid_url,dataset_version_number,dataset_publication_date,dataset_version_create_time,dataset_version_state,publisher
0,doi:10.48370/OFD/DBJUEM,https://doi.org/10.48370/OFD/DBJUEM,1.0,5/25/22,2022-05-25T10:17:50Z,RELEASED,openforestdata.pl
1,doi:10.21410/7E4/4WG94W,https://doi.org/10.21410/7E4/4WG94W,2.1,5/5/20,2020-05-13T16:06:28Z,RELEASED,data.sciencespo
2,doi:10.7910/DVN/5PRYPC,https://doi.org/10.7910/DVN/5PRYPC,4.0,5/27/20,2020-06-17T23:49:50Z,RELEASED,Harvard Dataverse
3,doi:10.17026/dans-2zm-dsmz,https://doi.org/10.17026/dans-2zm-dsmz,1.0,7/13/18,2022-01-27T20:15:47Z,RELEASED,DANS Data Station Archaeology
4,doi:10.7910/DVN/VIE1H,https://doi.org/10.7910/DVN/VIE1H,248.0,2/11/15,2015-06-18T19:49:12Z,RELEASED,Harvard Dataverse


In [4]:
# Check data

# Get the unique counts of datasets in the licenseMetadataDF and datasetsDF dataframes
print('Number of dataset versions licenseMetadataDF: %s' %(len(licenseMetadataDF)))
print('Number of datasets in licenseMetadataDF: %s' % (len(pd.unique(licenseMetadataDF['dataset_pid']))))

print('Number of dataset versions in basicMetadataDF: %s' %(len(basicMetadataDF)))
print('Number of datasets in basicMetadataDF: %s' % (len(pd.unique(basicMetadataDF['dataset_pid']))))

Number of dataset versions licenseMetadataDF: 473865
Number of datasets in licenseMetadataDF: 340813
Number of dataset versions in basicMetadataDF: 473865
Number of datasets in basicMetadataDF: 340813


In [18]:
# Make sure both dataframes contain the same datasets

# Get list of dataset PIDs in both dataframes
datasetsInlicenseMetadataDFList = set(licenseMetadataDF['dataset_pid'].tolist())
datasetsInbasicMetadataDFList = set(basicMetadataDF['dataset_pid'].tolist())

print(f'Count of pids in datasetsInlicenseMetadataDFList: {len(datasetsInlicenseMetadataDFList)}')
print(f'Count of pids in datasetsInbasicMetadataDFList: {len(datasetsInbasicMetadataDFList)}')

# Create list of dataset PIDs that exist in on list and not the other, if any
difference = datasetsInlicenseMetadataDFList - datasetsInbasicMetadataDFList
difference = list(difference)
print(f'Number of dataset PIDs that exist in on list and not the other: {len(difference)}')
if len(difference) > 0:
    list(difference[:10])

Count of pids in datasetsInlicenseMetadataDFList: 340813
Count of pids in datasetsInbasicMetadataDFList: 340813
Number of dataset PIDs that exist in on list and not the other: 0


In [19]:
# Create dataframe, licenseMetadataLatestVersionDF, that contains the license and terms metadata of only the latest
# version of each dataset
latestversion_licenseMetadataDF = (licenseMetadataDF
                 .iloc[licenseMetadataDF.groupby('dataset_pid')['dataset_version_number']
                 .agg(pd.Series.idxmax)]
                 .reset_index(drop=True, inplace=False))

# Replace any blank values with NaN
latestversion_licenseMetadataDF = latestversion_licenseMetadataDF.replace(r'^\s*$', np.nan, regex=True)

print(f'Count of pids in latestversion_licenseMetadataDF: {len(latestversion_licenseMetadataDF)}')


Count of pids in latestversion_licenseMetadataDF: 340813


In [23]:
# Deduplicate basicMetadataDF to that each row

# Remove dataset_version_number,  dataset_version_create_time, and dataset_version_state columns
basicMetadataDF = basicMetadataDF.drop(columns=[
    'dataset_version_number',
    'dataset_version_create_time',
    'dataset_version_state'])

# Drop duplicate rows
basicMetadataDF = basicMetadataDF.drop_duplicates()
print(len(basicMetadataDF))

340813


In [24]:
# Join the licenseMetadataLatestVersionDF and datasetsDF on the persistent_id columns
mergedDF = pd.merge(latestversion_licenseMetadataDF, basicMetadataDF,
         how='inner', on=['dataset_pid', 'dataset_pid_url'])

In [25]:
print(len(mergedDF))

340813


In [39]:
# Get list of license names in the license_name column
licenseNameList = list(set(mergedDF['license_name'].tolist()))

# Remove null and NONE values
licenseList = []
for licenseName in licenseNameList:
    if licenseName is not None and licenseName != 'NONE':
        licenseName = str(licenseName)
        licenseList.append(licenseName)
print(f'Count of license names: {len(licenseList)}')

Count of license names: 34


In [44]:
# Normalize names by removing all dashes
licenseListNormalized = []
for licenseName in licenseList:
    licenseName = licenseName.replace('-', ' ')
    licenseListNormalized.append(licenseName)
licenseListNormalized = list(set(licenseListNormalized))
print(len(licenseListNormalized))

29


In [37]:
print(licenseList)

[nan, 'CC-BY-NC-ND-3.0-ES', 'ODbL v1.0', 'CC-BY-ND-4.0', 'ODC-By v1.0', 'CC0', 'CC-BY-SA-4.0', 'DL-DE/BY-2-0', 'CC BY', 'CC BY 4.0', 'CC-BY-NC-SA-3.0-ES', 'CC-BY-SA-4.0-ES', 'Controlled Access', 'DANS MA KA Licence', 'CC-BY-3.0-ES', 'CC-BY-NC-ND-4.0', 'CC BY-NC-SA 4.0', 'MIT', 'CC-BY-NC-4.0-ES', 'CC BY-SA 4.0', 'CC BY-NC-ND 4.0', 'CC BY-NC 4.0', 'CC-BY-SA-3.0', 'CC-BY-NC-SA-4.0', 'CC-BY-4.0', 'Standard Access', 'DANS MA KI Licence', 'AGPL-3.0-or-later', 'DANS Licence', 'ODC-By 1.0', 'CCBY', 'Etalab (CC-BY)', 'CC-BY-NC-4.0', 'CC0 1.0']


In [None]:
# What licenses have people entered in the Terms of Use field
