In [17]:
import numpy as np
from numpy import nan
import pandas as pd

In [18]:
licenseMetadataDF = pd.read_csv('licenses_and_terms_metadata_hdv_2022-10-03.csv', sep=',', na_filter = False)
licenseMetadataDF.head()

Unnamed: 0,dataset_pid,dataset_pid_url,dataset_version_number,license_name,license_uri,terms_of_use,confidentiality_declaration,special_permissions,restrictions,citation_requirements,depositor_requirements,conditions,disclaimer,terms_of_access,data_access_place,original_archive,availability_status,contact_for_access,size_of_collection,study_completion
0,doi:10.7910/DVN/5PRYPC,https://doi.org/10.7910/DVN/5PRYPC,4.0,CC0 1.0,http://creativecommons.org/publicdomain/zero/1.0,,,,,,,,,,,,,,,
1,doi:10.7910/DVN/VIE1H,https://doi.org/10.7910/DVN/VIE1H,248.0,,,,,,,,,,,,,,,,,
2,doi:10.7910/DVN/KKUJWW,https://doi.org/10.7910/DVN/KKUJWW,1.0,CC0 1.0,http://creativecommons.org/publicdomain/zero/1.0,,,,,,,,,,,,,,,
3,doi:10.7910/DVN/5E6GBN,https://doi.org/10.7910/DVN/5E6GBN,1.2,CC0 1.0,http://creativecommons.org/publicdomain/zero/1.0,,,,,,,,,,,,,,,
4,doi:10.7910/DVN/9MKISZ,https://doi.org/10.7910/DVN/9MKISZ,10.0,CC0 1.0,http://creativecommons.org/publicdomain/zero/1.0,,,,,,,,,,,,,,,


In [19]:
basicMetadataDF = pd.read_csv('basic_metadata_hdv_2022-10-03.csv', sep=',', na_filter = False)
basicMetadataDF.head()

Unnamed: 0,dataset_pid,dataset_pid_url,dataset_version_number,dataset_publication_date,dataset_version_create_time,dataset_version_state,publisher
0,doi:10.7910/DVN/5PRYPC,https://doi.org/10.7910/DVN/5PRYPC,4.0,2020-05-27,2020-06-17T23:49:50Z,RELEASED,Harvard Dataverse
1,doi:10.7910/DVN/VIE1H,https://doi.org/10.7910/DVN/VIE1H,248.0,2015-02-11,2015-06-18T19:49:12Z,RELEASED,Harvard Dataverse
2,doi:10.7910/DVN/KKUJWW,https://doi.org/10.7910/DVN/KKUJWW,1.0,2015-07-23,2015-07-15T00:57:41Z,RELEASED,Harvard Dataverse
3,doi:10.7910/DVN/5E6GBN,https://doi.org/10.7910/DVN/5E6GBN,1.2,2020-04-05,2020-04-05T21:32:37Z,RELEASED,Harvard Dataverse
4,doi:10.7910/DVN/9MKISZ,https://doi.org/10.7910/DVN/9MKISZ,10.0,2011-06-09,2013-01-29T23:42:10Z,RELEASED,Harvard Dataverse


In [20]:
# Check data

# Get the unique counts of datasets in the licenseMetadataDF and datasetsDF dataframes
print('Number of dataset versions licenseMetadataDF: %s' %(len(licenseMetadataDF)))
print('Number of datasets in licenseMetadataDF: %s' % (len(pd.unique(licenseMetadataDF['dataset_pid']))))

print('\nNumber of dataset versions in basicMetadataDF: %s' %(len(basicMetadataDF)))
print('Number of datasets in basicMetadataDF: %s' % (len(pd.unique(basicMetadataDF['dataset_pid']))))

Number of dataset versions licenseMetadataDF: 130768
Number of datasets in licenseMetadataDF: 80278

Number of dataset versions in basicMetadataDF: 130768
Number of datasets in basicMetadataDF: 80278


In [22]:
# Make sure the two dataframes, allDatasetPIDsDF and basicMetadataDF, contain the same number of datasets
datasetsInlicenseMetadataDFList = set(licenseMetadataDF['dataset_pid'].tolist())
datasetsInbasicMetadataDFList = set(basicMetadataDF['dataset_pid'].tolist())

print(f'Datasets in datasetsInlicenseMetadataDFList: {len(datasetsInlicenseMetadataDFList)}')
print(f'Datasets in basicMetadataDF: {len(datasetsInbasicMetadataDFList)}')

# Create list of dataset PIDs that exist in one list and not the other, if any
difference = datasetsInlicenseMetadataDFList - datasetsInbasicMetadataDFList
difference = list(difference)
print(f'Number of datasets in one list and not the other: {len(difference)}')


Datasets in datasetsInlicenseMetadataDFList: 80278
Datasets in basicMetadataDF: 80278
Number of datasets in one list and not the other: 0


In [19]:
# Create dataframe, licenseMetadataLatestVersionDF, that contains the license and terms metadata of only the latest
# version of each dataset
latestversion_licenseMetadataDF = (licenseMetadataDF
                 .iloc[licenseMetadataDF.groupby('dataset_pid')['dataset_version_number']
                 .agg(pd.Series.idxmax)]
                 .reset_index(drop=True, inplace=False))

# Replace any blank values with NaN
latestversion_licenseMetadataDF = latestversion_licenseMetadataDF.replace(r'^\s*$', np.nan, regex=True)

print(f'Count of pids in latestversion_licenseMetadataDF: {len(latestversion_licenseMetadataDF)}')


Count of pids in latestversion_licenseMetadataDF: 340813


In [23]:
# Deduplicate basicMetadataDF to that each row

# Remove dataset_version_number,  dataset_version_create_time, and dataset_version_state columns
basicMetadataDF = basicMetadataDF.drop(columns=[
    'dataset_version_number',
    'dataset_version_create_time',
    'dataset_version_state'])

# Drop duplicate rows
basicMetadataDF = basicMetadataDF.drop_duplicates()
print(len(basicMetadataDF))

340813


In [24]:
# Join the licenseMetadataLatestVersionDF and datasetsDF on the persistent_id columns
mergedDF = pd.merge(latestversion_licenseMetadataDF, basicMetadataDF,
         how='inner', on=['dataset_pid', 'dataset_pid_url'])

In [25]:
print(len(mergedDF))

340813


In [39]:
# Get list of license names in the license_name column
licenseNameList = list(set(mergedDF['license_name'].tolist()))

# Remove null and NONE values
licenseList = []
for licenseName in licenseNameList:
    if licenseName is not None and licenseName != 'NONE':
        licenseName = str(licenseName)
        licenseList.append(licenseName)
print(f'Count of license names: {len(licenseList)}')

Count of license names: 34


In [44]:
# Normalize names by removing all dashes
licenseListNormalized = []
for licenseName in licenseList:
    licenseName = licenseName.replace('-', ' ')
    licenseListNormalized.append(licenseName)
licenseListNormalized = list(set(licenseListNormalized))
print(len(licenseListNormalized))

29


In [37]:
print(licenseList)

[nan, 'CC-BY-NC-ND-3.0-ES', 'ODbL v1.0', 'CC-BY-ND-4.0', 'ODC-By v1.0', 'CC0', 'CC-BY-SA-4.0', 'DL-DE/BY-2-0', 'CC BY', 'CC BY 4.0', 'CC-BY-NC-SA-3.0-ES', 'CC-BY-SA-4.0-ES', 'Controlled Access', 'DANS MA KA Licence', 'CC-BY-3.0-ES', 'CC-BY-NC-ND-4.0', 'CC BY-NC-SA 4.0', 'MIT', 'CC-BY-NC-4.0-ES', 'CC BY-SA 4.0', 'CC BY-NC-ND 4.0', 'CC BY-NC 4.0', 'CC-BY-SA-3.0', 'CC-BY-NC-SA-4.0', 'CC-BY-4.0', 'Standard Access', 'DANS MA KI Licence', 'AGPL-3.0-or-later', 'DANS Licence', 'ODC-By 1.0', 'CCBY', 'Etalab (CC-BY)', 'CC-BY-NC-4.0', 'CC0 1.0']


In [None]:
# What licenses have people entered in the Terms of Use field
