In [20]:
import numpy as np
from numpy import nan
import pandas as pd
import re
import requests
import htmlmin

In [2]:
def to_str(item):
    return str(item)

In [3]:
licenseMetadataDF = pd.read_csv(
    'licenses_and_terms_metadata_hdv_2022-10-03.csv',
    sep=',', na_filter=False,
    converters={'dataset_version_number': to_str})
licenseMetadataDF.head()

Unnamed: 0,dataset_pid,dataset_pid_url,dataset_version_number,license_name,license_uri,terms_of_use,confidentiality_declaration,special_permissions,restrictions,citation_requirements,depositor_requirements,conditions,disclaimer,terms_of_access,data_access_place,original_archive,availability_status,contact_for_access,size_of_collection,study_completion
0,doi:10.7910/DVN/5PRYPC,https://doi.org/10.7910/DVN/5PRYPC,4.0,CC0 1.0,http://creativecommons.org/publicdomain/zero/1.0,,,,,,,,,,,,,,,
1,doi:10.7910/DVN/VIE1H,https://doi.org/10.7910/DVN/VIE1H,248.0,,,,,,,,,,,,,,,,,
2,doi:10.7910/DVN/KKUJWW,https://doi.org/10.7910/DVN/KKUJWW,1.0,CC0 1.0,http://creativecommons.org/publicdomain/zero/1.0,,,,,,,,,,,,,,,
3,doi:10.7910/DVN/5E6GBN,https://doi.org/10.7910/DVN/5E6GBN,1.2,CC0 1.0,http://creativecommons.org/publicdomain/zero/1.0,,,,,,,,,,,,,,,
4,doi:10.7910/DVN/9MKISZ,https://doi.org/10.7910/DVN/9MKISZ,10.0,CC0 1.0,http://creativecommons.org/publicdomain/zero/1.0,,,,,,,,,,,,,,,


In [4]:
basicMetadataDF = pd.read_csv(
    'basic_metadata_hdv_2022-10-03.csv',
    sep=',', na_filter=False,
    parse_dates=['dataset_publication_date', 'dataset_version_create_time'],
    converters={'dataset_version_number': to_str})

basicMetadataDF.head()

Unnamed: 0,dataset_pid,dataset_pid_url,dataset_version_number,dataset_publication_date,dataset_version_create_time,dataset_version_state,publisher
0,doi:10.7910/DVN/5PRYPC,https://doi.org/10.7910/DVN/5PRYPC,4.0,2020-05-27,2020-06-17 23:49:50+00:00,RELEASED,Harvard Dataverse
1,doi:10.7910/DVN/VIE1H,https://doi.org/10.7910/DVN/VIE1H,248.0,2015-02-11,2015-06-18 19:49:12+00:00,RELEASED,Harvard Dataverse
2,doi:10.7910/DVN/KKUJWW,https://doi.org/10.7910/DVN/KKUJWW,1.0,2015-07-23,2015-07-15 00:57:41+00:00,RELEASED,Harvard Dataverse
3,doi:10.7910/DVN/5E6GBN,https://doi.org/10.7910/DVN/5E6GBN,1.2,2020-04-05,2020-04-05 21:32:37+00:00,RELEASED,Harvard Dataverse
4,doi:10.7910/DVN/9MKISZ,https://doi.org/10.7910/DVN/9MKISZ,10.0,2011-06-09,2013-01-29 23:42:10+00:00,RELEASED,Harvard Dataverse


In [5]:
# Check data

# Get the unique counts of datasets in the licenseMetadataDF and datasetsDF dataframes
print('Number of dataset versions licenseMetadataDF: %s' %(len(licenseMetadataDF)))
print('Number of datasets in licenseMetadataDF: %s' % (len(pd.unique(licenseMetadataDF['dataset_pid']))))

print('\nNumber of dataset versions in basicMetadataDF: %s' %(len(basicMetadataDF)))
print('Number of datasets in basicMetadataDF: %s' % (len(pd.unique(basicMetadataDF['dataset_pid']))))

Number of dataset versions licenseMetadataDF: 130768
Number of datasets in licenseMetadataDF: 80278

Number of dataset versions in basicMetadataDF: 130768
Number of datasets in basicMetadataDF: 80278


In [6]:
# Make sure the two dataframes, allDatasetPIDsDF and basicMetadataDF, contain the datasets
datasetsInlicenseMetadataDFList = set(licenseMetadataDF['dataset_pid'].tolist())
datasetsInbasicMetadataDFList = set(basicMetadataDF['dataset_pid'].tolist())

print(f'Datasets in datasetsInlicenseMetadataDFList: {len(datasetsInlicenseMetadataDFList)}')
print(f'Datasets in basicMetadataDF: {len(datasetsInbasicMetadataDFList)}')

# Create list of dataset PIDs that exist in one list and not the other, if any
difference = datasetsInlicenseMetadataDFList - datasetsInbasicMetadataDFList
difference = list(difference)
print(f'Number of datasets in one table and not the other: {len(difference)}')


Datasets in datasetsInlicenseMetadataDFList: 80278
Datasets in basicMetadataDF: 80278
Number of datasets in one table and not the other: 0


In [7]:
# Merge the two dataframes
licenseAndBasicMetadataDF = pd.merge(
    licenseMetadataDF, basicMetadataDF,
    how='inner',
    on=[
        'dataset_pid',
        'dataset_pid_url',
        'dataset_version_number'])

In [8]:
# Check count of unique datasets in the new mergedDF
print(f'Number of dataset versions in licenseAndBasicMetadataDF: {len(licenseAndBasicMetadataDF)}')
uniqueDatasetCount = len(pd.unique(licenseAndBasicMetadataDF['dataset_pid']))
print(f'Number of datasets in licenseAndBasicMetadataDF: {uniqueDatasetCount}')

Number of dataset versions in licenseAndBasicMetadataDF: 130768
Number of datasets in licenseAndBasicMetadataDF: 80278


In [11]:
# Create dataframe, licenseMetadataLatestVersionDF, that contains the license and terms metadata of only the latest
# version of each dataset, using the latest version release time from the merged basicMetadataDF
latestversion_licenseAndBasicMetadataDF = (licenseAndBasicMetadataDF
                 .iloc[licenseAndBasicMetadataDF.groupby('dataset_pid')['dataset_version_create_time']
                 .agg(pd.Series.idxmax)]
                 .reset_index(drop=True, inplace=False))

# Replace any blank values with NaN
# latestversion_licenseAndBasicMetadataDF = latestversion_licenseAndBasicMetadataDF.replace(r'^\s*$', np.nan, regex=True)

# Remove dataset_version_state and publisher columns and reorder remaining columns
latestversion_licenseAndBasicMetadataDF = latestversion_licenseAndBasicMetadataDF[[
    'dataset_pid_url', 'dataset_publication_date', 'dataset_version_number', 'dataset_version_create_time', 'license_name', 'license_uri', 'terms_of_use', 'confidentiality_declaration', 'special_permissions', 'restrictions', 'citation_requirements', 'depositor_requirements', 'conditions', 'disclaimer', 'terms_of_access', 'data_access_place', 'original_archive', 'availability_status', 'contact_for_access', 'size_of_collection', 'study_completion'
]]

print(f'Count of pids in latestversion_mergedDF: {len(latestversion_licenseAndBasicMetadataDF)}')


Count of pids in latestversion_mergedDF: 80278


In [12]:
# Let's make sure each column is the object type we want. Everything should be a string except for the two date columns.
latestversion_licenseAndBasicMetadataDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80278 entries, 0 to 80277
Data columns (total 21 columns):
 #   Column                       Non-Null Count  Dtype              
---  ------                       --------------  -----              
 0   dataset_pid_url              80278 non-null  object             
 1   dataset_publication_date     80278 non-null  datetime64[ns]     
 2   dataset_version_number       80278 non-null  object             
 3   dataset_version_create_time  80278 non-null  datetime64[ns, UTC]
 4   license_name                 80278 non-null  object             
 5   license_uri                  80278 non-null  object             
 6   terms_of_use                 80278 non-null  object             
 7   confidentiality_declaration  80278 non-null  object             
 8   special_permissions          80278 non-null  object             
 9   restrictions                 80278 non-null  object             
 10  citation_requirements        80278 non-null  o

Now that we have the data we need, let's answer our question:
- Which standard licences have depositors entered into the Terms fields?

As part of the Dataverse software's multiple license update:
1. If a dataset has a CC0 waiver and nothing entered in any of its "Terms of Use" fields, the dataset is considered to have a "standard" CC0 license, and the metadata of those datasets, the "license_name" field has "CC0".
2. If a dataset has a CC0 waiver and anything entered in any of its "Terms of Use" fields, the dataset is considered to have a custom license, where "This dataset is made available under a Creative Commons CC0 license with the following additional/modified terms and conditions:" is entered in the Terms of Use field, followed by whatever text was entered in any of the "Terms of Use" fields.
3. If the dataset does not have a CC0 waiver and anything entered in any of its "Terms of Use" fields, the dataset is considered to have a custom license, where what's   in any of the "Terms of Use" fields makes us the custom license.
4. If the CC0 waiver is not applied to the dataset and nothing is entered in any of the Terms of Use fields, the text "This dataset is made available without information on how it can be used. You should communicate with the Contact(s) specified before use." was entered in the Terms of Use field

To find datasets with "custom licenses", we want to find datasets that match the second or third conditions.

In [17]:
datasetsWithCustomLicense = (latestversion_licenseAndBasicMetadataDF
                     .query('license_name == "" and terms_of_use != "This dataset is made available without information on how it can be used. You should communicate with the Contact(s) specified before use."')
                     .reset_index(drop=True, inplace=False)
                     )

datasetsWithCustomLicense.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32645 entries, 0 to 32644
Data columns (total 21 columns):
 #   Column                       Non-Null Count  Dtype              
---  ------                       --------------  -----              
 0   dataset_pid_url              32645 non-null  object             
 1   dataset_publication_date     32645 non-null  datetime64[ns]     
 2   dataset_version_number       32645 non-null  object             
 3   dataset_version_create_time  32645 non-null  datetime64[ns, UTC]
 4   license_name                 32645 non-null  object             
 5   license_uri                  32645 non-null  object             
 6   terms_of_use                 32645 non-null  object             
 7   confidentiality_declaration  32645 non-null  object             
 8   special_permissions          32645 non-null  object             
 9   restrictions                 32645 non-null  object             
 10  citation_requirements        32645 non-null  o

In [25]:
# For each of these datasets, in order to determine which standard licenses might be mentioned in any of their Terms of Use or Terms of Access fields, let's create a new column that's a concatenation of what's entered in all of the Terms fields
termsColumns = [
    'terms_of_use',
    'confidentiality_declaration',
    'special_permissions',
    'restrictions',
    'citation_requirements',
    'depositor_requirements',
    'conditions',
    'disclaimer',
    'terms_of_access',
    'data_access_place',
    'original_archive',
    'availability_status',
    'contact_for_access',
    'size_of_collection',
    'study_completion'
]
datasetsWithCustomLicense['combined_terms'] = datasetsWithCustomLicense[termsColumns].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)

datasetsWithCustomLicense.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32645 entries, 0 to 32644
Data columns (total 22 columns):
 #   Column                       Non-Null Count  Dtype              
---  ------                       --------------  -----              
 0   dataset_pid_url              32645 non-null  object             
 1   dataset_publication_date     32645 non-null  datetime64[ns]     
 2   dataset_version_number       32645 non-null  object             
 3   dataset_version_create_time  32645 non-null  datetime64[ns, UTC]
 4   license_name                 32645 non-null  object             
 5   license_uri                  32645 non-null  object             
 6   terms_of_use                 32645 non-null  object             
 7   confidentiality_declaration  32645 non-null  object             
 8   special_permissions          32645 non-null  object             
 9   restrictions                 32645 non-null  object             
 10  citation_requirements        32645 non-null  o

In [54]:
# Let's get the unique list of
customMetadataList = list(set(datasetsWithCustomLicense['combined_terms'].tolist()))
print(f'Number of unique strings entered in Terms fields: {len(customMetadataList)}')

Number of unique strings entered in Terms fields: 2638


In [55]:
# Let's figure out which custom licenses are in these strings

# Some of the things entered in the Terms fields are HTML. Let's remove the formatting so that it's faster to search through.
for place, metadata in enumerate(customMetadataList):
    metadata = htmlmin.minify(metadata, remove_empty_space=True)
    customMetadataList[place] = metadata

In [64]:
# There's probably a lot of variety in the ways that standard licenses are identified by name, but there might be more standardization in how they're identified by their URLs, so let's extract the URLs from this custom metadata
urlsList = []
regex = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"

for metadata in customMetadataList:
    urls = re.search(regex, metadata)
    if urls is not None:
        # Try normalizing the URLs a bit more
        urls = urls.group().replace('http://', 'https://').replace('%2F', '/').replace('%3A', ':')
        urls = urls.rstrip('/')
        urlsList.append(urls)

# Deduplicate list of URLs
urlsList = list(set(urlsList))
print(f'Count of unique URLs: {len(urlsList)}')
for url in urlsList:
    print(url)

Count of unique URLs: 225
https://www.canvaslms.com/policies/api-policy
https://dataverse.harvard.edu/file.xhtml?fileId=3102931&version=RELEASED&version=.1
https://adsabs.harvard.edu/abs/2008ApJS..175..277D
https://www.socio.com/teenpregnancydata.php
https://dx.doi.org/10.7910/DVN/YZBRYX
https://dx.doi.org/10.7910/DVN/28038
https://afrobarometer.org/data/merged-round-6-data-36-countries-2016
https://doi.org/10.7910/DVN/A8O38Mdoi/10.7910/DVN/A8O38M
https://mediaproject.wesleyan.edu/dataaccess
https://pcouncil.wufoo.com/forms/udaya-adolescent-survey-2018a19
https://dx.doi.org/10.7910/DVN/25450
https://ccafs.cgiar.org
https://hdl.handle.net/1902.1/20796
https://forms.gle/2bhAuMMVRSZyLVfF7
www.jewishdatabank.org
https://dss.princeton.edu/catalog/resource1825
https://docs.cmv.io/en/latest/about/license
https://dx.doi.org/10.7910/DVN/26086
https://doi.org/10.7910/DVN/97Q2B8
https://creativecommons.org/licenses/by-nc/4.0
https://trec.nist.gov/data/reuters/ind_appl_reuters_v4.html
https://www.

In [65]:
# Now we just need to find the URLs that point to licenses. This is should be easy for Creative Commons licenses:
creativeCommonsUrls = []
for url in urlsList:
    if 'creativecommons.org/licenses' in url:
        creativeCommonsUrls.append(url)
creativeCommonsUrls.sort()
print(f'Count of Creative Commons URLs: {len(creativeCommonsUrls)}')
for url in creativeCommonsUrls:
    print(url)

Count of Creative Commons URLs: 23
https://creativecommons.org/licenses/by-nc-nd/4.0
https://creativecommons.org/licenses/by-nc-nd/4.0/legalcode
https://creativecommons.org/licenses/by-nc-sa/3.0
https://creativecommons.org/licenses/by-nc-sa/3.0/deed.en
https://creativecommons.org/licenses/by-nc-sa/3.0/us
https://creativecommons.org/licenses/by-nc-sa/4.0
https://creativecommons.org/licenses/by-nc/2.0
https://creativecommons.org/licenses/by-nc/3.0
https://creativecommons.org/licenses/by-nc/4.0
https://creativecommons.org/licenses/by-nc/4.0/legalcode
https://creativecommons.org/licenses/by-nd/4.0
https://creativecommons.org/licenses/by-sa/2.0
https://creativecommons.org/licenses/by-sa/4.0
https://creativecommons.org/licenses/by-sa/4.0/legalcode
https://creativecommons.org/licenses/by/2.0
https://creativecommons.org/licenses/by/2.5/it
https://creativecommons.org/licenses/by/3.0
https://creativecommons.org/licenses/by/3.0/us
https://creativecommons.org/licenses/by/4.0
https://creativecommon

In [66]:
# Looks like there are 23 unique URLs, but there might be some duplicates.
# Some URLs end in "/legalcode", which are just the official text of the license. So let's remove that.
# Some URLs end in "/deed" or "/deed.en". When those parts are removed from those URLs, they point to the same pages, so let's remove those, too.
# And one URL seems to have been generated by Outlook. The license, BY-NC-SA 3.0, is already in the list, so let's remove that "safelinks URL

for place, url in enumerate(creativeCommonsUrls):
    if '/legalcode' or '/deed' in url:
        url = url.replace('/legalcode', '').replace('/deed.en', '').replace('/deed', '')
        creativeCommonsUrls[place] = url
    if 'safelinks' in url:
        creativeCommonsUrls.pop(place)

# Deduplicate the list
creativeCommonsUrls = list(set(creativeCommonsUrls))

print(len(creativeCommonsUrls))
for url in creativeCommonsUrls:
    print(url)

15
https://creativecommons.org/licenses/by-sa/4.0
https://creativecommons.org/licenses/by-sa/2.0
https://creativecommons.org/licenses/by/4.0
https://creativecommons.org/licenses/by-nc-sa/3.0
https://creativecommons.org/licenses/by-nc/3.0
https://creativecommons.org/licenses/by/2.5/it
https://creativecommons.org/licenses/by/3.0/us
https://creativecommons.org/licenses/by-nc/2.0
https://creativecommons.org/licenses/by-nd/4.0
https://creativecommons.org/licenses/by/2.0
https://creativecommons.org/licenses/by-nc-nd/4.0
https://creativecommons.org/licenses/by-nc-sa/4.0
https://creativecommons.org/licenses/by-nc-sa/3.0/us
https://creativecommons.org/licenses/by-nc/4.0
https://creativecommons.org/licenses/by/3.0


In [67]:
# What about URLs that point to different types of licenses? For that, lets see what URLs are in SPDX, which maintains structured information about standard licenses

spdxDataUrl = 'https://raw.githubusercontent.com/spdx/license-list-data/master/json/licenses.json'
response = requests.get(spdxDataUrl)
spdxData = response.json()

# Let's save the license information from this list, remove the Creative Commons licenses
licenseDict = []
for license in spdxData['licenses']:
    if 'Creative Commons' not in license['name']:
        # reference = license['reference']
        name = license['name']
        # licenseNames.append(name)
        licenseId = license['licenseId']
        seeAlso = license['seeAlso']
        licenseDict.append(license)

print(len(licenseDict))

447


That's still a lot of licenses. Let's try to find the URLs of common types of licenses based on what licenses other Dataverse repositories are using. I've collected  this license information using the Dataverse API and sorted the licenses by type. Here's a list of the types of licenses and the license URLs for each.

- Apache: https://www.apache.org/licenses
- BSD: https://opensource.org/licenses
- GNU: https://www.gnu.org/licenses
- MIT: https://opensource.org/licenses/MIT
- Mozilla: https://mozilla.org/MPL

All of these licenses are meant to be applied to software. The Harvard Dataverse Repository requires that depositors always include data, and the recommended way to apply different licenses for different parts of a dataset, such as applying one license for the data files and one for software files, is to create a custom license,
we should exclude licenses meant only for software.

The Digital Curation Centre maintains a page at https://www.dcc.ac.uk/guidance/how-guides/license-research-data that describes two other tyeps of data licenses.

Let's see if any of these licenses' URLs are in our list:
- Open Data Commons: https://opendatacommons.org/licenses
- Open Government Licence (OGL): https://www.nationalarchives.gov.uk/doc/open-government-licence
