# Import Python libraries and functions

In [1]:
import pandas as pd
import re
import requests
import htmlmin

In [2]:
def to_str(item):
    return str(item)

# Get and prepare the data

In [3]:
# Read in license and terms metadata, making sure that all values are string values
licenseMetadataDF = pd.read_csv(
    'licenses_and_terms_metadata_hdv_2022-10-03.csv',
    sep=',', na_filter=False,
    converters={'dataset_version_number': to_str})

# Preview data
licenseMetadataDF.head()

Unnamed: 0,dataset_pid,dataset_pid_url,dataset_version_number,license_name,license_uri,terms_of_use,confidentiality_declaration,special_permissions,restrictions,citation_requirements,depositor_requirements,conditions,disclaimer,terms_of_access,data_access_place,original_archive,availability_status,contact_for_access,size_of_collection,study_completion
0,doi:10.7910/DVN/5PRYPC,https://doi.org/10.7910/DVN/5PRYPC,4.0,CC0 1.0,http://creativecommons.org/publicdomain/zero/1.0,,,,,,,,,,,,,,,
1,doi:10.7910/DVN/VIE1H,https://doi.org/10.7910/DVN/VIE1H,248.0,,,,,,,,,,,,,,,,,
2,doi:10.7910/DVN/KKUJWW,https://doi.org/10.7910/DVN/KKUJWW,1.0,CC0 1.0,http://creativecommons.org/publicdomain/zero/1.0,,,,,,,,,,,,,,,
3,doi:10.7910/DVN/5E6GBN,https://doi.org/10.7910/DVN/5E6GBN,1.2,CC0 1.0,http://creativecommons.org/publicdomain/zero/1.0,,,,,,,,,,,,,,,
4,doi:10.7910/DVN/9MKISZ,https://doi.org/10.7910/DVN/9MKISZ,10.0,CC0 1.0,http://creativecommons.org/publicdomain/zero/1.0,,,,,,,,,,,,,,,


In [4]:
# Read in basic metadata of each dataset, making sure that dates are recognized as dates and that the version number is treated as a string
basicMetadataDF = pd.read_csv(
    'basic_metadata_hdv_2022-10-03.csv',
    sep=',', na_filter=False,
    parse_dates=['dataset_publication_date', 'dataset_version_create_time'],
    converters={'dataset_version_number': to_str})

# Preview data
basicMetadataDF.head()

Unnamed: 0,dataset_pid,dataset_pid_url,dataset_version_number,dataset_publication_date,dataset_version_create_time,dataset_version_state,publisher
0,doi:10.7910/DVN/5PRYPC,https://doi.org/10.7910/DVN/5PRYPC,4.0,2020-05-27,2020-06-17 23:49:50+00:00,RELEASED,Harvard Dataverse
1,doi:10.7910/DVN/VIE1H,https://doi.org/10.7910/DVN/VIE1H,248.0,2015-02-11,2015-06-18 19:49:12+00:00,RELEASED,Harvard Dataverse
2,doi:10.7910/DVN/KKUJWW,https://doi.org/10.7910/DVN/KKUJWW,1.0,2015-07-23,2015-07-15 00:57:41+00:00,RELEASED,Harvard Dataverse
3,doi:10.7910/DVN/5E6GBN,https://doi.org/10.7910/DVN/5E6GBN,1.2,2020-04-05,2020-04-05 21:32:37+00:00,RELEASED,Harvard Dataverse
4,doi:10.7910/DVN/9MKISZ,https://doi.org/10.7910/DVN/9MKISZ,10.0,2011-06-09,2013-01-29 23:42:10+00:00,RELEASED,Harvard Dataverse


In [5]:
# Get the counts of dataset versions and counts of datasets in the licenseMetadataDF and datasetsDF dataframes
print('Number of dataset versions licenseMetadataDF: %s' %(len(licenseMetadataDF)))
print('Number of datasets in licenseMetadataDF: %s' % (len(pd.unique(licenseMetadataDF['dataset_pid']))))

print('\nNumber of dataset versions in basicMetadataDF: %s' %(len(basicMetadataDF)))
print('Number of datasets in basicMetadataDF: %s' % (len(pd.unique(basicMetadataDF['dataset_pid']))))

Number of dataset versions licenseMetadataDF: 130768
Number of datasets in licenseMetadataDF: 80278

Number of dataset versions in basicMetadataDF: 130768
Number of datasets in basicMetadataDF: 80278


In [6]:
# Make sure the two dataframes, allDatasetPIDsDF and basicMetadataDF, contain the same datasets
datasetsInlicenseMetadataDFList = set(licenseMetadataDF['dataset_pid'].tolist())
datasetsInbasicMetadataDFList = set(basicMetadataDF['dataset_pid'].tolist())

print(f'Datasets in datasetsInlicenseMetadataDFList: {len(datasetsInlicenseMetadataDFList)}')
print(f'Datasets in basicMetadataDF: {len(datasetsInbasicMetadataDFList)}')

# Create list of dataset PIDs that exist in one list and not the other, if any
difference = datasetsInlicenseMetadataDFList - datasetsInbasicMetadataDFList
difference = list(difference)
print(f'Number of datasets in one table and not the other: {len(difference)}')


Datasets in datasetsInlicenseMetadataDFList: 80278
Datasets in basicMetadataDF: 80278
Number of datasets in one table and not the other: 0


In [7]:
# Merge the two dataframes
licenseAndBasicMetadataDF = pd.merge(
    licenseMetadataDF, basicMetadataDF,
    how='inner',
    on=[
        'dataset_pid',
        'dataset_pid_url',
        'dataset_version_number'])

In [8]:
# Check count of unique datasets in the new licenseAndBasicMetadataDF dataframe
print(f'Number of dataset versions in licenseAndBasicMetadataDF: {len(licenseAndBasicMetadataDF)}')
uniqueDatasetCount = len(pd.unique(licenseAndBasicMetadataDF['dataset_pid']))
print(f'Number of datasets in licenseAndBasicMetadataDF: {uniqueDatasetCount}')

Number of dataset versions in licenseAndBasicMetadataDF: 130768
Number of datasets in licenseAndBasicMetadataDF: 80278


In [9]:
# Create dataframe, latestversion_licenseAndBasicMetadataDF, that contains the license and terms
# metadata of only the latest version of each dataset, using the latest version release time
# from licenseAndBasicMetadataDF
latestversion_licenseAndBasicMetadataDF = (licenseAndBasicMetadataDF
                 .iloc[licenseAndBasicMetadataDF.groupby('dataset_pid')['dataset_version_create_time']
                 .agg(pd.Series.idxmax)]
                 .reset_index(drop=True, inplace=False))

# Remove dataset_version_state and publisher columns and reorder remaining columns
latestversion_licenseAndBasicMetadataDF = latestversion_licenseAndBasicMetadataDF[[
    'dataset_pid_url', 'dataset_publication_date', 'dataset_version_number', 'dataset_version_create_time', 'license_name', 'license_uri', 'terms_of_use', 'confidentiality_declaration', 'special_permissions', 'restrictions', 'citation_requirements', 'depositor_requirements', 'conditions', 'disclaimer', 'terms_of_access', 'data_access_place', 'original_archive', 'availability_status', 'contact_for_access', 'size_of_collection', 'study_completion'
]]

# Check count of datasets in new dataframe
print(f'Count of datasets in latestversion_licenseAndBasicMetadataDF: {len(latestversion_licenseAndBasicMetadataDF)}')


Count of datasets in latestversion_licenseAndBasicMetadataDF: 80278


In [10]:
# Let's make sure each column is the data type we want. Everything should be a string except for the two date columns.
latestversion_licenseAndBasicMetadataDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80278 entries, 0 to 80277
Data columns (total 21 columns):
 #   Column                       Non-Null Count  Dtype              
---  ------                       --------------  -----              
 0   dataset_pid_url              80278 non-null  object             
 1   dataset_publication_date     80278 non-null  datetime64[ns]     
 2   dataset_version_number       80278 non-null  object             
 3   dataset_version_create_time  80278 non-null  datetime64[ns, UTC]
 4   license_name                 80278 non-null  object             
 5   license_uri                  80278 non-null  object             
 6   terms_of_use                 80278 non-null  object             
 7   confidentiality_declaration  80278 non-null  object             
 8   special_permissions          80278 non-null  object             
 9   restrictions                 80278 non-null  object             
 10  citation_requirements        80278 non-null  o

# Use the data

## Which standard licences have depositors entered into the Terms fields?

Now that we have the data we need, let's answer our question:
- Which standard licences have depositors entered into the Terms fields?

As part of the Dataverse software's v5.10 multiple license update:
1. If a dataset has a CC0 waiver and nothing entered in any of its "Terms of Use" fields, the dataset is considered to have a "standard" CC0 license, and in the metadata of those datasets, the "license_name" field contains "CC0 1.0".
2. If a dataset has a CC0 waiver and anything entered in any of its "Terms of Use" fields, the dataset is considered to have a custom license, where "This dataset is made available under a Creative Commons CC0 license with the following additional/modified terms and conditions:" is added to the Terms of Use field, and the additional/modified terms and conditions are whatever has been entered in any of the other "Terms of Use" fields.
3. If the dataset does not have a CC0 waiver but has anything in any of its "Terms of Use" fields, the dataset is considered to have a custom license, where what's in any of the "Terms of Use" fields makes up the custom license.
4. If the dataset does not have a CC0 waiver and nothing is entered in any of the "Terms of Use" fields, the text "This dataset is made available without information on how it can be used. You should communicate with the Contact(s) specified before use." is entered in the Terms of Use field.

To find datasets with "custom licenses", and then search for standard licenses entered in those custom licenses, we want to find datasets that match the second or third conditions.

In [12]:
datasetsWithCustomLicense = (latestversion_licenseAndBasicMetadataDF
                     .query('license_name == "" and terms_of_use != "This dataset is made available without information on how it can be used. You should communicate with the Contact(s) specified before use."')
                     .reset_index(drop=True, inplace=False)
                     )

# Get summary of data
datasetsWithCustomLicense.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32645 entries, 0 to 32644
Data columns (total 21 columns):
 #   Column                       Non-Null Count  Dtype              
---  ------                       --------------  -----              
 0   dataset_pid_url              32645 non-null  object             
 1   dataset_publication_date     32645 non-null  datetime64[ns]     
 2   dataset_version_number       32645 non-null  object             
 3   dataset_version_create_time  32645 non-null  datetime64[ns, UTC]
 4   license_name                 32645 non-null  object             
 5   license_uri                  32645 non-null  object             
 6   terms_of_use                 32645 non-null  object             
 7   confidentiality_declaration  32645 non-null  object             
 8   special_permissions          32645 non-null  object             
 9   restrictions                 32645 non-null  object             
 10  citation_requirements        32645 non-null  o

In [13]:
# For each of these 32,645 datasets, in order to determine which standard licenses might be mentioned in any of their Terms of Use or Terms of Access fields, let's create a new column that's a concatenation of what's entered in all of the Terms fields. Then we can search in just one column.
termsColumns = [
    'terms_of_use',
    'confidentiality_declaration',
    'special_permissions',
    'restrictions',
    'citation_requirements',
    'depositor_requirements',
    'conditions',
    'disclaimer',
    'terms_of_access',
    'data_access_place',
    'original_archive',
    'availability_status',
    'contact_for_access',
    'size_of_collection',
    'study_completion'
]
datasetsWithCustomLicense['combined_terms'] = datasetsWithCustomLicense[termsColumns].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)

# Strip any leading and trailing white spaces from values the columns
datasetsWithCustomLicense = datasetsWithCustomLicense.apply(lambda x: x.str.strip() if x.dtype == 'object' else x)

# Remove from the dataframe any datasets whose combined_terms column is empty. These datasets also have no "custom license", although the metadata says that it does (see bug reported at https://github.com/IQSS/dataverse.harvard.edu/issues/169)
datasetsWithCustomLicense = (datasetsWithCustomLicense
                             .query('combined_terms != ""')
                             .reset_index(drop=True, inplace=False)
                             )

# Export the dataframe as a CSV file to explore in a spreadsheet application
datasetsWithCustomLicense.to_csv('datasets_with_custom_license.csv', index=False)

# Get summary of data
datasetsWithCustomLicense.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32436 entries, 0 to 32435
Data columns (total 22 columns):
 #   Column                       Non-Null Count  Dtype              
---  ------                       --------------  -----              
 0   dataset_pid_url              32436 non-null  object             
 1   dataset_publication_date     32436 non-null  datetime64[ns]     
 2   dataset_version_number       32436 non-null  object             
 3   dataset_version_create_time  32436 non-null  datetime64[ns, UTC]
 4   license_name                 32436 non-null  object             
 5   license_uri                  32436 non-null  object             
 6   terms_of_use                 32436 non-null  object             
 7   confidentiality_declaration  32436 non-null  object             
 8   special_permissions          32436 non-null  object             
 9   restrictions                 32436 non-null  object             
 10  citation_requirements        32436 non-null  o

In [14]:
# Let's get a unique list of what's been entered in each combined Terms field
customMetadataList = list(set(datasetsWithCustomLicense['combined_terms'].tolist()))
print(f'Number of unique strings entered in Terms fields: {len(customMetadataList)}')

Number of unique strings entered in Terms fields: 2616


In [15]:
# Let's figure out which custom licenses are in these strings

# Some of the things entered in the Terms fields are HTML. Let's remove the formatting so that it's faster to search through.
for place, metadata in enumerate(customMetadataList):
    metadata = htmlmin.minify(metadata, remove_empty_space=True)
    customMetadataList[place] = metadata

In [16]:
# There's probably a lot of variety in the ways that standard licenses are identified by license name,
# but there will be less variety, or more standardization, in how the licenses are identified by their
# URLs, so let's extract the URLs from this custom metadata
urlsList = []
regex = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"

for metadata in customMetadataList:
    urls = re.search(regex, metadata)
    if urls is not None:
        # Try normalizing the URLs a bit more to get a better count of the unique URLs
        urls = urls.group().replace('http://', 'https://').replace('%2F', '/').replace('%3A', ':')
        urls = urls.rstrip('/')
        urlsList.append(urls)

# Deduplicate the list of URLs
urlsList = list(set(urlsList))

# Print count of unique URLs and the first five items in the list
print(f'First five URLs in the list of {len(urlsList)} URLs:')
for url in urlsList[:5]:
    print(url)

First five URLs in the list of 225 URLs:
https://thedata.org/citation
https://creativecommons.org/licenses/by-nc/3.0
https://dx.doi.org/10.7910/DVN/BVPQFH
https://www.comifac.org/fr/content/projet-%C2%AB-beyond-timber-%C2%BB-au-del%C3%A0-du-bois-la-comifac-et-ses-partenaires-pr%C3%A9sentent-les
https://lindat.mff.cuni.cz/repository/xmlui/page/licence-UD-1.2


In [18]:
# Looks like the search worked. Now we just need to find the URLs that point to licenses. This is should be easy for Creative Commons licenses:

creativeCommonsUrls = []
for url in urlsList:
    if 'creativecommons.org/licenses' in url:
        creativeCommonsUrls.append(url)
creativeCommonsUrls.sort()

# Print count of unique URLs and all items in the list
print(f'Count of Creative Commons license URLs: {len(creativeCommonsUrls)}')
for url in creativeCommonsUrls:
    print(url)

Count of Creative Commons license URLs: 23
https://creativecommons.org/licenses/by-nc-nd/4.0
https://creativecommons.org/licenses/by-nc-nd/4.0/legalcode
https://creativecommons.org/licenses/by-nc-sa/3.0
https://creativecommons.org/licenses/by-nc-sa/3.0/deed.en
https://creativecommons.org/licenses/by-nc-sa/3.0/us
https://creativecommons.org/licenses/by-nc-sa/4.0
https://creativecommons.org/licenses/by-nc/2.0
https://creativecommons.org/licenses/by-nc/3.0
https://creativecommons.org/licenses/by-nc/4.0
https://creativecommons.org/licenses/by-nc/4.0/legalcode
https://creativecommons.org/licenses/by-nd/4.0
https://creativecommons.org/licenses/by-sa/2.0
https://creativecommons.org/licenses/by-sa/4.0
https://creativecommons.org/licenses/by-sa/4.0/legalcode
https://creativecommons.org/licenses/by/2.0
https://creativecommons.org/licenses/by/2.5/it
https://creativecommons.org/licenses/by/3.0
https://creativecommons.org/licenses/by/3.0/us
https://creativecommons.org/licenses/by/4.0
https://creati

In [19]:
# Looks like there are 23 unique URLs for Creative Commons licenses, but there are duplicates.
# Some URLs end in "/legalcode", which are just the official text of the license. So let's remove that.
# Some URLs end in "/deed" or "/deed.en". When those parts are removed from those URLs, they point to the same web pages, so let's remove those, too.
# And one URL seems to have been generated by the Outlook email client and contains the URL for the BY-NC-SA 3.0 license. That license is already in the list, so let's remove that "safelinks" URL.

for place, url in enumerate(creativeCommonsUrls):
    if '/legalcode' or '/deed' in url:
        url = url.replace('/legalcode', '').replace('/deed.en', '').replace('/deed', '')
        creativeCommonsUrls[place] = url
    if 'safelinks' in url:
        creativeCommonsUrls.pop(place)

# Deduplicate the list
creativeCommonsUrls = list(set(creativeCommonsUrls))

# Print count of unique URLs and all items in the list
print(len(creativeCommonsUrls))
for url in creativeCommonsUrls:
    print(url)

15
https://creativecommons.org/licenses/by-nc-sa/3.0/us
https://creativecommons.org/licenses/by-nc-sa/3.0
https://creativecommons.org/licenses/by-nd/4.0
https://creativecommons.org/licenses/by-sa/4.0
https://creativecommons.org/licenses/by-nc/4.0
https://creativecommons.org/licenses/by/3.0/us
https://creativecommons.org/licenses/by-nc/3.0
https://creativecommons.org/licenses/by/2.0
https://creativecommons.org/licenses/by/3.0
https://creativecommons.org/licenses/by-nc-nd/4.0
https://creativecommons.org/licenses/by/2.5/it
https://creativecommons.org/licenses/by/4.0
https://creativecommons.org/licenses/by-nc/2.0
https://creativecommons.org/licenses/by-sa/2.0
https://creativecommons.org/licenses/by-nc-sa/4.0


So there are 15 different Creative Commons licenses used in datasets published in the Harvard Dataverse Repository.

## Other types of licenses
What about URLs that point to different types of licenses? For that, lets see what URLs are in SPDX, which maintains structured information about standard licenses.

In [39]:
spdxDataUrl = 'https://raw.githubusercontent.com/spdx/license-list-data/master/json/licenses.json'
response = requests.get(spdxDataUrl)
spdxData = response.json()

# Let's save the license information from this list, removing the Creative Commons licenses
licenseDict = []
for license in spdxData['licenses']:
    if 'Creative Commons' not in license['name']:
        licenseDict.append(license)

print(f'Count of licenses in SPDX (excluding Creative Commons licenses: {len(licenseDict)}')

Count of licenses in SPDX (excluding Creative Commons licenses: 447


447 is a lot of licenses. Trying to find if any of these licenses have been applied to datasets in the Harvard repository might not be worth the work.

Instead, let's see what types of licenses other Dataverse repositories are using. I've collected this information using the Dataverse API and sorted the licenses by type.

## What types of licenses are other Dataverse repositories using?

Here's a list of the types of licenses used by other Dataverse repositories and the base URLs used for each license.

- Apache: https://www.apache.org/licenses
- BSD: https://opensource.org/licenses
- GNU: https://www.gnu.org/licenses
- MIT: https://opensource.org/licenses/MIT
- Mozilla: https://mozilla.org/MPL

All of these licenses are meant to be applied to software. The Harvard Dataverse Repository requires that depositors always include data, and the recommended way to apply different licenses for different parts of a dataset, such as applying one license for the data files and one for software files, is to create a custom license.

So we should exclude licenses meant only for software.


## What licenses are generally recommended for data?

The Digital Curation Centre maintains a page at https://www.dcc.ac.uk/guidance/how-guides/license-research-data that describes two other types of data licenses.

- Open Data Commons: https://opendatacommons.org/licenses
- Open Government Licence (OGL): https://www.nationalarchives.gov.uk/doc/open-government-licence

Let's see if any of these licenses' URLs are already used in datasets published in the Harvard repository.

In [40]:
# List any Open Data Commons license URLs found in dataset metadata
openDataCommonsUrls = []
for url in urlsList:
    if 'opendatacommons.org/licenses' in url:
        openDataCommonsUrls.append(url.strip())
openDataCommonsUrls = list(set(openDataCommonsUrls))
openDataCommonsUrls.sort()
print(f'Count of Open Data Commons License URLs: {len(openDataCommonsUrls)}')
for url in openDataCommonsUrls:
    print(url)

# List any Open Government License URLs found in dataset metadata
openGovernmentLicenseUrls = []
for url in urlsList:
    if 'open-government-licence' in url:
        openGovernmentLicenseUrls.append(url.strip())
openGovernmentLicenseUrls = list(set(openGovernmentLicenseUrls))
openGovernmentLicenseUrls.sort()
print(f'\nCount of Open Government Licence URLs: {len(openGovernmentLicenseUrls)}')
for url in openGovernmentLicenseUrls:
    print(url)

Count of Open Data Commons License URLs: 4
https://opendatacommons.org/licenses/by/1.0
https://opendatacommons.org/licenses/odbl
https://opendatacommons.org/licenses/odbl/1-0
https://opendatacommons.org/licenses/odbl/1.0

Count of Open Government Licence URLs: 1
https://open.canada.ca/en/open-government-licence-canada


Two Open Data Commons licenses are used.

It looks like there are no Open Government Licences from the UK's National Archives, but by searching for URLs with the string "open-government-license", we found that Canada's Open Government License has been used.