In [82]:
import numpy as np
from numpy import nan
import pandas as pd
import re
import requests

In [48]:
def to_str(item):
    return str(item)

In [49]:
licenseMetadataDF = pd.read_csv(
    'licenses_and_terms_metadata_hdv_2022-10-03.csv',
    sep=',', na_filter=False,
    converters={'dataset_version_number': to_str})
licenseMetadataDF.head()

Unnamed: 0,dataset_pid,dataset_pid_url,dataset_version_number,license_name,license_uri,terms_of_use,confidentiality_declaration,special_permissions,restrictions,citation_requirements,depositor_requirements,conditions,disclaimer,terms_of_access,data_access_place,original_archive,availability_status,contact_for_access,size_of_collection,study_completion
0,doi:10.7910/DVN/5PRYPC,https://doi.org/10.7910/DVN/5PRYPC,4.0,CC0 1.0,http://creativecommons.org/publicdomain/zero/1.0,,,,,,,,,,,,,,,
1,doi:10.7910/DVN/VIE1H,https://doi.org/10.7910/DVN/VIE1H,248.0,,,,,,,,,,,,,,,,,
2,doi:10.7910/DVN/KKUJWW,https://doi.org/10.7910/DVN/KKUJWW,1.0,CC0 1.0,http://creativecommons.org/publicdomain/zero/1.0,,,,,,,,,,,,,,,
3,doi:10.7910/DVN/5E6GBN,https://doi.org/10.7910/DVN/5E6GBN,1.2,CC0 1.0,http://creativecommons.org/publicdomain/zero/1.0,,,,,,,,,,,,,,,
4,doi:10.7910/DVN/9MKISZ,https://doi.org/10.7910/DVN/9MKISZ,10.0,CC0 1.0,http://creativecommons.org/publicdomain/zero/1.0,,,,,,,,,,,,,,,


In [50]:
basicMetadataDF = pd.read_csv(
    'basic_metadata_hdv_2022-10-03.csv',
    sep=',', na_filter=False,
    parse_dates=['dataset_publication_date', 'dataset_version_create_time'],
    converters={'dataset_version_number': to_str})

basicMetadataDF.head()

Unnamed: 0,dataset_pid,dataset_pid_url,dataset_version_number,dataset_publication_date,dataset_version_create_time,dataset_version_state,publisher
0,doi:10.7910/DVN/5PRYPC,https://doi.org/10.7910/DVN/5PRYPC,4.0,2020-05-27,2020-06-17 23:49:50+00:00,RELEASED,Harvard Dataverse
1,doi:10.7910/DVN/VIE1H,https://doi.org/10.7910/DVN/VIE1H,248.0,2015-02-11,2015-06-18 19:49:12+00:00,RELEASED,Harvard Dataverse
2,doi:10.7910/DVN/KKUJWW,https://doi.org/10.7910/DVN/KKUJWW,1.0,2015-07-23,2015-07-15 00:57:41+00:00,RELEASED,Harvard Dataverse
3,doi:10.7910/DVN/5E6GBN,https://doi.org/10.7910/DVN/5E6GBN,1.2,2020-04-05,2020-04-05 21:32:37+00:00,RELEASED,Harvard Dataverse
4,doi:10.7910/DVN/9MKISZ,https://doi.org/10.7910/DVN/9MKISZ,10.0,2011-06-09,2013-01-29 23:42:10+00:00,RELEASED,Harvard Dataverse


In [51]:
# Check data

# Get the unique counts of datasets in the licenseMetadataDF and datasetsDF dataframes
print('Number of dataset versions licenseMetadataDF: %s' %(len(licenseMetadataDF)))
print('Number of datasets in licenseMetadataDF: %s' % (len(pd.unique(licenseMetadataDF['dataset_pid']))))

print('\nNumber of dataset versions in basicMetadataDF: %s' %(len(basicMetadataDF)))
print('Number of datasets in basicMetadataDF: %s' % (len(pd.unique(basicMetadataDF['dataset_pid']))))

Number of dataset versions licenseMetadataDF: 130768
Number of datasets in licenseMetadataDF: 80278

Number of dataset versions in basicMetadataDF: 130768
Number of datasets in basicMetadataDF: 80278


In [52]:
# Make sure the two dataframes, allDatasetPIDsDF and basicMetadataDF, contain the datasets
datasetsInlicenseMetadataDFList = set(licenseMetadataDF['dataset_pid'].tolist())
datasetsInbasicMetadataDFList = set(basicMetadataDF['dataset_pid'].tolist())

print(f'Datasets in datasetsInlicenseMetadataDFList: {len(datasetsInlicenseMetadataDFList)}')
print(f'Datasets in basicMetadataDF: {len(datasetsInbasicMetadataDFList)}')

# Create list of dataset PIDs that exist in one list and not the other, if any
difference = datasetsInlicenseMetadataDFList - datasetsInbasicMetadataDFList
difference = list(difference)
print(f'Number of datasets in one table and not the other: {len(difference)}')


Datasets in datasetsInlicenseMetadataDFList: 80278
Datasets in basicMetadataDF: 80278
Number of datasets in one table and not the other: 0


In [53]:
# Merge the two dataframes
licenseAndBasicMetadataDF = pd.merge(
    licenseMetadataDF, basicMetadataDF,
    how='inner',
    on=[
        'dataset_pid',
        'dataset_pid_url',
        'dataset_version_number'])

In [54]:
# Check count of unique datasets in the new mergedDF
print(f'Number of dataset versions in licenseAndBasicMetadataDF: {len(licenseAndBasicMetadataDF)}')
uniqueDatasetCount = len(pd.unique(licenseAndBasicMetadataDF['dataset_pid']))
print(f'Number of datasets in licenseAndBasicMetadataDF: {uniqueDatasetCount}')

Number of dataset versions in licenseAndBasicMetadataDF: 130768
Number of datasets in licenseAndBasicMetadataDF: 80278


In [68]:
# Create dataframe, licenseMetadataLatestVersionDF, that contains the license and terms metadata of only the latest
# version of each dataset, using the latest version release time from the merged basicMetadataDF
latestversion_licenseAndBasicMetadataDF = (licenseAndBasicMetadataDF
                 .iloc[licenseAndBasicMetadataDF.groupby('dataset_pid')['dataset_version_create_time']
                 .agg(pd.Series.idxmax)]
                 .reset_index(drop=True, inplace=False))

# Replace any blank values with NaN
latestversion_licenseAndBasicMetadataDF = latestversion_licenseAndBasicMetadataDF.replace(r'^\s*$', np.nan, regex=True)

# Remove dataset_version_state and publisher columns and reorder remaining columns
latestversion_licenseAndBasicMetadataDF = latestversion_licenseAndBasicMetadataDF[[
    'dataset_pid_url', 'dataset_publication_date', 'dataset_version_number', 'dataset_version_create_time', 'license_name', 'license_uri', 'terms_of_use', 'confidentiality_declaration', 'special_permissions', 'restrictions', 'citation_requirements', 'depositor_requirements', 'conditions', 'disclaimer', 'terms_of_access', 'data_access_place', 'original_archive', 'availability_status', 'contact_for_access', 'size_of_collection', 'study_completion'
]]

print(f'Count of pids in latestversion_mergedDF: {len(latestversion_licenseAndBasicMetadataDF)}')


Count of pids in latestversion_mergedDF: 80278


In [69]:
# Let's make sure each column is the object type we want. Everything should be a string except for the two date columns.
latestversion_licenseAndBasicMetadataDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80278 entries, 0 to 80277
Data columns (total 21 columns):
 #   Column                       Non-Null Count  Dtype              
---  ------                       --------------  -----              
 0   dataset_pid_url              80278 non-null  object             
 1   dataset_publication_date     80278 non-null  datetime64[ns]     
 2   dataset_version_number       80278 non-null  object             
 3   dataset_version_create_time  80278 non-null  datetime64[ns, UTC]
 4   license_name                 35351 non-null  object             
 5   license_uri                  35351 non-null  object             
 6   terms_of_use                 44716 non-null  object             
 7   confidentiality_declaration  82 non-null     object             
 8   special_permissions          1301 non-null   object             
 9   restrictions                 1099 non-null   object             
 10  citation_requirements        2190 non-null   o

Now that we have the data we need, let's get some answers:
- How many datasets have a CC0 waiver?
- How many datasets have other types of licenses or other terms of use?
- What standard licenses have been applied datasets that don't have CC0 waiver?

In [70]:
# Create a dataframe containing the datasets whose latest versions have a CC0 waiver
datasetsWithCC0DF = (latestversion_licenseAndBasicMetadataDF
                       .query('license_name == "CC0 1.0" or terms_of_use.str.contains("This dataset is made available under a Creative Commons CC0 license with the following additional/modified terms and conditions")')
                       .reset_index(drop=True, inplace=False)
                       )
datasetsWithCC0DF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36580 entries, 0 to 36579
Data columns (total 21 columns):
 #   Column                       Non-Null Count  Dtype              
---  ------                       --------------  -----              
 0   dataset_pid_url              36580 non-null  object             
 1   dataset_publication_date     36580 non-null  datetime64[ns]     
 2   dataset_version_number       36580 non-null  object             
 3   dataset_version_create_time  36580 non-null  datetime64[ns, UTC]
 4   license_name                 35351 non-null  object             
 5   license_uri                  35351 non-null  object             
 6   terms_of_use                 1229 non-null   object             
 7   confidentiality_declaration  27 non-null     object             
 8   special_permissions          68 non-null     object             
 9   restrictions                 104 non-null    object             
 10  citation_requirements        239 non-null    o

As part of the Dataverse software's multiple license update:
- If a dataset has a CC0 waiver and nothing entered in any of its "Terms of Use" fields, the dataset is considered to have a "standard" CC0 license
- If a dataset has a CC0 waiver and anything entered in any of its "Terms of Use" fields, the dataset is considered to have a custom license, where "This dataset is made available under a Creative Commons CC0 license with the following additional/modified terms and conditions:" is entered in the Terms of Use field, followed by whatever text was entered in any of the "Terms of Use" fields.

For this second case, let's see what's entered in these "custom licences".

In [58]:
# Create a new dataframe with only datasets with text in their terms_of_use column
datasetsWithCC0CustomDF = (datasetsWithCC0DF
                     .query('terms_of_use.notnull()')
                     .reset_index(drop=True, inplace=False)
                     )
print(f'Datasets with a custom license that includes CC0: {len(datasetsWithCC0CustomDF)}')

Datasets with a custom license that includes CC0: 1229


In [59]:
# Let's see what's in these custom licenses
termsOfUseList = pd.unique(datasetsWithCC0CustomDF['terms_of_use'])
print(f'Count of unique "terms of use" strings that include CC0: {len(termsOfUseList)}')
count = 0
for terms in termsOfUseList:
    # Let's remove the boiler plate message so it's easier to see what the "additional/modified terms and conditions" are
    terms = terms.replace('This dataset is made available under a Creative Commons CC0 license with the following additional/modified terms and conditions:', '')
    count += 1
    print(f'{count}. {terms}')

Count of unique "terms of use" strings that include CC0: 16
1.  CC0 Waiver
2.  
3.  Please contact me for permission to re-use data
4.  <a rel="license" href="http://creativecommons.org/publicdomain/mark/1.0/"><img alt="Creative Commons License" style="border-width:0" src="https://i.creativecommons.org/p/mark/1.0/88x31.png" /></a><br />This work is licensed under a <a rel="license" href="http://creativecommons.org/publicdomain/mark/1.0/">Creative Commons Public Domain Mark 1.0 License</a>.
5.  Attribution 4.0 International (CC BY 4.0)
6.  One year embargo
7.  Licensed electronic resources are restricted to members of the MIT community and for the purposes of research, education, and scholarship.  Under MIT's licenses for electronic resources, users generally may not: - 
<p><ul><li>redistribute the materials or permit anyone other than a member of the MIT community to use them</li> <li>remove, obscure or modify any copyright or other notices included in the materials</li> <li>use the ma

It looks like the only standard license mentioned in this list of custom licenses is the CC BY 4.0 license.

While we're interested here only in seeing which other standard licenses are included in these "custom terms", let's note some issues from this list of 16 "custom licenses" with how terms of use metadata have been recorded:
- In the first term in the list, it looks like there are cases where the "custom" parts of some datasets' terms of use are actually just CC0 waivers. We should probably see which datasets these are and if they can be updated to have only a CC0 waiver (and not a custom license)
- The second term in the list is blank. So the custom license reads that "This dataset is made available under a Creative Commons CC0 license with the following additional/modified terms and conditions:" but nothing follows colon. We should probably see which datasets these are. It's possible that they can also be updated to have only a CC0 waiver (and not a custom license).
- Some of the other terms in the list include a CC BY 4.0 license or language that might conflict with the CC0 waiver. While the Harvard Dataverse Repository isn't obligated to make sure that terms applied to deposits make sense, this should be looked into.

Now let's look at datasets that have something entered in their Terms of Use fields but don't have a CC0 waiver.

In [71]:
latestversion_licenseAndBasicMetadataDF = latestversion_licenseAndBasicMetadataDF.replace(np.nan, 'NONE')

# Create a dataframe containing the datasets whose latest versions have custom licenses and no CC0 waivers
datasetsWithCustomLicenseDF = (latestversion_licenseAndBasicMetadataDF
                       .query('license_name == "NONE" and terms_of_use != "NONE" and ~terms_of_use.str.contains("This dataset is made available under a Creative Commons CC0 license with the following additional/modified terms and conditions") and terms_of_use != "This dataset is made available without information on how it can be used. You should communicate with the Contact(s) specified before use."')
                     .reset_index(drop=True, inplace=False)
                     )
print(f'Count of datasets whose latest versions have custom licenses and no CC0 waivers: {len(datasetsWithCustomLicenseDF)}')

Count of datasets whose latest versions have custom licenses and no CC0 waivers: 31205


In [72]:
# For each of these datasets, in order to determine which standard licenses might be mentioned in any of their Terms of Use or Terms of Access fields, let's create a new column that's a concatenation of what's entered in all of the Terms fields
termsColumns = [
    'terms_of_use',
    'confidentiality_declaration',
    'special_permissions',
    'restrictions',
    'citation_requirements',
    'depositor_requirements',
    'conditions',
    'disclaimer',
    'terms_of_access',
    'data_access_place',
    'original_archive',
    'availability_status',
    'contact_for_access',
    'size_of_collection',
    'study_completion'
]
datasetsWithCustomLicenseDF['combined_terms'] = datasetsWithCustomLicenseDF[termsColumns].apply(lambda row: ' | '.join(row.values.astype(str)), axis=1)
datasetsWithCustomLicenseDF = datasetsWithCustomLicenseDF.replace('\r', ' ').replace('\n', '')

In [73]:
datasetsWithCustomLicenseDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31205 entries, 0 to 31204
Data columns (total 22 columns):
 #   Column                       Non-Null Count  Dtype              
---  ------                       --------------  -----              
 0   dataset_pid_url              31205 non-null  object             
 1   dataset_publication_date     31205 non-null  datetime64[ns]     
 2   dataset_version_number       31205 non-null  object             
 3   dataset_version_create_time  31205 non-null  datetime64[ns, UTC]
 4   license_name                 31205 non-null  object             
 5   license_uri                  31205 non-null  object             
 6   terms_of_use                 31205 non-null  object             
 7   confidentiality_declaration  31205 non-null  object             
 8   special_permissions          31205 non-null  object             
 9   restrictions                 31205 non-null  object             
 10  citation_requirements        31205 non-null  o

In [63]:
# Get license info from SPDX's list of licenses
# spdxDataUrl = 'https://raw.githubusercontent.com/spdx/license-list-data/master/json/licenses.json'
# response = requests.get(spdxDataUrl)
# spdxData = response.json()

# licenseNames = []
# for license in spdxData['licenses']:
#     reference = license['reference']
#     name = license['name']
#     licenseNames.append(name)
#     licenseId = license['licenseId']
#     seeAlso = license['seeAlso']
#
# print(licenseNames)


In [81]:
# Let's get a unique list of what's been entered in these Terms fields. Then we can extract the Creative Commons URLs from the list to see which Creative Commons licenses have been used
customMetadataList = list(set(datasetsWithCustomLicenseDF['combined_terms'].tolist()))

# Remove the "| NONE" entries from each item in the list
newList = []
for metadata in customMetadataList:
    metadata = metadata.replace(' | NONE', '')
    newList.append(metadata)
customMetadataList = newList

# Count items in the list
print(len(customMetadataList))

2418


In [115]:
# For each item in the customMetadataList, use regex to extract any Creative Commons URLs
urlsList = []
for metadata in customMetadataList:
    regex = r"(?i)\b((?:https?://creativecommons|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
    urls = re.search(regex, metadata)
    if urls is not None:
        urlsList.append(urls.group())

creativeCommonsUrlsList = []
for i in urlsList:
    if 'creativecommons.org/licenses' in i:
        creativeCommonsUrlsList.append(i)

# Deduplicate list
creativeCommonsUrlsList = list(set(creativeCommonsUrlsList))

# Export list to a txt file
with open('list_of_license_urls.txt', 'w') as f:
    for url in creativeCommonsUrlsList:
        f.write(f'{url}\n')


In [42]:
datasetsWithCustomLicenseDF.to_csv('/Users/juliangautier/Desktop/datasetsWithCustomLicenseDF.csv', index=False)