In [1]:
# from distutils.version import StrictVersion, LooseVersion
import numpy as np
# from numpy import nan
import pandas as pd
import packaging

In [2]:
def to_str(item):
    return str(item)

In [12]:
licenseMetadataDF = pd.read_csv(
    'licenses_and_terms_metadata_hdv_2022-10-03.csv',
    sep=',', na_filter=False,
    converters={'dataset_version_number': to_str})
licenseMetadataDF.head()

Unnamed: 0,dataset_pid,dataset_pid_url,dataset_version_number,license_name,license_uri,terms_of_use,confidentiality_declaration,special_permissions,restrictions,citation_requirements,depositor_requirements,conditions,disclaimer,terms_of_access,data_access_place,original_archive,availability_status,contact_for_access,size_of_collection,study_completion
0,doi:10.7910/DVN/5PRYPC,https://doi.org/10.7910/DVN/5PRYPC,4.0,CC0 1.0,http://creativecommons.org/publicdomain/zero/1.0,,,,,,,,,,,,,,,
1,doi:10.7910/DVN/VIE1H,https://doi.org/10.7910/DVN/VIE1H,248.0,,,,,,,,,,,,,,,,,
2,doi:10.7910/DVN/KKUJWW,https://doi.org/10.7910/DVN/KKUJWW,1.0,CC0 1.0,http://creativecommons.org/publicdomain/zero/1.0,,,,,,,,,,,,,,,
3,doi:10.7910/DVN/5E6GBN,https://doi.org/10.7910/DVN/5E6GBN,1.2,CC0 1.0,http://creativecommons.org/publicdomain/zero/1.0,,,,,,,,,,,,,,,
4,doi:10.7910/DVN/9MKISZ,https://doi.org/10.7910/DVN/9MKISZ,10.0,CC0 1.0,http://creativecommons.org/publicdomain/zero/1.0,,,,,,,,,,,,,,,


In [23]:
basicMetadataDF = pd.read_csv(
    'basic_metadata_hdv_2022-10-03.csv',
    sep=',', na_filter=False,
    parse_dates=['dataset_publication_date', 'dataset_version_create_time'],
    converters={'dataset_version_number': to_str})
basicMetadataDF.head()

Unnamed: 0,dataset_pid,dataset_pid_url,dataset_version_number,dataset_publication_date,dataset_version_create_time,dataset_version_state,publisher
0,doi:10.7910/DVN/5PRYPC,https://doi.org/10.7910/DVN/5PRYPC,4.0,2020-05-27,2020-06-17 23:49:50+00:00,RELEASED,Harvard Dataverse
1,doi:10.7910/DVN/VIE1H,https://doi.org/10.7910/DVN/VIE1H,248.0,2015-02-11,2015-06-18 19:49:12+00:00,RELEASED,Harvard Dataverse
2,doi:10.7910/DVN/KKUJWW,https://doi.org/10.7910/DVN/KKUJWW,1.0,2015-07-23,2015-07-15 00:57:41+00:00,RELEASED,Harvard Dataverse
3,doi:10.7910/DVN/5E6GBN,https://doi.org/10.7910/DVN/5E6GBN,1.2,2020-04-05,2020-04-05 21:32:37+00:00,RELEASED,Harvard Dataverse
4,doi:10.7910/DVN/9MKISZ,https://doi.org/10.7910/DVN/9MKISZ,10.0,2011-06-09,2013-01-29 23:42:10+00:00,RELEASED,Harvard Dataverse


In [24]:
# Check data

# Get the unique counts of datasets in the licenseMetadataDF and datasetsDF dataframes
print('Number of dataset versions licenseMetadataDF: %s' %(len(licenseMetadataDF)))
print('Number of datasets in licenseMetadataDF: %s' % (len(pd.unique(licenseMetadataDF['dataset_pid']))))

print('\nNumber of dataset versions in basicMetadataDF: %s' %(len(basicMetadataDF)))
print('Number of datasets in basicMetadataDF: %s' % (len(pd.unique(basicMetadataDF['dataset_pid']))))

Number of dataset versions licenseMetadataDF: 130768
Number of datasets in licenseMetadataDF: 80278

Number of dataset versions in basicMetadataDF: 130768
Number of datasets in basicMetadataDF: 80278


In [25]:
# Make sure the two dataframes, allDatasetPIDsDF and basicMetadataDF, contain the datasets
datasetsInlicenseMetadataDFList = set(licenseMetadataDF['dataset_pid'].tolist())
datasetsInbasicMetadataDFList = set(basicMetadataDF['dataset_pid'].tolist())

print(f'Datasets in datasetsInlicenseMetadataDFList: {len(datasetsInlicenseMetadataDFList)}')
print(f'Datasets in basicMetadataDF: {len(datasetsInbasicMetadataDFList)}')

# Create list of dataset PIDs that exist in one list and not the other, if any
difference = datasetsInlicenseMetadataDFList - datasetsInbasicMetadataDFList
difference = list(difference)
print(f'Number of datasets in one table and not the other: {len(difference)}')


Datasets in datasetsInlicenseMetadataDFList: 80278
Datasets in basicMetadataDF: 80278
Number of datasets in one table and not the other: 0


In [30]:
# Merge the two dataframes
licenseAndBasicMetadataDF = pd.merge(
    licenseMetadataDF, basicMetadataDF,
    how='inner',
    on=[
        'dataset_pid',
        'dataset_pid_url',
        'dataset_version_number'])

In [32]:
# Check count of unique datasets in the new mergedDF
print(f'Number of dataset versions in licenseAndBasicMetadataDF: {len(licenseAndBasicMetadataDF)}')
uniqueDatasetCount = len(pd.unique(licenseAndBasicMetadataDF['dataset_pid']))
print(f'Number of datasets in licenseAndBasicMetadataDF: {uniqueDatasetCount}')

Number of dataset versions in licenseAndBasicMetadataDF: 130768
Number of datasets in licenseAndBasicMetadataDF: 80278


In [34]:
# Create dataframe, licenseMetadataLatestVersionDF, that contains the license and terms metadata of only the latest
# version of each dataset, using the latest version release time from the merged basicMetadataDF
latestversion_licenseAndBasicMetadataDF = (licenseAndBasicMetadataDF
                 .iloc[licenseAndBasicMetadataDF.groupby('dataset_pid')['dataset_version_create_time']
                 .agg(pd.Series.idxmax)]
                 .reset_index(drop=True, inplace=False))

# Replace any blank values with NaN
latestversion_licenseAndBasicMetadataDF = latestversion_licenseAndBasicMetadataDF.replace(r'^\s*$', np.nan, regex=True)

print(f'Count of pids in latestversion_mergedDF: {len(latestversion_licenseAndBasicMetadataDF)}')


Count of pids in latestversion_mergedDF: 80278


In [35]:
# Let's make sure each column is the object type we want. Everything should be a string except for the two date columns.
latestversion_licenseAndBasicMetadataDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80278 entries, 0 to 80277
Data columns (total 24 columns):
 #   Column                       Non-Null Count  Dtype              
---  ------                       --------------  -----              
 0   dataset_pid                  80278 non-null  object             
 1   dataset_pid_url              80278 non-null  object             
 2   dataset_version_number       80278 non-null  object             
 3   license_name                 35351 non-null  object             
 4   license_uri                  35351 non-null  object             
 5   terms_of_use                 44716 non-null  object             
 6   confidentiality_declaration  82 non-null     object             
 7   special_permissions          1301 non-null   object             
 8   restrictions                 1099 non-null   object             
 9   citation_requirements        2190 non-null   object             
 10  depositor_requirements       538 non-null    o

Count of license names: 34
