In [1]:
from functools import reduce
import pandas as pd

In [2]:
# Import CSV file that lists PIDs of all datasets and which Dataverse installation
# they're published in, removing the PIDs of datasets whose metadata could not be
# saved, i.e. dataverse_json_export_saved is FALSE
datasetPIDsDF = (pd
    .read_csv(
        'dataset_pids_from_most_known_dataverse_installations.csv',
        usecols=lambda x: x not in ['dataset_pid', 'dataverse_name'],
        sep=',', na_filter=False)
    .query('(dataverse_json_export_saved == True)')
    .drop(columns=['dataverse_json_export_saved'])
    .reset_index(drop=True, inplace=False)
 )

datasetPIDsDF.head()

Unnamed: 0,installation,dataset_pid_url,dataverse_alias
0,CIDACS,https://doi.org/10.57833/cidacs/WV4JWB,ProjZika
1,CIDACS,https://doi.org/10.57833/cidacs/TGAIVO,ProjZika
2,CIDACS,https://doi.org/10.57833/cidacs/0FZP7V,ProjZika
3,CIDACS,https://doi.org/10.57833/cidacs/5BMSIX,ProjGates0
4,CIDACS,https://doi.org/10.57833/cidacs/I56ZVA,ProjGates0


In [3]:
# Import basic metadata of all dataset versions
datasetVersionMetadataDF = pd.read_csv(
    'basic_metadata_2022.10.02-2022.10.03.csv',
    usecols=lambda x: x not in [
        'dataset_pid', 'dataset_publication_date',
        'dataset_version_state', 'publisher'],
    parse_dates=['dataset_version_create_time'],
    sep=',', na_filter=False)

datasetVersionMetadataDF.head()

Unnamed: 0,dataset_pid_url,dataset_version_number,dataset_version_create_time
0,https://doi.org/10.48370/OFD/DBJUEM,1.0,2022-05-25 10:17:50+00:00
1,https://doi.org/10.21410/7E4/4WG94W,2.1,2020-05-13 16:06:28+00:00
2,https://doi.org/10.7910/DVN/5PRYPC,4.0,2020-06-17 23:49:50+00:00
3,https://doi.org/10.17026/dans-2zm-dsmz,1.0,2022-01-27 20:15:47+00:00
4,https://doi.org/10.7910/DVN/VIE1H,248.0,2015-06-18 19:49:12+00:00


In [4]:
print(f'Count of datasets: {len(datasetPIDsDF)}')
print(f'Count of dataset versions: {len(datasetVersionMetadataDF)}')

Count of datasets: 340857
Count of dataset versions: 473910


In [5]:
# From the datasetVersionMetadataDF, lets use the version publication dates to get the PIDs and dataset version numbers of only the most recently published versions of each dataset. The resulting dataframe should contain the same number of rows as the datasetPIDsDF.

latestDatasetVersionsDF = (datasetVersionMetadataDF
    .iloc[
        datasetVersionMetadataDF
    .groupby('dataset_pid_url')['dataset_version_create_time']
    .agg(pd.Series.idxmax)]
    .reset_index(drop=True, inplace=False))

latestDatasetVersionsDF.head()

Unnamed: 0,dataset_pid_url,dataset_version_number,dataset_version_create_time
0,http://dx.doi.org/10.26193/00HBWG,2.0,2019-05-13 08:43:01+00:00
1,http://dx.doi.org/10.26193/01P0AI,2.0,2019-05-13 06:23:26+00:00
2,http://dx.doi.org/10.26193/04F7C1,2.0,2019-05-13 10:02:15+00:00
3,http://dx.doi.org/10.26193/07R31R,2.0,2019-05-13 06:50:29+00:00
4,http://dx.doi.org/10.26193/0AF6TZ,5.0,2022-02-02 23:01:26+00:00


In [6]:
print(f'Count of datasets: {len(datasetPIDsDF)}')
print(f'Count of rows in latestDatasetVersionsDF: {len(latestDatasetVersionsDF)}')

Count of datasets: 340857
Count of rows in latestDatasetVersionsDF: 340857


In [21]:
# Join the latestDatasetVersionsDF and the datasetPIDsDF to add the installation column,
# so we know which installations published each dataset

basicDatasetMetadataDF = (pd
    .merge(latestDatasetVersionsDF, datasetPIDsDF,
         how='inner',
         on=['dataset_pid_url'])
    .reset_index(drop=True, inplace=False))

# Make sure the count of rows is the same as the count of total datasets: 340,857
print(len(basicDatasetMetadataDF))

340857


In [8]:
basicDatasetMetadataDF.head()

Unnamed: 0,dataset_pid_url,dataset_version_number,dataset_version_create_time,installation,dataverse_alias
0,http://dx.doi.org/10.26193/00HBWG,2.0,2019-05-13 08:43:01+00:00,ADA Dataverse,
1,http://dx.doi.org/10.26193/01P0AI,2.0,2019-05-13 06:23:26+00:00,ADA Dataverse,
2,http://dx.doi.org/10.26193/04F7C1,2.0,2019-05-13 10:02:15+00:00,ADA Dataverse,
3,http://dx.doi.org/10.26193/07R31R,2.0,2019-05-13 06:50:29+00:00,ADA Dataverse,
4,http://dx.doi.org/10.26193/0AF6TZ,5.0,2022-02-02 23:01:26+00:00,ADA Dataverse,


In [22]:
# Import geospatial metadata, dropping the dataset_pid column
geospatialMetadataDF = pd.read_csv(
    'geographic_bounding_box(geospatial)_2022.10.02-2022.10.03.csv',
    usecols=lambda x: x not in ['dataset_pid'],
    # sep=',', na_filter=False)
    sep=',')

print(f'Count of rows in geospatialMetadataDF: {len(geospatialMetadataDF)}')

Count of rows in geospatialMetadataDF: 145869


In [23]:
# Join geospatialMetadataDF with basicDatasetMetadataDF to retain metadata of
# each dataset's latest version

geospatialMetadataLatestVersionDF = (pd
    .merge(geospatialMetadataDF, basicDatasetMetadataDF,
          how='inner',
          on=['dataset_pid_url', 'dataset_version_number'])
    .drop(columns=[
        'dataset_version_create_time', 'installation',
        'dataverse_alias'])
    .reset_index(drop=True, inplace=False))

In [28]:
# Check the columns on the two dataframes basicDatasetMetadataDF, geospatialMetadataLatestVersionDF
# The daframes should have the 'dataset_pid_url' and 'dataset_version_number' columns

print('Columns in basicDatasetMetadataDF:')
for i in list(basicDatasetMetadataDF.columns):
    print(i)
print(f'\nColumns in grantInformationLatestVersionDF:')
for i in list(geospatialMetadataLatestVersionDF.columns):
    print(i)

Columns in basicDatasetMetadataDF:
dataset_version_create_time
installation
dataverse_alias

Columns in grantInformationLatestVersionDF:
westLongitude
eastLongitude
northLongitude
southLongitude


In [25]:
# Combine the basicDatasetMetadataDF, grantInformationLatestVersionDF, and contributorLatestVersionDF with a
# full outer join on dataset_pid_url and dataset_version_number columns
dataframes = [basicDatasetMetadataDF, geospatialMetadataLatestVersionDF]
indexList = ['dataset_pid_url', 'dataset_version_number']
for df in dataframes:
    df.set_index(indexList, inplace=True)

In [26]:
geospatialDatasetMetadataInDataverseInstallationsDF = reduce(lambda left, right: left.join(right, how='outer'), dataframes)
fundingDatasetMetadataInDataverseInstallationsDF = (geospatialDatasetMetadataInDataverseInstallationsDF.reset_index(drop=False, inplace=True))
geospatialDatasetMetadataInDataverseInstallationsDF.head()

Unnamed: 0,dataset_pid_url,dataset_version_number,dataset_version_create_time,installation,dataverse_alias,westLongitude,eastLongitude,northLongitude,southLongitude
0,http://dx.doi.org/10.26193/00HBWG,2.0,2019-05-13 08:43:01+00:00,ADA Dataverse,,,,,
1,http://dx.doi.org/10.26193/01P0AI,2.0,2019-05-13 06:23:26+00:00,ADA Dataverse,,,,,
2,http://dx.doi.org/10.26193/04F7C1,2.0,2019-05-13 10:02:15+00:00,ADA Dataverse,,,,,
3,http://dx.doi.org/10.26193/07R31R,2.0,2019-05-13 06:50:29+00:00,ADA Dataverse,,,,,
4,http://dx.doi.org/10.26193/0AF6TZ,5.0,2022-02-02 23:01:26+00:00,ADA Dataverse,,,,,


In [27]:
# geospatialDatasetMetadataInDataverseInstallationsDF.to_csv('geospatialDatasetMetadataInDataverseInstallationsDF.csv', index=False)

In [29]:
# Retain only datasets that have values in any of the four bounding box fields
geospatialDatasetMetadataInDataverseInstallationsDF = (
    geospatialDatasetMetadataInDataverseInstallationsDF.query(
        '(westLongitude == westLongitude) or\
        (eastLongitude == eastLongitude) or\
        (northLongitude == northLongitude) or\
        (southLongitude == southLongitude)'
        )
        .reset_index(drop=True, inplace=False)
)
geospatialDatasetMetadataInDataverseInstallationsDF.head()

Unnamed: 0,dataset_pid_url,dataset_version_number,dataset_version_create_time,installation,dataverse_alias,westLongitude,eastLongitude,northLongitude,southLongitude
0,https://doi.org/10.11588/data/10000,1.1,2017-04-06 07:16:00+00:00,HeiDATA,iwrgraphics,49.425272,49.397574,8.649282,8.720865
1,https://doi.org/10.11588/data/10039,2.2,2017-07-06 07:00:46+00:00,HeiDATA,healtheco,3.8667,,12.7333,
2,https://doi.org/10.11588/data/10044,1.1,2017-04-06 07:14:50+00:00,HeiDATA,iwrgraphics,49.425272,49.397574,8.649282,8.720865
3,https://doi.org/10.11588/data/10045,1.1,2017-04-06 07:15:31+00:00,HeiDATA,iwrgraphics,49.425272,49.397574,8.649282,8.720865
4,https://doi.org/10.11588/data/10046,1.1,2017-04-06 07:13:29+00:00,HeiDATA,iwrgraphics,49.425272,49.397574,8.649282,8.720865


In [30]:
print(f'Number of rows in latestDatasetVersionsDF: {len(geospatialDatasetMetadataInDataverseInstallationsDF)}')
print(f'Number of datasets in geospatialDatasetMetadataInDataverseInstallationsDF: {(len(pd.unique(geospatialDatasetMetadataInDataverseInstallationsDF["dataset_pid_url"])))}')

Number of rows in latestDatasetVersionsDF: 116054
Number of datasets in geospatialDatasetMetadataInDataverseInstallationsDF: 115444


In [75]:
datasetsWithInvalidBBMetadata = []
for index, row in geospatialDatasetMetadataInDataverseInstallationsDF.iterrows():
    invalidCount = 0
    datasetPid = row['dataset_pid_url']
    # westLongitude = row['westLongitude']
    # eastLongitude = row['eastLongitude']
    # northLongitude = row['northLongitude']
    # southLongitude = row['southLongitude']
    boxes = [
        row['westLongitude'],
        row['eastLongitude'],
        row['northLongitude'],
        row['southLongitude']
    ]
    for box in boxes:
        try:
            float(box) and box == box
        except Exception:
            invalidCount += 1
    if invalidCount > 0:
        datasetsWithInvalidBBMetadata.append(datasetPid)
        # print(f'{datasetPid} has an invalid bound box')
    # print(f'{eastLongitude}: {type(eastLongitude)}')
    # check_if_valid_bb(eastLongitude)


In [76]:
print(len(datasetsWithInvalidBBMetadata))

12016


In [77]:
print(datasetsWithInvalidBBMetadata[:10])

['https://doi.org/10.11588/data/LSG8TN', 'https://doi.org/10.15454/2TWG8B', 'https://doi.org/10.15454/CLYPS4', 'https://doi.org/10.15454/GYSGNR', 'https://doi.org/10.15454/JFVDNX', 'https://doi.org/10.15454/M2GI65', 'https://doi.org/10.15454/M9EOEW', 'https://doi.org/10.15454/SGO7EU', 'https://doi.org/10.15454/TJTV6N', 'https://doi.org/10.15454/YFY4AQ']
