In [2]:
from functools import reduce
import numpy as np
import os
import pandas as pd

In [3]:
os.chdir('..')
print(os.getcwd())

/Users/juliangautier/ux-research-data-analysis/notebooks


In [4]:
# Import CSV file that lists PIDs of all datasets and which Dataverse installation
# they're published in, removing the PIDs of datasets whose metadata could not be
# saved, i.e. dataverse_json_export_saved is FALSE
datasetPIDsDF = (pd
    .read_csv(
        './metadata/dataset_pids_from_most_known_dataverse_installations.csv',
        usecols=lambda x: x not in ['dataset_pid', 'dataverse_name'],
        sep=',', na_filter=False)
    .query('(dataverse_json_export_saved == True)')
    .drop(columns=['dataverse_json_export_saved'])
    .reset_index(drop=True, inplace=False)
 )

datasetPIDsDF.head()

Unnamed: 0,installation,dataset_pid_url,dataverse_alias
0,CIDACS,https://doi.org/10.57833/cidacs/WV4JWB,ProjZika
1,CIDACS,https://doi.org/10.57833/cidacs/TGAIVO,ProjZika
2,CIDACS,https://doi.org/10.57833/cidacs/0FZP7V,ProjZika
3,CIDACS,https://doi.org/10.57833/cidacs/5BMSIX,ProjGates0
4,CIDACS,https://doi.org/10.57833/cidacs/I56ZVA,ProjGates0


In [5]:
# Import basic metadata of all dataset versions
datasetVersionMetadataDF = pd.read_csv(
    './metadata/basic_metadata_2022.10.02-2022.10.03.csv',
    usecols=lambda x: x not in [
        'dataset_pid', 'dataset_publication_date',
        'dataset_version_state', 'publisher'],
    parse_dates=['dataset_version_create_time'],
    sep=',', na_filter=False)

datasetVersionMetadataDF.head()

Unnamed: 0,dataset_pid_url,dataset_version_number,dataset_version_create_time
0,https://doi.org/10.48370/OFD/DBJUEM,1.0,2022-05-25 10:17:50+00:00
1,https://doi.org/10.21410/7E4/4WG94W,2.1,2020-05-13 16:06:28+00:00
2,https://doi.org/10.7910/DVN/5PRYPC,4.0,2020-06-17 23:49:50+00:00
3,https://doi.org/10.17026/dans-2zm-dsmz,1.0,2022-01-27 20:15:47+00:00
4,https://doi.org/10.7910/DVN/VIE1H,248.0,2015-06-18 19:49:12+00:00


In [6]:
print(f'Count of datasets: {len(datasetPIDsDF)}')
print(f'Count of dataset versions: {len(datasetVersionMetadataDF)}')

Count of datasets: 340857
Count of dataset versions: 473910


In [7]:
# From the datasetVersionMetadataDF, lets use the version publication dates to get the PIDs and dataset version numbers of only the most recently published versions of each dataset. The resulting dataframe should contain the same number of rows as the datasetPIDsDF.

latestDatasetVersionsDF = (datasetVersionMetadataDF
    .iloc[
        datasetVersionMetadataDF
    .groupby('dataset_pid_url')['dataset_version_create_time']
    .agg(pd.Series.idxmax)]
    .reset_index(drop=True, inplace=False))

latestDatasetVersionsDF.head()

Unnamed: 0,dataset_pid_url,dataset_version_number,dataset_version_create_time
0,http://dx.doi.org/10.26193/00HBWG,2.0,2019-05-13 08:43:01+00:00
1,http://dx.doi.org/10.26193/01P0AI,2.0,2019-05-13 06:23:26+00:00
2,http://dx.doi.org/10.26193/04F7C1,2.0,2019-05-13 10:02:15+00:00
3,http://dx.doi.org/10.26193/07R31R,2.0,2019-05-13 06:50:29+00:00
4,http://dx.doi.org/10.26193/0AF6TZ,5.0,2022-02-02 23:01:26+00:00


In [8]:
print(f'Count of datasets: {len(datasetPIDsDF)}')
print(f'Count of rows in latestDatasetVersionsDF: {len(latestDatasetVersionsDF)}')

Count of datasets: 340857
Count of rows in latestDatasetVersionsDF: 340857


In [9]:
# Join the latestDatasetVersionsDF and the datasetPIDsDF to add the installation column,
# so we know which installations published each dataset

basicDatasetMetadataDF = (pd
    .merge(latestDatasetVersionsDF, datasetPIDsDF,
         how='inner',
         on=['dataset_pid_url'])
    .reset_index(drop=True, inplace=False))

# Make sure the count of rows is the same as the count of total datasets: 340,857
print(len(basicDatasetMetadataDF))

340857


In [10]:
basicDatasetMetadataDF.head()

Unnamed: 0,dataset_pid_url,dataset_version_number,dataset_version_create_time,installation,dataverse_alias
0,http://dx.doi.org/10.26193/00HBWG,2.0,2019-05-13 08:43:01+00:00,ADA Dataverse,
1,http://dx.doi.org/10.26193/01P0AI,2.0,2019-05-13 06:23:26+00:00,ADA Dataverse,
2,http://dx.doi.org/10.26193/04F7C1,2.0,2019-05-13 10:02:15+00:00,ADA Dataverse,
3,http://dx.doi.org/10.26193/07R31R,2.0,2019-05-13 06:50:29+00:00,ADA Dataverse,
4,http://dx.doi.org/10.26193/0AF6TZ,5.0,2022-02-02 23:01:26+00:00,ADA Dataverse,


In [11]:
# Import geospatial metadata, dropping the dataset_pid column
geospatialMetadataDF = pd.read_csv(
    './metadata/geographic_bounding_box(geospatial)_2022.10.02-2022.10.03.csv',
    usecols=lambda x: x not in ['dataset_pid'],
    # sep=',', na_filter=False)
    sep=',')

print(f'Count of rows in geospatialMetadataDF: {len(geospatialMetadataDF)}')

Count of rows in geospatialMetadataDF: 145869


In [12]:
# Join geospatialMetadataDF with basicDatasetMetadataDF to retain metadata of
# each dataset's latest version

geospatialMetadataLatestVersionDF = (pd
    .merge(geospatialMetadataDF, basicDatasetMetadataDF,
          how='inner',
          on=['dataset_pid_url', 'dataset_version_number'])
    .drop(columns=[
        'dataset_version_create_time', 'installation',
        'dataverse_alias'])
    .reset_index(drop=True, inplace=False))

In [13]:
# Combine basicDatasetMetadataDF and grantInformationLatestVersionDF with a
# full outer join on dataset_pid_url and dataset_version_number columns
dataframes = [basicDatasetMetadataDF, geospatialMetadataLatestVersionDF]
indexList = ['dataset_pid_url', 'dataset_version_number']
for df in dataframes:
    df.set_index(indexList, inplace=True)

In [14]:
geospatialDatasetMetadataInDataverseInstallationsDF = reduce(lambda left, right: left.join(right, how='outer'), dataframes)
fundingDatasetMetadataInDataverseInstallationsDF = (geospatialDatasetMetadataInDataverseInstallationsDF.reset_index(drop=False, inplace=True))
geospatialDatasetMetadataInDataverseInstallationsDF.head()

Unnamed: 0,dataset_pid_url,dataset_version_number,dataset_version_create_time,installation,dataverse_alias,westLongitude,eastLongitude,northLongitude,southLongitude
0,http://dx.doi.org/10.26193/00HBWG,2.0,2019-05-13 08:43:01+00:00,ADA Dataverse,,,,,
1,http://dx.doi.org/10.26193/01P0AI,2.0,2019-05-13 06:23:26+00:00,ADA Dataverse,,,,,
2,http://dx.doi.org/10.26193/04F7C1,2.0,2019-05-13 10:02:15+00:00,ADA Dataverse,,,,,
3,http://dx.doi.org/10.26193/07R31R,2.0,2019-05-13 06:50:29+00:00,ADA Dataverse,,,,,
4,http://dx.doi.org/10.26193/0AF6TZ,5.0,2022-02-02 23:01:26+00:00,ADA Dataverse,,,,,


In [15]:
# geospatialDatasetMetadataInDataverseInstallationsDF.to_csv('geospatialDatasetMetadataInDataverseInstallationsDF.csv', index=False)

In [16]:
# Retain only datasets that have values in any of the four bounding box fields
geospatialDatasetMetadataInDataverseInstallationsDF = (
    geospatialDatasetMetadataInDataverseInstallationsDF
        .query(
            '(westLongitude == westLongitude) or\
            (eastLongitude == eastLongitude) or\
            (northLongitude == northLongitude) or\
            (southLongitude == southLongitude)'
        )
        .reset_index(drop=True, inplace=False)
)
geospatialDatasetMetadataInDataverseInstallationsDF.head()

Unnamed: 0,dataset_pid_url,dataset_version_number,dataset_version_create_time,installation,dataverse_alias,westLongitude,eastLongitude,northLongitude,southLongitude
0,https://doi.org/10.11588/data/10000,1.1,2017-04-06 07:16:00+00:00,HeiDATA,iwrgraphics,49.425272,49.397574,8.649282,8.720865
1,https://doi.org/10.11588/data/10039,2.2,2017-07-06 07:00:46+00:00,HeiDATA,healtheco,3.8667,,12.7333,
2,https://doi.org/10.11588/data/10044,1.1,2017-04-06 07:14:50+00:00,HeiDATA,iwrgraphics,49.425272,49.397574,8.649282,8.720865
3,https://doi.org/10.11588/data/10045,1.1,2017-04-06 07:15:31+00:00,HeiDATA,iwrgraphics,49.425272,49.397574,8.649282,8.720865
4,https://doi.org/10.11588/data/10046,1.1,2017-04-06 07:13:29+00:00,HeiDATA,iwrgraphics,49.425272,49.397574,8.649282,8.720865


In [17]:
print(f'Number of bounding boxes in latestDatasetVersionsDF: {len(geospatialDatasetMetadataInDataverseInstallationsDF)}')
print(f'Number of datasets in geospatialDatasetMetadataInDataverseInstallationsDF: {(len(pd.unique(geospatialDatasetMetadataInDataverseInstallationsDF["dataset_pid_url"])))}')

Number of bounding boxes in latestDatasetVersionsDF: 116054
Number of datasets in geospatialDatasetMetadataInDataverseInstallationsDF: 115444


In [18]:
# Create list of dataset PIDs with "invalid" bounding box metadata

datasetsWithInvalidBBMetadataList = []
for index, row in geospatialDatasetMetadataInDataverseInstallationsDF.iterrows():
    datasetPid = row['dataset_pid_url']
    boxes = [
        row['westLongitude'],
        row['eastLongitude'],
        row['northLongitude'],
        row['southLongitude']
    ]
    invalidCount = 0
    for box in boxes:
        try:
            value = float(box)
            if np.isnan(value):
                invalidCount += 1
        except Exception:
            invalidCount += 1
    if invalidCount > 0:
        datasetsWithInvalidBBMetadataList.append(datasetPid)

datasetsWithInvalidBBMetadataList = list(set(datasetsWithInvalidBBMetadataList))

In [19]:
print(len(datasetsWithInvalidBBMetadataList))

12332


In [20]:
print(datasetsWithInvalidBBMetadataList[:10])

['https://doi.org/10.48370/OFD/DENPIN', 'https://doi.org/10.48370/OFD/QSNSLR', 'https://doi.org/10.48370/OFD/3TP8N9', 'https://doi.org/10.48370/OFD/9RVBWV', 'https://doi.org/10.48370/OFD/W7J7LI', 'https://doi.org/10.48370/OFD/RZQFYZ', 'https://doi.org/10.48370/OFD/44D4VR', 'https://doi.org/10.48370/OFD/UKRIIM', 'https://doi.org/10.48370/OFD/84I1VG', 'https://doi.org/10.21223/P3/AZRDGH']


In [21]:
# Create dataframe with metadata of only datasets with "invalid" bounding box metadata
datasetsWithInvalidBBMetadataDF = (
    # Drop all but the needed columns
    geospatialDatasetMetadataInDataverseInstallationsDF
        # Remove all datasets except those in given list of collection aliases
        .query(
        'dataset_pid_url in @datasetsWithInvalidBBMetadataList')
)
datasetsWithInvalidBBMetadataDF.head()

Unnamed: 0,dataset_pid_url,dataset_version_number,dataset_version_create_time,installation,dataverse_alias,westLongitude,eastLongitude,northLongitude,southLongitude
1,https://doi.org/10.11588/data/10039,2.2,2017-07-06 07:00:46+00:00,HeiDATA,healtheco,3.8667,,12.7333,
12,https://doi.org/10.11588/data/LSG8TN,1.0,2020-09-01 13:38:28+00:00,HeiDATA,geomorph,"6°37'56.27""E","6°54'33.22""E","50°59'31.93""N","50°46'19.53""N"
24,https://doi.org/10.15454/2TWG8B,1.0,2021-05-28 09:16:27+00:00,Recherche Data Gouv,TEMPO,,2°49,48°80,
37,https://doi.org/10.15454/CLYPS4,2.0,2020-01-30 17:38:02+00:00,Recherche Data Gouv,TEMPO,,"4°52'41.69""E","43°54'59.63""N",
38,https://doi.org/10.15454/D4MJMJ,3.1,2022-01-14 10:22:28+00:00,Recherche Data Gouv,pheno_ueh,-0.5966699246002216,47.674806960947606,,


In [22]:
datasetsWithInvalidBBMetadataDF.to_csv('datasetsWithInvalidBBMetadataDF.csv', index=False)