In [6]:
import pandas as pd
from datetime import datetime
from dateutil import tz
from dateutil.parser import parse

In [7]:
# Import CSV file as a dataframe
datasetsDF = pd.read_csv(
    'file_pids_in_dataverse_installations.csv',
    sep=',', na_filter = False)

# Convert the version_create_time column to a datetime object, making the timezones to UTC.
# The timezones don't matter here, since we're only interested in using the version_create_time column
# to figure out which version is the most recently published of each dataset
datasetsDF['version_create_time'] = pd.to_datetime(
    datasetsDF['version_create_time'],
    utc=True,
    errors='coerce')

In [10]:
datasetsDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 380830 entries, 0 to 380829
Data columns (total 5 columns):
 #   Column                  Non-Null Count   Dtype              
---  ------                  --------------   -----              
 0   installation_name       380830 non-null  object             
 1   dataset_pid_url         380830 non-null  object             
 2   dataset_version_number  380830 non-null  float64            
 3   version_create_time     380830 non-null  datetime64[ns, UTC]
 4   has_file_pids           380830 non-null  bool               
dtypes: bool(1), datetime64[ns, UTC](1), float64(1), object(2)
memory usage: 12.0+ MB


In [110]:
# Report the count of distinct datasets in the dataframe
print('Number of datasets in datasetsDF: %s' % (len(pd.unique(datasetsDF['dataset_pid_url']))))

Number of datasets in datasetsDF: 268410


In [111]:
# Create a new dataframe with information about only the latest published version of each dataset
datasetsDF_latestVersion = (
    datasetsDF
   .iloc[datasetsDF.groupby('dataset_pid_url')['version_create_time']
   .agg(pd.Series.idxmax)]
   .reset_index(drop=True, inplace=False))

In [112]:
datasetsDF_latestVersion.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 268410 entries, 0 to 268409
Data columns (total 5 columns):
 #   Column                  Non-Null Count   Dtype              
---  ------                  --------------   -----              
 0   installation_name       268410 non-null  object             
 1   dataset_pid_url         268410 non-null  object             
 2   dataset_version_number  268410 non-null  float64            
 3   version_create_time     268410 non-null  datetime64[ns, UTC]
 4   has_file_pids           268410 non-null  bool               
dtypes: bool(1), datetime64[ns, UTC](1), float64(1), object(2)
memory usage: 8.4+ MB


For each installation, we want to know if its latest published dataset contains files that have file PIDs.

Let's get the count of installations.

In [114]:
print('Number of installations in datasetsDF_latestVersion: %s' % (len(pd.unique(datasetsDF_latestVersion['installation_name']))))

Number of installations in datasetsDF_latestVersion: 72


Let's create a new dataframe that lists the most recently published dataset for each installation.

In [115]:
# Create a new dataframe with information about only the latest published version of each dataset
latestDatasetPerInstallationDF = (
    datasetsDF_latestVersion
        .iloc[datasetsDF_latestVersion.groupby('installation_name')['version_create_time']
        .agg(pd.Series.idxmax)]
        .reset_index(drop=True, inplace=False))

In [116]:
latestDatasetPerInstallationDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72 entries, 0 to 71
Data columns (total 5 columns):
 #   Column                  Non-Null Count  Dtype              
---  ------                  --------------  -----              
 0   installation_name       72 non-null     object             
 1   dataset_pid_url         72 non-null     object             
 2   dataset_version_number  72 non-null     float64            
 3   version_create_time     72 non-null     datetime64[ns, UTC]
 4   has_file_pids           72 non-null     bool               
dtypes: bool(1), datetime64[ns, UTC](1), float64(1), object(2)
memory usage: 2.4+ KB


In [125]:
latestDatasetPerInstallationDF.head()

# Export dataframe as a CSV to review
latestDatasetPerInstallationDF.to_csv('latestDatasetPerInstallationDF.csv', index=False)

Let's get a count of the installations whose most recently updated datasets have and don't have file PIDs

In [124]:
countOfDatasetsWithFilePids = len(latestDatasetPerInstallationDF[(latestDatasetPerInstallationDF['has_file_pids']==True)])
countOfDatasetsWithNoFilePids = len(latestDatasetPerInstallationDF[(latestDatasetPerInstallationDF['has_file_pids']==False)])

print(f'Count of installations where latest dataset version has file PIDs: {countOfDatasetsWithFilePids}')
print(f'Count of installations where latest dataset version has no file PIDs: {countOfDatasetsWithNoFilePids}')


Count of installations where latest dataset version has file PIDs: 36
Count of installations where latest dataset version has no file PIDs: 36
