# More questions related to "Investigating_deposits_in_HDVs_Root_dataverse"

## How many accounts have created just one dataset in Root? How many have created more than one dataset in Root?

In [None]:
# Create new dataframe showing the count of accounts that have created 

countOfDatasetsbyAccount = (
    publishedDepositsDF.query(
        'deposit_parent_dataverse == "1"\
        & deposit_type == "Dataset"\
        & deposit_createdate > "2018-10-31"\
        & deposit_createdate < "2020-11-01"')
    
    # Retain only these columns
    [['user_account_id_of_deposit']]

    # Group by the subsetted columns to get a count of datasets deposited by each account
    .value_counts(subset=['user_account_id_of_deposit'])
    .to_frame('dataset_count')
    
    # Group by the subsetted columns to get a count of datasets deposited by each account
    .value_counts(subset=['dataset_count'])
    .to_frame('count_of_accounts')

    .sort_values(by=['dataset_count'], inplace=False, ascending=True)

    .reset_index(drop=False, inplace=False)
    
    # Reorder columns
    [['count_of_accounts','dataset_count']]
    
    # Set index to "count of accounts" column
    .set_index('count_of_accounts')
)


In [None]:
print(countOfDatasetsbyAccount.shape)
countOfDatasetsbyAccount.head(15)


In the past two years, 3,112 datasets have been created in Root by 2,144 accounts:
- 1,684 accounts created only 1 dataset in Root. In other words, those 1,684 accounts created a little more than half of the 3,112 datasets created in Root
- 460 accounts created 2 or more datasets in Root. They created the other 1,428 datasets

## How many and which types of accounts have created datasets in Root each day?

### Create and check filtered dataframe

In [None]:
# Create new dataframe

countbyAccountType = (
    rawDataDF.query(
        'deposit_parent_dataverse == "1"\
        & deposit_type == "Dataset"\
        & deposit_createdate > "2018-10-31"\
        & deposit_createdate < "2020-11-01"')
    
    # Retain only these columns
    [['user_account_id_of_deposit', 'deposit_createdate', 'user_account_type']]
    
    .groupby(['deposit_createdate', 'user_account_type', 'user_account_id_of_deposit']).count()
    .sort_values(by=['deposit_createdate'], ascending=True)
    .rename(columns={'user_account_id_of_deposit': 'count_of_accounts'})
)

# Reset dataframe index (so that values in deposit_createdate repeat)
countbyAccountType.reset_index(drop=False, inplace=True)

countbyAccountType = (
    countbyAccountType.groupby(['deposit_createdate', 'user_account_type']).count()
    .rename(columns={'user_account_id_of_deposit': 'count_of_accounts'})
)

# Reset dataframe index (so that values in deposit_createdate repeat)
countbyAccountType.reset_index(drop=False, inplace=True)


In [None]:
print(countbyAccountType.shape)
countbyAccountType.head(5)


## How often are spam and test dataverses published?

```
select
    to_char(starttime, 'YYYY-MM-DD') as dataverse_publish_date,
    useridentifier as depositor_id,
    substring(info from '(?<=\:\[)\d*')::int as dataverse_id
from actionlogrecord
where actionsubtype ilike '%PublishDataverseCommand%' and actionresult = 'OK'
    and useridentifier not in ('@merce', '@dataverseAdmin', '@sonia', '@Dwayne', '@juliangautier', '@kmika')
    and substring(info from '(?<=\:\[)\d*')::int in(
        select 
            substring(info from '(?<=doomed\:\[)\d*')::int
        from actionlogrecord
        where
            actionsubtype ilike '%DeleteDataverseCommand%'
            and useridentifier in ('@merce', '@dataverseAdmin', '@sonia', '@Dwayne', '@juliangautier', '@kmika')
            and info ilike 'owner:[1 Harvard Dataverse] doomed%'
            and starttime > '2018-10-31'
            and actionresult = 'OK'
    )
```