# Generate UUID report on all public datasets
This notebook builds a table that has two important fields

* `has_uuids` - False if there are no UUIDs present in the UUID-API for this dataset
* `number_of_uuids` - the number of UUIDs in the UUID-API db.

In [15]:
import hubmapbags
from datetime import datetime
from pathlib import Path
import pandas as pd
from tqdm import tqdm
from warnings import warn as warning

token = 'token'
instance = 'prod' #default instance is test

In [16]:
# get assay types
assay_names = hubmapbags.get_assay_types()

In [17]:
report = pd.DataFrame()
for assay_name in assay_names:
    print(assay_name)
    datasets = pd.DataFrame(hubmapbags.get_hubmap_ids( assay_name=assay_name, token=token ))

    if datasets.empty:
        continue
    
    #clean up
    datasets = datasets[(datasets['data_type'] != 'image_pyramid')]
    datasets = datasets[(datasets['status'] == 'Published')]

    datasets['has_uuids'] = None
    datasets['number_of_uuids'] = None
    for index, datum in tqdm(datasets.iterrows()):
        datasets.loc[index, 'number_of_uuids'] = hubmapbags.uuids.get_number_of_uuids( datum['hubmap_id'], token=token )

        if datasets.loc[index, 'number_of_uuids'] == 0:
            datasets.loc[index, 'has_uuids'] = False
        else:
            datasets.loc[index, 'has_uuids'] = True
    
    if report.empty:
        report = datasets
    else:
        report = pd.concat( [report, datasets] )

AF


33it [00:10,  3.02it/s]


ATACseq-bulk


16it [00:04,  3.76it/s]


cell-dive


12it [00:03,  3.00it/s]


CODEX


93it [04:45,  3.07s/it]


CODEX2
DART-FISH


0it [00:00, ?it/s]


IMC2D


0it [00:00, ?it/s]


IMC3D


3it [00:01,  2.62it/s]


lc-ms_label-free
lc-ms_labeled
lc-ms-ms_label-free
lc-ms-ms_labeled
LC-MS-untargeted


0it [00:00, ?it/s]


Lightsheet


3it [00:00,  3.67it/s]


MALDI-IMS


26it [00:19,  1.32it/s]


MIBI


0it [00:00, ?it/s]


NanoDESI


0it [00:00, ?it/s]


NanoPOTS
MxIF
PAS


46it [00:12,  3.57it/s]


bulk-RNA


8it [00:02,  3.38it/s]


SNARE-ATACseq2


48it [00:14,  3.24it/s]


SNARE-RNAseq2


158it [00:46,  3.36it/s]


scRNAseq-10xGenomics-v2


4it [00:01,  3.11it/s]


scRNAseq-10xGenomics-v3


6it [00:01,  5.21it/s]


sciATACseq


21it [00:06,  3.10it/s]


sciRNAseq


12it [00:03,  3.05it/s]


seqFish


9it [00:03,  2.38it/s]


seqFish_pyramid
snATACseq


35it [00:11,  3.17it/s]


snRNAseq-10xGenomics-v2
snRNAseq-10xGenomics-v3


36it [00:07,  4.67it/s]


Slide-seq


37it [00:11,  3.09it/s]


Targeted-Shotgun-LC-MS
TMT-LC-MS


2it [00:00,  3.19it/s]


WGS


17it [00:05,  3.08it/s]


LC-MS


52it [00:18,  2.85it/s]


MS


60it [00:18,  3.18it/s]


LC-MS_bottom_up


0it [00:00, ?it/s]


MS_bottom_up


0it [00:00, ?it/s]


LC-MS_top_down


10it [00:02,  3.69it/s]


MS_top_down


10it [00:03,  3.28it/s]


In [18]:
from datetime import datetime
now = datetime.now() 

directory = 'uuid-data-report'

if not Path(directory).exists():
    Path(directory).mkdir()
report.to_csv( directory + '/' + str(now.strftime('%Y%m%d')) + '.tsv', sep='\t', index=False )