# Generate UUID report on all public primary protected datasets
This notebook builds a table that has two important fields

* `has_uuids` - False if there are no UUIDs present in the UUID-API for this dataset
* `number_of_uuids` - the number of UUIDs in the UUID-API db.

In [1]:
import hubmapbags
from datetime import datetime
from pathlib import Path
import pandas as pd
from tqdm import tqdm
from warnings import warn as warning

token = 'token'
instance = 'prod' #default instance is test

In [3]:
# get assay types
assay_names = hubmapbags.get_assay_types()
print(assay_names)

['AF', 'ATACseq-bulk', 'cell-dive', 'CODEX', 'CODEX2', 'DART-FISH', 'IMC2D', 'IMC3D', 'lc-ms_label-free', 'lc-ms_labeled', 'lc-ms-ms_label-free', 'lc-ms-ms_labeled', 'LC-MS-untargeted', 'Lightsheet', 'MALDI-IMS', 'MIBI', 'NanoDESI', 'NanoPOTS', 'MxIF', 'PAS', 'bulk-RNA', 'SNARE-ATACseq2', 'SNARE-RNAseq2', 'scRNAseq-10xGenomics-v2', 'scRNAseq-10xGenomics-v3', 'sciATACseq', 'sciRNAseq', 'seqFish', 'seqFish_pyramid', 'snATACseq', 'snRNAseq-10xGenomics-v2', 'snRNAseq-10xGenomics-v3', 'Slide-seq', 'Targeted-Shotgun-LC-MS', 'TMT-LC-MS', 'WGS', 'LC-MS', 'MS', 'LC-MS_bottom_up', 'MS_bottom_up', 'LC-MS_top_down', 'MS_top_down']


In [None]:
report = pd.DataFrame()
for assay_name in assay_names:
    print(assay_name)
    
    #gets all dataset ids given an assay name
    datasets = pd.DataFrame(hubmapbags.get_hubmap_ids( assay_name=assay_name, token=token ))

    if datasets.empty:
        continue
    
    #clean up
    
    # i only primary data
    datasets = datasets[(datasets['data_type'] != 'image_pyramid')]
    # i only want published 
    datasets = datasets[(datasets['status'] == 'Published')]
    
    # i only want protected, i.e. it contains human sequences
    datasets = datasets[(datasets['is_protected'] == True)]

    datasets['has_uuids'] = None
    datasets['number_of_uuids'] = None
    for index, datum in tqdm(datasets.iterrows()):
        
        if hubmapbags.apis.is_protected( datum['hubmap_id'], token=token ):
            datasets.loc[index, 'number_of_uuids'] = hubmapbags.uuids.get_number_of_uuids( datum['hubmap_id'], token=token )

            if datasets.loc[index, 'number_of_uuids'] == 0:
                datasets.loc[index, 'has_uuids'] = False
            else:
                datasets.loc[index, 'has_uuids'] = True
    
    if report.empty:
        report = datasets
    else:
        report = pd.concat( [report, datasets] )

In [None]:
from datetime import datetime
now = datetime.now() 

report = report[report['is_protected']==True] 
directory = 'uuid-protected-data-report'

if not Path(directory).exists():
    Path(directory).mkdir()
report.to_csv( directory + '/' + str(now.strftime('%Y%m%d')) + '.tsv', sep='\t', index=False )

In [4]:
filename = 'uuid-protected-data-report/20221121.tsv'
import pandas as pd
df = pd.read_csv( filename, sep='\t' )

In [7]:
df = df[df['has_uuids'] == False]
df

Unnamed: 0,uuid,hubmap_id,status,is_protected,data_type,group_name,has_uuids,number_of_uuids
99,9f08125dfdae4906881b62479471d65a,HBM775.RVQX.376,Published,True,SNARE-RNAseq2,University of California San Diego TMC,False,0
234,e2cecf936604e667aa8df81dccec1bc7,HBM599.GLRZ.888,Published,True,scRNAseq-10xGenomics-v3,TMC - University of Connecticut,False,0
235,11b3087c2796bc0694c0a8ff6bd189cd,HBM233.CCCX.767,Published,True,scRNAseq-10xGenomics-v3,TMC - University of Connecticut,False,0
236,740ca3fb637ca2a80bafd05eb8d3e72a,HBM837.FPWJ.865,Published,True,scRNAseq-10xGenomics-v3,TMC - University of Connecticut,False,0
237,035ba4d304de02aff6179dbc8adf8dc6,HBM743.FBJP.586,Published,True,scRNAseq-10xGenomics-v3,TMC - University of Connecticut,False,0
...,...,...,...,...,...,...,...,...
372,e1e4148e3951bc35222790dc1352f01c,HBM699.CHMK.457,Published,True,Slide-seq,Broad Institute RTI,False,0
373,2fc79a0ddab0e95d5935b3234f5cf372,HBM543.DWSW.978,Published,True,Slide-seq,Broad Institute RTI,False,0
374,d3e8350bdcc7a5851deb22045c3af31a,HBM348.FXGT.728,Published,True,Slide-seq,Broad Institute RTI,False,0
375,69b3d856a476114c1819287d0d688387,HBM969.VBPS.239,Published,True,Slide-seq,Broad Institute RTI,False,0


In [9]:
list(df['hubmap_id'])

['HBM775.RVQX.376',
 'HBM599.GLRZ.888',
 'HBM233.CCCX.767',
 'HBM837.FPWJ.865',
 'HBM743.FBJP.586',
 'HBM938.GJXR.224',
 'HBM756.PWKH.456',
 'HBM655.MFTK.764',
 'HBM659.GSQR.225',
 'HBM558.BHPZ.328',
 'HBM233.XQZM.395',
 'HBM453.GWNF.247',
 'HBM987.BFBR.496',
 'HBM367.ZMBH.758',
 'HBM557.VZPM.253',
 'HBM949.PNXL.623',
 'HBM367.NSZK.788',
 'HBM889.DMLC.292',
 'HBM433.SPRB.778',
 'HBM243.MXBM.589',
 'HBM925.FQDP.328',
 'HBM684.SLGB.599',
 'HBM657.XWQQ.636',
 'HBM373.FZMG.625',
 'HBM379.PCLL.836',
 'HBM373.VTNH.683',
 'HBM879.DFQN.248',
 'HBM638.CDHV.585',
 'HBM477.KVFD.827',
 'HBM545.QLKW.543',
 'HBM247.JTNN.859',
 'HBM599.CXNC.464',
 'HBM322.TNGF.859',
 'HBM745.GCNN.553',
 'HBM439.LWSZ.467',
 'HBM346.LSFW.324',
 'HBM569.FMVR.429',
 'HBM892.VLVC.242',
 'HBM399.GZRJ.726',
 'HBM487.WJST.938',
 'HBM354.FMKQ.822',
 'HBM958.VZLG.297',
 'HBM324.MKDC.693',
 'HBM337.GJHZ.665',
 'HBM272.KQDJ.873',
 'HBM684.XVPK.336',
 'HBM489.LPCX.978',
 'HBM832.WWZH.575',
 'HBM754.SJMP.486',
 'HBM586.QXWD.492',
