# dbGaP - phs002249
## KULMAP - Human Kidney, Urinary Tract, and Lung Mapping Center
[Study Link](https://www.ncbi.nlm.nih.gov/projects/gapprev/gap/cgi-bin/study.cgi?study_id=phs002249.v2.p1)

The major goal of the Human Kidney, Urinary Tract, and Lung Mapping Center (KULMAP) is to generate multi-omic and spatially resolved molecular anatomical maps of the human bladder, ureter and kidneys (BUKMAP) and the lung airways and parenchyma (LAPMAP) at a single cell resolution. This entails sequencing of the transcriptomes and epigenomes of dissociated single cells in a massively parallel manner. These profiles will then inform on highly multiplexed RNA in situ imaging for spatial mapping of hundreds of molecular targets in the tissue sections, at a subcellular resolution. These spatial molecular maps will serve as scaffolds for computational registration of cell types and the associated transcriptome/chromatin maps to the tissue space. The combination of sequencing single dissociated cells and multiplexed in situ mapping will allow the construction of detailed spatial maps for these large organs.

### PI
* Kun Zhang. University of California, San Diego, La Jolla, CA, USA.

In [None]:
import pandas as pd
from shutil import copytree
from shutil import rmtree
import pathlib
import json
import yaml
import hubmapbags
import pickle
from pathlib import Path
import pandas as pd
from tqdm import tqdm
from warnings import warn as warning
from datetime import datetime
import warnings

#for publishing only use the prod instance
instance = 'prod'

# get token from ingest.hubmapconsortium.org
token = ''

## Preparation

In [None]:
now = datetime.now()
output_directory = 'data'
if not Path(output_directory).exists():
    Path(output_directory).mkdir()

report_output_directory = 'uuid-protected-data-report'
if not Path(report_output_directory).exists():
    Path(report_output_directory).mkdir()

report_output_filename = report_output_directory + '/' + str(now.strftime('%Y%m%d')) + '.tsv'
    
def is_primary( hubmap_id, instance='prod', token=None ):
    metadata = hubmapbags.apis.get_ancestors_info( hubmap_id, instance=instance, token=token )
    if 'entity_type' in metadata[0].keys() and  metadata[0]['entity_type'] == 'Sample':
        return True
    else:
        if 'error' in metadata[0]:
            warning(metadata[0]['error'])
        return False
    
def has_metadata( metadata ):
    if 'ingest_metadata' in metadata.keys() and 'metadata' in metadata['ingest_metadata'].keys():
        return True
    else:
        return False

In [None]:
if not Path(report_output_filename).exists():
    # get assay types
    assay_names = hubmapbags.apis.get_assay_types( token=token )

    report = pd.DataFrame()
    for assay_name in assay_names:
        print(assay_name)
        datasets = pd.DataFrame(hubmapbags.get_hubmap_ids( assay_name=assay_name, token=token ))

        if datasets.empty:
            continue

        #clean up
        datasets = datasets[(datasets['data_type'] != 'image_pyramid')]
        datasets = datasets[(datasets['status'] == 'Published')]
        
        for index, datum in tqdm(datasets.iterrows()):
            datasets.loc[index, 'directory'] = hubmapbags.apis.get_directory( datum['hubmap_id'], instance='prod', token=token )
            metadata = hubmapbags.apis.get_dataset_info( datum['hubmap_id'], instance='prod', token=token )
            
            if has_metadata( metadata ):
                datasets.loc[index,'has_metadata'] = True
                if 'ingest_metadata' in metadata.keys():
                    datasets.loc[index,'assay_type'] = metadata['ingest_metadata']['metadata']['assay_type']
                    datasets.loc[index,'assay_category'] = metadata['ingest_metadata']['metadata']['assay_category']
                else:
                    datasets.loc[index,'assay_type'] = None
                    datasets.loc[index, 'assay_category'] = None
            else:
                datasets.loc[index,'has_metadata'] = False
            
        report = pd.concat([report,datasets])
    
    report = report[['group_name','uuid','hubmap_id','status','is_protected','data_type','assay_type','assay_category','directory']]
    report.to_csv(report_output_filename, sep='\t', index=False)
    report.to_pickle(reportf_output_filename.replace('tsv','pkl'))
else:
    print('File found on disk. Loading ' + report_output_filename + '.')
    report = pd.read_csv(report_output_filename, sep='\t')

In [None]:
report = report[(report['group_name'] == 'University of California San Diego TMC') | ((report['group_name'] == 'Broad Institute RTI') & (report['data_type'] == 'Slide-seq'))]

In [None]:
warnings.filterwarnings('ignore')

if not Path('dataframe.tsv').exists():
    columns = ['phs_accession','sample_ID','library_ID', \
               'title','group_name','data_type','library_strategy', \
               'library_source','library_selection', \
               'library_layout','platform','instrument_model', \
               'design_description','reference_genome_assembly', \
               'alignment_software']

    df = pd.DataFrame(columns=columns)

    phs_accession = 'phs002249'

    rows = []
    lib_counter = 1
    for index, datum in tqdm(report.iterrows()):
        metadata = hubmapbags.apis.get_dataset_info(datum['hubmap_id'], instance='prod', token=token)
        pmetadata = hubmapbags.apis.get_provenance_info(datum['hubmap_id'], instance='prod', token=token)

        organ = pmetadata['organ_type'][0]

        try:
            library_ID = metadata['ingest_metadata']['metadata']['library_id']
            library_ID = library_ID + '-' + datum['hubmap_id']
        except:
            library_ID = 'lib-' + datum['hubmap_id']

        try:
            instrument_model = metadata['ingest_metadata']['metadata']['acquisition_instrument_model']
        except:
            instrument_model = ''

        try:
            platform = metadata['ingest_metadata']['metadata']['acquisition_instrument_vendor']
        except:
            platform = ''

        try:
            if metadata['ingest_metadata']['metadata']['library_layout'] == 'paired-end':
                library_layout = 'paired'
            else:
                library_layout = 'single'
        except:
            library_layout = ''

        library_selection = 'other'

        if datum['data_type'] == 'Slide-seq':
            library_strategy = 'RNA-Seq'
            title='Slide-seq of human ' + organ 
            source = 'TRANSCRIPTOMIC'
        elif datum['data_tfype'] == 'snRNAseq':
            library_strategy = 'RNA-Seq'
            title='snRNAseq-10xGenomics-v3 of human ' + organ 
            source = 'TRANSCRIPTOMIC'
        else:
            try:
                if metadata['ingest_metadata']['metadata']['analyte_class'] == 'DNA':
                    library_strategy = 'ATAC-seq'
                    title='SNARE2-ATAC-seq of human ' + organ
                    library_source = 'GENOMIC'
                else:
                    library_strategy = 'RNA-Seq'
                    title='SNARE2-RNA-Seq of human ' + organ
                    library_source = 'TRANSCRIPTOMIC'
            except:
                library_strategy = ''
                title=''
                library_source = ''

        if datum['data_type'] == 'Slide-seq':
            library_construction_protocols_io_doi = metadata['ingest_metadata']['metadata']['library_construction_protocols_io_doi']
            sequencing_reagent_kit = metadata['ingest_metadata']['metadata']['sequencing_reagent_kit']
            design_description = f'The protocol and materials for the Slide-seq library construction process can be found in the following protocols.io protocol: dx.doi.org/{library_construction_protocols_io_doi}. The library was sequenced on the {platform} {instrument_model} system using the {sequencing_reagent_kit} kit.'
        else:
            try:
                library_construction_protocols_io_doi = metadata['ingest_metadata']['metadata']['library_construction_protocols_io_doi']
                sequencing_reagent_kit = metadata['ingest_metadata']['metadata']['sequencing_reagent_kit']
                design_description = f'SNARE-seq2 was performed as outlined (Nature Protocols, DOI:10.1038/s41596-021-00507-3) and according to the following protocols.io protocol: dx.doi.org/{library_construction_protocols_io_doi}. The library was sequenced on the {platform} {instrument_model} system using the {sequencing_reagent_kit} kit.'
            except:
                design_description = ''

        row = {'phs_accession':phs_accession, \
               'sample_ID':datum['hubmap_id'], \
               'library_ID':library_ID, \
               'title':title, \
               'group_name':datum['group_name'], \
               'data_type':datum['data_type'], \
               'library_strategy':library_strategy , \
               'library_source':library_source , \
               'library_selection':library_selection, \
               'library_layout':library_layout, \
               'platform':platform, \
               'instrument_model':instrument_model, \
               'design_description':design_description, \
               'reference_genome_assembly':None, \
               'alignment_software':None }

        rows.append(row)

    for row in rows:
        df = df.append(row, ignore_index=True)

    df.to_csv( 'dataframe.tsv', sep='\t', index=False )
else:
    df = pd.read_csv( 'dataframe.tsv', sep='\t' )

In [None]:
file = open('files.pkl', 'rb')
files = pickle.load(file)
file.close()

In [None]:
df["filetype0"]=None
df["filename0"]=None
df["checksum0"]=None
df["filetype1"]=None
df["filename1"]=None
df["checksum1"]=None
df["filetype2"]=None
df["filename2"]=None
df["checksum2"]=None
df["filetype3"]=None
df["filename3"]=None
df["checksum3"]=None
df["filetype4"]=None
df["filename4"]=None
df["checksum4"]=None
df["filetype5"]=None
df["filename5"]=None
df["checksum5"]=None

In [None]:
numbers = []
for f in files:
    numbers.append(len(f))

In [None]:
for index, datum in df.iterrows():
    if len(files[index]) == 2:
        keys = list(files[index].keys())
        df.loc[index,'checksum0'] = files[index][keys[0]]
        df.loc[index,'filename0'] = keys[0] + '.gz'
        df.loc[index,'checksum1'] = files[index][keys[1]]
        df.loc[index,'filename1'] = keys[1] + '.gz'
        df.loc[index,'filetype0'] = df.loc[index,'filetype1'] = 'fastq'
    elif len(files[index]) == 3:
        keys = list(files[index].keys())
        df.loc[index,'checksum0'] = files[index][keys[0]]
        df.loc[index,'filename0'] = keys[0] + '.gz'
        df.loc[index,'checksum1'] = files[index][keys[1]]
        df.loc[index,'filename1'] = keys[1] + '.gz'
        df.loc[index,'checksum2'] = files[index][keys[2]]
        df.loc[index,'filename2'] = keys[2] + '.gz'
        df.loc[index,'filetype0'] = df.loc[index,'filetype1'] = df.loc[index,'filetype2'] = 'fastq'
    elif len(files[index]) == 4:
        keys = list(files[index].keys())
        df.loc[index,'checksum0'] = files[index][keys[0]]
        df.loc[index,'filename0'] = keys[0] + '.gz'
        df.loc[index,'checksum1'] = files[index][keys[1]]
        df.loc[index,'filename1'] = keys[1] + '.gz'
        df.loc[index,'checksum2'] = files[index][keys[2]]
        df.loc[index,'filename2'] = keys[2] + '.gz'
        df.loc[index,'checksum3'] = files[index][keys[3]]
        df.loc[index,'filename3'] = keys[3] + '.gz'
        df.loc[index,'filetype0'] = df.loc[index,'filetype1'] = df.loc[index,'filetype2'] = df.loc[index,'filetype3'] = 'fastq'
    else:
        keys = list(files[index].keys())
        df.loc[index,'checksum0'] = files[index][keys[0]]
        df.loc[index,'filename0'] = keys[0] + '.gz'
        df.loc[index,'checksum1'] = files[index][keys[1]]
        df.loc[index,'filename1'] = keys[1] + '.gz'
        df.loc[index,'checksum2'] = files[index][keys[2]]
        df.loc[index,'filename2'] = keys[2] + '.gz'
        df.loc[index,'checksum3'] = files[index][keys[3]]
        df.loc[index,'filename3'] = keys[3] + '.gz'
        df.loc[index,'checksum4'] = files[index][keys[4]]
        df.loc[index,'filename4'] = keys[4] + '.gz'
        df.loc[index,'checksum5'] = files[index][keys[5]]
        df.loc[index,'filename5'] = keys[5] + '.gz'
        df.loc[index,'filetype0'] = df.loc[index,'filetype1'] = df.loc[index,'filetype2'] = df.loc[index,'filetype3'] = df.loc[index,'filetype4'] =  df.loc[index,'filetype5'] = 'fastq'

In [None]:
df.to_csv('phs002249.tsv', sep='\t', index=False)