In [None]:
import requests
import os
import pandas as pd

search_url = "https://www.encodeproject.org/search/" + \
             "?type=ExperimentSeries&assay_title=TF+ChIP-seq" + \
             "&format=json&limit=all"
        
encode_auth = ('HBZNC72F',  'dxjvsz3pfvnaqoxw')
HEADERS = {'accept': 'application/json'}
json_results = requests.get(search_url, headers=HEADERS, 
                            auth=encode_auth).json()

ChipReplicationQualityMetrics = {
    'reproducible_peaks', 'idr_cutoff', 'rescue_ratio', 
    'self_consistency_ratio', 'reproducibility'}

ChipPeakEnrichmentQualityMetrics = {
    'frip', 'min_size', '25_pct', '50_pct', '75_pct', 'max_size', 'mean'}

ChipAlignmentEnrichmentQualityMetrics = {
    'subsampled_reads', 'estimated_fragment_len', 'corr_estimated_fragment_len',
    'phantom_peak', 'corr_phantom_peak', 'argmin_corr', 'min_corr', 'NSC', 
    'RSC', 'auc', 'syn_auc', 'x_intercept', 'syn_x_intercept', 'elbow_pt', 
    'syn_elbow_pt', 'jsd', 'syn_jsd', 'pct_genome_enrich', 'diff_enrich', 
    'ch_div'}  

ChipLibraryQualityMetrics = {
    'unpaired_reads', 'paired_reads', 'unmapped_reads', 
    'unpaired_duplicate_reads', 'paired_duplicate_reads',
    'paired_optical_duplicate_reads', 'pct_duplicate_reads', 'total_fragments', 
    'distinct_fragments', 'positions_with_one_read', 'NRF', 'PBC1', 'PBC2'}

ChipAlignmentQualityMetrics = {
    'processing_stage', 'total_reads', 'total_reads_qc_failed', 
    'duplicate_reads', 'duplicate_reads_qc_failed', 'mapped_reads', 
    'mapped_reads_qc_failed', 'pct_mapped_reads', 'paired_reads', 
    'paired_reads_qc_failed', 'read1', 'read1_qc_failed', 'read2', 
    'read2_qc_failed', 'properly_paired_reads', 
    'properly_paired_reads_qc_failed', 'pct_properly_paired_reads', 
    'with_itself', 'with_itself_qc_failed', 'singletons', 
    'singletons_qc_failed', 'pct_singletons', 'diff_chroms', 
    'diff_chroms_qc_failed'}

rows = []
for item in json_results['@graph']:
    accession_id = item['accession']

    search_url = "https://www.encodeproject.org/search/" + \
                 "?type=Experiment&" + \
                 "related_series.@id=/experiment-series/{}/".format(
                     accession_id)
            
    json_results = requests.get(search_url, headers=HEADERS, 
                                auth=encode_auth).json()
    
    for item in json_results['@graph']:
        exp_accession_id = item['accession']
        
        search_url = "https://www.encodeproject.org/{}".format(
            exp_accession_id)
        json_results = requests.get(search_url, headers=HEADERS, 
                                    auth=encode_auth).json()
        
        assembly = ''
        
        fastqs = []
        fastqs_run_type = []
        
        alignments = [] 
        alignments_download_urls = []
        alignments_md5sums = []
        alignments_qc_metrics = []
        
        unfiltered_alignments = []
        unfiltered_alignments_download_urls = []
        unfiltered_alignments_md5sums = []
        unfiltered_alignments_qc_metrics = []
        
        preferred_default_bed_narrowPeak = []
        preferred_default_bed_narrowPeak_download_urls = []
        preferred_default_bed_narrowPeak_md5sums = []
        preferred_default_bed_narrowPeak_qc_metrics = []
        
        controls = []
        control_fastqs = []
        control_fastqs_runtype = []
        
        control_alignments = [] 
        control_alignments_download_urls = []
        control_alignments_md5sums = []
        control_alignments_qc_metrics = []
        
        control_unfiltered_alignments = []
        control_unfiltered_alignments_download_urls = []
        control_unfiltered_alignments_md5sums = []
        control_unfiltered_alignments_qc_metrics = []
        
        encode4_files = []
        for analysis_object in json_results['analysis_objects']:
            if 'ENCODE4' in analysis_object['pipeline_award_rfas']:
                encode4_files = [f.split('/')[2] for f in \
                                 analysis_object['files']]
            
        #print("ENCODE4 files", encode4_files)
        
        for file_dict in json_results['files']:
            if file_dict['file_type'] == 'fastq': 
                fastqs.append(file_dict['accession'])
                fastqs_run_type.append(file_dict['run_type'])
                continue

            if file_dict['accession'] not in encode4_files:
                #print("Skipping", file_dict['accession'])
                continue

            # fastq files dont have 'assembly' so we ^^^ check
            # that first
            # this check may not be necessary because we are 
            # checking for ENCODE4
            if file_dict['status'] == 'archived' or \
                file_dict['assembly'] == 'hg19':
                continue

            # this will get overwritten several times for each 
            # file, but it's ok 
            assembly = file_dict['assembly']

            if file_dict['file_type'] == 'bed narrowPeak' and \
                'preferred_default' in file_dict.keys() and \
                file_dict['preferred_default']:

                download_url = "https://www.encodeproject.org/" + \
                   "files/{}/@@download/{}.bed.gz".format(
                       file_dict['accession'], 
                       file_dict['accession'])

                preferred_default_bed_narrowPeak.append(
                    file_dict['accession'])
                preferred_default_bed_narrowPeak_download_urls.append(
                    download_url)
                preferred_default_bed_narrowPeak_md5sums.append(
                    file_dict['md5sum'])

                _qc_metrics = {}
                for qc_metrics in file_dict['quality_metrics']:
                    if qc_metrics['@type'][0] == \
                        'ChipReplicationQualityMetric':
                        _qc_metrics['ChipReplicationQualityMetric'] = \
                            {k: qc_metrics[k] for k in \
                             qc_metrics.keys() & \
                             ChipReplicationQualityMetrics}

                    if qc_metrics['@type'][0] == \
                        'ChipPeakEnrichmentQualityMetric':
                        _qc_metrics['ChipPeakEnrichmentQualityMetric'] = \
                            {k: qc_metrics[k] for k in \
                             qc_metrics.keys() & \
                             ChipPeakEnrichmentQualityMetrics}

                preferred_default_bed_narrowPeak_qc_metrics.append(
                    _qc_metrics)

                continue

            if file_dict['file_type'] == 'bam': 

                download_url = "https://www.encodeproject.org/" + \
                               "files/{}/@@download/{}.bam".format(
                                   file_dict['accession'], 
                                   file_dict['accession'])

                _qc_metrics = {}
                for qc_metrics in file_dict['quality_metrics']:
                    if qc_metrics['@type'][0] == \
                        'ChipAlignmentEnrichmentQualityMetric':
                        _qc_metrics['ChipAlignmentEnrichmentQualityMetric'] = \
                            {k: qc_metrics[k] for k in \
                             qc_metrics.keys() & \
                             ChipAlignmentEnrichmentQualityMetrics}

                    if qc_metrics['@type'][0] == \
                        'ChipLibraryQualityMetric':
                        _qc_metrics['ChipLibraryQualityMetric'] = \
                            {k: qc_metrics[k] for k in \
                             qc_metrics.keys() & \
                             ChipLibraryQualityMetrics}

                    if qc_metrics['@type'][0] == \
                        'ChipAlignmentQualityMetric':
                        _qc_metrics['ChipAlignmentQualityMetric'] = \
                            {k: qc_metrics[k] for k in \
                             qc_metrics.keys() & \
                             ChipAlignmentQualityMetrics}

                if file_dict['output_type'] == 'unfiltered alignments':
                    unfiltered_alignments.append(file_dict['accession'])  
                    unfiltered_alignments_download_urls.append(download_url)
                    unfiltered_alignments_md5sums.append(file_dict['md5sum'])
                    unfiltered_alignments_qc_metrics.append(_qc_metrics)

                if file_dict['output_type'] == 'alignments':
                    alignments.append(file_dict['accession'])  
                    alignments_download_urls.append(download_url)
                    alignments_md5sums.append(file_dict['md5sum'])
                    alignments_qc_metrics.append(_qc_metrics)

                continue

        # now add the controls
        for control in json_results['possible_controls']:
            control_accession = control['accession']
                    
            control_search_url = "https://www.encodeproject.org/{}".format(
                control_accession)
            control_json_results = requests.get(
                control_search_url, headers=HEADERS, auth=encode_auth).json()
            
            control_encode4_files = []
            for control_analysis_object in \
                control_json_results['analysis_objects']:
                if 'ENCODE4' in control_analysis_object['pipeline_award_rfas']:
                    control_encode4_files = [f.split('/')[2] for f in \
                                             control_analysis_object['files']]
                
            #print(control_encode4_files)
            for control_file_dict in control_json_results['files']:
                
                if control_file_dict['accession'] not in control_encode4_files:
                    #print("Skipping", file_dict['accession'])
                    continue

                if control_file_dict['status'] == 'archived' or \
                    control_file_dict['assembly'] == 'hg19':
                    continue
                    
                if control_file_dict['file_type'] == 'bam': 

                    control_download_url = "https://www.encodeproject.org/" + \
                        "files/{}/@@download/{}.bam".format(
                        control_file_dict['accession'], 
                        control_file_dict['accession'])

                    _control_qc_metrics = {}
                    for control_qc_metrics in \
                        control_file_dict['quality_metrics']:
                        if control_qc_metrics['@type'][0] == \
                            'ChipAlignmentEnrichmentQualityMetric':
                            _control_qc_metrics['ChipAlignmentEnrichmentQualityMetric'] = \
                                {k: control_qc_metrics[k] for k in \
                                 control_qc_metrics.keys() & \
                                 ChipAlignmentEnrichmentQualityMetrics}

                        if control_qc_metrics['@type'][0] == \
                            'ChipLibraryQualityMetric':
                            _control_qc_metrics['ChipLibraryQualityMetric'] = \
                                {k: control_qc_metrics[k] for k in \
                                 control_qc_metrics.keys() & \
                                 ChipLibraryQualityMetrics}

                        if control_qc_metrics['@type'][0] == \
                            'ChipAlignmentQualityMetric':
                            _control_qc_metrics['ChipAlignmentQualityMetric'] = \
                                {k: control_qc_metrics[k] for k in \
                                 control_qc_metrics.keys() & \
                                 ChipAlignmentQualityMetrics}

                    if control_file_dict['output_type'] == \
                        'unfiltered alignments':
                        control_unfiltered_alignments.append(
                            control_file_dict['accession'])  
                        control_unfiltered_alignments_download_urls.append(
                            control_download_url)
                        control_unfiltered_alignments_md5sums.append(
                            control_file_dict['md5sum'])
                        control_unfiltered_alignments_qc_metrics.append(
                            _control_qc_metrics)

                    if control_file_dict['output_type'] == 'alignments':
                        control_alignments.append(
                            control_file_dict['accession'])  
                        control_alignments_download_urls.append(
                            control_download_url)
                        control_alignments_md5sums.append(
                            control_file_dict['md5sum'])
                        control_alignments_qc_metrics.append(
                            _control_qc_metrics)

                    continue
            
        row = [accession_id, exp_accession_id, fastqs, fastqs_run_type,
               assembly, preferred_default_bed_narrowPeak, 
               preferred_default_bed_narrowPeak_download_urls, 
               preferred_default_bed_narrowPeak_md5sums, 
               preferred_default_bed_narrowPeak_qc_metrics,
               unfiltered_alignments, unfiltered_alignments_download_urls,
               unfiltered_alignments_md5sums, 
               unfiltered_alignments_qc_metrics, alignments, 
               alignments_download_urls, alignments_md5sums, 
               alignments_qc_metrics, control_unfiltered_alignments, 
               control_unfiltered_alignments_download_urls,
               control_unfiltered_alignments_md5sums, 
               control_unfiltered_alignments_qc_metrics, control_alignments, 
               control_alignments_download_urls, control_alignments_md5sums, 
               control_alignments_qc_metrics]

        #print(row)

        unique_run_types = set(fastqs_run_type)
        if len(unique_run_types) > 1:
            print(accession_id, exp_accession_id, "run_types", 
                  len(unique_run_types), "****** MULTIPLE RUN TYPES ******")
        else:
            print(accession_id, exp_accession_id, "run_types", 
                  len(unique_run_types))
    
        rows.append(row)


columns = ['experiment_series', 'experiment', 'fastqs', 'fastqs_run_type', 
           'assembly', 'preferred_default_bed_narrowPeak', 
           'preferred_default_bed_narrowPeak_download_urls', 
           'preferred_default_bed_narrowPeak_md5sums',
           'preferred_default_bed_narrowPeak_qc_metrics', 
           'unfiltered_alignments', 'unfiltered_alignments_download_urls', 
           'unfiltered_alignments_md5sums', 'unfiltered_alignments_qc_metrics',
           'alignments', 'alignments_download_urls', 'alignments_md5sums', 
           'alignments_qc_metrics', 'control_unfiltered_alignments', 
           'control_unfiltered_alignments_download_urls', 
           'control_unfiltered_alignments_md5sums', 
           'control_unfiltered_alignments_qc_metrics', 'control_alignments', 
           'control_alignments_download_urls', 'control_alignments_md5sums', 
           'control_alignments_qc_metrics']

df = pd.DataFrame(rows) 
df.to_csv('TF_Atlas_CHIPSeq_ExperimentSeries_metadata_qc_metrics.csv', 
                           sep='\t', index=False, header=columns)

In [None]:
# get a list of download urls and the corresponding md5Sums
# we'll write this info to a txt file and use that file to do 
# bulk downloads in parallel


# alignments and unfiltered alignments
alignments_urls = []
alignments_md5sums = []
unfiltered_alignments_urls = []
unfiltered_alignments_md5sums = []

for idx, row in df.iterrows():
    unfiltered_alignments_urls.extend(row['unfiltered_alignments_download_urls'])
    unfiltered_alignments_md5sums.extend(row['unfiltered_alignments_md5sums'])
    
    alignments_urls.extend(row['alignments_download_urls'])
    alignments_md5sums.extend(row['alignments_md5sums'])
    
    unfiltered_alignments_urls.extend(row['control_unfiltered_alignments_download_urls'])
    unfiltered_alignments_md5sums.extend(row['control_unfiltered_alignments_md5sums'])
    
    alignments_urls.extend(row['control_alignments_download_urls'])
    alignments_md5sums.extend(row['control_alignments_md5sums'])

alignments_urls_md5sums = ([list(a) for a in zip(alignments_urls, alignments_md5sums)])
alignments_urls_md5sums_df = pd.DataFrame(alignments_urls_md5sums)
alignments_urls_md5sums_df.to_csv('alignments_bams_urls_md5sums.txt', header=None, index=False, sep=' ')

unfiltered_alignments_urls_md5sums = ([list(a) for a in zip(unfiltered_alignments_urls, unfiltered_alignments_md5sums)])
unfiltered_alignments_urls_md5sums_df = pd.DataFrame(unfiltered_alignments_urls_md5sums)
unfiltered_alignments_urls_md5sums_df.to_csv('unfiltered_alignments_bams_urls_md5sums.txt', header=None, index=False, sep=' ')

# preferred default IDR thresholded peaks
preferred_default_urls = []
preferred_default_md5sums = []
for idx, row in df.iterrows():
    preferred_default_urls.extend(row['preferred_default_bed_narrowPeak_download_urls'])
    preferred_default_md5sums.extend(row['preferred_default_bed_narrowPeak_md5sums'])

preferred_default_urls_md5sums = ([list(a) for a in zip(preferred_default_urls, preferred_default_md5sums)])

preferred_default_urls_md5sums_df = pd.DataFrame(preferred_default_urls_md5sums)
preferred_default_urls_md5sums_df.to_csv('preferred_default_urls_md5sums.txt', header=None, index=False, sep=' ')
