---
title: Annotate ENCODE targets table with system, cell, organ informations
author: Sabrina Mi
date: 7/17/23
---

### Import library and read in targets dataframe

In [7]:
import pandas as pd
import requests, json

In [3]:
# Download Targets
targets_txt = 'https://raw.githubusercontent.com/calico/basenji/master/manuscripts/cross2020/targets_human.txt'
df_targets = pd.read_csv(targets_txt, sep='\t')

### Query ENCODE database

In [12]:
# Force return from the server in JSON format
headers = {'accept': 'application/json'}

def get_json(accession_id):

    # This URL locates the ENCODE biosample with accession number ENCBS000AAA
    url = 'https://www.encodeproject.org/files/' + accession_id +'/?format=json'
    
    # GET the object
    response = requests.get(url, headers=headers)
    
    # Extract the JSON response as a Python dictionary
    encff = response.json()

    return encff


### Extract fields

We want to query system, cell, and organ information.

In [60]:
sys_col, cell_col, org_col, dev_col, syn_col = ([] for i in range(5))

for i in range(5313):
    accession_id = df_targets['identifier'][i]
    encff = get_json(accession_id)

    if 'biosample_ontology' in encff.keys():

        sys = encff['biosample_ontology']['system_slims']
        sys_col.append(', '.join(sys) if len(sys) > 0 else 'NA')

        cell = encff['biosample_ontology']['cell_slims']
        cell_col.append(', '.join(cell) if len(cell) > 0 else 'NA')

        org = encff['biosample_ontology']['organ_slims']
        org_col.append(', '.join(org) if len(org) > 0 else 'NA')

        dev = encff['biosample_ontology']['developmental_slims']
        dev_col.append(', '.join(dev) if len(org) > 0 else 'NA')
    
    else:
        
        sys_col.append('NA')
        cell_col.append('NA')
        org_col.append('NA')
        dev_col.append('NA')
        
    if i % 500 == 0: print(i)





0
500
1000
1500
2000
2500
3000
3500
4000
4500
5000


In [61]:
print(sys_col)
print(cell_col)
print(org_col)
print(dev_col)

df_targets['system'] = sys_col
df_targets['cell'] = cell_col
df_targets ['organ'] = org_col
df_targets['developmental'] = dev_col

['central nervous system', 'central nervous system', 'NA', 'reproductive system', 'integumental system', 'integumental system', 'integumental system', 'respiratory system', 'integumental system', 'central nervous system', 'central nervous system', 'immune system', 'immune system', 'immune system', 'immune system', 'immune system', 'immune system', 'immune system', 'immune system', 'NA', 'NA', 'NA', 'circulatory system', 'excretory system', 'reproductive system', 'reproductive system', 'endocrine system, exocrine system, digestive system', 'endocrine system, exocrine system, digestive system', 'reproductive system', 'circulatory system, reproductive system', 'NA', 'integumental system', 'integumental system', 'immune system', 'immune system', 'immune system', 'reproductive system', 'reproductive system', 'exocrine system, integumental system', 'central nervous system', 'integumental system', 'immune system', 'integumental system', 'skeletal system', 'musculature', 'exocrine system, inte

In [66]:
df_targets.columns

Index(['index', 'genome', 'identifier', 'file', 'clip', 'scale', 'sum_stat',
       'description', 'system', 'cell', 'organ', 'developmental'],
      dtype='object')