In [170]:
from urllib.parse import urlencode, quote_plus
import os.path as osp
import requests
import pandas as pd
import tqdm
%run env.py
%run src/lib.py
data_file = osp.join(REPO_DATA_DIR, 'immunexpresso', 'data.csv')

### Import

#### Collect Cell Types 

In [142]:
def query(terms):
    url_format = 'http://immuneexpresso.org/immport-immunexpresso/search/ix_lexicon/synonym-search-trim?rows=2000&bf=product(frequency,preferred)&q={}&_=1555322107128'
    #query = urlencode({'q': '"' + term + '"'}, quote_via=quote_plus)
    query = urlencode({'q': terms}, quote_via=quote_plus)
    url = url_format.format(query)
    r = requests.get(url)
    df = pd.DataFrame(r.json()['response']['docs'])
    return df

In [143]:
queries = [
    '"T cell"',
    '"T-helper"',
    '"gamma delta"',
    '"Treg"',
    '"cytotoxic T"',
    '"regulatory T"',
    '"natural killer T"',
    '"NKT"',
    '"TFH"',
    '"memory T"'
]
df = pd.concat([query(q) for q in queries])\
    .drop_duplicates(subset=['name', 'preferred', 'synonym', 'term_id', 'type'])
df.head()

Unnamed: 0,frequency,id,name,preferred,score,synonym,term_id,type
0,379437,301,T cell,1,22879.916,T cell,CL_0000084,CELL
1,88514,9779,mature T cell,1,5337.337,mature T cell,CL_0002419,CELL
2,65426,2399,alpha-beta T cell,1,3945.3752,alpha-beta T cell,CL_0000789,CELL
3,63953,2431,mature alpha-beta T cell,1,3856.5688,mature alpha-beta T cell,CL_0000791,CELL
4,59986,1779,"CD4-positive, alpha-beta T cell",1,3617.401,"CD4-positive, alpha-beta T cell",CL_0000624,CELL


In [108]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 193 entries, 0 to 6
Data columns (total 8 columns):
frequency    193 non-null int64
id           193 non-null object
name         193 non-null object
preferred    193 non-null int64
score        193 non-null float64
synonym      193 non-null object
term_id      193 non-null object
type         193 non-null object
dtypes: float64(1), int64(2), object(5)
memory usage: 13.6+ KB


In [109]:
df['synonym'].unique()

array(['T cell', 'mature T cell', 'alpha-beta T cell',
       'mature alpha-beta T cell', 'CD4-positive, alpha-beta T cell',
       'CD8-positive, alpha-beta T cell', 'effector T cell',
       'cytotoxic T cell', 'regulatory T cell',
       'CD4-positive, CD25-positive, alpha-beta regulatory T cell',
       'helper T cell', 'CD8-positive, alpha-beta cytotoxic T cell',
       'immature T cell', 'CD4-positive helper T cell', 'naive T cell',
       'memory T cell', 'gamma-delta T cell',
       'CD4-positive, alpha-beta intraepithelial T cell',
       'mature NK T cell', 'alpha-beta intraepithelial T cell',
       'Anaplastic large cell lymphoma, T cell and Null cell type',
       'naive thymus-derived CD4-positive, alpha-beta T cell',
       'CD8-positive, alpha-beta memory T cell',
       'CD8-positive, alpha-beta regulatory T cell',
       'CD4-positive, alpha-beta memory T cell',
       'mature gamma-delta T cell', 'gamma-delta intraepithelial T cell',
       'dendritic epidermal T cel

In [150]:
df['type'].value_counts()

CELL        158
CYTOKINE     25
DISEASE      10
Name: type, dtype: int64

#### Collect Relation Data

In [152]:
REQ_FMT="""<request type='data'>
<knowledgeSpace id='interactions'>
<filters>
<cell expand='true'>{}</cell>
<add_functions>true</add_functions>
</filters>
<paging start='0' limit='1000000' />
<sort by='numPapers' dir='desc'/>
</knowledgeSpace>
</request>
"""

# Example response (from post to http://immuneexpresso.org/immport-immunexpresso/rest/RESTServlet):
# {'status': {'statusCode': 1},
#  'totalNumElements': 299,
#  'start': 0,
#  'limit': 10,
#  'dataElements': [{'score': '0.08',
#    'numPapers': 88,
#    'enrichmentScore': '5.01',
#    'actor': 'cell',
#    'cellOntologyNodeId': 'CL_0000545',
#    'cellOntologyNodeLabel': 'T-helper 1 cell',
#    'cytokineOntologyNodeId': 'CID_83',
#    'cytokineOntologyNodeLabel': 'IFNG',
#    'verbCategory': 'Positive',
#    'functions': []}, 
#     ... ]
# }

def parse(res):
    df = []
    if 'dataElements' not in res:
        return pd.DataFrame(df)
    for e in res['dataElements']:
        df.append(dict(
            cell_id = e['cellOntologyNodeId'],
            cell_label = e['cellOntologyNodeLabel'],
            cytokine_id = e['cytokineOntologyNodeId'],
            cytokine_label = e['cytokineOntologyNodeLabel'],
            category = e['verbCategory'],
            score = e['enrichmentScore'],
            num_papers = e['numPapers'],
            actor = e['actor']
        ))
    return pd.DataFrame(df)

def get_ix_data(cell_type_ids, max_failures=5):
    dfs = []
    n_fail = 0
    url = 'http://immuneexpresso.org/immport-immunexpresso/rest/RESTServlet'
    for cid in tqdm.tqdm_notebook(cell_type_ids):
        try:
            request = REQ_FMT.format(cid)
            res = requests.post(url, data={'request': request})
            df = parse(res.json())
            if len(df) > 0:
                dfs.append(df)
        except:
            n_fail += 1
            print('Failure occurred for cell type id "{}" (ignoring)')
            if n_fail >= max_failures:
                print('Num failures exceeds max ({})'.format(max_failures))
                raise
    return pd.concat(dfs)

#dfr = get_ix_data(['CL_0000084'])
cell_type_ids = df[df['type'] == 'CELL']['term_id'].unique()
dfr = get_ix_data(cell_type_ids)
dfr.head()

HBox(children=(IntProgress(value=0, max=107), HTML(value='')))

Unnamed: 0,actor,category,cell_id,cell_label,cytokine_id,cytokine_label,num_papers,score
0,cell,Positive,CL_0000084,T cell,CID_83,IFNG,809,4.28
1,cell,Positive,CL_0000084,T cell,CID_110,IL2,701,5.3
2,cell,Positive,CL_0000625,"CD8-positive, alpha-beta T cell",CID_83,IFNG,338,8.88
3,cytokine,Positive,CL_0000084,T cell,CID_110,IL2,313,1.58
4,cytokine,Unknown,CL_0000084,T cell,CID_110,IL2,260,4.52


In [153]:
dfr.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6246 entries, 0 to 9
Data columns (total 8 columns):
actor             6246 non-null object
category          6246 non-null object
cell_id           6246 non-null object
cell_label        6246 non-null object
cytokine_id       6246 non-null object
cytokine_label    6246 non-null object
num_papers        6246 non-null int64
score             6246 non-null object
dtypes: int64(1), object(7)
memory usage: 439.2+ KB


In [160]:
# Save results in the event of restart (NOTE: this will overwrite final results)
dfr.to_csv(data_file, index=False)
data_file

'/Users/eczech/repos/hammer/t-cell-relation-extraction/pm_subtype_protein_relations/data/immunexpresso/data.csv'

### Matching

Match cell types and cytokines to those in this project:

In [162]:
dfr = pd.read_csv(data_file)
# Save record count to ensure that none are lost after transformation
n_init = len(dfr)
dfr.head()

Unnamed: 0,actor,category,cell_id,cell_label,cytokine_id,cytokine_label,num_papers,score
0,cell,Positive,CL_0000084,T cell,CID_83,IFNG,809,4.28
1,cell,Positive,CL_0000084,T cell,CID_110,IL2,701,5.3
2,cell,Positive,CL_0000625,"CD8-positive, alpha-beta T cell",CID_83,IFNG,338,8.88
3,cytokine,Positive,CL_0000084,T cell,CID_110,IL2,313,1.58
4,cytokine,Unknown,CL_0000084,T cell,CID_110,IL2,260,4.52


In [171]:
df_ct = get_entity_meta_data(CELL_TYPES)
df_ct.head()

Unnamed: 0,sym,lbl,spid,src,extid,id
0,TN,TN,1,manual,,CT591AB86BE24D7A57
1,T-naïve,TN,1,manual,,CTE283259FDAF532CC
2,T naïve,TN,1,manual,,CTC08BA13585D27DA5
3,Tnaïve,TN,1,manual,,CTA34535F3D8CB4079
4,T-naive,TN,1,manual,,CT6D35F4BB1D4D94BF


In [172]:
df_ck = get_entity_meta_data(CYTOKINES)
df_ck.head()

Unnamed: 0,id,src,sym,lbl,spid,extid
0,CK3618F8D5635EA31B,cameron,IL-1α,IL-1α,1,
1,CKA07F01C879D8D6C2,cameron,IL-1β,IL-1β,1,
2,CKA95B181F6273B58D,cameron,IL-1RA,IL-1RA,1,
3,CKBF6003C60D23BA0D,cameron,IL-18,IL-18,1,
4,CKD21834C7D75FD77F,cameron,IL-2,IL-2,1,


In [168]:
len(dfr['cell_id'].unique()), len(cell_type_ids)

(51, 107)

In [29]:
df[df['synonym'].str.lower().str.contains('helper')]['synonym'].unique()

array([], dtype=object)