In [1]:
from urllib.parse import urlencode, quote_plus
import os.path as osp
import requests
import pandas as pd
import numpy as np
import tqdm
from tcre.env import *
from tcre import lib
TEMP_FILE = osp.join(SUPERVISION_DATA_DIR, 'immunexpresso', 'import.csv')
DATA_FILE = osp.join(SUPERVISION_DATA_DIR, 'immunexpresso', 'data.csv')

### Import

#### Collect Cell Types 

In [2]:
def query(terms):
    url_format = 'http://immuneexpresso.org/immport-immunexpresso/search/ix_lexicon/synonym-search-trim?rows=2000&bf=product(frequency,preferred)&q={}&_=1555322107128'
    #query = urlencode({'q': '"' + term + '"'}, quote_via=quote_plus)
    query = urlencode({'q': terms}, quote_via=quote_plus)
    url = url_format.format(query)
    r = requests.get(url)
    df = pd.DataFrame(r.json()['response']['docs'])
    return df

In [3]:
query('"effector T"')

Unnamed: 0,frequency,id,name,preferred,score,synonym,term_id,type
0,29222,4109,effector T cell,1,1608.1456,effector T cell,CL_0000911,CELL
1,1,8835,innate effector T cell,1,1.710521,innate effector T cell,CL_0002127,CELL
2,1,4053,"CD8-positive, alpha-beta cytokine secreting ef...",1,1.296634,"CD8-positive, alpha-beta cytokine secreting ef...",CL_0000908,CELL


In [4]:
queries = [
    '"T cell" OR (T AND cell)',
    '"T-helper"',
    '"gamma delta"',
    '"Treg"',
    '"cytotoxic T"',
    '"regulatory T"',
    '"natural killer T"',
    '"NKT"',
    '"TFH"',
    '"memory T"',
    '"effector T"',
    '"effector memory"',
    '"central memory"',
]
df = pd.concat([query(q) for q in queries])\
    .drop_duplicates(subset=['name', 'preferred', 'synonym', 'term_id', 'type'])
df.head()

Unnamed: 0,frequency,id,name,preferred,score,synonym,term_id,type
0,379437,301,T cell,1,16194.714,T cell,CL_0000084,CELL
1,88514,9779,mature T cell,1,3778.1338,mature T cell,CL_0002419,CELL
2,65426,2399,alpha-beta T cell,1,2792.9727,alpha-beta T cell,CL_0000789,CELL
3,63953,2431,mature alpha-beta T cell,1,2730.1199,mature alpha-beta T cell,CL_0000791,CELL
4,59986,1779,"CD4-positive, alpha-beta T cell",1,2560.8486,"CD4-positive, alpha-beta T cell",CL_0000624,CELL


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 195 entries, 0 to 2
Data columns (total 8 columns):
frequency    195 non-null int64
id           195 non-null object
name         195 non-null object
preferred    195 non-null int64
score        195 non-null float64
synonym      195 non-null object
term_id      195 non-null object
type         195 non-null object
dtypes: float64(1), int64(2), object(5)
memory usage: 13.7+ KB


In [6]:
df['synonym'].unique()

array(['T cell', 'mature T cell', 'alpha-beta T cell',
       'mature alpha-beta T cell', 'CD4-positive, alpha-beta T cell',
       'CD8-positive, alpha-beta T cell', 'effector T cell',
       'cytotoxic T cell', 'regulatory T cell',
       'CD4-positive, CD25-positive, alpha-beta regulatory T cell',
       'helper T cell', 'CD8-positive, alpha-beta cytotoxic T cell',
       'immature T cell', 'CD4-positive helper T cell', 'naive T cell',
       'memory T cell', 'gamma-delta T cell',
       'CD4-positive, alpha-beta intraepithelial T cell',
       'mature NK T cell', 'alpha-beta intraepithelial T cell',
       'Anaplastic large cell lymphoma, T cell and Null cell type',
       'naive thymus-derived CD4-positive, alpha-beta T cell',
       'CD8-positive, alpha-beta memory T cell',
       'T follicular helper cell',
       'CD8-positive, alpha-beta regulatory T cell',
       'CD4-positive, alpha-beta memory T cell',
       'mature gamma-delta T cell', 'gamma-delta intraepithelial T cell'

In [7]:
df['type'].value_counts()

CELL        160
CYTOKINE     25
DISEASE      10
Name: type, dtype: int64

#### Collect Relation Data

In [203]:
REQ_FMT="""<request type='data'>
<knowledgeSpace id='interactions'>
<filters>
<cell expand='true'>{}</cell>
<add_functions>true</add_functions>
</filters>
<paging start='0' limit='1000000' />
<sort by='numPapers' dir='desc'/>
</knowledgeSpace>
</request>
"""

# Example response (from post to http://immuneexpresso.org/immport-immunexpresso/rest/RESTServlet):
# {'status': {'statusCode': 1},
#  'totalNumElements': 299,
#  'start': 0,
#  'limit': 10,
#  'dataElements': [{'score': '0.08',
#    'numPapers': 88,
#    'enrichmentScore': '5.01',
#    'actor': 'cell',
#    'cellOntologyNodeId': 'CL_0000545',
#    'cellOntologyNodeLabel': 'T-helper 1 cell',
#    'cytokineOntologyNodeId': 'CID_83',
#    'cytokineOntologyNodeLabel': 'IFNG',
#    'verbCategory': 'Positive',
#    'functions': []}, 
#     ... ]
# }

def parse(res):
    df = []
    if 'dataElements' not in res:
        return pd.DataFrame(df)
    for e in res['dataElements']:
        df.append(dict(
            cell_id = e['cellOntologyNodeId'],
            cell_label = e['cellOntologyNodeLabel'],
            cytokine_id = e['cytokineOntologyNodeId'],
            cytokine_label = e['cytokineOntologyNodeLabel'],
            category = e['verbCategory'],
            score = e['enrichmentScore'],
            num_papers = e['numPapers'],
            actor = e['actor']
        ))
    return pd.DataFrame(df)

def get_ix_data(cell_type_ids, max_failures=5):
    dfs = []
    n_fail = 0
    url = 'http://immuneexpresso.org/immport-immunexpresso/rest/RESTServlet'
    for cid in tqdm.tqdm_notebook(cell_type_ids):
        try:
            request = REQ_FMT.format(cid)
            res = requests.post(url, data={'request': request})
            df = parse(res.json())
            if len(df) > 0:
                dfs.append(df)
        except:
            n_fail += 1
            print('Failure occurred for cell type id "{}" (ignoring)')
            if n_fail >= max_failures:
                print('Num failures exceeds max ({})'.format(max_failures))
                raise
    return pd.concat(dfs)

#dfr = get_ix_data(['CL_0000084'])
cell_type_ids = df[df['type'] == 'CELL']['term_id'].unique()
dfi = get_ix_data(cell_type_ids)
dfi = dfi.drop_duplicates()
dfi.head()

HBox(children=(IntProgress(value=0, max=108), HTML(value='')))

Unnamed: 0,actor,category,cell_id,cell_label,cytokine_id,cytokine_label,num_papers,score
0,cell,Positive,CL_0000084,T cell,CID_83,IFNG,809,4.28
1,cell,Positive,CL_0000084,T cell,CID_110,IL2,701,5.3
2,cell,Positive,CL_0000625,"CD8-positive, alpha-beta T cell",CID_83,IFNG,338,8.88
3,cytokine,Positive,CL_0000084,T cell,CID_110,IL2,313,1.58
4,cytokine,Unknown,CL_0000084,T cell,CID_110,IL2,260,4.52


In [204]:
dfi.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6246 entries, 0 to 9
Data columns (total 8 columns):
actor             6246 non-null object
category          6246 non-null object
cell_id           6246 non-null object
cell_label        6246 non-null object
cytokine_id       6246 non-null object
cytokine_label    6246 non-null object
num_papers        6246 non-null int64
score             6246 non-null object
dtypes: int64(1), object(7)
memory usage: 439.2+ KB


In [32]:
# Save results in the event of restart 
dfi.to_csv(TEMP_FILE, index=False)
TEMP_FILE

'/lab/repos/t-cell-relation-extraction/data/supervision/immunexpresso/import.csv'

### Matching

Match cell types and cytokines to those in this project:

#### Load Project Meta Data

In [34]:
dfr = pd.read_csv(TEMP_FILE).drop_duplicates()
for c in ['cell_ref_id', 'cytokine_ref_id']:
    if c in dfr:
        dfr = dfr.drop(c, axis=1)
# Save record count to ensure that none are lost after transformation
n_init = len(dfr)
dfr.head()

Unnamed: 0,actor,category,cell_id,cell_label,cytokine_id,cytokine_label,num_papers,score
0,cell,Positive,CL_0000084,T cell,CID_83,IFNG,809,4.28
1,cell,Positive,CL_0000084,T cell,CID_110,IL2,701,5.3
2,cell,Positive,CL_0000625,"CD8-positive, alpha-beta T cell",CID_83,IFNG,338,8.88
3,cytokine,Positive,CL_0000084,T cell,CID_110,IL2,313,1.58
4,cytokine,Unknown,CL_0000084,T cell,CID_110,IL2,260,4.52


In [35]:
df_ct = lib.get_entity_meta_data(lib.CELL_TYPES)
df_ct.head()

Unnamed: 0,depth,extid,id,lbl,parent,root,spid,src,sym,priority,prefid,enabled
0,4,CL:0001047,CT312FEE66DC3CC142,"CD4-positive, CD25-positive, CCR4-positive, al...",,CL:0000084,1,cl,CCR+ Treg,50,CT05DC4A067F2BDFCB,True
1,3,CL:0002426,CT5847C8E0924693BF,"CD11b-positive, CD27-positive natural killer",,CL:0000623,1,cl,"CD11b-positive, CD27-positive natural killer",50,CT5847C8E0924693BF,True
2,2,CL:0000939,CTDEA6CA6446B38258,"CD16-positive, CD56-dim natural killer",,CL:0000623,1,cl,CD16+CD56+ NK,50,CTDBB7C729B7F40970,True
3,2,CL:0000938,CTBC1262C129A2D683,"CD16-negative, CD56-bright natural killer",,CL:0000623,1,cl,CD16-CD56bright NK,50,CT2751E190DAECAF00,True
4,2,CL:0000938,CT9761754DA14E9898,"CD16-negative, CD56-bright natural killer",,CL:0000623,1,cl,"CD16-negative, CD56-bright NK",50,CT2751E190DAECAF00,True


In [36]:
df_ct['lbl'].unique()[:50]

array(['CD4-positive, CD25-positive, CCR4-positive, alpha-beta regulatory T',
       'CD11b-positive, CD27-positive natural killer',
       'CD16-positive, CD56-dim natural killer',
       'CD16-negative, CD56-bright natural killer',
       'CD2-positive, CD5-positive, CD44-positive alpha-beta intraepithelial T',
       'CD24-positive, CD4 single-positive thymocyte',
       'CD24-positive, CD8 single-positive thymocyte',
       'CD25-positive, CD27-positive immature gamma-delta T',
       'CD27-high, CD11b-high natural killer',
       'CD27-high, CD11b-low natural killer',
       'CD27-low, CD11b-high natural killer',
       'CD27-low, CD11b-low immature natural killer',
       'CD27-negative gamma-delta T', 'CD27-positive gamma-delta T',
       'mature T',
       'CD4-intermediate, CD8-positive double-positive thymocyte',
       'CD4-negative CD8-negative gamma-delta intraepithelial T',
       'CD4-negative, CD8-negative type I NK T',
       'CD4-negative, CD8-negative type I NK T cel

In [37]:
df_ck = lib.get_entity_meta_data(lib.CYTOKINES)
df_ck.head()

Unnamed: 0,id,src,sym,lbl,spid,extid,priority,prefid,enabled
0,CKC13858A2B1F14B6A,ckr,FGF2,FGF2,1,CID_61,50,CKC4486E15652210FC,True
1,CKE1C384AC6D4F253C,ckr,G0S19-3,CCL3L2,1,CID_4,50,CKA949C45055283102,True
2,CK02F6DA228C64A2DE,ckr,IL 27,IL-27α,1,CID_120,50,CKC78D5F9D7031D9FD,True
3,CKFB6F75446966C774,ckr,LD78gamma,CCL3L2,1,CID_4,50,CKA949C45055283102,True
4,CK4FE9053AF4462C6F,ckr,SCYA3L2,CCL3L2,1,CID_4,50,CKA949C45055283102,True


In [38]:
df_ck['lbl'].unique()

array(['FGF2', 'CCL3L2', 'IL-27α', 'TGFα', 'CXCL8', 'IFN-1', 'IFN-AR1',
       'IFN-AR2', 'IFN-CDw119', 'IFN-GR2', 'IFN-IL-18', 'IFN-IL-28A',
       'IFN-IL-28B', 'IFN-IL-28a', 'IFN-IL-28b', 'IFN-IL-29', 'IFN-IL-6',
       'IFN-LR1', 'IFN-α', 'IFN-α1', 'IFN-α10', 'IFN-α13', 'IFN-α14',
       'IFN-α16', 'IFN-α17', 'IFN-α2', 'IFN-α21', 'IFN-α4', 'IFN-α5',
       'IFN-α6', 'IFN-α7', 'IFN-α8', 'IFN-β', 'IFN-β1', 'IFN-ε', 'IFN-γ',
       'IFN-κ', 'IFN-ω', 'IL-1', 'IL-1 theta', 'IL-10', 'IL-10RA',
       'IL-10RB', 'IL-11', 'IL-11RA', 'IL-12', 'IL-12A', 'IL-12B',
       'IL-12RB2', 'IL-12α', 'IL-13', 'IL-14', 'IL-15', 'IL-15Ra',
       'IL-16', 'IL-17', 'IL-17A', 'IL-17B', 'IL-17C', 'IL-17D', 'IL-17F',
       'IL-17a', 'IL-17b', 'IL-17c', 'IL-17d', 'IL-17f', 'IL-18',
       'IL-18BP', 'IL-18R1', 'IL-19', 'IL-1A', 'IL-1B', 'IL-1F10',
       'IL-1RA', 'IL-1α', 'IL-1β', 'IL-1δ', 'IL-1ε', 'IL-2', 'IL-20',
       'IL-20Ra', 'IL-20Rb', 'IL-21', 'IL-21R', 'IL-22', 'IL-22RA1',
       'IL-22RA2', 'IL

#### Match Cell Types

In [66]:
# Map immuneXpresso cell type labels to internal labels
m_ct = {
    'CD4-positive helper T cell': 'Th',
    'naive T cell': 'TN',
    'naive thymus-derived CD4-positive, alpha-beta T cell': 'TN',
    'regulatory T cell': 'Treg',
    'CD4-positive, CD25-positive, alpha-beta regulatory T cell': 'Treg',
    'CD8-positive, CD28-negative, alpha-beta regulatory T cell': 'Treg',
    'CD8-positive, CXCR3-positive, alpha-beta regulatory T cell': 'Treg',
    'induced T-regulatory cell': 'iTreg',
    'natural T-regulatory cell': 'nTreg',
    'CD8-positive, alpha-beta regulatory T cell': 'Treg',
    'CD4-positive, alpha-beta cytotoxic T cell': 'Tc',
    'CD8-positive, alpha-beta cytotoxic T cell': 'Tc',
    'cytotoxic T cell': 'Tc',
    'effector T cell': 'effector T',
    'mature T cell': 'mature T',
    'pro-T cell': 'DN2 immature T',
    'early T lineage precursor': 'ThP',
    'CD4-positive, alpha-beta intraepithelial T cell': 'IEL',
    'CD8-alpha-beta-positive, alpha-beta intraepithelial T cell': 'IEL',
    'intraepithelial lymphocyte': 'IEL',
    'CD4-positive, alpha-beta memory T cell': 'TMEM',
    'CD8-positive, alpha-beta memory T cell': 'TMEM',
    'memory T cell': 'TMEM',
    'T follicular helper cell': 'Tfh',
    'T-helper 1 cell': 'Th1',
    'T-helper 17 cell': 'Th17',
    'T-helper 2 cell': 'Th2',
    'T-helper 22 cell': 'Th22',
    'T-helper 9 cell': 'Th9',
    'Tc1 cell': 'Tc1',
    'Tc17 cell': 'Tc17',
    'Tr1 cell': 'Treg1',
    'dendritic epidermal T cell': 'DETC',
    'CD8-positive, alpha-beta thymocyte': 'Thymocyte',
    'DN4 thymocyte': 'Thymocyte',
    'fetal thymocyte': 'Thymocyte',
    'double negative thymocyte': 'Thymocyte',
    'double-positive, alpha-beta thymocyte': 'Thymocyte',
    'mature CD4 single-positive thymocyte': 'Thymocyte',
    'thymocyte': 'Thymocyte',
    'effector T cell': 'TEFF',
    'effector memory CD4-positive, alpha-beta T cell': 'TEM',
    'effector memory CD8-positive, alpha-beta T cell': 'TEM',
    'gamma-delta T cell': 'γδT',
    'helper T cell': 'Th',
    'mature NK T cell': 'NKT'
}

In [67]:
df_ct.head()

Unnamed: 0,depth,extid,id,lbl,parent,root,spid,src,sym,priority,prefid,enabled
0,4,CL:0001047,CT312FEE66DC3CC142,"CD4-positive, CD25-positive, CCR4-positive, al...",,CL:0000084,1,cl,CCR+ Treg,50,CT05DC4A067F2BDFCB,True
1,3,CL:0002426,CT5847C8E0924693BF,"CD11b-positive, CD27-positive natural killer",,CL:0000623,1,cl,"CD11b-positive, CD27-positive natural killer",50,CT5847C8E0924693BF,True
2,2,CL:0000939,CTDEA6CA6446B38258,"CD16-positive, CD56-dim natural killer",,CL:0000623,1,cl,CD16+CD56+ NK,50,CTDBB7C729B7F40970,True
3,2,CL:0000938,CTBC1262C129A2D683,"CD16-negative, CD56-bright natural killer",,CL:0000623,1,cl,CD16-CD56bright NK,50,CT2751E190DAECAF00,True
4,2,CL:0000938,CT9761754DA14E9898,"CD16-negative, CD56-bright natural killer",,CL:0000623,1,cl,"CD16-negative, CD56-bright NK",50,CT2751E190DAECAF00,True


In [41]:
#np.sort(dfr['cell_label'].unique())

In [42]:
#np.sort(df_ct['lbl'].unique())

In [68]:
assert df_ct['sym'].is_unique
m_prefid = df_ct.set_index('sym')['prefid'].to_dict()
# Map iX cell type name to some known symbol and then map that symbol to the preferred id
dfr['cell_ref_id'] = dfr['cell_label'].map(m_ct).map(m_prefid)
dfr[dfr['cell_ref_id'].notnull()].head()

Unnamed: 0,actor,category,cell_id,cell_label,cytokine_id,cytokine_label,num_papers,score,cell_ref_id,cytokine_ref_id
13,cell,Positive,CL_0000545,T-helper 1 cell,CID_83,IFNG,88,5.01,CT4EC75AEE8E7F5A03,CKEF3883BE9F74024F
21,cytokine,Positive,CL_0000792,"CD4-positive, CD25-positive, alpha-beta regula...",CID_110,IL2,60,3.12,CT3FE60C3FE8E2D0E6,CKA3BA887803CF8CE7
22,cell,Positive,CL_0000546,T-helper 2 cell,CID_131,IL4,59,15.88,CT3CFB613D270E6C62,CKF0F9E60D7241467D
26,cell,Positive,CL_0000792,"CD4-positive, CD25-positive, alpha-beta regula...",CID_93,IL10,57,7.93,CT3FE60C3FE8E2D0E6,CK8F843985B9BF76B8
27,cytokine,Positive,CL_0000545,T-helper 1 cell,CID_95,IL12,56,11.77,CT4EC75AEE8E7F5A03,CKDE858D3F63FD2E03


In [69]:
dfr.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1245 entries, 0 to 6045
Data columns (total 10 columns):
actor              1245 non-null object
category           1245 non-null object
cell_id            1245 non-null object
cell_label         1245 non-null object
cytokine_id        1245 non-null object
cytokine_label     1245 non-null object
num_papers         1245 non-null int64
score              1245 non-null float64
cell_ref_id        685 non-null object
cytokine_ref_id    1245 non-null object
dtypes: float64(1), int64(1), object(8)
memory usage: 107.0+ KB


In [70]:
dfr[dfr['cell_ref_id'].isnull()]['cell_label'].unique()

array(['T cell', 'CD8-positive, alpha-beta T cell',
       'CD4-positive, alpha-beta T cell', 'effector T cell',
       'plasmacytoid dendritic cell',
       'thymic plasmacytoid dendritic cell',
       'CD11c-low plasmacytoid dendritic cell'], dtype=object)

#### Match Cytokines

In [45]:
# cts = df_ck.groupby(['src', 'sym'])['lbl'].nunique()
# cts[cts > 1]

In [71]:
#df_ck[df_ck['sym'] == 'IFN-L2']
#df_ck[df_ck['sym'] == 'LT-α']
#df_ck[df_ck['sym'] == 'Monocyte-CSF']
df_ck[df_ck['sym'] == 'IL-1RN']
#df_ck[df_ck['lbl'] == 'CXCL7']

Unnamed: 0,id,src,sym,lbl,spid,extid,priority,prefid,enabled
2916,CKE72725973F3BAC8F,ckr,IL-1RN,IL-1RA,1,CID_86,50,CK5898940A9125FE21,True


In [72]:
m_ck = {'Monocyte-CSF': 'CSF1'}
dfr_ck = dfr['cytokine_label'].unique()
prj_ck = df_ck['lbl'].unique()
prj_ck_m = df_ck.groupby('sym')['lbl'].min().to_dict()
prj_ck_m = {k.upper():v for k, v in prj_ck_m.items()}

unmapped_cytokines = []
for ck in dfr_ck:
    if ck in m_ck:
        continue
    sym = ck
    if ck.startswith('IL'):
        sym = 'IL-' + ck[2:]
    if sym in prj_ck:
        m_ck[ck] = sym
    elif sym in prj_ck_m:
        m_ck[ck] = prj_ck_m[sym]
    else:
        unmapped_cytokines.append(ck)

assert len(unmapped_cytokines) == 0, 'Found the following unmapped cytokines:\n{}'.format(sorted(unmapped_cytokines))

In [73]:
#np.sort(dfr['cytokine_label'].unique())
#np.sort(df_ck['lbl'].unique())
#u'\N{GREEK SMALL LETTER GAMMA}'
#df_ck[df_ck['lbl'] == 'IFN-α']
#df_ck[df_ck['lbl'] == 'IFN-γ']

In [74]:
assert df_ck['sym'].is_unique
# Map iX cytokine name to some known symbol and then map that symbol to the preferred id
m_prefid = df_ck.set_index('sym')['prefid'].to_dict()
dfr['cytokine_ref_id'] = dfr['cytokine_label'].map(m_ck).map(m_prefid)
assert dfr['cytokine_ref_id'].notnull().all()
dfr.head()

Unnamed: 0,actor,category,cell_id,cell_label,cytokine_id,cytokine_label,num_papers,score,cell_ref_id,cytokine_ref_id
0,cell,Positive,CL_0000084,T cell,CID_83,IFNG,809,4.28,,CKEF3883BE9F74024F
1,cell,Positive,CL_0000084,T cell,CID_110,IL2,701,5.3,,CKA3BA887803CF8CE7
2,cell,Positive,CL_0000625,"CD8-positive, alpha-beta T cell",CID_83,IFNG,338,8.88,,CKEF3883BE9F74024F
3,cytokine,Positive,CL_0000084,T cell,CID_110,IL2,313,1.58,,CKA3BA887803CF8CE7
4,cytokine,Unknown,CL_0000084,T cell,CID_110,IL2,260,4.52,,CKA3BA887803CF8CE7


In [75]:
dfr.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1245 entries, 0 to 6045
Data columns (total 10 columns):
actor              1245 non-null object
category           1245 non-null object
cell_id            1245 non-null object
cell_label         1245 non-null object
cytokine_id        1245 non-null object
cytokine_label     1245 non-null object
num_papers         1245 non-null int64
score              1245 non-null float64
cell_ref_id        685 non-null object
cytokine_ref_id    1245 non-null object
dtypes: float64(1), int64(1), object(8)
memory usage: 107.0+ KB


## EDA

In [76]:
dfr['actor'].value_counts()

cytokine    1054
cell         191
Name: actor, dtype: int64

In [77]:
dfr.groupby(['actor', 'category']).size()

actor     category
cell      Positive    191
cytokine  Negative    271
          Positive    422
          Unknown     361
dtype: int64

In [78]:
dft = dfr[(dfr['actor'] == 'cytokine') & (dfr['category'] == 'Positive')].sort_values('num_papers', ascending=False)
#dft = dfr[(dfr['category'] == 'Positive')].sort_values('num_papers', ascending=False)
dft[dft['cell_label'] == 'T-helper 2 cell'].head(15)

Unnamed: 0,actor,category,cell_id,cell_label,cytokine_id,cytokine_label,num_papers,score,cell_ref_id,cytokine_ref_id
58,cytokine,Positive,CL_0000546,T-helper 2 cell,CID_131,IL4,31,5.47,CT3CFB613D270E6C62,CKF0F9E60D7241467D
153,cytokine,Positive,CL_0000546,T-helper 2 cell,CID_128,IL33,11,13.53,CT3CFB613D270E6C62,CK621B34672BE37D14
184,cytokine,Positive,CL_0000546,T-helper 2 cell,CID_93,IL10,9,1.47,CT3CFB613D270E6C62,CK8F843985B9BF76B8
277,cytokine,Positive,CL_0000546,T-helper 2 cell,CID_83,IFNG,5,0.33,CT3CFB613D270E6C62,CKEF3883BE9F74024F
310,cytokine,Positive,CL_0000546,T-helper 2 cell,CID_118,IL25,5,30.31,CT3CFB613D270E6C62,CK52B8D587BB3E5BB9
387,cytokine,Positive,CL_0000546,T-helper 2 cell,CID_108,IL18,4,3.91,CT3CFB613D270E6C62,CK3F77E7B46E9E491A
495,cytokine,Positive,CL_0000546,T-helper 2 cell,CID_133,IL6,3,0.44,CT3CFB613D270E6C62,CKF8C95AB8BB638BAC
456,cytokine,Positive,CL_0000546,T-helper 2 cell,CID_20,CCL2,3,1.15,CT3CFB613D270E6C62,CK73ED9A7EF785943D
406,cytokine,Positive,CL_0000546,T-helper 2 cell,CID_95,IL12,3,0.91,CT3CFB613D270E6C62,CKDE858D3F63FD2E03
413,cytokine,Positive,CL_0000546,T-helper 2 cell,CID_162,TGFB,3,0.48,CT3CFB613D270E6C62,CKFD4CA0B2B4BC3AE4


## Export

In [79]:
assert len(dfr) == n_init
dfr.to_csv(DATA_FILE, index=False)
DATA_FILE

'/lab/repos/t-cell-relation-extraction/data/supervision/immunexpresso/data.csv'