In [1]:
from urllib.parse import urlencode, quote_plus
import os.path as osp
import requests
import pandas as pd
import numpy as np
import tqdm
%run env.py
%run src/lib.py
TEMP_FILE = osp.join(SUPERVISION_DATA_DIR, 'immunexpresso', 'import.csv')
DATA_FILE = osp.join(SUPERVISION_DATA_DIR, 'immunexpresso', 'data.csv')

### Import

#### Collect Cell Types 

In [188]:
def query(terms):
    url_format = 'http://immuneexpresso.org/immport-immunexpresso/search/ix_lexicon/synonym-search-trim?rows=2000&bf=product(frequency,preferred)&q={}&_=1555322107128'
    #query = urlencode({'q': '"' + term + '"'}, quote_via=quote_plus)
    query = urlencode({'q': terms}, quote_via=quote_plus)
    url = url_format.format(query)
    r = requests.get(url)
    df = pd.DataFrame(r.json()['response']['docs'])
    return df

In [194]:
query('"effector T"')

'CD8-positive, alpha-beta cytokine secreting effector T cell'

In [199]:
queries = [
    '"T cell" OR (T AND cell)',
    '"T-helper"',
    '"gamma delta"',
    '"Treg"',
    '"cytotoxic T"',
    '"regulatory T"',
    '"natural killer T"',
    '"NKT"',
    '"TFH"',
    '"memory T"',
    '"effector T"',
    '"effector memory"',
    '"central memory"',
]
df = pd.concat([query(q) for q in queries])\
    .drop_duplicates(subset=['name', 'preferred', 'synonym', 'term_id', 'type'])
df.head()

Unnamed: 0,frequency,id,name,preferred,score,synonym,term_id,type
0,379437,301,T cell,1,16194.714,T cell,CL_0000084,CELL
1,88514,9779,mature T cell,1,3778.1338,mature T cell,CL_0002419,CELL
2,65426,2399,alpha-beta T cell,1,2792.9727,alpha-beta T cell,CL_0000789,CELL
3,63953,2431,mature alpha-beta T cell,1,2730.1199,mature alpha-beta T cell,CL_0000791,CELL
4,59986,1779,"CD4-positive, alpha-beta T cell",1,2560.8486,"CD4-positive, alpha-beta T cell",CL_0000624,CELL


In [200]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 195 entries, 0 to 2
Data columns (total 8 columns):
frequency    195 non-null int64
id           195 non-null object
name         195 non-null object
preferred    195 non-null int64
score        195 non-null float64
synonym      195 non-null object
term_id      195 non-null object
type         195 non-null object
dtypes: float64(1), int64(2), object(5)
memory usage: 13.7+ KB


In [201]:
df['synonym'].unique()

array(['T cell', 'mature T cell', 'alpha-beta T cell',
       'mature alpha-beta T cell', 'CD4-positive, alpha-beta T cell',
       'CD8-positive, alpha-beta T cell', 'effector T cell',
       'cytotoxic T cell', 'regulatory T cell',
       'CD4-positive, CD25-positive, alpha-beta regulatory T cell',
       'helper T cell', 'CD8-positive, alpha-beta cytotoxic T cell',
       'immature T cell', 'CD4-positive helper T cell', 'naive T cell',
       'memory T cell', 'gamma-delta T cell',
       'CD4-positive, alpha-beta intraepithelial T cell',
       'mature NK T cell', 'alpha-beta intraepithelial T cell',
       'Anaplastic large cell lymphoma, T cell and Null cell type',
       'naive thymus-derived CD4-positive, alpha-beta T cell',
       'CD8-positive, alpha-beta memory T cell',
       'T follicular helper cell',
       'CD8-positive, alpha-beta regulatory T cell',
       'CD4-positive, alpha-beta memory T cell',
       'mature gamma-delta T cell', 'gamma-delta intraepithelial T cell'

In [202]:
df['type'].value_counts()

CELL        160
CYTOKINE     25
DISEASE      10
Name: type, dtype: int64

#### Collect Relation Data

In [203]:
REQ_FMT="""<request type='data'>
<knowledgeSpace id='interactions'>
<filters>
<cell expand='true'>{}</cell>
<add_functions>true</add_functions>
</filters>
<paging start='0' limit='1000000' />
<sort by='numPapers' dir='desc'/>
</knowledgeSpace>
</request>
"""

# Example response (from post to http://immuneexpresso.org/immport-immunexpresso/rest/RESTServlet):
# {'status': {'statusCode': 1},
#  'totalNumElements': 299,
#  'start': 0,
#  'limit': 10,
#  'dataElements': [{'score': '0.08',
#    'numPapers': 88,
#    'enrichmentScore': '5.01',
#    'actor': 'cell',
#    'cellOntologyNodeId': 'CL_0000545',
#    'cellOntologyNodeLabel': 'T-helper 1 cell',
#    'cytokineOntologyNodeId': 'CID_83',
#    'cytokineOntologyNodeLabel': 'IFNG',
#    'verbCategory': 'Positive',
#    'functions': []}, 
#     ... ]
# }

def parse(res):
    df = []
    if 'dataElements' not in res:
        return pd.DataFrame(df)
    for e in res['dataElements']:
        df.append(dict(
            cell_id = e['cellOntologyNodeId'],
            cell_label = e['cellOntologyNodeLabel'],
            cytokine_id = e['cytokineOntologyNodeId'],
            cytokine_label = e['cytokineOntologyNodeLabel'],
            category = e['verbCategory'],
            score = e['enrichmentScore'],
            num_papers = e['numPapers'],
            actor = e['actor']
        ))
    return pd.DataFrame(df)

def get_ix_data(cell_type_ids, max_failures=5):
    dfs = []
    n_fail = 0
    url = 'http://immuneexpresso.org/immport-immunexpresso/rest/RESTServlet'
    for cid in tqdm.tqdm_notebook(cell_type_ids):
        try:
            request = REQ_FMT.format(cid)
            res = requests.post(url, data={'request': request})
            df = parse(res.json())
            if len(df) > 0:
                dfs.append(df)
        except:
            n_fail += 1
            print('Failure occurred for cell type id "{}" (ignoring)')
            if n_fail >= max_failures:
                print('Num failures exceeds max ({})'.format(max_failures))
                raise
    return pd.concat(dfs)

#dfr = get_ix_data(['CL_0000084'])
cell_type_ids = df[df['type'] == 'CELL']['term_id'].unique()
dfi = get_ix_data(cell_type_ids)
dfi = dfi.drop_duplicates()
dfi.head()

HBox(children=(IntProgress(value=0, max=108), HTML(value='')))

Unnamed: 0,actor,category,cell_id,cell_label,cytokine_id,cytokine_label,num_papers,score
0,cell,Positive,CL_0000084,T cell,CID_83,IFNG,809,4.28
1,cell,Positive,CL_0000084,T cell,CID_110,IL2,701,5.3
2,cell,Positive,CL_0000625,"CD8-positive, alpha-beta T cell",CID_83,IFNG,338,8.88
3,cytokine,Positive,CL_0000084,T cell,CID_110,IL2,313,1.58
4,cytokine,Unknown,CL_0000084,T cell,CID_110,IL2,260,4.52


In [204]:
dfi.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6246 entries, 0 to 9
Data columns (total 8 columns):
actor             6246 non-null object
category          6246 non-null object
cell_id           6246 non-null object
cell_label        6246 non-null object
cytokine_id       6246 non-null object
cytokine_label    6246 non-null object
num_papers        6246 non-null int64
score             6246 non-null object
dtypes: int64(1), object(7)
memory usage: 439.2+ KB


In [205]:
# Save results in the event of restart (NOTE: this will overwrite final results)
dfi.to_csv(TEMP_FILE, index=False)
TEMP_FILE

'/Users/eczech/repos/hammer/t-cell-relation-extraction/pm_subtype_protein_relations/data/immunexpresso/data.csv'

### Matching

Match cell types and cytokines to those in this project:

#### Load Project Meta Data

In [2]:
dfr = pd.read_csv(TEMP_FILE).drop_duplicates()
# Save record count to ensure that none are lost after transformation
n_init = len(dfr)
dfr.head()

Unnamed: 0,actor,category,cell_id,cell_label,cytokine_id,cytokine_label,num_papers,score,cell_ref_id,cytokine_ref_id
0,cell,Positive,CL_0000084,T cell,CID_83,IFNG,809,4.28,,CK1B71668FDDECE3CF
1,cell,Positive,CL_0000084,T cell,CID_110,IL2,701,5.3,,CK4D687600656CECF1
2,cell,Positive,CL_0000625,"CD8-positive, alpha-beta T cell",CID_83,IFNG,338,8.88,,CK1B71668FDDECE3CF
3,cytokine,Positive,CL_0000084,T cell,CID_110,IL2,313,1.58,,CK4D687600656CECF1
4,cytokine,Unknown,CL_0000084,T cell,CID_110,IL2,260,4.52,,CK4D687600656CECF1


In [3]:
df_ct = get_entity_meta_data(CELL_TYPES)
df_ct.head()

Unnamed: 0,sym,lbl,spid,src,extid,id,prefid
0,DETC,DETC,1,manual,,CT231343DE336492F1,CT231343DE336492F1
1,DETCs,DETC,1,manual,,CT4CF1E0FFE26A395F,CT231343DE336492F1
2,Dendritic-epidermal-T,DETC,1,manual,,CTAD8B4B3EE6ECB6BB,CT231343DE336492F1
3,Dendritic epidermal T,DETC,1,manual,,CT728A6AA916C71387,CT231343DE336492F1
4,DendriticepidermalT,DETC,1,manual,,CTE89D7A3B2E8FAACE,CT231343DE336492F1


In [4]:
df_ct['lbl'].unique()

array(['DETC', 'IEL', 'MAIT', 'NKT', 'TCM', 'TEFF', 'TEM', 'TEMRA',
       'TMEM', 'TN', 'Tc', 'Tc0', 'Tc1', 'Tc17', 'Tc2', 'Tc22', 'Tc3',
       'Tc9', 'Tfh', 'Tfh0like', 'Tfh17like', 'Tfh1like', 'Tfh22like',
       'Tfh2like', 'Tfh3like', 'Tfh9like', 'Tfreg', 'Th', 'Th0', 'Th1',
       'Th17', 'Th2', 'Th22', 'Th3', 'Th9', 'ThP', 'Thymocyte', 'Treg',
       'Treg1', 'Treg17', 'Trm', 'Tscm', 'Tsupp', 'iTreg', 'nTreg',
       'pTreg', 'γδT', 'γδT-17', 'γδT-TCS1', 'γδT-Vγ1', 'γδT-Vγ4',
       'γδT-Vγ9', 'γδT-Vγ9Vδ2', 'γδT-Vδ1', 'γδT-Vδ2'], dtype=object)

In [5]:
df_ck = get_entity_meta_data(CYTOKINES)
df_ck.head()

Unnamed: 0,id,src,sym,lbl,spid,extid,prefid
0,CK16DB62C0EE32FCFC,manual,4-1BB,4-1BB,1,,CK16DB62C0EE32FCFC
1,CKD6A1F8500CF5BB6C,manual,TNFSF9,4-1BB,1,,CK16DB62C0EE32FCFC
2,CK66480A09B13AD79D,cameron,4-1BBL,4-1BBL,1,,CK66480A09B13AD79D
3,CK3C8B23133684A426,cameron,APRIL,APRIL,1,,CK3C8B23133684A426
4,CKA1A67F50E3346A8F,cameron,TALL-2,APRIL,1,,CK3C8B23133684A426


In [6]:
df_ck['lbl'].unique()

array(['4-1BB', '4-1BBL', 'APRIL', 'CCL1', 'CCL11', 'CCL13', 'CCL14',
       'CCL15', 'CCL16', 'CCL17', 'CCL18', 'CCL19', 'CCL2', 'CCL20',
       'CCL21', 'CCL22', 'CCL23', 'CCL24', 'CCL25', 'CCL26', 'CCL27',
       'CCL28', 'CCL3', 'CCL3L1', 'CCL3L3', 'CCL4', 'CCL4L1', 'CCL4L2',
       'CCL5', 'CCL6', 'CCL7', 'CCL8', 'CCL9', 'CD153', 'CD154', 'CD178',
       'CD258', 'CD40LG', 'CD70', 'CKLF', 'CSF1', 'CSF2', 'CSF3',
       'CX3CL1', 'CXCL1', 'CXCL10', 'CXCL11', 'CXCL12', 'CXCL13',
       'CXCL14', 'CXCL16', 'CXCL17', 'CXCL2', 'CXCL3', 'CXCL4', 'CXCL5',
       'CXCL6', 'CXCL7', 'CXCL8', 'CXCL9', 'EBI3', 'EGF', 'EPO', 'FGF1',
       'FGF2', 'FP248', 'G-CSF', 'GDF15', 'GITRL', 'GM-CSF', 'IFN-1',
       'IFN-IL-28A', 'IFN-IL-28B', 'IFN-IL-6', 'IFN-α', 'IFN-α1',
       'IFN-α2', 'IFN-α6', 'IFN-β', 'IFN-β1', 'IFN-γ', 'IL-1', 'IL-10',
       'IL-11', 'IL-12', 'IL-12A', 'IL-12B', 'IL-12α', 'IL-13', 'IL-14',
       'IL-15', 'IL-16', 'IL-17', 'IL-17A', 'IL-17B', 'IL-17C', 'IL-17D',
       'IL-1

#### Match Cell Types

In [7]:
# Map immuneXpresso cell type labels to internal labels
m_ct = {
    'CD4-positive helper T cell': 'Th',
    'naive T cell': 'TN',
    'naive thymus-derived CD4-positive, alpha-beta T cell': 'TN',
    'regulatory T cell': 'Treg',
    'CD4-positive, CD25-positive, alpha-beta regulatory T cell': 'Treg',
    'CD8-positive, CD28-negative, alpha-beta regulatory T cell': 'Treg',
    'CD8-positive, CXCR3-positive, alpha-beta regulatory T cell': 'Treg',
    'induced T-regulatory cell': 'iTreg',
    'natural T-regulatory cell': 'nTreg',
    'CD8-positive, alpha-beta regulatory T cell': 'Treg',
    'CD4-positive, alpha-beta cytotoxic T cell': 'Tc',
    'CD8-positive, alpha-beta cytotoxic T cell': 'Tc',
    'cytotoxic T cell': 'Tc',
    'CD4-positive, alpha-beta intraepithelial T cell': 'IEL',
    'CD8-alpha-beta-positive, alpha-beta intraepithelial T cell': 'IEL',
    'intraepithelial lymphocyte': 'IEL',
    'CD4-positive, alpha-beta memory T cell': 'TMEM',
    'CD8-positive, alpha-beta memory T cell': 'TMEM',
    'memory T cell': 'TMEM',
    'T follicular helper cell': 'Tfh',
    'T-helper 1 cell': 'Th1',
    'T-helper 17 cell': 'Th17',
    'T-helper 2 cell': 'Th2',
    'T-helper 22 cell': 'Th22',
    'T-helper 9 cell': 'Th9',
    'Tc1 cell': 'Tc1',
    'Tc17 cell': 'Tc17',
    'Tr1 cell': 'Treg1',
    'dendritic epidermal T cell': 'DETC',
    'CD8-positive, alpha-beta thymocyte': 'Thymocyte',
    'DN4 thymocyte': 'Thymocyte',
    'fetal thymocyte': 'Thymocyte',
    'double negative thymocyte': 'Thymocyte',
    'double-positive, alpha-beta thymocyte': 'Thymocyte',
    'mature CD4 single-positive thymocyte': 'Thymocyte',
    'thymocyte': 'Thymocyte',
    'effector T cell': 'TEFF',
    'effector memory CD4-positive, alpha-beta T cell': 'TEM',
    'effector memory CD8-positive, alpha-beta T cell': 'TEM',
    'gamma-delta T cell': 'γδT',
    'helper T cell': 'Th',
    'mature NK T cell': 'NKT'
}

In [8]:
np.sort(dfr['cell_label'].unique())

array(['CD11c-low plasmacytoid dendritic cell',
       'CD4-positive helper T cell',
       'CD4-positive, CD25-positive, alpha-beta regulatory T cell',
       'CD4-positive, alpha-beta T cell',
       'CD4-positive, alpha-beta cytotoxic T cell',
       'CD4-positive, alpha-beta intraepithelial T cell',
       'CD4-positive, alpha-beta memory T cell',
       'CD8-alpha-beta-positive, alpha-beta intraepithelial T cell',
       'CD8-positive, CD28-negative, alpha-beta regulatory T cell',
       'CD8-positive, CXCR3-positive, alpha-beta regulatory T cell',
       'CD8-positive, alpha-beta T cell',
       'CD8-positive, alpha-beta cytotoxic T cell',
       'CD8-positive, alpha-beta memory T cell',
       'CD8-positive, alpha-beta regulatory T cell',
       'CD8-positive, alpha-beta thymocyte', 'DN4 thymocyte', 'T cell',
       'T follicular helper cell', 'T-helper 1 cell', 'T-helper 17 cell',
       'T-helper 2 cell', 'T-helper 22 cell', 'T-helper 9 cell',
       'Tc1 cell', 'Tc17 cell', '

In [9]:
np.sort(df_ct['lbl'].unique())

array(['DETC', 'IEL', 'MAIT', 'NKT', 'TCM', 'TEFF', 'TEM', 'TEMRA',
       'TMEM', 'TN', 'Tc', 'Tc0', 'Tc1', 'Tc17', 'Tc2', 'Tc22', 'Tc3',
       'Tc9', 'Tfh', 'Tfh0like', 'Tfh17like', 'Tfh1like', 'Tfh22like',
       'Tfh2like', 'Tfh3like', 'Tfh9like', 'Tfreg', 'Th', 'Th0', 'Th1',
       'Th17', 'Th2', 'Th22', 'Th3', 'Th9', 'ThP', 'Thymocyte', 'Treg',
       'Treg1', 'Treg17', 'Trm', 'Tscm', 'Tsupp', 'iTreg', 'nTreg',
       'pTreg', 'γδT', 'γδT-17', 'γδT-TCS1', 'γδT-Vγ1', 'γδT-Vγ4',
       'γδT-Vγ9', 'γδT-Vγ9Vδ2', 'γδT-Vδ1', 'γδT-Vδ2'], dtype=object)

In [10]:
dfr['cell_ref_id'] = dfr['cell_label'].map(m_ct).map(
    # Create internal label -> internal ID dict
    df_ct[df_ct['prefid'] == df_ct['id']].set_index('lbl')['id'].to_dict()
)
dfr[dfr['cell_ref_id'].notnull()].head()

Unnamed: 0,actor,category,cell_id,cell_label,cytokine_id,cytokine_label,num_papers,score,cell_ref_id,cytokine_ref_id
13,cell,Positive,CL_0000545,T-helper 1 cell,CID_83,IFNG,88,5.01,CTC3A8C3CBC245616A,CK1B71668FDDECE3CF
21,cytokine,Positive,CL_0000792,"CD4-positive, CD25-positive, alpha-beta regula...",CID_110,IL2,60,3.12,CTB574584AD019ABB8,CK4D687600656CECF1
22,cell,Positive,CL_0000546,T-helper 2 cell,CID_131,IL4,59,15.88,CTE7B12DC660323A0E,CKAD8334A7AD4B4D2F
26,cell,Positive,CL_0000792,"CD4-positive, CD25-positive, alpha-beta regula...",CID_93,IL10,57,7.93,CTB574584AD019ABB8,CK3A3EC71E6A0C53F0
27,cytokine,Positive,CL_0000545,T-helper 1 cell,CID_95,IL12,56,11.77,CTC3A8C3CBC245616A,CK04FD0805168B608B


In [11]:
dfr.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1245 entries, 0 to 6045
Data columns (total 10 columns):
actor              1245 non-null object
category           1245 non-null object
cell_id            1245 non-null object
cell_label         1245 non-null object
cytokine_id        1245 non-null object
cytokine_label     1245 non-null object
num_papers         1245 non-null int64
score              1245 non-null float64
cell_ref_id        713 non-null object
cytokine_ref_id    1245 non-null object
dtypes: float64(1), int64(1), object(8)
memory usage: 107.0+ KB


#### Match Cytokines

In [12]:
# cts = df_ck.groupby(['src', 'sym'])['lbl'].nunique()
# cts[cts > 1]

In [13]:
#df_ck[df_ck['sym'] == 'IFN-L2']
#df_ck[df_ck['sym'] == 'LT-α']
#df_ck[df_ck['sym'] == 'Monocyte-CSF']
df_ck[df_ck['sym'] == 'IL-1RN']
#df_ck[df_ck['lbl'] == 'CXCL7']

Unnamed: 0,id,src,sym,lbl,spid,extid,prefid
1013,CKA31BE3DE2FF31BDC,manual,IL-1RN,IL-1RA,1,,CKA95B181F6273B58D


In [14]:
m_ck = {'Monocyte-CSF': 'CSF1'}
dfr_ck = dfr['cytokine_label'].unique()
prj_ck = df_ck['lbl'].unique()
prj_ck_m = df_ck.groupby('sym')['lbl'].min().to_dict()
prj_ck_m = {k.upper():v for k, v in prj_ck_m.items()}

unmapped_cytokines = []
for ck in dfr_ck:
    if ck in m_ck:
        continue
    sym = ck
    if ck.startswith('IL'):
        sym = 'IL-' + ck[2:]
    if sym in prj_ck:
        m_ck[ck] = sym
    elif sym in prj_ck_m:
        m_ck[ck] = prj_ck_m[sym]
    else:
        unmapped_cytokines.append(ck)

assert len(unmapped_cytokines) == 0, 'Found the following unmapped cytokines:\n{}'.format(sorted(unmapped_cytokines))

In [15]:
#np.sort(dfr['cytokine_label'].unique())
#np.sort(df_ck['lbl'].unique())
#u'\N{GREEK SMALL LETTER GAMMA}'
#df_ck[df_ck['lbl'] == 'IFN-α']
#df_ck[df_ck['lbl'] == 'IFN-γ']

In [16]:
dfr['cytokine_ref_id'] = dfr['cytokine_label'].map(m_ck).map(
    # Create internal label -> internal ID dict
    df_ck[df_ck['prefid'] == df_ck['id']].set_index('lbl')['id'].to_dict()
)
assert dfr['cytokine_ref_id'].notnull().all()
dfr.head()

Unnamed: 0,actor,category,cell_id,cell_label,cytokine_id,cytokine_label,num_papers,score,cell_ref_id,cytokine_ref_id
0,cell,Positive,CL_0000084,T cell,CID_83,IFNG,809,4.28,,CK128EC732B281BF60
1,cell,Positive,CL_0000084,T cell,CID_110,IL2,701,5.3,,CKD21834C7D75FD77F
2,cell,Positive,CL_0000625,"CD8-positive, alpha-beta T cell",CID_83,IFNG,338,8.88,,CK128EC732B281BF60
3,cytokine,Positive,CL_0000084,T cell,CID_110,IL2,313,1.58,,CKD21834C7D75FD77F
4,cytokine,Unknown,CL_0000084,T cell,CID_110,IL2,260,4.52,,CKD21834C7D75FD77F


In [17]:
dfr.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1245 entries, 0 to 6045
Data columns (total 10 columns):
actor              1245 non-null object
category           1245 non-null object
cell_id            1245 non-null object
cell_label         1245 non-null object
cytokine_id        1245 non-null object
cytokine_label     1245 non-null object
num_papers         1245 non-null int64
score              1245 non-null float64
cell_ref_id        713 non-null object
cytokine_ref_id    1245 non-null object
dtypes: float64(1), int64(1), object(8)
memory usage: 107.0+ KB


## EDA

In [18]:
df_ck[df_ck['lbl'] == 'IL-12']

Unnamed: 0,id,src,sym,lbl,spid,extid,prefid
712,CKB4EB2D2CC8BBB93D,cameron,IL-12,IL-12,1,,CKB4EB2D2CC8BBB93D
713,CK0AB4E70A099B813E,cameron,NK cell stimulatory factor,IL-12,1,,CKB4EB2D2CC8BBB93D
714,CKCD91D98FD7E182B5,cameron,IL12,IL-12,1,,CKB4EB2D2CC8BBB93D
715,CK04FD0805168B608B,transform,IL-12,IL-12,1,,CKB4EB2D2CC8BBB93D
716,CK7A4CEF7FAC120B47,transform,IL12,IL-12,1,,CKB4EB2D2CC8BBB93D
717,CKBE08F217CBE82494,transform,(IL)12,IL-12,1,,CKB4EB2D2CC8BBB93D
718,CK7ADA236A5F75EDDC,transform,(IL) 12,IL-12,1,,CKB4EB2D2CC8BBB93D
719,CKC5FE39E4D3160194,transform,(IL)-12,IL-12,1,,CKB4EB2D2CC8BBB93D
720,CKD7424A364DAB5EFC,transform,interleukin 12,IL-12,1,,CKB4EB2D2CC8BBB93D
721,CKFE618FC9B3BD04C5,transform,interleukin-12,IL-12,1,,CKB4EB2D2CC8BBB93D


In [19]:
dfr['actor'].value_counts()

cytokine    1054
cell         191
Name: actor, dtype: int64

In [20]:
dfr.groupby(['actor', 'category']).size()

actor     category
cell      Positive    191
cytokine  Negative    271
          Positive    422
          Unknown     361
dtype: int64

In [23]:
dft = dfr[(dfr['actor'] == 'cytokine') & (dfr['category'] == 'Positive')].sort_values('num_papers', ascending=False)
#dft = dfr[(dfr['category'] == 'Positive')].sort_values('num_papers', ascending=False)
dft[dft['cell_label'] == 'T-helper 2 cell'].head(15)

Unnamed: 0,actor,category,cell_id,cell_label,cytokine_id,cytokine_label,num_papers,score,cell_ref_id,cytokine_ref_id
58,cytokine,Positive,CL_0000546,T-helper 2 cell,CID_131,IL4,31,5.47,CTE7B12DC660323A0E,CK6FE135B3F86FB707
153,cytokine,Positive,CL_0000546,T-helper 2 cell,CID_128,IL33,11,13.53,CTE7B12DC660323A0E,CK3B4A795DF4150CF4
184,cytokine,Positive,CL_0000546,T-helper 2 cell,CID_93,IL10,9,1.47,CTE7B12DC660323A0E,CKC5CC1A269C01EC48
277,cytokine,Positive,CL_0000546,T-helper 2 cell,CID_83,IFNG,5,0.33,CTE7B12DC660323A0E,CK128EC732B281BF60
310,cytokine,Positive,CL_0000546,T-helper 2 cell,CID_118,IL25,5,30.31,CTE7B12DC660323A0E,CK1A3D9956EEABBF99
387,cytokine,Positive,CL_0000546,T-helper 2 cell,CID_108,IL18,4,3.91,CTE7B12DC660323A0E,CKBF6003C60D23BA0D
495,cytokine,Positive,CL_0000546,T-helper 2 cell,CID_133,IL6,3,0.44,CTE7B12DC660323A0E,CK0CBC35CD2AB7D661
456,cytokine,Positive,CL_0000546,T-helper 2 cell,CID_20,CCL2,3,1.15,CTE7B12DC660323A0E,CKCDB807A64FAA7FA0
406,cytokine,Positive,CL_0000546,T-helper 2 cell,CID_95,IL12,3,0.91,CTE7B12DC660323A0E,CKB4EB2D2CC8BBB93D
413,cytokine,Positive,CL_0000546,T-helper 2 cell,CID_162,TGFB,3,0.48,CTE7B12DC660323A0E,CKFD4CA0B2B4BC3AE4


## Export

In [22]:
assert len(dfr) == n_init
dfr.to_csv(DATA_FILE, index=False)
DATA_FILE

'/Users/eczech/repos/hammer/t-cell-relation-extraction/pm_subtype_protein_relations/data/supervision/immunexpresso/data.csv'