## Cell Type Integration

In [4]:
import os.path as osp
import pandas as pd
import numpy as np
from tcre import meta
from tcre import lib
from tcre.env import *
from tcre.lib import SPECIES_HUMAN_ID

### Load Cell Ontology

In [8]:
path = osp.join(META_DATA_DIR, 'raw', 'cl.raw.csv')
df = pd.read_csv(path)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 632 entries, 0 to 631
Data columns (total 6 columns):
desc          632 non-null object
id            632 non-null object
label         632 non-null object
syn           632 non-null object
syn_typ       632 non-null object
syn_typ_id    632 non-null int64
dtypes: int64(1), object(5)
memory usage: 29.7+ KB


In [17]:
# Make sure that each group of synonyms has a record with symbol equal to label
assert (df.groupby('label').apply(lambda g: len(g[g['syn'] == g['label']])) == 1).all()

In [67]:
def search(term, df):
    term = term.lower()
    ids = df[df['syn'].str.lower().apply(lambda v: term in v)]['id'].unique()
    df = df.groupby('label').apply(lambda g: g if g['id'].isin(ids).any() else None)
    return df[df['syn_typ'] == 'label']
        
pd.set_option('display.max_colwidth', 200)
search('gamma', df)

Unnamed: 0,desc,id,label,syn,syn_typ,syn_typ_id
13,"A type II NK T cell that has been recently activated, secretes interferon-gamma, and has the phenotype CD69-positive and downregulated NK markers.",CL:0000932,type II NK T cell secreting interferon-gamma,type II NK T cell secreting interferon-gamma,label,5.0
91,A gamma-delta T cell that has a mature phenotype. These cells can be found in tissues and circulation where they express unique TCR repertoire depending on their location.,CL:0000800,mature gamma-delta T cell,mature gamma-delta T cell,label,5.0
97,"A thymocyte that has a T cell receptor consisting of a gamma chain containing Vgamma2 segment, and a delta chain. This cell type is CD4- negative, CD8-negative and CD24-negative. This cell-type is...",CL:0002407,mature Vgamma2-positive thymocyte,mature Vgamma2-positive thymocyte,label,5.0
98,"A thymocyte that has a T cell receptor consisting of a gamma chain containing Vgamma2 segment, and a delta chain. This cell type is CD4- negative, CD8-negative and CD24-negative. This cell-type is...",CL:0002403,mature Vgamma2-positive fetal thymocyte,mature Vgamma2-positive fetal thymocyte,label,5.0
99,"A thymocyte that has a T cell receptor consisting of a gamma chain containing Vgamma2 segment, and a delta chain. This cell type is CD4- negative, CD8-negative and CD24-negative. This cell-type is...",CL:0002409,mature Vgamma2-negative thymocyte,mature Vgamma2-negative thymocyte,label,5.0
100,"A Vgamma1.1-positive, Vdelta6.3-positive thymocyte that is CD24-negative.",CL:0002416,"mature Vgamma1.1-positive, Vdelta6.3-positive thymocyte","mature Vgamma1.1-positive, Vdelta6.3-positive thymocyte",label,5.0
101,"A Vgamma1.1-positive, Vdelta6.3-negative thymocyte that is CD24-negative.",CL:0002413,"mature Vgamma1.1-positive, Vdelta6.3-negative thymocyte","mature Vgamma1.1-positive, Vdelta6.3-negative thymocyte",label,5.0
140,A gamma-delta T cell that has an immature phenotype.,CL:0000799,immature gamma-delta T cell,immature gamma-delta T cell,label,5.0
146,"A double negative post-natal thymocyte that has a T cell receptor consisting of a gamma chain containing a Vgamma2 segment, and a delta chain. This cell type is CD4- negative, CD8-negative and CD2...",CL:0002406,immature Vgamma2-positive thymocyte,immature Vgamma2-positive thymocyte,label,5.0
147,"A double negative thymocyte that has a T cell receptor consisting of a gamma chain containing a Vgamma2 segment, and a delta chain. This cell type is CD4- negative, CD8-negative and CD24-positive ...",CL:0002378,immature Vgamma2-positive fetal thymocyte,immature Vgamma2-positive fetal thymocyte,label,5.0


### Manual Entries

In [None]:
# Not found:
# - T-helper precursor
# - Tissue resident memory
# - Stem memory
# - Treg17
# - Follicular regulatory
# - Peripheral Treg

In [42]:
SRC = 'manual'
cell_types = [
    #('TEFF', ['T-eff', 'T-effector', 'eff-T', 'effector-T']),
    ('TN',    'CL:0000898', ['T-naïve', 'naïve-T']),
    ('TMEM',  'CL:0000813', ['T-mem', 'memory-T', 'mem-T']),
    ('TEMRA', 'CL:0001062', ['T-emra', 'emra-T']),
    ('TCM',   'CL:0000904', ['T-cm', 'cm-T', 'central-memory-T']),
    ('TEM',   'CL:0000905', ['T-em', 'em-T', 'effector-memory-T', 'effector-memory-T']),
    ('IEL',   'CL:0002496', ['IELs', 'Intraepithelial-lymphocyte', 'Intraepithelial-lymphocytes']),
    ('DETC',  'CL:0000916', ['DETCs', 'Dendritic-epidermal-T']),
    ('Thymocyte', 'CL:0000893', ['thymocyte', 'thymocytes']),
    ('Th',   'CL:0000912', ['T-Helper', 'Helper-T']),
    ('Tc',   'CL:0000910', ['T-Cytotoxic', 'Cytotoxic-T', 'Cytolitic-T', 'T-Cytolitic', 'CTL', 'CTLs']),
    ('ThP',  None,         ['Th-Precursor', 'Precursor-Th']),
    ('Trm',  None,         ['T-rm', 'T-resident-memory', 'resident-memory-T']),
    ('Tscm', None,         ['T-SCM', 'T-SC', 'stem-memory-T', 'T-stem-memory', 'memory-stem-T', 'T-memory-stem']),
    ('NKT',   'CL:0000814', ['natural-killer-T', 'T-natural-killer', 'INKT', 'TRNKT', 'CNKT', 'NK/T']),
    ('MAIT',  'CL:0000940', ['Mucosal-associated invariant T']),
    ('Treg',  'CL:0000792', ['T-reg', 'T-regs', 'T-regulatory', 'regulatory-T']),
    ('Tsupp', 'CL:0000792', ['Tsuppressor', 'Ts/c', 'Ts/Tc','T-suppressor/cytotoxic'], 'Treg'),
    ('Treg1', 'CL:0000901', ['T-reg1', 'T-reg-1', 'Treg/Th1', 'Treg-Th1', 'Tr-1'], 'Treg'),
    ('Treg17', None,        ['T-reg17', 'T-reg-17', 'Treg/Th17', 'Treg-Th17', 'Tr-17'], 'Treg'),
    ('Tfh',   'CL:0002038', ['T-FH', 'T-follicular-helper', 'follicular-helper-T', 'fhT', 'T-follicular'], 'Th'),
    ('Tfreg', None,         ['Tf-reg', 'T-follicular-regulatory', 'follicular-regulatory-T', 'follicular-Treg', 'fTreg'], 'Treg'),
    ('iTreg', 'CL:0000902', [
        'i-Treg', 'induced-Treg', 'inducible-Treg', 'induced-T-reg', 'inducible-T-reg',
        'induced T-regulatory', 'induced-T-regulatory', 'induced regulatory-T', 'induced-regulatory-T',
        'inducible T-regulatory', 'inducible-T-regulatory', 'inducible regulatory-T', 'inducible-regulatory-T',
        ], 'Treg'),
    ('nTreg', 'CL:0000903', [
        'n-Treg', 'natural-Treg', 'natural-T-reg', 
        'natural T-regulatory', 'natural-T-regulatory', 'natural regulatory-T', 'natural-regulatory-T'
        ], 'Treg'),
    ('pTreg', None, [
        'p-Treg', 'peripheral-Treg', 'peripheral-T-reg', 
        'peripheral T-regulatory', 'peripheral-T-regulatory', 'peripheral regulatory-T', 'peripheral-regulatory-T',
        'peripherally-induced-Treg'
        ], 'Treg'),
    ('γδT', 'CL:0000798', ['Gamma-Delta-T', 'T-Gamma-Delta', 'Gamma/Delta-T', 'γ/δ-T', 'γδ-T', 'gd-T', 'Tgd', 'g/dT', 'Tg/d']),
    ('γδT-Vδ1', ['VDelta1-T', 'Vδ1-T'], 'γδT'),
    ('γδT-Vδ2', ['VDelta2-T', 'Vδ2-T'], 'γδT'),
    ('γδT-Vγ1', ['VGamma1-T', 'Vγ1-T'], 'γδT'),
    ('γδT-Vγ4', ['VGamma4-T', 'Vγ4-T'], 'γδT'),
    ('γδT-Vγ9', ['VGamma9-T', 'Vγ9-T'], 'γδT'),
    ('γδT-Vγ9Vδ2', ['T-VGamma9-Delta2', 'VGamma9-Delta2-T', 'Vγ9Vδ2T'], 'γδT'),
    ('γδT-17', ['γδ17T', 'T-Gamma-Delta17', 'Gamma-Delta17-T'], 'γδT'),
    ('γδT-TCS1', ['TCS1'], 'γδT')
] + [
    ('Tc' + str(i), ['Tc-' + str(i), 'T-cytolitic-' + str(i), 'T cytolitic-' + str(i), 'T-cytotoxic-' + str(i), 'T cytotoxic-' + str(i)], 'Tc') 
    for i in [0, 1, 2, 3, 9, 17, 22]
] + [
    ('Th' + str(i), [
        'Th-' + str(i), 'T-helper-' + str(i), 'T-helper ' + str(i), 'T helper-' + str(i), 'T helper cell-' + str(i),
        'T helper cell (Th)-' + str(i), 'T helper (Th)-' + str(i)
        ], 'Th')
    for i in [0, 1, 2, 3, 9, 17, 22]
] + [
    ('Tfh{}like'.format(i), ['Tfh-{}-like'.format(i), 'Tfh-{}like'.format(i), 'Tfh{}-like'.format(i), 'Tfh{}'.format(i)], 'Tfh') 
    for i in [0, 1, 2, 3, 9, 17, 22]
]

In [43]:
import unidecode
df = []
for r in cell_types:
    assert len(r) in [2, 3]
    lbl, alias = r[:2]
    parent = None if len(r) == 2 else r[2]
    df.append((lbl, lbl))
    for a1 in alias:
        for a2 in [a1, unidecode.unidecode(a1)]:
            df.append((a2, lbl))
            df.append((a2.replace('-', ' '), lbl))
            df.append((a2.replace('-', ''), lbl))
df = pd.DataFrame(df, columns=['sym', 'lbl']).drop_duplicates()

# Add -cell[s] -lymphocyte[s] to the end of each symbol to cover cases where that
# the spans are not broken into separate tokens (e.g. "follicular regulatory T-cells")
df = pd.concat([df] + [
    df[df['sym'].str.endswith('T')].assign(sym=lambda df: df['sym'] + suffix)
    for suffix in ['-cell', '-cells', '-lymphoctye', '-lymphocytes']
])

df = df.assign(spid=SPECIES_HUMAN_ID, src=SRC, extid=None)
df['id'] = get_ids(df, ID_TYP_CT)
df = df.drop_duplicates()
df = add_preferred_ids(df)
df.head()

Unnamed: 0,sym,lbl,spid,src,extid,id,prefid
108,DETC,DETC,1,manual,,CT231343DE336492F1,CT231343DE336492F1
109,DETCs,DETC,1,manual,,CT4CF1E0FFE26A395F,CT231343DE336492F1
115,Dendritic-epidermal-T,DETC,1,manual,,CTAD8B4B3EE6ECB6BB,CT231343DE336492F1
116,Dendritic epidermal T,DETC,1,manual,,CT728A6AA916C71387,CT231343DE336492F1
117,DendriticepidermalT,DETC,1,manual,,CTE89D7A3B2E8FAACE,CT231343DE336492F1


In [44]:
df[df['sym'].str.lower().str.match('.*help.*')].head(15)

Unnamed: 0,sym,lbl,spid,src,extid,id,prefid
404,T-follicular-helper,Tfh,1,manual,,CT521FD23411A69F7B,CTE9F6070561C95355
405,T follicular helper,Tfh,1,manual,,CT5C03730892B0B7E0,CTE9F6070561C95355
406,Tfollicularhelper,Tfh,1,manual,,CT10F75C923DC5D960,CTE9F6070561C95355
410,follicular-helper-T,Tfh,1,manual,,CT8A54A7929F3074FF,CTE9F6070561C95355
411,follicular helper T,Tfh,1,manual,,CT48DDD25612FEC28B,CTE9F6070561C95355
412,follicularhelperT,Tfh,1,manual,,CT92FD47DCB488F926,CTE9F6070561C95355
135,T-Helper,Th,1,manual,,CT58AC91B99EE259CE,CTB7DFCED683E6F0A9
136,T Helper,Th,1,manual,,CTCC4CF8EAB76DFC68,CTB7DFCED683E6F0A9
137,THelper,Th,1,manual,,CT0107866BBCD97E04,CTB7DFCED683E6F0A9
141,Helper-T,Th,1,manual,,CT068AEEF38840649E,CTB7DFCED683E6F0A9


### Export

In [45]:
path = osp.join(META_DATA_DIR, CELL_TYPES + '.csv')
df.to_csv(path, index=False)
path

'/Users/eczech/repos/hammer/t-cell-relation-extraction/pm_subtype_protein_relations/data/meta/cell_types.csv'