## Metadata Integration

In [2]:
import os.path as osp
import pandas as pd
import numpy as np
from tcre import meta
from tcre.env import *
from tcre.lib import SPECIES_HUMAN_ID

## Cell Types

In [5]:
path = osp.join(META_DATA_DIR, 'raw', 'cl.raw.csv')
df = pd.read_csv(path)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 632 entries, 0 to 631
Data columns (total 6 columns):
desc          632 non-null object
id            632 non-null object
label         632 non-null object
syn           632 non-null object
syn_typ       632 non-null object
syn_typ_id    632 non-null int64
dtypes: int64(1), object(5)
memory usage: 29.7+ KB


### Cell Ontology

### Manual Entries

In [42]:
SRC = 'manual'
cell_types = [
    #('TEFF', ['T-eff', 'T-effector', 'eff-T', 'effector-T']),
    ('TN', ['T-naïve', 'naïve-T']),
    ('TMEM', ['T-mem', 'memory-T', 'mem-T']),
    ('TEMRA', ['T-emra', 'emra-T']),
    ('TCM', ['T-cm', 'cm-T', 'central-memory-T']),
    ('TEM', ['T-em', 'em-T', 'effector-memory-T', 'effector-memory-T']),
    ('IEL', ['IELs', 'Intraepithelial-lymphocyte', 'Intraepithelial-lymphocytes']),
    ('DETC', ['DETCs', 'Dendritic-epidermal-T']),
    ('Thymocyte', ['thymocyte', 'thymocytes']),
    ('Th', ['T-Helper', 'Helper-T']),
    ('Tc', ['T-Cytotoxic', 'Cytotoxic-T', 'Cytolitic-T', 'T-Cytolitic', 'CTL', 'CTLs']),
    ('ThP', ['Th-Precursor', 'Precursor-Th']),
    ('Trm', ['T-rm', 'T-resident-memory', 'resident-memory-T']),
    ('Tscm', ['T-SCM', 'T-SC', 'stem-memory-T', 'T-stem-memory', 'memory-stem-T', 'T-memory-stem']),
    ('NKT', ['natural-killer-T', 'T-natural-killer', 'INKT', 'TRNKT', 'CNKT', 'NK/T']),
    ('MAIT', ['Mucosal-associated invariant T']),
    ('Treg', ['T-reg', 'T-regs', 'T-regulatory', 'regulatory-T']),
    ('Tsupp', ['Tsuppressor', 'Ts/c', 'Ts/Tc','T-suppressor/cytotoxic'], 'Treg'),
    ('Treg1', ['T-reg1', 'T-reg-1', 'Treg/Th1', 'Treg-Th1', 'Tr-1'], 'Treg'),
    ('Treg17', ['T-reg17', 'T-reg-17', 'Treg/Th17', 'Treg-Th17', 'Tr-17'], 'Treg'),
    ('Tfh', ['T-FH', 'T-follicular-helper', 'follicular-helper-T', 'fhT', 'T-follicular'], 'Th'),
    ('Tfreg', ['Tf-reg', 'T-follicular-regulatory', 'follicular-regulatory-T', 'follicular-Treg', 'fTreg'], 'Treg'),
    ('iTreg', [
        'i-Treg', 'induced-Treg', 'inducible-Treg', 'induced-T-reg', 'inducible-T-reg',
        'induced T-regulatory', 'induced-T-regulatory', 'induced regulatory-T', 'induced-regulatory-T',
        'inducible T-regulatory', 'inducible-T-regulatory', 'inducible regulatory-T', 'inducible-regulatory-T',
        ], 'Treg'),
    ('nTreg', [
        'n-Treg', 'natural-Treg', 'natural-T-reg', 
        'natural T-regulatory', 'natural-T-regulatory', 'natural regulatory-T', 'natural-regulatory-T'
        ], 'Treg'),
    ('pTreg', [
        'p-Treg', 'peripheral-Treg', 'peripheral-T-reg', 
        'peripheral T-regulatory', 'peripheral-T-regulatory', 'peripheral regulatory-T', 'peripheral-regulatory-T',
        'peripherally-induced-Treg'
        ], 'Treg'),
    ('γδT', ['Gamma-Delta-T', 'T-Gamma-Delta', 'Gamma/Delta-T', 'γ/δ-T', 'γδ-T', 'gd-T', 'Tgd', 'g/dT', 'Tg/d']),
    ('γδT-Vδ1', ['VDelta1-T', 'Vδ1-T'], 'γδT'),
    ('γδT-Vδ2', ['VDelta2-T', 'Vδ2-T'], 'γδT'),
    ('γδT-Vγ1', ['VGamma1-T', 'Vγ1-T'], 'γδT'),
    ('γδT-Vγ4', ['VGamma4-T', 'Vγ4-T'], 'γδT'),
    ('γδT-Vγ9', ['VGamma9-T', 'Vγ9-T'], 'γδT'),
    ('γδT-Vγ9Vδ2', ['T-VGamma9-Delta2', 'VGamma9-Delta2-T', 'Vγ9Vδ2T'], 'γδT'),
    ('γδT-17', ['γδ17T', 'T-Gamma-Delta17', 'Gamma-Delta17-T'], 'γδT'),
    ('γδT-TCS1', ['TCS1'], 'γδT')
] + [
    ('Tc' + str(i), ['Tc-' + str(i), 'T-cytolitic-' + str(i), 'T cytolitic-' + str(i), 'T-cytotoxic-' + str(i), 'T cytotoxic-' + str(i)], 'Tc') 
    for i in [0, 1, 2, 3, 9, 17, 22]
] + [
    ('Th' + str(i), [
        'Th-' + str(i), 'T-helper-' + str(i), 'T-helper ' + str(i), 'T helper-' + str(i), 'T helper cell-' + str(i),
        'T helper cell (Th)-' + str(i), 'T helper (Th)-' + str(i)
        ], 'Th')
    for i in [0, 1, 2, 3, 9, 17, 22]
] + [
    ('Tfh{}like'.format(i), ['Tfh-{}-like'.format(i), 'Tfh-{}like'.format(i), 'Tfh{}-like'.format(i), 'Tfh{}'.format(i)], 'Tfh') 
    for i in [0, 1, 2, 3, 9, 17, 22]
]

In [43]:
import unidecode
df = []
for r in cell_types:
    assert len(r) in [2, 3]
    lbl, alias = r[:2]
    parent = None if len(r) == 2 else r[2]
    df.append((lbl, lbl))
    for a1 in alias:
        for a2 in [a1, unidecode.unidecode(a1)]:
            df.append((a2, lbl))
            df.append((a2.replace('-', ' '), lbl))
            df.append((a2.replace('-', ''), lbl))
df = pd.DataFrame(df, columns=['sym', 'lbl']).drop_duplicates()

# Add -cell[s] -lymphocyte[s] to the end of each symbol to cover cases where that
# the spans are not broken into separate tokens (e.g. "follicular regulatory T-cells")
df = pd.concat([df] + [
    df[df['sym'].str.endswith('T')].assign(sym=lambda df: df['sym'] + suffix)
    for suffix in ['-cell', '-cells', '-lymphoctye', '-lymphocytes']
])

df = df.assign(spid=SPECIES_HUMAN_ID, src=SRC, extid=None)
df['id'] = get_ids(df, ID_TYP_CT)
df = df.drop_duplicates()
df = add_preferred_ids(df)
df.head()

Unnamed: 0,sym,lbl,spid,src,extid,id,prefid
108,DETC,DETC,1,manual,,CT231343DE336492F1,CT231343DE336492F1
109,DETCs,DETC,1,manual,,CT4CF1E0FFE26A395F,CT231343DE336492F1
115,Dendritic-epidermal-T,DETC,1,manual,,CTAD8B4B3EE6ECB6BB,CT231343DE336492F1
116,Dendritic epidermal T,DETC,1,manual,,CT728A6AA916C71387,CT231343DE336492F1
117,DendriticepidermalT,DETC,1,manual,,CTE89D7A3B2E8FAACE,CT231343DE336492F1


In [44]:
df[df['sym'].str.lower().str.match('.*help.*')].head(15)

Unnamed: 0,sym,lbl,spid,src,extid,id,prefid
404,T-follicular-helper,Tfh,1,manual,,CT521FD23411A69F7B,CTE9F6070561C95355
405,T follicular helper,Tfh,1,manual,,CT5C03730892B0B7E0,CTE9F6070561C95355
406,Tfollicularhelper,Tfh,1,manual,,CT10F75C923DC5D960,CTE9F6070561C95355
410,follicular-helper-T,Tfh,1,manual,,CT8A54A7929F3074FF,CTE9F6070561C95355
411,follicular helper T,Tfh,1,manual,,CT48DDD25612FEC28B,CTE9F6070561C95355
412,follicularhelperT,Tfh,1,manual,,CT92FD47DCB488F926,CTE9F6070561C95355
135,T-Helper,Th,1,manual,,CT58AC91B99EE259CE,CTB7DFCED683E6F0A9
136,T Helper,Th,1,manual,,CTCC4CF8EAB76DFC68,CTB7DFCED683E6F0A9
137,THelper,Th,1,manual,,CT0107866BBCD97E04,CTB7DFCED683E6F0A9
141,Helper-T,Th,1,manual,,CT068AEEF38840649E,CTB7DFCED683E6F0A9


In [45]:
path = osp.join(META_DATA_DIR, CELL_TYPES + '.csv')
df.to_csv(path, index=False)
path

'/Users/eczech/repos/hammer/t-cell-relation-extraction/pm_subtype_protein_relations/data/meta/cell_types.csv'