## Metadata Integration

In [1]:
import os.path as osp
import pandas as pd
import numpy as np
%run env.py
%run src/lib.py
%run src/meta.py
%matplotlib inline

## Transcription Factors

#### Load from Lambert, et al. 2016

In [47]:
SRC = 'lambert'
# Load export of tab "Table S1. Related to Figure 1B-Table 1.csv" from Document S1 in
# https://doi.org/10.1016%2Fj.cell.2018.01.029 (containing *human* transcription factors)
df = pd.read_csv(osp.join(META_DATA_DIR, 'raw', 'transcription_factors.lambert.csv'), header=[0,1])
df = df[df['Is TF?'].iloc[:,0] == 'Yes']
df = df[[('Gene Information', 'ID'), ('Unnamed: 1_level_0', 'Name')]]
df.columns = ['extid', 'lbl']
df = df.assign(sym=df['lbl'].values, spid=SPECIES_HUMAN_ID, src=SRC)
df['spid'] = SPECIES_HUMAN_ID
# Ensure number of records from spreadsheet export matches https://en.wikipedia.org/wiki/List_of_human_transcription_factors
assert len(df) == 1639, \
    'Data frame does not have expected row count 1639 '\
    '(count should match https://en.wikipedia.org/wiki/List_of_human_transcription_factors)'
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1639 entries, 0 to 2763
Data columns (total 5 columns):
extid    1639 non-null object
lbl      1639 non-null object
sym      1639 non-null object
spid     1639 non-null int64
src      1639 non-null object
dtypes: int64(1), object(4)
memory usage: 76.8+ KB


In [48]:
df.head()

Unnamed: 0,extid,lbl,sym,spid,src
0,ENSG00000137203,TFAP2A,TFAP2A,1,lambert
1,ENSG00000008196,TFAP2B,TFAP2B,1,lambert
2,ENSG00000087510,TFAP2C,TFAP2C,1,lambert
3,ENSG00000008197,TFAP2D,TFAP2D,1,lambert
4,ENSG00000116819,TFAP2E,TFAP2E,1,lambert


In [49]:
# Use MyGene to get aliases for genes above (by ensembl id, not symbol/name)
dfa = mg.getgenes(
    ids=df['extid'].unique(),
    scopes=["symbol", "retired", "name", "alias"],
    fields='symbol,name,taxid,ensembl.gene,alias', 
    as_dataframe=True
)
# The query term is set as the index in results so use that to get the ensembl id
# rather than the `ensenbl.gene` field since this is inexplicably empty sometimes
# even when the query term as an ensembl id and all other fields are valid (e.g.
# ENSG00000232040 --> gives symbol ZBED9 and valid aliases but empty gene id field)
dfa['extid'] = dfa.index
dfa = mygene_prep(dfa)
dfa = dfa.assign(spid=SPECIES_HUMAN_ID, src=SRC)
dfa = dfa[dfa['lbl'].notnull() & dfa['sym'].notnull()]
assert dfa['extid'].notnull().all()
dfa.info()

querying 1-1000...done.
querying 1001-1639...done.
<class 'pandas.core.frame.DataFrame'>
Int64Index: 7370 entries, 0 to 7380
Data columns (total 5 columns):
sym      7370 non-null object
lbl      7370 non-null object
extid    7370 non-null object
spid     7370 non-null int64
src      7370 non-null object
dtypes: int64(1), object(4)
memory usage: 345.5+ KB


In [50]:
dfm = pd.concat([df, dfa[df.columns]])
# As the mygene results may contain aliases matching the names
# in the lambert spreadsheet and the `src` is the same, duplicates
# should be removed 
dfm = dfm.drop_duplicates(subset=['sym', 'lbl', 'spid'])
dfm.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7378 entries, 0 to 7380
Data columns (total 5 columns):
extid    7378 non-null object
lbl      7378 non-null object
sym      7378 non-null object
spid     7378 non-null int64
src      7378 non-null object
dtypes: int64(1), object(4)
memory usage: 345.8+ KB


In [45]:
# # Remove larger symbols that are still too ambiguous (normally found via tagging error analysis)
# rm_sym = ['GENESIS', 'MINOR']
# rm_sym = [v.upper() for v in rm_sym]
# mask = dfm['sym'].str.upper().isin(rm_sym)
# assert set(rm_sym) == set(dfm[mask]['sym'].str.upper().unique())
# print('Removing {} records with excessively short symbols: {}'.format(mask.sum(), rm_sym))
# dfm = dfm[~mask]
# dfm.info()

In [51]:
df_lambert = dfm.copy()

In [37]:
#df_lambert[df_lambert['sym'].str.lower().str.contains('t-bet')]

#### Load Manual Entries

In [52]:
SRC = 'manual'
df = pd.read_csv(osp.join(META_DATA_DIR, 'raw', 'transcription_factors.manual.csv'), sep=',')
df = df.assign(src=SRC, extid=None)
df.head()

Unnamed: 0,sym,lbl,spid,src,extid
0,RORC,RORC,1,manual,
1,RORγ,RORC,1,manual,
2,RORγt,RORC,1,manual,
3,RORγ1,RORC,1,manual,
4,RORγ2,RORC,1,manual,


In [53]:
df_manual = df.copy()

#### Merge and Export

In [64]:
df = merge([df_lambert, df_manual], ID_TYP_TF).drop_duplicates()
df = add_preferred_ids(df)
print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7391 entries, 1072 to 6156
Data columns (total 7 columns):
id        7391 non-null object
src       7391 non-null object
sym       7391 non-null object
lbl       7391 non-null object
spid      7391 non-null int64
extid     7378 non-null object
prefid    7391 non-null object
dtypes: int64(1), object(6)
memory usage: 461.9+ KB
None


Unnamed: 0,id,src,sym,lbl,spid,extid,prefid
1072,TFC491EFF7A86A1701,lambert,AC008770.2,AC008770.2,1,ENSG00000267179,TFC491EFF7A86A1701
232,TF366ACF42CE4CCEBB,lambert,AC008770.3,AC008770.3,1,ENSG00000267179,TF366ACF42CE4CCEBB
160,TF9837F24D5ADF39CB,lambert,AC023509.3,AC023509.3,1,ENSG00000267281,TF9837F24D5ADF39CB
233,TFA91BACA6A8187757,lambert,AC092835.1,AC092835.1,1,ENSG00000233757,TFA91BACA6A8187757
234,TF25E2A8B125012DB6,lambert,AC138696.1,AC138696.1,1,ENSG00000264668,TF25E2A8B125012DB6


In [65]:
# Remove excessively short symbols or larger symbols that are still too 
# ambiguous (normally found via tagging error analysis)
rm_sym = list(df[(df['sym'].str.len() < MIN_TF_SYM_LEN) & (df['src'] != 'manual')]['sym'].unique())
rm_sym.extend(['GENESIS', 'MINOR', 'OUT'])
rm_sym = [v.upper() for v in rm_sym]

mask = df['sym'].str.upper().isin(rm_sym)
assert set(rm_sym) == set(df[mask]['sym'].str.upper().unique())
print('Removing {} records with ambiguous symbols: {}'.format(mask.sum(), sorted(rm_sym)))
df = df[~mask]
df.info()

Removing 456 records with ambiguous symbols: ['AA', 'AFX', 'AHC', 'AHR', 'AHX', 'AI4', 'AIM', 'AIO', 'AIS', 'AKA', 'AMS', 'AN', 'AN2', 'ANF', 'AP1', 'AP2', 'AP4', 'AR', 'AR1', 'AR7', 'AR8', 'ARA', 'ARX', 'B1F', 'BAR', 'BBX', 'BCH', 'BCS', 'BDE', 'BDP', 'BEN', 'BF1', 'BF2', 'BFT', 'BHC', 'BNC', 'BOM', 'BP1', 'BSX', 'BTD', 'BXR', 'BZP', 'CAA', 'CAR', 'CBF', 'CCD', 'CCF', 'CDP', 'CF5', 'CHA', 'CHN', 'CIC', 'CIZ', 'CJS', 'CPF', 'CPX', 'CRD', 'CRS', 'CRX', 'CSL', 'CSO', 'CST', 'CSX', 'CTF', 'CTM', 'CUX', 'D9', 'DB1', 'DBP', 'DGS', 'DMO', 'DOD', 'DOM', 'DP1', 'DP2', 'DP4', 'DR1', 'DSS', 'DUB', 'E2A', 'E47', 'E4F', 'EBF', 'EC2', 'EFC', 'EFG', 'EHF', 'ELP', 'EN1', 'EN2', 'EOS', 'ER', 'ERA', 'ERB', 'ERF', 'ERG', 'ERM', 'ERP', 'ERT', 'ESR', 'ESX', 'ETF', 'EZF', 'EZI', 'F11', 'FEV', 'FEZ', 'FHX', 'FIK', 'FIP', 'FND', 'FOG', 'FOS', 'FPP', 'FRA', 'FRU', 'FTF', 'FXR', 'G10', 'G13', 'G17', 'GAX', 'GBF', 'GCR', 'GEF', 'GENESIS', 'GF1', 'GLI', 'GR', 'GRL', 'GSC', 'GSF', 'GTD', 'GTX', 'GUD', 'H6', 'H6L'

In [66]:
df['sym'].str.len().clip(0, 15).value_counts().sort_index()

4     1312
5     1668
6     1466
7      524
8      170
9       94
10      65
11      43
12      33
13      19
14      45
15    1496
Name: sym, dtype: int64

In [67]:
df[df['sym'].str.lower().str.contains('gfi')]

Unnamed: 0,id,src,sym,lbl,spid,extid,prefid
1200,TFD6425765B0AF5303,lambert,NGFI-A,EGR1,1,ENSG00000120738,TF9C933C7F3601C428
1218,TF6AD173F0E42A506C,lambert,NGFI-C,EGR4,1,ENSG00000135625,TF193163E659BF0520
1219,TFA77682C38E2A0D4F,lambert,NGFIC,EGR4,1,ENSG00000135625,TF193163E659BF0520
265,TF1FEE043B45D9654E,lambert,GFI1,GFI1,1,ENSG00000162676,TF1FEE043B45D9654E
1242,TFE156A51136C217BC,lambert,GFI-1,GFI1,1,ENSG00000162676,TF1FEE043B45D9654E
1243,TFD513B64FD7B6A5DF,lambert,GFI1A,GFI1,1,ENSG00000162676,TF1FEE043B45D9654E
266,TF06386BF18D80F1C5,lambert,GFI1B,GFI1B,1,ENSG00000165702,TF06386BF18D80F1C5
6382,TF03C9CAC32D70E497,lambert,NGFIB,NR4A1,1,ENSG00000123358,TF139F6699C1A7EEBB


In [68]:
path = osp.join(META_DATA_DIR, TRANSCRIPTION_FACTORS + '.csv')
df.to_csv(path, index=False)
path

'/Users/eczech/repos/hammer/t-cell-relation-extraction/pm_subtype_protein_relations/data/meta/transcription_factors.csv'

## Cell Types

In [42]:
SRC = 'manual'
cell_types = [
    #('TEFF', ['T-eff', 'T-effector', 'eff-T', 'effector-T']),
    ('TN', ['T-naïve', 'naïve-T']),
    ('TMEM', ['T-mem', 'memory-T', 'mem-T']),
    ('TEMRA', ['T-emra', 'emra-T']),
    ('TCM', ['T-cm', 'cm-T', 'central-memory-T']),
    ('TEM', ['T-em', 'em-T', 'effector-memory-T', 'effector-memory-T']),
    ('IEL', ['IELs', 'Intraepithelial-lymphocyte', 'Intraepithelial-lymphocytes']),
    ('DETC', ['DETCs', 'Dendritic-epidermal-T']),
    ('Thymocyte', ['thymocyte', 'thymocytes']),
    ('Th', ['T-Helper', 'Helper-T']),
    ('Tc', ['T-Cytotoxic', 'Cytotoxic-T', 'Cytolitic-T', 'T-Cytolitic']),
    ('ThP', ['Th-Precursor', 'Precursor-Th']),
    ('Trm', ['T-rm', 'T-resident-memory', 'resident-memory-T']),
    ('Tscm', ['T-SCM', 'T-SC', 'stem-memory-T', 'T-stem-memory', 'memory-stem-T', 'T-memory-stem']),
    ('NKT', ['natural-killer-T', 'T-natural-killer', 'INKT', 'TRNKT', 'CNKT', 'NK/T']),
    ('MAIT', ['Mucosal-associated invariant T']),
    ('Treg', ['T-reg', 'T-regs', 'T-regulatory', 'regulatory-T']),
    ('Tsupp', ['Tsuppressor', 'Ts/c', 'Ts/Tc','T-suppressor/cytotoxic'], 'Treg'),
    ('Treg1', ['T-reg1', 'T-reg-1', 'Treg/Th1', 'Treg-Th1', 'Tr-1'], 'Treg'),
    ('Treg17', ['T-reg17', 'T-reg-17', 'Treg/Th17', 'Treg-Th17', 'Tr-17'], 'Treg'),
    ('Tfh', ['T-FH', 'T-follicular-helper', 'follicular-helper-T', 'fhT'], 'Th'),
    ('Tfreg', ['Tf-reg', 'T-follicular-regulatory', 'follicular-regulatory-T', 'follicular-Treg', 'fTreg'], 'Treg'),
    ('iTreg', [
        'i-Treg', 'induced-Treg', 'inducible-Treg', 'induced-T-reg', 'inducible-T-reg',
        'induced T-regulatory', 'induced-T-regulatory', 'induced regulatory-T', 'induced-regulatory-T',
        'inducible T-regulatory', 'inducible-T-regulatory', 'inducible regulatory-T', 'inducible-regulatory-T',
        ], 'Treg'),
    ('nTreg', [
        'n-Treg', 'natural-Treg', 'natural-T-reg', 
        'natural T-regulatory', 'natural-T-regulatory', 'natural regulatory-T', 'natural-regulatory-T'
        ], 'Treg'),
    ('pTreg', [
        'p-Treg', 'peripheral-Treg', 'peripheral-T-reg', 
        'peripheral T-regulatory', 'peripheral-T-regulatory', 'peripheral regulatory-T', 'peripheral-regulatory-T',
        'peripherally-induced-Treg'
        ], 'Treg'),
    ('γδT', ['Gamma-Delta-T', 'T-Gamma-Delta', 'Gamma/Delta-T', 'γ/δ-T', 'γδ-T', 'gd-T', 'Tgd', 'g/dT', 'Tg/d']),
    ('γδT-Vδ1', ['VDelta1-T', 'Vδ1-T'], 'γδT'),
    ('γδT-Vδ2', ['VDelta2-T', 'Vδ2-T'], 'γδT'),
    ('γδT-Vγ1', ['VGamma1-T', 'Vγ1-T'], 'γδT'),
    ('γδT-Vγ4', ['VGamma4-T', 'Vγ4-T'], 'γδT'),
    ('γδT-Vγ9', ['VGamma9-T', 'Vγ9-T'], 'γδT'),
    ('γδT-Vγ9Vδ2', ['T-VGamma9-Delta2', 'VGamma9-Delta2-T', 'Vγ9Vδ2T'], 'γδT'),
    ('γδT-17', ['γδ17T', 'T-Gamma-Delta17', 'Gamma-Delta17-T'], 'γδT'),
    ('γδT-TCS1', ['TCS1'], 'γδT')
] + [
    ('Tc' + str(i), ['Tc-' + str(i), 'T-cytolitic-' + str(i), 'T cytolitic-' + str(i), 'T-cytotoxic-' + str(i), 'T cytotoxic-' + str(i)], 'Tc') 
    for i in [0, 1, 2, 3, 9, 17, 22]
] + [
    ('Th' + str(i), [
        'Th-' + str(i), 'T-helper-' + str(i), 'T-helper ' + str(i), 'T helper-' + str(i), 'T helper cell-' + str(i),
        'T helper cell (Th)-' + str(i), 'T helper (Th)-' + str(i)
        ], 'Th')
    for i in [0, 1, 2, 3, 9, 17, 22]
] + [
    ('Tfh{}like'.format(i), ['Tfh-{}-like'.format(i), 'Tfh-{}like'.format(i), 'Tfh{}-like'.format(i)], 'Tfh') 
    for i in [0, 1, 2, 3, 9, 17, 22]
]

In [43]:
import unidecode
df = []
for r in cell_types:
    assert len(r) in [2, 3]
    lbl, alias = r[:2]
    parent = None if len(r) == 2 else r[2]
    df.append((lbl, lbl))
    for a1 in alias:
        for a2 in [a1, unidecode.unidecode(a1)]:
            df.append((a2, lbl))
            df.append((a2.replace('-', ' '), lbl))
            df.append((a2.replace('-', ''), lbl))
df = pd.DataFrame(df, columns=['sym', 'lbl']).drop_duplicates()
df = df.assign(spid=SPECIES_HUMAN_ID, src=SRC, extid=None)
df['id'] = get_ids(df, ID_TYP_CT)
df = df.drop_duplicates()
df = add_preferred_ids(df)
df.head()

Unnamed: 0,sym,lbl,spid,src,extid,id,prefid
108,DETC,DETC,1,manual,,CT231343DE336492F1,CT231343DE336492F1
109,DETCs,DETC,1,manual,,CT4CF1E0FFE26A395F,CT231343DE336492F1
115,Dendritic-epidermal-T,DETC,1,manual,,CTAD8B4B3EE6ECB6BB,CT231343DE336492F1
116,Dendritic epidermal T,DETC,1,manual,,CT728A6AA916C71387,CT231343DE336492F1
117,DendriticepidermalT,DETC,1,manual,,CTE89D7A3B2E8FAACE,CT231343DE336492F1


In [44]:
df[df['sym'].str.lower().str.match('.*help.*')].head(15)

Unnamed: 0,sym,lbl,spid,src,extid,id,prefid
404,T-follicular-helper,Tfh,1,manual,,CT521FD23411A69F7B,CTE9F6070561C95355
405,T follicular helper,Tfh,1,manual,,CT5C03730892B0B7E0,CTE9F6070561C95355
406,Tfollicularhelper,Tfh,1,manual,,CT10F75C923DC5D960,CTE9F6070561C95355
410,follicular-helper-T,Tfh,1,manual,,CT8A54A7929F3074FF,CTE9F6070561C95355
411,follicular helper T,Tfh,1,manual,,CT48DDD25612FEC28B,CTE9F6070561C95355
412,follicularhelperT,Tfh,1,manual,,CT92FD47DCB488F926,CTE9F6070561C95355
135,T-Helper,Th,1,manual,,CT58AC91B99EE259CE,CTB7DFCED683E6F0A9
136,T Helper,Th,1,manual,,CTCC4CF8EAB76DFC68,CTB7DFCED683E6F0A9
137,THelper,Th,1,manual,,CT0107866BBCD97E04,CTB7DFCED683E6F0A9
141,Helper-T,Th,1,manual,,CT068AEEF38840649E,CTB7DFCED683E6F0A9


In [45]:
path = osp.join(META_DATA_DIR, CELL_TYPES + '.csv')
df.to_csv(path, index=False)
path

'/Users/eczech/repos/hammer/t-cell-relation-extraction/pm_subtype_protein_relations/data/meta/cell_types.csv'