## Transcription Factor Integration

In [1]:
import os.path as osp
import pandas as pd
import numpy as np
from tcre import meta
from tcre import lib
from tcre.meta import ID_TYP_TF, MIN_TF_SYM_LEN
from tcre.lib import TRANSCRIPTION_FACTORS, SPECIES_HUMAN_ID
from tcre.env import *

#### Load from Lambert, et al. 2016

In [2]:
SRC = 'lambert'
# Load export of tab "Table S1. Related to Figure 1B-Table 1.csv" from Document S1 in
# https://doi.org/10.1016%2Fj.cell.2018.01.029 (containing *human* transcription factors)
df = pd.read_csv(osp.join(META_DATA_DIR, 'raw', 'transcription_factors.lambert.csv'), header=[0,1])
df = df[df['Is TF?'].iloc[:,0] == 'Yes']
df = df[[('Gene Information', 'ID'), ('Unnamed: 1_level_0', 'Name')]]
df.columns = ['extid', 'lbl']
df = df.assign(sym=df['lbl'].values, spid=SPECIES_HUMAN_ID, src=SRC)
df['spid'] = SPECIES_HUMAN_ID
# Ensure number of records from spreadsheet export matches https://en.wikipedia.org/wiki/List_of_human_transcription_factors
assert len(df) == 1639, \
    'Data frame does not have expected row count 1639 '\
    '(count should match https://en.wikipedia.org/wiki/List_of_human_transcription_factors)'
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1639 entries, 0 to 2763
Data columns (total 5 columns):
extid    1639 non-null object
lbl      1639 non-null object
sym      1639 non-null object
spid     1639 non-null int64
src      1639 non-null object
dtypes: int64(1), object(4)
memory usage: 76.8+ KB


In [3]:
df.head()

Unnamed: 0,extid,lbl,sym,spid,src
0,ENSG00000137203,TFAP2A,TFAP2A,1,lambert
1,ENSG00000008196,TFAP2B,TFAP2B,1,lambert
2,ENSG00000087510,TFAP2C,TFAP2C,1,lambert
3,ENSG00000008197,TFAP2D,TFAP2D,1,lambert
4,ENSG00000116819,TFAP2E,TFAP2E,1,lambert


In [4]:
# Use MyGene to get aliases for genes above (by ensembl id, not symbol/name)
dfa = meta.mg.getgenes(
    ids=df['extid'].unique(),
    scopes=["symbol", "retired", "name", "alias"],
    fields='symbol,name,taxid,ensembl.gene,alias', 
    as_dataframe=True
)
# The query term is set as the index in results so use that to get the ensembl id
# rather than the `ensenbl.gene` field since this is inexplicably empty sometimes
# even when the query term as an ensembl id and all other fields are valid (e.g.
# ENSG00000232040 --> gives symbol ZBED9 and valid aliases but empty gene id field)
dfa['extid'] = dfa.index
dfa = meta.mygene_prep(dfa)
dfa = dfa.assign(spid=SPECIES_HUMAN_ID, src=SRC)
dfa = dfa[dfa['lbl'].notnull() & dfa['sym'].notnull()]
assert dfa['extid'].notnull().all()
dfa.info()

querying 1-1000...done.
querying 1001-1639...done.
<class 'pandas.core.frame.DataFrame'>
Int64Index: 7380 entries, 0 to 7390
Data columns (total 5 columns):
sym      7380 non-null object
lbl      7380 non-null object
extid    7380 non-null object
spid     7380 non-null int64
src      7380 non-null object
dtypes: int64(1), object(4)
memory usage: 345.9+ KB


In [5]:
dfm = pd.concat([df, dfa[df.columns]])
# As the mygene results may contain aliases matching the names
# in the lambert spreadsheet and the `src` is the same, duplicates
# should be removed 
dfm = dfm.drop_duplicates(subset=['sym', 'lbl', 'spid'])
dfm.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7388 entries, 0 to 7390
Data columns (total 5 columns):
extid    7388 non-null object
lbl      7388 non-null object
sym      7388 non-null object
spid     7388 non-null int64
src      7388 non-null object
dtypes: int64(1), object(4)
memory usage: 346.3+ KB


In [6]:
df_lambert = dfm.copy()

In [7]:
#df_lambert[df_lambert['sym'].str.lower().str.contains('t-bet')]

#### Load Manual Entries

In [8]:
SRC = 'manual'
df = pd.read_csv(osp.join(META_DATA_DIR, 'raw', 'transcription_factors.manual.csv'), sep=',')
df = df.assign(src=SRC, extid=None)
df.head()

Unnamed: 0,sym,lbl,spid,src,extid
0,RORC,RORC,1,manual,
1,RORγ,RORC,1,manual,
2,RORγt,RORC,1,manual,
3,RORγ1,RORC,1,manual,
4,RORγ2,RORC,1,manual,


In [9]:
df_manual = df.copy()

### Merge 

In [10]:
SRC_PRIORITY = {
    'lambert': 50, 
    'manual': 20
}

In [11]:
df = meta.merge([df_lambert, df_manual], ID_TYP_TF).drop_duplicates()
df = meta.add_source_priority(df, SRC_PRIORITY)
df = meta.add_preferred_ids(df)
print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7406 entries, 6167 to 5786
Data columns (total 9 columns):
id          7406 non-null object
src         7406 non-null object
sym         7406 non-null object
lbl         7406 non-null object
spid        7406 non-null int64
extid       7388 non-null object
priority    7406 non-null int64
prefid      7406 non-null object
enabled     7406 non-null bool
dtypes: bool(1), int64(2), object(6)
memory usage: 528.0+ KB
None


Unnamed: 0,id,src,sym,lbl,spid,extid,priority,prefid,enabled
6167,TF36194BB02DFF6779,lambert,11orf9,MYRF,1,ENSG00000124920,50,TF727377B0E281FAA1,True
4626,TFE41E4684206987BC,lambert,12CC4,FOXP1,1,ENSG00000114861,50,TF3BD26C8F2295B0D0,True
4109,TFC45919BBF21EF770,lambert,2410002I16Rik,CXXC1,1,ENSG00000154832,50,TFD4C80059467A52F0,True
6136,TF7F70A3FBA005F34E,lambert,2A-DUB,MYSM1,1,ENSG00000162601,50,TFB861950DDE3DF397,True
6137,TF6978CB4134B87D9B,lambert,2ADUB,MYSM1,1,ENSG00000162601,50,TFB861950DDE3DF397,True


### Filter

In [12]:
# Remove excessively short symbols or larger symbols that are still too 
# ambiguous (normally found via tagging error analysis)
rm_sym = list(df[(df['sym'].str.len() < MIN_TF_SYM_LEN) & (df['src'] != 'manual')]['sym'].unique())
rm_sym = [v.upper() for v in rm_sym]

mask = df['sym'].str.upper().isin(rm_sym)
assert set(rm_sym) == set(df[mask]['sym'].str.upper().unique())
print('Removing {} records with ambiguous symbols: {}'.format(mask.sum(), sorted(rm_sym)))
df = df[~mask]
df.info()

Removing 454 records with ambiguous symbols: ['AA', 'AFX', 'AHC', 'AHR', 'AHX', 'AI4', 'AIM', 'AIO', 'AIS', 'AKA', 'AMS', 'AN', 'AN2', 'ANF', 'AP1', 'AP2', 'AP4', 'AR', 'AR1', 'AR7', 'AR8', 'ARA', 'ARX', 'B1F', 'BAR', 'BBX', 'BCH', 'BCS', 'BDE', 'BDP', 'BEN', 'BF1', 'BF2', 'BFT', 'BHC', 'BNC', 'BOM', 'BP1', 'BSX', 'BTD', 'BXR', 'BZP', 'CAA', 'CAR', 'CBF', 'CCD', 'CCF', 'CDP', 'CF5', 'CHA', 'CHN', 'CIC', 'CIZ', 'CJS', 'CPF', 'CPX', 'CRD', 'CRS', 'CRX', 'CSL', 'CSO', 'CST', 'CSX', 'CTF', 'CTM', 'CUX', 'D9', 'DB1', 'DBP', 'DGS', 'DMO', 'DOD', 'DOM', 'DP1', 'DP2', 'DP4', 'DR1', 'DSS', 'DUB', 'E2A', 'E47', 'E4F', 'EBF', 'EC2', 'EFC', 'EFG', 'EHF', 'ELP', 'EN1', 'EN2', 'EOS', 'ER', 'ERA', 'ERB', 'ERF', 'ERG', 'ERM', 'ERP', 'ERT', 'ESR', 'ESX', 'ETF', 'EZF', 'EZI', 'F11', 'FEV', 'FEZ', 'FHX', 'FIK', 'FIP', 'FND', 'FOG', 'FOS', 'FPP', 'FRA', 'FRU', 'FTF', 'FXR', 'G10', 'G13', 'G17', 'GAX', 'GBF', 'GCR', 'GEF', 'GF1', 'GLI', 'GR', 'GRL', 'GSC', 'GSF', 'GTD', 'GTX', 'GUD', 'H6', 'H6L', 'HAP', 'H

In [13]:
# Apply static blacklist
df = meta.apply_symbol_filters(df, lib.get_entity_meta_filters(TRANSCRIPTION_FACTORS))
df.info()

Removing 3 symbols in pre-defined filters: ['Genesis' 'MINOR' 'PRISM']
<class 'pandas.core.frame.DataFrame'>
Int64Index: 6949 entries, 6167 to 5786
Data columns (total 9 columns):
id          6949 non-null object
src         6949 non-null object
sym         6949 non-null object
lbl         6949 non-null object
spid        6949 non-null int64
extid       6931 non-null object
priority    6949 non-null int64
prefid      6949 non-null object
enabled     6949 non-null bool
dtypes: bool(1), int64(2), object(6)
memory usage: 495.4+ KB


### Summarize

In [14]:
df.groupby(['src', 'enabled']).size().unstack().fillna(0)

enabled,False,True
src,Unnamed: 1_level_1,Unnamed: 2_level_1
lambert,105,6826
manual,6,12


In [15]:
df['sym'].str.len().clip(0, 15).value_counts().sort_index()

4     1313
5     1676
6     1468
7      520
8      164
9       85
10      55
11      43
12      33
13      19
14      44
15    1529
Name: sym, dtype: int64

In [16]:
df[df['sym'].str.lower().str.contains('foxp3')]

Unnamed: 0,id,src,sym,lbl,spid,extid,priority,prefid,enabled
1185,TF54F4218B9B674FCF,lambert,FOXP3,FOXP3,1,ENSG00000049768,50,TF54F4218B9B674FCF,True
13,TF0697C638336DAAB9,manual,FOXP3,FOXP3,1,,20,TF54F4218B9B674FCF,False
14,TF9B6AD040299F00D3,manual,FoxP3,FOXP3,1,,20,TF54F4218B9B674FCF,True
15,TF67F3D442E67CA0A3,manual,Foxp3,FOXP3,1,,20,TF54F4218B9B674FCF,True
16,TF742A1822CFE15888,manual,foxP3,FOXP3,1,,20,TF54F4218B9B674FCF,True
17,TF0F340536F1731D52,manual,foxp3,FOXP3,1,,20,TF54F4218B9B674FCF,True


In [17]:
df[df['sym'].str.lower().str.contains('gfi')]

Unnamed: 0,id,src,sym,lbl,spid,extid,priority,prefid,enabled
1242,TFE156A51136C217BC,lambert,GFI-1,GFI1,1,ENSG00000162676,50,TF1FEE043B45D9654E,True
265,TF1FEE043B45D9654E,lambert,GFI1,GFI1,1,ENSG00000162676,50,TF1FEE043B45D9654E,True
1243,TFD513B64FD7B6A5DF,lambert,GFI1A,GFI1,1,ENSG00000162676,50,TF1FEE043B45D9654E,True
266,TF06386BF18D80F1C5,lambert,GFI1B,GFI1B,1,ENSG00000165702,50,TF06386BF18D80F1C5,True
1200,TFD6425765B0AF5303,lambert,NGFI-A,EGR1,1,ENSG00000120738,50,TF9C933C7F3601C428,True
1218,TF6AD173F0E42A506C,lambert,NGFI-C,EGR4,1,ENSG00000135625,50,TF193163E659BF0520,True
6390,TF03C9CAC32D70E497,lambert,NGFIB,NR4A1,1,ENSG00000123358,50,TF139F6699C1A7EEBB,True
1219,TFA77682C38E2A0D4F,lambert,NGFIC,EGR4,1,ENSG00000135625,50,TF193163E659BF0520,True


### Export

In [18]:
assert df['id'].value_counts().max() == 1

In [19]:
path = osp.join(META_DATA_DIR, TRANSCRIPTION_FACTORS + '.csv')
df.to_csv(path, index=False)
path

'/lab/repos/t-cell-relation-extraction/data/meta/transcription_factors.csv'