## Metadata Integration

In [3]:
import os.path as osp
import pandas as pd
import numpy as np
from tcre import lib
from tcre.lib import SPECIES_HUMAN_ID
from tcre import meta
from tcre.env import *

## Transcription Factors

#### Load from Lambert, et al. 2016

In [4]:
SRC = 'lambert'
# Load export of tab "Table S1. Related to Figure 1B-Table 1.csv" from Document S1 in
# https://doi.org/10.1016%2Fj.cell.2018.01.029 (containing *human* transcription factors)
df = pd.read_csv(osp.join(META_DATA_DIR, 'raw', 'transcription_factors.lambert.csv'), header=[0,1])
df = df[df['Is TF?'].iloc[:,0] == 'Yes']
df = df[[('Gene Information', 'ID'), ('Unnamed: 1_level_0', 'Name')]]
df.columns = ['extid', 'lbl']
df = df.assign(sym=df['lbl'].values, spid=SPECIES_HUMAN_ID, src=SRC)
df['spid'] = SPECIES_HUMAN_ID
# Ensure number of records from spreadsheet export matches https://en.wikipedia.org/wiki/List_of_human_transcription_factors
assert len(df) == 1639, \
    'Data frame does not have expected row count 1639 '\
    '(count should match https://en.wikipedia.org/wiki/List_of_human_transcription_factors)'
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1639 entries, 0 to 2763
Data columns (total 5 columns):
extid    1639 non-null object
lbl      1639 non-null object
sym      1639 non-null object
spid     1639 non-null int64
src      1639 non-null object
dtypes: int64(1), object(4)
memory usage: 76.8+ KB


In [5]:
df.head()

Unnamed: 0,extid,lbl,sym,spid,src
0,ENSG00000137203,TFAP2A,TFAP2A,1,lambert
1,ENSG00000008196,TFAP2B,TFAP2B,1,lambert
2,ENSG00000087510,TFAP2C,TFAP2C,1,lambert
3,ENSG00000008197,TFAP2D,TFAP2D,1,lambert
4,ENSG00000116819,TFAP2E,TFAP2E,1,lambert


In [7]:
# Use MyGene to get aliases for genes above (by ensembl id, not symbol/name)
dfa = meta.mg.getgenes(
    ids=df['extid'].unique(),
    scopes=["symbol", "retired", "name", "alias"],
    fields='symbol,name,taxid,ensembl.gene,alias', 
    as_dataframe=True
)
# The query term is set as the index in results so use that to get the ensembl id
# rather than the `ensenbl.gene` field since this is inexplicably empty sometimes
# even when the query term as an ensembl id and all other fields are valid (e.g.
# ENSG00000232040 --> gives symbol ZBED9 and valid aliases but empty gene id field)
dfa['extid'] = dfa.index
dfa = mygene_prep(dfa)
dfa = dfa.assign(spid=SPECIES_HUMAN_ID, src=SRC)
dfa = dfa[dfa['lbl'].notnull() & dfa['sym'].notnull()]
assert dfa['extid'].notnull().all()
dfa.info()

querying 1-1000...

ConnectionError: HTTPConnectionPool(host='mygene.info', port=80): Max retries exceeded with url: /v3/gene/ (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fc0da927c18>: Failed to establish a new connection: [Errno -3] Temporary failure in name resolution',))

In [50]:
dfm = pd.concat([df, dfa[df.columns]])
# As the mygene results may contain aliases matching the names
# in the lambert spreadsheet and the `src` is the same, duplicates
# should be removed 
dfm = dfm.drop_duplicates(subset=['sym', 'lbl', 'spid'])
dfm.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7378 entries, 0 to 7380
Data columns (total 5 columns):
extid    7378 non-null object
lbl      7378 non-null object
sym      7378 non-null object
spid     7378 non-null int64
src      7378 non-null object
dtypes: int64(1), object(4)
memory usage: 345.8+ KB


In [51]:
df_lambert = dfm.copy()

In [37]:
#df_lambert[df_lambert['sym'].str.lower().str.contains('t-bet')]

#### Load Manual Entries

In [52]:
SRC = 'manual'
df = pd.read_csv(osp.join(META_DATA_DIR, 'raw', 'transcription_factors.manual.csv'), sep=',')
df = df.assign(src=SRC, extid=None)
df.head()

Unnamed: 0,sym,lbl,spid,src,extid
0,RORC,RORC,1,manual,
1,RORγ,RORC,1,manual,
2,RORγt,RORC,1,manual,
3,RORγ1,RORC,1,manual,
4,RORγ2,RORC,1,manual,


In [53]:
df_manual = df.copy()

#### Merge and Export

In [64]:
df = merge([df_lambert, df_manual], ID_TYP_TF).drop_duplicates()
df = add_preferred_ids(df)
print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7391 entries, 1072 to 6156
Data columns (total 7 columns):
id        7391 non-null object
src       7391 non-null object
sym       7391 non-null object
lbl       7391 non-null object
spid      7391 non-null int64
extid     7378 non-null object
prefid    7391 non-null object
dtypes: int64(1), object(6)
memory usage: 461.9+ KB
None


Unnamed: 0,id,src,sym,lbl,spid,extid,prefid
1072,TFC491EFF7A86A1701,lambert,AC008770.2,AC008770.2,1,ENSG00000267179,TFC491EFF7A86A1701
232,TF366ACF42CE4CCEBB,lambert,AC008770.3,AC008770.3,1,ENSG00000267179,TF366ACF42CE4CCEBB
160,TF9837F24D5ADF39CB,lambert,AC023509.3,AC023509.3,1,ENSG00000267281,TF9837F24D5ADF39CB
233,TFA91BACA6A8187757,lambert,AC092835.1,AC092835.1,1,ENSG00000233757,TFA91BACA6A8187757
234,TF25E2A8B125012DB6,lambert,AC138696.1,AC138696.1,1,ENSG00000264668,TF25E2A8B125012DB6


In [65]:
# Remove excessively short symbols or larger symbols that are still too 
# ambiguous (normally found via tagging error analysis)
rm_sym = list(df[(df['sym'].str.len() < MIN_TF_SYM_LEN) & (df['src'] != 'manual')]['sym'].unique())
rm_sym.extend(['GENESIS', 'MINOR', 'OUT'])
rm_sym = [v.upper() for v in rm_sym]

mask = df['sym'].str.upper().isin(rm_sym)
assert set(rm_sym) == set(df[mask]['sym'].str.upper().unique())
print('Removing {} records with ambiguous symbols: {}'.format(mask.sum(), sorted(rm_sym)))
df = df[~mask]
df.info()

Removing 456 records with ambiguous symbols: ['AA', 'AFX', 'AHC', 'AHR', 'AHX', 'AI4', 'AIM', 'AIO', 'AIS', 'AKA', 'AMS', 'AN', 'AN2', 'ANF', 'AP1', 'AP2', 'AP4', 'AR', 'AR1', 'AR7', 'AR8', 'ARA', 'ARX', 'B1F', 'BAR', 'BBX', 'BCH', 'BCS', 'BDE', 'BDP', 'BEN', 'BF1', 'BF2', 'BFT', 'BHC', 'BNC', 'BOM', 'BP1', 'BSX', 'BTD', 'BXR', 'BZP', 'CAA', 'CAR', 'CBF', 'CCD', 'CCF', 'CDP', 'CF5', 'CHA', 'CHN', 'CIC', 'CIZ', 'CJS', 'CPF', 'CPX', 'CRD', 'CRS', 'CRX', 'CSL', 'CSO', 'CST', 'CSX', 'CTF', 'CTM', 'CUX', 'D9', 'DB1', 'DBP', 'DGS', 'DMO', 'DOD', 'DOM', 'DP1', 'DP2', 'DP4', 'DR1', 'DSS', 'DUB', 'E2A', 'E47', 'E4F', 'EBF', 'EC2', 'EFC', 'EFG', 'EHF', 'ELP', 'EN1', 'EN2', 'EOS', 'ER', 'ERA', 'ERB', 'ERF', 'ERG', 'ERM', 'ERP', 'ERT', 'ESR', 'ESX', 'ETF', 'EZF', 'EZI', 'F11', 'FEV', 'FEZ', 'FHX', 'FIK', 'FIP', 'FND', 'FOG', 'FOS', 'FPP', 'FRA', 'FRU', 'FTF', 'FXR', 'G10', 'G13', 'G17', 'GAX', 'GBF', 'GCR', 'GEF', 'GENESIS', 'GF1', 'GLI', 'GR', 'GRL', 'GSC', 'GSF', 'GTD', 'GTX', 'GUD', 'H6', 'H6L'

In [66]:
df['sym'].str.len().clip(0, 15).value_counts().sort_index()

4     1312
5     1668
6     1466
7      524
8      170
9       94
10      65
11      43
12      33
13      19
14      45
15    1496
Name: sym, dtype: int64

In [67]:
df[df['sym'].str.lower().str.contains('gfi')]

Unnamed: 0,id,src,sym,lbl,spid,extid,prefid
1200,TFD6425765B0AF5303,lambert,NGFI-A,EGR1,1,ENSG00000120738,TF9C933C7F3601C428
1218,TF6AD173F0E42A506C,lambert,NGFI-C,EGR4,1,ENSG00000135625,TF193163E659BF0520
1219,TFA77682C38E2A0D4F,lambert,NGFIC,EGR4,1,ENSG00000135625,TF193163E659BF0520
265,TF1FEE043B45D9654E,lambert,GFI1,GFI1,1,ENSG00000162676,TF1FEE043B45D9654E
1242,TFE156A51136C217BC,lambert,GFI-1,GFI1,1,ENSG00000162676,TF1FEE043B45D9654E
1243,TFD513B64FD7B6A5DF,lambert,GFI1A,GFI1,1,ENSG00000162676,TF1FEE043B45D9654E
266,TF06386BF18D80F1C5,lambert,GFI1B,GFI1B,1,ENSG00000165702,TF06386BF18D80F1C5
6382,TF03C9CAC32D70E497,lambert,NGFIB,NR4A1,1,ENSG00000123358,TF139F6699C1A7EEBB


In [68]:
path = osp.join(META_DATA_DIR, TRANSCRIPTION_FACTORS + '.csv')
df.to_csv(path, index=False)
path

'/Users/eczech/repos/hammer/t-cell-relation-extraction/pm_subtype_protein_relations/data/meta/transcription_factors.csv'