In [1]:
%run utils.py
%matplotlib inline
import os.path as osp
import pandas as pd
import numpy as np
import tqdm
import mygene

In [2]:
mg = mygene.MyGeneInfo()

In [3]:
df_pr = pd.read_csv(osp.join(DATA_DIR, 'pubmed_abstract_proteins_resolved.csv'))
df_pr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61175 entries, 0 to 61174
Data columns (total 8 columns):
id            61175 non-null int64
start         61175 non-null int64
end           61175 non-null int64
value         61175 non-null object
class         61175 non-null object
value_norm    61175 non-null object
value_lbl     61175 non-null object
value_sym     17228 non-null object
dtypes: int64(3), object(5)
memory usage: 3.7+ MB


In [4]:
# Use normalized raw value instead of HGNC-resolved value_lbl
qfld = 'value_norm'

## Build Queries

In [5]:
from unidecode import unidecode

def normalize_to_query(pr):
    # Normalize greeks and super/subscripts (TNFα -> TNFa, CX₃CR1 -> CX3CR1)
    # Quotes and ticks will result bad http requests
    return unidecode(pr).replace("'", "").replace('"', '')

df_query = pd.DataFrame([
    {'query': normalize_to_query(pr), qfld: pr}
    for pr in df_pr[qfld].unique()
])
df_query.head()

Unnamed: 0,query,value_norm
0,FCGRIIB,FCGRIIB
1,IL2,IL2
2,FOXP3,FOXP3
3,CD4,CD4
4,TCR,TCR


In [6]:
# Show proteins that normalized to the same query terms (which isn't a problem)
cts = df_query.groupby('query').size()
df_query[df_query['query'].isin(cts[cts > 1].index.values)].sort_values('query')

Unnamed: 0,query,value_norm
484,CD3E,CD3Ε
1087,CD3E,CD3Ɛ
2584,CX3CL1,CX₃CL1
2125,CX3CL1,CX3CL1
2586,CX3CR1,CX₃CR1
342,CX3CR1,CX3CR1
3402,DP2,DP₂
2556,DP2,DP2
3845,FOXP3,FOXP³
2,FOXP3,FOXP3


In [7]:
batch_size = 250
q_pr = df_query['query'].unique()
q_pr_n = len(q_pr)
q_pr = np.array_split(q_pr, q_pr_n//batch_size)
print(
    'Number of queries total = {}\nNumber of query batches = {}\nBatch size counts:\n{}'
    .format(q_pr_n, len(q_pr), pd.Series([len(b) for b in q_pr]).value_counts())
)

Number of queries total = 5244
Number of query batches = 20
Batch size counts:
262    16
263     4
dtype: int64


## Run Queries

In [8]:
# See full list: http://docs.mygene.info/en/latest/doc/data.html#species
SPECIES = {9606: 'human', 10090: 'mouse'}

def query(terms):
    df = mg.querymany(
        terms,
        scopes=["symbol", "retired", "name", "alias"],
        fields='symbol,name,taxid,ensembl.gene,alias', 
        species='human,mouse', 
        ensemblonly=True,
        as_dataframe=True
    )
    if 'taxid' in df:
        df['species'] = df['taxid'].apply(lambda v: None if pd.isnull(v) else SPECIES.get(int(v), 'unknown'))
    return df

df_res = []
for batch in tqdm.tqdm(q_pr):
    df_res.append(query(list(batch)))
    
df_res = pd.concat(df_res)

  0%|          | 0/20 [00:00<?, ?it/s]

querying 1-263...done.


  5%|▌         | 1/20 [00:02<00:44,  2.36s/it]

Finished.
153 input query terms found dup hits:
	[('IL2', 4), ('FOXP3', 5), ('CD4', 10), ('TCR', 4), ('SATB1', 4), ('CD14', 3), ('HLA', 10), ('PD1', 
102 input query terms found no hit:
	['FCGRIIB', 'PROGRAMMEDDEATH1', 'TFH2', 'TFH17', 'TFH1', 'LGLS', 'MHCI', 'APRCA', 'IMMUNOGLOBULING2A
Pass "returnall=True" to return complete lists of duplicate or missing query terms.
querying 1-263...done.


 10%|█         | 2/20 [00:04<00:41,  2.33s/it]

Finished.
100 input query terms found dup hits:
	[('TF', 10), ('AKT', 10), ('GARP', 10), ('CD73', 2), ('TCRa', 6), ('TCRb', 10), ('GMCSF', 2), ('INOS
145 input query terms found no hit:
	['CCTYPE5', 'WB', 'HUT102', 'IFC', 'HIV1', 'GRANU', 'GROWTHFACTOR', 'ADGM', 'TARGET6', 'ESAT6', 'RV0
Pass "returnall=True" to return complete lists of duplicate or missing query terms.
querying 1-263...done.


 15%|█▌        | 3/20 [00:08<00:49,  2.94s/it]

Finished.
101 input query terms found dup hits:
	[('SEMA3A', 2), ('REGULATOR', 10), ('CD24', 9), ('CD27', 3), ('MDA5', 2), ('STAT1', 2), ('IL27', 3),
149 input query terms found no hit:
	['RIGILIKE', 'RLR', 'CHB', 'CD4AND', 'TGM', 'COMPLEMENTCONTROLPROTEINSUPER', 'NUCLEARHORMONE', 'HIST
Pass "returnall=True" to return complete lists of duplicate or missing query terms.
querying 1-263...done.


 20%|██        | 4/20 [00:11<00:42,  2.68s/it]

Finished.
95 input query terms found dup hits:
	[('IL2RA', 2), ('SYK', 2), ('MYD88', 2), ('P38', 10), ('JNK', 2), ('SUPER', 4), ('HDAC', 4), ('HDAC6
155 input query terms found no hit:
	['IL4HS2', 'IL4CNS2', 'OROXYLINA', 'SMAD3P', 'P38P', 'JNKP', 'ERK12', 'TPATH2', 'MHCCLASS', 'IFNgAND
Pass "returnall=True" to return complete lists of duplicate or missing query terms.
querying 1-262...done.


 25%|██▌       | 5/20 [00:13<00:37,  2.52s/it]

Finished.
101 input query terms found dup hits:
	[('GONADOTROPIN', 10), ('HCG', 5), ('FOLR4', 2), ('ITGB8', 2), ('PGLYRP1', 2), ('IL1RL1', 2), ('ITGA
147 input query terms found no hit:
	['COLLAGENII', 'HUMANCHORIONICGONADOTROPIN', 'GALECTIN10', 'EOE', 'IL17ANEG', 'ASSOCIATEDINVARIANT',
Pass "returnall=True" to return complete lists of duplicate or missing query terms.
querying 1-262...done.


 30%|███       | 6/20 [00:15<00:34,  2.50s/it]

Finished.
80 input query terms found dup hits:
	[('3P', 5), ('b1', 10), ('CB', 2), ('MOG', 2), ('CD47', 2), ('GITRL', 2), ('QA1', 3), ('KLF2', 2), (
167 input query terms found no hit:
	['CD11CMHC', 'CD11CLANGERINMHC', 'TYPE1', 'CANNABINOID1AND2', 'AEA', 'RORgTFACTOR', 'GLUCOCORTICOIDI
Pass "returnall=True" to return complete lists of duplicate or missing query terms.
querying 1-262...done.


 35%|███▌      | 7/20 [00:19<00:35,  2.76s/it]

Finished.
90 input query terms found dup hits:
	[('CD134L', 2), ('TNFRSF4', 2), ('TNFSF4', 2), ('SIT', 2), ('CD152', 2), ('CCR10', 5), ('IL20', 2), 
149 input query terms found no hit:
	['OX40OX40L', 'INDOLEAMINE23DIOXYGENASE1', 'BOXPROTEINP3', 'TRECS', 'SIRT1MTOR', 'RUNTRELATEDTRANSCR
Pass "returnall=True" to return complete lists of duplicate or missing query terms.
querying 1-262...done.


 40%|████      | 8/20 [00:21<00:30,  2.58s/it]

Finished.
74 input query terms found dup hits:
	[('LTbR', 2), ('LYMPHOTOXIN', 6), ('CEBPb', 4), ('ITK', 2), ('MST1', 4), ('~92', 10), ('DRB1', 4), (
172 input query terms found no hit:
	['LYMPHOTOXINab', 'LTab', 'PHOSPHOSTAT5', 'BINDINGPROTEIN', 'IL6C', 'KINASEITK', 'IL2INDUCIBLETCELLK
Pass "returnall=True" to return complete lists of duplicate or missing query terms.
querying 1-262...done.


 45%|████▌     | 9/20 [00:23<00:27,  2.49s/it]

Finished.
99 input query terms found dup hits:
	[('NFAT1', 2), ('NFAT2', 2), ('NFATC2', 2), ('NFATC1', 2), ('P53', 10), ('KIR3DL2', 2), ('TPC', 3), 
146 input query terms found no hit:
	['IL_10', 'IL_4', 'MYELINASSOCIATEDAGS', 'NFAT1CD4', 'DIOXINLIKECOMPOUNDS', 'LATENTTGFb', 'CASPASE8'
Pass "returnall=True" to return complete lists of duplicate or missing query terms.
querying 1-262...done.


 50%|█████     | 10/20 [00:25<00:23,  2.33s/it]

Finished.
74 input query terms found dup hits:
	[('MBP', 5), ('IFNAR1', 2), ('ACC1', 3), ('MALT1', 2), ('PARACASPASE', 2), ('LECTIN', 10), ('IGF1', 
168 input query terms found no hit:
	['CLASSIBMAJORHISTOCOMPATIBILITY', 'FOXP3MRNA', 'SCCS', 'FOXP3EXPRESSION', 'ACETYLCOACARBOXYLASE1', 
Pass "returnall=True" to return complete lists of duplicate or missing query terms.
querying 1-262...done.


 55%|█████▌    | 11/20 [00:29<00:25,  2.81s/it]

Finished.
69 input query terms found dup hits:
	[('CAMK4', 2), ('TYROSINASE', 9), ('LPL', 3), ('RAG2', 2), ('IA', 9), ('CDM', 3), ('MUC4', 2), ('IRA
179 input query terms found no hit:
	['GIT2aPIX', 'CAMK4DEPENDENT', 'CREMa', 'CALCIUMCALMODULINDEPENDENTPROTEINKINASEIV', 'CAMPRESPONSEEL
Pass "returnall=True" to return complete lists of duplicate or missing query terms.
querying 1-262...done.


 60%|██████    | 12/20 [00:31<00:20,  2.57s/it]

Finished.
83 input query terms found dup hits:
	[('HSF1', 2), ('HSP70', 10), ('HBZ', 2), ('BST2', 3), ('ISG15', 2), ('OAS2', 2), ('AVP', 7), ('FMS',
167 input query terms found no hit:
	['SHOCKFACTOR1', '17AAG', 'ALLERGENSIT', 'HTLV1', 'BASICLEUCINEZIPPERFACTOR', 'AVPS', 'NTERMINALKINA
Pass "returnall=True" to return complete lists of duplicate or missing query terms.
querying 1-262...done.


 65%|██████▌   | 13/20 [00:33<00:16,  2.39s/it]

Finished.
72 input query terms found dup hits:
	[('VEGFR2', 2), ('SLAMF1', 2), ('CD205', 3), ('ISLET', 10), ('GAPDH', 3), ('ZBTB16', 2), ('POLYMORPH
178 input query terms found no hit:
	['TKI', 'VEGFRS', 'SIGAD', 'IDCM', 'CLASSIORCLASSIIHLA', 'PLATELETDERIVEDGROWTHFACTOR', 'PDGF', 'NAD
Pass "returnall=True" to return complete lists of duplicate or missing query terms.
querying 1-262...done.


 70%|███████   | 14/20 [00:35<00:13,  2.27s/it]

Finished.
69 input query terms found dup hits:
	[('CELLS', 10), ('IRX2', 2), ('MANNOSE', 10), ('TICAM1', 2), ('TRIF', 6), ('CDK6', 2), ('B7.2', 2), 
176 input query terms found no hit:
	['EAHL', 'CD8086DEPENDENT', '86', 'TCCR7', 'TNFRELATEDAPOPTOSISINDUCING', 'GZMBPROGRAMMEDDEATH12', '
Pass "returnall=True" to return complete lists of duplicate or missing query terms.
querying 1-262...done.


 75%|███████▌  | 15/20 [00:38<00:13,  2.66s/it]

Finished.
59 input query terms found dup hits:
	[('CATHELICIDIN', 2), ('VAP1', 2), ('CLIP', 7), ('PRB', 2), ('GDA', 5), ('NOS2', 6), ('PGD2', 3), ('
184 input query terms found no hit:
	['CATIONICHOSTDEFENSEPEPTIDE', 'GRANZYMEAANDB', 'CLEVER1', 'FEEL1', 'STABILIN1', 'VASCULARADHESIONPR
Pass "returnall=True" to return complete lists of duplicate or missing query terms.
querying 1-262...done.


 80%|████████  | 16/20 [00:40<00:09,  2.45s/it]

Finished.
45 input query terms found dup hits:
	[('MKI67', 10), ('MAJOR', 10), ('TIEG1', 2), ('DESMOGLEIN', 10), ('MIR223', 3), ('POLYMERASE', 10), 
202 input query terms found no hit:
	['CD4IFN', 'bIFN', 'ESO1B', 'FCRL3T', 'HLADR]', 'PROTEIN1a', 'GRANULOCYTECHEMOTACTICPROTEIN2', 'C57B
Pass "returnall=True" to return complete lists of duplicate or missing query terms.
querying 1-262...done.


 85%|████████▌ | 17/20 [00:42<00:07,  2.36s/it]

Finished.
61 input query terms found dup hits:
	[('LGALS1', 2), ('LGALS3', 2), ('SLAM', 10), ('TGFA', 4), ('PLCg2', 2), ('P28', 8), ('HY', 2), ('NFA
180 input query terms found no hit:
	['IDODCS', 'SIGNALLINGLYMPHOCYTICACTIVATIONMOLECULE', 'IFNgANDTYPE2', 'CLTA4', 'OFTH2', 'SCD30', 'CD
Pass "returnall=True" to return complete lists of duplicate or missing query terms.
querying 1-262...done.


 90%|█████████ | 18/20 [00:45<00:04,  2.25s/it]

Finished.
61 input query terms found dup hits:
	[('BMI1', 5), ('TGFb]', 10), ('VLA1', 2), ('RD6', 2), ('JANUS', 10), ('SPRED1', 2), ('RELAXIN', 10),
188 input query terms found no hit:
	['DCIL12P70', 'TYPEIANDIIIFN', 'TREGULATORYCELLTYPE', 'ANGIOGENICGROWTHFACTOR', 'HPV16E6', 'ESCC', '
Pass "returnall=True" to return complete lists of duplicate or missing query terms.
querying 1-262...done.


 95%|█████████▌| 19/20 [00:49<00:02,  2.85s/it]

Finished.
61 input query terms found dup hits:
	[('IFNb1', 2), ('MCP3', 2), ('MIP3b', 2), ('Ab', 10), ('Ab1', 4), ('PDE4B', 2), ('PDE4A', 2), ('H1R'
187 input query terms found no hit:
	['AIG', 'TXA23', 'TXA51', 'HTYR', 'MTYR', 'A2.1', 'NONSELF', 'CTLASSOCIATEDAG4', 'CD3SFV', 'ALLERGEN
Pass "returnall=True" to return complete lists of duplicate or missing query terms.
querying 1-262...done.


100%|██████████| 20/20 [00:51<00:00,  2.57s/it]

Finished.
45 input query terms found dup hits:
	[('CARDIOLIPIN', 2), ('ID', 8), ('ABPA', 2), ('JUN', 10), ('NKRP1A', 2), ('CAK', 4), ('PHOSPHODIESTE
194 input query terms found no hit:
	['MELANOMAASSOCIATEDPROTEIN', 'OVALBUMINPEPTIDE', 'BSAB', 'THROMBOPLASTIN', 'APTT', 'P38aMITOGENACTI
Pass "returnall=True" to return complete lists of duplicate or missing query terms.





In [9]:
df_res.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10509 entries, FCGRIIB to CYA
Data columns (total 10 columns):
_id             7174 non-null object
_score          7174 non-null float64
alias           6028 non-null object
ensembl         259 non-null object
ensembl.gene    5762 non-null object
name            7174 non-null object
notfound        3335 non-null object
symbol          7174 non-null object
taxid           7174 non-null float64
species         7174 non-null object
dtypes: float64(2), object(8)
memory usage: 903.1+ KB


In [10]:
df_res.head()

Unnamed: 0_level_0,_id,_score,alias,ensembl,ensembl.gene,name,notfound,symbol,taxid,species
query,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
FCGRIIB,,,,,,,True,,,
IL2,3558.0,91.829506,"[IL-2, TCGF, lymphokine]",,ENSG00000109471,interleukin 2,,IL2,9606.0,human
IL2,16183.0,75.850266,Il-2,,ENSMUSG00000027720,interleukin 2,,Il2,10090.0,mouse
IL2,3702.0,12.175253,"[EMT, LPFS1, LYK, PSCTK2]",,ENSG00000113263,IL2 inducible T cell kinase,,ITK,9606.0,human
IL2,16428.0,10.497415,"[Emt, Tcsk, Tsk]",,ENSMUSG00000020395,IL2 inducible T cell kinase,,Itk,10090.0,mouse


In [11]:
df_qlkp = df_query.set_index('query')
df_qlkp.index.is_unique

False

In [12]:
df_qlkp.head()

Unnamed: 0_level_0,value_norm
query,Unnamed: 1_level_1
FCGRIIB,FCGRIIB
IL2,IL2
FOXP3,FOXP3
CD4,CD4
TCR,TCR


In [27]:
# Merge results by query string back to query frame containing original protein symbol
df_res_pr = pd.merge(
    df_res.reset_index(), 
    df_qlkp.reset_index(),
    on='query',
    how='left'
)

# Reorganize some field names
df_res_pr = df_res_pr.rename(columns={
    '_id': 'id', 
    '_score': 'score', 
    'ensembl.gene': 'ensemblgene',
    'symbol': 'sym'
})

# Remove results with no match
df_res_pr = df_res_pr[df_res_pr['id'].notnull()]
df_res_pr = df_res_pr[['id', 'score', 'sym', 'name', 'ensemblgene', 'species', qfld]]

df_res_pr['ensemblgene'] = df_res_pr['ensemblgene'].str.strip()

# Ensure that remaining records all have a reference to pubmed protein name
assert df_res_pr[qfld].notnull().all()

df_res_pr.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7202 entries, 1 to 10538
Data columns (total 7 columns):
id             7202 non-null object
score          7202 non-null float64
sym            7202 non-null object
name           7202 non-null object
ensemblgene    5788 non-null object
species        7202 non-null object
value_norm     7202 non-null object
dtypes: float64(1), object(6)
memory usage: 450.1+ KB


In [28]:
df_res_pr.head()

Unnamed: 0,id,score,sym,name,ensemblgene,species,value_norm
1,3558,91.829506,IL2,interleukin 2,ENSG00000109471,human,IL2
2,16183,75.850266,Il2,interleukin 2,ENSMUSG00000027720,mouse,IL2
3,3702,12.175253,ITK,IL2 inducible T cell kinase,ENSG00000113263,human,IL2
4,16428,10.497415,Itk,IL2 inducible T cell kinase,ENSMUSG00000020395,mouse,IL2
5,50943,89.72368,FOXP3,forkhead box P3,ENSG00000049768,human,FOXP3


In [29]:
# Validate that all species results were resolved
assert df_res_pr['species'].isin(SPECIES.values()).all()
df_res_pr['species'].value_counts()

human    4306
mouse    2896
Name: species, dtype: int64

## Add transcription factor flag

In [31]:
# This is small enough that it doesn't need a separate annotation notebook
df_tf = pd.read_csv(osp.join(DATA_DIR, 'human_tfs.csv'))
df_tf['ensemblid'] = df_tf['ensemblid'].str.strip()
df_res_pr['is_tf'] = df_res_pr['ensemblgene'].isin(df_tf['ensemblid'].unique())
df_res_pr['is_tf'].value_counts()

False    6849
True      353
Name: is_tf, dtype: int64

## Export

In [35]:
# For consistency with other annotation notebooks, prefix field originating
# from pubmed protein table
df_exp = df_res_pr.rename(columns={qfld: 'pr_' + qfld})
df_exp.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7202 entries, 1 to 10538
Data columns (total 8 columns):
id               7202 non-null object
score            7202 non-null float64
sym              7202 non-null object
name             7202 non-null object
ensemblgene      5788 non-null object
species          7202 non-null object
pr_value_norm    7202 non-null object
is_tf            7202 non-null bool
dtypes: bool(1), float64(1), object(6)
memory usage: 457.2+ KB


In [33]:
path = osp.join(DATA_DIR, 'pubmed_mygene_protein_annotations.csv')
df_res_pr.to_csv(path, index=False)
path

'data/pubmed_mygene_protein_annotations.csv'