In [1]:
import pronto
import os.path as osp
import re
import os
import pandas as pd
import numpy as np
from tcre.env import *

In [8]:
# Download pr.owl from http://purl.obolibrary.org/obo/pr.owl
onto = pronto.Ontology(osp.join(DATA_DIR, 'meta', 'pro', 'pr.owl'))

In [32]:
len(onto.terms), type(onto.terms)

(316242, collections.OrderedDict)

In [33]:
# Show namespace frequency
pd.Series([ns for k, t in onto.terms.items() for ns in t.other['namespace']]).value_counts()

protein               216442
gene                   98908
ncbi_taxonomy            580
PSI-MOD                  120
cellular_component       115
chebi_ontology            36
molecular_function        25
sequence                   7
bfo                        5
biological_process         3
obi                        1
dtype: int64

In [44]:
# Top-level "Protein" term (see https://www.ebi.ac.uk/ols/ontologies/pr/terms?iri=http%3A%2F%2Fpurl.obolibrary.org%2Fobo%2FPR_000000001)
parent = onto['PR:000000001']

# Fetch children
terms = [parent] + list(parent.rchildren())
len(terms)

213413

In [45]:
# Show namespace frequency (for selected subtree)
pd.Series([ns for t in terms for ns in t.other['namespace']]).value_counts()

protein    213413
dtype: int64

In [43]:
# Check that ids are always present and always unique
pd.Series([t.id is None for t in terms]).value_counts(), pd.Series([t.id or t in terms]).value_counts().max()

(False    213413
 dtype: int64, 1)

In [56]:
# Show fields for an example term
t = terms[230]
t.id, t.name, t.desc, t.other, t.other, t.synonyms, list(t.synonyms)[0].desc

('PR:000001836',
 'CD7 molecule',
 Description('A protein that is a translation product of the human CD7 gene or a 1:1 ortholog thereof.', ['PRO:WCB']),
 {'hasExactSynonym': ['CD7',
   'GP40',
   'T-cell antigen CD7',
   'T-cell leukemia antigen',
   'T-cell surface antigen Leu-9',
   'TP41'],
  'comment': ['Category=gene. Requested by=CL.'],
  'xref': ['PIRSF:PIRSF038791'],
  'namespace': ['protein']},
 {'hasExactSynonym': ['CD7',
   'GP40',
   'T-cell antigen CD7',
   'T-cell leukemia antigen',
   'T-cell surface antigen Leu-9',
   'TP41'],
  'comment': ['Category=gene. Requested by=CL.'],
  'xref': ['PIRSF:PIRSF038791'],
  'namespace': ['protein']},
 {<Synonym: "CD7" EXACT []>,
  <Synonym: "GP40" EXACT []>,
  <Synonym: "T-cell antigen CD7" EXACT []>,
  <Synonym: "T-cell leukemia antigen" EXACT []>,
  <Synonym: "T-cell surface antigen Leu-9" EXACT []>,
  <Synonym: "TP41" EXACT []>},
 'T-cell antigen CD7')

In [49]:
# Synonym type frequencies
syn_types = pd.Series([s.scope for t in terms for s in t.synonyms]).value_counts()
syn_types

EXACT      436863
RELATED    149152
BROAD        4763
NARROW       1316
dtype: int64

In [59]:
pd.DataFrame([(t.id, t.name, s.scope, s.desc) for t in terms for s in t.synonyms], columns=['id', 'nm', 'sc', 'desc'])\
    .groupby('sc').head(5).sort_values('sc')

Unnamed: 0,id,nm,sc,desc
1,PR:000000003,HLH DNA-binding protein inhibitor,BROAD,bHLH clade D
3,PR:000000003,HLH DNA-binding protein inhibitor,BROAD,bHLH class V
25,PR:000000020,Myc protein,BROAD,bHLH class III
26,PR:000000020,Myc protein,BROAD,bHLH clade B
37,PR:000000735,staphylococcal enterotoxin A,BROAD,ET-A
2,PR:000000003,HLH DNA-binding protein inhibitor,EXACT,DNA-binding protein inhibitor ID
4,PR:000000005,TGF-beta receptor type-2,EXACT,TGF-beta receptor type II
5,PR:000000005,TGF-beta receptor type-2,EXACT,TbetaR-II
6,PR:000000005,TGF-beta receptor type-2,EXACT,TGFR-2
7,PR:000000005,TGF-beta receptor type-2,EXACT,transforming growth factor-beta receptor type II


In [48]:
# "Category=gene. Requested by=CL" --> gene
category_regex = re.compile(r'(?<=Category=)[a-z-]+(?=\.)')

def get_category(t):
    if 'comment' in t.other and t.other['comment']:
        cats = category_regex.findall(t.other['comment'][0])
        return cats[0] if cats else None
    return None

# Map each term to its category
category_map = {t.id: get_category(t) for t in terms}
assert len(category_map) == len(terms)

In [61]:
[t.id for t in t.parents]

['PR:000000001']

In [68]:
# Synonym type priorities
PRIORITY = {'label': 5, 'exact': 4, 'narrow': 3, 'related': 2, 'broad': 1}
SPECIES = {'any': 3, 'human': 2, 'mouse': 1}

def get_row(t, syn, syn_typ):
    # Get ancestor ID list
    ancestors = [ta.id for ta in t.parents]
    
    # Assign "parent" as first ancestor in "gene" category 
    # (e.g. "interferon gamma isoform 1 (human)" gets parent "interferon gamma")
    parent = [aid for aid in ancestors if category_map.get(aid) == 'gene']
    parent = parent[0] if parent else None
    
    return dict(
        id=t.id, namespace=t.other['namespace'][0], 
        label=t.name, syn=syn.desc,
        category=category_map.get(t.id), 
        parent=parent, ancestors=ancestors,
        syn_typ=syn_typ if syn_typ else syn.scope.lower()
    )

def get_frame(syn_fn, syn_typ=None):    
    return pd.DataFrame([
        get_row(t, s, syn_typ)
        for t in terms
        for s in syn_fn(t)
    ])
            
dfa = pd.concat([
    get_frame(lambda t: t.synonyms),
    get_frame(lambda t: [pronto.Synonym(t.name)], 'label')
])
dfa.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 805507 entries, 0 to 213412
Data columns (total 8 columns):
ancestors    805507 non-null object
category     805409 non-null object
id           805507 non-null object
label        805507 non-null object
namespace    805507 non-null object
parent       331586 non-null object
syn          805507 non-null object
syn_typ      805507 non-null object
dtypes: object(8)
memory usage: 55.3+ MB


In [84]:
# Validate that all fields are not null except where missing values are expected:
# parent - can be null if entry has no ancestor in "gene" category
# category - can be null if category regex extraction results in nothing 
#    (not sure why such an important field is jammed into a string description rather than made into a formal property within the ontology)
for c in dfa.columns.difference(['parent', 'category']):
    assert dfa[c].notnull().all(), f'Found null values for field {c}'

In [85]:
dfa.head()

Unnamed: 0,ancestors,category,id,label,namespace,parent,syn,syn_typ,species
0,[PR:000000001],family,PR:000000003,HLH DNA-binding protein inhibitor,protein,,ID protein,related,any
1,[PR:000000001],family,PR:000000003,HLH DNA-binding protein inhibitor,protein,,bHLH clade D,broad,any
2,[PR:000000001],family,PR:000000003,HLH DNA-binding protein inhibitor,protein,,DNA-binding protein inhibitor ID,exact,any
3,[PR:000000001],family,PR:000000003,HLH DNA-binding protein inhibitor,protein,,bHLH class V,broad,any
4,[PR:000000001],gene,PR:000000005,TGF-beta receptor type-2,protein,,TGF-beta receptor type II,exact,any


In [86]:
dfa['syn_typ'].value_counts()

exact      436863
label      213413
related    149152
broad        4763
narrow       1316
Name: syn_typ, dtype: int64

In [87]:
dfa['category'].value_counts()

organism-gene            476217
organism-sequence        156723
gene                     119254
organism-modification     23980
sequence                  16958
modification              10510
family                      963
organism-seqgroup           561
organism-family              85
union                        65
organism-genegroup           54
complex                      20
organism-complex             19
Name: category, dtype: int64

In [88]:
# Extract species from labels (i.e. names): "suppressor of kinetochore protein 1 (yeast)" --> yeast
dfa['species'] = np.where(
    dfa['category'].str.contains('organism'), 
    dfa['label'].str.strip().str.extract(r'(?<=\()([a-z ]+?)(?=\)$)', expand=False),
    None
)
# Replace unknown species classifications with "any", as these are more generic 
# ontology elements that could apply to any organism (but not necessarily all of them)
dfa['species'] = dfa['species'].fillna('any')
dfa['species'].value_counts()

any                    324251
human                  206231
mouse                  122978
rat                     52025
yeast                   29749
fruit fly               21994
worm                    20540
zebrafish               15163
chicken                 10671
cow                       797
pig                       272
rabbit                    238
dog                       125
frog                      115
guinea pig                110
spinach                    45
sheep                      40
rice                       31
opium poppy                25
maize                      25
wheat                      20
turkey                     12
green monkey               12
horse                       9
honeybee                    9
cat                         6
horseradish                 5
kidney bean                 5
great scarlet poppy         2
cobra                       2
Name: species, dtype: int64

In [89]:
# Create a set of all proteins with either IDs explicitly associated with target 
# organisms are that are ancestors of such proteins

def get_target_ids(df):
    return list(set(df['id'].tolist() + [v for a in df['ancestors'] for v in a]))
    
target_ids = get_target_ids(dfa[dfa['species'].isin(list(SPECIES.keys()))])
len(target_ids)

174347

In [144]:
# Build a regular expression for removing species in synonyms;
# e.g. "IL4-induced protein 1 (mouse)" --> "IL4-induced protein 1"
# * Note that the species strings are matched explicitly as there are often parenthetical elements in labels
species_regex = ' \(({})\)$'.format('|'.join(dfa['species'].unique()))
species_regex

' \\((any|yeast|human|chicken|rat|mouse|zebrafish|worm|fruit fly|green monkey|frog|pig|cow|dog|sheep|maize|wheat|horse|rabbit|guinea pig|kidney bean|horseradish|turkey|cobra|great scarlet poppy|opium poppy|spinach|honeybee|rice|cat)\\)$'

In [146]:
# Filter to records and subtrees relevant to target species
# species_regex = r'\(({})\)'.format('|'.join(list(SPECIES.values())))
# .assign(syn=lambda df: df['syn'].str.replace(species_regex, '').str.strip())
dff = (
    dfa
    # Restrict to species-specific sub-trees noting that a couple
    # ancestors end up being for off-target species (hence the 
    # second filter on species again)
    .pipe(lambda df: df[df['id'].isin(target_ids)])
    .pipe(lambda df: df[df['species'].isin(list(SPECIES.keys()))])
    # Remove species in synonym strings and drop any resulting duplicates within the same term id
    .assign(syn=lambda df: df['syn'].str.strip().str.replace(species_regex, '').str.strip())
    .drop_duplicates(['id', 'label', 'syn', 'syn_typ', 'species'])
    .drop('ancestors', axis=1)
)
dff.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 653228 entries, 0 to 213412
Data columns (total 8 columns):
category     653130 non-null object
id           653228 non-null object
label        653228 non-null object
namespace    653228 non-null object
parent       255340 non-null object
syn          653228 non-null object
syn_typ      653228 non-null object
species      653228 non-null object
dtypes: object(8)
memory usage: 44.9+ MB


In [147]:
dff['species'].value_counts()

any      324251
human    206072
mouse    122905
Name: species, dtype: int64

In [148]:
# Get frequencies with which synonyms match for different labels (i.e. terms).
# This should be useful for deciding which synonyms are too broad to be helpful
dff.groupby(['syn', 'syn_typ', 'species'])['id'].nunique().rename('n_ids').reset_index()\
    .groupby(['species', 'syn_typ', 'n_ids']).size().unstack().fillna(0).astype(int)

Unnamed: 0_level_0,n_ids,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,17,21,23,35,39
species,syn_typ,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
any,broad,599,622,94,49,26,13,3,0,5,3,2,2,1,0,0,1,0,1,0,1
any,exact,164335,260,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
any,label,78101,23,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
any,narrow,250,6,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
any,related,68215,4111,357,81,25,12,4,1,2,1,0,0,5,0,2,0,0,0,0,0
human,broad,573,94,20,13,6,3,3,1,1,1,0,0,1,1,0,0,1,0,1,0
human,exact,118967,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
human,label,60728,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
human,narrow,961,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
human,related,21790,1063,111,18,3,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0


In [149]:
# When a synonym exists for multiple records within same species, resolve by using
# synonym source priority (e.g. label = highest, exact = next highest, broad = lowest)
# * when a synonym has multiple records for the same species and synonym type, use the highest id
dfd = (
    dff
    .assign(priority=lambda df: df['syn_typ'].map(PRIORITY))
    # Note that NA values will are last by default (i.e. lowest)
    .sort_values(['species', 'syn', 'priority', 'id'], ascending=False)
    .groupby(['species', 'syn'], group_keys=False).head(1)
)
assert dfd.groupby(['species', 'syn']).size().max() == 1
dfd.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 640587 entries, 444955 to 182118
Data columns (total 9 columns):
category     640489 non-null object
id           640587 non-null object
label        640587 non-null object
namespace    640587 non-null object
parent       251235 non-null object
syn          640587 non-null object
syn_typ      640587 non-null object
species      640587 non-null object
priority     640587 non-null int64
dtypes: int64(1), object(8)
memory usage: 48.9+ MB


In [153]:
dfd['species'].value_counts()

any      314794
human    204020
mouse    121773
Name: species, dtype: int64

In [154]:
dfd['priority'].fillna(-1).value_counts()

4    355667
5    174313
2    107475
1      1884
3      1248
Name: priority, dtype: int64

In [155]:
# Finally, make synonyms unique by choosing based on species preference
# * This may not be desirable in the future as it will do something like make "IL12A"
# always point the IL12A record for humans rather than humans or mice

def assign_species_id(df):
    df['species_id'] = df['species'].map(SPECIES)
    assert df['species_id'].notnull().all()
    return df

dfe = (
    dfd
    .pipe(assign_species_id)
    # Note that NA values will are last by default (i.e. lowest)
    .sort_values(['syn', 'species_id', 'id'], ascending=False)
    .groupby(['syn'], group_keys=False).head(1)
)
assert dfe.groupby('syn').size().max() == 1
assert dfe['species_id'].notnull().all()
dfe.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 503426 entries, 56926 to 182118
Data columns (total 10 columns):
category      503328 non-null object
id            503426 non-null object
label         503426 non-null object
namespace     503426 non-null object
parent        131835 non-null object
syn           503426 non-null object
syn_typ       503426 non-null object
species       503426 non-null object
priority      503426 non-null int64
species_id    503426 non-null int64
dtypes: int64(2), object(8)
memory usage: 42.2+ MB


In [156]:
dfe[dfe['syn'].str.contains('^IL4')]
#dfe[dfe['syn'] == 'IL12A']
#dfe[dfe['syn'] == 'TGFB']

Unnamed: 0,category,id,label,namespace,parent,syn,syn_typ,species,priority,species_id
1240,gene,PR:000001866,interleukin-4 receptor subunit alpha,protein,,IL4RA,related,any,2,3
99846,modification,PR:000018552,"interleukin-4 receptor subunit alpha, signal p...",protein,PR:000001866,IL4R/SigPep-,exact,any,4,3
99847,modification,PR:000018553,interleukin-4 receptor subunit alpha proteolyt...,protein,PR:000001866,IL4R/ClvPrd,exact,any,4,3
1237,gene,PR:000001866,interleukin-4 receptor subunit alpha,protein,,IL4R,exact,any,4,3
204387,modification,PR:000020307,"L-amino-acid oxidase, signal peptide removed form",protein,PR:000009005,IL4I1/SigPep-,exact,any,4,3
25385,gene,PR:000009005,L-amino-acid oxidase,protein,,IL4I1,exact,any,4,3
571099,organism-gene,PR:Q6UX52,protein IL-40 (human),protein,,IL40,related,human,2,2
93962,modification,PR:000018452,"interleukin-4, signal peptide removed form",protein,PR:000001391,IL4/SigPep-,exact,any,4,3
25386,gene,PR:000009005,L-amino-acid oxidase,protein,,IL4-induced protein 1,exact,any,4,3
856,gene,PR:000001391,interleukin-4,protein,,IL4,exact,any,4,3


In [157]:
path = osp.join(META_DATA_DIR, 'raw', 'pro.raw.csv.gz')
dfe.to_csv(path, index=False)
path

'/lab/repos/t-cell-relation-extraction/data/meta/raw/pro.raw.csv.gz'