In [1]:
import pronto
import os.path as osp
import re
import os
import pandas as pd
import numpy as np
%run env.py

In [2]:
# Download cl.owl from http://purl.obolibrary.org/obo/cl.owl
onto = pronto.Ontology(osp.join(os.environ['DATA_DIR'], 'meta', 'cl', 'cl.owl'))

In [8]:
# Get term for "T Cell"
# See: https://www.ebi.ac.uk/ols/ontologies/cl/terms?iri=http%3A%2F%2Fpurl.obolibrary.org%2Fobo%2FCL_0000084
parent = onto['CL:0000084']
parent

<CL:0000084: T cell>

In [9]:
len(list(parent.rchildren()))

135

In [10]:
parent.rchildren()

[<CL:0000789: alpha-beta T cell>,
 <CL:0000798: gamma-delta T cell>,
 <CL:0002419: mature T cell>,
 <CL:0002420: immature T cell>,
 <CL:0000790: immature alpha-beta T cell>,
 <CL:0000791: mature alpha-beta T cell>,
 <CL:0000809: double-positive, alpha-beta thymocyte>,
 <CL:0000810: CD4-positive, alpha-beta thymocyte>,
 <CL:0000811: CD8-positive, alpha-beta thymocyte>,
 <CL:0000914: immature NK T cell>,
 <CL:0002427: resting double-positive thymocyte>,
 <CL:0002428: double-positive blast>,
 <CL:0002429: CD69-positive double-positive thymocyte>,
 <CL:0002430: CD4-intermediate, CD8-positive double-positive thymocyte>,
 <CL:0002431: CD4-positive, CD8-intermediate double-positive thymocyte>,
 <CL:0002432: CD24-positive, CD4 single-positive thymocyte>,
 <CL:0002433: CD69-positive, CD4-positive single-positive thymocyte>,
 <CL:0002434: CD24-positive, CD8 single-positive thymocyte>,
 <CL:0002435: CD69-positive, CD8-positive single-positive thymocyte>,
 <CL:0002039: immature NK T cell stage I>,

In [12]:
terms = [parent] + list(parent.rchildren())
len(terms)

136

In [24]:
pd.Series([
    s.scope
    for t in terms
    for s in t.synonyms
]).value_counts()

EXACT      447
BROAD       58
RELATED     18
NARROW       3
dtype: int64

In [None]:
def get_row(t, s):
    dict(
        id=t.id, label=t.name, desc=str(t.desc),
        syn=s if isinstance(s, str) else s.desc,
        syn_typ='label' if isinstance(s, str) else s.scope.lower(),
        syn_typ_v=None if isinstance(s, str) else s.scope.lower(),
    )

def get_frame(syn_fn, syn_typ):    
    return pd.DataFrame(

        for t in terms
        for s in list(t.synonyms) + [pronto.Synonym(t.name)]
    )

In [13]:
t = onto['CL:0000899']
t

<CL:0000899: T-helper 17 cell>

In [19]:
t.id, t.name, t.synonyms, str(t.desc)

('CL:0000899',
 'T-helper 17 cell',
 {<Synonym: "IL-17-producing CD4+ T helper" EXACT []>,
  <Synonym: "T helper cells type 17" EXACT []>,
  <Synonym: "T(H)-17 cell" EXACT []>,
  <Synonym: "Th17 CD4+ T cell" EXACT []>,
  <Synonym: "Th17 T cell" EXACT []>,
  <Synonym: "Th17 T lymphocyte" EXACT []>,
  <Synonym: "Th17 T-cell" EXACT []>,
  <Synonym: "Th17 T-lymphocyte" EXACT []>,
  <Synonym: "Th17 cell" EXACT []>,
  <Synonym: "helper T cell type 17" EXACT []>},
 'CD4-positive, alpha-beta T cell with the phenotype RORgamma-t-positive, CXCR3-negative, CCR6-positive, and capable of producing IL-17.')

In [17]:
s = list(t.synonyms)[0]

In [22]:
s.scope, s.syn_type, s.desc

('EXACT', None, str)

In [None]:
dict()
for t in terms
dict(
        id=cid, namespace=namespace, 
        name=c.name, label=label, syn=syn,
        category=category_map.get(cid), 
        parent=parent, ancestors=ancestors,
        syn_typ=syn_typ
    )

In [25]:
category_regex = re.compile(r'(?<=Category=)[a-z-]+(?=\.)')

def get_class_category(c):
    if c.comment:
        cats = category_regex.findall(c.comment[0])
        return cats[0] if cats else None
    return None

def get_class_id(c):
    if hasattr(c, 'id'):
        return c.id[0]
    return None

# Built map from ID -> Category to be used for faster ancestor category lookups
category_map = {
    get_class_id(c): get_class_category(c) 
    for c in onto.classes() if get_class_id(c) is not None
}
len(category_map)

316242

In [26]:
def get_row(c, label, syn, syn_typ):
    # Extract from strings like:
    # "get_ontology("http://purl.obolibrary.org/obo/pr.owl#").get_namespace("http://purl.obolibrary.org/obo/")"
    namespace = str(c.namespace).split('/')[-2]
    
    # Get ancestor ID list
    ancestors = [get_class_id(a) for a in c.ancestors() if get_class_id(a) is not None]
    
    # Assign "parent" as first ancestor in "gene" category 
    # (e.g. "interferon gamma isoform 1 (human)" gets parent "interferon gamma")
    parent = [aid for aid in ancestors if category_map.get(aid) == 'gene']
    parent = parent[0] if parent else None
    
    cid = get_class_id(c)
    
    return dict(
        id=cid, namespace=namespace, 
        name=c.name, label=label, syn=syn,
        category=category_map.get(cid), 
        parent=parent, ancestors=ancestors,
        syn_typ=syn_typ
    )

def get_frame(syn_typ, syn_fn):
    return pd.DataFrame([
        get_row(c, l, s, syn_typ)
        for c in onto.classes()
        for l in (c.label or [None])
        for s in syn_fn(c)
    ])
    
SPECIES = {'any': 3, 'human': 2, 'mouse': 1}
SYN_TYPS = {'label': 3, 'exact': 2, 'related': 1}

df = pd.concat([
    get_frame('exact', lambda c: c.hasExactSynonym or [None]),
    get_frame('related', lambda c: c.hasRelatedSynonym or [None]),
    get_frame('label', lambda c: c.label or [None])
])
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1236944 entries, 0 to 316241
Data columns (total 9 columns):
ancestors    1236944 non-null object
category     1226853 non-null object
id           1236944 non-null object
label        1236944 non-null object
name         1236944 non-null object
namespace    1236944 non-null object
parent       717720 non-null object
syn          903995 non-null object
syn_typ      1236944 non-null object
dtypes: object(9)
memory usage: 94.4+ MB


In [27]:
df['category'].value_counts()

organism-gene            502458
external                 296956
organism-sequence        226870
gene                     126820
organism-modification     29120
sequence                  24836
modification              15282
family                     1630
organism-complex           1299
complex                     695
organism-seqgroup           623
organism-family             103
organism-genegroup           81
union                        80
Name: category, dtype: int64

In [28]:
df['species'] = np.where(
    df['category'].str.contains('organism'), 
    df['label'].str.strip().str.extract(r'(?<=\()([a-z ]+?)(?=\)$)', expand=False),
    None
)
# Replace unknown species classifications with "any", as these are more generic 
# ontology elements that could apply to any organism (but not necessarily all of them)
df['species'] = df['species'].fillna('any')
df['species'].value_counts()

any                    665569
human                  251844
mouse                  148564
rat                     61209
yeast                   30591
fruit fly               24521
worm                    23722
zebrafish               16528
chicken                 12221
cow                       912
pig                       303
rabbit                    261
dog                       140
frog                      139
guinea pig                129
spinach                    54
sheep                      41
rice                       35
maize                      30
opium poppy                27
wheat                      24
green monkey               16
turkey                     15
horse                      12
honeybee                   11
kidney bean                 7
horseradish                 7
cat                         6
great scarlet poppy         3
cobra                       3
Name: species, dtype: int64

In [29]:
df['namespace'].value_counts()

obo        956824
cgi-bin     77194
gene        65364
marker      50457
locus       47496
result      15375
reports     11751
view         9834
cgnc         1659
id            990
Name: namespace, dtype: int64

In [30]:
df['syn_typ'].value_counts()

exact      540266
related    380436
label      316242
Name: syn_typ, dtype: int64

In [31]:
# Create a set of all proteins with either IDs explicitly associated with target 
# organisms are that are ancestors of such proteins

def get_target_ids(df):
    return list(set(df['id'].tolist() + [v for a in df['ancestors'] for v in a]))
    
target_ids = get_target_ids(df[df['species'].isin(list(SPECIES.keys()))])
len(target_ids)

277154

In [32]:
# Filter to OBO records and subtrees relevant to target species
species_regex = '|'.join('\({}\)'.format(v) for v in SPECIES.values())
dff = (
    df
    .pipe(lambda df: df[df['namespace'] == 'obo'])
    # Restrict to species-specific sub-trees noting that a couple
    # ancestors end up being for off-target species (hence the 
    # second filter on species again)
    .pipe(lambda df: df[df['id'].isin(target_ids)])
    .pipe(lambda df: df[df['species'].isin(list(SPECIES.keys()))])
    .pipe(lambda df: df[df['syn'].notnull()])
    # Remove substrings like "(human)" from synonyms
    .assign(syn=lambda df: df['syn'].str.replace(species_regex, '').str.strip())
    .drop_duplicates(['id', 'label', 'name', 'syn', 'syn_typ', 'species'])
    .drop('ancestors', axis=1)
)
dff.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 659126 entries, 2 to 316241
Data columns (total 9 columns):
category     654656 non-null object
id           659126 non-null object
label        659126 non-null object
name         659126 non-null object
namespace    659126 non-null object
parent       516227 non-null object
syn          659126 non-null object
syn_typ      659126 non-null object
species      659126 non-null object
dtypes: object(9)
memory usage: 50.3+ MB


In [33]:
dff.groupby(['syn', 'syn_typ', 'species'])['id'].nunique().rename('n_ids').reset_index()\
    .groupby(['syn_typ', 'species', 'n_ids']).size()

syn_typ  species  n_ids
exact    any      1        165438
                  2           262
                  3             8
         human    1        119153
                  2             3
         mouse    1         72195
                  2            19
label    any      1         87178
                  2            33
                  3             2
         human    1         60960
                  2             3
         mouse    1         35556
                  2             4
related  any      1         68456
                  2          4125
                  3           358
                  4            80
                  5            24
                  6            13
                  7             4
                  8             1
                  9             2
                  10            1
                  13            5
                  15            2
         human    1         22136
                  2          1031
                  3     

In [34]:
# When a synonym exists for multiple records within same species, resolve by using
# synonym source preference (equal to label = highest, exact synonym = next, related synonym = lowest)
# * when a synonym has multiple records for the same species and synonym type, use the highest id
dfd = (
    dff
    .assign(syn_typ_id=lambda df: df['syn_typ'].map(SYN_TYPS))
    # Note that NA values will are last by default (i.e. lowest)
    .sort_values(['species', 'syn', 'syn_typ_id', 'id'], ascending=False)
    .groupby(['species', 'syn'], group_keys=False).head(1)
)
assert dfd.groupby(['species', 'syn']).size().max() == 1
dfd.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 649105 entries, 537614 to 295741
Data columns (total 10 columns):
category      644657 non-null object
id            649105 non-null object
label         649105 non-null object
name          649105 non-null object
namespace     649105 non-null object
parent        508057 non-null object
syn           649105 non-null object
syn_typ       649105 non-null object
species       649105 non-null object
syn_typ_id    649105 non-null int64
dtypes: int64(1), object(9)
memory usage: 54.5+ MB


In [35]:
dfd['species'].value_counts()

any      323965
human    203233
mouse    121907
Name: species, dtype: int64

In [36]:
# Finally, resolve make synonyms unique by choosing based on species preference
# * This may not be desirable in the future as it will do something like make "IL12A"
# always point the IL12A record for humans rather than humans or mice

def assign_species_id(df):
    df['species_id'] = df['species'].map(SPECIES)
    assert df['species_id'].notnull().all()
    return df

dfe = (
    dfd
    .pipe(assign_species_id)
    # Note that NA values will are last by default (i.e. lowest)
    .sort_values(['syn', 'species_id', 'id'], ascending=False)
    .groupby(['syn'], group_keys=False).head(1)
)
assert dfe.groupby('syn').size().max() == 1
assert dfe['species_id'].notnull().all()
dfe.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 629165 entries, 537614 to 295741
Data columns (total 11 columns):
category      624717 non-null object
id            629165 non-null object
label         629165 non-null object
name          629165 non-null object
namespace     629165 non-null object
parent        488344 non-null object
syn           629165 non-null object
syn_typ       629165 non-null object
species       629165 non-null object
syn_typ_id    629165 non-null int64
species_id    629165 non-null int64
dtypes: int64(2), object(9)
memory usage: 57.6+ MB


In [40]:
dfe[dfe['syn'].str.contains('^IL2')]
#dfe[dfe['syn'] == 'IL12A']
#dfe[dfe['syn'] == 'TGFB']

Unnamed: 0,category,id,label,name,namespace,parent,syn,syn_typ,species,syn_typ_id,species_id
127151,modification,PR:000018417,"cytokine receptor common subunit gamma, signal...",PR_000018417,obo,PR:000001340,IL2RG/SigPep-,exact,any,2,3
70983,gene,PR:000001340,cytokine receptor common subunit gamma,PR_000001340,obo,PR:000001340,IL2RG,exact,any,2,3
127180,modification,PR:000018446,"interleukin-2 receptor subunit beta, signal pe...",PR_000018446,obo,PR:000001381,IL2RB/SigPep-,exact,any,2,3
71197,gene,PR:000001381,interleukin-2 receptor subunit beta,PR_000001381,obo,PR:000001381,IL2RB,exact,any,2,3
127178,modification,PR:000018444,"interleukin-2 receptor subunit alpha, signal p...",PR_000018444,obo,PR:000001380,IL2RA/SigPep-,exact,any,2,3
71191,gene,PR:000001380,interleukin-2 receptor subunit alpha,PR_000001380,obo,PR:000001380,IL2RA,exact,any,2,3
69450,gene,PR:000001476,interleukin-29,PR_000001476,obo,PR:000001476,IL29,related,any,1,3
80576,gene,PR:000009003,interleukin-28 receptor alpha chain,PR_000009003,obo,PR:000009003,IL28RA,related,any,1,3
69441,gene,PR:000001470,interleukin-28B,PR_000001470,obo,PR:000001470,IL28C,related,any,1,3
69440,gene,PR:000001470,interleukin-28B,PR_000001470,obo,PR:000001470,IL28B,related,any,1,3


In [42]:
path = osp.join(os.environ['META_DATA_DIR'], 'raw', 'pro.raw.csv')
dfe.to_csv(path, index=False)
path

'/Users/eczech/repos/hammer/t-cell-relation-extraction/pm_subtype_protein_relations/data/meta/raw/pro.raw.csv'