In [1]:
import pronto
import os.path as osp
import re
import os
import pandas as pd
import numpy as np
from tcre.env import *

In [2]:
# Download cl.owl from http://purl.obolibrary.org/obo/cl.owl
onto = pronto.Ontology(osp.join(os.environ['DATA_DIR'], 'meta', 'cl', 'cl.owl'))

In [3]:
# Get terms for "T Cell" and "natural killer cell"; see:
# https://www.ebi.ac.uk/ols/ontologies/cl/terms?iri=http%3A%2F%2Fpurl.obolibrary.org%2Fobo%2FCL_0000084
# https://www.ebi.ac.uk/ols/ontologies/cl/terms?iri=http%3A%2F%2Fpurl.obolibrary.org%2Fobo%2FCL_0000623
parents = [onto['CL:0000084'], onto['CL:0000623']]
parents

[<CL:0000084: T cell>, <CL:0000623: natural killer cell>]

In [4]:
def _dist(terms, target, ct, max_dist):
    for t in terms:
        if t.id == target.id:
            return ct
    terms = [p for t in terms for p in t.parents]
    if not terms or ct > max_dist:
        return None
    return _dist(terms, target, ct + 1, max_dist)

def distance_to(term, target, max_dist=100):
    """Get difference in depth between a term and a target term (returns None if not in same subtree)"""
    return _dist([term], target, 0, max_dist)

In [5]:
[len(list(t.rchildren())) for t in parents]

[135, 27]

In [6]:
terms = [(t, p) for p in parents for t in [p] + list(p.rchildren())]
len(terms)

164

In [7]:
pd.Series([
    s.scope
    for t in terms
    for s in t[0].synonyms
]).value_counts()

EXACT      481
BROAD       63
RELATED     21
NARROW       3
dtype: int64

In [8]:
def get_row(t, p, s, syn_typ):
    # Get relative distance between term and root term
    depth = distance_to(t, p)
    return dict(
        id=t.id, root=p.id, label=t.name, desc=str(t.desc),
        depth=depth, syn=s.desc,
        syn_typ=syn_typ if syn_typ else s.scope.lower()
    )

def get_frame(syn_fn, syn_typ=None):    
    return pd.DataFrame([
        get_row(t[0], t[1], s, syn_typ)
        for t in terms
        for s in syn_fn(t[0])
    ])
            
dfa = pd.concat([
    get_frame(lambda t: t.synonyms),
    get_frame(lambda t: [pronto.Synonym(t.name)], 'label')
])
dfa.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 732 entries, 0 to 163
Data columns (total 7 columns):
depth      732 non-null int64
desc       732 non-null object
id         732 non-null object
label      732 non-null object
root       732 non-null object
syn        732 non-null object
syn_typ    732 non-null object
dtypes: int64(1), object(6)
memory usage: 45.8+ KB


In [9]:
dfa['depth'].value_counts()

4    331
3    184
5    105
2     76
1     26
0     10
Name: depth, dtype: int64

In [10]:
dfa['syn_typ'].value_counts()

exact      481
label      164
broad       63
related     21
narrow       3
Name: syn_typ, dtype: int64

In [11]:
dfa['root'].value_counts()

CL:0000084    662
CL:0000623     70
Name: root, dtype: int64

In [12]:
cts = dfa.groupby('syn')['id'].nunique()
cts.value_counts()

1    684
2      8
3      6
7      2
Name: id, dtype: int64

In [13]:
cts[cts > 1]

syn
IEL                           7
NKT.44+.NK1.1-.Th             2
cytotoxic T cell              2
cytotoxic T lymphocyte        2
cytotoxic T-cell              2
cytotoxic T-lymphocyte        2
early cortical thymocyte      2
immature T cell               3
immature T-cell               2
intraepithelial lymphocyte    7
mature T cell                 2
p-NK                          3
suppressor T cell             3
suppressor T lymphocyte       3
suppressor T-cell             3
suppressor T-lymphocyte       3
Name: id, dtype: int64

In [14]:
# Resolve conflicting tokens using synonym type preference
SYN_TYPES = {'label': 5, 'exact': 4, 'narrow': 3, 'related': 2, 'broad': 1}

def assign_syn_typ_id(df):
    df['syn_typ_id'] = df['syn_typ'].map(SYN_TYPES)
    assert df['syn_typ_id'].notnull().all()
    return df

dfr = (
    dfa
    .pipe(assign_syn_typ_id)
    # Note that NA values will are last by default (i.e. lowest)
    .sort_values(['syn', 'syn_typ_id', 'id'], ascending=False)
    .groupby(['syn'], group_keys=False).head(1)
)
assert dfr.groupby(['syn']).size().max() == 1
dfr.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 700 entries, 413 to 186
Data columns (total 8 columns):
depth         700 non-null int64
desc          700 non-null object
id            700 non-null object
label         700 non-null object
root          700 non-null object
syn           700 non-null object
syn_typ       700 non-null object
syn_typ_id    700 non-null int64
dtypes: int64(2), object(6)
memory usage: 49.2+ KB


In [15]:
# Ensure that labels and ids are mutually exclusive
assert dfr.groupby('label')['id'].nunique().max() == 1
assert dfr.groupby('id')['label'].nunique().max() == 1

In [16]:
dfr[dfr['syn'].str.contains('17')]

Unnamed: 0,depth,desc,id,label,root,syn,syn_typ,syn_typ_id
118,4,"A CD4-positive, alpha-beta T cell that has the...",CL:0001051,"CD4-positive, CXCR3-negative, CCR6-negative, a...",CL:0000084,non-Th1/Th17 CD4+ T cell,broad,1
268,4,"A CD8-positive, alpha-beta T cell that has the...",CL:0001052,"CD8-positive, CXCR3-negative, CCR6-negative, a...",CL:0000084,non-Tc1/Tc17 CD8+ T cell,broad,1
145,5,"CD4-positive, alpha-beta T cell with the pheno...",CL:0000899,T-helper 17 cell,CL:0000084,helper T cell type 17,exact,4
433,3,A circulating gamma-delta T cell that expresse...,CL:0002125,CD27-negative gamma-delta T cell,CL:0000084,gammadelta-17 cells,exact,4
140,5,"CD4-positive, alpha-beta T cell with the pheno...",CL:0000899,T-helper 17 cell,CL:0000084,Th17 cell,exact,4
144,5,"CD4-positive, alpha-beta T cell with the pheno...",CL:0000899,T-helper 17 cell,CL:0000084,Th17 T-lymphocyte,exact,4
143,5,"CD4-positive, alpha-beta T cell with the pheno...",CL:0000899,T-helper 17 cell,CL:0000084,Th17 T-cell,exact,4
147,5,"CD4-positive, alpha-beta T cell with the pheno...",CL:0000899,T-helper 17 cell,CL:0000084,Th17 T lymphocyte,exact,4
142,5,"CD4-positive, alpha-beta T cell with the pheno...",CL:0000899,T-helper 17 cell,CL:0000084,Th17 T cell,exact,4
146,5,"CD4-positive, alpha-beta T cell with the pheno...",CL:0000899,T-helper 17 cell,CL:0000084,Th17 CD4+ T cell,exact,4


In [17]:
dfr[dfr['syn'].str.contains('NK') & (dfr['root'] == 'CL:0000623')]

Unnamed: 0,depth,desc,id,label,root,syn,syn_typ,syn_typ_id
537,2,An immature natural killer cell that is NK1.1-...,CL:0002345,"CD27-low, CD11b-low immature natural killer cell",CL:0000623,preNK.BM,broad,1
534,1,Cell committed to natural killer cell lineage ...,CL:0000937,pre-natural killer cell,CL:0000623,pre-NK cell,exact,4
536,2,A natural killer cell that is developmentally ...,CL:0002344,"CD56-negative, CD161-positive immature natural...",CL:0000623,p-NK,related,2
532,1,A natural killer cell that is developmentally ...,CL:0000824,mature natural killer cell,CL:0000623,mature NK cell,exact,4
529,1,A natural killer cell that is developmentally ...,CL:0000823,immature natural killer cell,CL:0000623,immature NK cell,exact,4
551,2,A natural killer cell subset that is found in ...,CL:0002343,decidual natural killer cell,CL:0000623,decidual NK cell,exact,4
550,2,A natural killer cell subset that is found in ...,CL:0002343,decidual natural killer cell,CL:0000623,dNK cell,exact,4
547,2,A mature natural killer cell that has the phen...,CL:0000939,"CD16-positive, CD56-dim natural killer cell",CL:0000623,cytotoxic CD56-dim NK cell,exact,4
150,3,A NK1.1-positive T cell that is NKGA2-positive.,CL:0002439,NKGA2-positive natural killer cell,CL:0000623,NKGA2-positive natural killer cell,label,5
148,2,A mature NK cell that is NK1.1-positive.,CL:0002438,NK1.1-positive natural killer cell,CL:0000623,NK1.1-positive natural killer cell,label,5


### Export

In [18]:
path = osp.join(META_DATA_DIR, 'raw', 'cl.raw.csv')
dfr.to_csv(path, index=False)
path

'/lab/repos/t-cell-relation-extraction/data/meta/raw/cl.raw.csv'