In [1]:
import pronto
import os.path as osp
import re
import os
import pandas as pd
import numpy as np
%run env.py

In [2]:
# Download cl.owl from http://purl.obolibrary.org/obo/cl.owl
onto = pronto.Ontology(osp.join(os.environ['DATA_DIR'], 'meta', 'cl', 'cl.owl'))

In [8]:
# Get term for "T Cell"
# See: https://www.ebi.ac.uk/ols/ontologies/cl/terms?iri=http%3A%2F%2Fpurl.obolibrary.org%2Fobo%2FCL_0000084
parent = onto['CL:0000084']
parent

<CL:0000084: T cell>

In [9]:
len(list(parent.rchildren()))

135

In [12]:
terms = [parent] + list(parent.rchildren())
len(terms)

136

In [24]:
pd.Series([
    s.scope
    for t in terms
    for s in t.synonyms
]).value_counts()

EXACT      447
BROAD       58
RELATED     18
NARROW       3
dtype: int64

In [29]:
def get_row(t, s, syn_typ):
    return dict(
        id=t.id, label=t.name, desc=str(t.desc),
        syn=s.desc,
        syn_typ=syn_typ if syn_typ else s.scope.lower()
    )

def get_frame(syn_fn, syn_typ=None):    
    return pd.DataFrame([
        get_row(t, s, syn_typ)
        for t in terms
        for s in syn_fn(t)
    ])
            
dfa = pd.concat([
    get_frame(lambda t: t.synonyms),
    get_frame(lambda t: [pronto.Synonym(t.name)], 'label')
])
dfa.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 662 entries, 0 to 135
Data columns (total 5 columns):
desc       662 non-null object
id         662 non-null object
label      662 non-null object
syn        662 non-null object
syn_typ    662 non-null object
dtypes: object(5)
memory usage: 31.0+ KB


In [30]:
dfa['syn_typ'].value_counts()

exact      447
label      136
broad       58
related     18
narrow       3
Name: syn_typ, dtype: int64

In [32]:
cts = dfa.groupby('syn')['id'].nunique()
cts.value_counts()

1    617
2      8
3      5
7      2
Name: id, dtype: int64

In [33]:
cts[cts > 1]

syn
IEL                           7
NKT.44+.NK1.1-.Th             2
cytotoxic T cell              2
cytotoxic T lymphocyte        2
cytotoxic T-cell              2
cytotoxic T-lymphocyte        2
early cortical thymocyte      2
immature T cell               3
immature T-cell               2
intraepithelial lymphocyte    7
mature T cell                 2
suppressor T cell             3
suppressor T lymphocyte       3
suppressor T-cell             3
suppressor T-lymphocyte       3
Name: id, dtype: int64

In [35]:
# Resolve conflicting tokens using synonym type preference
SYN_TYPES = {'label': 5, 'exact': 4, 'narrow': 3, 'related': 2, 'broad': 1}

def assign_syn_typ_id(df):
    df['syn_typ_id'] = df['syn_typ'].map(SYN_TYPES)
    assert df['syn_typ_id'].notnull().all()
    return df

dfr = (
    dfa
    .pipe(assign_syn_typ_id)
    # Note that NA values will are last by default (i.e. lowest)
    .sort_values(['syn', 'syn_typ_id', 'id'], ascending=False)
    .groupby(['syn'], group_keys=False).head(1)
)
assert dfr.groupby(['syn']).size().max() == 1
dfr.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 632 entries, 412 to 186
Data columns (total 6 columns):
desc          632 non-null object
id            632 non-null object
label         632 non-null object
syn           632 non-null object
syn_typ       632 non-null object
syn_typ_id    632 non-null int64
dtypes: int64(1), object(5)
memory usage: 34.6+ KB


In [40]:
# Ensure that labels and ids are mutually exclusive
assert dfr.groupby('label')['id'].nunique().max() == 1
assert dfr.groupby('id')['label'].nunique().max() == 1

In [36]:
dfr[dfr['syn'].str.contains('17')]

Unnamed: 0,desc,id,label,syn,syn_typ,syn_typ_id
119,"A CD4-positive, alpha-beta T cell that has the...",CL:0001051,"CD4-positive, CXCR3-negative, CCR6-negative, a...",non-Th1/Th17 CD4+ T cell,broad,1
268,"A CD8-positive, alpha-beta T cell that has the...",CL:0001052,"CD8-positive, CXCR3-negative, CCR6-negative, a...",non-Tc1/Tc17 CD8+ T cell,broad,1
147,"CD4-positive, alpha-beta T cell with the pheno...",CL:0000899,T-helper 17 cell,helper T cell type 17,exact,4
433,A circulating gamma-delta T cell that expresse...,CL:0002125,CD27-negative gamma-delta T cell,gammadelta-17 cells,exact,4
141,"CD4-positive, alpha-beta T cell with the pheno...",CL:0000899,T-helper 17 cell,Th17 cell,exact,4
142,"CD4-positive, alpha-beta T cell with the pheno...",CL:0000899,T-helper 17 cell,Th17 T-lymphocyte,exact,4
146,"CD4-positive, alpha-beta T cell with the pheno...",CL:0000899,T-helper 17 cell,Th17 T-cell,exact,4
144,"CD4-positive, alpha-beta T cell with the pheno...",CL:0000899,T-helper 17 cell,Th17 T lymphocyte,exact,4
140,"CD4-positive, alpha-beta T cell with the pheno...",CL:0000899,T-helper 17 cell,Th17 T cell,exact,4
138,"CD4-positive, alpha-beta T cell with the pheno...",CL:0000899,T-helper 17 cell,Th17 CD4+ T cell,exact,4


In [41]:
path = osp.join(os.environ['META_DATA_DIR'], 'raw', 'cl.raw.csv')
dfr.to_csv(path, index=False)
path

'/Users/eczech/repos/hammer/t-cell-relation-extraction/pm_subtype_protein_relations/data/meta/raw/cl.raw.csv'