In [3]:
%run utils.py
%matplotlib inline
import os.path as osp
import re
import tqdm
import spacy
import pandas as pd
nlp = spacy.load("en_scispacy_core_web_sm")
fix_jupyter_spacy_config()

In [4]:
output_file = 'pubmed_abstracts.csv'
df = pd.read_csv(osp.join(DATA_DIR, output_file))
df = df[df['abstract'].notnull() & df['title'].notnull()]
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 46580 entries, 0 to 52173
Data columns (total 5 columns):
abstract    46580 non-null object
date        17852 non-null object
id          46580 non-null int64
terms       46580 non-null object
title       46580 non-null object
dtypes: int64(1), object(4)
memory usage: 2.1+ MB


In [37]:
# *Potentially useful function for recursive subtree search
# def find_node(tokens, p):
#     for token in tokens:
#         if p(token):
#             return token
#     else:
#         children = [t for token in tokens for t in list(token.children)]
#         if not children:
#             return None
#         return find_node(children, p)

def prep(text):
    text = re.sub('\\<(sup|sub)\\>([a-zA-Z+-]*)\\</(sup|sub)>', '\\2 ', text)
    text = re.sub('[tT]\\-([cC]ell[s]*|[lL]ymphocyte[s]*)', 'T \\1', text)
    return text

def clean(term):
    term = term.strip()
    if term.endswith('s'):
        term = term[:-1]
    term = re.sub('[\\(\\)\\-\\+]', '', term).upper().strip()
    term = term.replace('CELLS', '')
    term = term.replace('CELL', '')
    return term

def get_t_types(text, details=False):
    doc = nlp(prep(text))
    res = []
    for token in doc:
        if token.text.startswith('T') or token.text.endswith('T'): 
            if token.dep_ in ['appos', 'amod', 'conj', 'nmod', 'nounmod']:
                if token.head.lemma_ in ['cell', 'lymphocyte']:
                    type_text = clean(token.text)
                    if type_text != 'T':
                        res.append((type_text, token.idx, len(token.text)))
    df = pd.DataFrame(res, columns=['type', 'offset', 'len'])
    if details:
        return df
    cts = df['type'].value_counts()
    cts.index.name = 'type'
    cts.name = 'count'
    return cts.reset_index()

In [38]:
#token = list(nlp(prep('hi there person')))[2]
#get_t_types('A cell (TRM)', details=True)

In [49]:
#text = df.iloc[1]['abstract']
#text = prep('Mucosal associated (MAIT) cells')
#text = df[df['id'] == 12634786].iloc[0]['abstract']
text = "Tissue-resident memory T (Trm) cells form a heterogeneous population that provides localized protection against pathogens. Here, we identify CD49a as a marker that differentiates CD8+ Trm cells on a compartmental and functional basis. In human skin epithelia, CD8+CD49a+ Trm cells produced interferon-γ, whereas CD8+CD49a- Trm cells produced interleukin-17 (IL-17)"
#print(prep(text))
print(text)
get_t_types(text, details=True)

Tissue-resident memory T (Trm) cells form a heterogeneous population that provides localized protection against pathogens. Here, we identify CD49a as a marker that differentiates CD8+ Trm cells on a compartmental and functional basis. In human skin epithelia, CD8+CD49a+ Trm cells produced interferon-γ, whereas CD8+CD49a- Trm cells produced interleukin-17 (IL-17)


Unnamed: 0,type,offset,len
0,TRM,26,3


In [12]:
len(df)

46580

In [14]:
dfcts = pd.concat([
    get_t_types(prep(r['title'] + '.  ' + r['abstract'])).assign(id=r['id'])
    #for i, r in df.loc[df['abstract'].notnull()].sample(5000, random_state=1).iterrows()
    for i, r in tqdm.tqdm(df.iterrows())
])
dfcts = dfcts.reset_index(drop=True)
dfcts.head()

46580it [31:21, 24.76it/s]


Unnamed: 0,type,count,id
0,TH,1,30456753
1,TREG,4,30560927
2,T,1,30560927
3,T,1,30325558
4,TFH,1,30325558


In [68]:
dfcts['type'].value_counts().sort_values().tail(50)

TRESP                  11
TC17                   12
TH1TYPE                12
TGFBETA                14
TH1POLARIZED           14
TSUPPRESSOR            14
TH1LIKE                14
TOTAL                  15
TAL                    16
TCON                   16
TREG/TH17              17
TGFΒ                   17
TRM                    17
TAXSPECIFIC            17
T3                     18
TBET                   19
TCONV                  22
TCM                    23
TOLEROGENIC            25
TH9                    27
ΓΔT                    29
NONT                   30
TUMORINFILTRATING      30
TFR                    33
TH22                   34
TC1                    34
TEM                    36
CD4T                   43
TCR                    45
TYPE                   45
TREGULATORY            49
TREGS                  52
TC                     55
INKT                   56
T8                     61
TR                     62
T4                     67
TEFF                   79
TR1         

In [None]:
# TC17 - CD8(+)T cells producing IL17 (markers: CD161)
# TC1 - CD8(+)T cells producing IFN-gamma
# TC2 - CD8(+)T cells producing IL-4 and IL-5
# TH{0,1,2,17,22}
# TH{0,1,2,17}LIKE 
# TRM - Tissue resident memory cell (markers: CD69)
# TFR - Follicular regulatory T cells (markers: CXCR5, Bcl-6, PD-1, and ICOS)
# TSCM (TSCS) - T stem cell
# TH3 - TGF-beta negative CD4 cells
# TR1 - T-reg type 1: able to secrete IL-10 in response to antigen (markers: CD49b, TIM-3, PD-1, TIGIT, LAG-3, and ICOS)
# TCS1 - Gamma/delta subtype (TCS1); may see increase after transplants
# THP - T-helper precursor cells (related: IL-2, 4, 12, 13, CD40/CD40L)
# TLAK - IL-2 activated t cells (markers: CD16)
# TCM / TM 
# TN / 
# TEFFS / TEFF / TE
# TEMRA 
# TCRGAMMADELTA / TCRGAMMA
# Subsets of GD cells: 
# main difference: Vδ1 vs Vδ2
# Vδ1 γδ-T cells with different Vγ elements account for the majority of mucosal-associated lymphoid tissue γδ-T cells
# Vδ2γ9-T vs In contrast, γδ-T cells bearing the Vδ2 gene with the co-expression of the Vγ9 chain (Vγ9Vδ2-T cells) are abundant in the peripheral blood and lymphoid organs of most healthy individuals
# 

# OKT - Ortho-Kung T cell (just an old broad term for lyphocytes)
# PBT - Peripheral blood T cells
# TRESP - Responder T cells (seems to be anything CD25-)
# TREST?
# TCON


# Currently there are six subtypes of Tregs including CD4+CD25+ naturally occurring (N-Tregs), 
# inducible naïve CD4+CD25- T cells (TR1), TR1 memory phenotype, T-helper type 3 (TH3), 
# CD4-CD25+DX5+ natural killer T cells (TRNKT), and CD4-CD25+CD8+ cytotoxic T cells (TRCTC)

In [210]:
# Remove "TYPE", split on '/' and match pieces
m_typ = {
    'TN': (['TN'], 'Naive', 2),
    'TEMRA': (['TEMRA'], 'EMRA', 2),
    'TCM': (['TCM'], 'Central Memory', 2),
    'TEFF' : (['TEFF', 'TEFFS', 'TEFFECTORS'], 'Effector', 2),
    'TEM': (['TEM', 'TEMS'], 'Effector Memory', 2),
    'TC0': (['TC0'], 'Tc0', 1),
    'TC1': (['TC1'], 'Tc1', 1),
    'TC2': (['TC2'], 'Tc2', 1),
    'TC9': (['TC9'], 'Tc9', 1),
    'TC17': (['TC17'], 'Tc17', 1),
    'TH': (['TH', 'THELPER'], 'Th', 2),
    'THP': (['THP'], 'Th-Precursor', 1),
    'TH0': (['TH0'], 'Th0', 1),
    'TH1': (['TH1'], 'Th1', 1),
    'TH2': (['TH2'], 'Th2', 1),
    'TH3': (['TH3'], 'Th3', 1),
    'TH9': (['TH9'], 'Th9', 1),
    'TH17': (['TH17'], 'Th17', 1),
    'TH22': (['TH22'], 'Th22', 1),
    'TFH': (['TFH', 'TFHLIKE'], 'Tfh', 2),
    'TFREG': (['TFREG', 'TFR', 'TFRLIKE'], 'Follicular Treg', 1),
    'TFH0LIKE': (['TH0LIKE', 'THELPER0LIKE'], 'Th0-like Tfh', 1),
    'TFH1LIKE': (['TH1LIKE', 'THELPER1LIKE'], 'Th1-like Tfh', 1),
    'TFH2LIKE': (['TH2LIKE', 'THELPER2LIKE'], 'Th2-like Tfh', 1),
    'TFH17LIKE': (['TH17LIKE', 'THELPER17LIKE'], 'Th17-like Tfh', 1),
    'TRM': (['TRM', 'TTRM'], 'Tissue Resident Memory', 1),
    'TSCM': (['TSCM'], 'Stem Memory', 1),
    'TREG': (['TREG', 'TREGS', 'TREGULATORY'], 'Treg', 2),
    # This is an old term for Tregs
    'TSUPP': (['TSUPPRESSOR', 'TS/C', 'TSUPPRESSOR/CYTOTOXIC', 'TSUPPRESSORCYTOTOXIC', 'TS/TC'], 'T Suppresor', 2),
    'TREG1': (['TREG1', 'TR1', 'TR1POLARIZED'], 'Type 1 Treg', 1), # Inducible naive
    'TREG17': (['TREG17', 'TREG/TH17', 'TREGS/TH17', 'TREGSTH17', 'TREGTH17'], 'Type 17 Treg', 1),
    'NKT': (['NKT', 'NK/T', 'INKT', 'TRNKT', 'CNKT'], 'Natural Killer T', 1),
    'MAIT': (['MAIT'], 'Mucosal-associated Invariant T', 1),
    'γδT': ([
        'TCRGAMMADELTA', 'TGAMMADELTA', 'TCRGAMMA', 'TGAMMA', 
        'TCRDELTA', 'TDELTA', 'TCRGAMMADELTABEARING', 'ΓΔT', 'GAMMA/DELTAT'], 'γδT', 2),
    'γδT-Vδ1': (['VDELTA1T', 'VΔ1T', 'TCRVΔ1', 'TCRVΔ1'], 'γδT-Vδ1', 1),
    'γδT-Vδ2': (['VDELTA2T', 'VΔ2T', 'TCRVΔ2', 'TCRVΔ2'], 'γδT-Vδ2', 1),
    'γδT-Vγ9': (['TCRVΓ9', 'TCRVGAMMA9'], 'γδT-Vγ9', 1),
    'γδT-Vγ9Vδ2': (['VΓ9VΔ2T', 'VΔ2T', 'TCRVΔ2', 'TCRVΔ2'], 'γδT-Vγ9Vδ2', 1),
    'γδT-17': (['TCRΓΔ17', 'TGAMMADELTA17'], 'γδT17', 1),
    'γδT-TCS1': (['TCS1'], 'γδT-TCS1', 1),
    
    # Broader classifications
    'TRESP': (['TRESP', 'TRESPS'], 'Responsive T Cell', 3),
    'TREST': (['TREST', 'TRESTING'], 'Resting T Cell', 3),
    'OKT': (['OKT', 'OKT8T'], 'Ortho-Kung T Cell', 3),
    'TCONV': (['TCONV', 'TCON'], 'Non-regulatory T Cell', 3),
    'TLAK': (['TLAK', 'LAKT', 'TCK'], 'Cytokine Activated T Cell', 3)
}

In [211]:
dfcts[dfcts['id'] == 30119214]

Unnamed: 0,type,count,id
28,TREG,1,30119214


In [212]:
dfcts[dfcts['type'].str.contains('TH')]['type'].value_counts().head(50).tail(50)
#dfcts[dfcts['type'].str.contains('TH') & dfcts['type'].str.contains('LIKE')]['type'].value_counts().head(50).tail(50)
#dfcts[dfcts['type'].str.contains('Γ') | dfcts['type'].str.contains('Δ') | dfcts['type'].str.contains('GAMMA') | dfcts['type'].str.contains('DELTA')]['type'].value_counts().head(50).tail(50)

TH1                  561
TH17                 533
TH                   508
TH2                  300
THELPER              232
TH22                  34
TH9                   27
TREG/TH17             17
TH1POLARIZED          14
TH1LIKE               14
TH1TYPE               12
TH0                   11
TH3                    9
TH2TYPE                6
THYMIC                 6
TH1/TH17               5
TH2LIKE                5
TH1/TH2                5
THELPER1               5
TH17/TREG              5
THP                    4
TH0LIKE                4
THELPER/INDUCER        4
TH1/TH2/TH17           4
TH17POLARIZED          3
TH17/TH1               3
TH2SKEWED              3
TH17DERIVED            3
TH0/TH2                3
THELPER17              3
TH2POLARIZED           3
THYMUSDERIVED          3
THEOPHYLLINE           2
THC                    2
TH/I                   2
TREGS/TH17             2
TH2INDUCING            2
TH1SUBSET              1
TH1EFFECTOR            1
TH1/TH17POLARIZED      1


In [172]:
#df[df['id'].isin(dfcts[dfcts['type'] == 'TS/C']['id'])]['id'].iloc[0]

In [200]:
df[df['id'].isin(dfcts[dfcts['type'] == 'TH']['id'])]['abstract'].iloc[1]

'Dasatinib induces lymphocytosis of large granular lymphocytes (LGLs) in a proportion of patients with chronic myelogenous leukemia (CML), and is associated with better clinical outcomes. LGLs consist of cytotoxic T lymphocytes and natural killer cells; however, the context and phenotypic/functional features of each type of LGL are unknown. To better define features of these LGLs, we investigated lymphocytosis in CML patients treated with dasatinib. D57-positive and CD4-positive type I T-helper (Th) cells (CD57+\u2009Th cells) rarely occur in CML patients without lymphocytosis and in healthy individuals; however, a substantial increase in the proportion of CD57+\u2009Th cells was observed in CML patients treated with dasatinib. In addition, these cells showed appreciable levels of cytocidal activity via cytotoxic degranulation. Analysis of T-cell receptor α and β sequences showed a skewed T-cell repertoire in the CD57+\u2009Th cells. Furthermore, patients with LGLs and CD57+\u2009Th ly

## Type Normalization

In [201]:
dfcts.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11524 entries, 0 to 11523
Data columns (total 3 columns):
type     11524 non-null object
count    11524 non-null int64
id       11524 non-null int64
dtypes: int64(2), object(1)
memory usage: 270.2+ KB


In [205]:
typ_uniq = dfcts['type'].unique()
m_typ_norm = {}

def get_match(typ):
    for k, v in m_typ.items():
        cands = v[0]
        if typ in cands:
            return k
    return None

for typ_ent in typ_uniq:
    typ = typ_ent.replace('TYPE', '')
    match = get_match(typ)
    if match is None:
        for t in typ.split('/'):
            match = get_match(t)
            if match is not None:
                break
    if match is not None:
        cands, nm, lvl = m_typ[match]
        m_typ_norm[typ_ent] = (match, nm, lvl)
    else:
        m_typ_norm[typ_ent] = (None, None, None)

In [207]:
dfr = dfcts.copy()
dfr['type_key'] = [e[0] for e in dfr['type'].map(m_typ_norm)]
dfr['type_lbl'] = [e[1] for e in dfr['type'].map(m_typ_norm)]
dfr['type_lvl'] = [e[2] for e in dfr['type'].map(m_typ_norm)]
dfr.head(15)

Unnamed: 0,type,count,id,type_key,type_lbl,type_lvl
0,TH,1,30456753,TH,Th,2.0
1,TREG,4,30560927,TREG,Treg,2.0
2,T,1,30560927,,,
3,T,1,30325558,,,
4,TFH,1,30325558,TFH,Tfh,2.0
5,TH,1,30151740,TH,Th,2.0
6,THELPER,1,30151740,TH,Th,2.0
7,TREG,1,29996743,TREG,Treg,2.0
8,MAIT,3,29470597,MAIT,Mucosal-associated Invariant T,1.0
9,T,1,29470597,,,


## Export

In [208]:
dfr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11524 entries, 0 to 11523
Data columns (total 6 columns):
type        11524 non-null object
count       11524 non-null int64
id          11524 non-null int64
type_key    7045 non-null object
type_lbl    7045 non-null object
type_lvl    7045 non-null float64
dtypes: float64(1), int64(2), object(3)
memory usage: 540.3+ KB


In [209]:
path = osp.join(DATA_DIR, 'pubmed_abstract_tcell_types.csv')
dfr.to_csv(path, index=False)
path

'/Users/eczech/tmp/nlp/data/pubmed_abstract_tcell_types.csv'