In [180]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import word2vec
import re
import plotly
import plotly.plotly as py
from plotly import graph_objs as go

In [None]:
plotly.offline.init_notebook_mode()

## Load Cell Type Tags

These tags are a combination of white-listed T cell types and a JNLPA-trained NER model noun phrases from 10k doc PMC corpus

In [311]:
df = pd.read_csv('/Users/eczech/data/research/hammer/nlp/20190311-pubmed-tcell-relation/articles/corpus/corpus_01/tags.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4007030 entries, 0 to 4007029
Data columns (total 10 columns):
id            object
type          object
ent_id        object
ent_lbl       object
ent_prefid    object
start_chr     int64
end_chr       int64
start_wrd     int64
end_wrd       int64
text          object
dtypes: int64(4), object(6)
memory usage: 305.7+ MB


In [312]:
pattern = re.compile('[T][hH][-]?\d{1,2}')
    
def accept(r):
    t = r['text']
    for p in ['T cell', 'T-cell', 'T lymphocyte', 'T-lymphocyte', 'TH-', ' T ', 'T helper', 'T-helper']:
        if p in t:
            return True
    if pattern.search(t):
        return True
    return False

# Filter to t cell types only from NER model (ignore IMMUNE_CELL_TYPE to ignore white-listed cell types)
df = (
    df 
    .pipe(lambda df: df[df['type'].str.contains('CELL_TYPE')])
    .pipe(lambda df: df[df.apply(accept, axis=1)])
    .assign(type=lambda df: df['type'].map(dict(CELL_TYPE='JNLPA-NER', IMMUNE_CELL_TYPE='WHITE-LIST')))
)

df.head()

Unnamed: 0,id,type,ent_id,ent_lbl,ent_prefid,start_chr,end_chr,start_wrd,end_wrd,text
10,PMC5704053,JNLPA-NER,,,,511,529,94,99,naïve CD4+ T cells
43,PMC5704053,JNLPA-NER,,,,2761,2773,488,492,CD8+ T cells
45,PMC5704053,JNLPA-NER,,,,2893,2905,512,516,CD8+ T cells
73,PMC5704053,JNLPA-NER,,,,4346,4370,791,799,naïve CD4+ CD25− T cells
160,PMC5704053,JNLPA-NER,,,,10311,10329,1956,1961,CD4+ CD25− T cells


In [341]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 358790 entries, 10 to 4007021
Data columns (total 10 columns):
id            358790 non-null object
type          358790 non-null object
ent_id        124519 non-null object
ent_lbl       124519 non-null object
ent_prefid    124519 non-null object
start_chr     358790 non-null int64
end_chr       358790 non-null int64
start_wrd     358790 non-null int64
end_wrd       358790 non-null int64
text          358790 non-null object
dtypes: int64(4), object(6)
memory usage: 30.1+ MB


In [313]:
# Breakdown by source
df['type'].value_counts()

JNLPA-NER     234271
WHITE-LIST    124519
Name: type, dtype: int64

In [342]:
# Num unique cell type noun phrases
df['text'].nunique()

30728

In [316]:
# Sort cell type phrases by frequency and show horizontal concatentation by rank slice 
# (50 items per slice, starting at various positions within ranked list)
def ct(rng, grp):
    return (
        df['text'].value_counts().rename('count').rename_axis('text', axis=0)
        .sort_values(ascending=False).iloc[slice(*rng)].reset_index()
        .rename(columns=lambda c: (grp, c))
        .pipe(lambda df: df.set_axis(pd.MultiIndex.from_tuples(df.columns.to_list()), axis=1, inplace=False))
    )
pd.concat([ct((i-1, i+50-1), '#%s-%s' % (i, i+50)) for i in [1, 500, 1000, 5000]], axis=1)

Unnamed: 0_level_0,#1-51,#1-51,#500-550,#500-550,#1000-1050,#1000-1050,#5000-5050,#5000-5050
Unnamed: 0_level_1,text,count,text,count,text,count,text,count
0,T cells,45886,CD4+ and CD8+ T-lymphocytes,30,contaminating T cells,15,circulating Aβ-reactive T cells,2
1,Th17,41559,freshly isolated T cells,30,IL-17A-producing CD4+ T cells,15,young CD8+ T cells,2
2,Th1,32949,primary CD8+ T cells,30,anti-tumor CD8+ T cell,15,cytokine-producing CART cells,2
3,Th2,27526,CD56+ T cells,30,live T cells,15,mucosal T-cells,2
4,CD4+ T cells,21958,activated CD4+ T-cells,30,CD4+ to CD8+ T cells,15,encephalitogenic CD4+ T-cells,2
5,CD8+ T cells,13059,CD4+ T cell types,30,CD3-positive T cells,15,immune response T cells,2
6,CD4 T cells,4985,Naive CD8+ T cells,30,human CD8+ T lymphocytes,15,B-cell and T-cell subsets,2
7,T lymphocytes,4964,circulating T-cells,30,Thy1.1+OT-1+CD8+ T cells,14,proliferating T cell populations,2
8,TH17,4212,activated T-lymphocytes,30,TGF-β1−/− T cells,14,autoreactive immature T lymphocytes,2
9,T helper,4209,CMV-specific T-cells,30,gated CD8+ T cells,14,CD8+ or CD4+ T lymphocytes,2


In [345]:
re.search(r'^(?:[tT][hHcCr]|[tT][- ][hH]elper )\d{1,2}(?: cell| lympo)?', 'T-helper 17')

<_sre.SRE_Match object; span=(0, 11), match='T-helper 17'>

In [351]:
# Compare the most common white-listed cell types to those from the NER tagger,
# looking specifically for the very common "Th\d{1,2}" pattern or minor variants
# NOTE: The "Th\d{1,2}" pattern is very frequently missed by the NER model

def get_cts(typ, pattern):
    return df.query('type == "%s"' % typ)\
        .pipe(
            lambda df: df[df['text'].str.contains(pattern)]['text']\
            .value_counts().rename('count').rename_axis('text', axis=0).sort_values().tail(50).reset_index()\
            .add_prefix(typ + ':')
        )
pd.concat([
    # Match to TH17 | Tr1 | Tc0 | T helper 17 etc.
    get_cts('JNLPA-NER', r'^(?:[tT][hHcCr]|[tT][- ][hH]elper )\d{1,2}(?: cell| lympo)?'), 
    get_cts('WHITE-LIST', '.*')
], axis=1)

Unnamed: 0,JNLPA-NER:text,JNLPA-NER:count,WHITE-LIST:text,WHITE-LIST:count
0,Th1/Th17 cell,3,T helper-17,7
1,Th1/Th17 subsets,3,T helper1,7
2,Th1/Th17 lymphocytes,4,TH3,7
3,TH1/TH2 cells,4,peripheral T regulatory,7
4,Th1Th17/Th1* cells,4,inducible T regulatory,8
5,Th17.1 cells,4,T helper 22,8
6,Th17/Th1 hybrid cells,4,T helper-2,9
7,Th1/Th2/Th17 cells,4,Treg/Th1,9
8,Th1/Th17 effector cells,4,T helper 9,10
9,Th17/Treg cell populations,4,T helper-1,11


In [340]:
# Look for cell types with long strings of CD* designations
df[df['text'].str.contains(r'(?:CD\d{1,3}[\+-]){3,}')]['text'].value_counts().sort_values().tail(50)

CD3+CD4+CD25+Foxp3+ T cell                                             1
Tumor environment CD4+CD25-CD69+Foxp3-LAP+ T cells                     1
CD3+CD4-CD8- T cell                                                    1
CD4+CD25+CD44+ T cells                                                 1
CD45+CD3+CD4+ helper T-cells                                           1
CD3+CD4+CD25+Forp3+ T cells                                            1
CD4+CD25-CD127+ effector T-cells                                       1
CD45+CD3+CD8+ T cells                                                  1
CD3+CD4-CD8- double-negative (DN) T cells                              1
CD4+CD45RA+CD45RO−CD62L+CCR7+CD127+CD27+CD28+CD95+CD122+ T cells       1
CD4+CD25+CD127+ T effector cells                                       1
CD45+CD3+CD4+ T cells                                                  1
CD28-CD57-CD8+ T-cells                                                 1
CD4+CD25+CD73+ T cells                             

## Load Embeddings

In [None]:
# See: https://github.com/danielfrg/word2vec
model = word2vec.load('/Users/eczech/Downloads/PubMed-and-PMC-w2v.bin')



In [352]:
# Number of words in vocab:
len(model.vocab)

4087446

In [179]:
# Check to see if multi-protein modifiers are present in vocab
words = pd.Series(model.vocab)
words[words.str.contains('CD4\+')]

2312                                          CD4+
18996                                    CD4+CD25+
36648                                    CD4+CD25-
46619                                     CD4+CD8+
50250                                    CD4+/CD8+
54278                                        CD4+T
66209                              CD4+CD25+Foxp3+
71137                                     CD3+CD4+
77722                                     CD4+CD8-
77855                                   CD4+Foxp3+
84450                                 CD4+CD25high
90325                                    CD25+CD4+
98198                              CD4+CD25+FoxP3+
102586                                      CD4+8+
126543                                      CD4+8-
128745                                 CD4+CD45RO+
136560                                 CD4+CD45RA+
136968                                CD4+CD28null
140567                                  CD4+FoxP3+
146239                         

## Clustering

In [356]:
# Pull list of cell type phrases by frequency (only the most frequent phrases will be analyzed)
cts = df.groupby('text').size().sort_values(ascending=False).rename('count').reset_index()
cts.head()

Unnamed: 0,text,count
0,T cells,45886
1,Th17,41559
2,Th1,32949
3,Th2,27526
4,CD4+ T cells,21958


In [410]:
def get_vecs(df, model, use_spacy=False):
    mtxt, mvec = [], []
    
    if use_spacy:
        import spacy
        nlp = spacy.load('en_core_sci_md')
        
    # Map index -> vocab word
    lkp = pd.Series(np.arange(len(model.vocab)), index=model.vocab).to_dict()
    
    # Build list of 200 dimension vectors with corresponding text
    for i, r in tqdm.tqdm_notebook(df.iterrows(), total=len(df)):
        ws = [str(t) for t in nlp.tokenizer(r['text'])] if use_spacy else r['text'].split()
        
        # Fetch vectors for present tokens 
        wsidx = [i for i in range(len(ws)) if ws[i] in lkp]
        wslbl = str(list(np.array(ws)[wsidx])) 
        vecs = [model.vectors[lkp[ws[i]]] for i in wsidx]
        
        # Ignore cell types having no tokens with embedding vectors
        if len(vecs) == 0:
            continue
            
        # Sum embeddings across present tokens
        mtxt.append((wslbl,) + tuple(r.values))
        mvec.append(np.stack(vecs).sum(axis=0))
    df = pd.DataFrame(mvec, index=pd.MultiIndex.from_tuples(mtxt))
    df.index.names = ['label', 'text', 'count']
    return df

dfv = get_vecs(cts, model, use_spacy=False)
#dfv = get_vecs(cts, model, use_spacy=True)

HBox(children=(IntProgress(value=0, max=30728), HTML(value='')))

In [411]:
dfv.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
label,text,count,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
"['T', 'cells']",T cells,45886,0.046443,-0.034749,0.092937,0.029665,0.149286,0.250088,-0.059849,0.199854,-0.15388,-0.06005,...,-0.312609,-0.006381,0.022755,0.311247,0.055689,0.199118,-0.17019,0.194637,0.117738,-0.028728
['Th17'],Th17,41559,-0.045065,0.091974,-0.057747,0.006647,0.047733,0.160625,-0.024835,0.052163,-0.105156,-0.054238,...,-0.054173,-0.057372,0.037496,0.106478,0.063646,0.040446,0.001464,0.100441,-0.030139,0.087788
['Th1'],Th1,32949,0.017503,0.097283,-0.099464,0.020219,0.023538,0.16844,-0.023904,0.108156,-0.061502,-0.017315,...,-0.023351,-0.059457,0.026301,0.050174,0.059468,-0.01995,-0.084066,0.107506,-0.032287,0.049623


In [412]:
import umap
from numba import NumbaPerformanceWarning
import warnings

with warnings.catch_warnings():
    warnings.filterwarnings("ignore", category=NumbaPerformanceWarning)
    decomp = umap.UMAP(n_components=2, metric='cosine', n_neighbors=15, random_state=1)
    X = dfv.head(10000)
    Y = decomp.fit_transform(X.values)

In [429]:
from scipy.stats import gaussian_kde
size = np.log10(X.index.get_level_values('count')) 
size = np.clip(20 * (size / size.max()), 6, 20)
kde = gaussian_kde(Y.T)(Y.T)
trace = go.Scatter(
    x = Y[:, 0],
    y = Y[:, 1],
    text = [
        'Text: {}<br>Vocab: {}<br>N: {}'.format(i[1].replace('\n', ''), i[0], i[2]) 
        for i in X.index
    ],
    marker = dict(size=size, line=dict(width=0), color=kde, colorscale='Portland'),
    mode = 'markers'
)
layout = go.Layout(hovermode='closest')
fig = go.Figure(data=[trace], layout=layout)
plotly.offline.iplot(fig) 
# When ready: 
# plotly.tools.set_credentials_file(username='eczech', api_key='########')
# py.iplot(fig, filename='word2vec_umap_v2')

### Notable T Cell Noun Phrases

- myelin-reactive encephalitogenic CD4+ (T helper [Th]) cells
- circulating autoreactive T cells
- unprimed rat T-cell
- OVA-specific CD4+ and CD8+ congenically labeled Thy1.1+ T cells
- hepatic and splenic CD69+CD8+ T cells 
- double-positive (DP) T cells
- patient-derived T-cells
- naïve CD8+ T (CD8+CD45RA+CCR7highCD45RO−CD56-CD57−) cells
- HBV-specific CD8 T cells
- DO11.10 T cells
- dividing T cells
- CD4+2D2-IEL-THIGH cells
- polyclonal T cells
- quiescent T cells 
- Antigen-specific T cells
- infiltrating T-cells
- normal CD4+ T cells
- Primary T cells
- Hector T cells  (Hector is a kind of mouse line)
- purified CD4+ T cells 

# Graveyard

In [None]:
# def get_vectors(model, words):
#     lkp = pd.Series(np.arange(len(model.vocab)), index=model.vocab)
#     idx = lkp.loc[words]
#     if idx.isnull().any():
#         raise ValueError('Terms "{}" not found'.format(idx[idx.isnull()].index.values))
#     return model.vectors[idx.values]
#
# def get_augmented_vectors(model, words, combos):
#     vec = get_vectors(model, words)
#     aug = []
#     for c in combos:
#         idx = np.where(np.isin(model.vocab, c))[0]
#         if len(idx) != 2:
#             raise ValueError('Combination "{}" not found'.format(c))
#         aug.append(model.vectors[idx].sum(axis=0))
#     vec = np.concatenate((vec, np.stack(aug, axis=0)))
#     return vec
#
# twd = [
#     'CD4', 'CD4-', 'CD4+', 'CD8', 'CD8-', 'CD8+', 'CD3', 'CD3-', 'CD3+', 'CD25', 'CD25+', 'CD25-',
#     'CD19', 'CD19+', 'CD20', 'CD20+', 'CD27', 'CD27+', 'CD38', 'CD38+', 'CD24', 'CD24+', 'CD10', 'CD10+',
#     'unactivated', 'HIV-specific', 'TIM-1+', 'activated', 'T-bet+', 'CD4+CD28-',
#     'CD3+CD4+', 'CD4+CD8-', 'CD4+CD25-'
# ]
# tcb = [('CD3+', 'CD4+'), ('CD4+', 'CD8-'), ('CD4+', 'CD25-')]
# #tvc = get_vectors(model, twd)
# tvc = get_augmented_vectors(model, twd, tcb)
# tvc.shape, len(twd), len(tcb)

# from sklearn.metrics.pairwise import cosine_similarity
# cols = twd + ['{} + {}'.format(*c) for c in tcb]
# sns.clustermap(pd.DataFrame(cosine_similarity(tvc), index=cols, columns=cols), cmap='Spectral')