In [1]:
# conda env scibert
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import torch
import plotly
import plotly.plotly as py
from plotly import graph_objs as go
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM

tags_path = '/Users/eczech/data/research/hammer/nlp/20190311-pubmed-tcell-relation/articles/corpus/corpus_01/tags.csv'
model_path = '/Users/eczech/tmp/scibert/scibert_scivocab_uncased'

In [2]:
df = pd.read_csv(tags_path)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4007030 entries, 0 to 4007029
Data columns (total 10 columns):
id            object
type          object
ent_id        object
ent_lbl       object
ent_prefid    object
start_chr     int64
end_chr       int64
start_wrd     int64
end_wrd       int64
text          object
dtypes: int64(4), object(6)
memory usage: 305.7+ MB


In [3]:
model = BertModel.from_pretrained(model_path)
model.eval()
tokenizer = BertTokenizer.from_pretrained(model_path)

# # Tokenized input
# text = "[CLS] Th17 differentiation is induced by IL-6. [SEP]"
# tokenized_text = tokenizer.tokenize(text)
# indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
# segments_ids = [1 for i in range(len(tokenized_text))]
# tokens_tensor = torch.tensor([indexed_tokens])
# segments_tensors = torch.tensor([segments_ids])

In [33]:
def get_subtoken_embeddings(model, sentence):
    tokenized_text = tokenizer.tokenize(sentence)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    segments_ids = [1 for i in range(len(tokenized_text))]
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])
    with torch.no_grad():
        encoded_layers, _ = model(tokens_tensor, segments_tensors)
    token_embeddings = torch.squeeze(torch.sum(torch.stack(encoded_layers)[-4:], 0))
    return tokenized_text, token_embeddings

def get_word_embedding(sentence, token_embeddings, word):
    pass

In [42]:
tokenizer.tokenize('CD4+CD45RA+CD45RO−CD62L+CCR7+CD127+CD27+CD28+CD95+CD122+ T cells')

['cd',
 '##4',
 '+',
 'cd',
 '##45',
 '##ra',
 '+',
 'cd',
 '##45',
 '##ro',
 '##−',
 '##cd',
 '##62',
 '##l',
 '+',
 'ccr',
 '##7',
 '+',
 'cd',
 '##127',
 '+',
 'cd',
 '##27',
 '+',
 'cd',
 '##28',
 '+',
 'cd',
 '##95',
 '+',
 'cd',
 '##122',
 '+',
 't',
 'cells']

In [37]:
text = '[CLS] Furthermore, Th2 cells involved in allergic airway disease models express CCR4, and CCR4+ T cells from asthmatic patients are a predominant source of Th2 cytokines [SEP]'
tokenized_text, token_embeddings = get_token_embeddings(model, text)

In [38]:
tokenized_text

['[CLS]',
 'furthermore',
 ',',
 'th',
 '##2',
 'cells',
 'involved',
 'in',
 'allergic',
 'airway',
 'disease',
 'models',
 'express',
 'ccr',
 '##4',
 ',',
 'and',
 'ccr',
 '##4',
 '+',
 't',
 'cells',
 'from',
 'asthma',
 '##tic',
 'patients',
 'are',
 'a',
 'predominant',
 'source',
 'of',
 'th',
 '##2',
 'cytokines',
 '[SEP]']

In [4]:
# Predict hidden states features for each layer
# dimensions of `encoded_layers`
# The layer number (12 layers)
# The batch number (1 sentence)
# The word / token number (22 tokens in our sentence)
# The hidden unit / feature number (768 features)
with torch.no_grad():
    encoded_layers, _ = model(tokens_tensor, segments_tensors)

In [31]:
token_embeddings = torch.squeeze(torch.sum(torch.stack(encoded_layers)[-4:], 0))
token_embeddings.shape

torch.Size([12, 768])