In [1]:
import os
import os.path as osp
import pandas as pd
import numpy as np
from snorkel import SnorkelSession
from tcre.env import *
from tcre.supervision import *
from tcre.modeling import utils
%matplotlib inline
session = SnorkelSession()
classes = get_candidate_classes()

In [2]:
from snorkel.models import Candidate, GoldLabel
candidate_class = classes.inducing_cytokine
cands = session.query(candidate_class.subclass)\
    .filter(candidate_class.subclass.split == SPLIT_DEV).all()
len(cands)

673

In [3]:
pd.set_option('display.max_colwidth', 250)
df = utils.get_candidate_df(cands, entity_markers=['[[', ']]', '<<', '>>'])
df.head()

Unnamed: 0,cid,label,text
0,30167,1,"[[ IL-12 ]] induces not only Ifng expression1 but also T - bet , which promotes the survival and proliferation of differentiating << TH1 >> cells ."
1,30168,0,"IL-12 induces not only [[ Ifng ]] expression1 but also T - bet , which promotes the survival and proliferation of differentiating << TH1 >> cells ."
2,30169,1,"In mice , [[ TGFβ ]] together with IL6 can activate antigen - responsive naïve CD4 + T cells to develop into << Th17 >> cells [ 39 ] ."
3,30170,1,"In mice , TGFβ together with [[ IL6 ]] can activate antigen - responsive naïve CD4 + T cells to develop into << Th17 >> cells [ 39 ] ."
4,30171,0,"Several findings suggest that during the initiation of a << Th1 >> response , [[ IL-12 ]] is produced particularly by macrophages in response to certain microbial antigens , while NK cells are the main source of IFN-γ in response to IL-12 ( 7 , 1..."


In [39]:
df[['text', 'label']].to_csv('/tmp/data.csv', index=False)

In [41]:
from torchtext import data as txd
TEXT = txd.Field(lower=True)
LABEL = txd.Field(sequential=False)

In [42]:
from torchtext import vocab

In [45]:
W2V_MODEL_01

'/lab/data/word2vec/PubMed-and-PMC-w2v.bin'

In [44]:
from torchtext.vocab import Vectors
vectors = Vectors(name=W2V_MODEL_01, cache='/lab/data/word2vec')

In [57]:
from collections import Counter

In [58]:
len(Counter(featurizer.model.vocab))

4087443

In [59]:
len(featurizer.model.vocab)

4087446

In [61]:
featurizer.model.word(1)

'the'

In [62]:
featurizer.model.ix('the')

1

In [73]:
featurizer.model.vocab[:3]

array(['</s>', 'the', ','], dtype='<U78')

In [110]:
import torch
from torchtext.vocab import Vocab
from collections import defaultdict, Counter
    
class W2VVocab(Vocab):
    
    def __init__(self, model, specials=['<pad>'], unk_init=np.zeros):
        super().__init__(Counter())
        self.itos = specials + list(model.vocab)
        
        unk_index = model.ix('UNK')
        def get_unk_index():
            return unk_index
        self.stoi = defaultdict(get_unk_index)
        self.stoi.update({w: i for i, w in enumerate(self.itos)})
        self.vectors = torch.cat([
            torch.FloatTensor(np.zeros((len(specials), model.vectors.shape[1]))),
            torch.FloatTensor(model.vectors)
        ], dim=0)

In [111]:
TEXT.vocab = W2VVocab(featurizer.model)

In [None]:
# vocab = TEXT.vocab
# self.embed = nn.Embedding(len(vocab), emb_dim)
# self.embed.weight.data.copy_(vocab.vectors)

In [None]:
pos = txd.TabularDataset('/tmp/data.csv'
                         
path='data/pos/pos_wsj_train.tsv', format='tsv',
fields=[('text', data.Field()),
        ('labels', data.Field())])

In [4]:
from tcre.modeling import features

In [8]:
import imp
imp.reload(features)

<module 'tcre.modeling.features' from '/lab/repos/t-cell-relation-extraction/src/tcre/modeling/features.py'>

In [9]:
featurizer = features.get_spacy_w2v_featurizer()

In [10]:
def get_training_data(df, featurizer):
    X_train, Y_train = [], []
    for i, r in df.iterrows():
        indices, tokens = featurizer.indices(r['text'])
        X_train.append(indices)
        Y_train.append(r['label'])
        assert r['label'] in [0, 1]
    return np.array(X_train), np.array(Y_train)

In [11]:
from sklearn.model_selection import train_test_split
X, Y = get_training_data(df, featurizer)
idx_train, idx_test = train_test_split(np.arange(len(X)), test_size=.3, stratify=Y)
X_train, Y_train = X[idx_train], Y[idx_train]
X_test, Y_test = X[idx_test], Y[idx_test]

In [31]:
imp.reload(models)

<module 'tcre.modeling.models' from '/lab/repos/t-cell-relation-extraction/src/tcre/modeling/models.py'>

In [36]:
from tcre.modeling import models
#lstm = models.W2VLSTM(featurizer).build(hidden_dim=50, lr=.01, dropout=0) # min loss ~.11 at 50 epochs
lstm = models.W2VLSTM(featurizer).build(hidden_dim=250, lr=.01, dropout=0)

In [37]:
lstm.train(X_train, Y_train, n_epochs=25, batch_size=32, seed=1, dev_ckpt=False)

[W2VLSTM] Training model
[W2VLSTM] n_train=471  #epochs=25  batch size=32
[W2VLSTM] Epoch 1 (4.56s)	Average loss=0.648659
[W2VLSTM] Epoch 2 (9.27s)	Average loss=0.492220
[W2VLSTM] Epoch 3 (13.71s)	Average loss=0.482761
[W2VLSTM] Epoch 4 (18.41s)	Average loss=0.460722
[W2VLSTM] Epoch 5 (22.99s)	Average loss=0.441662
[W2VLSTM] Epoch 6 (27.72s)	Average loss=0.433644
[W2VLSTM] Epoch 7 (32.41s)	Average loss=0.435663
[W2VLSTM] Epoch 8 (37.00s)	Average loss=0.543179
[W2VLSTM] Epoch 9 (41.68s)	Average loss=0.484761
[W2VLSTM] Epoch 10 (46.18s)	Average loss=0.495525
[W2VLSTM] Epoch 11 (50.72s)	Average loss=0.469161
[W2VLSTM] Epoch 12 (55.38s)	Average loss=0.451379
[W2VLSTM] Epoch 13 (60.04s)	Average loss=0.436876
[W2VLSTM] Epoch 14 (64.75s)	Average loss=0.412195
[W2VLSTM] Epoch 15 (69.38s)	Average loss=0.404069
[W2VLSTM] Epoch 16 (73.91s)	Average loss=0.403772
[W2VLSTM] Epoch 17 (78.61s)	Average loss=0.397979
[W2VLSTM] Epoch 18 (83.22s)	Average loss=0.393517
[W2VLSTM] Epoch 19 (87.89s)	Average l