In [1]:
#!git clone https://github.com/jforjohn/drug_embeddings
!pip install seqeval
!pip install git+https://www.github.com/keras-team/keras-contrib.git

Collecting seqeval
  Downloading https://files.pythonhosted.org/packages/55/dd/3bf1c646c310daabae47fceb84ea9ab66df7f518a31a89955290d82b8100/seqeval-0.0.10-py3-none-any.whl
Installing collected packages: seqeval
Successfully installed seqeval-0.0.10
Collecting git+https://www.github.com/keras-team/keras-contrib.git
  Cloning https://www.github.com/keras-team/keras-contrib.git to /tmp/pip-req-build-nvfzmhhw
  Running command git clone -q https://www.github.com/keras-team/keras-contrib.git /tmp/pip-req-build-nvfzmhhw
Building wheels for collected packages: keras-contrib
  Building wheel for keras-contrib (setup.py) ... [?25l[?25hdone
  Stored in directory: /tmp/pip-ephem-wheel-cache-wmj7w1gi/wheels/11/27/c8/4ed56de7b55f4f61244e2dc6ef3cdbaff2692527a2ce6502ba
Successfully built keras-contrib
Installing collected packages: keras-contrib
Successfully installed keras-contrib-2.0.8


In [136]:
!python /content/gdrive/My\ Drive/drug_embeddings/src/MainLauncher.py

Mounted at /content/gdrive


In [13]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)
from os import chdir; chdir('/content/gdrive/My Drive/drug_embeddings/src')
import sys; sys.path.append('../src')

Mounted at /content/gdrive


In [14]:

from input_output.parser import Parser
from input_output.writer import Writer
from input_output.load_config import load_config_file
from preprocessing.tokenizer import tokenize
from preprocessing.transformations import removeEmptyRows
from preprocessing.transformations import CRF_get_tag
from structs import DrugEntity
from models.dl import architecture
from models.dl import Metrics
from models.dl import embedding_weights

import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.optimizers import RMSprop, Adam
from keras.callbacks import TensorBoard, ModelCheckpoint
from keras_contrib import metrics, losses
from keras.models import load_model
from keras.utils import plot_model
from seqeval.metrics import f1_score, classification_report
import tensorflow as tf
#tf.reset_default_graph()

from time import time
import pandas as pd
import numpy as np
from os import path
from os import mkdir
import keras
print(keras.__version__)

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_colwidth', 1000)

def preprocess_steps(base_folder):
  df = Parser('../'+ base_folder).call()
  #df['sentence'] = df['sentence'].apply(sentClean)#.apply(lambda x: tokens2sent(x, removals=False))
  df['tokens'] = df['sentence'].apply(tokenize)
  df['crf_tags'] = df[['tokens', 'parsed_drugs']].apply(CRF_get_tag, axis=1)
  return df
    
config = load_config_file('config', './')
config_data = config['data']
config_preprocess = config['preprocessing']
config_arch = config['arch']
config_training = config['training']

output_dir = config_data['output_dir']
pretrained_emb_dir = config_data['pretrained_emb_dir']
try:
    # Create target Directory
    mkdir(output_dir)
except FileExistsError:
    print("Directory " , output_dir ,  " already exists")
    
train_base_folder = config_data.get('train_dir')
test_base_folder = config_data.get('test_dir')

df_train = preprocess_steps(train_base_folder)
df_test = preprocess_steps(test_base_folder)

emb_dim = config_preprocess['EMB_DIM']
max_len = df_train['tokens'].apply(len).max()
  
words = df_train['tokens'].apply(
    lambda el_lst: pd.Series([el['text'] for el in el_lst])).stack().unique().tolist()
words.append("ENDPAD")

word2idx = {w: i + 1 for i, w in enumerate(words)}

tags = df_train['crf_tags'].apply(lambda el_lst: pd.Series(el_lst)).stack().unique()
tag2idx = {t: i for i, t in enumerate(tags)}
idx2tag = dict(map(reversed, tag2idx.items()))

n_words = len(words)
n_tags = len(tags)

# Train
X_train = [[word2idx[w['text']] for w in s] for s in df_train['tokens']]
X_train = pad_sequences(maxlen=max_len, sequences=X_train, padding="post", value=word2idx['ENDPAD'])

y_train = [[tag2idx[t] for t in s] for s in df_train['crf_tags']]
y_train_pad = pad_sequences(maxlen=max_len, sequences=y_train, padding="post", value=tag2idx["O"])
y_train = [to_categorical(i, num_classes=n_tags) for i in y_train_pad]

# Test
X_test = [[word2idx.get(w['text'], 0) for w in s] for s in df_test['tokens']]
X_test = pad_sequences(maxlen=max_len, sequences=X_test, padding="post", value=0)

y_test = [[tag2idx[t] for t in s] for s in df_test['crf_tags']]
y_test = pad_sequences(maxlen=max_len, sequences=y_test, padding="post", value=tag2idx["O"])
y_test = [to_categorical(i, num_classes=n_tags) for i in y_test]


Using TensorFlow backend.


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
2.2.4
Directory  model1  already exists


In [3]:
weights = embedding_weights(
            X_train, y_train_pad,
            n_words, n_tags,
            max_len, emb_dim,
            output_dir,
            pretrained_emb_dir,
            config_training['EMB_EPOCHS'],
            'simple')

Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 165, 20)           134160    
_________________________________________________________________
flatten_1 (Flatten)          (None, 3300)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 165)               544665    
Total params: 678,825
Trainable params: 678,825
Non-trainable params: 0
_________________________________________________________________
None
Instructions for updating:
Use tf.cast instead.
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24

In [47]:
np.array(weights).shape
len(word2idx)

6707

In [0]:
from gensim.models.callbacks import CallbackAny2Vec
class EpochSaver(CallbackAny2Vec):
  '''Callback to save model after each epoch and show training parameters '''
  '''def __init__(self, savedir):
      self.savedir = savedir
      self.epoch = 0

      os.makedirs(self.savedir, exist_ok=True)
  '''
  def on_epoch_end(self, model):
      print("Model loss:", model.get_latest_training_loss())



In [0]:
idx2word = dict(map(reversed, word2idx.items()))
def reverseX(pred):
    out = []
    for pred_i in pred:
        out_i = []
        for p in pred_i:
            #p_i = np.argmax(p)
            out_i.append(idx2word[p])
        out.append(out_i)
    return out
x2_train = reverseX(X_train)

In [39]:
from gensim.models import Word2Vec
w2v_model = Word2Vec(min_count=1,
                     window=2,
                     size=20,
                     #sample=6e-5, 
                     alpha=0.5,
                     min_alpha=0.001, 
                     #min_alpha=0.0007, 
                     negative=0,
                     compute_loss=True,
                     sg=0)
tokens = df_train['tokens'].apply(lambda x: [el['text'] for el in x])
w2v_model.build_vocab(x2_train)
w2v_model.train(x2_train, total_examples=w2v_model.corpus_count, epochs=5, start_alpha=0.5,end_alpha=0.5,compute_loss=True,callbacks=[EpochSaver()])

Model loss: 0.0
Model loss: 0.0
Model loss: 0.0
Model loss: 0.0
Model loss: 0.0


(698537, 4681875)

0.0

In [3]:
len(w2v_model.wv.vocab)
w2v_model.save('/content/emb_model.bin')
w = Word2Vec.load('/content/emb_model.bin')


gensim.models.word2vec.Word2Vec

In [0]:
import gensim
w = gensim.models.KeyedVectors.load_word2vec_format('/content/gdrive/My Drive/drug_embeddings/resources/glove/wikipedia-pubmed-and-PMC-w2v.bin', binary=True)

In [0]:
from sklearn.model_selection import train_test_split
x_tr, x_tst, y_tr, y_tst = train_test_split(
            X_train , np.array(y_train), test_size=0.1)

In [4]:
from keras.layers import Embedding, Flatten, Dense
from keras.models import Sequential
#emb_type = config['emb_type']
#neurons_rnn = config['neurons_rnn']
#neurons_dense = config['neurons_dense']
#rec_drop = config['rec_drop']
#impl = config['impl']
#if emb_type == 'simple':
model = Sequential()
model.add(Embedding(input_dim=n_words+1,
                    output_dim=emb_dim,
                    input_length=max_len))
model.add(Flatten())
model.add(Dense(max_len, activation='sigmoid'))

print(model.summary())
            
model.compile(optimizer='adam', loss='mse', metrics=['acc'])

cbacks = []
cbacks.append(metrics)
tensorboard = TensorBoard(log_dir="{}".format(time()))
cbacks.append(tensorboard)


h = model.fit(X_train, y_train_pad,
          #validation_data=(x_tst,y_tst),
          epochs=3,
          #callbacks=cbacks,
          verbose=1)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 165, 20)           134160    
_________________________________________________________________
flatten_2 (Flatten)          (None, 3300)              0         
_________________________________________________________________
dense_2 (Dense)              (None, 165)               544665    
Total params: 678,825
Trainable params: 678,825
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/3
Epoch 2/3
Epoch 3/3
