In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals
import pandas as pd
import tensorflow as tf
import numpy as np
import os
import time
import cProfile
import tensorflow.contrib.eager as tfe

tfe.enable_eager_execution()

In [2]:
df = pd.read_csv("diseasesSentences2.csv", usecols=["sentence"])

In [3]:
df.head()

Unnamed: 0,sentence
0,bipolar disorder RESEMBLES panic disorder PALL...
1,bipolar disorder RESEMBLES panic disorder PALL...
2,bipolar disorder RESEMBLES panic disorder PALL...
3,bipolar disorder RESEMBLES panic disorder PALL...
4,bipolar disorder RESEMBLES panic disorder PALL...


In [4]:
text = df.sentence.str.cat(sep=', ').upper()

In [5]:
vocab = sorted(set(text))

In [6]:
len(vocab)

53

In [7]:
# Creating a mapping from unique characters to indices
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

text_as_int = np.array([char2idx[c] for c in text])

In [8]:
print ('{} ---- characters mapped to int ---- > {}'.format(repr(text[:13]), text_as_int[:13]))

'BIPOLAR DISOR' ---- characters mapped to int ---- > [23 30 37 36 33 22 39  0 25 30 40 36 39]


In [9]:
# The maximum length sentence we want for a single input in characters
seq_length = 100
examples_per_epoch = len(text)//(seq_length+1)

# Create training examples / targets
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

In [10]:
for i in char_dataset.take(5):
    print(idx2char[i.numpy()])

Instructions for updating:
Colocations handled automatically by placer.
B
I
P
O
L


In [11]:
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

for item in sequences.take(5):
    print(repr(''.join(idx2char[item.numpy()])))

'BIPOLAR DISORDER RESEMBLES PANIC DISORDER PALLIATED_BY FLUOXETINE CAUSES DYSARTHRIA, BIPOLAR DISORDER'
' RESEMBLES PANIC DISORDER PALLIATED_BY FLUOXETINE CAUSES THROMBOCYTOPENIA, BIPOLAR DISORDER RESEMBLES'
' PANIC DISORDER PALLIATED_BY FLUOXETINE CAUSES MENTAL DISABILITY, BIPOLAR DISORDER RESEMBLES PANIC DI'
'SORDER PALLIATED_BY FLUOXETINE CAUSES HYPOXIA, BIPOLAR DISORDER RESEMBLES PANIC DISORDER PALLIATED_BY'
' FLUOXETINE CAUSES HYPERPLASIA, BIPOLAR DISORDER RESEMBLES PANIC DISORDER PALLIATED_BY FLUOXETINE CAU'


In [12]:
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)

In [13]:
for input_example, target_example in  dataset.take(1):
    print ('Input data: ', repr(''.join(idx2char[input_example.numpy()])))
    print ('Target data:', repr(''.join(idx2char[target_example.numpy()])))

Input data:  'BIPOLAR DISORDER RESEMBLES PANIC DISORDER PALLIATED_BY FLUOXETINE CAUSES DYSARTHRIA, BIPOLAR DISORDE'
Target data: 'IPOLAR DISORDER RESEMBLES PANIC DISORDER PALLIATED_BY FLUOXETINE CAUSES DYSARTHRIA, BIPOLAR DISORDER'


In [14]:
for i, (input_idx, target_idx) in enumerate(zip(input_example[:5], target_example[:5])):
    print("Step {:4d}".format(i))
    print("  input: {} ({:s})".format(input_idx, repr(idx2char[input_idx])))
    print("  expected output: {} ({:s})".format(target_idx, repr(idx2char[target_idx])))

Step    0
  input: 23 ('B')
  expected output: 30 ('I')
Step    1
  input: 30 ('I')
  expected output: 37 ('P')
Step    2
  input: 37 ('P')
  expected output: 36 ('O')
Step    3
  input: 36 ('O')
  expected output: 33 ('L')
Step    4
  input: 33 ('L')
  expected output: 22 ('A')


In [15]:
# Batch size
BATCH_SIZE = 64

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

dataset

<DatasetV1Adapter shapes: ((64, 100), (64, 100)), types: (tf.int64, tf.int64)>

In [16]:
# Length of the vocabulary in chars
vocab_size = len(vocab)

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 1024

In [17]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim,
                              batch_input_shape=[batch_size, None]),
    tf.keras.layers.GRU(rnn_units,
                        return_sequences=True,
                        stateful=True,
                        recurrent_initializer='glorot_uniform'),
    tf.keras.layers.Dense(vocab_size)
  ])
    return model

In [18]:
model = build_model(
  vocab_size = len(vocab),
  embedding_dim=embedding_dim,
  rnn_units=rnn_units,
  batch_size=BATCH_SIZE)

In [19]:
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

(64, 100, 53) # (batch_size, sequence_length, vocab_size)


In [20]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (64, None, 256)           13568     
_________________________________________________________________
gru (GRU)                    (64, None, 1024)          3935232   
_________________________________________________________________
dense (Dense)                (64, None, 53)            54325     
Total params: 4,003,125
Trainable params: 4,003,125
Non-trainable params: 0
_________________________________________________________________


In [21]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices,axis=-1).numpy()

In [22]:
print("Input: \n", repr("".join(idx2char[input_example_batch[0]])))
print()
print("Next Char Predictions: \n", repr("".join(idx2char[sampled_indices ])))

Input: 
 'EMBLES PANIC DISORDER PALLIATED_BY CLOMIPRAMINE CAUSES DEPRESSED LEVEL OF CONSCIOUSNESS, BIPOLAR DIS'

Next Char Predictions: 
 "Α4,33A5NR_+ΑKQ-XD2K>9YHDRJ:ND I5OBV2ÏN((GH'MUT>PIO(VXVZ63RHFKCTLY.PP:L)&U1.,7 K(I,(B83E)3 SU8SE>)7(B"


In [23]:
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

example_batch_loss  = loss(target_example_batch, example_batch_predictions)
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("scalar_loss:      ", example_batch_loss.numpy().mean())

Prediction shape:  (64, 100, 53)  # (batch_size, sequence_length, vocab_size)
scalar_loss:       3.9700818


In [24]:
model.compile(optimizer='adam', loss=loss)

In [25]:
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [26]:
EPOCHS=30

In [27]:
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback], steps_per_epoch=10)

Epoch 1/30


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "



Consider using a TensorFlow optimizer from `tf.train`.
Instructions for updating:
Use tf.train.CheckpointManager to manage checkpoints rather than manually editing the Checkpoint proto.
Epoch 2/30

Consider using a TensorFlow optimizer from `tf.train`.
Epoch 3/30

Consider using a TensorFlow optimizer from `tf.train`.
Epoch 4/30

Consider using a TensorFlow optimizer from `tf.train`.
Epoch 5/30

Consider using a TensorFlow optimizer from `tf.train`.
Epoch 6/30

Consider using a TensorFlow optimizer from `tf.train`.
Epoch 7/30

Consider using a TensorFlow optimizer from `tf.train`.
Epoch 8/30

Consider using a TensorFlow optimizer from `tf.train`.
Epoch 9/30

Consider using a TensorFlow optimizer from `tf.train`.
Epoch 10/30

Consider using a TensorFlow optimizer from `tf.train`.
Epoch 11/30

Consider using a TensorFlow optimizer from `tf.train`.
Epoch 12/30

Consider using a TensorFlow optimizer from `tf.train`.
Epoch 13/30

Consider using a TensorFlow optimizer from `tf.train`.
Epoch

Epoch 16/30

Consider using a TensorFlow optimizer from `tf.train`.
Epoch 17/30

Consider using a TensorFlow optimizer from `tf.train`.
Epoch 18/30

Consider using a TensorFlow optimizer from `tf.train`.
Epoch 19/30

Consider using a TensorFlow optimizer from `tf.train`.
Epoch 20/30

Consider using a TensorFlow optimizer from `tf.train`.
Epoch 21/30

Consider using a TensorFlow optimizer from `tf.train`.
Epoch 22/30

Consider using a TensorFlow optimizer from `tf.train`.
Epoch 23/30

Consider using a TensorFlow optimizer from `tf.train`.
Epoch 24/30

Consider using a TensorFlow optimizer from `tf.train`.
Epoch 25/30

Consider using a TensorFlow optimizer from `tf.train`.
Epoch 26/30

Consider using a TensorFlow optimizer from `tf.train`.
Epoch 27/30

Consider using a TensorFlow optimizer from `tf.train`.
Epoch 28/30

Consider using a TensorFlow optimizer from `tf.train`.
Epoch 29/30

Consider using a TensorFlow optimizer from `tf.train`.
Epoch 30/30

Consider using a TensorFlow optimiz

In [27]:
tf.train.latest_checkpoint(checkpoint_dir)

'./training_checkpoints/ckpt_30'

In [28]:
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)

model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

model.build(tf.TensorShape([1, None]))

In [29]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (1, None, 256)            13568     
_________________________________________________________________
gru_1 (GRU)                  (1, None, 1024)           3935232   
_________________________________________________________________
dense_1 (Dense)              (1, None, 53)             54325     
Total params: 4,003,125
Trainable params: 4,003,125
Non-trainable params: 0
_________________________________________________________________


In [63]:
def generate_text(model, start_string):
  # Evaluation step (generating text using the learned model)

  # Number of characters to generate
    num_generate = 20

  # Converting our start string to numbers (vectorizing)
    input_eval = [char2idx[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0)

      # Empty string to store our results
    text_generated = []

      # Low temperatures results in more predictable text.
      # Higher temperatures results in more surprising text.
      # Experiment to find the best setting.
    temperature = 1.0

      # Here batch size == 1
    model.reset_states()
    for i in range(num_generate):
        predictions = model(input_eval)
          # remove the batch dimension
        predictions = tf.squeeze(predictions, 0)

          # using a categorical distribution to predict the word returned by the model
        predictions = predictions / temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

          # We pass the predicted word as the next input to the model
          # along with the previous hidden state
        input_eval = tf.expand_dims([predicted_id], 0)
        text_generated.append(idx2char[predicted_id])
    return (start_string + ''.join(text_generated))

In [85]:
print(generate_text(model, start_string=u"SCHIZOPHRENIA PALLIATED"))

SCHIZOPHRENIA PALLIATED_BY RISPERIDONE CAUS


In [40]:
print(generate_text(model, start_string=u"SCHIZOPHRENIA PALLIATED"))

SCHIZOPHRENIA PALLIATED_BY FLUVOXAMINE


In [33]:
print(generate_text(model, start_string=u"SCHIZOPHRENIA PALLIATED"))

SCHIZOPHRENIA PALLIATES_CPD ARIPIPRAZOLE CAUSES_CCSE CARDE PRESSION, BIPO


In [37]:
print(generate_text(model, start_string=u"SCHIZOPHRENIA PALLIATES"))

SCHIZOPHRENIA PALLIATES_CPD CITALOPRAM CAUSES_CCSE BREACT DISORDER, BIPOL


In [104]:
print(generate_text(model, start_string=u"PAROXETINE PALLIATES"))

PAROXETINE PALLIATES_CHG INTRASICAMPDELATIVE, BIPOLAR DISORDER RESEMBL


In [32]:
print(generate_text(model, start_string=u"PAROXETINE PALLIATES"))

PAROXETINE PALLIATES_CPD BIPOLAR DISORDER BINDS_CBL COFSCITALIPE, BIPO
