#### Place in the cell below the code to collect all the data from vcare

In [None]:
#
#
#

### Import packages

In [1]:
from keras.layers import Bidirectional, Concatenate, Permute, Dot, Input, LSTM, Multiply
from keras.layers import RepeatVector, Dense, Activation, Lambda, Embedding, Flatten
from keras.optimizers import Adam
from keras.utils import to_categorical
from keras.models import load_model, Model, Sequential
import keras.backend as K
import numpy as np
import matplotlib.pyplot as plt
from keras.preprocessing import sequence
from keras import layers
from keras.datasets import imdb
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import os

Using TensorFlow backend.


### Import functions

In [None]:
#import keras_metrics
def recall(y_true, y_pred):	
    """Recall metric.	
    Only computes a batch-wise average of recall. Computes the recall, a metric
    for multi-label classification of how many relevant items are selected.	
    """	
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))	
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))	
    recall = true_positives / (possible_positives + K.epsilon())	
    return recall

def precision(y_true, y_pred):	
    """Precision metric.	
    Only computes a batch-wise average of precision. Computes the precision, a
    metric for multi-label classification of how many selected items are
    relevant.
    """	
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))	
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))	
    precision = true_positives / (predicted_positives + K.epsilon())	
    return precision

def f1_score(y_true, y_pred):
    """Computes the F1 Score
    Only computes a batch-wise average of recall. Computes the recall, a metric
    for multi-label classification of how many relevant items are selected.	
    """
    p = precision(y_true, y_pred)
    r = recall(y_true, y_pred)
    return (2 * p * r) / (p + r + K.epsilon())


### Parameters setup
There are 2607 medical records as of now. Their length is variable, the longest is 3931 words long. On average they are 801  words long. More than 98% of them is shorter than 2000 words.
We have a word2vec dictionary of 27610 entries with 256 features each.

In [None]:
total_samples = 2607
maxlen = 1000 #cutoff for medical record's length
training_samples = int(0.7 * total_samples ) #split the dataset as you wish
validation_samples = int(0.15 * total_samples )
test_samples = total_samples-training_samples-validation_samples
max_words = 7000 #how many words to take into account 

### Tokenization

In [None]:
# renaming data fetched from the database
data = x_rep_raw
del x_rep_raw
labels = y_rep 
del y_rep
#########################################################################################################################


word_index = tokenizer.word_index
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
print('Found %s unique tokens.' % len(word_index))
data = pad_sequences(sequences, maxlen=maxlen) #Pads sequences to the same length.
labels = np.asarray(labels)
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
x_train = data[:training_samples]
y_train = labels[:training_samples]
x_val = data[training_samples: training_samples + validation_samples]
y_val = labels[training_samples: training_samples + validation_samples]
x_test = data[total_samples-test_samples:total_samples]
y_test = labels[total_samples-test_samples:total_samples]

### Parsing and preparing the word2vec word-embeddings file

In [None]:
w2v_dir = '/home/v_charvet/workspace/data'
embeddings_index = {}
f = open(os.path.join(w2v_dir, 'w2v_reports_256.vec'), encoding='utf-8',errors='ignore')
dummy = f.readline() #to skip the first line that tells: n.of words n.of features
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Found %s word vectors.' % len(embeddings_index))

#Preparing the GloVe word-embeddings matrix
embedding_dim = 256 #number of , #we've got the 128 features option too
embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in word_index.items():
    if i < max_words:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

### Model
Replace the next cell with any of the models uploaded and make some adjustments on the output file names

In [None]:
model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
model.add(layers.Dropout(0.5))
model.add(layers.Bidirectional(layers.GRU(32))) #32, 64, 100 would be best
model.add(layers.Dense(1, activation='sigmoid'))
model.summary()

model.layers[0].set_weights([embedding_matrix]) 
model.layers[0].trainable = False #to use pretrained embeddings

opt = Adam(lr=0.01, beta_1=0.9, beta_2=0.999, decay=0.01) #Optimizer

model.compile(optimizer=opt,loss='binary_crossentropy',metrics=[recall])
history = model.fit(x_train, y_train,epochs=10,batch_size=64,validation_data=(x_val, y_val)) 

#optional:
#model.save_weights('pre_trained_glove_model.h5')

### Visualization

In [None]:
rec = history.history['recall']
val_rec = history.history['val_recall']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(acc) + 1)
plt.figure()
plt.plot(epochs, rec, 'bo', label='Training rec')
plt.plot(epochs, val_rec, 'b', label='Validation rec')
plt.title('Training and validation rec')
plt.savefig('DITEP_modelname_rec.png') #Choose output image name
plt.legend()
plt.figure()
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.savefig('DITEP_modelname_loss.png') #Choose output image name

### Evaluation
Customize what you want to retain of the model.

In [None]:
scores = model.evaluate(x_test, y_test)
print('Test loss:', scores[0])
print('Test recall', scores[1])
scores.extend(['val_rec:'])
scores.extend(val_acc)
f=open('out_DITEP_modelname.txt','w') #Choose output .txt name
temp=''
for i in scores:
    temp+=str(i)
    temp+='\n'
f.write(temp)
f.close()