In [2]:
import os
import pandas as pd
import numpy as np

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

import keras.backend.tensorflow_backend as K
K.set_session

<function keras.backend.tensorflow_backend.set_session(session)>

In [28]:
### INPUT ###
# load sequence representation and labels
# extended substring
# seq2vec + TFIDF
seq_repres = pd.read_csv('ext_substr_w5_d100_seq2vec-TFIDF.csv')

print(seq_repres.shape)

(10000, 104)


In [32]:
### MAIN PART ###
# input vector: row names
X = np.array(seq_repres.index, dtype = 'int64')

# embeddings initialzer
embedding_matrix = np.array(seq_repres.iloc[:,4:104], dtype = 'float64')

# label vector
labels = np.array(seq_repres['label'], dtype = 'object')
labels = np.where(labels == 'hotspot', 1, 0)


In [111]:
# build DAN (will terribly overfit)

embeddingDim = 100


def build_and_compile_model():
    
    tf.keras.backend.clear_session()

    # imput layer
    input_l = keras.Input(shape = (1,), name = 'input_layer')
    
    # initialize with seq2vec-representation
    embedding = layers.Embedding(input_dim = len(X),
                                output_dim = embeddingDim,
                                input_length = 1,
                                weights = [embedding_matrix],
                                trainable = True,
                                name = 'embedding')(input_l)
    
    dense1 = layers.Dense(32, activation = 'tanh')(embedding)
    norm1 = layers.BatchNormalization(trainable = True)(dense1)
    drop1 = layers.Dropout(0.3)(norm1)
    
    dense4 = layers.Dense(8, activation = 'relu')(drop1)
    norm4 = layers.BatchNormalization(trainable = True)(dense4)
    
    output = layers.Dense(1, activation = 'sigmoid')(norm4)
    
    
    # model
    model = keras.Model(inputs=input_l, outputs=output)
    opt = keras.optimizers.Adagrad()
    
    model.compile(loss=keras.losses.BinaryCrossentropy(),
                    optimizer = opt,
                    metrics=['accuracy'])
    
    return model

In [112]:
model = build_and_compile_model()
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_layer (InputLayer)     (None, 1)                 0         
_________________________________________________________________
embedding (Embedding)        (None, 1, 100)            1000000   
_________________________________________________________________
dense (Dense)                (None, 1, 32)             3232      
_________________________________________________________________
batch_normalization_v1 (Batc (None, 1, 32)             128       
_________________________________________________________________
dropout (Dropout)            (None, 1, 32)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 1, 8)              264       
_________________________________________________________________
batch_normalization_v1_1 (Ba (None, 1, 8)              32        
__________

In [113]:
# fit model
fit = model.fit([X, labels],
                epochs = 30,
                verbose = 1)


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [115]:
# save weights and metrics
model.save_weights('DAN/weights_ext_seq2vec-TFIDF.h5')

val = []
name = list(fit.history.keys())
for i, elem in enumerate(fit.history.keys()):
    val.append(fit.history[elem])

m = list(zip(name, val))
m = pd.DataFrame(m)
pd.DataFrame.to_csv(m, 'DAN/metrics_ext_seq2vec-TFIDF.csv', header=False, index = False)

