# Libraries, Headers and Stuff

In [1]:
import sys
sys.path.append('../../src')

In [2]:
N_SENTENCES = 50000
TEST_RATIO = .1
WINDOW_SIZE = 16
N_EPOCHS = 5
CACHED = False
N_STATES = 16
SAMPLES_VERIFY = 8
BATCH_SIZE = 10

In [3]:
import datetime
import os

FOLDER_OUT = 'Boolean-' + str(datetime.datetime.today())
if not os.path.exists(FOLDER_OUT):
    os.makedirs(FOLDER_OUT)

In [4]:
import numpy as np
import keras
np.random.seed(55555)

Using TensorFlow backend.


In [5]:
%load_ext rpy2.ipython

## Load and set up dataset

In [6]:
import pandas as pd
import utils.preprocess as pre
reload(pre)

input_file = 'boolean.csv'
df = pd.read_csv(input_file, dtype={'sequence': object})

test_split = 0.5
train_size = int(len(df) * (1 - test_split))

_, char2int, int2char = pre.encode(''.join(df['sequence']))


char2int = dict((key, value + 1) for (key, value) in char2int.items())
int2char = dict((key + 1, value) for (key, value) in int2char.items())
int2char[0] = 'X'
char2int['X'] = 0

df['sequence'] = df['sequence'].apply(lambda x: np.array([char2int[e] for e in x]))
df = df.reindex(np.random.permutation(df.index))

X_train = np.array(df['sequence'].values[:train_size])
y_train = np.array(df['target'].values[:train_size])
X_test = np.array(df['sequence'].values[train_size:])
y_test = np.array(df['target'].values[train_size:])

print(X_train.shape)
print(X_train[:10])


print(all([item.size%2==1 for item in X_train]))

print(max([item.size for item in X_train]))

print(y_train.shape)
print(y_train)
print(min(y_train), max(y_train))
print (char2int)

Encoding
Total vocabulary len_sequence:  4
(5000,)
[array([3, 4, 2, 1, 2, 4, 3, 1, 3, 1, 3, 4, 2, 4, 2, 4, 3])
 array([3, 1, 2, 1, 2, 4, 2, 1, 3, 4, 3, 4, 3, 4, 3, 4, 2, 4, 3, 1, 2, 1, 3,
       1, 3, 4, 3, 4, 3, 4, 3, 1, 2, 4, 3, 1, 2, 1, 3])
 array([3, 1, 2, 1, 3, 4, 2])
 array([3, 4, 2, 1, 2, 4, 2, 1, 3, 4, 2, 1, 3, 1, 3, 4, 3, 4, 2, 4, 2, 4, 3,
       1, 2])
 array([3, 1, 3, 1, 2, 4, 2, 4, 2]) array([3, 4, 3, 1, 3, 4, 2, 4, 3])
 array([2, 4, 2, 1, 3, 4, 2, 1, 3, 4, 2, 4, 2, 4, 3, 4, 3, 4, 3, 1, 3, 4, 3,
       4, 3, 1, 3, 4, 3, 4, 2, 1, 2, 1, 2])
 array([2, 1, 2, 4, 2, 4, 2, 4, 3, 1, 3, 4, 3])
 array([2, 1, 2, 4, 2, 4, 3, 4, 3, 1, 3, 1, 2, 1, 3, 1, 3, 4, 3, 4, 3, 4, 2,
       1, 2, 4, 3, 1, 2, 4, 2, 4, 3, 1, 2, 1, 3, 1, 3])
 array([3, 4, 3, 1, 2, 1, 3, 1, 2, 1, 2, 4, 2, 1, 3])]
True
41
(5000,)
[1 1 0 ..., 0 0 1]
(0, 1)
{'1': 3, '0': 2, '|': 4, 'X': 0, '&': 1}


In [7]:
# truncate and pad input sequences
from keras.preprocessing import sequence

max_length = 50
X_train = sequence.pad_sequences(X_train, maxlen=max_length)

print("X_test before padding : {} ".format(X_test[:10]))
X_test = sequence.pad_sequences(X_test, maxlen=max_length)
print("X_test after padding : {} ".format(X_test[:10]))


X_test before padding : [ array([2, 4, 2, 4, 3, 4, 3, 1, 2, 4, 3, 4, 2, 1, 3, 4, 3, 1, 3, 4, 2, 4, 2,
       4, 3, 4, 2, 1, 2, 4, 2, 4, 2, 4, 3])
 array([3, 4, 3, 1, 2, 4, 3, 1, 3, 1, 2, 1, 3, 4, 2, 4, 2, 4, 3, 4, 2, 4, 2,
       1, 3, 1, 2, 1, 3, 1, 3, 4, 2, 1, 3, 4, 2])
 array([2, 1, 2, 4, 3, 1, 2, 1, 2, 4, 3, 4, 2, 4, 2, 1, 2, 1, 3, 4, 3, 1, 2,
       1, 3, 1, 2])
 array([3, 1, 2, 1, 2, 1, 3])
 array([3, 1, 2, 4, 3, 4, 3, 1, 3, 1, 3, 4, 3, 1, 3, 4, 2, 1, 3, 4, 2])
 array([3, 4, 3, 1, 3, 4, 2, 1, 2, 1, 3, 4, 2, 1, 2, 1, 2])
 array([2, 4, 3, 1, 2, 4, 3, 1, 3, 4, 3, 1, 3, 1, 2, 4, 3, 1, 3, 1, 2, 1, 3,
       4, 3, 1, 2, 4, 2, 4, 3, 1, 3])
 array([2, 1, 2, 4, 2, 4, 2, 1, 3, 4, 3, 4, 3, 1, 3, 1, 2, 1, 2, 1, 3, 1, 2,
       1, 2, 4, 2, 1, 2, 1, 3, 4, 3])
 array([3, 4, 2, 1, 3, 4, 3, 1, 2, 1, 3, 4, 3, 4, 2, 1, 2, 1, 3])
 array([2, 1, 2, 4, 3, 1, 2, 4, 3, 4, 3, 1, 3, 4, 3, 1, 3, 1, 2, 1, 3, 4, 3,
       1, 2, 1, 3, 4, 2, 4, 2, 1, 2, 1, 2, 4, 3, 1, 3])] 
X_test after padding : [[0 0 0 0 0 0 

In [8]:
X_train_flat = X_train.flatten()
X_train = keras.utils.to_categorical(X_train_flat)
X_train = X_train.reshape(5000,max_length,len(char2int))

X_test_flat = X_test.flatten()
X_test = keras.utils.to_categorical(X_test_flat)
X_test = X_test.reshape(5000,max_length,len(char2int))

# y_train = y_train.flatten()
# y_train = keras.utils.to_categorical(y_train)

# y_test = y_test.flatten()
# y_test = keras.utils.to_categorical(y_test)

In [9]:
print "Training data:"
print "X:", X_train.shape
print "y:", y_train.shape

print "Test data:"
print "X:", X_test.shape
print "y:", y_test.shape

Training data:
X: (5000, 50, 5)
y: (5000,)
Test data:
X: (5000, 50, 5)
y: (5000,)


In [10]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM,SimpleRNN
from keras.layers import Lambda
from keras import regularizers

from controllers.mylstm_legacy import MYLSTM


from keras import optimizers
keras.optimizers.Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)

in_dim = X_train.shape[1:]
# out_dim = y_train.shape[1]

model = Sequential()
model.add(LSTM(N_STATES, return_sequences=True,
                         stateful=False,
                         batch_size=BATCH_SIZE,
                         input_shape=in_dim))
model.add(Lambda(lambda x: x[:,-1, :]))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['acc'])

In [11]:
from keras.models import load_model
from controllers.mylstm_legacy import MYLSTM

if not CACHED:
    model.summary()
    model.fit(X_train, y_train,
                        batch_size=BATCH_SIZE,
                        epochs=5,
                        verbose=1,
                        shuffle=False)
    model.save('models/boolean.h5')
else:
    model = load_model('models/boolean.h5')
        

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (10, 50, 16)              1408      
_________________________________________________________________
lambda_1 (Lambda)            (10, 16)                  0         
_________________________________________________________________
dense_1 (Dense)              (10, 1)                   17        
Total params: 1,425
Trainable params: 1,425
Non-trainable params: 0
_________________________________________________________________
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [12]:
# Final evaluation of the model
model.reset_states()
scores = model.evaluate(X_test, y_test, verbose=0, batch_size=BATCH_SIZE)

import pickle
pickle.dump((X_test, y_test), open("test_data_boolean.pkl", "wb"))

print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 98.54%


In [13]:
import datetime
import os
model.save('boolean_'+str(datetime.datetime.today())+'.h5')

## TODO: Extract Features

In [14]:
import features as feat
reload(feat)

X_test_sequence = [int2char[i] for i in X_test_flat]

bool_fsm_states = feat.bool_fsm_states(char2int)

features = [bool_fsm_states]

In [15]:
reload(feat)

feature_frame_x = feat.FeatureFrame(features, X_test_sequence)
feature_frame_x.extract()

print'Features for test sequence:'
print feature_frame_x.names
print feature_frame_x.values[:15,:]

Generating feature scores
Running feature 0 out of 1
250000
Added features ['S0', 'S1', 'S2', 'S3', 'S4', '...']
Tidying...
Computed feature matrix, with shape: (250000, 7)
Snippet of the features
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX0|1&0|0|0
['S0', 'S1', 'S2', 'S3', 'S4', 'S5', 'S6']
[[ 1.  0.  0.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.  0.  

## Extract Hidden States

In [16]:
import extractor
reload(extractor)

ex=extractor.Extractor(model, [0])
states = ex.get_states(X_test, batch_size=BATCH_SIZE, unshuffle=True)

nn_config = ex.get_structure()
nn_offsets = ex.get_offets()

import pickle
pickle.dump(states, open("states_boolean.pkl", "wb"))

print 'states shape:', states.shape
print ''
print 'config:', nn_config
print 'offets:', nn_offsets

BEWARE _ ONLY SUPPORTS CONSECUTIVE LAYER IDS STARTING AT 0
Creates spy models
... for id 0 : <keras.layers.recurrent.LSTM object at 0x11b5e2c50>
Gets the activations for the hidden states
Gets structure
Gets offets
Gets structure
states shape: (5000, 800)

config: [('<keras.layers.recurrent.LSTM object at 0x11b5e2c50>', 50, 16)]
offets: {(0, 27): 27, (0, 47): 47, (0, 20): 20, (0, 14): 14, (0, 7): 7, (0, 49): 49, (0, 43): 43, (0, 16): 16, (0, 10): 10, (0, 36): 36, (0, 3): 3, (0, 28): 28, (0, 32): 32, (0, 21): 21, (0, 15): 15, (0, 24): 24, (0, 44): 44, (0, 17): 17, (0, 11): 11, (0, 37): 37, (0, 4): 4, (0, 40): 40, (0, 29): 29, (0, 33): 33, (0, 22): 22, (0, 0): 0, (0, 25): 25, (0, 45): 45, (0, 18): 18, (0, 12): 12, (0, 38): 38, (0, 5): 5, (0, 41): 41, (0, 30): 30, (0, 8): 8, (0, 34): 34, (0, 23): 23, (0, 1): 1, (0, 26): 26, (0, 46): 46, (0, 19): 19, (0, 13): 13, (0, 39): 39, (0, 6): 6, (0, 48): 48, (0, 42): 42, (0, 31): 31, (0, 9): 9, (0, 35): 35, (0, 2): 2}


## Plot activations TODO

## Inspects - correlation

In [17]:
#states_resized = np.zeros((feature_frame_x.values.shape[0], N_STATES))

#for s in range(N_STATES):
#    states_resized[:,s] = np.hstack( [ states[:, t+s] for t in range(0, states.shape[1], N_STATES)])



In [18]:
import scores
import inspector as ip

insp = ip.Inspector(nn_config, nn_offsets)
mi_scores, names = insp.inspect(states, feature_frame_x, scores.Correlation())

Computing attribution scores
Feture matrix dimensions: (250000, 7)
States dimensions: (5000, 800)
Computing score for feature 0: S0
Layer 0
Timestep 0
Scoring neurons 0 to 15


ValueError: Feature has 249951 rows, while states has 5000 rows