In [2]:
from sys import argv
import numpy as np
import json
from collections import defaultdict
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import Model
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Input, LSTM, Dense, RepeatVector, TimeDistributed, concatenate, Embedding
from tensorflow.keras.models import load_model
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.models import model_from_json
from keras.utils import to_categorical
import random

In [3]:
import pickle

In [4]:
# tuning
params = {
    'activation': ['relu', 'elu', 'sigmoid'],
    'batch_size': [100],
    'epochs': [20],
    'lr': [0.01, 0.04],
    'momentum': [0.95],
    'hidden_neurons': [100, 200, 400, 800, 1200],
    'loss': ['mean_squared_logarithmic_error', 'huber', 'mean_absolute_percentage_error', 'mean_absolute_error', 'mean_squared_error']
}

In [3]:
EPOCH_COUNT = 10
WINDOW_SIZES = (16, 3)
LOOKAHEAD = 1
TRAIN_TEST_SPLIT = 0.8

KB = 1024
LONG_JUMP_THRESHOLD = 256 * KB

In [2]:
from pandas import read_csv, DataFrame
df = read_csv('df')

In [2]:
def prepare_data(source):
    global d
    # Input loading and preprocessing:
    # - load data and get the read locations
    # - convert into samples by sliding a window
    # - prepare decoder input
    # - split into input and output

    with open(source) as f:
        data = json.loads(f.read())

    filereads = defaultdict(list)

    for op in data["operations"]:
        if op["type"] != "read":
            continue

        path, loc, size = op["path"], op["offset"], op["size"]
        filereads[path].append((loc, size))

    some_file = list(filereads.keys())[0]
    x = np.array(filereads[some_file])
    ins, outs = [], []

    # Split reads into samples:
    for i in range(len(x) - WINDOW_SIZES[0] - WINDOW_SIZES[1] + 1):
        # NOTE: just offset, not size
        ins.append(x[i:(i+WINDOW_SIZES[0]), :1].tolist())
        outs.append(x[(i+WINDOW_SIZES[0]):(i+WINDOW_SIZES[0]+WINDOW_SIZES[1]), :1].tolist())
        
    # shuffle
    io = [np.array(ins), np.array(outs)]
    io = np.concatenate(io, axis=1)
    # shuffles along the first axis
    np.random.shuffle(io)
    # recover
    ins = io[:, :WINDOW_SIZES[0], 0]
    outs = io[:, WINDOW_SIZES[0]:, 0]
    
    # convert ins to ins and outs to jumps
    def make_rel(w):
        rels = w[:, 1:] - w[:, :-1]
        return rels
    
    print(ins.shape)
    ins = make_rel(ins)
    outs = make_rel(outs)
    
    # remove uncommon jumps
    from collections import Counter
    ic = Counter(ins.reshape((-1)))
    ic = ic.most_common(96)
    ins2 = np.zeros(ins.shape)
    ins2[:, :] = 0.2
    outs2 = np.zeros(outs.shape)
    outs2[:, :] = 0.2
    
    # restore common jumps to ins2 and outs2:
    ic = [v for v, _ in ic]
    ic = {v: i for i, v in enumerate(ic)}
    d = ic
    for v, i in ic.items():
        ins2[ins == v] = i
        outs2[outs == v] = i
    ins = ins2
    outs = outs2
    
    # prepare decoder target
    out_dec = np.roll(outs, -LOOKAHEAD, axis=1)
    out_dec = out_dec[:, :-1]# = WINDOW_SIZES[0]+1
    in_dec = outs[:, :-1]
    
    in_enc = to_categorical(ins)
    in_dec = to_categorical(in_dec)
    out_dec = to_categorical(out_dec)
    
    tkns = in_enc.shape[-1]
    in_dec = in_dec.reshape((in_dec.shape[0], 1, in_dec.shape[-1]))
    out_dec = out_dec.reshape((out_dec.shape[0], 1, out_dec.shape[-1]))

    # Train/test split
    j = int(len(ins)*TRAIN_TEST_SPLIT)
    X_train = {'enc': in_enc[:j], 'dec': in_dec[:j]}
    Y_train = out_dec[:j]
    X_test = {'enc': in_enc[j:], 'dec': in_dec[j:]}
    Y_test = out_dec[j:]

    return (X_train, Y_train, X_test, Y_test)

In [27]:
data = prepare_data(data_filename)
X_train, Y_train, X_test, Y_test = data

(4078, 16)


In [1]:
print(X_train['enc'].shape)
print(X_train['dec'].shape)
print(Y_train.shape)
print(X_test['enc'].shape)
print(X_test['dec'].shape)
print(Y_test.shape)

NameError: name 'X_train' is not defined

In [29]:
X_train['dec'][:, :-1] == Y_train[:, 1:]


array([], shape=(3262, 0, 96), dtype=bool)

In [379]:
#### stara wersja create model poniej

In [380]:
def create_model_old(X_train, Y_train, params):
    n_hidden = params['hidden_neurons']
    token_count = X_train['enc'].shape[-1]

    n_timesteps, n_features, n_outputs = X_train['abs'].shape[1], X_train['abs'].shape[2], Y_train.shape[1]
    # n_features == 1

    rel_input = Input(shape=(n_timesteps), name='abs')
    emb_input = Embedding(token_count, 16)(rel_input)
    lstm_in = LSTM(rel_input.shape[1])(emb_input)
        
    rel_input = Input(shape=(n_timesteps, n_features), name='abs')
    lstm_in = LSTM(rel_input.shape[1])(rel_input)

    abs_input = Input(shape=(n_timesteps), name='rel')
    
    lstm_in = LSTM(16)(emb_input)
    dense_in = Dense(abs_input.shape[1])(abs_input)

    concatenated = concatenate([lstm_in, dense_in], axis=-1)
    rv = RepeatVector(Y_train.shape[1])(concatenated)
    lstm_out = LSTM(n_hidden, return_sequences=True)(rv)
    td_wrapped = TimeDistributed(Dense(Y_train.shape[2], activation=params['activation']))(lstm_in)

    model = Model({'rel': rel_input, 'abs': abs_input}, td_wrapped)
    
    opt = SGD(lr=params['lr'], momentum=params['momentum'])
    model.compile(optimizer='rmsprop', loss=params['loss'], metrics=['mape', 'acc'])
    return model

def seq2seq_model(X_train, Y_train, X_test, Y_test, params, verbose=0):

    model = create_model(X_train, Y_train, params)
    
    if verbose != 0:
        model.summary()
    
    out = model.fit(X_train, Y_train,
        validation_data=(X_test, Y_test),
        epochs=params['epochs'],
        verbose=verbose,
        batch_size=params['batch_size'])
    return out, model

def load():
    global model
    model = load_model(MODEL_LOCATION)
                     

In [None]:
#### nowe create model poniej

In [31]:
def create_model(X_train, Y_train, params):
    token_count = X_train['enc'].shape[-1]
    latent_dim = params['hidden_neurons']
    
    enc_input = Input(shape=(None, token_count), name='enc')
    _, state_h, state_c = LSTM(latent_dim, return_state=True)(enc_input)
    encoder_states = [state_h, state_c]

    dec_input = Input(shape=(None, token_count), name='dec')
    decoder_outputs, _, _ = LSTM(latent_dim, return_sequences=True, return_state=True)(dec_input,
                                     initial_state=encoder_states)
    
    decoder_dense = Dense(token_count, activation='softmax')
    decoder_outputs = decoder_dense(decoder_outputs)

    model = Model({'enc': enc_input, 'dec': dec_input}, decoder_outputs)
    
    model.compile(optimizer='rmsprop', loss=params['loss'], metrics=['accuracy'])
    return model

def seq2seq_model(X_train, Y_train, X_test, Y_test, params, verbose=0):

    model = create_model(X_train, Y_train, params)
    
    if verbose != 0:
        model.summary()
    
    out = model.fit(X_train, Y_train,
        validation_data=(X_test, Y_test),
        epochs=params['epochs'],
        verbose=verbose,
        batch_size=params['batch_size'])
    return out, model

def load():
    global model
    model = load_model(MODEL_LOCATION)
                     

In [39]:
params = {
    'activation': ['softmax'],
    'batch_size': [100],
    'epochs': [25],
    'lr': [0.01],
    'momentum': [0.95],
    'hidden_neurons': [400],
    'loss': ['categorical_crossentropy']
}
anyparams = { k: random.choice(v) for k, v in params.items() }
anyparams

{'activation': 'softmax',
 'batch_size': 100,
 'epochs': 25,
 'lr': 0.01,
 'momentum': 0.95,
 'hidden_neurons': 400,
 'loss': 'categorical_crossentropy'}

In [33]:
# dbg train
data = prepare_data(data_filename)
X_train, Y_train, X_test, Y_test = data
fit, model = seq2seq_model(X_train, Y_train, X_test, Y_test, anyparams, verbose=1)

(4078, 16)
Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
enc (InputLayer)                [(None, None, 96)]   0                                            
__________________________________________________________________________________________________
dec (InputLayer)                [(None, None, 96)]   0                                            
__________________________________________________________________________________________________
lstm_2 (LSTM)                   [(None, 400), (None, 795200      enc[0][0]                        
__________________________________________________________________________________________________
lstm_3 (LSTM)                   [(None, None, 400),  795200      dec[0][0]                        
                                                                 lstm_2[0][1]    

In [34]:
def test(model, data):
    X_train, Y_train, X_test, Y_test = data

    q = model.predict(X_test)
    return np.mean(np.argmax(q[:, :, :], axis=-1) == np.argmax(Y_test[:, :, :], axis=-1))
test(model, data)

0.19730392156862744

In [35]:
results = model.evaluate(X_test, Y_test, batch_size=128, verbose=0)
print("test loss, test acc:", results[1])

test loss, test acc: 0.1973039209842682


In [36]:
f = data_filename.split('/')[-1].split('.')[0]
f
model.save("saved/xd" + f + ".h5")
with open("saved/xd" + f + '.pickle', 'wb') as f:
    pickle.dump(d, f)

In [37]:
files = []
for opens in [64, 256, 1024]:
    for seqlen in [8, 16, 24]:
        for seqcount in [16, 64, 256]:
            f = f"../../patterns/sequences_{opens}_{seqlen}_{seqcount}.json"
            files.append(f)

In [38]:
for f in files:
    X_train, Y_train, X_test, Y_test = prepare_data(f)
    fit, model = seq2seq_model(X_train, Y_train, X_test, Y_test, anyparams, verbose=0)
    results = model.evaluate(X_test, Y_test, batch_size=128, verbose=0)
    print(f)
    print("acc:", results[1])

FileNotFoundError: [Errno 2] No such file or directory: '../../patterns/sequences_64_8_16.json'

In [425]:
params = {
    'activation': ['softmax'],
    'batch_size': [200],
    'epochs': [20],
    'lr': [0.02],
    'momentum': [0.95],
    'hidden_neurons': [400],
    'loss': ['categorical_crossentropy']
}
anyparams = { k: random.choice(v) for k, v in params.items() }
anyparams

{'activation': 'softmax',
 'batch_size': 200,
 'epochs': 20,
 'lr': 0.02,
 'momentum': 0.95,
 'hidden_neurons': 400,
 'loss': 'categorical_crossentropy'}

In [427]:
def cycle(pattern):
    data = prepare_data(pattern)
    X_train, Y_train, X_test, Y_test = data
    fit, model = seq2seq_model(X_train, Y_train, X_test, Y_test, anyparams, verbose=0)
    
    result = test(model, data)
    
    f = pattern.split('/')[-1].split('.')[0]
    model.save("saved/" + f + ".h5")
    with open("saved/" + f + '.pickle', 'wb') as f:
        pickle.dump(d, f)
    
    print(pattern, result)

In [430]:
st = """
sequences_64_24_64
sequences_256_24_64
""".split()

"""
sequences_64_24_64
sequences_256_24_64


sequences_1024_8_16

sequences_1024_16_64
sequences_1024_24_256

sequences_64_8_16
sequences_64_16_64
sequences_64_24_256

sequences_256_8_16
sequences_256_16_64
sequences_256_24_256"""

patterns = list(map( pattern_filename, st ))


In [431]:
for p in patterns:
    cycle(p)

(1518, 16)
../../patterns/sequences_64_24_64.json 0.3618421052631579
(6126, 16)
../../patterns/sequences_256_24_64.json 0.33931484502446985
