In [1]:
from sys import argv
import os
os.environ["TF_FORCE_GPU_ALLOW_GROWTH"]="true"

import numpy as np
import json
from collections import defaultdict
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import Model
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Input, LSTM, Dense, RepeatVector, TimeDistributed, Concatenate, Embedding, Dropout
from tensorflow.keras.models import load_model, Sequential
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.models import model_from_json
from tensorflow.keras.utils import to_categorical
import random
import pickle
from pandas import read_csv, DataFrame
from sklearn.model_selection import train_test_split
print("GPUs Available: ", tf.config.list_physical_devices('GPU'))
physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], True)
tf.debugging.set_log_device_placement(True)

GPUs Available:  [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [5]:
KB = 1024

In [7]:
dfc = read_csv('df_clusters')
offset = dfc.offset.to_numpy()

In [8]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=5)
clusters = kmeans.fit_predict(dfc[['offset']])
offsets = dfc.offset

In [9]:
seq_length = 32
samples = len(dfc)
x_c = np.zeros((samples)).astype(dtype=np.int32)
x_o = np.zeros((samples, seq_length)).astype(dtype=np.int32)
y_o = np.zeros((samples)).astype(dtype=np.int32)

for s in range(100): # range(samples):
    idx = random.randint(0, len(dfc)-seq_length-1)

    last = offsets[idx+seq_length-1]
    nxt = offsets[idx+seq_length]
    diff = nxt - last + 7

    if diff >= 0.0 and diff < 16:
        y_o[s] = diff
    else:
        y_o[s] = 16
    x_o[s] = offsets[idx:idx+seq_length]
    x_c[s] = clusters[idx]

In [10]:
cluster_input = keras.Input(shape=(1,), name="cluster")

offs_input = keras.Input(shape=(seq_length,), name="offset")
offs_emb = Embedding(x_o.max()+1, 16)(offs_input)
offs_ft = LSTM(128)(offs_emb)

merged = Concatenate()([cluster_input, offs_ft])
drop = Dropout(0.2, input_shape=(None, 160))(merged)

offs_output = Dense(1, name="out_offset")(drop)

model = keras.Model(
    inputs=[cluster_input, offs_input],
    outputs=[offs_output],
)

model.compile(loss='mse', optimizer='adam')

In [19]:
with tf.device('/GPU:0'):
    history = model.fit(
        {"cluster": x_c, "offset": x_o},
        {"out_offset": y_o},
        epochs=24,
        batch_size=1024,
        validation_split=0.1,
        verbose=1,
    )

Epoch 1/24
Epoch 2/24
Epoch 3/24
Epoch 4/24
Epoch 5/24
Epoch 6/24
Epoch 7/24
Epoch 8/24
Epoch 9/24
Epoch 10/24
Epoch 11/24
Epoch 12/24
Epoch 13/24
Epoch 14/24
Epoch 15/24
Epoch 16/24
Epoch 17/24
Epoch 18/24
Epoch 19/24
Epoch 20/24
Epoch 21/24
Epoch 22/24
Epoch 23/24
Epoch 24/24


# 1 lstm + embedding adresów

In [10]:
dff = df.copy()
dff.filename = df.filename.map(lambda x: int(x.split('/')[-1]))
filenames = dff.filename.to_numpy().astype(dtype=np.int32)
offsets = dff.offset.to_numpy().astype(dtype=np.int32)

In [11]:
samples = len(df)
files = 50
seq_length = 32

x_f = np.zeros((samples, seq_length)).astype(dtype=np.int32)
x_o = np.zeros((samples, seq_length)).astype(dtype=np.int32)
y_f = np.zeros((samples)).astype(dtype=np.int32)
y_o = np.zeros((samples)).astype(dtype=np.int32)

ok = np.zeros((samples)).astype(dtype=np.int32)

for s in range(samples):
    idx = random.randint(0, len(dff)-seq_length-1)

    last = offsets[idx+seq_length-1]
    nxt = offsets[idx+seq_length]
    diff = nxt - last + 7

    y_o[s] = diff
    x_f[s] = filenames[idx:idx+seq_length] -1
    y_f[s] = filenames[idx+seq_length] -1
    x_o[s] = offsets[idx:idx+seq_length]

In [12]:
samples = x_o.shape[0]
input_shape = (samples, seq_length, 2)
output_shape = (samples, 2)

In [13]:
file_input = keras.Input(shape=(seq_length,), name="file")
file_emb = Embedding(50, 4)(file_input)
file_ft = LSTM(16)(file_emb)

offs_input = keras.Input(shape=(seq_length,), name="offset")
offs_emb = Embedding(1024, 64)(offs_input)
offs_ft = LSTM(64)(offs_emb)

merged = Concatenate()([file_ft, offs_ft])

# lstm = LSTM(300, dropout=0.3, recurrent_dropout=0.3)(merged)

file_output = Dense(50, name="out_file", activation='softmax')(merged)
offs_output = Dense(512, name="out_offset", activation='softmax')(merged)

model = keras.Model(
    inputs=[file_input, offs_input],
    outputs=[file_output, offs_output],
)

model.compile(loss='mse', optimizer='adam')



In [14]:
history = model.fit(
    {"file": x_f, "offset": x_o},
    {"out_file": y_f, "out_offset": y_o},
    epochs=24,
    batch_size=64,
    validation_split=0.1,
    verbose=1,
)

Epoch 1/24
Epoch 2/24
Epoch 3/24
Epoch 4/24
Epoch 5/24
Epoch 6/24
Epoch 7/24
Epoch 8/24
Epoch 9/24
Epoch 10/24
Epoch 11/24
Epoch 12/24
Epoch 13/24
Epoch 14/24
Epoch 15/24
Epoch 16/24
Epoch 17/24
Epoch 18/24
Epoch 19/24
Epoch 20/24
Epoch 21/24
Epoch 22/24
Epoch 23/24
Epoch 24/24


In [569]:
x_f[0,:].shape

(32,)

In [570]:
f, o = model.predict({"file": x_f[0:, :], "offset": x_o[0:, :]})


In [571]:
sum(np.around(o[:, -1]) == y_o) / y_o.size

0.1498035130912362

In [572]:
sum(np.around(f[:, -1]) == y_f) / y_f.size

0.9703612518346669