In [1]:
from __future__ import division, unicode_literals, print_function, absolute_import
import numpy as np
import tensorflow as tf
import pandas as pd
from crflayer import CRF
from tensorflow.keras.callbacks import Callback
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn_crfsuite import metrics
import warnings
import time
import os
import func
import prepare_train_with_set as prepare
warnings.filterwarnings('ignore')

# Parameter

In [2]:
set_total = 1
# How many Set
DEBUG = False        # Print element
path_max_len = 30    # padding length
path_emb_size = 5    # embedding size

con_max_len = 50    # padding length
con_emb_size = 5    # embedding size

EPOCHS = 10000        # Train epochs
conv_num = 5        # First cnn filter num
#max_num = 206       # How many nodes should pad
UNTIL_LOSS = 0.001    # When achieve loss then stop
opt = tf.keras.optimizers.Adam(learning_rate=0.0005) # Set learning rate
NO_IMPROVE = 3     # Stop when no improve for epochs
current_path = os.path.join(os.path.expanduser("~"), "jupyter", "Sequence_Labeling_Wrapper_Verification", "data")

# GPU limit

In [3]:
def gpu_limit(num):
    gpus = tf.config.experimental.list_physical_devices('GPU')
    if gpus:
      # Restrict TensorFlow to only use the first GPU
      try:
        tf.config.experimental.set_visible_devices(gpus[num], 'GPU')
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPU")
      except RuntimeError as e:
        # Visible devices must be set before GPUs have been initialized
        print(e)

# Tokenizer

Use tokenizer to convert words to encoding for embedding layer.

In [4]:
def tokenizer():
    tokenizer_path = tf.keras.preprocessing.text.Tokenizer(num_words=None)
    tokenizer_content = tf.keras.preprocessing.text.Tokenizer(num_words=None)
    return tokenizer_path, tokenizer_content

In [5]:
def process_training(set_total, current_path, tokenizer_path, tokenizer_content, path_len, con_len):
    train_data, Set_dict = prepare.train_file_generate(set_total, current_path)
    test_data = prepare.test_file_generate(current_path)
    max_num_train, max_label_train = func.load_data_num(train_data, True)
    max_num_test = func.load_data_num(test_data, False)
    max_num = max(max_num_train, max_num_test)
    col_set_dict = dict(map(reversed, Set_dict.items()))
    feature_train, word_train, label_train, out_train = func.cnn_process_data(train_data, tokenizer_path, tokenizer_content, path_len, con_len)
    #feature_train = feature_train.tolist()
    #label_train = label_train.tolist()
    #word_train = [word_train[i].tolist() for i in range(len(word_train))]
    return test_data, feature_train, word_train, label_train, max_label_train, max_num, col_set_dict

In [6]:
class EarlyStoppingByLossVal(tf.keras.callbacks.Callback):
    '''
    Early stop when training value less than setting value.
    '''
    def __init__(self, monitor='loss', value=UNTIL_LOSS, verbose=0):
        super(Callback, self).__init__()
        self.monitor = monitor
        self.value = value
        self.verbose = verbose

    def on_epoch_end(self, epoch, logs={}):
        current = logs.get(self.monitor)
        if current is None:
            warnings.warn("Early stopping requires %s available!" % self.monitor, RuntimeWarning)

        if current < self.value:
            if self.verbose > 0:
                print("Epoch %05d: early stopping THR" % epoch)
            self.model.stop_training = True

# Model

In [7]:
def full_model(max_num, max_label):
    '''
    Model definition for our experiments using tensorflow keras.
    '''
    path_input = tf.keras.Input(shape=(path_max_len,), name='Path_emb_input')
    content_input = tf.keras.Input(shape=(con_max_len,), name='Content_emb_input')
    feature_input = tf.keras.Input(shape=(6,), name='Feature_input')
    
    path_emb = tf.keras.layers.Embedding(path_word_size+1, path_emb_size)(path_input)
    content_emb = tf.keras.layers.Embedding(con_word_size+1, con_emb_size)(content_input)
    
    feature = tf. reshape(feature_input, [-1, max_num, 6])
    path_emb = tf.reshape(path_emb, [-1, max_num, path_max_len*path_emb_size, 1])
    content_emb = tf.reshape(content_emb, [-1, max_num, con_max_len*con_emb_size, 1])
    
    path = tf.keras.layers.Conv2D(conv_num, (3, path_max_len*path_emb_size), (1, path_max_len*path_emb_size), padding='same')(path_emb)
    con = tf.keras.layers.Conv2D(conv_num, (3, con_max_len*con_emb_size), (1, con_max_len*con_emb_size), padding='same')(content_emb)
    
    path_emb = tf.reshape(path, [-1, max_num, conv_num])
    content_emb = tf.reshape(con, [-1, max_num, conv_num])
    
    combine = tf.keras.layers.concatenate([feature, path_emb, content_emb], -1)
    
    mlp = combine
    d = tf.reshape(mlp, [-1, 6 + conv_num*2])
    d = tf.keras.layers.Dense(max_label+200, activation='tanh')(d)
    d = tf.keras.layers.Dense(max_label+1, activation='softmax')(d)
    output = d
    model = tf.keras.Model(inputs=[feature_input, path_input, content_input], outputs=output)

    return model

def model_word_only(max_num, max_label):
    '''
    Model definition for our experiments using tensorflow keras.
    '''
    path_input = tf.keras.Input(shape=(path_max_len,), name='Path_emb_input')
    content_input = tf.keras.Input(shape=(con_max_len,), name='Content_emb_input')
    feature_input = tf.keras.Input(shape=(6,), name='Feature_input')
    
    path_emb = tf.keras.layers.Embedding(path_word_size+1, path_emb_size)(path_input)
    content_emb = tf.keras.layers.Embedding(con_word_size+1, con_emb_size)(content_input)
    
    path_emb = tf.reshape(path_emb, [-1, max_num, path_max_len*path_emb_size, 1])
    content_emb = tf.reshape(content_emb, [-1, max_num, con_max_len*con_emb_size, 1])
    
    path = tf.keras.layers.Conv2D(conv_num, (3, path_max_len*path_emb_size), (1, path_max_len*path_emb_size), padding='same')(path_emb)
    con = tf.keras.layers.Conv2D(conv_num, (3, con_max_len*con_emb_size), (1, con_max_len*con_emb_size), padding='same')(content_emb)
    
    path_emb = tf.reshape(path, [-1, max_num, conv_num])
    content_emb = tf.reshape(con, [-1, max_num, conv_num])
    
    combine = tf.keras.layers.concatenate([path_emb, content_emb], -1)
    
    mlp = combine
    d = tf.reshape(mlp, [-1, conv_num*2])
    d = tf.keras.layers.Dense(max_label+200, activation='tanh')(d)
    d = tf.keras.layers.Dense(max_label+1, activation='softmax')(d)
    output = d
    model = tf.keras.Model(inputs=[feature_input, path_input, content_input], outputs=output)

    return model

In [8]:
def model_compile(history):
    model.compile(
        loss='sparse_categorical_crossentropy',
        optimizer=opt,
        metrics=['accuracy']
    )
    print(model.summary())
    stop_when_no_improve = tf.keras.callbacks.EarlyStopping(monitor='loss', mode='min', min_delta=0, patience = NO_IMPROVE, restore_best_weights=True)
    until_loss = EarlyStoppingByLossVal(monitor='loss', value=UNTIL_LOSS, verbose=1)
    callbacks = [history, stop_when_no_improve, until_loss]
    return callbacks

# Function define

In [9]:
def train(model, X_train, path, content, y, callbacks):
    start = time.time()
    model.fit([X_train, path, content], y, epochs=EPOCHS, callbacks=callbacks, use_multiprocessing=True, batch_size=BATCH_SIZE)
    t = time.time()-start
    return model, t

In [10]:
def predict(model, feature, path, content):
    ts_start = time.time()
    predictions = model.predict([feature, path, content], batch_size=VAL_BATCH_SIZE)
    ts = time.time()-ts_start
    return predictions, ts

In [11]:
def save_model(model):
    import pickle
    #model.save_weights("./cnn/data/cnn.h5")
    model.save(os.path.join(current_path, "cnn", "data", "model.h5"))
    # saving
    with open(os.path.join(current_path, "cnn", "data", "tokenizer_path.pickle"), "wb") as handle:
        pickle.dump(tokenizer_path, handle, protocol=pickle.HIGHEST_PROTOCOL)
    with open(os.path.join(current_path, "cnn", "data", "tokenizer_content.pickle"), "wb") as handle:
        pickle.dump(tokenizer_content, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [12]:
def load_model():
    import pickle
    model = tf.keras.models.load_model(os.path.join(current_path, "cnn", "data", "model.h5"))
    model.summary()
    # loading
    with open(os.path.join(current_path, "cnn", "data", "tokenizer_path.pickle"), 'rb') as handle:
        tokenizer_path = pickle.load(handle)
    with open(os.path.join(current_path, "cnn", "data", "tokenizer_content.pickle"), 'rb') as handle:
        tokenizer_content = pickle.load(handle)        
    path_word_size = len(tokenizer_path.index_docs)
    con_word_size = len(tokenizer_content.index_docs)
    return model, tokenizer_path, tokenizer_content, path_word_size, con_word_size

In [15]:
def get_result(predictions, max_num):
    result = []
    count = 0
    for page in range(int(len(predictions)/max_num)):
        tmp = []
        for node in range(max_num):
            tmp.append(np.argmax(predictions[count]))
            count += 1
        result.append(tmp)
    return result

In [13]:
if __name__ == "__main__":
    # How many Set
    set_total = 1
    model_name = "cnn"
    current_path = os.path.join(os.path.expanduser("~"), "jupyter", "Sequence_Labeling_Wrapper_Verification", "data")
    
    # GPU
    gpu_limit(1)
    
    # Tokenizer
    tokenizer_path, tokenizer_content = tokenizer()
    
    # Process training file
    test_data, X_train, word_train, y_train, max_label_train, max_num, col_set_dict = process_training(set_total, 
                                                                                                    current_path, 
                                                                                                    tokenizer_path, 
                                                                                                    tokenizer_content, 
                                                                                                    path_max_len, con_max_len)
    
    BATCH_SIZE = max_num      # batch size
    VAL_BATCH_SIZE = max_num  # Validation batch size
    path_word_size = len(tokenizer_path.index_docs)
    con_word_size = len(tokenizer_content.index_docs)
    page_num = int(len(y_train)/max_num)
    
    # Define model
    model = full_model(max_num, max_label_train)
    history = func.LossHistory()
    callables = model_compile(history)
    
    # Start training
    model, t = train(model, X_train, word_train[0], word_train[1], y_train, callables)
    
    # Graph
    #history.loss_plot('epoch')
    
    # Load test feature
    X_test, word_test, y_test, _ = func.cnn_process_data(test_data, tokenizer_path, tokenizer_content, path_max_len, con_max_len)
    
    # Start testing
    pred, ts = predict(model, X_test, word_test[0], word_test[1])
    
    # Process output
    result = get_result(pred, max_num)
    col_type = func.get_col_type(current_path)
    Set_data = func.predict_output(set_total, current_path, model_name, col_type, result, max_label_train, col_set_dict)
    set_train_data, set_train_count = set_func.Set_train_file_generate(set_total, current_path, model_name, feature_train, max_num)
    set_test_data, set_test_count = set_func.Set_test_file_generate(set_total, current_path, model_name, Set_data, feature_test, max_num)
    page_c = len(result)
    
    # Process set
            
    # Process time
    

2 Physical GPUs, 1 Logical GPU
Train Table Opening:/home/rick/jupyter/Sequence_Labeling_Wrapper_Verification/data/data/TableA.txt

Train file find coltype
 --------------------------------------------------------------------------------
Test Table Opening:/home/rick/jupyter/Sequence_Labeling_Wrapper_Verification/data/data/GA.txt

Test file find coltype
 --------------------------------------------------------------------------------
Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Path_emb_input (InputLayer)     [(None, 30)]         0                                            
__________________________________________________________________________________________________
Content_emb_input (InputLayer)  [(None, 50)]         0                                            
_____________________________________________________

Epoch 37/10000
Epoch 38/10000
Epoch 39/10000
Epoch 40/10000
Epoch 41/10000
Epoch 42/10000
Epoch 43/10000
Epoch 44/10000
Epoch 45/10000
Epoch 46/10000
Epoch 47/10000
Epoch 48/10000
Epoch 49/10000
Epoch 50/10000
Epoch 51/10000
Epoch 52/10000
Epoch 53/10000
Epoch 54/10000
Epoch 55/10000
Epoch 56/10000
Epoch 57/10000
Epoch 58/10000
Epoch 59/10000
Epoch 60/10000
Epoch 61/10000
Epoch 62/10000
Epoch 63/10000
Epoch 64/10000
Epoch 65/10000
Epoch 66/10000
Epoch 67/10000
Epoch 68/10000
Epoch 69/10000
Epoch 70/10000
Epoch 71/10000
Epoch 72/10000
Epoch 73/10000
Epoch 74/10000
Epoch 75/10000
Epoch 76/10000
Epoch 77/10000
Epoch 78/10000
Epoch 79/10000
Epoch 80/10000
Epoch 81/10000
Epoch 82/10000
Epoch 83/10000
Epoch 84/10000
Epoch 85/10000
Epoch 86/10000
Epoch 87/10000
Epoch 88/10000
Epoch 89/10000
Epoch 90/10000
Epoch 91/10000
Epoch 92/10000
Epoch 93/10000
Epoch 94/10000
Epoch 95/10000
Epoch 96/10000
Epoch 97/10000
Epoch 98/10000
Epoch 99/10000
Epoch 100/10000
Epoch 101/10000
Epoch 102/10000
Epoch 1

Epoch 112/10000
Epoch 113/10000
Epoch 114/10000
Epoch 115/10000
Epoch 116/10000
Epoch 117/10000
Epoch 118/10000
Epoch 119/10000
Epoch 120/10000
Epoch 121/10000
Epoch 122/10000
Epoch 123/10000
Epoch 124/10000
Epoch 125/10000
Epoch 126/10000
Epoch 127/10000
Epoch 128/10000
Epoch 129/10000
Epoch 130/10000
Epoch 131/10000
Epoch 132/10000
Epoch 133/10000
Epoch 134/10000
Epoch 135/10000
Epoch 136/10000
Epoch 137/10000
Epoch 138/10000
Epoch 139/10000
Epoch 140/10000
Epoch 141/10000
Epoch 142/10000
Epoch 143/10000
Epoch 144/10000
Epoch 145/10000
Epoch 146/10000
Epoch 147/10000
Epoch 148/10000
Epoch 149/10000
Epoch 150/10000
Epoch 151/10000
Epoch 152/10000
Epoch 153/10000
Epoch 154/10000
Epoch 155/10000
Epoch 156/10000
Epoch 157/10000
Epoch 158/10000
Epoch 159/10000
Epoch 160/10000
int32 int32 int32
(1980, 37)


# Create Set Train File

Generate train file for Set Model from DCADE Set Table.

In [None]:
set_data_count = []
if set_total > 0:
    for set_t in range(set_total):
        with open(os.path.join(current_path, "data", "Set-" + str(set_t+1) + ".txt"), "r") as set_file:
            set_tmp = []
            output_name = os.path.join(current_path, "cnn", "set", "Set-" + str(set_t+1) + "_train_raw.csv")
            if DEBUG:
                print("Generating:" + output_name + "\n")
            output = open(output_name, "w")
            output.write("Leafnode\tPTypeSet\tTypeSet\tContentid\tPathid\tSimseqid\tPath\tContent\tLabel\n")
            line = set_file.readline()
            slot = line.rstrip("\n").split("\t")
            while(slot[0]!="ColType"): 
                line = set_file.readline()
                slot = line.rstrip("\n").split("\t")
            with open(os.path.join(current_path, "cnn", "set", "Set-"+ str(set_t+1) + "_coltype.txt"), "w") as col_file:
                col_file.write(str(slot[1:]))
            line = set_file.readline() # First line of data
            page_num = 0
            count = 0
            while(line != ""):
                slot = line.rstrip("\n").split("\t")
                data_info = slot[0].split("-")
                if(page_num != int(data_info[1])):
                    set_tmp.append(count)
                    count = 0
                set_num = int(data_info[0])
                page_num = int(data_info[1])
                if DEBUG:
                    print(str(data_info[0])+"-"+str(data_info[1])+"-"+str(data_info[2]))
                idx = 1
                sub_list = slot[1:]
                while("" in sub_list):
                    sub_list.remove("")
                while(" " in sub_list):
                    sub_list.remove(" ")
                for element in sub_list:
                    count += 1
                    if DEBUG:
                        print(element)
                    element = int(element)
                    output.write(str(feature_train_1[page_num*max_num+element])+"\t")
                    output.write(str(feature_train_2[page_num*max_num+element])+"\t")
                    output.write(str(feature_train_3[page_num*max_num+element])+"\t")
                    output.write(str(feature_train_4[page_num*max_num+element])+"\t")
                    output.write(str(feature_train_5[page_num*max_num+element])+"\t")
                    output.write(str(feature_train_6[page_num*max_num+element])+"\t")
                    output.write(str(list(path_train[page_num*max_num+element])))
                    output.write("\t")
                    output.write(str(list(content_train[page_num*max_num+element])))
                    output.write("\t")
                    output.write(str(idx) + "\n")
                    if DEBUG:
                        print(feature_train_1[page_num*max_num+element])
                    idx += 1
                line = set_file.readline()
            set_tmp.append(count)
            output.close()
        set_data_count.append(set_tmp)

In [None]:
if set_total > 0:
    with open(os.path.join(current_path, "cnn", "set", "set_train_count.txt"), "w") as file:
        file.write(str(set_data_count))
        if DEBUG:
            print(set_data_count)

# Create Set Test file

Generate test file from node data being predicted in a Set by model.

In [None]:
set_data_count = []
if set_total > 0:
    for set_t in range(set_total):
        set_tmp = []
        with open(os.path.join(current_path, "cnn", "set", "Set-" + str(set_t+1) + "_ytest_raw.csv"), "w") as set_file:
            set_file.write("Leafnode\tPTypeSet\tTypeSet\tContentid\tPathid\tSimseqid\tPath\tContent\tLabel\n")
            for pages in tqdm(range(len(Set_data))):
                count = 0
                for node in Set_data[pages][set_t]:
                    count += 1
                    set_file.write(str(feature_test_1[pages*max_num+node])+"\t")
                    set_file.write(str(feature_test_2[pages*max_num+node])+"\t")
                    set_file.write(str(feature_test_3[pages*max_num+node])+"\t")
                    set_file.write(str(feature_test_4[pages*max_num+node])+"\t")
                    set_file.write(str(feature_test_5[pages*max_num+node])+"\t")
                    set_file.write(str(feature_test_6[pages*max_num+node])+"\t")
                    set_file.write(str(list(path_test[pages*max_num+node])))
                    set_file.write("\t")
                    set_file.write(str(list(content_test[pages*max_num+node])))
                    set_file.write("\t")
                    set_file.write(str(0) + "\n")
                set_tmp.append(count)
        set_data_count.append(set_tmp)

In [None]:
if set_total > 0:
    with open(os.path.join(current_path, "cnn", "set", "set_test_count.txt"), "w") as file:
        file.write(str(set_data_count))
        if DEBUG:
            print(set_data_count)
    with open(os.path.join(current_path, "cnn", "set", "word_size.txt"), "w") as file:
        file.write(str(path_word_size)+"\n")
        file.write(str(con_word_size))

In [None]:
page_c = len(result)

In [None]:
model_loss, model_acc = model.evaluate([path_train, content_train, feature_train_1, feature_train_2, feature_train_3, feature_train_4, feature_train_5, feature_train_6], label_train, batch_size=BATCH_SIZE)
print("\n\nLoss {}, Acc {}".format(model_loss, model_acc))

In [None]:
if DEBUG:
    print(label_train.shape)

# Set parameter

In [None]:
path_max_len = 30    # padding length
path_emb_size = 10    # embedding size

con_max_len = 50    # padding length
con_emb_size = 10    # embedding size

feature_emb_size = 5

EPOCHS = 5000        # Train epochs
conv_num = 20        # First cnn filter num
UNTIL_LOSS = 0.001    # When achieve loss then stop
opt = tf.keras.optimizers.Adam(learning_rate=0.001) # Set learning rate
NO_IMPROVE = 50     # Stop when no improve for epochs

# Set function define

In [None]:
def max_num_set(set_data_count, set_total):
    max_set = []
    for i in range(set_total):
        max_set.append(0)
    for sets in range(len(set_data_count)):
        max_set[sets] = max(set_data_count[sets])
    return max_set

def feature_padding_set(df, set_count, set_num):
    feature = []
    count = 0
    for pages in set_count[set_num-1]:
        set_len = pages
        for i in range(set_len):
            feature.append(df[count])
            count += 1
        if set_len != max_set[set_num-1]:
            for i in range(max_set[set_num-1]-set_len):
                feature.append(9999)
    return feature

def emb_padding_set(df, set_count, set_num, pad_len):
    emb = []
    tmp = []
    for i in range(pad_len):
        tmp.append(0)
    count = 0
    for pages in set_count[set_num-1]:
        set_len = pages
        for i in range(set_len):
            emb.append(eval(df[count]))
            count += 1
        if set_len != max_set[set_num-1]:
            for i in range(max_set[set_num-1]-set_len):
                emb.append(tmp)
    if DEBUG:
        print(count)
    return emb

def one_of_n(ans, total):
    tmp = []
    for i in range(int(total)):
        if ans == i:
            tmp.append(1.0)
        else:
            tmp.append(0.0)
    return tmp

def label_padding_set(df, set_count, set_num):
    label = []
    count = 0
    for pages in set_count[set_num-1]:
        set_len = pages
        for i in range(set_len):
            label.append(df[count])
            count += 1
        if set_len != max_set[set_num-1]:
            for i in range(max_set[set_num-1]-set_len):
                label.append(0)
    return label

def to_train_array_set(df, set_count, set_num):
    feature_1 = np.array(feature_padding_set(df['Leafnode'], set_count, set_num))
    feature_2 = np.array(feature_padding_set(df['PTypeSet'], set_count, set_num))
    feature_3 = np.array(feature_padding_set(df['TypeSet'], set_count, set_num))
    feature_4 = np.array(feature_padding_set(df['Contentid'], set_count, set_num))
    feature_5 = np.array(feature_padding_set(df['Pathid'], set_count, set_num))
    feature_6 = np.array(feature_padding_set(df['Simseqid'], set_count, set_num))
    feature_1 = feature_1.flatten()
    feature_2 = feature_2.flatten()
    feature_3 = feature_3.flatten()
    feature_4 = feature_4.flatten()
    feature_5 = feature_5.flatten()
    feature_6 = feature_6.flatten()
    
    path = np.array(emb_padding_set(df['Path'], set_count, set_num, path_max_len))
    path = np.reshape(path, [len(set_count[set_num-1])*max_set[set_num-1], path_max_len])
    content = np.array(emb_padding_set(df['Content'], set_count, set_num, con_max_len))
    content = np.reshape(content, [len(set_count[set_num-1])*max_set[set_num-1], con_max_len])
    
    label = np.array(label_padding_set(df['Label'], set_count, set_num))
    label = np.reshape(label, [len(set_count[set_num-1])*max_set[set_num-1], 1])
    return feature_1, feature_2, feature_3, feature_4, feature_5, feature_6, path, content, label

In [None]:
if set_total > 0:
    Set_data = []
    set_train_count = []
    set_test_count = []
    with open(os.path.join(current_path, "cnn", "set", "Set_data.txt"), "r") as set_file:
        Set_data = eval(set_file.readline())
    with open(os.path.join(current_path, "cnn", "set", "set_train_count.txt"), "r") as set_file:
        set_train_count = eval(set_file.readline())
    with open(os.path.join(current_path, "cnn", "set", "set_test_count.txt"), "r") as set_file:
        set_test_count = eval(set_file.readline())
    with open(os.path.join(current_path, "cnn", "set", "word_size.txt"), "r") as file:
        path_word_size = eval(file.readline())
        con_word_size = eval(file.readline())
    max_num_train = max_num_set(set_train_count, set_total)
    max_num_test = max_num_set(set_test_count, set_total)
    max_set = []
    for i in range(len(max_num_train)):
        max_set.append(max(max_num_train[i], max_num_test[i]))

# Run ALL Set

Loop all the set for training and testing

In [None]:
if set_total > 0:    
    for num in range(set_total):
        set_num = num + 1
        # Load Train file & Test file
        df = get_df(os.path.join(current_path, "cnn", "set", "Set-" + str(set_num) + "_train_raw.csv"))
        max_num = max_set[set_num-1]
        max_label = max(df['Label'])
        BATCH_SIZE = max_num      # Training bath size
        VAL_BATCH_SIZE = max_num  # Validation batch size
        feature_train_1, feature_train_2, feature_train_3, feature_train_4, feature_train_5, feature_train_6, path_train, content_train, label_train = to_train_array_set(df, set_train_count, set_num)
        
        # Design Model
        def get_model():
            path_input = tf.keras.Input(shape=(path_max_len,), name='Path_emb_input')
            content_input = tf.keras.Input(shape=(con_max_len,), name='Content_emb_input')
            feature_input_1 = tf.keras.Input(shape=(1,), name='Feature_input1')
            feature_input_2 = tf.keras.Input(shape=(1,), name='Feature_input2')
            feature_input_3 = tf.keras.Input(shape=(1,), name='Feature_input3')
            feature_input_4 = tf.keras.Input(shape=(1,), name='Feature_input4')
            feature_input_5 = tf.keras.Input(shape=(1,), name='Feature_input5')
            feature_input_6 = tf.keras.Input(shape=(1,), name='Feature_input6')

            if DEBUG:
                print(path_input.shape)

            path_emb = tf.keras.layers.Embedding(path_word_size+1, path_emb_size)(path_input)
            content_emb = tf.keras.layers.Embedding(con_word_size+1, con_emb_size)(content_input)
            
            f_1_emb = tf.keras.layers.Embedding(10000, feature_emb_size)(feature_input_1)
            f_2_emb = tf.keras.layers.Embedding(10000, feature_emb_size)(feature_input_2)
            f_3_emb = tf.keras.layers.Embedding(10000, feature_emb_size)(feature_input_3)
            f_4_emb = tf.keras.layers.Embedding(10000, feature_emb_size)(feature_input_4)
            f_5_emb = tf.keras.layers.Embedding(10000, feature_emb_size)(feature_input_5)
            f_6_emb = tf.keras.layers.Embedding(10000, feature_emb_size)(feature_input_6)

            path_emb = tf.reshape(path_emb, [-1, max_num, path_max_len*path_emb_size])
            content_emb = tf.reshape(content_emb, [-1, max_num, con_max_len*con_emb_size])

            path_emb = tf.expand_dims(path_emb, -1)
            content_emb = tf.expand_dims(content_emb, -1)

            path_feature = tf.keras.layers.Conv2D(conv_num, kernel_size=(3,  path_max_len*path_emb_size), strides=(1, path_max_len*path_emb_size), name='Conv_for_Path_emb', padding='same')(path_emb)
            content_feature = tf.keras.layers.Conv2D(conv_num, kernel_size=(3, con_max_len*con_emb_size), strides=(1, con_max_len*con_emb_size), name='Conv_for_Content_emb', padding='same')(content_emb)

            path = tf.reshape(path_feature, [-1, conv_num])
            content = tf.reshape(content_feature, [-1, conv_num])
            
            f_1_emb = tf.reshape(f_1_emb, [-1, feature_emb_size])
            f_2_emb = tf.reshape(f_2_emb, [-1, feature_emb_size])
            f_3_emb = tf.reshape(f_3_emb, [-1, feature_emb_size])
            f_4_emb = tf.reshape(f_4_emb, [-1, feature_emb_size])
            f_5_emb = tf.reshape(f_5_emb, [-1, feature_emb_size])
            f_6_emb = tf.reshape(f_6_emb, [-1, feature_emb_size])

            combine = tf.keras.layers.concatenate([path, content, f_1_emb, f_2_emb, f_3_emb, f_4_emb, f_5_emb, f_6_emb], -1)
            d = combine
            d = tf.keras.layers.Dense(max_label+200, activation='tanh')(d)
            d = tf.keras.layers.Dense(max_label+1, activation='softmax')(d)
            output = d
            model = tf.keras.Model(inputs=[path_input, content_input, feature_input_1, feature_input_2, feature_input_3, feature_input_4, feature_input_5, feature_input_6], outputs=output)

            return model
        
        def model_word_only():
            path_input = tf.keras.Input(shape=(path_max_len,), name='Path_emb_input')
            content_input = tf.keras.Input(shape=(con_max_len,), name='Content_emb_input')
            feature_input_1 = tf.keras.Input(shape=(1,), name='Feature_input1')
            feature_input_2 = tf.keras.Input(shape=(1,), name='Feature_input2')
            feature_input_3 = tf.keras.Input(shape=(1,), name='Feature_input3')
            feature_input_4 = tf.keras.Input(shape=(1,), name='Feature_input4')
            feature_input_5 = tf.keras.Input(shape=(1,), name='Feature_input5')
            feature_input_6 = tf.keras.Input(shape=(1,), name='Feature_input6')

            if DEBUG:
                print(path_input.shape)

            path_emb = tf.keras.layers.Embedding(path_word_size+1, path_emb_size)(path_input)
            content_emb = tf.keras.layers.Embedding(con_word_size+1, con_emb_size)(content_input)

            path_emb = tf.reshape(path_emb, [-1, max_num, path_max_len*path_emb_size])
            content_emb = tf.reshape(content_emb, [-1, max_num, con_max_len*con_emb_size])

            path_emb = tf.expand_dims(path_emb, -1)
            content_emb = tf.expand_dims(content_emb, -1)

            path_feature = tf.keras.layers.Conv2D(conv_num, kernel_size=(3,  path_max_len*path_emb_size), strides=(1, path_max_len*path_emb_size), name='Conv_for_Path_emb', padding='same')(path_emb)
            content_feature = tf.keras.layers.Conv2D(conv_num, kernel_size=(3, con_max_len*con_emb_size), strides=(1, con_max_len*con_emb_size), name='Conv_for_Content_emb', padding='same')(content_emb)

            path = tf.reshape(path_feature, [-1, conv_num])
            content = tf.reshape(content_feature, [-1, conv_num])

            combine = tf.keras.layers.concatenate([path, content], -1)
            d = combine
            d = tf.keras.layers.Dense(max_label+200, activation='tanh')(d)
            d = tf.keras.layers.Dense(max_label+1, activation='softmax')(d)
            output = d
            model = tf.keras.Model(inputs=[path_input, content_input, feature_input_1, feature_input_2, feature_input_3, feature_input_4, feature_input_5, feature_input_6], outputs=output)

            return model
        
        # Model
        model = model_word_only()
        model.compile(
            loss='sparse_categorical_crossentropy',
            optimizer=opt,
            metrics=['accuracy']
        )
        print(model.summary())
        history = LossHistory()
        stop_when_no_improve = tf.keras.callbacks.EarlyStopping(monitor='loss', mode='min', min_delta=0, patience = NO_IMPROVE, restore_best_weights=True)
        until_loss = EarlyStoppingByLossVal(monitor='loss', value=UNTIL_LOSS, verbose=1)
        callbacks = [history, stop_when_no_improve, until_loss]
        
        # Train
        start = time.time()
        model.fit([path_train, content_train, feature_train_1, feature_train_2, feature_train_3, feature_train_4, feature_train_5, feature_train_6], label_train, epochs=EPOCHS, callbacks=callbacks, use_multiprocessing=True, batch_size=BATCH_SIZE)
        t += time.time()-start
        
        # Save model
        model.save(os.path.join(current_path, "cnn", "set", "set-" + str(set_num) + "_model.h5"))
        del model
        
        #Load model
        model = tf.keras.models.load_model(os.path.join(current_path, "cnn", "set", "set-" + str(set_num) + "_model.h5"))
        '''model.compile(
            loss='sparse_categorical_crossentropy',
            optimizer=opt,
            metrics=['accuracy']
        )'''
        #model.load_weights("./cnn/set/set-"+str(set_num)+"_cnn-mlp.h5")
        
        # Load Test file
        df = get_df(os.path.join(current_path, "cnn", "set", "Set-" + str(set_num) + "_ytest_raw.csv"))
        feature_test_1, feature_test_2, feature_test_3, feature_test_4, feature_test_5, feature_test_6, path_test, content_test, label_test = to_train_array_set(df, set_test_count, set_num)
        
        # Load word size
        with open(os.path.join(current_path, "cnn", "set", "word_size.txt"), "r") as file:
            path_word_size = eval(file.readline())
            con_word_size = eval(file.readline())
            
        # Prediction
        ts_start = time.time()
        predictions = model.predict([path_test, content_test, feature_test_1, feature_test_2, feature_test_3, feature_test_4, feature_test_5, feature_test_6], batch_size=VAL_BATCH_SIZE)
        ts += time.time()-ts_start
        
        # Get result
        result = []
        count = 0
        for page in range(len(set_test_count[set_num-1])):
            tmp = []
            for node in range(max_num):
                tmp.append(np.argmax(predictions[count]))
                count += 1
            result.append(tmp)
        
        # Read Col
        col_type = []
        with open(os.path.join(current_path, "cnn", "set", "Set-" + str(set_num) + "_coltype.txt"), "r") as file:
            tmp = file.readline()
            slot = eval(tmp)
            col_type = slot
            
        # Output
        Set = []
        with open(os.path.join(current_path, "cnn", "set", "set-" + str(set_num) + ".csv"), "w") as file: # Create prediction file
            for col in col_type: # loop to write the Col type
                file.write(col + "\t")
                if DEBUG:
                    print(col + "\t", end='')
            if DEBUG:
                print("")
            file.write("\n")
            current_pos = 1
            for page in tqdm(range(len(result))): # Loop each page
                p_tmp = []
                for cols in range(max_label+1):
                    c_tmp = []
                    for node in range(len(result[page])):
                        r = result[page][node]
                        if r == cols:
                            c_tmp.append(node)
                    p_tmp.append(c_tmp)
                Set.append(p_tmp)
            Set_tmp = Set.copy()
            for page in range(len(Set_tmp)):
                empty = False
                col = []
                for i in range(len(Set_tmp[page])):
                    col.append(False)
                col[0] = True
                while(not empty):
                    for cols in range(len(Set_tmp[page])):
                        if len(Set_tmp[page][cols]) == 0:
                            col[cols] = True
                            if cols != 0:
                                if DEBUG:
                                    print("\t", end="")
                                file.write("\t")
                        else:
                            n = str(int(feature_test_1[page*max_num+Set_tmp[page][cols][0]]))
                            if cols != 0:
                                if DEBUG:
                                    print(n+"\t", end="")
                                file.write(n+"\t")
                            del Set_tmp[page][cols][0]
                            if len(Set_tmp[page][cols]) == 0:
                                col[cols] = True
                        empty = True
                        for i in col:
                            if i == False:
                                empty = False
                                break
                    if DEBUG:
                        print("\n", end="")
                    file.write("\n")

In [None]:
timef = open(os.path.join(current_path, "cnn", "data", "time_cnn.txt"),"w")
print("\ntrain time:"+str(t))
timef.write("train:"+str(t)+"\n")
print("test time:"+str(ts))
print("per page:"+ str(float(ts)/page_c)+"\n")
timef.write("test:"+str(ts)+"\n")
timef.write("per page:"+ str(float(ts)/page_c)+"\n")
timef.close()