In [1]:
from __future__ import division, unicode_literals, print_function, absolute_import
import numpy as np
import tensorflow as tf
import pandas as pd
from crflayer import CRF
from tensorflow.keras.callbacks import Callback
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn_crfsuite import metrics
import warnings
import time
warnings.filterwarnings('ignore')

# Parameter

In [2]:
set_total = 0
OOM_Split = 1
# How many Set
BATCH_SIZE = 1      # Training bath size
VAL_BATCH_SIZE = 1  # Validation batch size

DEBUG = False        # Print element
path_max_len = 30    # padding length
path_emb_size = 5    # embedding size

con_max_len = 50    # padding length
con_emb_size = 5    # embedding size

feature_emb_size = 3

EPOCHS = 10000        # Train epochs
conv_num = 5        # First cnn filter num
#max_num = 206       # How many nodes should pad
UNTIL_LOSS = 0.001    # When achieve loss then stop
opt = tf.keras.optimizers.Adam(learning_rate=0.004) # Set learning rate
NO_IMPROVE = 2     # Stop when no improve for epochs

# GPU limit

In [3]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  # Restrict TensorFlow to only use the first GPU
  try:
    tf.config.experimental.set_visible_devices(gpus[1], 'GPU')
    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)
    logical_gpus = tf.config.experimental.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPU")
  except RuntimeError as e:
    # Visible devices must be set before GPUs have been initialized
    print(e)

2 Physical GPUs, 1 Logical GPU


# Tokenizer

Use tokenizer to convert words to encoding for embedding layer.

In [4]:
tokenizer_path = tf.keras.preprocessing.text.Tokenizer(num_words=None)
tokenizer_content = tf.keras.preprocessing.text.Tokenizer(num_words=None)

# Function define

In [5]:
def node_num(data):
    '''
    To generate a list of numbers of nodes that each page have
    '''
    count = False
    num_list = []
    for index in range(len(data)):
        if data[index] == 0 and count != False:
            num_list.append(data[index-1] + 1)
        else:
            count = True
    num_list.append(data[len(data) - 1] + 1)
    count = 0
    index_list = []
    for i in num_list:
        if count == 0:
            index_list.append(i - 1)
            count += 1
        else:
            index_list.append(index_list[count - 1] + i)
            count += 1
    return num_list, index_list


def node_data(data, num):
    '''
    Padding the data with zero when that page is less than max_num leafnode
    '''
    output = []
    count = 0
    for page_num in num:
        tmp = []
        page = 0
        if page_num == max_num:
            for i in range(page_num):
                tmp.append(data[count])
                count += 1
            page += 1
        else:
            for i in range(page_num):
                tmp.append(data[count])
                count += 1
            for i in range(max_num - page_num):
                tmp.append(99999)
            page += 1
        output.append(tmp)
    return output

def label_padding(data, num):
    '''
    Padding the labels with zero when that page is less than max_num leafnode
    '''
    output = []
    count = 0
    for page_num in num:
        tmp = []
        page = 0
        if page_num == max_num:
            for i in range(page_num):
                tmp.append(data[count])
                count += 1
            page += 1
        else:
            for i in range(page_num):
                tmp.append(data[count])
                count += 1
            for i in range(max_num - page_num):
                tmp.append(0) # Pad label with 0
            page += 1
        output.append(tmp)
    return output


def node_emb(data, num, pad_len):
    '''
    Padding the embedding with empty when that page is less than max_num leafnode.
    '''
    output = []
    count = 0
    tmp2 = []
    for j in range(pad_len):
        tmp2.append(0.0)
    for page_num in num:
        tmp = []
        page = 0
        if page_num == max_num:
            for i in range(page_num):
                tmp.append(data[count])
                count += 1
            page += 1
        else:
            for i in range(page_num):
                tmp.append(data[count])
                count += 1
            for i in range(max_num - page_num):
                tmp.append(tmp2)
            page += 1
        output.append(tmp)
    return output

def get_df(path):
    '''
    Read csv file and return pandas dataframe.
    '''
    df = pd.read_csv(path, sep='\t')
    return df
    

def load_data_csv(df):
    '''
    Load the pandas dataframe and convert it to numpy array for train and test.
    '''
    path_encoded = tokenizer_path.texts_to_sequences(df['Path'])
    df['Content'] = df['Content'].str.replace('/|\.|\?|:|=|,|<|>|&|@|\+|-|#|~|\|', ' ')
    df['Content'] = df['Content'].astype(str)
    content_encoded = tokenizer_content.texts_to_sequences(df['Content'])
    path_pad = tf.keras.preprocessing.sequence.pad_sequences(path_encoded, path_max_len, padding='post')
    content_pad = tf.keras.preprocessing.sequence.pad_sequences(content_encoded, con_max_len, padding='post')
    if DEBUG:
        print(path_pad.shape)
        print(content_pad.shape)
    num, index = node_num(df['Leafnode'])
    path = np.array(node_emb(path_pad, num, path_max_len))
    content = np.array(node_emb(content_pad, num, con_max_len))
    if DEBUG:
        print(path.shape)
        print(content.shape)
    feature_1 = np.array(node_data(df['Leafnode'], num))
    df.drop(['Leafnode'], axis=1)
    feature_2 = np.array(node_data(df['PTypeSet'], num))
    df.drop(['PTypeSet'], axis=1)
    feature_3 = np.array(node_data(df['TypeSet'], num))
    df.drop(['TypeSet'], axis=1)
    feature_4 = np.array(node_data(df['Contentid'], num))
    df.drop(['Contentid'], axis=1)
    feature_5 = np.array(node_data(df['Pathid'], num))
    df.drop(['Pathid'], axis=1)
    feature_6 = np.array(node_data(df['Simseqid'], num))
    df.drop(['Simseqid'], axis=1)
    
    label_array = np.array(label_padding(df['Label'], num))
    m_label = df['Label'].max()
    df.drop(['Label'], axis=1)
    label = []
    path_arr = []
    content_arr = []
    for pages in tqdm(range(len(label_array))): # Loop each page
        page = []
        path_page = []
        content_page = []
        for node in range(len(label_array[pages])): # Loop each node
            node_label = []
            for label_t in range(max_label + 1): # Loop each label and a additional empty label ex.1~142 0 is empty
                if label_t == label_array[pages][node]:
                    node_label.append(1.0)
                else:
                    node_label.append(0.0)
            page.append(node_label)
            path_page.append(path[pages][node])
            content_page.append(content[pages][node])
        label.append(page)
        path_arr.append(path_page)
        content_arr.append(content_page)
    label = np.array(label)
    path_arr = np.array(path_arr)
    content_arr = np.array(content_arr)
    path_arr = np.reshape(path_arr, [len(label_array), max_num, path_max_len])
    content_arr = np.reshape(content_arr, [len(label_array), max_num, con_max_len])
    label = np.reshape(label, [len(label_array), max_num, max_label+1])
    
    # OOM part
    feature_1 = np.reshape(feature_1, [-1, int(max_num/OOM_Split)])
    feature_2 = np.reshape(feature_2, [-1, int(max_num/OOM_Split)])
    feature_3 = np.reshape(feature_3, [-1, int(max_num/OOM_Split)])
    feature_4 = np.reshape(feature_4, [-1, int(max_num/OOM_Split)])
    feature_5 = np.reshape(feature_5, [-1, int(max_num/OOM_Split)])
    feature_6 = np.reshape(feature_6, [-1, int(max_num/OOM_Split)])
    label = np.reshape(label, [-1, int(max_num/OOM_Split), max_label+1])
    path_arr = np.reshape(path_arr, [-1, int(max_num/OOM_Split), path_max_len])
    content_arr = np.reshape(content_arr, [-1, int(max_num/OOM_Split), con_max_len])
    
    return feature_1, feature_2, feature_3, feature_4, feature_5, feature_6, path_arr, content_arr, label, m_label


def load_data_num(path, istrain):
    '''
    Get the max num of leafnodes and return.
    '''
    df = pd.read_csv(path, sep='\t')
    num, index = node_num(df['Leafnode'])
    if istrain:
        max_label = df['Label'].max()
        return max(num), max_label
    else:
        return max(num)


class LossHistory(tf.keras.callbacks.Callback):
    '''
    Draw the figure of train.
    '''
    def on_train_begin(self, logs={}):
        self.losses = {'batch':[], 'epoch':[]}
        self.accuracy = {'batch':[], 'epoch':[]}
        self.val_loss = {'batch':[], 'epoch':[]}
        self.val_acc = {'batch':[], 'epoch':[]}

    def on_batch_end(self, batch, logs={}):
        self.losses['batch'].append(logs.get('loss'))
        self.accuracy['batch'].append(logs.get('accuracy'))
        self.val_loss['batch'].append(logs.get('val_loss'))
        self.val_acc['batch'].append(logs.get('val_accuracy'))

    def on_epoch_end(self, batch, logs={}):
        self.losses['epoch'].append(logs.get('loss'))
        self.accuracy['epoch'].append(logs.get('accuracy'))
        self.val_loss['epoch'].append(logs.get('val_loss'))
        self.val_acc['epoch'].append(logs.get('val_accuracy'))

    def loss_plot(self, loss_type):
        iters = range(len(self.losses[loss_type]))
        f1 = plt.figure(1)
        plt.plot(iters, self.accuracy[loss_type], 'r', label='train acc')
        plt.plot(iters, self.val_acc[loss_type], 'b', label='val acc')
        plt.grid(True)
        plt.xlabel(loss_type)
        plt.ylabel('acc')

        f2 = plt.figure(2)
        plt.plot(iters, self.losses[loss_type], 'r', label='train loss')
        plt.plot(iters, self.val_loss[loss_type], 'b', label='val loss')
        plt.grid(True)
        plt.xlabel(loss_type)
        plt.ylabel('loss')

        plt.show()

# Train until loss Callback

In [6]:
class EarlyStoppingByLossVal(tf.keras.callbacks.Callback):
    '''
    Early stop when training value less than setting value.
    '''
    def __init__(self, monitor='loss', value=UNTIL_LOSS, verbose=0):
        super(Callback, self).__init__()
        self.monitor = monitor
        self.value = value
        self.verbose = verbose

    def on_epoch_end(self, epoch, logs={}):
        current = logs.get(self.monitor)
        if current is None:
            warnings.warn("Early stopping requires %s available!" % self.monitor, RuntimeWarning)

        if current < self.value:
            if self.verbose > 0:
                print("Epoch %05d: early stopping THR" % epoch)
            self.model.stop_training = True

# Check max_num in train

Open the train file and test file to check max number of nodes for each page to give the number for padding.

In [7]:
max_num_train, max_label_train = load_data_num("./data/train_raw.csv", True)
max_num_test = load_data_num("./data/ytest_raw.csv", False)
max_num = max(max_num_train, max_num_test)
if max_num%OOM_Split != 0: # Let max num can be spilt into 10.
    max_num += OOM_Split - max_num%OOM_Split
if DEBUG:
    print(max_num_train)
    print(max_num_test)
    print(max_num)

# Load Set index File

Open Set_idx file to check which label is a Set in the training file generated by the training file generation part.

In [8]:
col_set_dict={}
if set_total > 0:
    Set_dict = {}
    with open("./data/Set_idx.txt", "r") as set_file:
        Set_dict = eval(set_file.readline())
    col_set_dict = dict(map(reversed, Set_dict.items()))
    if DEBUG:
        print(Set_dict)

# Load Train File

Read Training file and make tokenizer to fit on Path and Content to get the encoding for words.

In [9]:
max_label = max_label_train
df = get_df("./data/train_raw.csv")
tokenizer_path.fit_on_texts(df['Path'])
tokenizer_content.fit_on_texts(df['Content'].astype(str))
path_word_size = len(tokenizer_path.index_docs)
con_word_size = len(tokenizer_content.index_docs)
feature_train_1, feature_train_2, feature_train_3, feature_train_4, feature_train_5, feature_train_6, path_train, content_train, label_train, out_train = load_data_csv(df)
crf = CRF(False)

100%|██████████| 30/30 [00:00<00:00, 27372.01it/s]


In [10]:
if DEBUG:
    print(feature_train_1.shape)
    print(label_train.shape)
    print(path_word_size)
    print(con_word_size)

# Design Model

In [11]:
def get_model():
    '''
    Model definition for our experiments using tensorflow keras.
    '''
    path_input = tf.keras.Input(shape=(int(max_num/OOM_Split), path_max_len), name='Path_emb_input')
    content_input = tf.keras.Input(shape=(int(max_num/OOM_Split), con_max_len), name='Content_emb_input')
    feature_input_1 = tf.keras.Input(shape=(int(max_num/OOM_Split),), name='Feature_input1')
    feature_input_2 = tf.keras.Input(shape=(int(max_num/OOM_Split),), name='Feature_input2')
    feature_input_3 = tf.keras.Input(shape=(int(max_num/OOM_Split),), name='Feature_input3')
    feature_input_4 = tf.keras.Input(shape=(int(max_num/OOM_Split),), name='Feature_input4')
    feature_input_5 = tf.keras.Input(shape=(int(max_num/OOM_Split),), name='Feature_input5')
    feature_input_6 = tf.keras.Input(shape=(int(max_num/OOM_Split),), name='Feature_input6')
    
    path_f = tf.keras.layers.Flatten()(path_input)
    content_f = tf.keras.layers.Flatten()(content_input)
    
    path_emb = tf.keras.layers.Embedding(path_word_size+1, path_emb_size)(path_f)
    content_emb = tf.keras.layers.Embedding(con_word_size+1, con_emb_size)(content_f)
    f_1_emb = tf.keras.layers.Embedding(100000, feature_emb_size)(feature_input_1)
    f_2_emb = tf.keras.layers.Embedding(100000, feature_emb_size)(feature_input_2)
    f_3_emb = tf.keras.layers.Embedding(100000, feature_emb_size)(feature_input_3)
    f_4_emb = tf.keras.layers.Embedding(100000, feature_emb_size)(feature_input_4)
    f_5_emb = tf.keras.layers.Embedding(100000, feature_emb_size)(feature_input_5)
    f_6_emb = tf.keras.layers.Embedding(100000, feature_emb_size)(feature_input_6)
    
    path_emb = tf.reshape(path_emb, [-1, int(max_num/OOM_Split), path_max_len*path_emb_size])
    content_emb = tf.reshape(content_emb, [-1, int(max_num/OOM_Split), con_max_len*con_emb_size])
    
    path_emb = tf.expand_dims(path_emb, -1)
    content_emb = tf.expand_dims(content_emb, -1)
    
    path_feature = tf.keras.layers.Conv2D(conv_num, kernel_size=(3,  path_max_len*path_emb_size), strides=(1, path_max_len*path_emb_size), name='Conv_for_Path_emb', padding='same')(path_emb)
    content_feature = tf.keras.layers.Conv2D(conv_num, kernel_size=(3, con_max_len*con_emb_size), strides=(1, con_max_len*con_emb_size), name='Conv_for_Content_emb', padding='same')(content_emb)
    
    path = tf.reshape(path_feature, [-1, conv_num])
    content = tf.reshape(content_feature, [-1, conv_num])
    
    f_1_emb = tf.reshape(f_1_emb, [-1, feature_emb_size])
    f_2_emb = tf.reshape(f_2_emb, [-1, feature_emb_size])
    f_3_emb = tf.reshape(f_3_emb, [-1, feature_emb_size])
    f_4_emb = tf.reshape(f_4_emb, [-1, feature_emb_size])
    f_5_emb = tf.reshape(f_5_emb, [-1, feature_emb_size])
    f_6_emb = tf.reshape(f_6_emb, [-1, feature_emb_size])

    combine = tf.keras.layers.concatenate([path, content, f_1_emb, f_2_emb, f_3_emb, f_4_emb, f_5_emb, f_6_emb], -1)
    d = combine
    d = tf.keras.layers.Dense(max_label+1)(d)
    d = tf.reshape(d, [-1, int(max_num/OOM_Split), max_label+1])
    output = crf(d)
    output = tf.reshape(output, [-1, int(max_num/OOM_Split), max_label+1])
    model = tf.keras.Model(inputs=[path_input, content_input, feature_input_1, feature_input_2, feature_input_3, feature_input_4, feature_input_5, feature_input_6], outputs=output)

    return model

# Model

Compile model with parameters (loss, optimizer and metrics), and set up the early stop callbacks.

In [12]:
model = get_model()
model.compile(
    loss=crf.loss,
    optimizer=opt,
    metrics=[crf.accuracy]
)
print(model.summary())
history = LossHistory()
stop_when_no_improve = tf.keras.callbacks.EarlyStopping(monitor='loss', mode='min', min_delta=0, patience = NO_IMPROVE, restore_best_weights=True)
until_loss = EarlyStoppingByLossVal(monitor='loss', value=UNTIL_LOSS, verbose=1)
callbacks = [history, stop_when_no_improve, until_loss]
t = 0

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Path_emb_input (InputLayer)     [(None, 10, 30)]     0                                            
__________________________________________________________________________________________________
Content_emb_input (InputLayer)  [(None, 10, 50)]     0                                            
__________________________________________________________________________________________________
flatten (Flatten)               (None, 300)          0           Path_emb_input[0][0]             
__________________________________________________________________________________________________
flatten_1 (Flatten)             (None, 500)          0           Content_emb_input[0][0]          
______________________________________________________________________________________________

# Train

Start training and recording the time consumption.

In [13]:
start = time.time()
model.fit([path_train, content_train, feature_train_1, feature_train_2, feature_train_3, feature_train_4, feature_train_5, feature_train_6], label_train, epochs=EPOCHS, callbacks=callbacks, use_multiprocessing=True, batch_size=BATCH_SIZE)
t = time.time()-start

Train on 30 samples
Epoch 1/10000
Epoch 2/10000
Epoch 3/10000
Epoch 4/10000
Epoch 5/10000
Epoch 6/10000
Epoch 7/10000
Epoch 8/10000
Epoch 9/10000
Epoch 10/10000
Epoch 11/10000
Epoch 12/10000
Epoch 13/10000
Epoch 14/10000
Epoch 15/10000
Epoch 16/10000
Epoch 17/10000
Epoch 18/10000
Epoch 19/10000


# Graph

Show the training accuracy-epochs and loss-epochs graph. 

In [14]:
#history.loss_plot('epoch')

# Save Model

Save tokenizer data and model for future use.

In [15]:
import pickle
#model.save_weights("./crf/data/cnn-crf.h5")
# saving
with open("./crf/data/tokenizer_path.pickle", "wb") as handle:
    pickle.dump(tokenizer_path, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open("./crf/data/tokenizer_content.pickle", "wb") as handle:
    pickle.dump(tokenizer_content, handle, protocol=pickle.HIGHEST_PROTOCOL)
model.save_weights("./crf/data/model.h5")
del model

# Load Model

Load model and tokenizer back from file.

In [16]:
import pickle
"""model = tf.keras.models.load_model("./crf/data/model.h5", custom_objects={'CRF':CRF, 'loss':crf.loss, 'metrics':[crf.accuracy], 'optimizer':opt})
model.summary()"""
model = get_model()
model.compile(
    loss=crf.loss,
    optimizer=opt,
    metrics=[crf.accuracy]
)
model.load_weights("./crf/data/model.h5")
# loading
with open('./crf/data/tokenizer_path.pickle', 'rb') as handle:
    tokenizer_path = pickle.load(handle)
with open('./crf/data/tokenizer_content.pickle', 'rb') as handle:
    tokenizer_content = pickle.load(handle)

# Load Test file

In [17]:
df = get_df("./data/ytest_raw.csv")
feature_test_1, feature_test_2, feature_test_3, feature_test_4, feature_test_5, feature_test_6, path_test, content_test, a, b = load_data_csv(df)

100%|██████████| 30/30 [00:00<00:00, 22869.71it/s]


In [18]:
if DEBUG:
    print(max_num)
    print(feature_test_1.shape)
    print(path_test.shape)
    print(path_word_size)
    print(con_word_size)
path_word_size = len(tokenizer_path.index_docs)
con_word_size = len(tokenizer_content.index_docs)

# Prediction

Predict on test file and record the testing time.

In [19]:
ts_start = time.time()
predictions = model.predict([path_test, content_test, feature_test_1, feature_test_2, feature_test_3, feature_test_4, feature_test_5, feature_test_6], batch_size=VAL_BATCH_SIZE)
ts = time.time()-ts_start
predictions = np.reshape(predictions, [-1, max_num, max_label+1])

In [20]:
if DEBUG:
    print(feature_test_1[0][0])
    print(path_test[0][0])
    print(content_test[0][0])
    print(predictions.shape)

# Output & Turn predict back to label

Pick maximum argument label as prediction and save in result list.

In [21]:
result = []
for page in range(predictions.shape[0]):
    tmp = []
    for node in range(max_num):
        tmp.append(np.argmax(predictions[page][node]))
    result.append(tmp)

# Read Column Type

Read Column type from TableA for file ColType output.

In [22]:
col_type = []
with open("./data/TableA.txt", "r") as file:
    line = file.readline()
    slot = line.rstrip("\n").split("\t")
    while(slot[0]!="ColType"):
        line = file.readline()
        slot = line.rstrip("\n").split("\t")
    col_type = slot[1:]
if DEBUG:
    print(col_type)

# File prediction output

Output the prediction.csv file for use.

In [23]:
Set_data = []
with open("./crf/data/predictions.csv", "w") as file: # Create prediction file
    for col in col_type: # loop to write the Col type
        file.write(col + "\t")
        if DEBUG:
            print(col + "\t", end='')
    file.write("\n")
    for page in tqdm(range(predictions.shape[0])): # Loop each page
        sets = []
        for label in range(label_train.shape[2] + 1): # Loop whole label
            if DEBUG:
                print("Label: " + str(label))
            if label == 0:
                continue
            empty = True
            isset = False
            data = []
            for node in range(predictions.shape[1]):
                if result[page][node] == label:
                    if empty == False and not isset:
                        if DEBUG:
                            print(" ", end='')
                        file.write(" ")
                    empty = False
                    if label in col_set_dict.keys() and set_total > 0: # That col is a Set
                        isset = True
                        data.append(node)
                        if DEBUG:
                            print("Append:" + str(node))
                    else:
                        if DEBUG:
                            print(str(node), end='')
                        file.write(str(node))
            if label in col_set_dict.keys() and set_total > 0: # That col is a Set
                if DEBUG:
                    print(str(col_set_dict[label])+"-"+str(page), end='')
                file.write(str(col_set_dict[label])+"-"+str(page))
                sets.append(data)
            if DEBUG:
                print("\t", end='')
            file.write("\t")
        if DEBUG:
            print("")
        file.write("\n")
        if DEBUG:
            print(data)
        Set_data.append(sets)
print("Done!")

100%|██████████| 30/30 [00:00<00:00, 47411.12it/s]

Done!





In [24]:
model_loss, model_acc = model.evaluate([path_train, content_train, feature_train_1, feature_train_2, feature_train_3, feature_train_4, feature_train_5, feature_train_6], label_train, batch_size=BATCH_SIZE)
print("\n\nLoss {}, Acc {}".format(model_loss, model_acc))



Loss 0.0008725484212239583, Acc 1.0


In [25]:
if DEBUG:
    print(col_set_dict)

# Create Set data output for test

Output the Set data that being predicted in the Set by the model.

In [26]:
if set_total > 0:
    with open("./crf/set/Set_data.txt", "w") as set_train_file:
        tmp = str(Set_data)
        set_train_file.write(tmp)
        if DEBUG:
            print(tmp)

# Create Set Train File

Generate train file for Set Model from DCADE Set Table.

In [27]:
if DEBUG:
    print(feature_train_1.shape)

feature_train_1 = np.reshape(feature_train_1, [-1, max_num])
feature_train_2 = np.reshape(feature_train_2, [-1, max_num])
feature_train_3 = np.reshape(feature_train_3, [-1, max_num])
feature_train_4 = np.reshape(feature_train_4, [-1, max_num])
feature_train_5 = np.reshape(feature_train_5, [-1, max_num])
feature_train_6 = np.reshape(feature_train_6, [-1, max_num])
label_train = np.reshape(label_train, [-1, max_num, max_label+1])
path_train = np.reshape(path_train, [-1, max_num, path_max_len])
content_train = np.reshape(content_train, [-1, max_num, con_max_len])

In [28]:
set_data_count = []
if set_total > 0:
    for set_t in range(set_total):
        with open("./data/Set-"+ str(set_t+1) +".txt", "r") as set_file:
            set_tmp = []
            output_name = "./set/Set-"+ str(set_t+1) +"_train_raw.csv"
            if DEBUG:
                print("Generating:" + output_name + "\n")
            output = open(output_name, "w")
            output.write("Leafnode\tPTypeSet\tTypeSet\tContentid\tPathid\tSimseqid\tPath\tContent\tLabel\n")
            line = set_file.readline()
            slot = line.rstrip("\n").split("\t")
            while(slot[0]!="ColType"): 
                line = set_file.readline()
                slot = line.rstrip("\n").split("\t")
            with open("./crf/set/Set-"+ str(set_t+1) +"_coltype.txt", "w") as col_file:
                col_file.write(str(slot[1:]))
            line = set_file.readline() # First line of data
            page_num = 0
            count = 0
            while(line != ""):
                slot = line.rstrip("\n").split("\t")
                data_info = slot[0].split("-")
                if(page_num != int(data_info[1])):
                    set_tmp.append(count)
                    count = 0
                set_num = int(data_info[0])
                page_num = int(data_info[1])
                if DEBUG:
                    print(str(data_info[0])+"-"+str(data_info[1])+"-"+str(data_info[2]))
                idx = 1
                sub_list = slot[1:]
                while("" in sub_list):
                    sub_list.remove("")
                while(" " in sub_list):
                    sub_list.remove(" ")
                for element in sub_list:
                    count += 1
                    if DEBUG:
                        print(element)
                    element = int(element)
                    output.write(str(feature_train_1[page_num][element])+"\t")
                    output.write(str(feature_train_2[page_num][element])+"\t")
                    output.write(str(feature_train_3[page_num][element])+"\t")
                    output.write(str(feature_train_4[page_num][element])+"\t")
                    output.write(str(feature_train_5[page_num][element])+"\t")
                    output.write(str(feature_train_6[page_num][element])+"\t")
                    output.write(str(list(path_train[page_num][element])))
                    output.write("\t")
                    output.write(str(list(content_train[page_num][element])))
                    output.write("\t")
                    output.write(str(idx) + "\n")
                    if DEBUG:
                        print(feature_train_1[page_num][element])
                    idx += 1
                line = set_file.readline()
            set_tmp.append(count)
            output.close()
        set_data_count.append(set_tmp)

In [29]:
if set_total > 0:
    with open("./crf/set/set_train_count.txt", "w") as file:
        file.write(str(set_data_count))

# Create Set Test file

Generate test file from node data being predicted in a Set by model.

In [30]:
set_data_count = []
if set_total > 0:
    for set_t in range(set_total):
        set_tmp = []
        with open("./crf/set/Set-"+ str(set_t+1) +"_ytest_raw.csv", "w") as set_file:
            set_file.write("Leafnode\tPTypeSet\tTypeSet\tContentid\tPathid\tSimseqid\tPath\tContent\tLabel\n")
            co = 0
            for pages in tqdm(range(len(Set_data))):
                count = 0
                for node in Set_data[pages][set_t]:
                    co += 1
                    count += 1
                    set_file.write(str(feature_train_1[pages][node]))
                    set_file.write("\t")
                    set_file.write(str(feature_train_2[pages][node]))
                    set_file.write("\t")
                    set_file.write(str(feature_train_3[pages][node]))
                    set_file.write("\t")
                    set_file.write(str(feature_train_4[pages][node]))
                    set_file.write("\t")
                    set_file.write(str(feature_train_5[pages][node]))
                    set_file.write("\t")
                    set_file.write(str(feature_train_6[pages][node]))
                    set_file.write("\t")
                    set_file.write(str(list(path_train[pages][node])))
                    set_file.write("\t")
                    set_file.write(str(list(content_train[pages][node])))
                    set_file.write("\t")
                    set_file.write(str(0) + "\n")
                set_tmp.append(count)
            if DEBUG:
                print(co)
        set_data_count.append(set_tmp)

In [31]:
if DEBUG:
    print(set_data_count)

In [32]:
if set_total > 0:
    with open("./crf/set/set_test_count.txt", "w") as file:
        file.write(str(set_data_count))
    with open("./crf/set/word_size.txt", "w") as file:
        file.write(str(path_word_size)+"\n")
        file.write(str(con_word_size))

In [33]:
page_c = len(result)

# Set Parameter

In [34]:
path_max_len = 30    # padding length
path_emb_size = 10    # embedding size

con_max_len = 50    # padding length
con_emb_size = 10    # embedding size

feature_emb_size = 5

EPOCHS = 10000        # Train epochs
conv_num = 20        # First cnn filter num
UNTIL_LOSS = 0.01    # When achieve loss then stop
opt = tf.keras.optimizers.Adam(learning_rate=0.001) # Set learning rate
NO_IMPROVE = 50     # Stop when no improve for epochs

# Function define

In [35]:
def max_num_set(set_data_count, set_total):
    max_set = []
    for i in range(set_total):
        max_set.append(0)
    for sets in range(len(set_data_count)):
        max_set[sets] = max(set_data_count[sets])
    return max_set

def feature_padding_set(df, set_count, set_num):
    feature = []
    count = 0
    for pages in set_count[set_num-1]:
        t = []
        set_len = pages
        for i in range(set_len):
            t.append(df[count])
            count += 1
        if set_len != max_set[set_num-1]:
            for i in range(max_set[set_num-1]-set_len):
                t.append(9999)
        feature.append(t)
    return feature

def emb_padding_set(df, set_count, set_num, pad_len):
    emb = []
    tmp = []
    for i in range(pad_len):
        tmp.append(0)
    count = 0
    for pages in set_count[set_num-1]:
        set_len = pages
        for i in range(set_len):
            emb.append(eval(df[count]))
            count += 1
        if set_len != max_set[set_num-1]:
            for i in range(max_set[set_num-1]-set_len):
                emb.append(tmp)
    return emb

def one_of_n(ans, total):
    tmp = []
    for i in range(int(total)):
        if ans == i:
            tmp.append(1.0)
        else:
            tmp.append(0.0)
    return tmp

def label_padding_set(df, set_count, set_num):
    label = []
    tmp = one_of_n(0, max_label+1)
    count = 0
    for pages in set_count[set_num-1]:
        set_len = pages
        for i in range(set_len):
            label.append(one_of_n(df[count], max_label+1))
            count += 1
        if set_len != max_set[set_num-1]:
            for i in range(max_set[set_num-1]-set_len):
                label.append(tmp)
    return label

def to_train_array_set(df, set_count, set_num):
    feature_1 = np.array(feature_padding_set(df['Leafnode'], set_count, set_num))
    feature_2 = np.array(feature_padding_set(df['PTypeSet'], set_count, set_num))
    feature_3 = np.array(feature_padding_set(df['TypeSet'], set_count, set_num))
    feature_4 = np.array(feature_padding_set(df['Contentid'], set_count, set_num))
    feature_5 = np.array(feature_padding_set(df['Pathid'], set_count, set_num))
    feature_6 = np.array(feature_padding_set(df['Simseqid'], set_count, set_num))
    
    path = np.array(emb_padding_set(df['Path'], set_count, set_num, path_max_len))
    path = np.reshape(path, [len(set_count[set_num-1]), max_set[set_num-1], path_max_len])
    content = np.array(emb_padding_set(df['Content'], set_count, set_num, con_max_len))
    content = np.reshape(content, [len(set_count[set_num-1]), max_set[set_num-1], con_max_len])
    
    label = np.array(label_padding_set(df['Label'], set_count, set_num))
    label = np.reshape(label, [len(set_count[set_num-1]), max_set[set_num-1], int(max_label+1)])
    return feature_1, feature_2, feature_3, feature_4, feature_5, feature_6, path, content, label

# Read Set data

In [36]:
if set_total > 0:
    Set_data = []
    set_train_count = []
    set_test_count = []
    with open("./crf/set/Set_data.txt", "r") as set_file:
        Set_data = eval(set_file.readline())
    with open("./crf/set/set_train_count.txt", "r") as set_file:
        set_train_count = eval(set_file.readline())
    with open("./crf/set/set_test_count.txt", "r") as set_file:
        set_test_count = eval(set_file.readline())
    with open("./crf/set/word_size.txt", "r") as file:
        path_word_size = eval(file.readline())
        con_word_size = eval(file.readline())
    max_num_train = max_num_set(set_train_count, set_total)
    max_num_test = max_num_set(set_test_count, set_total)
    max_set = []
    for i in range(len(max_num_train)):
        max_set.append(max(max_num_train[i], max_num_test[i]))

# Run ALL

Loop all the set for training and testing

In [37]:
if set_total > 0:
    for num in range(set_total):
        set_num = num + 1
        df = get_df("./set/Set-"+str(set_num)+"_train_raw.csv")
        max_num = max_set[set_num-1]
        max_label = max(df['Label'])
        feature_train_1, feature_train_2, feature_train_3, feature_train_4, feature_train_5, feature_train_6, path_train, content_train, label_train = to_train_array_set(df, set_train_count, set_num)
        crf = CRF(False)
        BATCH_SIZE = max_num      # Training bath size
        VAL_BATCH_SIZE = max_num  # Validation batch size
        
        def get_model():
            path_input = tf.keras.Input(shape=(max_num, path_max_len), name='Path_emb_input')
            content_input = tf.keras.Input(shape=(max_num, con_max_len), name='Content_emb_input')
            feature_input_1 = tf.keras.Input(shape=(max_num,), name='Feature_input1')
            feature_input_2 = tf.keras.Input(shape=(max_num,), name='Feature_input2')
            feature_input_3 = tf.keras.Input(shape=(max_num,), name='Feature_input3')
            feature_input_4 = tf.keras.Input(shape=(max_num,), name='Feature_input4')
            feature_input_5 = tf.keras.Input(shape=(max_num,), name='Feature_input5')
            feature_input_6 = tf.keras.Input(shape=(max_num,), name='Feature_input6')

            path_f = tf.keras.layers.Flatten()(path_input)
            content_f = tf.keras.layers.Flatten()(content_input)

            path_emb = tf.keras.layers.Embedding(path_word_size+1, path_emb_size)(path_f)
            content_emb = tf.keras.layers.Embedding(con_word_size+1, con_emb_size)(content_f)
            f_1_emb = tf.keras.layers.Embedding(100000, feature_emb_size)(feature_input_1)
            f_2_emb = tf.keras.layers.Embedding(100000, feature_emb_size)(feature_input_2)
            f_3_emb = tf.keras.layers.Embedding(100000, feature_emb_size)(feature_input_3)
            f_4_emb = tf.keras.layers.Embedding(100000, feature_emb_size)(feature_input_4)
            f_5_emb = tf.keras.layers.Embedding(100000, feature_emb_size)(feature_input_5)
            f_6_emb = tf.keras.layers.Embedding(100000, feature_emb_size)(feature_input_6)

            path_emb = tf.reshape(path_emb, [-1, max_num, path_max_len*path_emb_size])
            content_emb = tf.reshape(content_emb, [-1, max_num, con_max_len*con_emb_size])

            path_emb = tf.expand_dims(path_emb, -1)
            content_emb = tf.expand_dims(content_emb, -1)

            path_feature = tf.keras.layers.Conv2D(conv_num, kernel_size=(3,  path_max_len*path_emb_size), strides=(1, path_max_len*path_emb_size), name='Conv_for_Path_emb', padding='same')(path_emb)
            content_feature = tf.keras.layers.Conv2D(conv_num, kernel_size=(3, con_max_len*con_emb_size), strides=(1, con_max_len*con_emb_size), name='Conv_for_Content_emb', padding='same')(content_emb)

            path = tf.reshape(path_feature, [-1, conv_num])
            content = tf.reshape(content_feature, [-1, conv_num])

            f_1_emb = tf.reshape(f_1_emb, [-1, feature_emb_size])
            f_2_emb = tf.reshape(f_2_emb, [-1, feature_emb_size])
            f_3_emb = tf.reshape(f_3_emb, [-1, feature_emb_size])
            f_4_emb = tf.reshape(f_4_emb, [-1, feature_emb_size])
            f_5_emb = tf.reshape(f_5_emb, [-1, feature_emb_size])
            f_6_emb = tf.reshape(f_6_emb, [-1, feature_emb_size])

            combine = tf.keras.layers.concatenate([path, content, f_1_emb, f_2_emb, f_3_emb, f_4_emb, f_5_emb, f_6_emb], -1)
            d = combine
            d = tf.keras.layers.Dense(max_label+1)(d)
            d = tf.reshape(d, [-1, max_num, max_label+1])
            output = crf(d)
            model = tf.keras.Model(inputs=[path_input, content_input, feature_input_1, feature_input_2, feature_input_3, feature_input_4, feature_input_5, feature_input_6], outputs=output)

            return model
        
        model = get_model()
        model.compile(
            loss=crf.loss,
            optimizer=opt,
            metrics=[crf.accuracy]
        )
        history = LossHistory()
        stop_when_no_improve = tf.keras.callbacks.EarlyStopping(monitor='loss', mode='min', min_delta=0, patience = NO_IMPROVE, restore_best_weights=True)
        until_loss = EarlyStoppingByLossVal(monitor='loss', value=UNTIL_LOSS, verbose=1)
        callbacks = [history, stop_when_no_improve, until_loss]
        
        start = time.time()
        model.fit([path_train, content_train, feature_train_1, feature_train_2, feature_train_3, feature_train_4, feature_train_5, feature_train_6], label_train, epochs=EPOCHS, callbacks=callbacks, use_multiprocessing=True, batch_size=BATCH_SIZE)
        t += time.time()-start
        
        model_loss, model_acc = model.evaluate([path_train, content_train, feature_train_1, feature_train_2, feature_train_3, feature_train_4, feature_train_5, feature_train_6], label_train, batch_size=BATCH_SIZE)
        print("\n\nLoss {}, Acc {}".format(model_loss, model_acc))
        """model.save("./crf/set/set-"+str(set_num)+"_model.h5")
        del model
        
        model = tf.keras.models.load_model("./crf/set/set-"+str(set_num)+"_model.h5")
        '''model.compile(
            loss=crf.loss,
            optimizer=opt,
            metrics=[crf.accuracy]
        )
        model.load_weights("./crf/set/set-"+str(set_num)+"_cnn-crf.h5")'''"""
        
        df = get_df("./crf/set/Set-"+str(set_num)+"_ytest_raw.csv")
        feature_test_1, feature_test_2, feature_test_3, feature_test_4, feature_test_5, feature_test_6, path_test, content_test, label_test = to_train_array_set(df, set_test_count, set_num)
        
        with open("./crf/set/word_size.txt", "r") as file:
            path_word_size = eval(file.readline())
            con_word_size = eval(file.readline())
        ts_start = time.time()
        predictions = model.predict([path_test, content_test, feature_test_1, feature_test_2, feature_test_3, feature_test_4, feature_test_5, feature_test_6], batch_size=VAL_BATCH_SIZE)
        ts += time.time()-ts_start
        
        result = []
        for page in range(predictions.shape[0]):
            tmp = []
            for node in range(max_num):
                tmp.append(np.argmax(predictions[page][node]))
            result.append(tmp)
            
        col_type = []
        with open("./crf/set/Set-"+str(set_num)+"_coltype.txt", "r") as file:
            tmp = file.readline()
            slot = eval(tmp)
            col_type = slot
        Set = []
        with open("./crf/set/set-"+str(set_num)+".csv", "w") as file: # Create prediction file
            for col in col_type: # loop to write the Col type
                file.write(col + "\t")
                if DEBUG:
                    print(col + "\t", end='')
            if DEBUG:
                print("")
            file.write("\n")
            current_pos = 1
            for page in tqdm(range(len(result))): # Loop each page
                p_tmp = []
                for cols in range(max_label+1):
                    c_tmp = []
                    for node in range(len(result[page])):
                        r = result[page][node]
                        if r == cols:
                            c_tmp.append(node)
                    p_tmp.append(c_tmp)
                Set.append(p_tmp)
            Set_tmp = Set.copy()
            for page in range(len(Set_tmp)):
                empty = False
                col = []
                for i in range(len(Set_tmp[page])):
                    col.append(False)
                col[0] = True
                while(not empty):
                    for cols in range(len(Set_tmp[page])):
                        if len(Set_tmp[page][cols]) == 0:
                            col[cols] = True
                            if cols != 0:
                                if DEBUG:
                                    print("\t", end="")
                                file.write("\t")
                        else:
                            n = str(int(feature_test_1[page][Set_tmp[page][cols][0]]))
                            if cols != 0:
                                if DEBUG:
                                    print(n+"\t", end="")
                                file.write(n+"\t")
                            del Set_tmp[page][cols][0]
                            if len(Set_tmp[page][cols]) == 0:
                                col[cols] = True
                        empty = True
                        for i in col:
                            if i == False:
                                empty = False
                                break
                    if DEBUG:
                        print("\n", end="")
                    file.write("\n")

In [38]:
timef = open("./crf/data/time_crf.txt","w")
print("\ntrain time:"+str(t))
timef.write("train:"+str(t)+"\n")
print("test time:"+str(ts))
print("per page:"+ str(float(ts)/page_c)+"\n")
timef.write("test:"+str(ts)+"\n")
timef.write("per page:"+ str(float(ts)/page_c)+"\n")
timef.close()


train time:11.723287582397461
test time:0.25821900367736816
per page:0.008607300122578938

