# PRNSMRework

### Import Packages

In [1]:
import os

# Log level: Default:0 (ALL), 1 (INFO), 2 (WARNING), 3 (ERROR)
os.environ['TF_CPP_MIN_LOG_LEVEL'] = "2"
import logging
import random
import sys
from urllib.parse import parse_qs, parse_qsl, urlparse, urlsplit, unquote
from itertools import islice
from typing import Any
import re

import numpy as np
import pandas as pd
import parsel
from sklearn_crfsuite.metrics import (flat_classification_report,
                                      sequence_accuracy_score)

from nltk import word_tokenize

# Import autopager
sys.path.insert(0, '..')
from autopager import AUTOPAGER_LIMITS
from autopager.htmlutils import (get_link_href, get_link_text,
                                 get_selector_root,
                                 get_text_around_selector_list)
from autopager.model import _elem_attr, _num_tokens_feature
from autopager.parserutils import (MyHTMLParser, TagParser, compare_tag,
                                   draw_scaled_page, get_first_tag,
                                   position_check)
from autopager.storage import Storage
from autopager.utils import (get_domain, ngrams, ngrams_wb, normalize,
                             normalize_whitespaces, replace_digits, tokenize)

parser = MyHTMLParser()
tagParser = TagParser()

In [81]:
import tensorflow as tf
from tensorflow import keras
from tensorflow_addons.layers.crf import CRF
import tensorflow_text as tf_text
from keras import backend as K
# mixed_precision = tf.keras.mixed_precision
# policy = mixed_precision.Policy('mixed_float16')
# mixed_precision.set_global_policy(policy)

layers = tf.keras.layers
pad_sequences = tf.keras.preprocessing.sequence.pad_sequences
Model = tf.keras.Model
Dataset = tf.data.Dataset
Dense = tf.keras.layers.Dense
Input = tf.keras.layers.Input
Bidirectional = tf.keras.layers.Bidirectional
LSTM = tf.keras.layers.LSTM
Embedding = tf.keras.layers.Embedding
Masking = tf.keras.layers.Masking
Concatenate = tf.keras.layers.Concatenate
AveragePooling2D = tf.keras.layers.AveragePooling2D
MaxPooling2D = tf.keras.layers.MaxPooling2D
MaxPooling3D = tf.keras.layers.MaxPooling3D
Conv2D = tf.keras.layers.Conv2D
Conv1D = tf.keras.layers.Conv1D
Reshape = tf.keras.layers.Reshape
Attention = tf.keras.layers.Attention
GlobalAveragePooling1D = tf.keras.layers.GlobalAveragePooling1D

tf.random.set_seed(0)
random.seed(0)
np.random.seed(0)

In [3]:
from ipywidgets import IntProgress
from IPython.display import display

## Define constants

In [4]:
MAX_PAGE_SEQ = 512

## Set GPU

In [5]:
# TODO: Use argparse
USED_GPU = 0

Check GPU availability

In [6]:
# Tensorflow
tf.__version__
gpus = tf.config.experimental.list_physical_devices('GPU')

if len(gpus)!=0:
    for device in gpus:
        tf.config.experimental.set_memory_growth(device, True)
  # Restrict TensorFlow to only use the first GPU
    try:
        tf.config.experimental.set_visible_devices(gpus[USED_GPU], 'GPU')
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPU")
    except RuntimeError as e:
    # Visible devices must be set before GPUs have been initialized
        print(e)
else:
    print("Tensorflow: No GPUs visible")
    
# Pytorch
import torch

if(torch.cuda.is_available() and torch.cuda.device_count() > USED_GPU):
    torch.cuda.set_device(USED_GPU)
    torch.cuda.current_device()
    torch.cuda.device(USED_GPU)
else:
    print("PyTorch: No GPUs visible")

1 Physical GPUs, 1 Logical GPU


## Load data

In [7]:
storage = Storage()

Current test file:  ['en', 'zh', 'ko', 'ja', 'de', 'ru', 'test', 'event']


In [8]:
urls = [rec['Page URL'] for rec in storage.iter_records(language='en',contain_button = True, file_type='T')]
# X_raw: <Selector xpath='.//a|.//button' data='<a href="https://www.oneplus.com"><im...'>,
# The a and button that is not yet extracted
X_raw: list[parsel.selector.SelectorList]
# y: ['O',  'PAGE', 'O', 'PAGE', 'PAGE', 'PAGE', 'PAGE', 'PAGE', 'O', 'PAGE', 'NEXT', 'O']
y: list[str]
X_raw, y, page_positions = storage.get_Xy(language='en', contain_button = True,  contain_position=True,file_type='T', scaled_page='normal')
print("pages: {}  domains: {}".format(len(urls), len({get_domain(url) for url in urls})))

pages: 164  domains: 55


# Feature extraction

### Token feature and Tag feature

In [9]:
# XXX: these functions should be copy-pasted from autopager/model.py

def _as_list(generator, limit=None) -> list:
    """
    >>> _as_list(ngrams_wb("text", 2, 2), 0)
    []
    >>> _as_list(ngrams_wb("text", 2, 2), 2)
    ['te', 'ex']
    >>> _as_list(ngrams_wb("text", 2, 2))
    ['te', 'ex', 'xt']
    """
    return list(generator if limit is None else islice(generator, 0, limit))

def feat_to_tokens(feat, tokenizer):
    if type(feat) == type([]):
        feat = ' '.join(feat)
    tokens = tokenizer.tokenize(feat)
    return tokens

def num_token_feature_to_class(number):
    if number == '=0':
        return [1, 0, 0, 0]
    elif number == '=1':
        return [0, 1, 0, 0]
    elif number == '=2':
        return [0, 0, 1, 0]
    else:
        return [0, 0, 0, 1]

def link_to_features(link: parsel.Selector):
    # Get text contecnt of the link otherwise alt or img.
    # Normalize multiple white space to one and to lowercase.
    text = normalize(get_link_text(link))
    href = get_link_href(link)
    if href is None:
        href = ""
    p = urlsplit(href)
    parent = link.xpath('..').extract()
    # Retrive the line of first tag opening
    parent = get_first_tag(parser, parent[0])
    query_parsed = parse_qsl(p.query) #parse query string from path
    query_param_names = [k.lower() for k, v in query_parsed]
    query_param_names_ngrams = _as_list(ngrams_wb(
        " ".join([normalize(name) for name in query_param_names]), 3, 5, True
    ))

    # Classes of link itself and all its children.
    # It is common to have e.g. span elements with fontawesome
    # arrow icon classes inside <a> links.
    self_and_children_classes = ' '.join(link.xpath(".//@class").extract())
    parent_classes = ' '.join(link.xpath('../@class').extract())
    css_classes = normalize(parent_classes + ' ' + self_and_children_classes)
    
    token_feature = {
        'text-exact': replace_digits(text.strip()[:100].strip()),
        # <scheme>://<netloc>/<path>?<query>#<fragment>
        'url': href,
        'parent-tag': parent,
        'class':_as_list(ngrams_wb(css_classes, 4, 5),
                          AUTOPAGER_LIMITS.max_css_features),
        'text': _as_list(ngrams_wb(replace_digits(text), 2, 5),
                         AUTOPAGER_LIMITS.max_text_features),
    }
    tag_feature = {
        'isdigit': 1 if text.isdigit() is True else 0,
        'isalpha': 1 if text.isalpha() is True else 0,
        'has-href': 0 if href == "" else 1,
        'path-has-page': 1 if 'page' in p.path.lower() else 0,
        'path-has-pageXX': 1 if re.search(r'[/-](?:p|page\w?)/?\d+', p.path.lower()) is not None else 0,
        'path-has-number': 1 if any(part.isdigit() for part in p.path.split('/')) else 0,
        'href-has-year': 1 if re.search('20\d\d', href) is not None else 0,
        'class-has-disabled': 1 if 'disabled' in css_classes else 0,
    }
    non_token_feature = []
    for k,v in tag_feature.items():
        if type(v) == type([]):
            non_token_feature.extend(v)
        else:
            non_token_feature.append(v)

    # print(token_feature)

    return [token_feature, non_token_feature]


def page_to_features(xseq):
    feat_list = [link_to_features(a) for a in xseq]
    around = get_text_around_selector_list(xseq, max_length=15)
    # Append sibling's text-exact to each node's text-full.
    for feat, (before, after) in zip(feat_list, around):
        feat[0]['text-full'] = normalize(before) + ',' + feat[0]['text-exact'] + ',' + normalize(after)
    
    return feat_list

In [10]:
def get_token_tag_features_from_chunks(chunks):
    token_features = []
    tag_features = []
    for idx, page in enumerate(chunks):
        try:
            feat_list = page_to_features(page)
            token_features.append([node[0] for node in feat_list])
            tag_features.append(np.array([node[1] for node in feat_list]))
        except:
            raise Exception(f"Error occured on {idx}")
    return token_features, tag_features

In [11]:
token_features: list[list[dict[str, Any]]]
token_features, tag_features = get_token_tag_features_from_chunks(X_raw)
# ['text-exact', 'query', 'parent-tag', 'class', 'text', 'text-full']
token_feature_titles: list[str] = list(token_features[0][0].keys())

### Prepare parent tag feature
From top 30 parent tag

In [12]:
def sparse_representation_with_map(tag, data_map):
    # Vector length is the number of tags in the map(30).
    rt_vec = [0] * len(data_map)
    for idx, map_tag in enumerate(data_map):
        # ('tag_name', count)
        if tag == map_tag[0]:
            rt_vec[idx] = 1
            break
    return rt_vec

In [13]:
def get_ptags_vector(token_features, data_map_for_ptag):
    pages_ptag = []
    for page in token_features:
        ptag_page = []
        for node in page:
            p_tag = node['parent-tag']
            ptag_page.append(sparse_representation_with_map(p_tag, data_map_for_ptag))
        pages_ptag.append(np.array(ptag_page))
    return pages_ptag

In [14]:
top_parent_tags = {}

for page in token_features:
    for node in page:
        p_tag = node['parent-tag']
        if p_tag not in top_parent_tags:
            top_parent_tags[p_tag] = 1
        else:
            top_parent_tags[p_tag] += 1
sorted_parent_tags = sorted(top_parent_tags.items(),key=lambda x:x[1],reverse=True)

data_map_for_ptag = sorted_parent_tags[:30]
#Get parent tag vector
ptags_vector = get_ptags_vector(token_features, data_map_for_ptag)

### Get Class, Query tokens by tokenizer

In [15]:
class TagTokenizer:
    def __init__(self, tag_name_count=None):
        rt_dict = {}
        rt_dict["[PAD]"] = 0
        rt_dict["[UNK]"] = 1
        if tag_name_count is not None:
            for k in tag_name_count.keys():
                rt_dict[k] = len(rt_dict)
        self.map = rt_dict

    def tokenize(self, word):
        if isinstance(word, list):
            token_list = []
            for _word in word:
                if _word not in self.map:
                    token_list.append(self.map["[UNK]"])
                else:
                    token_list.append(self.map[_word])
            return token_list
        else:
            if word not in self.map:
                return self.map["[UNK]"]
            else:
                return self.map[word]

    def get_size(self):
        return len(self.map)


#### Prepare class and query features

In [16]:
top_thousand_class = {}

for page in token_features:
    for node in page:
        for class_name in node['class']:
            top_thousand_class[class_name] = top_thousand_class.get(class_name, 0) + 1
          
class_tokenizer = TagTokenizer(top_thousand_class)

### Get pre-trained sentence(text content) embedding

Load Pre-trained Laser model and embedding

In [17]:
from LaserSentenceModel import LaserSentenceModel
laser = LaserSentenceModel()
laser.getSentenceVector('hello').shape

(1024,)

In [18]:
def word_to_vector(word_list, word_vector_method = None):
    if word_vector_method is None:
        print("Need to specified a method.")
        return
    elif word_vector_method == 'FastText':
        from FastTextModel import FastTextModel
        ft = FastTextModel()
        if type(word_list) == type([]):
            if len(word_list) == 0:
                return np.zeros(ft.getModel().get_dimension())
            else:
                vectors_array = []
                for word in word_list:
                    vector = ft.getWordVector(word)
                    vectors_array.append(vector)
                mean_vector = np.mean(vectors_array, axis = 0)
                return mean_vector
        else:
            return ft.getWordVector(word_list)
    elif word_vector_method == 'Laser':
        return laser.getSentenceVector(word_list)

In [19]:
def pages_to_word_vector_from_keylist(word_vector_method, token_features, word_to_vec_list):
    print(f"Transform key {word_to_vec_list} to word_vector ... useing {word_vector_method}")
    pages_vector = []
    p = IntProgress(max=len(token_features))
    p.description = '(Init)'
    p.value = 0
    display(p)
    for idx, page in enumerate(token_features):
        p.description = f"Task: {idx+1}"
        p.value = idx+1
        page_vectors = []
        for node in page:
            full_vector_list = []
            for k,v in node.items():
                if k in word_to_vec_list:
                    # if word_to_vec_list is 'text-full'
                    # v is str, returns laser.embed_sentences(sents, lang=self.lang)[0]
                    full_vector_list.append(word_to_vector(v, word_vector_method))
            full_vector = np.concatenate(full_vector_list, axis=0)
            page_vectors.append(full_vector)
        pages_vector.append(np.array(page_vectors))
    p.description = '(Done)'
    return pages_vector

In [20]:
if os.path.isfile('embedding/train/LaserEmb_full.npy'):
    laser_full_tokens_emb = np.load('embedding/train/LaserEmb_full.npy', allow_pickle=True)
else:
    laser_full_tokens_emb = pages_to_word_vector_from_keylist('Laser', token_features, ['text-full'])
    np.save('embedding/train/LaserEmb_full.npy', laser_full_tokens_emb)

## Padding to fixed size and prepare for training inputs

In [21]:
def prepare_input_ids(page_tokens, max_len):
    pages_class = []
    for page in page_tokens:
        class_page = []
        for node in page:
            #class
            class_ids = class_tokenizer.tokenize(node['class'])
            class_ids = class_ids + [0] * (max_len-len(class_ids))
            class_page.append(class_ids[:max_len])
        pages_class.append(np.array(class_page))
    return pages_class

In [22]:
train_tag_info_list = tag_features #features which only have tag true/false information

In [24]:
max_len = 256
train_attr_x = laser_full_tokens_emb
train_ptag: list = ptags_vector
pages_class: list[np.ndarray] = prepare_input_ids(token_features, max_len=max_len)
pages_url: list[np.ndarray] = [np.array([node['url'] for node in page], dtype=object) for page in token_features]
train_tag_x = tag_features


In [25]:
train_composite_with_token = [train_attr_x, train_ptag, pages_class, pages_url, train_tag_x]

In [26]:
labels = ["O", "PREV", "PAGE", "NEXT"]
tag2idx = {label: idx for idx, label in enumerate(labels)}
idx2tag = {idx: label for idx, label in enumerate(labels)}
num_tags = len(labels)


In [27]:
train_y = [np.array([tag2idx.get(l) for l in lab]) for lab in y]

In [28]:
for inputs in train_composite_with_token:
    print(inputs[0].shape)

(303, 1024)
(303, 30)
(303, 256)
(303,)
(303, 8)


## Build model

Fit word and char tokenizer

In [29]:
# 序列長度
MAX_URL_CHAR_LEN = 1024
MAX_URL_WORD_LEN = 128

# 種類
MAX_URL_CHAR_TOKEN = 256
MAX_URL_WORD_TOKEN = 20000


def word_spliter(url: tf.Tensor):
    # May be .lower()?
    return tf_text.regex_split(url, r"\/|&|\?|#|\.|://|=|-|[\ ]")


def char_spliter(url: tf.Tensor):
    return tf.strings.unicode_split(url, "UTF-8")
    # return tf.strings.bytes_split(url)


char_vectorize_layer = tf.keras.layers.TextVectorization(
    standardize="lower",  # Do not remove punchuation!!
    split=char_spliter,
    max_tokens=MAX_URL_CHAR_TOKEN,
    output_mode="int",
    output_sequence_length=MAX_URL_CHAR_LEN,
)

word_vectorize_layer = tf.keras.layers.TextVectorization(
    standardize="lower",  # Do not remove punchuation!!
    split=word_spliter,
    max_tokens=MAX_URL_WORD_TOKEN,
    output_mode="int",
    output_sequence_length=MAX_URL_WORD_LEN,
)

# Smaller batch?
url_dataset = Dataset.from_tensor_slices([unquote(node['url']) for page in token_features for node in page])
char_vectorize_layer.adapt(url_dataset.batch(64))
word_vectorize_layer.adapt(url_dataset.batch(64))

# pages_url_char = [[char_vectorize_layer(node['url']) for node in page] for page in token_features[:1]]
# pages_url_word: list[np.ndarray] = [np.array([word_vectorize_layer(node['url']) for node in page], dtype=str) for page in token_features]

In [82]:
def get_custom_emb_model(use_crf=True, embedding_size=32, hidden_size=300):
    HIDDEN_UNITS = hidden_size
    NUM_CLASS = num_tags
    EMBEDDING_SIZE = embedding_size
    CLS_EMB_SIZE = 256
    CONV_FILTERS = 64 # Recommand 256
    CLASS_CONV_FILTERS = 8

    # Since the tensor may be ragged, we use None to represent the dimention of sequence length
    laser_emb_shape = (None, 1024)  # Laser embedding
    tag_info_shape = (None, 8)  # tag_freatures from autopager
    class_emb_shape = (None, 256)  # Class and Query embedding Shape
    ptag_emb_shape = (None, 30)  # Parent tag embedding shape
    page_embbed_shape = (-1, EMBEDDING_SIZE)

    filter_sizes = [3, 4, 5, 6]

    input_ft_embedding = Input(
        shape=laser_emb_shape, name="input_ft_embeddings"
    )
    input_ptag_emb = Input(shape=(ptag_emb_shape), name="input_ptag_embedding")
    input_class = Input(shape=class_emb_shape, name="input_class")
    
    ##
    input_url = Input(shape=(None,), dtype=tf.string, name="input_url")
    input_url = Reshape([-1, 1])(input_url)
    ##
    
    input_tag_information = Input(
        shape=(tag_info_shape), name="input_tag_information"
    )

    ### CLASS EMBEDDING ###
    # TODO: Find proper Conv2D size and Embedding size
    class_emb = input_class
    class_emb = K.expand_dims(class_emb, axis=-1)
    # [batch_size, seq_len, 256, 1]
    class_emb = Conv1D(
        filters=CLASS_CONV_FILTERS,
        kernel_size=[CLS_EMB_SIZE],
        strides=1,
        padding="valid",
        data_format="channels_last",
        activation="relu",
        name='Conv_class'
    )(class_emb)
    class_emb = Reshape((-1, CLASS_CONV_FILTERS), name="class_emb_out")(class_emb)

    ##### URL EMBEDDING #####
    ### CHAR CONVOLUTION LAYER ###
    # url_char_emb = Reshape(target_shape=(MAX_URL_CHAR_LEN,))(input_url)
    # print(K.int_shape(input_url))
    url_char_emb = char_vectorize_layer(input_url)
    # print(K.int_shape(url_char_emb))
    # url_char_emb = K.squeeze(url_char_emb, axis = -1)
    url_char_emb = Embedding(
        input_dim=MAX_URL_CHAR_TOKEN,
        output_dim=EMBEDDING_SIZE,
        input_length=MAX_URL_CHAR_LEN,
        mask_zero=True,
    )(url_char_emb)
    # print(K.int_shape(url_char_emb))
    url_char_emb = K.expand_dims(url_char_emb, axis=-1)
    
    pooled_char_x = []
    for i, filter_size in enumerate(filter_sizes):
        # print("Before conv", K.int_shape(url_char_emb))
        url_char_emb_conved = Conv2D(
            filters=CONV_FILTERS,
            kernel_size=[filter_size, EMBEDDING_SIZE],
            strides=(1, 1),
            padding="valid",
            data_format="channels_last",
            activation="relu",
            name=f'Conv_char_{i}'
        )(url_char_emb)
        # print("Before pool", K.int_shape(url_char_emb_conved))
        url_char_emb_pooled = MaxPooling3D(
            pool_size=(1, MAX_URL_CHAR_LEN - filter_size + 1, 1),
            strides=(1, 1, 1),
            padding="valid",
            data_format="channels_last",
            name=f'MaxPooling_char_{i}'
        )(url_char_emb_conved)
        # print("After pool", K.int_shape(url_char_emb_pooled))
        pooled_char_x.append(url_char_emb_pooled)

    num_filters_total = CONV_FILTERS * len(filter_sizes)
    # print(K.int_shape(pooled_char_x[0]))
    
    url_char_emb = Concatenate(axis=2, name='Concat_pooled_char')(pooled_char_x)
    url_char_emb = Reshape(target_shape=(-1, num_filters_total))(url_char_emb)

    char_output = Dense(units=512, activation="relu")(
        url_char_emb
    )  # (num_filters_total, 512)

    #### WORD CONVOLUTION LAYER ###
    url_word_emb = word_vectorize_layer(input_url)
    # print(K.int_shape(url_word_emb))
    # url_char_emb = K.squeeze(url_word_emb, axis = -1)
    url_word_emb = Embedding(
        input_dim=MAX_URL_WORD_TOKEN,
        output_dim=EMBEDDING_SIZE,
        input_length=MAX_URL_WORD_LEN,
        mask_zero=True,
    )(url_word_emb)
    url_word_emb = K.expand_dims(url_word_emb, axis=-1)
    pooled_word_x = []
    for i, filter_size in enumerate(filter_sizes):
        # [filter_height, filter_width, in_channels, out_channels]
        # filter_shape = [filter_size, embedding_size, 1, 256]
        url_word_emb_conved = Conv2D(
            filters=CONV_FILTERS,
            kernel_size=[filter_size, EMBEDDING_SIZE],
            padding="valid",
            data_format="channels_last",
            activation="relu",
            name=f'Conv_word_{i}'
        )(url_word_emb)
        url_word_emb_pooled = MaxPooling3D(
            pool_size=(1, MAX_URL_WORD_LEN - filter_size + 1, 1),
            strides=(1, 1, 1),
            padding="valid",
            data_format="channels_last",
            name=f'MaxPooling_word_{i}'
        )(url_word_emb_conved)
        pooled_word_x.append(url_word_emb_pooled)
    
    num_filters_total = CONV_FILTERS * len(filter_sizes)
    url_word_emb = Concatenate(axis=2, name='Concat_pooled_word')(pooled_word_x)
    url_word_emb = Reshape(target_shape=(-1, num_filters_total))(url_word_emb)
    # url_word_emb = Dropout(.2)(url_word_emb)
    word_output = Dense(units=512, activation="relu")(
        url_word_emb
    )  # (num_filters_total)

    ############################### CONCAT WORD AND CHAR BRANCH ############################
    conv_output = Concatenate(axis=2, name='Concat_word_char')([word_output, char_output])
    conv_output = Dense(units=512, activation="relu")(conv_output)
    conv_output = Dense(units=256, activation="relu")(conv_output)
    url_emb = Dense(units=128, activation="relu")(conv_output)

    # ft_FFN = Dense(units=512, activation="relu", name="ft_FFN_01")(
    #     input_ft_embedding
    # )
    # ft_FFN = Dense(units=256, activation="relu", name="ft_FFN_02")(ft_FFN)
    # ft_FFN = Dense(units=128, activation="relu", name="ft_FFN_out")(ft_FFN)

    # FFN for ptag
    # ptag_FFN = Dense(units = 128, activation = 'relu', name="ptag_FFN_01")(input_ptag_vector)
    # ptag_FFN = Dense(units = 64, activation = 'relu', name="ptag_FFN_out")(ptag_FFN)

    ############################### LSTM+CRF ############################
    merged = Concatenate(name='Concat_all_freatures')(
        [
            input_ft_embedding,
            input_ptag_emb,
            class_emb,
            url_emb,
            input_tag_information,
        ]
        
    )
    model = Bidirectional(LSTM(units=HIDDEN_UNITS // 2, return_sequences=True))(
        merged
    )

    crf = CRF(NUM_CLASS, name="crf_layer")
    out = crf(model)
    loss_fn = crf.get_loss
    # [train_attr_x, train_ptag, pages_class, pages_url, train_tag_x]
    model = Model(
        [
            input_ft_embedding,
            input_ptag_emb,
            input_class,
            input_url,
            input_tag_information,
        ],
        out,
    )

    return model, loss_fn


## Split data into train/val set

In [31]:
BATCH_SIZE = 1

In [32]:
def composite_train_test_split(composite_x, y, number):
    x_train = [data[:-number] for data in composite_x]
    y_train = y[:-number]
    x_val = [data[-number:] for data in composite_x]
    y_val = y[-number:]
    return x_train, y_train, x_val, y_val

def list_to_dataSet(data, dataType):
    # return tf.convert_to_tensor(data)
    dataset = Dataset.from_generator(lambda: iter(data), dataType)
    return dataset

def data_list_to_dataset(x, y, isValidation = False, batch_size = 1):
    all_data = Dataset.zip(tuple([list_to_dataSet(feature_type, feature_type[0].dtype) for feature_type in x]))
    y_ds = list_to_dataSet(y, tf.int32)
    final_set = Dataset.zip((all_data, y_ds))
    if not isValidation:
        final_set = final_set.shuffle(buffer_size=1024).batch(batch_size)
    else:
        final_set = final_set.batch(batch_size)
    return final_set

In [33]:
x_train, y_train, x_val, y_val = composite_train_test_split(train_composite_with_token, train_y, 20)

In [34]:
def composite_list_to_dataset(x, batch_size = 1):
    """
    x: [[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]]
    output: [[1, 5, 9], [2, 6, 10], [3, 7, 11], [4, 8, 12]]
    batch_size = 2
    output: [[[1, 5, 9], [2, 6, 10]], [[3, 7, 11], [4, 8, 12]]
    """
    all_data = Dataset.zip(tuple([list_to_dataSet(data, tf.float32) for data in x]))
    return all_data.batch(batch_size)

In [35]:
train_dataset = data_list_to_dataset(x_train, y_train, isValidation=False)
val_dataset = data_list_to_dataset(x_val, y_val, isValidation=True)

In [36]:
print(x_train[3][0].shape)
it = iter(train_dataset)

(303,)


In [37]:
print(type(next(it)))
print(next(it)[0][3].shape)

<class 'tuple'>
(1, 82)


## Define Custom Training

In [38]:
# Calculate training/val f1-score
from sklearn.metrics import classification_report
from collections import Counter


def calculate_pages_metric(y_true_pages, y_predict_pages):
    pages_f1 = []
    nexts_f1 = []
    avg_f1 = []
    for y_true, y_predict in zip(y_true_pages, y_predict_pages):
        if len(y_true) == 0:
            break
        report = classification_report(
            y_true, y_predict, output_dict=True, zero_division=1
        )
        #         print(report)
        PAGE = report["2"]["f1-score"]
        NEXT = report["3"]["f1-score"]
        pages_f1.append(PAGE)
        nexts_f1.append(NEXT)
        avg_f1.append((PAGE + NEXT) / 2)
    return pages_f1, nexts_f1, avg_f1


def calculate_page_metric(y_true, y_predict):
    # 下面以對zero_division做處理
    report = classification_report(
        y_true, y_predict, labels=[0, 2, 3], output_dict=True, zero_division=0
    )
    OTHER = report["0"]["f1-score"]
    PAGE = report["2"]["f1-score"]
    NEXT = report["3"]["f1-score"]
    if 2 in y_true and 3 in y_true:
        AVG = (PAGE + NEXT) / 2
    elif 2 in y_true and 3 not in y_true:
        AVG = PAGE
    elif 2 not in y_true and 3 in y_true:
        AVG = NEXT
    else:
        AVG = OTHER
    return AVG


Test for 1 data predict

In [39]:
# for (batch_x, batch_y) in train_dataset.take(1):
#     batch_predict_y = model(batch_x).numpy()
#     batch_true_y = batch_y.numpy()
#     print(batch_true_y)
#     print(batch_predict_y)
#     print(calculate_page_metric(batch_true_y[0], batch_predict_y[0]))
#     print(classification_report(batch_true_y[0], batch_predict_y[0]))

## Training

In [40]:
def train_on_epoch(epochs, model, optimizer, train_dataset, val_dataset, best_model_method = 'f1-score'):
    import time
    
    epochs = epochs
    best_weights = None
    best_f1_weights = None
    best = np.Inf
    best_loss_history = None
    best_f1 = 0
    best_f1_history = None
    avg_epoch_losses = []
    avg_epoch_f1s = []
    for epoch in range(epochs):
        print("\nStart of epoch %d" % (epoch,))
        start_time = time.time()

        # Iterate over the batches of the dataset.
        for step, (x_batch_train, y_batch_train) in enumerate(train_dataset):
            with tf.GradientTape() as tape:
                logits = model(x_batch_train, training=True)
                loss_value = loss_fn(y_batch_train, logits)
            grads = tape.gradient(loss_value, model.trainable_weights)
            optimizer.apply_gradients(zip(grads, model.trainable_weights))

        
        # TODO: Tensorboard
        
        # Run a validation loop at the end of each epoch.
        val_losses = []
        val_f1s = []
        for x_batch_val, y_batch_val in val_dataset:
            val_logits = model(x_batch_val, training=False)
            val_loss_value = loss_fn(y_batch_val, val_logits)
            val_avg_f1 = calculate_page_metric(y_batch_val.numpy()[0], val_logits.numpy()[0])
            val_losses.append(val_loss_value)
            val_f1s.append(val_avg_f1)
        average_val_loss = np.average(val_losses)
        average_val_f1 = np.average(val_f1s)
        avg_epoch_losses.append(average_val_loss)
        avg_epoch_f1s.append(average_val_f1)
        if average_val_loss < best:
            best_weights = model.get_weights()
            best = average_val_loss
            best_loss_history = [val_losses, val_f1s]
        if average_val_f1 > best_f1:
            best_f1_weights = model.get_weights()
            best_f1 = average_val_f1
            best_f1_history = [val_losses, val_f1s]
        print("Validation loss: %.4f" % (float(average_val_loss),))
        print("Validation F1: %.4f" % (float(average_val_f1),))
        print("Time taken: %.2fs" % (time.time() - start_time))
    print(f"Best loss: {best}, Best F1: {best_f1}")
    print(f"Training finish, load best weights. {best_model_method}")
    
    if best_model_method == 'loss':
        model.set_weights(best_weights)
    elif best_model_method == 'f1-score':
        model.set_weights(best_f1_weights)
    avg_epoch_result = {"epoch_losses": avg_epoch_losses, "epoch_f1s": avg_epoch_f1s}
    return model, avg_epoch_result

In [41]:
def learning_curve(epochs, model, optimizer, train_dataset, val_dataset):
    import time
    
    epochs = epochs
    best_f1_weights = None
    best_f1 = 0
    best_f1_history = None
    best_train = 0

    for epoch in range(epochs):
        # print("\nStart of epoch %d" % (epoch,))
        start_time = time.time()
        train_f1s = []
        # Iterate over the batches of the dataset.
        for step, (x_batch_train, y_batch_train) in enumerate(train_dataset):
            with tf.GradientTape() as tape:
                logits = model(x_batch_train, training=True)
                loss_value = loss_fn(y_batch_train, logits)
                train_avg_f1 = calculate_page_metric(y_batch_train.numpy()[0], logits.numpy()[0])
                train_f1s.append(train_avg_f1)
            grads = tape.gradient(loss_value, model.trainable_weights)
            optimizer.apply_gradients(zip(grads, model.trainable_weights))

        average_train_f1 = np.average(train_f1s)
        
        # Run a validation loop at the end of each epoch.
        val_losses = []
        val_f1s = []
        for x_batch_val, y_batch_val in val_dataset:
            val_logits = model(x_batch_val, training=False)
            val_loss_value = loss_fn(y_batch_val, val_logits)
            val_avg_f1 = calculate_page_metric(y_batch_val.numpy()[0], val_logits.numpy()[0])
            val_losses.append(val_loss_value)
            val_f1s.append(val_avg_f1)

        average_val_f1 = np.average(val_f1s)

        if average_val_f1 > best_f1:
            best_f1 = average_val_f1
            best_train = average_train_f1
            
    print(f"Best train f1: {best_train}, Best val f1: {best_f1}")
    
    return best_train, best_f1

## Test data evaluation

In [42]:
# Transfer distribution to corresponding label
def label_distribution_to_label(predict_y):
    if len(predict_y.shape) != 3:
        return predict_y
    label_y = list()
    for page in predict_y:
        tmp = list()
        for lab in page:
            lab = lab.tolist()
            tmp.append(lab.index(max(lab)))
        label_y.append(tmp)
    return label_y

In [43]:
# Prepare for testing inputs
def prepare_for_testing(test_X_raw, test_y_raw): #ft-bert -no chunks
    test_token_features, test_tag_features = get_token_tag_features_from_chunks(test_X_raw)
    
    top_parent_tags = {}
    for page in test_token_features:
        for node in page:
            p_tag = node['parent-tag']
            if p_tag not in top_parent_tags:
                top_parent_tags[p_tag] = 1
            else:
                top_parent_tags[p_tag] += 1
    sorted_parent_tags = sorted(top_parent_tags.items(),key=lambda x:x[1],reverse=True)
    
    test_ptags_vector = get_ptags_vector(test_token_features, sorted_parent_tags[:30])
    
    if os.path.isfile('embedding/train/LaserEmb_full.npy'):
        test_ft_emb = np.load('embedding/train/LaserEmb_full.npy', allow_pickle=True)
    else:
        test_ft_emb = pages_to_word_vector_from_keylist('Laser', token_features, ['text-full']) #token_feature_titles?
        np.save('embedding/train/LaserEmb_full.npy', laser_full_tokens_emb)
    
    test_tag_info_list = test_tag_features
    ## Tokens prepare
    test_pages_class, test_pages_query = prepare_input_ids(test_token_features, max_len)
    # test_pages_class, test_pages_query, test_pages_text = prepare_input_ids(test_token_features, max_len)
    ## X_test_input
    test_composite_input = [test_ft_emb, test_ptags_vector, test_pages_class, test_pages_query, test_tag_info_list]
    
    ## y_test_input
    y_test = np.asarray(test_y_raw)
    
    
    return test_composite_input, y_test

In [44]:
def node_level_score(y_pred, y_true):
    reports = flat_classification_report(
        y_true, y_pred, labels=["PAGE", "NEXT"], digits=3, output_dict=True
    )

    page_prec = reports["PAGE"]["precision"]
    page_rec = reports["PAGE"]["recall"]
    page_f1 = reports["PAGE"]["f1-score"]
    next_prec = reports["NEXT"]["precision"]
    next_rec = reports["NEXT"]["recall"]
    next_f1 = reports["NEXT"]["f1-score"]

    record = {
        "page_prec": page_prec,
        "page_rec": page_rec,
        "page_f1": page_f1,
        "next_prec": next_prec,
        "next_rec": next_rec,
        "next_f1": next_f1,
    }
    print("Finish page ")
    return record


def page_level_score(y_pred, y_true):
    page_prec = 0
    page_rec = 0
    page_f1 = 0
    next_prec = 0
    next_rec = 0
    next_f1 = 0
    macro_f1 = 0
    size = 0
    for idx, (page_pred, page_true) in enumerate(zip(y_pred, y_true)):
        # 沒算到 False positive?
        if (
            "NEXT" not in page_true
            and "PAGE" not in page_true
            and "PREV" not in page_true
        ):
            # True positive = 0
            continue
        else:
            size += 1
        reports = classification_report(
            page_true,
            page_pred,
            labels=["PAGE", "NEXT"],
            digits=3,
            output_dict=True,
            zero_division=0,
        )
        page_prec += reports["PAGE"]["precision"]
        page_rec += reports["PAGE"]["recall"]
        page_f1 += reports["PAGE"]["f1-score"]
        next_prec += reports["NEXT"]["precision"]
        next_rec += reports["NEXT"]["recall"]
        next_f1 += reports["NEXT"]["f1-score"]
    record = {
        "page_prec": page_prec / size,
        "page_rec": page_rec / size,
        "page_f1": page_f1 / size,
        "next_prec": next_prec / size,
        "next_rec": next_rec / size,
        "next_f1": next_f1 / size,
    }
    print("Finish page ")
    return record


In [45]:
def evaluate_from_batch(model, x, y, evaluate_labels, multiTask=False):
    print("Start predicting test data ...")
    test_page_dataset = composite_list_to_dataset(x)
    predicted_y = []
    for pageIdx, batch_x_test in enumerate(test_page_dataset):
        if len(y[pageIdx]) == 0:
            batch_predict_y = np.array([])
        else:
            if multiTask:
                batch_predict_y = model(batch_x_test)[0][0].numpy()
            else:
                batch_predict_y = model(batch_x_test)[0].numpy()

        if len(batch_predict_y.shape) != 1:
            tmp = list()
            for lab in batch_predict_y:
                lab = lab.tolist()
                tmp.append(lab.index(max(lab)))
            batch_predict_y = tmp
        predicted_y.append(batch_predict_y)
    print("Start evaluating test data ...")
    predict_y = np.asarray(
        [[idx2tag.get(lab) for lab in page] for page in predicted_y]
    )

    print("Node level classification report:")
    micro_report = flat_classification_report(
        y, predict_y, digits=3, labels=["PAGE", "NEXT"]
    )
    print(micro_report)
    micro_report_dict = flat_classification_report(
        y, predict_y, digits=3, labels=["PAGE", "NEXT"], output_dict=True
    )
    print(
        "Page level macro (For caculation methology please refer to the docs)"
    )
    macro_report = page_level_score(predict_y, y)
    print(macro_report)
    
    # with open('predict_y.txt', 'w') as f:
    #     f.write(str(predict_y))
    # with open('y.txt', 'w') as f:
    #     f.write(str(y))

    return (
        0.5 * (micro_report_dict["PAGE"]["f1-score"] + macro_report["next_f1"])
        + 0.5 * (micro_report_dict["PAGE"]["f1-score"] + macro_report["next_f1"])
    ) / 2


In [46]:
def evaluate_model(model, target="all"):
    # test_languages = storage.get_all_test_languages()
    test_languages = ["en", "de", "ru", "zh", "ja", "ko"]
    if target != "all":
        test_languages = [target]

    reports: dict = {}
    score: int = 0
    has_test_data = False

    for language in test_languages:
        print("Testing language: ", language)
        test_urls = [
            rec["Page URL"]
            for rec in storage.iter_test_records_by_language(language=language)
        ]
        test_X_raw, test_y = storage.get_test_Xy_by_language(language=language)
        print(
            "pages: {}  domains: {}".format(
                len(test_urls), len({get_domain(url) for url in test_urls})
            )
        )
        _test_x, _test_y = prepare_for_testing(test_X_raw, test_y)
        score = evaluate_from_batch(model, _test_x, _test_y, ["PAGE", "NEXT"])
        print("===================================")

    return score


In [47]:
def calculate_macro_avg(reports):
    avg_macro = 0
    for lan, report in reports.items():
        avg_macro+=report['macro avg']['f1-score']
    return avg_macro/len(reports)

## Page/Node level evaluation

In [48]:
def get_test_data(type=None, scaled_page='normal'):
    if type is None:
        print("Please assign type of test_data")
        return (None, None, None)
    test_X_one = []
    test_X_two = []
    test_y_one = []
    test_y_two = []
    test_page_positions_one = []
    test_page_positions_two = []
    if type != 'EVENT_SOURCE':
        storage.test_file = 'NORMAL'
        test_urls = [rec['Page URL'] for rec in storage.iter_test_records(exclude_en=None)]
        test_X_one, test_y_one, test_page_positions_one = storage.get_test_Xy(validate=False, contain_position=True,scaled_page=scaled_page,exclude_en=None)
        print("pages: {}  domains: {}".format(len(test_urls), len({get_domain(url) for url in test_urls})))
        if type == 'NORMAL':
            return test_X_one, test_y_one, test_page_positions_one
    if type != 'NORMAL':
        storage.test_file = 'EVENT_SOURCE'
        test_urls = [rec['Page URL'] for rec in storage.iter_test_records(exclude_en=None)]
        test_X_two, test_y_two, test_page_positions_two = storage.get_test_Xy(validate=False, contain_position=True,scaled_page=scaled_page,exclude_en=None)
        print("pages: {}  domains: {}".format(len(test_urls), len({get_domain(url) for url in test_urls})))
        if type == 'EVENT_SOURCE':
            return test_X_two, test_y_two, test_page_positions_two
    test_X_raw = test_X_one + test_X_two
    test_y = test_y_one + test_y_two
    test_positions = test_page_positions_one + test_page_positions_two
    return test_X_raw, test_y, test_positions

def evaluate_test(test_X_raw, test_y, model):
    storage.test_file = 'EVENT_SOURCE'
    _test_x, _test_y = prepare_for_testing(test_X_raw, test_y)
    evaluate_from_batch(model, _test_x, _test_y, ['PAGE','NEXT'])

    # count_predict(test_y_pred, test_y)
    # page_classifier(test_y_pred, test_y)
    return

test_X_raw, test_y, test_page_positions = get_test_data('EVENT_SOURCE')

pages: 100  domains: 53


## Train and Evaluate model

In [83]:
model, loss_fn = get_custom_emb_model(use_crf=True, embedding_size = 32, hidden_size = 300)

In [None]:
model.summary()

In [51]:
optimizer = keras.optimizers.Adam()

In [84]:
model, avg_epoch_result = train_on_epoch(2, model, optimizer, train_dataset, val_dataset)


Start of epoch 0


ResourceExhaustedError: Exception encountered when calling layer 'Conv_char_3' (type Conv2D).

{{function_node __wrapped__Conv2D_device_/job:localhost/replica:0/task:0/device:GPU:0}} OOM when allocating tensor with shape[2379,64,1019,1] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc [Op:Conv2D]

Call arguments received by layer 'Conv_char_3' (type Conv2D):
  • inputs=tf.Tensor(shape=(1, 2379, 1024, 32, 1), dtype=float32)

In [None]:
score = evaluate_model(model, target='event')

In [None]:
model = tf.keras.models.load_model("saved_model/0416", compile=False)

In [None]:
# test_X_raw, test_y = storage.get_test_Xy_by_language(language="event")
# _test_x, _test_y = prepare_for_testing(test_X_raw, test_y)
# evaluate_from_batch(model, _test_x, _test_y, ["PAGE", "NEXT"])

In [None]:
from datetime import datetime
time_stamp = datetime.now().strftime("%m%d-%H%M")

model.save(f'saved_model/{time_stamp}')