In [2]:
from model import create_VST_model
import tensorflow as tf
from transformers import BertTokenizer
import os
import numpy as np
import re
import cv2
from keras.mixed_precision import policy
from keras.utils import tf_inspect
import pandas as pd


Successfully load the BertTokenizer


In [3]:
# Use for texture data preprocessing
pattern = "[A-Z]"
pattern1 = '["\\[\\]\\\\]'
pattern2 = "[*.+!$#&,;{}()':=/<>%-]"
pattern3 = '[_]'

# Define basic parameters
max_len = 100
training_samples = 147
validation_samples = 63
max_words = 1000

# store all data
data_set = {}

# store file name
file_name = []

# store structure information
data_structure = {}

# store texture information
data_texture = {}

# store token, position and segment information
data_token = {}
data_position = {}
data_segment = {}
# dic_content = {}

# store the content of each text
string_content = {}

# store picture information
data_picture = {}

# store content of each picture
data_image = []

# experimental part — randomly shuffling data
all_data = []
train_data = []
test_data = []

structure = []
image = []
token = []
segment = []


In [4]:
# Define the basic bert class
class BertConfig(object):

    def __init__(self, **kwargs):
        super().__init__()
        self.vocab_size = kwargs.pop('vocab_size', 30000)
        self.type_vocab_size = kwargs.pop('type_vocab_size', 300)
        self.hidden_size = kwargs.pop('hidden_size', 768)
        self.num_hidden_layers = kwargs.pop('num_hidden_layers', 12)
        self.num_attention_heads = kwargs.pop('num_attention_heads', 12)
        self.intermediate_size = kwargs.pop('intermediate_size', 3072)
        self.hidden_activation = kwargs.pop('hidden_activation', 'gelu')
        self.hidden_dropout_rate = kwargs.pop('hidden_dropout_rate', 0.1)
        self.attention_dropout_rate = kwargs.pop('attention_dropout_rate', 0.1)
        self.max_position_embeddings = kwargs.pop('max_position_embeddings', 200)
        self.max_sequence_length = kwargs.pop('max_sequence_length', 200)


class BertEmbedding(tf.keras.layers.Layer):

    def __init__(self, config, **kwargs):
        super().__init__(name='BertEmbedding')
        self.vocab_size = config.vocab_size
        self.hidden_size = config.hidden_size
        self.token_embedding = self.add_weight('weight', shape=[self.vocab_size, self.hidden_size],
                                               initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02))
        self.type_vocab_size = config.type_vocab_size

        self.position_embedding = tf.keras.layers.Embedding(
            config.max_position_embeddings,
            config.hidden_size,
            embeddings_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
            name='position_embedding'
        )
        self.token_type_embedding = tf.keras.layers.Embedding(
            config.type_vocab_size,
            config.hidden_size,
            embeddings_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
            name='token_type_embedding'
        )
        self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-12, name='LayerNorm')
        self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_rate)

    def build(self, input_shape):
        with tf.name_scope('bert_embeddings'):
            super().build(input_shape)

    def call(self, inputs, training=False, mode='embedding'):
        # used for masked lm
        if mode == 'linear':
            return tf.matmul(inputs, self.token_embedding, transpose_b=True)

        input_ids, token_type_ids = inputs
        input_ids = tf.cast(input_ids, dtype=tf.int32)
        position_ids = tf.range(input_ids.shape[1], dtype=tf.int32)[tf.newaxis, :]
        if token_type_ids is None:
            token_type_ids = tf.fill(input_ids.shape.as_list(), 0)

        position_embeddings = self.position_embedding(position_ids)
        token_type_embeddings = self.token_type_embedding(token_type_ids)
        token_embeddings = tf.gather(self.token_embedding, input_ids)

        embeddings = token_embeddings + token_type_embeddings + position_embeddings
        embeddings = self.layer_norm(embeddings)
        embeddings = self.dropout(embeddings, training=training)
        return embeddings

    def get_config(self):
        """Returns the config of the layer.

        A layer config is a Python dictionary (serializable)
        containing the configuration of a layer.
        The same layer can be reinstantiated later
        (without its trained weights) from this configuration.

        The config of a layer does not include connectivity
        information, nor the layer class name. These are handled
        by `Network` (one layer of abstraction above).

        Returns:
            Python dictionary.
        """
        all_args = tf_inspect.getfullargspec(self.__init__).args
        config = {
            'name': self.name,
            'trainable': self.trainable,
        }
        if hasattr(self, '_batch_input_shape'):
            config['batch_input_shape'] = self._batch_input_shape
        config['dtype'] = policy.serialize(self._dtype_policy)
        if hasattr(self, 'dynamic'):
            # Only include `dynamic` in the `config` if it is `True`
            if self.dynamic:
                config['dynamic'] = self.dynamic
            elif 'dynamic' in all_args:
                all_args.remove('dynamic')
        expected_args = config.keys()
        # Finds all arguments in the `__init__` that are not in the config:
        extra_args = [arg for arg in all_args if arg not in expected_args]
        # Check that either the only argument in the `__init__` is  `self`,
        # or that `get_config` has been overridden:
        if len(extra_args) > 1 and hasattr(self.get_config, '_is_default'):
            raise NotImplementedError('Layer %s has arguments in `__init__` and '
                                      'therefore must override `get_config`.' %
                                      self.__class__.__name__)
        return config


tokenizer = BertTokenizer.from_pretrained('google/bert_uncased_L-12_H-768_A-12')
print('Successfully load the BertTokenizer')

Successfully load the BertTokenizer


In [5]:
model = create_VST_model()

Metal device set to: Apple M1 Max


  super().__init__(name, **kwargs)


In [6]:
model.load_weights('../Experimental output/VST_BEST.hdf5')

In [7]:
structure_dir = '../OUR_Dataset/Processed Dataset/Structure'
texture_dir = '../OUR_Dataset/Processed Dataset/Texture'
picture_dir = '../OUR_Dataset/Processed Dataset/Image'

In [8]:
def preprocess_new_structure_data(structure_dir):
    for f_name in os.listdir(structure_dir):
        f = open(os.path.join(structure_dir, f_name), errors='ignore')
        lines = []
        
        if not f_name.startswith('.'):
            file_name.append(f_name.split('.')[0])
            
            for line in f:
                line = line.strip(' \n')
                info = line.split(' ')
                info_int = []
                
                count = 0
                max_elements = 305
                
                for item in info:
                    if count < max_elements:
                        info_int.append(int(item))
                        count += 1  

                info_int = np.asarray(info_int)
                lines.append(info_int)
            f.close()
            
            lines = np.asarray(lines)
            data_structure[f_name.split('.')[0]] = lines
    return data_structure

In [9]:
def preprocess_new_texture_data(texture_dir):
    for f_name in os.listdir(texture_dir):
        if f_name[-4:] == ".txt":
            list_content = []
            list_position = []
            list_segment = []
            s = ''
            segment_id = 0
            position_id = 0
            count = 0
            f = open(os.path.join(texture_dir, f_name), errors='ignore')
            for content in f:
                content = re.sub(r"([a-z]+)([A-Z]+)", r"\1 \2", content)
                content = re.sub(pattern1, lambda x: " " + x.group(0) + " ", content)
                content = re.sub(pattern2, lambda x: " " + x.group(0) + " ", content)
                content = re.sub(pattern3, lambda x: " ", content)
                list_value = content.split()
                for item in list_value:
                    if len(item) > 1 or not item.isalpha():
                        s = s + ' ' + item
                        list_content.append(item)
                        if count < max_len:
                            list_position.append(position_id)
                            position_id += 1
                            list_segment.append(segment_id)
                        count += 1
                segment_id += 1
            while count < max_len:
                list_segment.append(segment_id)
                list_position.append(count)
                count += 1
            f.close()
            string_content[f_name.split('.')[0]] = s
            data_position[f_name.split('.')[0]] = list_position
            data_segment[f_name.split('.')[0]] = list_segment

    for sample in string_content:
        list_token = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(string_content[sample]))
        list_token = list_token[:max_len]
        while len(list_token) < max_len:
            list_token.append(0)
        data_token[sample] = list_token



In [10]:
def preprocess_new_picture_data(picture_dir):
    for f_name in os.listdir(picture_dir):
        if not f_name.startswith('.') and f_name[-4:] in ['.jpg', '.jpeg', '.png']:
            img_data = cv2.imread(os.path.join(picture_dir, f_name))
            img_data = cv2.resize(img_data, (128, 128))
            result = img_data / 255.0
            data_picture[f_name.split('.')[0]] = result
            data_image.append(result)



In [11]:
def prepare_data_for_prediction():
    count_id = 0
    while count_id < 69 and count_id < len(file_name):
        all_data.append(file_name[count_id])
        count_id += 1
    for item in all_data:
        structure.append(data_structure[item])
        image.append(data_picture[item])
        token.append(data_token[item])
        segment.append(data_segment[item])

In [12]:
preprocess_new_structure_data(structure_dir)
preprocess_new_texture_data(texture_dir)
preprocess_new_picture_data(picture_dir)
prepare_data_for_prediction()


In [13]:
# format the data
structure = np.asarray(structure,  dtype=np.float32)
image = np.asarray(image,  dtype=np.float32)
token = np.asarray(token,  dtype=np.float32)
segment = np.asarray(segment,  dtype=np.float32)

In [14]:
data_to_predict = [structure, token, segment, image]
result = model.predict(data_to_predict)

2023-06-02 17:24:23.020290: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz




In [15]:
file_names_column = []
predictions_column = []

for i, pred in enumerate(result):
    file_names_column.append(all_data[i])
    predictions_column.append(pred)

data = {
    'file_Name': file_names_column,
    'readability_score': predictions_column
}

prediction_df = pd.DataFrame(data)
prediction_df['readability_score'] = prediction_df['readability_score'].astype(float)
prediction_df['readability'] = round(prediction_df['readability_score'])

In [16]:
prediction_df

Unnamed: 0,file_Name,readability_score,readability
0,27,0.115989,0.0
1,1,0.017252,0.0
2,15,0.993372,1.0
3,8,0.179276,0.0
4,32,0.0,0.0
5,42,0.206883,0.0
6,19,0.999957,1.0
7,47,0.907458,1.0
8,37,0.06961,0.0
9,4,0.765248,1.0
