In [1]:
import pickle
import time
import keras
import time

from keras_preprocessing.sequence import pad_sequences
from keras_preprocessing import sequence
from keras.preprocessing import text
from keras import layers

import tensorflow as tf

Parameters

In [2]:
model_path = '../models/tc_model_2_1681463923_tt_1681463733.h5'
tokenizer_path = '../models/tokenizer_1681463733.pickle'
detailed_data_path = '../data/places_krakow_detailed_1680895219.pickle'
city_name = 'krakow'
creation_time = int(time.time())
maxlen = 500

The layers for transformer model

In [3]:
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, ff_act='relu', ff_reg=None, ff_d=0.25, mh_reg=None, mh_d=0.1, norm_eps=1e-6, **kwargs):
        # initialize super class
        super(TransformerBlock, self).__init__(**kwargs)
        
        # multi head attention
        self.att = layers.MultiHeadAttention(
            num_heads=num_heads, 
            key_dim=embed_dim,
            kernel_regularizer=mh_reg,
            dropout=mh_d
        )
        
        # feed forward network
        self.ffn = keras.Sequential([
            layers.Dense(ff_dim, activation=ff_act, kernel_regularizer=ff_reg), 
            layers.Dense(embed_dim, kernel_regularizer=ff_reg)
        ])
        
        # layer normalizations
        self.layernorm1 = layers.LayerNormalization(epsilon=norm_eps)
        self.layernorm2 = layers.LayerNormalization(epsilon=norm_eps)
        
        # dropout layers
        self.dropout1 = layers.Dropout(ff_d)
        self.dropout2 = layers.Dropout(ff_d)
        
        # remember for serialization
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.ff_dim = ff_dim
        self.ff_act=ff_act
        self.ff_reg=ff_reg
        self.ff_d=ff_d
        self.mh_reg=mh_reg
        self.mh_d=mh_d
        self.norm_eps=norm_eps
        
    def get_config(self):
        config = super().get_config()
        config.update({
            "embed_dim": self.embed_dim,
            "num_heads": self.num_heads,
            "ff_dim": self.ff_dim,
            "ff_act": self.ff_act,
            "ff_reg": self.ff_reg,
            "ff_d": self.ff_d,
            "mh_reg": self.mh_reg,
            "mh_d": self.mh_d,
            "norm_eps": self.norm_eps
        })
        return config

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

In [4]:
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim, embed_reg=None, **kwargs):
        super(TokenAndPositionEmbedding, self).__init__(**kwargs)
        
        # embedding layers
        self.token_emb = layers.Embedding(
            input_dim=vocab_size,
            output_dim=embed_dim,
            embeddings_regularizer=embed_reg
        )
        self.pos_emb = layers.Embedding(
            input_dim=maxlen, 
            output_dim=embed_dim,
            embeddings_regularizer=embed_reg
        )
        
        # save for serialization
        self.maxlen = maxlen
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim
        self.embed_reg = embed_reg

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions
    
    def get_config(self):
        config = super().get_config()
        config.update({
            "maxlen": self.maxlen,
            "vocab_size": self.vocab_size,
            "embed_dim": self.embed_dim,
            "embed_reg": self.embed_reg
        })
        return config

### Add the vector column to detailed data

The method loads neural network model, tokenizer and the detailed places data

In [5]:
def load(model_path, tokenizer_path, detailed_data_path):
    with open(tokenizer_path, 'rb') as file:
        tokenizer = pickle.load(file)
    with open(detailed_data_path, 'rb') as file:
        data = pickle.load(file)
    
    model = keras.models.load_model(
        filepath = model_path, 
        custom_objects = {
            "TokenAndPositionEmbedding": TokenAndPositionEmbedding,
            "TransformerBlock": TransformerBlock
        }
    )
    
    return model, tokenizer, data

The methods are responsible for describing each place (the default method concatenates its types, summary and reviews)

In [6]:
def default_describer(place):
    description = ''
    for place_type in place[2]:
        description += (place_type.replace('_', ' ')) + ' '

    if place[8] is not None:
        description += place[8] + ' '

    for review in place[9]:
        description += review.replace('\n', '').replace('\\', '') + ' '

    return description.replace('  ', ' ')

In [7]:
def summary_reviews_describer(place):
    description = ''
    if place[8] is not None:
        description += place[8] + ' '

    for review in place[9]:
        description += review.replace('\n', '').replace('\\', '') + ' '

    return description.replace('  ', ' ')

The method adds additional column that describes places

In [8]:
def vectorize_data(data, model, tokenizer, describer=default_describer, description_length_threshold=400):
    result = []
    for place in data:
        description = describer(place)
        if len(description) >= description_length_threshold:
            sequences = tokenizer.texts_to_sequences([description])
            x = pad_sequences(sequences, maxlen=maxlen)
            vector = model.predict(x, verbose=0)[0]
            place.append(vector)
            result.append(place)

    return result

Use created methods to load model, tokenizer and information about places

In [9]:
model, tokenizer, data = load(model_path, tokenizer_path, detailed_data_path)

Vectorize loaded data

In [10]:
data_vectorized = vectorize_data(data, model, tokenizer, describer=summary_reviews_describer)

Save the vectorized data

In [11]:
with open(f'../data/places_{city_name}_vectorized_{creation_time}.pickle', 'wb') as file:
    pickle.dump(data_vectorized, file, protocol=pickle.HIGHEST_PROTOCOL)