In [1]:
import pickle
import time
import keras
import time

from keras_preprocessing.sequence import pad_sequences
from keras_preprocessing import sequence
from keras.preprocessing import text

Parameters

In [2]:
model_path = '../models/tc_model_1_1680891414_tt_1680888903.h5'
tokenizer_path = '../models/tokenizer_1680888903.pickle'
detailed_data_path = '../data/places_krakow_detailed_1680895219.pickle'
city_name = 'krakow'
creation_time = int(time.time())

The method loads neural network model, tokenizer and the detailed places data

### Add the vector column to detailed data

In [3]:
def load(model_path, tokenizer_path, detailed_data_path):
    with open(tokenizer_path, 'rb') as file:
        tokenizer = pickle.load(file)
    with open(detailed_data_path, 'rb') as file:
        data = pickle.load(file)
    model = keras.models.load_model(model_path)
    
    return model, tokenizer, data

The method is responsible for describing each place (the default method concatenates its types, summary and reviews)

In [4]:
def default_describer(place):
    description = ''
    for place_type in place[2]:
        description += (place_type.replace('_', ' ')) + ' '

    if place[8] is not None:
        description += place[8] + ' '

    for review in place[9]:
        description += review.replace('\n', '').replace('\\', '') + ' '

    return description.replace('  ', ' ')

The method adds additional column that describes places

In [5]:
def vectorize_data(data, model, tokenizer, describer=default_describer):
    result = []
    maxlen = model.layers[0].input_length
    for place in data:
        description = describer(place)
        sequences = tokenizer.texts_to_sequences([description])
        x = pad_sequences(sequences, maxlen=maxlen)
        vector = model.predict(x, verbose=0)[0]
        place.append(vector)
        result.append(place)

    return result

Use created methods to load model, tokenizer and information about places

In [6]:
model, tokenizer, data = load(model_path, tokenizer_path, detailed_data_path)

Metal device set to: Apple M1


2023-04-07 21:58:30.826376: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-04-07 21:58:30.826469: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


Vectorize loaded data

In [7]:
data_vectorized = vectorize_data(data, model, tokenizer)

2023-04-07 21:58:31.315471: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2023-04-07 21:58:31.539228: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-04-07 21:58:31.609851: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-04-07 21:58:34.206199: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Save the vectorized data

In [8]:
with open(f'../data/places_{city_name}_vectorized_{creation_time}.pickle', 'wb') as file:
    pickle.dump(data_vectorized, file, protocol=pickle.HIGHEST_PROTOCOL)