In [1]:
import pickle
import time
import keras
import time

from keras_preprocessing.sequence import pad_sequences
from keras_preprocessing import sequence
from keras.preprocessing import text

Parameters

In [2]:
model_path = '../models/tc_model_1_1681229441_tt_1681228271.h5'
tokenizer_path = '../models/tokenizer_1681228271.pickle'
detailed_data_path = '../data/places_krakow_detailed_1680895219.pickle'
city_name = 'krakow'
creation_time = int(time.time())

The method loads neural network model, tokenizer and the detailed places data

### Add the vector column to detailed data

In [3]:
def load(model_path, tokenizer_path, detailed_data_path):
    with open(tokenizer_path, 'rb') as file:
        tokenizer = pickle.load(file)
    with open(detailed_data_path, 'rb') as file:
        data = pickle.load(file)
    model = keras.models.load_model(model_path)
    
    return model, tokenizer, data

The methods are responsible for describing each place (the default method concatenates its types, summary and reviews)

In [4]:
def default_describer(place):
    description = ''
    for place_type in place[2]:
        description += (place_type.replace('_', ' ')) + ' '

    if place[8] is not None:
        description += place[8] + ' '

    for review in place[9]:
        description += review.replace('\n', '').replace('\\', '') + ' '

    return description.replace('  ', ' ')

In [5]:
def summary_reviews_describer(place):
    description = ''
    if place[8] is not None:
        description += place[8] + ' '

    for review in place[9]:
        description += review.replace('\n', '').replace('\\', '') + ' '

    return description.replace('  ', ' ')

The method adds additional column that describes places

In [6]:
def vectorize_data(data, model, tokenizer, describer=default_describer, description_length_threshold=400):
    result = []
    maxlen = model.layers[0].input_length
    for place in data:
        description = describer(place)
        if len(description) >= description_length_threshold:
            sequences = tokenizer.texts_to_sequences([description])
            x = pad_sequences(sequences, maxlen=maxlen)
            vector = model.predict(x, verbose=0)[0]
            place.append(vector)
            result.append(place)

    return result

Use created methods to load model, tokenizer and information about places

In [7]:
model, tokenizer, data = load(model_path, tokenizer_path, detailed_data_path)

Vectorize loaded data

In [8]:
data_vectorized = vectorize_data(data, model, tokenizer, describer=summary_reviews_describer)

Save the vectorized data

In [9]:
with open(f'../data/places_{city_name}_vectorized_{creation_time}.pickle', 'wb') as file:
    pickle.dump(data_vectorized, file, protocol=pickle.HIGHEST_PROTOCOL)