In [12]:
import requests
import pickle
import time
import json

import tensorflow as tf
import keras

from keras_preprocessing.sequence import pad_sequences
from keras_preprocessing import sequence
from keras.preprocessing import text

Setting the proper parameters to get the best-fitting results from google maps

In [2]:
# variables needed for downloading the data
secret_api_key = 'AIzaSyAEqOrB9kcTREWrb1rgHECl1VblT1jZO1M'  # the google maps api key of the app
types = [
    'art_gallery', 'atm', 'bar', 'beauty_salon', 'book_store', 'bus_station', 
    'cafe', 'car_wash', 'casino', 'cemetery', 'church', 'city_hall', 
    'clothing_store', 'florist', 'gas_station', 'gym', 'jewelry_store', 'zoo',
    'university', 'travel_agency', 'store', 'spa', 'restaurant', 'post_office',
    'police', 'parking', 'park', 'night_club', 'museum', 'library',
]  # list that contains types of searched places
data = {}  # data consists of the following tupples: placed_id -> (name, types, lat, lng)
parameters = {
    'location': str(50.0615868) + ',' + str(19.9373393),  # krakow old town 
    'radius': 5000,  # 5 km
    'language': 'en',  # the result language
    'key': secret_api_key
}
city_name = 'krakow'

# variables needed for vectorization
model_path = '../models/tc_model_1_1680701562.h5'
tokenizer_path = '../models/tokenizer_1680783215.pickle'

### Get ids of places around specified point

The method executes the request to google maps

In [3]:
def execute_request(parameters):
    return requests.get('https://maps.googleapis.com/maps/api/place/nearbysearch/json', params=parameters)

The method processes the result by extracting needed data

In [4]:
def extract_data(response):
    data = {}
    places = response.json()['results']
    for place in places:
        place_id = place["place_id"]
        name = place["name"]
        types = place["types"]
        lat = place['geometry']['location']["lat"]
        lng = place['geometry']['location']["lng"]
        data[place_id] = (name, types, lat, lng)
    return data

The loop that gathers the data

In [5]:
for t in types:
    parameters['type'] = t
    response = execute_request(parameters)
    response_data = extract_data(response)
    for key, value in response_data.items():
        if key not in data:
            data[key] = value
    print(f"Response data length for category {t}: {len(response_data)}")

print("Total data:", len(data)) 

Response data length for category art_gallery: 20
Response data length for category atm: 20
Response data length for category bar: 20
Response data length for category beauty_salon: 20
Response data length for category book_store: 20
Response data length for category bus_station: 11
Response data length for category cafe: 20
Response data length for category car_wash: 20
Response data length for category casino: 6
Response data length for category cemetery: 20
Response data length for category church: 20
Response data length for category city_hall: 5
Response data length for category clothing_store: 20
Response data length for category florist: 20
Response data length for category gas_station: 9
Response data length for category gym: 20
Response data length for category jewelry_store: 20
Response data length for category zoo: 2
Response data length for category university: 20
Response data length for category travel_agency: 20
Response data length for category store: 20
Response data l

Convert dict to list

In [6]:
result = [(key, value[0], value[1], value[2], value[3]) for key, value in data.items()]

### Get details about gathered places and create proper vector for each sample

The method that sends request for place details

In [7]:
def execute_details_request(place_id):
    parameters = {
        'place_id': place_id,
        'key': secret_api_key,
        'language': 'en',
        'fields': 'editorial_summary,reviews,website,formatted_address,url'
    }
    return requests.get('https://maps.googleapis.com/maps/api/place/details/json', params=parameters)

Read the detailed data about each found place and create a vector that describes it

In [8]:
full_data = []
for r in result:
    place_id = r[0]
    place_name = r[1]
    place_types = r[2]
    place_lat = r[3]
    place_lng = r[4]
    place_summary = None
    place_reviews = []
    place_website = None
    place_address = None
    place_url = None
    
    response = execute_details_request(place_id)
    response_data = response.json()['result']
    
    if 'editorial_summary' in response_data:
        place_summary = response_data['editorial_summary']['overview']
    if 'reviews' in response_data:
        for review in response_data['reviews']:
            place_reviews.append(review['text'])
    if 'website' in response_data:
        place_website = response_data['website']
    if 'formatted_address' in response_data:
        place_address = response_data['formatted_address']
    if 'url' in response_data:
        place_url = response_data['url']
    
    full_data.append((
        place_id, place_name, place_types, 
        place_lat, place_lng, place_address, 
        place_website, place_url, place_summary,
        place_reviews
    ))

### Vectorize gathered data with the given model

The method loads model and tokenizer

In [9]:
def load_model_tokenizer(model_path, tokenizer_path):
    with open(tokenizer_path, 'rb') as file:
        tokenizer = pickle.load(file)
        
    model = keras.models.load_model(model_path)
    
    return model, tokenizer

The method compresses the data to the list of the following tupples: (id, name, address, lattitude, longtitude, web page, google page, description, interest vector)

In [10]:
def vectorize_places(places, model, tokenizer):
    result = []
    maxlen = model.layers[0].input_length
    for place in places:
        if place[9] is None and not place[10]:
            continue
        
        p_id = place[0]
        p_name = place[1]
        p_ltn = place[3]
        p_lng = place[4]
        p_address = place[5]
        p_web_page = None
        if place[6] is not None:
            p_web_page = place[6]
        p_google_page = None
        if place[7] is not None:
            p_google_page = place[7]
            
        description = ''
        for category in place[2]:
            description += (category.replace('_', ' ')) + ' '
            
        if place[8] is not None:
            description += place[8] + ' '
            
        for review in place[9]:
            description += review.replace('\n', '').replace('\\', '') + ' '
        
        description = description.replace('  ', ' ')
        sequences = tokenizer.texts_to_sequences([description])
        x = pad_sequences(sequences, maxlen=maxlen)
        p_vector = model.predict(x, verbose=0)[0]
        
        result.append((p_id, p_name, p_address, p_ltn, p_lng, p_web_page, p_google_page, description, p_vector))
    
    return result

Load model and calculate vector for each place

In [13]:
model, tokenizer = load_model_tokenizer(model_path, tokenizer_path)
full_data_vectorized = vectorize_places(full_data, model, tokenizer)

Metal device set to: Apple M1


2023-04-07 17:43:29.000774: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-04-07 17:43:29.000903: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
2023-04-07 17:43:29.902798: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2023-04-07 17:43:30.272794: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-04-07 17:43:30.570191: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-04-07 17:43:35.997178: I tensorflow/core/grappler/o

Save vectorized places

In [15]:
with open(f'../data/places_{city_name}_vectorized_{int(time.time())}', 'wb') as file:
    pickle.dump(full_data_vectorized, file, protocol=pickle.HIGHEST_PROTOCOL)