## Notebook with code used to transfer existing data from MongoDB acquired during writing thesis

In [3]:
from pymongo import ASCENDING, GEOSPHERE, MongoClient
import pandas as pd
from alive_progress import alive_bar
from shapely.geometry import Point, mapping
import shapely
import json
from os import listdir
from os.path import isfile, join
from tqdm import tqdm
from tqdm.contrib.concurrent import process_map
import geopandas as gpd

In [4]:
from db_connector.models import Area, TrainLabel, Vector

In [5]:
client = MongoClient('mongodb://localhost:27017/')

In [6]:
db = client.osmDataDB

In [7]:
coll_cities = db.cities

## Migrating cities into sqlite database

In [8]:
cities = [a for a in coll_cities.find({ 'accepted': True })]

In [9]:
cities[0]

{'_id': ObjectId('608c8e85d9325f7b4aa354ee'),
 'city_id': 1,
 'city': 'Antwerpen',
 'osm_areas_ids': [59518],
 'geometry': {'type': 'Polygon',
  'coordinates': [[[4.2175769, 51.3738851],
    [4.2182598, 51.3722094],
    [4.2199279, 51.3683223],
    [4.220477, 51.3673435],
    [4.2220663, 51.3645103],
    [4.2267461, 51.3597586],
    [4.2325255, 51.3575718],
    [4.242049, 51.3539671],
    [4.2594513, 51.3463655],
    [4.2630991, 51.3445695],
    [4.2658028, 51.3431487],
    [4.2672153, 51.3421246],
    [4.2682919, 51.3412454],
    [4.2697939, 51.3393687],
    [4.2712101, 51.3370899],
    [4.2726263, 51.3338993],
    [4.273313, 51.3317542],
    [4.2735705, 51.3303867],
    [4.2737851, 51.3281342],
    [4.2736992, 51.3249966],
    [4.273313, 51.3199814],
    [4.2730555, 51.3152607],
    [4.2733559, 51.3129002],
    [4.2739138, 51.3108882],
    [4.2754588, 51.3081251],
    [4.2789411, 51.3031754],
    [4.2808661, 51.3012835],
    [4.2833981, 51.2999955],
    [4.2853288, 51.299298],
    [4

In [10]:
for city in cities:
    shp = shapely.geometry.shape(city['geometry'])
    area = Area()
    area.name = city['city']
    area.lon = shp.centroid.x
    area.lat = shp.centroid.y
    area.shape = city['geometry']
    area.save()

## Seeding database with vectors

In [11]:
from h3 import h3
import math

In [12]:
coll_relations = db.relations
coll_stations = db.stationsInCities

In [13]:
RESOLUTION = 11

edge_size = h3.edge_length(RESOLUTION, unit='m')
h = edge_size * math.sqrt(3) / 2
k_neighbours = math.ceil(2000 / (h*2))
print(edge_size, h, k_neighbours)
NEIGHBORS = k_neighbours

24.910561 21.57317864852189 47


In [14]:
def getHexes(city_id):
    station_indexes = set()
    all_indexes = set()
    for st in tqdm([s for s in coll_stations.find({ 'city_id': city_id })], desc=f"{city['city']} - Loading station hexes"):
        pt = shapely.geometry.shape(st['geometry'])
        h = h3.geo_to_h3(pt.y, pt.x, RESOLUTION)
        indexes = h3.hex_range(h, NEIGHBORS)
        station_indexes.add(h)
        all_indexes.update(indexes)
    return station_indexes, all_indexes

In [15]:
def check_hex(h):
    global shp
    polygons = h3.h3_set_to_multi_polygon([h], geo_json=True)
    outlines = [loop for polygon in polygons for loop in polygon]
    polyline = [outline + [outline[0]] for outline in outlines][0]
    polygon = shapely.geometry.Polygon(polyline)
    if shp.intersects(polygon):
        return {'hex_id': h, 'geometry': polygon}
    return None

In [16]:
hexes_dict = {}
for city in coll_cities.find({ 'accepted': True }):
    shp = shapely.geometry.shape(city['geometry'])
    buffered_polygon = shp.buffer(0.005)

    hexes_dict[city['city']] = {}
    stations, non_stations = getHexes(city['city_id'])
    hexes_shp = process_map(check_hex, non_stations, max_workers=8, desc=f"{city['city']} - Filtering hexes", chunksize = 1)
    hexes_dict[city['city']]['stations'] = stations
    hexes_dict[city['city']]['non_stations'] = set([h['hex_id'] for h in hexes_shp if h is not None])
    print(city['city'], len(hexes_dict[city['city']]['stations']), len(hexes_dict[city['city']]['non_stations']))

Antwerpen - Loading station hexes: 100%|██████████| 298/298 [00:01<00:00, 171.52it/s]
Antwerpen - Filtering hexes: 100%|██████████| 75461/75461 [00:13<00:00, 5560.75it/s]
Barcelona - Loading station hexes:   5%|▍         | 25/502 [00:00<00:01, 248.05it/s]

Antwerpen 292 56967


Barcelona - Loading station hexes: 100%|██████████| 502/502 [00:02<00:00, 197.04it/s]
Barcelona - Filtering hexes: 100%|██████████| 67580/67580 [00:12<00:00, 5311.89it/s]
Berlin - Loading station hexes:   0%|          | 0/305 [00:00<?, ?it/s]

Barcelona 497 42436


Berlin - Loading station hexes: 100%|██████████| 305/305 [00:01<00:00, 199.95it/s]
Berlin - Filtering hexes: 100%|██████████| 122989/122989 [00:26<00:00, 4615.27it/s]


Berlin 305 122989


Bern - Loading station hexes: 100%|██████████| 193/193 [00:00<00:00, 201.18it/s]
Bern - Filtering hexes: 100%|██████████| 53474/53474 [00:25<00:00, 2112.15it/s]
Bordeaux - Loading station hexes:  12%|█▏        | 22/183 [00:00<00:00, 211.74it/s]

Bern 193 53474


Bordeaux - Loading station hexes: 100%|██████████| 183/183 [00:00<00:00, 196.76it/s]
Bordeaux - Filtering hexes: 100%|██████████| 143284/143284 [00:28<00:00, 4969.22it/s]
Brno - Loading station hexes:  14%|█▍        | 21/148 [00:00<00:00, 205.50it/s]

Bordeaux 183 143284


Brno - Loading station hexes: 100%|██████████| 148/148 [00:00<00:00, 202.77it/s]
Brno - Filtering hexes: 100%|██████████| 75349/75349 [00:14<00:00, 5097.15it/s]
Bruxelles - Loading station hexes:   0%|          | 0/349 [00:00<?, ?it/s]

Brno 143 70193


Bruxelles - Loading station hexes: 100%|██████████| 349/349 [00:01<00:00, 190.40it/s]
Bruxelles - Filtering hexes: 100%|██████████| 108928/108928 [00:19<00:00, 5469.11it/s]
Budapest - Loading station hexes:  13%|█▎        | 20/159 [00:00<00:00, 192.60it/s]

Bruxelles 349 83719


Budapest - Loading station hexes: 100%|██████████| 159/159 [00:00<00:00, 201.04it/s]
Budapest - Filtering hexes: 100%|██████████| 49779/49779 [00:09<00:00, 5251.13it/s]
Cardiff - Loading station hexes:  16%|█▌        | 16/100 [00:00<00:00, 156.92it/s]

Budapest 159 49779


Cardiff - Loading station hexes: 100%|██████████| 100/100 [00:00<00:00, 166.66it/s]
Cardiff - Filtering hexes: 100%|██████████| 70147/70147 [00:13<00:00, 5323.99it/s]


Cardiff 100 59720


Dublin - Loading station hexes: 100%|██████████| 195/195 [00:00<00:00, 205.40it/s]
Dublin - Filtering hexes: 100%|██████████| 74365/74365 [00:39<00:00, 1885.07it/s]
Gothenburg - Loading station hexes:  19%|█▉        | 21/112 [00:00<00:00, 200.49it/s]

Dublin 189 64830


Gothenburg - Loading station hexes: 100%|██████████| 112/112 [00:00<00:00, 197.09it/s]
Gothenburg - Filtering hexes: 100%|██████████| 59217/59217 [00:11<00:00, 5140.13it/s]
Helsinki - Loading station hexes:   6%|▌         | 20/352 [00:00<00:01, 192.45it/s]

Gothenburg 112 59217


Helsinki - Loading station hexes: 100%|██████████| 352/352 [00:01<00:00, 192.53it/s]
Helsinki - Filtering hexes: 100%|██████████| 152386/152386 [00:28<00:00, 5339.68it/s]
Kyiv - Loading station hexes:  17%|█▋        | 20/118 [00:00<00:00, 199.38it/s]

Helsinki 352 152386


Kyiv - Loading station hexes: 100%|██████████| 118/118 [00:00<00:00, 195.40it/s]
Kyiv - Filtering hexes: 100%|██████████| 165223/165223 [00:30<00:00, 5378.71it/s]


Kyiv 117 158731


London - Loading station hexes: 100%|██████████| 789/789 [00:03<00:00, 200.36it/s]
London - Filtering hexes: 100%|██████████| 110230/110230 [00:36<00:00, 3023.77it/s]


London 787 110230


Lyon - Loading station hexes: 100%|██████████| 422/422 [00:02<00:00, 198.07it/s]
Lyon - Filtering hexes: 100%|██████████| 121011/121011 [00:44<00:00, 2717.85it/s]
Madrid - Loading station hexes:   0%|          | 0/264 [00:00<?, ?it/s]

Lyon 421 121011


Madrid - Loading station hexes: 100%|██████████| 264/264 [00:01<00:00, 200.42it/s]
Madrid - Filtering hexes: 100%|██████████| 58055/58055 [00:11<00:00, 5163.03it/s]


Madrid 260 58055


Marseille - Loading station hexes: 100%|██████████| 126/126 [00:00<00:00, 196.24it/s]
Marseille - Filtering hexes: 100%|██████████| 39410/39410 [00:46<00:00, 842.00it/s]
Milan - Loading station hexes:   6%|▌         | 19/318 [00:00<00:01, 186.14it/s]

Marseille 126 29552


Milan - Loading station hexes: 100%|██████████| 318/318 [00:01<00:00, 187.04it/s]
Milan - Filtering hexes: 100%|██████████| 70801/70801 [00:13<00:00, 5259.04it/s]


Milan 317 62612


Moscow - Loading station hexes: 100%|██████████| 654/654 [00:03<00:00, 194.86it/s]
Moscow - Filtering hexes: 100%|██████████| 398512/398512 [02:17<00:00, 2894.41it/s]


Moscow 654 378242


Munich - Loading station hexes: 100%|██████████| 299/299 [00:01<00:00, 199.47it/s]
Munich - Filtering hexes: 100%|██████████| 313092/313092 [01:06<00:00, 4715.10it/s]


Munich 299 294611


Nantes - Loading station hexes: 100%|██████████| 121/121 [00:00<00:00, 192.76it/s]
Nantes - Filtering hexes: 100%|██████████| 36759/36759 [00:19<00:00, 1874.02it/s]
Oslo - Loading station hexes:   0%|          | 0/246 [00:00<?, ?it/s]

Nantes 121 36759


Oslo - Loading station hexes: 100%|██████████| 246/246 [00:01<00:00, 190.34it/s]
Oslo - Filtering hexes: 100%|██████████| 56041/56041 [00:11<00:00, 5053.40it/s]


Oslo 244 53820


Ostrava - Loading station hexes: 100%|██████████| 337/337 [00:01<00:00, 190.63it/s]
Ostrava - Filtering hexes: 100%|██████████| 159722/159722 [00:47<00:00, 3339.76it/s]


Ostrava 337 135588


Paris - Loading station hexes: 100%|██████████| 1399/1399 [00:06<00:00, 206.07it/s]
Paris - Filtering hexes: 100%|██████████| 264684/264684 [03:06<00:00, 1416.23it/s]
Poznań - Loading station hexes:  10%|▉         | 17/174 [00:00<00:00, 163.13it/s]

Paris 1399 264684


Poznań - Loading station hexes: 100%|██████████| 174/174 [00:00<00:00, 193.02it/s]
Poznań - Filtering hexes: 100%|██████████| 117298/117298 [00:21<00:00, 5445.72it/s]


Poznań 174 105131


Prague - Loading station hexes: 100%|██████████| 398/398 [00:02<00:00, 197.87it/s]
Prague - Filtering hexes: 100%|██████████| 152614/152614 [01:03<00:00, 2390.62it/s]
Seville - Loading station hexes:   0%|          | 0/258 [00:00<?, ?it/s]

Prague 397 151321


Seville - Loading station hexes: 100%|██████████| 258/258 [00:01<00:00, 198.95it/s]
Seville - Filtering hexes: 100%|██████████| 60138/60138 [00:10<00:00, 5482.19it/s]
Toulouse - Loading station hexes:   7%|▋         | 21/284 [00:00<00:01, 201.30it/s]

Seville 253 48232


Toulouse - Loading station hexes: 100%|██████████| 284/284 [00:01<00:00, 190.45it/s]
Toulouse - Filtering hexes: 100%|██████████| 65273/65273 [00:12<00:00, 5174.47it/s]
Valencia - Loading station hexes:   0%|          | 0/276 [00:00<?, ?it/s]

Toulouse 282 52452


Valencia - Loading station hexes: 100%|██████████| 276/276 [00:01<00:00, 181.83it/s]
Valencia - Filtering hexes: 100%|██████████| 51625/51625 [00:10<00:00, 4984.00it/s]
Vienna - Loading station hexes:  17%|█▋        | 20/120 [00:00<00:00, 196.98it/s]

Valencia 276 33274


Vienna - Loading station hexes: 100%|██████████| 120/120 [00:00<00:00, 184.05it/s]
Vienna - Filtering hexes: 100%|██████████| 49047/49047 [00:09<00:00, 5187.78it/s]
Warszawa - Loading station hexes:   5%|▌         | 19/347 [00:00<00:01, 184.59it/s]

Vienna 120 49047


Warszawa - Loading station hexes: 100%|██████████| 347/347 [00:01<00:00, 189.78it/s]
Warszawa - Filtering hexes: 100%|██████████| 181767/181767 [00:35<00:00, 5062.95it/s]
Wrocław - Loading station hexes:   0%|          | 0/215 [00:00<?, ?it/s]

Warszawa 339 173437


Wrocław - Loading station hexes: 100%|██████████| 215/215 [00:01<00:00, 183.94it/s]
Wrocław - Filtering hexes: 100%|██████████| 143532/143532 [00:27<00:00, 5312.84it/s]
Zaragoza - Loading station hexes:  15%|█▍        | 19/129 [00:00<00:00, 185.82it/s]

Wrocław 215 126558


Zaragoza - Loading station hexes: 100%|██████████| 129/129 [00:00<00:00, 194.88it/s]
Zaragoza - Filtering hexes: 100%|██████████| 32227/32227 [00:06<00:00, 5137.57it/s]
Zurich - Loading station hexes:  10%|█         | 17/170 [00:00<00:00, 165.84it/s]

Zaragoza 129 32227


Zurich - Loading station hexes: 100%|██████████| 170/170 [00:00<00:00, 187.66it/s]
Zurich - Filtering hexes: 100%|██████████| 70203/70203 [00:20<00:00, 3431.49it/s]


Zurich 170 70203


In [17]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

In [18]:
class BaseCountCategoryEmbedding():
    def __init__(self):
        client = MongoClient('mongodb://localhost:27017/')
        db = client.osmDataDB
        self.coll_cities = db.cities
        self.coll_relations_filtered = db.relationsFiltered

    def fit(self):
        categories = self.coll_relations_filtered.find({}, {'_id':0, 'category': 1}).distinct('category')
        self.vectorizer = CountVectorizer()
        self.vectorizer.fit(categories)
       

    def _transform(self, hex):
        polygons = h3.h3_set_to_multi_polygon([hex], geo_json=True)
        # flatten polygons into loops.
        outlines = [loop for polygon in polygons for loop in polygon]
        polyline = [outline + [outline[0]] for outline in outlines][0]
        polygon = shapely.geometry.Polygon(polyline)

        c = MongoClient('mongodb://localhost:27017/')

        relations = c.osmDataDB.relationsFiltered.find({
            "geometry": {
                "$geoIntersects": {
                    "$geometry": shapely.geometry.mapping(polygon)
                }
            }
        },
        {
            '_id': 0,
            'category': 1
        })
        categories = [r['category'] for r in relations]
        if len(categories) > 0:
            functions = ' '.join(categories)
            vector = self.vectorizer.transform([functions]).toarray().reshape((-1,))
        else:
            # print(f'No relations for hex {hex["hex_id"]}')
            vector = np.zeros(shape=(len(self.vectorizer.get_feature_names()),))
        return np.array(vector).reshape((-1,))

    def transform(self, hex):
        vector = self._transform(hex)
        return np.array(vector).reshape((-1,))

In [19]:
embedder = BaseCountCategoryEmbedding()
embedder.fit()

In [20]:
from db_connector.db_definition import db as sqlite_db

In [21]:
def parse_hex(h):
    global embedder
    return embedder.transform(h)

In [22]:
for city, hexes in hexes_dict.items():
    city_obj = Area.get(Area.name == city)
    h3_indexes = [h for h in hexes['non_stations']]
    stations = [h for h in hexes['stations']]
    vectors = process_map(parse_hex, h3_indexes, max_workers=8, desc=f"{city} - Vectorizing hexes", chunksize = 1)
    with sqlite_db.atomic():
        for hex, vector in tqdm(zip(h3_indexes, vectors), desc=f"{city} - Saving vectors", total=len(h3_indexes)):
            tv_obj = {
                'area': city_obj,
                'hex_id': hex
            }
            label_obj = {
                'area': city_obj,
                'has_station': hex in stations,
                'hex_id': hex
            }
            for i, v in enumerate(vector):
                tv_obj[f'v{i}'] = int(v)
            Vector.insert(**tv_obj).execute()
            TrainLabel.insert(**label_obj).execute()

Antwerpen - Vectorizing hexes: 100%|██████████| 56967/56967 [01:18<00:00, 727.17it/s]
Antwerpen - Saving vectors: 100%|██████████| 56967/56967 [00:29<00:00, 1929.66it/s]
Barcelona - Vectorizing hexes: 100%|██████████| 42436/42436 [01:47<00:00, 394.58it/s]
Barcelona - Saving vectors: 100%|██████████| 42436/42436 [00:21<00:00, 1961.76it/s]
Berlin - Vectorizing hexes: 100%|██████████| 122989/122989 [02:44<00:00, 747.43it/s]
Berlin - Saving vectors: 100%|██████████| 122989/122989 [01:14<00:00, 1649.58it/s]
Bern - Vectorizing hexes: 100%|██████████| 53474/53474 [01:11<00:00, 746.83it/s]
Bern - Saving vectors: 100%|██████████| 53474/53474 [00:27<00:00, 1939.03it/s]
Bordeaux - Vectorizing hexes: 100%|██████████| 143284/143284 [03:13<00:00, 740.59it/s]
Bordeaux - Saving vectors: 100%|██████████| 143284/143284 [01:27<00:00, 1631.64it/s]
Brno - Vectorizing hexes: 100%|██████████| 70193/70193 [01:30<00:00, 775.86it/s]
Brno - Saving vectors: 100%|██████████| 70193/70193 [00:38<00:00, 1843.31it/s]
