In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pymongo import ASCENDING, GEOSPHERE, MongoClient
import pandas as pd
from alive_progress import alive_bar
from shapely.geometry import Point, mapping
from keplergl import KeplerGl
import shapely
import json
from os import listdir
from os.path import isfile, join
from tqdm import tqdm
import geopandas as gpd
from h3 import h3
import math
import sklearn
import numpy as np
import time

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
# from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, f1_score

In [4]:
from neighbour_embedding_methods import AverageDiminishingNeighbourEmbedding, AverageDiminishingSquqredNeighbourEmbedding, AverageNeighbourEmbedding, ConcatenateNeighbourEmbedding
from custom_distance_metric import DistanceMetric
from embedding_methods import BaseTfIdfCategoryEmbedding

In [5]:
INBALANCE_RATIOS = [1, 2, 3, 5]
RESOLUTIONS = [9, 10, 11]
NEIGHBORS = {
    9: [0,1,2,3],
    10: [0,2,4,6,8,10],
    11: [0,5,10,15,20,25]
}
CLASSIFIERS = [KNeighborsClassifier, SVC, RandomForestClassifier, AdaBoostClassifier]
NEIGHBORS_EMBEDDING_CLASSES = [ConcatenateNeighbourEmbedding, AverageNeighbourEmbedding, AverageDiminishingNeighbourEmbedding, AverageDiminishingSquqredNeighbourEmbedding]
EMBEDDING_CLASSES = [BaseTfIdfCategoryEmbedding]

In [6]:
client = MongoClient('mongodb://localhost:27017/')
db = client.osmDataDB
coll_cities = db.cities
coll_hexes_filtered = db.hexesInCitiesFiltered
coll_relations_filtered = db.relationsFiltered

In [7]:
cities = [c for c in coll_cities.find({'accepted': True}, {'_id': 0, 'city_id': 1, 'city': 1})]
cities

[{'city_id': 1, 'city': 'Antwerpen'},
 {'city_id': 2, 'city': 'Barcelona'},
 {'city_id': 3, 'city': 'Berlin'},
 {'city_id': 4, 'city': 'Bern'},
 {'city_id': 5, 'city': 'Bordeaux'},
 {'city_id': 7, 'city': 'Brno'},
 {'city_id': 8, 'city': 'Bruxelles'},
 {'city_id': 10, 'city': 'Budapest'},
 {'city_id': 11, 'city': 'Cardiff'},
 {'city_id': 17, 'city': 'Dublin'},
 {'city_id': 24, 'city': 'Gothenburg'},
 {'city_id': 25, 'city': 'Helsinki'},
 {'city_id': 26, 'city': 'Kyiv'},
 {'city_id': 30, 'city': 'London'},
 {'city_id': 32, 'city': 'Lyon'},
 {'city_id': 33, 'city': 'Madrid'},
 {'city_id': 35, 'city': 'Marseille'},
 {'city_id': 36, 'city': 'Milan'},
 {'city_id': 37, 'city': 'Moscow'},
 {'city_id': 38, 'city': 'Munich'},
 {'city_id': 39, 'city': 'Nantes'},
 {'city_id': 41, 'city': 'Oslo'},
 {'city_id': 42, 'city': 'Ostrava'},
 {'city_id': 45, 'city': 'Paris'},
 {'city_id': 46, 'city': 'Poznań'},
 {'city_id': 47, 'city': 'Prague'},
 {'city_id': 49, 'city': 'Seville'},
 {'city_id': 52, 'city

In [8]:
combinations = []
for city in cities:
    for resolution in RESOLUTIONS:
        for inbalance_ratio in INBALANCE_RATIOS:
            for embedding_cls in EMBEDDING_CLASSES:
                for neighbours in NEIGHBORS[resolution]:
                    if neighbours == 0:
                        combinations.append({
                            'city': city,
                            'resolution': resolution,
                            'inbalance_ratio': inbalance_ratio,
                            'embedding_cls': embedding_cls,
                            'neighbours': neighbours,
                            'neighbour_embedding_cls': ConcatenateNeighbourEmbedding
                        })
                    else:
                        for neighbour_embedding_cls in NEIGHBORS_EMBEDDING_CLASSES:
                            combinations.append({
                                'city': city,
                                'resolution': resolution,
                                'inbalance_ratio': inbalance_ratio,
                                'embedding_cls': embedding_cls,
                                'neighbours': neighbours,
                                'neighbour_embedding_cls': neighbour_embedding_cls
                            })
print(len(combinations))

7480


In [9]:
def generate_xy(city_id, resolution, inbalance_ratio, neighbours, neighbour_embedding_cls, embedding_cls):
    relation_embedder = embedding_cls()
    relation_embedder.fit({ 'city_id': city_id })
    all_stations = [hex for hex in coll_hexes_filtered.find({ 'city_id': city_id, 'has_station': True, 'resolution': resolution })]
    stations_length = len(all_stations)
    non_stations_cursor = coll_hexes_filtered.aggregate([
        { '$match': { 'city_id': city_id, 'has_station': False, 'resolution': resolution } },
        { '$sample': { 'size': stations_length * inbalance_ratio } }
    ])
    non_stations = [hex for hex in non_stations_cursor]
    hex_id_list =  [h['hex_id'] for h in all_stations + non_stations]
    y = np.array([1] * stations_length + [0] * len(non_stations))
    embedder = neighbour_embedding_cls()
    vectors_list = [embedder.get_embedding(h, neighbours, relation_embedder) for h in all_stations + non_stations]
    # vectors = np.stack(vectors_list, axis=0)
    vectors = np.array(vectors_list)
    # vectors[np.isfinite(vectors) == True] = 0
    try:
        vectors[np.isnan(vectors) == True] = 0
    except:
        print(vectors)
    return vectors, y, hex_id_list

In [11]:
def iterate_combination(t):
    params, idx = t
    # print(params)
    results = []
    city = params['city']
    resolution = params['resolution']
    inbalance_ratio = params['inbalance_ratio']
    embedding_cls = params['embedding_cls']
    neighbours = params['neighbours']
    neighbour_embedding_cls = params['neighbour_embedding_cls'] 
    custom_metric = DistanceMetric()
    best_clf_dict = {}

    desc = f'[{idx}] {city["city"]} Res: {resolution} InbR: {inbalance_ratio} EmbCls: {embedding_cls.__name__} NeighEmbCls: {neighbour_embedding_cls.__name__} Neigh: {neighbours}'

    with tqdm(total=10 * 10 * len(CLASSIFIERS), desc=desc, lock_args=(False,)) as pbar:
        for iteration in range(10):
            x, y, hex_ids = generate_xy(city['city_id'], resolution, inbalance_ratio, neighbours, neighbour_embedding_cls, embedding_cls)

            y_hex_zip = list(zip(y, hex_ids))

            X_tmp, X_validation, Y_hex_id_tmp, Y_hex_id_validation = train_test_split(x, y_hex_zip, test_size=0.2, stratify=y)

            X_validation = np.array(X_validation)
            Y_validation, hex_ids_validation = zip(*Y_hex_id_validation)
            Y_validation = np.array(list(Y_validation))
            hex_ids_validation = list(hex_ids_validation)

            y_tmp = [t[0] for t in Y_hex_id_tmp]

            clf_values = {}

            for _ in range(10):
                X_train, X_test, Y_hex_id_train, Y_hex_id_test = train_test_split(X_tmp, Y_hex_id_tmp, test_size=0.25, stratify=y_tmp)
                X_train = np.array(X_train)
                Y_train, hex_ids_train = zip(*Y_hex_id_train)
                Y_train = np.array(list(Y_train))
                hex_ids_train = list(hex_ids_train)
                X_test = np.array(X_test)
                Y_test, hex_ids_test = zip(*Y_hex_id_test)
                Y_test = np.array(list(Y_test))
                hex_ids_test = list(hex_ids_test)

                for clf_cls in CLASSIFIERS:
                    clf = clf_cls()
                    try:
                        clf.fit(X_train, Y_train)
                    except:
                        print(X_train)
                        print(X_train.shape)
                        print(np.any(np.isnan(X_train)))
                        print(np.all(np.isfinite(X_train)))
                        raise

                    try:
                        y_pred = clf.predict(X_test)
                    except:
                        print(X_test)
                        print(X_test.shape)
                        print(np.any(np.isnan(X_test)))
                        print(np.all(np.isfinite(X_test)))
                        raise

                    acc = accuracy_score(y_pred=y_pred, y_true=Y_test)
                    f1 = f1_score(y_pred=y_pred, y_true=Y_test)
                    # custom_metric_value = custom_metric.calculate(Y_test, y_pred, hex_ids_test)
                    if not clf_cls.__name__ in clf_values:
                        clf_values[clf_cls.__name__] = {
                            'accuracy': [],
                            'f1': []
                        }
                    clf_values[clf_cls.__name__]['accuracy'].append(acc)
                    clf_values[clf_cls.__name__]['f1'].append(f1)
                    if not clf_cls.__name__ in best_clf_dict or f1 > best_clf_dict[clf_cls.__name__][0]:
                        best_clf_dict[clf_cls.__name__] = (f1, clf)
                    pbar.update(1)
                    
            for k, v in clf_values.items():
                results.append({
                    'city': city['city'],
                    'resolution': resolution,
                    'inbalance_ratio': inbalance_ratio,
                    'embedding_cls': embedding_cls.__name__,
                    'neighbours': neighbours,
                    'neighbour_embedding_cls': neighbour_embedding_cls.__name__,
                    'classfier_cls': k,
                    'iteration': iteration + 1,
                    'dataset_type': 'test',
                    'accuracy': np.average(v['accuracy']),
                    'f1_score': np.average(v['f1']),
                    # 'custom_metric': custom_metric_value
                })
            
            for k, v in best_clf_dict.items():
                y_pred = v[1].predict(X_validation)
                acc = accuracy_score(y_pred=y_pred, y_true=Y_validation)
                f1 = f1_score(y_pred=y_pred, y_true=Y_validation)
                # custom_metric_value = custom_metric.calculate(Y_validation, y_pred, hex_ids_validation)
                results.append({
                    'city': city['city'],
                    'resolution': resolution,
                    'inbalance_ratio': inbalance_ratio,
                    'embedding_cls': embedding_cls.__name__,
                    'neighbours': neighbours,
                    'neighbour_embedding_cls': neighbour_embedding_cls.__name__,
                    'classfier_cls': k,
                    'iteration': iteration + 1,
                    'dataset_type': 'validation',
                    'accuracy': acc,
                    'f1_score': f1,
                    # 'custom_metric': custom_metric_value
                })
    results_df = pd.DataFrame(results)
    results_df.to_csv(f'results/result_{city["city"]}_{time.strftime("%Y%m%d-%H%M%S")}.csv')

In [148]:
# skip = 77
# for idx, combination in enumerate(combinations):
#     if idx < skip:
#         continue
#     iterate_combination(combination, idx)

[77] Antwerpen Res: 10 InbR: 2 EmbCls: BaseTfIdfCategoryEmbedding NeighEmbCls: AverageDiminishingSquqredNeighbourEmbedding Neigh: 2:  70%|██████▉   | 279/400 [01:04<00:10, 11.91it/s]None neighbours!
[77] Antwerpen Res: 10 InbR: 2 EmbCls: BaseTfIdfCategoryEmbedding NeighEmbCls: AverageDiminishingSquqredNeighbourEmbedding Neigh: 2: 100%|██████████| 400/400 [01:27<00:00,  4.60it/s]
[78] Antwerpen Res: 10 InbR: 2 EmbCls: BaseTfIdfCategoryEmbedding NeighEmbCls: ConcatenateNeighbourEmbedding Neigh: 4:  90%|████████▉ | 359/400 [02:47<00:03, 11.51it/s]None neighbours!
[78] Antwerpen Res: 10 InbR: 2 EmbCls: BaseTfIdfCategoryEmbedding NeighEmbCls: ConcatenateNeighbourEmbedding Neigh: 4: 100%|██████████| 400/400 [03:01<00:00,  2.20it/s]
[79] Antwerpen Res: 10 InbR: 2 EmbCls: BaseTfIdfCategoryEmbedding NeighEmbCls: AverageNeighbourEmbedding Neigh: 4: 100%|██████████| 400/400 [02:01<00:00,  3.30it/s]
[80] Antwerpen Res: 10 InbR: 2 EmbCls: BaseTfIdfCategoryEmbedding NeighEmbCls: AverageDiminishingNe

KeyboardInterrupt: 

In [12]:
from functools import partial
from itertools import chain

from tqdm.contrib.concurrent import process_map

pairs = []
skip = 88
for idx, combination in enumerate(combinations):
    if idx < skip:
        continue
    pairs.append((combination, idx))

# process_map(iterate_combination, pairs, max_workers=6)

In [13]:
from __future__ import print_function

import sys
from concurrent.futures import ThreadPoolExecutor
from functools import partial
from multiprocessing import Pool, RLock, freeze_support
from random import random
from threading import RLock as TRLock
from time import sleep

from tqdm.auto import tqdm, trange
from tqdm.contrib.concurrent import process_map, thread_map

PY2 = sys.version_info[:1] <= (2,)
tqdm.set_lock(TRLock())
pool_args = {}
if not PY2:
    pool_args.update(initializer=tqdm.set_lock, initargs=(tqdm.get_lock(),))
with ThreadPoolExecutor(**pool_args) as p:
    p.map(partial(iterate_combination), pairs)


[93] Antwerpen Res: 10 InbR: 2 EmbCls: BaseTfIdfCategoryEmbedding NeighEmbCls: AverageDiminishingSquqredNeighbourEmbedding Neigh: 10:   0%|          | 0/400 [00:00<?, ?it/s][A

[97] Antwerpen Res: 10 InbR: 3 EmbCls: BaseTfIdfCategoryEmbedding NeighEmbCls: AverageDiminishingNeighbourEmbedding Neigh: 2:   0%|          | 0/400 [00:00<?, ?it/s][A[A









[92] Antwerpen Res: 10 InbR: 2 EmbCls: BaseTfIdfCategoryEmbedding NeighEmbCls: AverageDiminishingNeighbourEmbedding Neigh: 10:   0%|          | 0/400 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A










[89] Antwerpen Res: 10 InbR: 2 EmbCls: BaseTfIdfCategoryEmbedding NeighEmbCls: AverageDiminishingSquqredNeighbourEmbedding Neigh: 8:   0%|          | 0/400 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A







[94] Antwerpen Res: 10 InbR: 3 EmbCls: BaseTfIdfCategoryEmbedding NeighEmbCls: ConcatenateNeighbourEmbedding Neigh: 0:   0%|          | 1/400 [00:07<48:59,  7.37s/it][A[A[A[A[A[A[A[A







[94] Antwerpen Res: 10