In [1]:
%load_ext autoreload
%autoreload 2

In [3]:
from pymongo import ASCENDING, GEOSPHERE, MongoClient
import pandas as pd
from alive_progress import alive_bar
from shapely.geometry import Point, mapping
from keplergl import KeplerGl
import shapely
import json
from os import listdir
from os.path import isfile, join
from tqdm import tqdm
import geopandas as gpd
from h3 import h3
import math
import sklearn
import numpy as np

In [4]:
client = MongoClient('mongodb://localhost:27017/')
db = client.osmDataDB
coll_cities = db.cities
coll_hexes_filtered = db.hexesInCitiesFiltered
coll_relations_filtered = db.relationsFiltered

In [5]:
d = coll_relations_filtered.delete_many({ 'amenity': 'bicycle_rental' })
print(d.deleted_count, " documents deleted") 

8639  documents deleted


In [5]:
CURRENT_CITY = 'Wrocław'
CURRENT_RESOLUTION = 9
NEIGHBOURS = 0

In [6]:
city_id = coll_cities.find_one({'city': CURRENT_CITY})['city_id']
city_id

59

In [12]:
categories = coll_relations_filtered.find({'city_id': city_id}, {'_id':0, 'category': 1}).distinct('category')

['aerialway',
 'airports',
 'buildings',
 'culture_art_entertainment',
 'education',
 'emergency',
 'finances',
 'healthcare',
 'historic',
 'leisure',
 'other',
 'roads_bike',
 'roads_drive',
 'roads_walk',
 'shops',
 'sport',
 'sustenance',
 'tourism',
 'transportation',
 'water']

In [63]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [13]:
vectorizer = TfidfVectorizer()
vectorizer.fit(categories)
vectorizer.get_feature_names()

['aerialway',
 'airports',
 'buildings',
 'culture_art_entertainment',
 'education',
 'emergency',
 'finances',
 'healthcare',
 'historic',
 'leisure',
 'other',
 'roads_bike',
 'roads_drive',
 'roads_walk',
 'shops',
 'sport',
 'sustenance',
 'tourism',
 'transportation',
 'water']

In [54]:
def embedHexBaseline(hex):
    # hex = coll_hexes_filtered.find_one({'hex_id': hex_id})
    relations = coll_relations_filtered.find({
        "geometry": {
            "$geoIntersects": {
                "$geometry": hex['geometry']
            }
        }
    })
    functions = ' '.join([r['category'] for r in relations])
    vector = vectorizer.transform([functions])
    # print(functions)
    # print(vector)
    return vector.toarray().reshape((-1,))

In [17]:
embedHexBaseline('897ab094db3ffff')

roads_walk roads_walk roads_walk roads_walk roads_walk roads_walk roads_walk roads_walk roads_walk roads_walk roads_walk roads_walk roads_walk roads_walk roads_walk roads_walk roads_walk roads_walk roads_walk roads_walk roads_bike roads_bike shops roads_walk roads_walk roads_walk roads_walk roads_walk roads_walk roads_walk roads_walk roads_walk roads_walk roads_walk roads_walk roads_walk roads_walk roads_walk roads_walk roads_walk roads_walk roads_walk
  (0, 14)	0.025598984252456894
  (0, 13)	0.9983603858458189
  (0, 11)	0.05119796850491379


In [18]:
all_stations = [hex for hex in coll_hexes_filtered.find({ 'city_id': city_id, 'has_station': True, 'resolution': CURRENT_RESOLUTION })]
stations_length = len(all_stations)

In [21]:
stations_length

213

In [84]:
station_vectors = [embedHexBaseline(h) for h in all_stations]

In [94]:
INBALANCE_RATIO = 5.0

In [95]:
non_stations_cursor = coll_hexes_filtered.aggregate([
    { '$match': { 'city_id': city_id, 'has_station': False, 'resolution': CURRENT_RESOLUTION } },
    { '$sample': { 'size': stations_length * INBALANCE_RATIO } }
])
non_stations = [hex for hex in non_stations_cursor]

In [96]:
len(non_stations)

1065

In [97]:
non_station_vectors = [embedHexBaseline(h) for h in non_stations]

In [98]:
X = np.array(station_vectors + non_station_vectors)
Y = np.array([1] * stations_length + [0] * len(non_stations))

In [99]:
X.shape

(1278, 20)

In [100]:
Y.shape

(1278,)

In [101]:
Y

array([1, 1, 1, ..., 0, 0, 0])

In [102]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, stratify=Y)

In [103]:
y_test

array([0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0])

In [104]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report

In [105]:
for _cls in [KNeighborsClassifier, RandomForestClassifier, DecisionTreeClassifier]:
    clf = _cls()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(classification_report(y_test, y_pred))
    # scores = cross_val_score(clf, X, Y, cv=5, scoring='f1_macro')
    # print(scores)

              precision    recall  f1-score   support

           0       0.90      0.92      0.91       213
           1       0.56      0.47      0.51        43

    accuracy                           0.85       256
   macro avg       0.73      0.69      0.71       256
weighted avg       0.84      0.85      0.84       256

              precision    recall  f1-score   support

           0       0.91      0.93      0.92       213
           1       0.62      0.53      0.57        43

    accuracy                           0.87       256
   macro avg       0.77      0.73      0.75       256
weighted avg       0.86      0.87      0.86       256

              precision    recall  f1-score   support

           0       0.90      0.91      0.90       213
           1       0.51      0.49      0.50        43

    accuracy                           0.84       256
   macro avg       0.70      0.70      0.70       256
weighted avg       0.83      0.84      0.83       256

