## Get Closest POIs

In [1]:
%load_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
import logging

import sys

sys.path.append("/app")

# import scraping as sc

import pandas as pd

from jinja2 import Template

from db_utils import get_engine, get_table_creation_query

from gis_utils import get_gdf_coords, get_closest_idxs, get_closest_pois_slow


import dataloader as loader


import numpy as np
import os

import geopandas as gpd

<IPython.core.display.Javascript object>

## Connect to Database

In [3]:
user = os.getenv("POSTGRES_USER")
password = os.getenv("POSTGRES_PASSWORD")
host = os.getenv("POSTGRES_HOST")

<IPython.core.display.Javascript object>

In [4]:
engine = get_engine(user, password, host)

<IPython.core.display.Javascript object>

## Define search parameters

In [5]:
searchname = "tamzin"

<IPython.core.display.Javascript object>

## Load data

In [6]:
q_load = f"""SELECT g.* FROM 
{searchname}.geocoded_addresses g
INNER JOIN
{searchname}.address_ids_to_process a
ON g.address_id=a.address_id"""

<IPython.core.display.Javascript object>

In [7]:
with engine.connect() as conn:
    df = pd.read_sql(q_load, con=conn)

<IPython.core.display.Javascript object>

## Load data

### POIs

In [8]:
pois = loader.load_poi_gdfs(user=user, password=password, host=host)

<IPython.core.display.Javascript object>

### Addresses

In [9]:
q_addresses = f"""SELECT c.* FROM 
{searchname}.bng_coords c
INNER JOIN
{searchname}.address_ids_to_process a
ON c.address_id=a.address_id"""

<IPython.core.display.Javascript object>

In [10]:
df_addresses = loader.load_sql(q_addresses, user=user, password=password, host=host)

<IPython.core.display.Javascript object>

## Get coordinates of POIs

In [11]:
is_point = {
    category: np.all(pois[category].geometry.geom_type == "Point") for category in pois
}

<IPython.core.display.Javascript object>

In [12]:
coords = {
    category: get_gdf_coords(pois[category]) for category in pois if is_point[category]
}

<IPython.core.display.Javascript object>

## Get closest POIs for point-based POIs

In [13]:
closest_idxs = {
    category: get_closest_idxs(
        df_addresses[["eastings", "northings"]].values, coords[category]
    )
    for category in coords
}

<IPython.core.display.Javascript object>

In [14]:
closest_ids = {
    category: pois[category].id.values[closest_idxs[category]]
    for category in closest_idxs
}

<IPython.core.display.Javascript object>

## Get closest POIs for polygon-based POIs

In [15]:
from shapely.ops import nearest_points


<IPython.core.display.Javascript object>

In [16]:
polygons = {
    category: pois[category].set_index("id").geometry
    for category in pois
    if not is_point[category]
}

<IPython.core.display.Javascript object>

In [17]:
def closest_loc_point_polygon(point, polygon):
    return nearest_points(polygon, point)[0]

<IPython.core.display.Javascript object>

In [18]:
gdf_addresses = gpd.GeoDataFrame(
    df_addresses,
    geometry=gpd.points_from_xy(df_addresses.eastings, df_addresses.northings),
)

<IPython.core.display.Javascript object>

In [19]:
closest_ids_poly = {
    category: get_closest_pois_slow(polygons[category].reset_index(), gdf_addresses)[0]
    for category in polygons
}

100%|██████████| 152/152 [00:01<00:00, 92.02it/s]


<IPython.core.display.Javascript object>

In [20]:
closest_points_poly = {
    category: gpd.GeoSeries(
        [
            closest_loc_point_polygon(point, poly)
            for point, poly in zip(
                gdf_addresses.geometry,
                polygons[category].loc[closest_ids_poly[category]].values,
            )
        ]
    )
    for category in polygons
}

<IPython.core.display.Javascript object>

## Create Outputs

In [21]:
output_dfs_points = {
    category: pd.DataFrame(
        {
            "address_id": df_addresses.address_id.values,
            "poi_id": closest_ids[category],
            "eastings": None,
            "northings": None,
            "poi_category": category,
        }
    )
    for category in closest_ids
}

<IPython.core.display.Javascript object>

In [22]:
output_dfs_polys = {
    category: pd.DataFrame(
        {
            "address_id": df_addresses.address_id.values,
            "poi_id": closest_ids_poly[category],
            "eastings": closest_points_poly[category].x.values,
            "northings": closest_points_poly[category].y.values,
            "poi_category": category,
        }
    )
    for category in closest_ids_poly
}

<IPython.core.display.Javascript object>

## Create table

In [23]:
cols = {
    "eastings": "DECIMAL(14,6)",
    "northings": "DECIMAL(14,6)",
    "poi_id": "INTEGER",
    "address_id": "INTEGER",
    "poi_category": "VARCHAR(64)",
}

index_cols = ["poi_id", "address_id", "poi_category"]
unique_cols = []

<IPython.core.display.Javascript object>

In [24]:
create_q = get_table_creation_query(
    "closest_pois", cols, searchname, index_cols, unique_cols
)

<IPython.core.display.Javascript object>

In [25]:
q_unique = f"""CREATE UNIQUE INDEX address_poi_category_closest_pois_categoryx
        ON {searchname}.closest_pois (poi_category,address_id);
        ALTER TABLE {searchname}.closest_pois 
        ADD CONSTRAINT unique_address_poi_category_closest_pois
        UNIQUE USING INDEX address_poi_category_closest_pois_categoryx;"""

<IPython.core.display.Javascript object>

In [26]:
with engine.connect() as conn:
    conn.execute(create_q)
    conn.execute(q_unique)

<IPython.core.display.Javascript object>

## Create Outputs

In [27]:
output = pd.concat([output_dfs_points[i] for i in output_dfs_points])
output = output.append(
    pd.concat([output_dfs_polys[i] for i in output_dfs_polys]), ignore_index=True
)

<IPython.core.display.Javascript object>

In [28]:
with engine.connect() as conn:
    output.to_sql(
        "closest_pois", schema=searchname, index=False, if_exists="append", con=conn
    )

<IPython.core.display.Javascript object>