# Geocode Addresses

In [1]:
%load_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
import logging

import sys

sys.path.append("/app")

# import scraping as sc

import pandas as pd

from jinja2 import Template

from db_utils import get_engine, get_table_creation_query

import geocoding as gc
import os

<IPython.core.display.Javascript object>

In [3]:
# global logger
logger = logging.getLogger()
handler = logging.StreamHandler()
formatter = logging.Formatter("%(asctime)s [%(name)s] %(levelname)-8s %(message)s")
handler.setFormatter(formatter)
logger.addHandler(handler)
logger.setLevel(logging.INFO)

<IPython.core.display.Javascript object>

## Connect to Database

In [4]:
user = os.getenv("POSTGRES_USER")
password = os.getenv("POSTGRES_PASSWORD")
host = os.getenv("POSTGRES_HOST")

<IPython.core.display.Javascript object>

In [5]:
engine = get_engine(user, password, host)

<IPython.core.display.Javascript object>

## Define search parameters

In [6]:
searchname = "tamzin"

<IPython.core.display.Javascript object>

## Create View for Addresses to Process

In [7]:
q_view = f"""CREATE OR REPLACE VIEW {searchname}.addresses_to_process AS
SELECT u.id,u.address
FROM {searchname}.unique_addresses u
INNER JOIN
{searchname}.address_ids_to_process a
on a.address_id=u.id
"""

<IPython.core.display.Javascript object>

In [8]:
with engine.connect() as conn:
    conn.execute(q_view)

<IPython.core.display.Javascript object>

## Load data

In [9]:
with engine.connect() as conn:
    df = pd.read_sql(f"SELECT * FROM {searchname}.addresses_to_process", con=conn)

<IPython.core.display.Javascript object>

In [10]:
logger.info(f"Loaded {len(df)} addresses to geocode")

2021-05-28 13:56:14,549 [root] INFO     Loaded 155 addresses to geocode


<IPython.core.display.Javascript object>

In [11]:
df_results = gc.query_addresses(df)

2021-05-28 13:56:14,560 [geocoding] INFO     Querying geolocations...
100%|██████████| 155/155 [01:17<00:00,  2.01it/s]
2021-05-28 13:57:31,637 [geocoding] INFO     Compiling results...
2021-05-28 13:57:31,641 [geocoding] INFO     155 of 155 successfully fetched (100.00%)


<IPython.core.display.Javascript object>

In [12]:
logger.info(f"Geocoded {len(df_results)} addresses")

2021-05-28 13:57:31,653 [root] INFO     Geocoded 155 addresses


<IPython.core.display.Javascript object>

## Create geocoded table

In [13]:
cols = {
    "latitude": "DECIMAL(10,6)",
    "longitude": "DECIMAL(10,6)",
    "address_id": "INTEGER",
    "geocoded_address": "VARCHAR(256)",
}

index_cols = ["address_id"]
unique_cols = ["address_id"]

<IPython.core.display.Javascript object>

In [14]:
create_q = get_table_creation_query(
    "geocoded_addresses", cols, searchname, index_cols, unique_cols
)

<IPython.core.display.Javascript object>

In [15]:
with engine.connect() as conn:
    conn.execute(create_q)

<IPython.core.display.Javascript object>

## Add to table

In [16]:
with engine.connect() as conn:
    df_results[["id", "address", "latitude", "longitude"]].rename(
        columns={"id": "address_id", "address": "geocoded_address"}
    ).to_sql(
        "geocoded_addresses",
        schema=searchname,
        index=False,
        con=conn,
        if_exists="append",
    )

<IPython.core.display.Javascript object>

## Note invalid addresses

In [17]:
q_add_invalid = f"""INSERT INTO {searchname}.invalid_addresses (address_id)
SELECT a.address_id
FROM   {searchname}.address_ids_to_process a
WHERE  NOT EXISTS (
   SELECT  -- SELECT list mostly irrelevant; can just be empty in Postgres
   FROM   {searchname}.geocoded_addresses g
   WHERE  g.address_id = a.address_id
   );
"""

<IPython.core.display.Javascript object>

In [18]:
with engine.connect() as conn:
    conn.execute(q_add_invalid)

<IPython.core.display.Javascript object>