In [1]:
import re
import pandas as pd
import numpy as np

pd.set_option("display.max_columns", None)

In [2]:
path = "../data/Prestadores.csv"
df = pd.read_csv(
    path, 
    encoding='utf8', 
    sep=';', 
    index_col=False,
    usecols=['depa_nombre', 'muni_nombre', 'nits_nit', 'razon_social', 'direccion']
)

In [3]:
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

geolocator = Nominatim(user_agent="test")
# need to define a rate_limiter to not break the service. Max 1 call per second.
geolocator = RateLimiter(geolocator.geocode, min_delay_seconds=1.1)

In [4]:
# using this lambda to parse all addresses from a particular city
#geocode = lambda query: geolocator.geocode("%s, bogotá colombia" % query) # without rate limiter
geocode = lambda query: geolocator("%s, bogotá colombia" % query) # with rate limiter
# geocode by address
test_address = geocode(df.direccion.iloc[0])
print('Geopy reuslt:\t',test_address)
print('Geopy coord:\t',test_address.latitude, test_address.longitude)
print('Google coord: \t4.703611878558428, -74.04538972022057')

Geopy reuslt:	 Calle 125, Campania, UPZ Niza, Localidad Suba, Bogotá, Bogotá Distrito Capital - Municipio, RAP (Especial) Central, 111121, Colombia
Geopy coord:	 4.7088418 -74.0718151
Google coord: 	4.703611878558428, -74.04538972022057


## Standardize addresses

In [5]:
# explore a sample of addresses format
df.direccion.sample(10, random_state=42)

29                        DIAGONAL 115A No 70C-75 LOCAL 1
99                        CL 63 No. 11 - 45 OF. 209 y 213
678                                         AK 60 # 63 63
1399    TRANSVERSAL 73D AV. 1RA DE MAYO No 38C-41 SUR ...
185                                 CL 134 # 7B 83 CS 814
914                                 CALLE 17 # 10 16 PI 9
1137                       Calle 90 No 19A-49 OFICINA 902
906                        Carrera 56 B Bis A # 67 A - 72
844            Carrera 7 BIS No. 124 - 26 Oficina 701-702
1360    KR 22 # 100 24 piso1 piso3 consultorios 1 2 3 y 7
Name: direccion, dtype: object

Several addresses has a part at the end like `local`, `cs`, `pi`, `piso`, `oficina` ... that part must be cleaned.

In [6]:
import sys
sys.path.append('..')
from utils.misc_funcs import get_distance, standardize_address

In [7]:
# test standardize an address
print(standardize_address(df.direccion.iloc[29]))
print(standardize_address(df.direccion.iloc[99]))
print(standardize_address(df.direccion.iloc[1399]))
print(standardize_address(df.direccion.iloc[1360]))

diagonal 115a #70c-75
cl 63 #11 - 45
transversal 73d av 1ra de mayo #38c-41 sur
kr 22 # 100 24


In [8]:
# apply function for all addresses
df['direccion_std'] = df.direccion.apply(standardize_address)

## Create simmulation on new addresses sample

In [9]:
from tqdm import tqdm

# get a sample of addresses
claim_address = {
    "claim1": {
        "address": "Cra. 11 #82-71",
        },
    "claim2": {
        "address": "Cra. 15 #124-30"
        },
    "claim3": {
        "address": "Cl. 43 #7-65"
        },
    "claim4": {
        "address": "Cra. 7 #N. 28-66"
        },
    "claim5": {
        "address": "Cra. 6 #15-88"
        },
    "claim6": {
        "address": "Cra. 7 #40 - 62"
        },
    "claim7": {
        "address": "Dg. 61c #26-36"
        }
}
# get coordinates for all claims addresses
for k, a in tqdm(claim_address.items()):
    temp_geocode = geocode(standardize_address(a["address"]))
    a['coordinates'] = (temp_geocode.latitude, temp_geocode.longitude)

  0%|          | 0/7 [00:00<?, ?it/s]

100%|██████████| 7/7 [00:08<00:00,  1.16s/it]


In [10]:
# check on dictionary current status
claim_address

{'claim1': {'address': 'Cra. 11 #82-71',
  'coordinates': (4.5667319, -74.1053956)},
 'claim2': {'address': 'Cra. 15 #124-30',
  'coordinates': (4.7088075, -74.0424709)},
 'claim3': {'address': 'Cl. 43 #7-65',
  'coordinates': (4.63014005, -74.06524866353007)},
 'claim4': {'address': 'Cra. 7 #N. 28-66',
  'coordinates': (4.5564374, -74.121314)},
 'claim5': {'address': 'Cra. 6 #15-88',
  'coordinates': (4.6691469, -74.0439787)},
 'claim6': {'address': 'Cra. 7 #40 - 62',
  'coordinates': (4.5564374, -74.121314)},
 'claim7': {'address': 'Dg. 61c #26-36',
  'coordinates': (4.6474259, -74.0741248)}}

In [11]:
# take a sample of 100 providers
df_sample = df.sample(100, random_state=42)
# get coordinates for that samples providers
sample_coord = []
for a in tqdm(df_sample.direccion_std):
    temp_geocode = geocode(a)
    try:
        sample_coord.append((temp_geocode.latitude, temp_geocode.longitude))
    except AttributeError:
        sample_coord.append(np.nan)

  7%|▋         | 7/100 [00:06<01:35,  1.02s/it]RateLimiter caught an error, retrying (0/2 tries). Called with (*('carrera 56 b bis a # 67 a - 72, bogotá colombia',), **{}).
Traceback (most recent call last):
  File "/opt/homebrew/Caskroom/miniforge/base/envs/comet/lib/python3.9/site-packages/urllib3/connectionpool.py", line 466, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/opt/homebrew/Caskroom/miniforge/base/envs/comet/lib/python3.9/site-packages/urllib3/connectionpool.py", line 461, in _make_request
    httplib_response = conn.getresponse()
  File "/opt/homebrew/Caskroom/miniforge/base/envs/comet/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/opt/homebrew/Caskroom/miniforge/base/envs/comet/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/opt/homebrew/Caskroom/miniforge/base/envs/comet/lib/python3.9/http/client.py", line 281, in _read_st