# Data Aquisition

## Loading Data and Libraries

In [1]:
import pycep_correios as pycep
import geocoder
import pandas as pd
import csv
import json
import os

DATAPATH = '../data/'

In [None]:
df_ceps = pd.read_csv(os.path.join(DATAPATH, 'raw', 'preprocessed', 'CEPs.csv'), header=None)
df_ceps = df_ceps[0]

## Informações básicas

### Amostra

In [6]:
df_ceps.sample(5)

14056    66816249
18038     9060000
10649    31578230
20524    26170060
368      69908550
Name: 0, dtype: int64

### Shape dos dados

In [7]:
df_ceps.shape

(1184,)

## Pré-processamento

Os CEPs estão no formato Int, com isso o 0 inicial é descartado, o que não é verdade para a lógica real do CEP.

In [8]:
df_ceps.apply(lambda x: len(str(x))).value_counts()

8    1061
7     123
Name: 0, dtype: int64

Transformação para String e adição do zero à esquerda, quando aplicável.

In [9]:
df_ceps = df_ceps.astype(str).apply(lambda x: x.zfill(8))

In [10]:
df_ceps.apply(lambda x: len(str(x))).value_counts()

8    1184
Name: 0, dtype: int64

## Obter localização (lat, long) a partir do CEP

### Consultar Endereço a partir do CEP através da API dos Correios

In [11]:
def get_address(cep):
    address = None
    try:
        address = pycep.consultar_cep(cep)
    except:
        pass
    return address

In [12]:
cep_address_map = {}

In [13]:
ceps = df_ceps.to_list()

In [14]:
i = 0
for cep in ceps:
    if i % 100 == 0: print(i)
    cep_address_map[cep] = get_address(cep)
    i += 1

0
100
200
300
400
500
600
700
800
900
1000
1100


In [15]:
cep_address_map_2 = {}
for i in cep_address_map:
    cep = cep_address_map[i]
    if cep != None:
        cep_address_map_2[i] = cep

In [16]:
df_address = pd.DataFrame(cep_address_map_2.values(), index=cep_address_map_2.keys())

In [17]:
df_address.head()

Unnamed: 0,bairro,cep,cidade,complemento2,end,uf,unidadesPostagem
60730265,Parque São José,60730265.0,Fortaleza,- de 1350/1351 ao fim,Rua Antônio Costa Mendes,CE,[]
53605000,Campina de Feira,53605000.0,Igarassu,"- do km 39,010 ao km 40,004 - lado par",Avenida Alfredo Bandeira de Melo,PE,[]
18700970,,,,,,,[]
27267112,Duzentos e Quarenta e Nove,27267112.0,Volta Redonda,- de 760 a 830 - lado par,Avenida Europa,RJ,[]
23076380,Campo Grande,23076380.0,Rio de Janeiro,,Rua Taufik Dib,RJ,[]


In [18]:
df_address.to_csv('../data/raw/address_novo.csv')

### Consultar a localização a partir do endereço através do Open Street Map

In [22]:
cep_location_map = {}

j = 0
for i, row in df_address.iterrows():
    if j % 100 == 0: print(j)
    
    cep_location_map[i] = {'lng': None, 'lat': None}
    
    if row['end'] and row['uf']:
    
        geo = geocoder.osm(row['end'] + ', ' + row['uf'] + ', Brasil')

        if geo.ok:
            cep_location_map[i] = {'lng': geo.osm['x'], 'lat': geo.osm['y']}
    j += 1

0


INFO:geocoder.base:Requested https://nominatim.openstreetmap.org/search?q=Rua+Ant%C3%B4nio+Costa+Mendes%2C+CE%2C+Brasil&format=jsonv2&addressdetails=1&limit=1
INFO:geocoder.base:Requested https://nominatim.openstreetmap.org/search?q=Avenida+Alfredo+Bandeira+de+Melo%2C+PE%2C+Brasil&format=jsonv2&addressdetails=1&limit=1
INFO:geocoder.base:Requested https://nominatim.openstreetmap.org/search?q=Avenida+Europa%2C+RJ%2C+Brasil&format=jsonv2&addressdetails=1&limit=1
INFO:geocoder.base:Requested https://nominatim.openstreetmap.org/search?q=Rua+Taufik+Dib%2C+RJ%2C+Brasil&format=jsonv2&addressdetails=1&limit=1
INFO:geocoder.base:Requested https://nominatim.openstreetmap.org/search?q=Rua+Pau+Brasil+s%2Fn%2C+PR%2C+Brasil&format=jsonv2&addressdetails=1&limit=1
INFO:geocoder.base:Requested https://nominatim.openstreetmap.org/search?q=Avenida+Ipiranga%2C+RS%2C+Brasil&format=jsonv2&addressdetails=1&limit=1
INFO:geocoder.base:Requested https://nominatim.openstreetmap.org/search?q=Rua+Jos%C3%A9+Matos+L

100


INFO:geocoder.base:Requested https://nominatim.openstreetmap.org/search?q=Rua+Para%C3%ADso%2C+CE%2C+Brasil&format=jsonv2&addressdetails=1&limit=1
INFO:geocoder.base:Requested https://nominatim.openstreetmap.org/search?q=Rua+Manoel+Martins+Seabra%2C+SP%2C+Brasil&format=jsonv2&addressdetails=1&limit=1
INFO:geocoder.base:Requested https://nominatim.openstreetmap.org/search?q=Rua+Jos%C3%A9+Vieira+de+Lima%2C+PE%2C+Brasil&format=jsonv2&addressdetails=1&limit=1
INFO:geocoder.base:Requested https://nominatim.openstreetmap.org/search?q=Rua+Conde+de+Agrolongo%2C+RJ%2C+Brasil&format=jsonv2&addressdetails=1&limit=1
INFO:geocoder.base:Requested https://nominatim.openstreetmap.org/search?q=Rua+Ricardo+Dalton%2C+SP%2C+Brasil&format=jsonv2&addressdetails=1&limit=1
INFO:geocoder.base:Requested https://nominatim.openstreetmap.org/search?q=Rua+Manoel+Marques+Lopes%2C+SP%2C+Brasil&format=jsonv2&addressdetails=1&limit=1
INFO:geocoder.base:Requested https://nominatim.openstreetmap.org/search?q=Rua+Delta%2C+

200


INFO:geocoder.base:Requested https://nominatim.openstreetmap.org/search?q=Avenida+Eliseu+de+Almeida%2C+SP%2C+Brasil&format=jsonv2&addressdetails=1&limit=1
INFO:geocoder.base:Requested https://nominatim.openstreetmap.org/search?q=Estrada+dos+Leites%2C+SP%2C+Brasil&format=jsonv2&addressdetails=1&limit=1
INFO:geocoder.base:Requested https://nominatim.openstreetmap.org/search?q=Rua+Rio+Grande+do+Sul%2C+RJ%2C+Brasil&format=jsonv2&addressdetails=1&limit=1
INFO:geocoder.base:Requested https://nominatim.openstreetmap.org/search?q=Rua+Hor%C3%A1cio+Soares+de+Oliveira%2C+SP%2C+Brasil&format=jsonv2&addressdetails=1&limit=1
INFO:geocoder.base:Requested https://nominatim.openstreetmap.org/search?q=Rua+Thom%C3%A1s+Alberto+Whately%2C+SP%2C+Brasil&format=jsonv2&addressdetails=1&limit=1
INFO:geocoder.base:Requested https://nominatim.openstreetmap.org/search?q=Rua+G%2C+MG%2C+Brasil&format=jsonv2&addressdetails=1&limit=1
INFO:geocoder.base:Requested https://nominatim.openstreetmap.org/search?q=Rua+Urinde%

300


INFO:geocoder.base:Requested https://nominatim.openstreetmap.org/search?q=Rua+Jornalista+Jos%C3%A9+Olavo+Bispo%2C+AL%2C+Brasil&format=jsonv2&addressdetails=1&limit=1
INFO:geocoder.base:Requested https://nominatim.openstreetmap.org/search?q=Rua+Jornalista+Helv%C3%ADdio+Prisco%2C+MG%2C+Brasil&format=jsonv2&addressdetails=1&limit=1
INFO:geocoder.base:Requested https://nominatim.openstreetmap.org/search?q=Avenida+Professor+Makguti%2C+MG%2C+Brasil&format=jsonv2&addressdetails=1&limit=1
INFO:geocoder.base:Requested https://nominatim.openstreetmap.org/search?q=Rua+T+30%2C+GO%2C+Brasil&format=jsonv2&addressdetails=1&limit=1
INFO:geocoder.base:Requested https://nominatim.openstreetmap.org/search?q=Rua+Darcy+Vargas%2C+ES%2C+Brasil&format=jsonv2&addressdetails=1&limit=1
INFO:geocoder.base:Requested https://nominatim.openstreetmap.org/search?q=Travessa+Tom%C3%A1s+Ildefonso%2C+CE%2C+Brasil&format=jsonv2&addressdetails=1&limit=1
INFO:geocoder.base:Requested https://nominatim.openstreetmap.org/search

400


INFO:geocoder.base:Requested https://nominatim.openstreetmap.org/search?q=Rua+Aventura%2C+GO%2C+Brasil&format=jsonv2&addressdetails=1&limit=1
INFO:geocoder.base:Requested https://nominatim.openstreetmap.org/search?q=Rua+Genivaldo+Correia+Lima%2C+PB%2C+Brasil&format=jsonv2&addressdetails=1&limit=1
INFO:geocoder.base:Requested https://nominatim.openstreetmap.org/search?q=Rua+Quatro%2C+MA%2C+Brasil&format=jsonv2&addressdetails=1&limit=1
INFO:geocoder.base:Requested https://nominatim.openstreetmap.org/search?q=Rua+Doutor+Apr%C3%ADgio+de+Menezes%2C+AM%2C+Brasil&format=jsonv2&addressdetails=1&limit=1
INFO:geocoder.base:Requested https://nominatim.openstreetmap.org/search?q=Rua+Alberto+de+Faria%2C+SP%2C+Brasil&format=jsonv2&addressdetails=1&limit=1
INFO:geocoder.base:Requested https://nominatim.openstreetmap.org/search?q=Rua+Tonelero%2C+RJ%2C+Brasil&format=jsonv2&addressdetails=1&limit=1
INFO:geocoder.base:Requested https://nominatim.openstreetmap.org/search?q=Rua+Poeta+Cam%C3%B5es%2C+RN%2C+B

500


INFO:geocoder.base:Requested https://nominatim.openstreetmap.org/search?q=Rua+Carlos+Maciel+Britto%2C+ES%2C+Brasil&format=jsonv2&addressdetails=1&limit=1
INFO:geocoder.base:Requested https://nominatim.openstreetmap.org/search?q=Rua+Maria+Borboleta%2C+MG%2C+Brasil&format=jsonv2&addressdetails=1&limit=1
INFO:geocoder.base:Requested https://nominatim.openstreetmap.org/search?q=Rua+J7%2C+GO%2C+Brasil&format=jsonv2&addressdetails=1&limit=1
INFO:geocoder.base:Requested https://nominatim.openstreetmap.org/search?q=Rua+Luiz+Oscar+de+Carvalho%2C+SC%2C+Brasil&format=jsonv2&addressdetails=1&limit=1
INFO:geocoder.base:Requested https://nominatim.openstreetmap.org/search?q=Rua+Constante+Moro+Sobrinho%2C+PR%2C+Brasil&format=jsonv2&addressdetails=1&limit=1
INFO:geocoder.base:Requested https://nominatim.openstreetmap.org/search?q=Rua+Belarmina%2C+RJ%2C+Brasil&format=jsonv2&addressdetails=1&limit=1
INFO:geocoder.base:Requested https://nominatim.openstreetmap.org/search?q=Rua+Tr%C3%AAs%2C+RJ%2C+Brasil&f

600


INFO:geocoder.base:Requested https://nominatim.openstreetmap.org/search?q=Rua+Moacir+Emigio+Fabro%2C+RS%2C+Brasil&format=jsonv2&addressdetails=1&limit=1
INFO:geocoder.base:Requested https://nominatim.openstreetmap.org/search?q=Rua+L%C3%A1zaro+Toledo+de+Queiroz%2C+SP%2C+Brasil&format=jsonv2&addressdetails=1&limit=1
INFO:geocoder.base:Requested https://nominatim.openstreetmap.org/search?q=Rua+Leopoldina+Monteiro+Firme%2C+ES%2C+Brasil&format=jsonv2&addressdetails=1&limit=1
INFO:geocoder.base:Requested https://nominatim.openstreetmap.org/search?q=Avenida+Bernardo+Manuel%2C+CE%2C+Brasil&format=jsonv2&addressdetails=1&limit=1
INFO:geocoder.base:Requested https://nominatim.openstreetmap.org/search?q=Rua+Dinarte+Domingues%2C+SC%2C+Brasil&format=jsonv2&addressdetails=1&limit=1
INFO:geocoder.base:Requested https://nominatim.openstreetmap.org/search?q=Rua+Elisi%C3%A1rio+de+Camargo+Branco%2C+SC%2C+Brasil&format=jsonv2&addressdetails=1&limit=1
INFO:geocoder.base:Requested https://nominatim.openstre

700


ERROR:geocoder.base:Status code Unknown from https://nominatim.openstreetmap.org/search: ERROR - HTTPSConnectionPool(host='nominatim.openstreetmap.org', port=443): Max retries exceeded with url: /search?q=Rua+Doutor+Abel+Capela%2C+SC%2C+Brasil&format=jsonv2&addressdetails=1&limit=1 (Caused by ConnectTimeoutError(<urllib3.connection.VerifiedHTTPSConnection object at 0x0000021873BE2048>, 'Connection to nominatim.openstreetmap.org timed out. (connect timeout=5.0)'))
ERROR:geocoder.base:Status code Unknown from https://nominatim.openstreetmap.org/search: ERROR - HTTPSConnectionPool(host='nominatim.openstreetmap.org', port=443): Max retries exceeded with url: /search?q=Rua+Jos%C3%A9+Antonio+do+Amaral%2C+SP%2C+Brasil&format=jsonv2&addressdetails=1&limit=1 (Caused by ConnectTimeoutError(<urllib3.connection.VerifiedHTTPSConnection object at 0x0000021873EE8630>, 'Connection to nominatim.openstreetmap.org timed out. (connect timeout=5.0)'))
ERROR:geocoder.base:Status code Unknown from https://no

800


INFO:geocoder.base:Requested https://nominatim.openstreetmap.org/search?q=Rua+Ibiuna%2C+RJ%2C+Brasil&format=jsonv2&addressdetails=1&limit=1
INFO:geocoder.base:Requested https://nominatim.openstreetmap.org/search?q=Rua+Leopoldino+Bastos%2C+RJ%2C+Brasil&format=jsonv2&addressdetails=1&limit=1
INFO:geocoder.base:Requested https://nominatim.openstreetmap.org/search?q=Avenida+Beira-Rio%2C+ES%2C+Brasil&format=jsonv2&addressdetails=1&limit=1
INFO:geocoder.base:Requested https://nominatim.openstreetmap.org/search?q=Rua+Rafael+Gabriel%2C+SP%2C+Brasil&format=jsonv2&addressdetails=1&limit=1
INFO:geocoder.base:Requested https://nominatim.openstreetmap.org/search?q=Rua+Maruim%2C+SP%2C+Brasil&format=jsonv2&addressdetails=1&limit=1
INFO:geocoder.base:Requested https://nominatim.openstreetmap.org/search?q=Avenida+Doutor+Jos%C3%A9+Singer%2C+SP%2C+Brasil&format=jsonv2&addressdetails=1&limit=1
INFO:geocoder.base:Requested https://nominatim.openstreetmap.org/search?q=Rua+Francisco+Poliza%2C+SP%2C+Brasil&fo

In [23]:
df_locations = pd.DataFrame(cep_location_map.values(), index=cep_location_map.keys())

In [24]:
df_locations.shape

(872, 2)

In [25]:
df_locations['lat'].isnull().sum() / df_locations.shape[0]

0.22362385321100917

In [26]:
df_locations.to_csv('../data/raw/locations.csv')

## Juntar informações

In [27]:
data = pd.merge(df_locations, df_address, left_index=True, right_index=True, how='inner')

In [28]:
data.head()

Unnamed: 0,lat,lng,bairro,cep,cidade,complemento2,end,uf,unidadesPostagem
60730265,-3.790178,-38.583916,Parque São José,60730265.0,Fortaleza,- de 1350/1351 ao fim,Rua Antônio Costa Mendes,CE,[]
53605000,,,Campina de Feira,53605000.0,Igarassu,"- do km 39,010 ao km 40,004 - lado par",Avenida Alfredo Bandeira de Melo,PE,[]
18700970,,,,,,,,,[]
27267112,-22.531716,-44.126833,Duzentos e Quarenta e Nove,27267112.0,Volta Redonda,- de 760 a 830 - lado par,Avenida Europa,RJ,[]
23076380,-22.877733,-43.576327,Campo Grande,23076380.0,Rio de Janeiro,,Rua Taufik Dib,RJ,[]


## Exportar dados

In [47]:
data.to_csv('../data/external/cep_location.csv')