# Recuperar lista de cidades no IBGE da mesorregião da Zona da Mata Mineira

## Objetivos

Recuperar dados das cidades da mesorregião da Zona da Mata Mineira


In [42]:
# curl -i -H "Accept: application/jsope: application/json" -X GET https://servicodados.ibge.gov.br/api/v1/localidades/mesorregioes/3112/municipios
# Link para api do IBGE
# https://servicodados.ibge.gov.br/api/docs/localidades?versao=1#api-Mesorregioes-mesorregioesMesorregiaoGet
# SIOPE
# https://www.fnde.gov.br/siope/indicadoresFinanceirosEEducacionais.do?acao=PESQUISAR&anoPaginacao=2011&paginacao=-&pag=result&cod_uf=31&municipios=317130
import os, ssl
import re
import numpy as np
import pandas as pd
import requests
import urllib.request
from bs4 import BeautifulSoup
from lxml import html

In [2]:
# Listar cidades da mesorregião da Zona da Mata Mineira
# e criar dataframe do pandas

r = requests.get('https://servicodados.ibge.gov.br/api/v1/localidades/mesorregioes/3112/municipios')
json_cities = r.json()

cities = []

for city in json_cities:
    cities.append([city['id'], city['nome']])

data = pd.DataFrame(np.array(cities), columns=['city_id','city_name'])

In [3]:
# Criar atributo com códigos compatíveis com o SIOPE/FTND
data['city_id_siope'] = data.city_id.str[:6]

In [4]:
data.head()

Unnamed: 0,city_id,city_name,city_id_siope
0,3100302,Abre Campo,310030
1,3100401,Acaiaca,310040
2,3101508,Além Paraíba,310150
3,3102050,Alto Caparaó,310205
4,3102100,Alto Rio Doce,310210


## Baixar e raspar página web com BeautifulSoup

In [5]:
# Scrape with not valid SSL certificate
if (not os.environ.get('PYTHONHTTPSVERIFY', '') and
    getattr(ssl, '_create_unverified_context', None)): 
    ssl._create_default_https_context = ssl._create_unverified_context

In [13]:
URL = 'https://www.fnde.gov.br/siope/indicadoresFinanceirosEEducacionais.do?acao=PESQUISAR&anoPaginacao=2011&paginacao=-&pag=result&cod_uf=31&municipios=317130'
PAGE = urllib.request.urlopen(URL).read()
soup = BeautifulSoup(PAGE)

In [15]:
print(soup.prettify())

<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
 <!-- InstanceBegin template="/Templates/report_template.dwt" codeOutsideHTMLIsLocked="false" -->
 <head>
  <!-- InstanceBeginEditable name="doctitle" -->
  <title>
   SIOPE
  </title>
  <!-- InstanceEndEditable -->
  <meta content="English" http-equiv="Content-Language"/>
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
  <link href="style.css" rel="stylesheet" type="text/css"/>
  <link href="favicon.ico" rel="shortcut icon" type="image/x-icon"/>
  <!-- InstanceBeginEditable name="head" -->
  <!-- InstanceEndEditable -->
  <script language="javascript" src="functions.js">
  </script>
  <!-- InstanceBeginEditable name="ContentRegion" -->
  <script type="text/javascript">
   var pkPageLoadStart = new Date();
  </script>
 </head>
 <body>
  <script language="JavaScript1.2" type="text/javascript">
   function submitF

In [97]:
tables = soup.find_all('table', {'class': 'table'})

storeTable = tables[0].find_all('tr')

tabledata = []
finaldata = []

for row in storeTable:
    cols = row.find_all('td')
    cols = [ele.text.strip() for ele in cols]
    tabledata.append([ele for ele in cols if ele])
    
for item in tabledata:
    if len(item) > 7:
        for row in range(1,len(item)):
            item[row] = re.sub('\xa0%','', item[row])
            item[row] = re.sub(',','.', item[row])
        finaldata.append(item)    
    
pd.DataFrame(np.array(finaldata))

# TODO extract table headers
# TODO extract all tables in a given page

Unnamed: 0,0,1,2,3,4,5,6,7
0,1.1,Percentual de aplicação das receitas de impost...,29.04,31.2,26.48,26.81,28.11,27.17
1,1.2,Percentual de aplicação do FUNDEF ou FUNDEB na...,97.41,64.89,95.3,93.76,87.02,63.42
2,1.3,Percentual de aplicação do FUNDEF ou FUNDEB em...,1.69,35.11,2.56,6.23,12.97,36.47
3,1.4,Percentual das receitas do FUNDEF ou FUNDEB nã...,0.9,0.0,2.14,0.0,0.01,0.11
