#### Useful links

[Government portal](https://dados.gov.br/)

[How-to get an API key](https://dados.gov.br/dados/conteudo/como-acessar-a-api-do-portal-de-dados-abertos-com-o-perfil-de-consumidor)

[API documentation](https://dados.gov.br/swagger-ui/index.html#/)

### Common requirements

#### Imports

In [3]:
import os
import json
import requests

from dotenv import load_dotenv
from urllib.parse import urljoin
from urllib.parse import urlparse
from urllib.request import urlretrieve

If you need debugging

```python
import logging
import http.client

http.client.HTTPConnection.debuglevel = 10

# You must initialize logging, otherwise you'll not see debug output.
logging.basicConfig()
logging.getLogger().setLevel(logging.DEBUG)
requests_log = logging.getLogger("requests.packages.urllib3")
requests_log.setLevel(logging.DEBUG)
requests_log.propagate = True
```

#### Data directory

In [4]:
data_directory_path = "./data"

#### Environmental variables loading

In [5]:
env_filepath = ".env"

In [6]:
load_dotenv(env_filepath)

True

#### Request headers and base URL

In [7]:
request_headers = {
    'accept': 'application/json',
    'chave-api-dados-abertos': os.environ["CHAVE_API_DADOS_GOV_BR"],
}

request_base_url = urlparse('https://dados.gov.br/dados/api/publico/')

### Geting organizational data

#### Requesting the list of organizations

In [7]:
organizations = []
pagina = 0
while True:
    pagina += 1
    print(pagina, end=', ')
    
    request_params = {
        'pagina' : pagina,
    }
    
    requested = requests.get(
        urljoin(request_base_url.geturl(),'organizacao'), 
        headers=request_headers, 
        params=request_params
    )
    
    if requested.json():
        organizations.extend(requested.json())
    else:
        break


1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 

#### Requesting details of organizations

In [6]:
o = 0
for org in organizations:    
    o += 1
    if 'ativo' in org: continue
    requested = requests.get(
        urljoin(request_base_url.geturl(),'/'.join(['organizacao',org['id']])), 
        headers=request_headers, 
        params=request_params
    )
    org.update(requested.json())
    print(o, end=', ' if o % 20 else '\n')

1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20
21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40
41, 42, 43, 44, 45, 

#### Dumping organizations data to a json file

In [48]:
import time

In [87]:
json_filename = time.strftime("organizations_%Y%m%d.json", time.localtime())
json_filepath = os.path.join(data_directory_path,json_filename)

if not os.path.exists(json_filepath):
    with open(json_filepath,'w+') as f:
        json.dump(organizations,f,indent=4)

### Loading from disk

#### Loading organizations from json file

In [41]:
json_filename = sorted([f for f in os.listdir(data_directory_path) if f.startswith('organizations_')])[0]
json_filepath = os.path.join(data_directory_path,json_filename)

with open(json_filepath,'r') as f:
    organizations = json.load(f)

#### Filtering the organizations to our needs

In [42]:
organizations = [o for o in organizations if 'universidade' in o['nome'].lower()]


In [44]:
for o, org in enumerate(organizations):
    org_directory_name = org['nome'].replace(' ','_').lower().replace('/','_').replace('\\','_').strip()
    org_directory_path = os.path.join(data_directory_path,org_directory_name)
    org_json_filepath = os.path.join(data_directory_path,org_directory_name+'.json')

    if os.path.exists(org_json_filepath): 
        with open(org_json_filepath,'r') as f:
            organizations[o] = json.load(f)

#### Requesting the list of datasets from each organization

In [35]:
o = 0 
for org in organizations:
    o += 1
    if org.get('conjuntoDados'): continue
    if not int(org['qtdConjuntoDeDados']): continue
    print(o,org['titulo'], end=': ')

    conjunto_dados = []
    pagina = 0
    while True:
        pagina += 1
        print(pagina, end=', ')
        request_params = {
            'idOrganizacao' : org['id'],
            'pagina' : pagina,
        }
        
        request_url = "https://dados.gov.br/dados/api/publico/conjuntos-dados"
        requested = requests.get(request_url, headers=request_headers, params=request_params)
    
        if requested.json():
            conjunto_dados.extend(requested.json())
        else:
            break
    org['conjuntoDados'] = conjunto_dados
    print('!')

#### Requesting dataset details

In [36]:
for o, org in enumerate(organizations):
    org_directory_name = org['nome'].replace(' ','_').lower().replace('/','_').replace('\\','_').strip()
    org_directory_path = os.path.join(data_directory_path,org_directory_name)
    org_json_filepath = os.path.join(data_directory_path,org_directory_name+'.json')

    if os.path.exists(org_json_filepath): continue
    if not int(org['qtdConjuntoDeDados']): continue

    print(o, org['titulo'], end=':\n')
    request_params = {
        
    }
    for d, dado in enumerate(org['conjuntoDados']):
        print('\t'+str(d), end=' (')
        request_url = "https://dados.gov.br/dados/api/publico/conjuntos-dados/" + dado['id']
        response = requests.get(request_url, headers=request_headers, params=request_params)
        response_json = response.json()
        if response_json:
            print(dado['title'], end=')\n')
            dado.update(response_json)        
    
    
    with open(org_json_filepath,'w+') as f:
        json.dump(org,f,indent=4)

#### Retrieving dataset resource

In [37]:
organizations_jsons = [f for f in os.listdir(data_directory_path) if f.endswith('.json') and not f.startswith('organizations')]

In [38]:
[len(o['conjuntoDados']) for o in organizations if o.get('conjuntoDados')]

[88]

In [39]:
organizations_jsons = [json for json in organizations_jsons if 'ufrn' in json]
organizations_jsons

['universidade_federal_do_rio_grande_do_norte_-_ufrn.json']

In [40]:
for o,organization_json_filepath in enumerate(organizations_jsons):
    with open(os.path.join(data_directory_path,organization_json_filepath),'r') as f:
        organization_json = json.load(f)
    organization_directory_name = organization_json_filepath[:-5]
    organization_directory_path = os.path.join(data_directory_path,organization_directory_name)

    os.makedirs(organization_directory_path, exist_ok=True)

    print(organization_directory_name)

    for conjunto in organization_json['conjuntoDados']:
        print('\t',conjunto['title'])
        for r,recurso in enumerate(conjunto['recursos']):
            if not recurso['formato']: continue
            url = urlparse(recurso['link'])
            filename = url.path.split('/')[-1]
            filepath = os.path.join(organization_directory_path,filename)
            if os.path.exists(filepath): continue
            print('\t\t',r, recurso['titulo'],filename)
            urlretrieve(url.geturl(),filepath)


universidade_federal_do_rio_grande_do_norte_-_ufrn
	 I Hackathon UFRN
	 Relação Empenhos COVID inscritos em RPNP
	 Acervo aquisição
	 Servidores Aposentados
	 Trabalhos de Conclusão de Curso
	 Requisição de Serviço para Licitação
	 Cursos de Graduação
	 Requisição de Parecer Técnico/Avaliação
	 Bolsas de Apoio
	 Requisição de Material para Importação
	 Requisição de Serviço para Pessoa Jurídica
	 Requisição de Reserva de Veículo
	 Requisição de Meio Ambiente
	 Requisição de Materiais Informacionais do Contrato
	 Requisição de Atividade de Campo
	 Requisição de Devolução de Material
	 Grupos de Pesquisa
	 Solicitação de Material de Registro de Preço
	 Requisições de Materiais
	 Componentes Curriculares
	 Requisição de Materiais Informacionais
	 Orientações de Docentes
	 Estruturas Curriculares
	 Bens Móveis
	 Processos Seletivos
	 Acervo da Biblioteca
	 Planos Individuais Docentes
	 Funções Gratificadas
	 AVALIAÇÕES DE DESEMPENHO DE DOCENTES
	 Convênios
	 Requisição de Material para P