<a href="https://colab.research.google.com/github/josegoisgit/dadosgov/blob/main/query_dadosgov.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# import

In [3]:
!pip install unidecode

Collecting unidecode
[?25l  Downloading https://files.pythonhosted.org/packages/9e/25/723487ca2a52ebcee88a34d7d1f5a4b80b793f179ee0f62d5371938dfa01/Unidecode-1.2.0-py2.py3-none-any.whl (241kB)
[K     |█▍                              | 10kB 17.1MB/s eta 0:00:01[K     |██▊                             | 20kB 23.4MB/s eta 0:00:01[K     |████                            | 30kB 23.8MB/s eta 0:00:01[K     |█████▍                          | 40kB 26.0MB/s eta 0:00:01[K     |██████▉                         | 51kB 23.2MB/s eta 0:00:01[K     |████████▏                       | 61kB 19.9MB/s eta 0:00:01[K     |█████████▌                      | 71kB 19.6MB/s eta 0:00:01[K     |██████████▉                     | 81kB 18.5MB/s eta 0:00:01[K     |████████████▏                   | 92kB 16.6MB/s eta 0:00:01[K     |█████████████▋                  | 102kB 16.8MB/s eta 0:00:01[K     |███████████████                 | 112kB 16.8MB/s eta 0:00:01[K     |████████████████▎               | 12

In [4]:
import re
import requests
import pandas

import unidecode


from itertools import chain
from lxml.etree import HTML

import IPython

# funções

In [13]:

strip = lambda X: ('\n'.join(set(x.strip() for x in X if x.strip()))).strip()
plain = lambda i: list(chain.from_iterable(i))


### get request e html dom: document object model

modelo do objeto documento

In [14]:

def get_url_html(url):
    requested         = requests.request('GET', url)
    html              = HTML(requested.text)    
    html.nsmap['url'] = url

    return html


In [15]:

def contains_clause(dic, operator='OR'):
    predicados = ["contains(@{k},'{v}')".format(k=k,v=v) for k,v in dic.items()]

    return operator.join(predicados)



In [16]:

def compose_xpath(element,attribute,content):
    dictionary = contains_clause({attribute:content})

    str_dict = {
        'element' : element,
        'dictionary' : dictionary 
    }

    return "{element}[{dictionary}]".format(**str_dict)


In [17]:

def get_xnodes(html, element, attribute='class', content='', complement=''):
    if complement:
        xnode = html.xpath(compose_xpath(element,attribute,content) + complement)
    else:
        xnode = html.xpath(compose_xpath(element,attribute,content) )
    return xnode if xnode else None


In [18]:

def get_xnode(html, element, attribute='class', content='', complement=''):
    xnodes = get_xnodes(html, element, attribute, content, complement)
    return xnodes[0] if xnodes else None


### informações da busca

In [19]:

def get_html_page_count(html):
    xnode = get_xnode( html, '//div', 'class', 'pagination' ) 
    
    if len(xnode):
        return 1
    
    pdiv = xnode.xpath( '//li/a/text()' )
    
    def pmax(pdiv):
        V = [1]
        for p in pdiv:
            try:
                v = int(p)
                V.append(v)       
            except:
                continue
        return max(V)    
    return pmax(pdiv)


In [78]:

def get_query_page_count(hyperlink, query):
    html = get_url_html( hyperlink + '/dataset?q=%s' % query)
    return get_html_page_count( html )


In [21]:

def get_html_items_href(html):
    xnodes = get_xnodes( html, './/h3', 'class', 'dataset-heading' , '//a/@href' ) 
    return xnodes


### informações do dataset

In [22]:

def get_html_dataset(html):    
    article_html = get_xnode(html , '//article')
    
    dataset = dict()
    
    dataset['title'] = strip( get_xnodes( article_html , 'div/h1/text()') ) 
    dataset['key']   = re.sub('[^\w]','_',unidecode.unidecode(dataset['title']).lower())
    dataset['text']  = strip( get_xnodes(article_html,'//div','class','notes','//p/text()') ) 
    dataset['url']   = get_xnodes( article_html, '//ol', 'class', 'breadcrumb', '//li/a/@href')

    return dataset


In [23]:

def get_html_dataset_license(html,hyperlink='https://dados.gov.br'):
    xnode = get_xnode( html, '//section','class','license') 

    license_title = xnode.xpath( './/@title' )
    license_href  = xnode.xpath( './/@href')
    license_text  = [ text.strip() for text in xnode.xpath( './/text()' ) if text.strip() ]
    license_logo  = [ hyperlink + p for p in xnode.xpath( './/img//@src' ) ]
    
    license_dict          = dict()
    license_dict['title'] = license_title
    license_dict['href']  = license_href
    license_dict['text']  = license_text
    license_dict['logo']  = license_logo

    return license_dict


In [24]:

def get_html_dataset_resources_href(html):
    return get_xnodes(html,'//a','class','heading','/@href')


### informações do recurso

In [25]:

def get_html_resource_info(html):
    
    ths     = get_xnodes( html, '//table', 'class', 'table-condensed','/tbody//th/text()')
    headers = [re.sub('[^\w]','_',unidecode.unidecode(th.lower())) for th in ths]
    tds     = get_xnodes( html, '//table', 'class', 'table-condensed','/tbody//td')
    data    = [strip(get_xnodes(td, './/text()')).strip() for td in tds]

    info_dict        = {h:d for h,d in zip(headers,data)}
    info_dict['url'] = get_xnode( html, '//a','class','resource-url-analytic','/@href')    

    return info_dict

### auxiliares

In [None]:

def __dir_repr__(variable=dict,pattern=''):
    dir_type_dict = dict()
    
    for attribute in dir(variable):
        if not re.findall(pattern, attribute): continue
        value = getattr(variable,attribute)
        typename = type(value).__name__
        
        if typename not in dir_type_dict:
            dir_type_dict[typename] = [(attribute,value)]
        else:
            dir_type_dict[typename].append((attribute,value))
        
    for key, items in dir_type_dict.items():
        print(key)
        print()
        for attribute,value in items:
            print('\t',attribute)
        print()
        #print('{: <30s}{:<20s}'.format(typename, attribute))


In [None]:

def show_url( url='https://dados.gov.br'):
    dadosgov_frame = IPython.display.IFrame(src=url,width='100%',height='500px')
    display(dadosgov_frame)
    return


In [None]:

def query_site( query=''):
    query = query.lower()
    query = unidecode.unidecode(query)
    query = re.sub('[^a-z\s\'\"]','',query)
    query = re.sub('\s+','+',query)
    
    url = 'https://dados.gov.br/dataset?q=%s' % query
    dadosgov_frame = IPython.display.IFrame(src=url,width='100%',height='500px')
    display(dadosgov_frame)
    return dadosgov_frame



In [109]:

get_query_page_url   = lambda h, q, p : h + '/dataset' + '?q=' + q + '&page=' + str(p)
get_query_page_html  = lambda h, q, p : get_url_html( get_query_page_url( h, q, p ) )
get_query_page_items = lambda h, q, p : get_html_items_href( get_query_page_html( h, q, p ) )
get_query_items_href = lambda h, q    : plain([ get_query_page_items( h, q, p+1 ) for p in range( get_query_page_count( h, q ) ) ])


# Argumentos de busca

In [110]:
import ipywidgets

In [111]:
query     = 'UFRN'
hyperlink = 'https://dados.gov.br'

In [112]:
qri_label = ipywidgets.widgets.Label('Query')
qri_text  = ipywidgets.widgets.Text(query)

hyperlink_text  = ipywidgets.widgets.Text(hyperlink)
hyperlink_label = ipywidgets.widgets.Label('Hyperlink')

In [113]:
qri_box = ipywidgets.widgets.HBox([qri_label, qri_text])
hyperlink_box = ipywidgets.widgets.HBox([hyperlink_label, hyperlink_text])

In [114]:
display(ipywidgets.widgets.VBox([qri_box,hyperlink_box]))

VBox(children=(HBox(children=(Label(value='Query'), Text(value='UFRN'))), HBox(children=(Label(value='Hyperlin…

In [127]:
query     = qri_text.value
hyperlink = hyperlink_text.value

In [128]:
query_url = get_query_page_url(hyperlink,query,1)

In [129]:
show_url(query_url)

In [130]:
page_count = get_query_page_count( hyperlink, query )
page_count

1

In [131]:
resources = []
items_href = get_query_items_href(hyperlink,query)

In [138]:
show_url(hyperlink + resources_href[0])

In [140]:
import pandas

for item_href in items_href:
    item_url = hyperlink + item_href

    print('\n\n\t', item_url, end='\n\n')
    dataset_html   = get_url_html( item_url )

    dataset        = get_html_dataset( dataset_html )
    license        = get_html_dataset_license ( dataset_html )
    resources_href = get_html_dataset_resources_href( dataset_html )

    for resource_href in resources_href:
        resource_url  = hyperlink + resource_href 

        resource_html = get_url_html( resource_url )  
        resource_dict = get_html_resource_info(resource_html)

        a = 'http://landpage-h.cgu.gov.br/dadosabertos/index.php?url='

        if resource_dict['url']:
            resource_dict['url_out'] = resource_dict['url'].replace(a,'')
        else:
            resource_dict['url_out'] = None
        resource_dict['dataset'] = dataset.copy()
        resource_dict['license'] = license

        print('\t\t',resource_dict['url_out'], end='\n')

        resources.append( resource_dict )
    


SyntaxError: ignored

In [141]:
df_datagov = pandas.DataFrame(resources)

In [142]:
df_datagov

Unnamed: 0,ultima_atualizacao,criado,formato,licenca,created,datastore_active,format,id,last_modified,package_id,resource_type,revision_id,size,state,webstore_last_updated,webstore_url,url,url_out,dataset,license,position
0,2/Abril/2021,15/Agosto/2017,CSV,Other (Open),há mais de 3 anos,True,CSV,ff0a457e-76fa-4aca-ad99-48aebd7db070,22 dias atrás,8bf1a468-48ff-4f4d-95ee-b17b7a3a5592,csv,b51067ab-ca4a-46a5-a0a6-a05eeb4f5f72,"741,8 KiB",active,,,http://landpage-h.cgu.gov.br/dadosabertos/inde...,http://dados.ufrn.br/dataset/8bf1a468-48ff-4f4...,"{'title': 'Docentes', 'key': 'docentes', 'text...",{'title': ['Este conjunto de dados satisfaz a ...,
1,1/Julho/2019,14/Outubro/2016,PDF,Other (Open),há mais de 4 anos,,PDF,83988d39-6dd6-4003-91ca-b5ebdbe740f5,há mais de 1 ano,8bf1a468-48ff-4f4d-95ee-b17b7a3a5592,,5b422372-faf7-40d3-a325-7833942ba092,,active,,,http://landpage-h.cgu.gov.br/dadosabertos/inde...,http://dados.ufrn.br/dataset/8bf1a468-48ff-4f4...,"{'title': 'Docentes', 'key': 'docentes', 'text...",{'title': ['Este conjunto de dados satisfaz a ...,1
2,23/Março/2021,19/Setembro/2019,CSV,Other (Open),há mais de 1 ano,,CSV,9a42ee8c-a59f-414e-af74-1d4e1a8316ef,há 1 mês,1a21faeb-23a8-49a9-94c8-5abafa4825b5,,812382fe-b005-42a4-ad0b-12627866eebf,,active,,,http://landpage-h.cgu.gov.br/dadosabertos/inde...,http://dados.ufrn.br/dataset/1a21faeb-23a8-49a...,"{'title': 'Orientações de Docentes', 'key': 'o...",{'title': ['Este conjunto de dados satisfaz a ...,
3,23/Março/2021,19/Setembro/2019,CSV,Other (Open),há mais de 1 ano,True,CSV,152b07dc-5b97-40dc-90a6-ab9de667f8e1,há 1 mês,1a21faeb-23a8-49a9-94c8-5abafa4825b5,,812382fe-b005-42a4-ad0b-12627866eebf,,active,,,http://landpage-h.cgu.gov.br/dadosabertos/inde...,http://dados.ufrn.br/dataset/1a21faeb-23a8-49a...,"{'title': 'Orientações de Docentes', 'key': 'o...",{'title': ['Este conjunto de dados satisfaz a ...,1
4,23/Março/2021,15/Janeiro/2018,CSV,Other (Open),há mais de 3 anos,True,CSV,fb0213f6-d4ae-41b1-8264-efbff24b3051,há 1 mês,1a21faeb-23a8-49a9-94c8-5abafa4825b5,,812382fe-b005-42a4-ad0b-12627866eebf,,active,,,http://landpage-h.cgu.gov.br/dadosabertos/inde...,http://dados.ufrn.br/dataset/1a21faeb-23a8-49a...,"{'title': 'Orientações de Docentes', 'key': 'o...",{'title': ['Este conjunto de dados satisfaz a ...,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
114,17/Março/2021,9/Março/2020,CSV,Creative Commons Attribution,há mais de 1 ano,,CSV,49bbd54d-9f0a-439a-b492-59390f90b2d2,há 1 mês,2e8d3459-1a3a-4e9a-8b7c-332d9e8fe00e,,0e6cb1c1-a666-4ca0-bff3-8ab9749459ec,,active,,,http://landpage-h.cgu.gov.br/dadosabertos/inde...,http://dados.ufrn.br/dataset/2e8d3459-1a3a-4e9...,"{'title': 'Indicadores de Pesquisa', 'key': 'i...",{'title': ['Este conjunto de dados satisfaz a ...,7
115,17/Março/2021,9/Março/2020,CSV,Creative Commons Attribution,há mais de 1 ano,True,CSV,7e4da4ea-8f31-492e-9bed-6f53e72bff17,há 1 mês,2e8d3459-1a3a-4e9a-8b7c-332d9e8fe00e,,0e6cb1c1-a666-4ca0-bff3-8ab9749459ec,,active,,,http://landpage-h.cgu.gov.br/dadosabertos/inde...,http://dados.ufrn.br/dataset/2e8d3459-1a3a-4e9...,"{'title': 'Indicadores de Pesquisa', 'key': 'i...",{'title': ['Este conjunto de dados satisfaz a ...,8
116,16/Abril/2021,9/Março/2020,CSV,Creative Commons Attribution,há mais de 1 ano,,CSV,c063d5ec-e8d8-47fc-99bd-d3effe6aec32,8 dias atrás,2e8d3459-1a3a-4e9a-8b7c-332d9e8fe00e,,fd98e6c0-3b30-49a8-b7ca-cdec375d55e3,,active,,,http://landpage-h.cgu.gov.br/dadosabertos/inde...,http://dados.ufrn.br/dataset/2e8d3459-1a3a-4e9...,"{'title': 'Indicadores de Pesquisa', 'key': 'i...",{'title': ['Este conjunto de dados satisfaz a ...,9
117,16/Abril/2021,9/Março/2020,CSV,Creative Commons Attribution,há mais de 1 ano,True,CSV,daf57850-420f-41dc-933b-b4fa4bbb5a23,8 dias atrás,2e8d3459-1a3a-4e9a-8b7c-332d9e8fe00e,,fd98e6c0-3b30-49a8-b7ca-cdec375d55e3,,active,,,http://landpage-h.cgu.gov.br/dadosabertos/inde...,http://dados.ufrn.br/dataset/2e8d3459-1a3a-4e9...,"{'title': 'Indicadores de Pesquisa', 'key': 'i...",{'title': ['Este conjunto de dados satisfaz a ...,10


In [77]:

resources

[]

In [143]:
arcevo_dict = dict()
for resource in resources:
    key = resource['dataset']['key']
    if not key in arcevo_dict: arcevo_dict[key] = []
    arcevo_dict[key].append( resource['url_out'] )


In [148]:

dict_key_str = lambda key : "arcevo_dict['%s']" % key
n = max([len(dict_key_str(key)) for key in arcevo_dict.keys()])



http://dados.ufrn.br/dataset/8bf1a468-48ff-4f4d-95ee-b17b7a3a5592/resource/ff0a457e-76fa-4aca-ad99-48aebd7db070/download/docentes.csv
http://dados.ufrn.br/dataset/8bf1a468-48ff-4f4d-95ee-b17b7a3a5592/resource/83988d39-6dd6-4003-91ca-b5ebdbe740f5/download/docentesdicionario.pdf

http://dados.ufrn.br/dataset/1a21faeb-23a8-49a9-94c8-5abafa4825b5/resource/9a42ee8c-a59f-414e-af74-1d4e1a8316ef/download/orientacoes2019.csv
http://dados.ufrn.br/dataset/1a21faeb-23a8-49a9-94c8-5abafa4825b5/resource/152b07dc-5b97-40dc-90a6-ab9de667f8e1/download/orientacoes2018.csv
http://dados.ufrn.br/dataset/1a21faeb-23a8-49a9-94c8-5abafa4825b5/resource/fb0213f6-d4ae-41b1-8264-efbff24b3051/download/orientacoes2017.csv
http://dados.ufrn.br/dataset/1a21faeb-23a8-49a9-94c8-5abafa4825b5/resource/3fa37647-3b6e-4ad1-abf8-b66ef589dae3/download/orientacoes2016.csv
http://dados.ufrn.br/dataset/1a21faeb-23a8-49a9-94c8-5abafa4825b5/resource/0391561f-dc9e-422c-8d04-f4933fcb4bb3/download/orientacoes2015.csv
http://dados.ufr

In [None]:

left = list(('{: <%is} = ' % n).format( "arcevo_dict[%s]" % key ) for key in arcevo_dict.keys())

arcevo_str = ''

for key in sorted(arcevo_dict.keys(),key=lambda e: e):
    links = arcevo_dict[key]
    entry = ('{: <' + str(n) + 's} = [' ).format(dict_key_str(key))
    for link in sorted(links, key=lambda e: os.path.splitext(e)[1],reverse=True):
        entry += "'{link}', ".format(link=link)
    entry += "]\n"
    arcevo_str += entry

In [150]:
print(arcevo_str)

arcevo_dict['avaliacao_de_docencia']                            = ['http://dados.ufrn.br/dataset/d5723d75-7e6e-4264-82aa-b96909b69f63/resource/781c88a6-80c9-4591-b626-3015501e33a5/download/dicionario-de-dados---avaliacoes-de-docentes.pdf', 'http://dados.ufrn.br/dataset/d5723d75-7e6e-4264-82aa-b96909b69f63/resource/7accd1d2-2793-460e-b98d-87a0679b9155/download/avaliacaodocencia.csv', ]
arcevo_dict['avaliacoes_de_desempenho_de_docentes']             = ['http://dados.ufrn.br/dataset/e3536933-52ec-4ab2-aae6-1dad48f31d2c/resource/28f2570b-170b-43af-a43e-8f0c56b1872c/download/dicionario---avaliacoes-de-desempenho-de-docentes.pdf', 'http://dados.ufrn.br/dataset/e3536933-52ec-4ab2-aae6-1dad48f31d2c/resource/21808e6c-4a01-409d-b97c-404c2dd5a24b/download/avaliacao-desempenho-docente.csv', ]
arcevo_dict['docentes']                                         = ['http://dados.ufrn.br/dataset/8bf1a468-48ff-4f4d-95ee-b17b7a3a5592/resource/83988d39-6dd6-4003-91ca-b5ebdbe740f5/download/docentesdicionario.