<a href="https://colab.research.google.com/github/josegoisgit/dadosgov/blob/main/query_dadosgov.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Módulos e pacotes

## Instalação

In [50]:
!pip install unidecode



## Importação

In [51]:
import re
import requests
import pandas

import unidecode


from itertools import chain
from lxml.etree import HTML

import IPython
import ipywidgets
import google.colab

# Funções

## auxiliares

In [52]:

strip = lambda X: ('\n'.join(set(x.strip() for x in X if x.strip()))).strip()
plain = lambda i: list(chain.from_iterable(i))


## get_url_html

In [53]:
def get_url_html(url):
    requested         = requests.request('GET', url)
    html              = HTML(requested.text)    
    html.nsmap['url'] = url

    return html


In [54]:
#Exemplo:
html = get_url_html('https://docs.python.org')
html.xpath('.//a/@href')

['genindex.html',
 'py-modindex.html',
 'https://www.python.org/',
 '#',
 'whatsnew/3.9.html',
 'whatsnew/index.html',
 'tutorial/index.html',
 'library/index.html',
 'reference/index.html',
 'using/index.html',
 'howto/index.html',
 'installing/index.html',
 'distributing/index.html',
 'extending/index.html',
 'c-api/index.html',
 'faq/index.html',
 'py-modindex.html',
 'genindex.html',
 'glossary.html',
 'search.html',
 'contents.html',
 'bugs.html',
 'https://devguide.python.org/docquality/#helping-with-documentation',
 'about.html',
 'license.html',
 'copyright.html',
 'download.html',
 'https://docs.python.org/3.10/',
 'https://docs.python.org/3.9/',
 'https://docs.python.org/3.8/',
 'https://docs.python.org/3.7/',
 'https://docs.python.org/3.6/',
 'https://docs.python.org/3.5/',
 'https://docs.python.org/2.7/',
 'https://www.python.org/doc/versions/',
 'https://www.python.org/dev/peps/',
 'https://wiki.python.org/moin/BeginnersGuide',
 'https://wiki.python.org/moin/PythonBooks',


## xpath

### contains_clause

In [55]:
def contains_clause(dic, operator='OR'):
    predicados = ["contains(@{k},'{v}')".format(k=k,v=v) for k,v in dic.items()]

    return operator.join(predicados)


### compose_xpath

In [56]:

def compose_xpath(element,attribute,content):
    dictionary = contains_clause({attribute:content})

    str_dict = {
        'element' : element,
        'dictionary' : dictionary 
    }

    return "{element}[{dictionary}]".format(**str_dict)


### get_xnodes

In [57]:

def get_xnodes(html, element, attribute='class', content='', complement=''):
    if complement:
        xnode = html.xpath(compose_xpath(element,attribute,content) + complement)
    else:
        xnode = html.xpath(compose_xpath(element,attribute,content) )
    return xnode if xnode else None


### get_xnode

In [58]:

def get_xnode(html, element, attribute='class', content='', complement=''):
    xnodes = get_xnodes(html, element, attribute, content, complement)
    return xnodes[0] if xnodes else None


## datasets

### get_html_page_count

In [59]:

def get_html_page_count(html):
    xnode = get_xnode( html, '//div', 'class', 'pagination' ) 
    
    if len(xnode):
        return 1
    
    pdiv = xnode.xpath( '//li/a/text()' )
    
    def pmax(pdiv):
        V = [1]
        for p in pdiv:
            try:
                v = int(p)
                V.append(v)       
            except:
                continue
        return max(V)    
    return pmax(pdiv)


### get_query_page_count

In [60]:

def get_query_page_count(hyperlink, query):
    html = get_url_html( hyperlink + '/dataset?q=%s' % query)
    return get_html_page_count( html )


### get_html_items_href






In [61]:

def get_html_items_href(html):
    xnodes = get_xnodes( html, './/h3', 'class', 'dataset-heading' , '//a/@href' ) 
    return xnodes


### get_html_dataset

In [62]:

def get_html_dataset(html):    
    article_html = get_xnode(html , '//article')
    
    dataset = dict()
    
    dataset['organization'] = ' '.join([x.strip() for x in get_xnodes(html, './/section','class','module-content','//h1//text()')])
    dataset['title']        = strip( get_xnodes( article_html , 'div/h1/text()') ) 
    dataset['key']          = re.sub('[^\w]','_',unidecode.unidecode(dataset['title']).lower())
    z = get_xnodes(article_html,'//div','class','notes','//p/text()')
    dataset['text'] = strip(z) if z else ''
    dataset['url']          = get_xnodes( article_html, '//ol', 'class', 'breadcrumb', '//li/a/@href')

    return dataset


### get_html_dataset_license

In [63]:

def get_html_dataset_license(html,hyperlink='https://dados.gov.br'):
    xnode = get_xnode( html, '//section','class','license') 

    if not xnode: return dict(title='NA',href='',text='',logo='')

    license_title = xnode.xpath( './/@title' )
    license_href  = xnode.xpath( './/@href')
    license_text  = [ text.strip() for text in xnode.xpath( './/text()' ) if text.strip() ]
    license_logo  = [ hyperlink + p for p in xnode.xpath( './/img//@src' ) ]
    
    license_dict          = dict()
    license_dict['title'] = license_title
    license_dict['href']  = license_href
    license_dict['text']  = license_text
    license_dict['logo']  = license_logo

    return license_dict


### get_html_dataset_resources_href

In [64]:

def get_html_dataset_resources_href(html):
    return get_xnodes(html,'//a','class','heading','/@href')


### get_html_resource_info





In [65]:

def get_html_resource_info(html):
    
    ths     = get_xnodes( html, '//table', 'class', 'table-condensed','/tbody//th/text()')
    headers = [re.sub('[^\w]','_',unidecode.unidecode(th.lower())) for th in ths]
    tds     = get_xnodes( html, '//table', 'class', 'table-condensed','/tbody//td')
    data    = [strip(get_xnodes(td, './/text()')).strip() for td in tds]

    info_dict        = {h:d for h,d in zip(headers,data)}
    info_dict['url'] = get_xnode( html, '//a','class','resource-url-analytic','/@href')    

    return info_dict

### auxiliares

In [66]:

def __dir_repr__(variable=dict,pattern=''):
    dir_type_dict = dict()
    
    for attribute in dir(variable):
        if not re.findall(pattern, attribute): continue
        value = getattr(variable,attribute)
        typename = type(value).__name__
        
        if typename not in dir_type_dict:
            dir_type_dict[typename] = [(attribute,value)]
        else:
            dir_type_dict[typename].append((attribute,value))
        
    for key, items in dir_type_dict.items():
        print(key)
        print()
        for attribute,value in items:
            print('\t',attribute)
        print()
        #print('{: <30s}{:<20s}'.format(typename, attribute))


In [67]:

def show_url( url='https://dados.gov.br'):
    dadosgov_frame = IPython.display.IFrame(src=url,width='100%',height='500px')
    display(dadosgov_frame)
    return


In [68]:

def query_site( query=''):
    query = query.lower()
    query = unidecode.unidecode(query)
    query = re.sub('[^a-z\s\'\"]','',query)
    query = re.sub('\s+','+',query)
    
    url = 'https://dados.gov.br/dataset?q=%s' % query
    dadosgov_frame = IPython.display.IFrame(src=url,width='100%',height='500px')
    display(dadosgov_frame)
    return dadosgov_frame



In [69]:

get_query_page_url   = lambda h, q, p : h + '/dataset' + '?q=' + q + '&page=' + str(p)
get_query_page_html  = lambda h, q, p : get_url_html( get_query_page_url( h, q, p ) )
get_query_page_items = lambda h, q, p : get_html_items_href( get_query_page_html( h, q, p ) )
get_query_items_href = lambda h, q    : plain([ get_query_page_items( h, q, p+1 ) for p in range( get_query_page_count( h, q ) ) ])


# Leitura

## argumentos

In [70]:
query     = ''
hyperlink = 'https://dados.gov.br'

In [71]:
qri_label = ipywidgets.widgets.Label('Query')
qri_text  = ipywidgets.widgets.Text(query)

hyperlink_text  = ipywidgets.widgets.Text(hyperlink)
hyperlink_label = ipywidgets.widgets.Label('Hyperlink')

In [72]:
qri_box = ipywidgets.widgets.HBox([qri_label, qri_text])
hyperlink_box = ipywidgets.widgets.HBox([hyperlink_label, hyperlink_text])

In [73]:
display(ipywidgets.widgets.VBox([qri_box,hyperlink_box]))

VBox(children=(HBox(children=(Label(value='Query'), Text(value='UFRN relação de docentes'))), HBox(children=(L…

In [74]:
query     = qri_text.value
hyperlink = hyperlink_text.value

In [75]:
query_url = get_query_page_url(hyperlink,query,1)

In [76]:
show_url(query_url)

## visualização

In [77]:
page_count = get_query_page_count( hyperlink, query )
page_count

1

In [78]:
resources = []
items_href = get_query_items_href(hyperlink,query)

In [79]:
ih = 0

print(hyperlink + items_href[ih])
show_url(hyperlink + items_href[ih])

https://dados.gov.br/dataset/docentes


In [117]:
import pandas

for item_href in items_href:
    item_url = hyperlink + item_href

    print('\n\n\t', item_url, end='\n\n')
    dataset_html   = get_url_html( item_url )

    dataset        = get_html_dataset( dataset_html )
    license        = get_html_dataset_license ( dataset_html )
    resources_href = get_html_dataset_resources_href( dataset_html )

    for resource_href in resources_href:
        resource_url  = hyperlink + resource_href 

        resource_html = get_url_html( resource_url )  
        resource_dict = get_html_resource_info(resource_html)

        a = 'http://landpage-h.cgu.gov.br/dadosabertos/index.php?url='

        if resource_dict['url']:
            resource_dict['url_out'] = resource_dict['url'].replace(a,'')
        else:
            resource_dict['url_out'] = None
        resource_dict['dataset'] = dataset.copy()
        resource_dict['license'] = license

        resource_dict['havested'] = time.ctime()
        
        print('\t\t',resource_dict['url_out'], end='\n')

        resources.append( resource_dict )
    




	 https://dados.gov.br/dataset/docentes



  """


		 http://dados.ufrn.br/dataset/8bf1a468-48ff-4f4d-95ee-b17b7a3a5592/resource/ff0a457e-76fa-4aca-ad99-48aebd7db070/download/docentes.csv
		 http://dados.ufrn.br/dataset/8bf1a468-48ff-4f4d-95ee-b17b7a3a5592/resource/83988d39-6dd6-4003-91ca-b5ebdbe740f5/download/docentesdicionario.pdf


	 https://dados.gov.br/dataset/relacao-de-servidores-docentes

		 https://dados.ufrr.br/dataset/52722fb1-ea4b-4c07-978f-4ecba4c32b51/resource/776ab277-906f-472e-a325-6e43137f8feb/download/relacao-de-docentes.csv
		 https://dados.ufrr.br/dataset/52722fb1-ea4b-4c07-978f-4ecba4c32b51/resource/c2e9f6fb-09dd-432b-ae78-38548772218a/download/dicionario-de-dados_relacao-de-docentes-da-ufrr.pdf


	 https://dados.gov.br/dataset/relacao-docentes-ativos

		 https://dados.ufrr.br/dataset/80850532-2121-4191-86e6-79f0a57cb30b/resource/19973135-258f-4263-9fc0-c942c19fd5b6/download/dicionario-de-dados_relacao-de-docentes-ativos-na-ufrr.pdf
		 https://dados.ufrr.br/dataset/80850532-2121-4191-86e6-79f0a57cb30b/resource/d0f0

In [118]:
for r,resource in enumerate(resources):
    d2 = resources[r].copy()
    for field,value in sorted(resource.items(),key=lambda e: e[0]):
        value = d2[field]
        if isinstance(value, dict):
            for key, door in value.items():
                d2[field+'_'+key] = door
            d2.pop(field)
    resources[r] = d2

# Dados

In [119]:
df_datagov = pandas.DataFrame(resources)

In [120]:
df_datagov = df_datagov[sorted(df_datagov.keys())]

In [121]:
df_datagov.iloc[5]

created                                                  há mais de 2 anos
criado                                                         3/Maio/2018
dataset_key                                     relacao_de_docentes_ativos
dataset_organization                Universidade Federal de Roraima - UFRR
dataset_text                                                              
dataset_title                                   Relação de Docentes Ativos
dataset_url              [/, /organization, /organization/universidade-...
datastore_active                                                       NaN
format                                                                 CSV
formato                                                           text/csv
havested                                                               NaN
id                                    d0f090bd-600b-4ecf-af4e-077c3ee0f5e9
last_modified                                            há mais de 2 anos
licenca                  

In [122]:
df_datagov.to_csv('./resultado.csv')
google.colab.files.download('./resultado.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## texto

In [123]:
for a,b in df_datagov.groupby('package_id'):
    print(b.iloc[0].dataset_key,a,end='\n\n')

    for url in b.url_out.values:
        print('\t',url)
    print()
    

concursos_publicos_docentes 03ef14c1-5b4d-41b3-8208-c0d6f6032d1c

	 http://dadosabertos.ufma.br/dataset/03ef14c1-5b4d-41b3-8208-c0d6f6032d1c/resource/8ba54959-895a-4840-88aa-74eed5d3f9fd/download/rh_concursos_docentes.csv
	 http://dadosabertos.ufma.br/dataset/03ef14c1-5b4d-41b3-8208-c0d6f6032d1c/resource/6d543811-d140-4218-8a7e-e7f38212bed6/download/metadados_rh_concursos_docentes.pdf
	 http://dadosabertos.ufma.br/dataset/03ef14c1-5b4d-41b3-8208-c0d6f6032d1c/resource/bb3fba58-5711-4e73-a412-3c41577dcdbb/download/2018-2019_concursos_docentes.csv
	 http://dadosabertos.ufma.br/dataset/03ef14c1-5b4d-41b3-8208-c0d6f6032d1c/resource/8ba54959-895a-4840-88aa-74eed5d3f9fd/download/rh_concursos_docentes.csv
	 http://dadosabertos.ufma.br/dataset/03ef14c1-5b4d-41b3-8208-c0d6f6032d1c/resource/6d543811-d140-4218-8a7e-e7f38212bed6/download/metadados_rh_concursos_docentes.pdf
	 http://dadosabertos.ufma.br/dataset/03ef14c1-5b4d-41b3-8208-c0d6f6032d1c/resource/bb3fba58-5711-4e73-a412-3c41577dcdbb/downlo