Skip to content

Commit

Permalink
Fixes #2055 - Busca Textual (#2179)
Browse files Browse the repository at this point in the history
Fixes #2055 - Busca Textual
  • Loading branch information
edwardoliveira committed Dec 20, 2018
1 parent 0c464b6 commit 90dec5f
Show file tree
Hide file tree
Showing 20 changed files with 2,797 additions and 55 deletions.
5 changes: 3 additions & 2 deletions Dockerfile
@@ -1,8 +1,9 @@
FROM alpine:3.8

ENV BUILD_PACKAGES postgresql-dev graphviz-dev graphviz build-base git pkgconfig \
python3-dev libxml2-dev jpeg-dev libressl-dev libffi-dev libxslt-dev \
nodejs npm py3-lxml py3-magic postgresql-client poppler-utils antiword vim openssh-client
python3-dev libxml2-dev jpeg-dev libressl-dev libffi-dev libxslt-dev \
nodejs npm py3-lxml py3-magic postgresql-client poppler-utils antiword \
curl jq openssh-client vim openssh-client

RUN apk update --update-cache && apk upgrade

Expand Down
72 changes: 37 additions & 35 deletions sapl/base/search_indexes.py
@@ -1,13 +1,12 @@
import os.path
import re
import string
import textract
import logging

from django.db.models import F, Q, Value
from django.db.models.fields import TextField
from django.db.models.functions import Concat
from django.template import loader
from haystack import connections
from haystack.constants import Indexable
from haystack.fields import CharField
from haystack.indexes import SearchIndex
Expand All @@ -24,6 +23,7 @@

class TextExtractField(CharField):

backend = None
logger = logging.getLogger(__name__)

def __init__(self, **kwargs):
Expand All @@ -34,24 +34,20 @@ def __init__(self, **kwargs):
self.model_attr = (self.model_attr, )

def solr_extraction(self, arquivo):
extracted_data = self._get_backend(None).extract_file_contents(
arquivo)['contents']
# Remove as tags xml
self.logger.debug("Removendo as tags xml.")
extracted_data = re.sub('<[^>]*>', '', extracted_data)
# Remove tags \t e \n
self.logger.debug("Removendo as \t e \n.")
extracted_data = extracted_data.replace(
'\n', ' ').replace('\t', ' ')
# Remove sinais de pontuação
self.logger.debug("Removendo sinais de pontuação.")
extracted_data = re.sub('[' + string.punctuation + ']',
' ', extracted_data)
# Remove espaços múltiplos
self.logger.debugger("Removendo espaços múltiplos.")
extracted_data = " ".join(extracted_data.split())

return extracted_data
if not self.backend:
self.backend = connections['default'].get_backend()
try:
with open(arquivo.path, 'rb') as f:
content = self.backend.extract_file_contents(f)
if not content or not content['contents']:
return ''
data = content['contents']
except Exception as e:
print('erro processando arquivo: ' % arquivo.path)
self.logger.error(arquivo.path)
self.logger.error('erro processando arquivo: ' % arquivo.path)
data = ''
return data

def whoosh_extraction(self, arquivo):

Expand All @@ -66,11 +62,11 @@ def whoosh_extraction(self, arquivo):
language='pt-br').decode('utf-8').replace('\n', ' ').replace(
'\t', ' ')

def print_error(self, arquivo):
self.logger.error("Erro inesperado processando arquivo: {}".format(arquivo.path))
msg = 'Erro inesperado processando arquivo: %s' % (
arquivo.path)
print(msg)
def print_error(self, arquivo, error):
msg = 'Erro inesperado processando arquivo %s erro: %s' % (
arquivo.path, error)
print(msg, error)
self.logger.error(msg, error)

def file_extractor(self, arquivo):
if not os.path.exists(arquivo.path) or \
Expand All @@ -81,23 +77,23 @@ def file_extractor(self, arquivo):
if SOLR_URL:
try:
return self.solr_extraction(arquivo)
except Exception as e:
self.logger.error("Erro no arquivo {}. ".format(arquivo.path) + str(e))
self.print_error(arquivo)
except Exception as err:
print(str(err))
self.print_error(arquivo, err)

# Em ambiente de DEV utiliza-se o Whoosh
# Como ele não possui extração, faz-se uso do textract
else:
try:
self.logger.debug("Tentando whoosh_extraction no arquivo {}".format(arquivo.path))
return self.whoosh_extraction(arquivo)
except ExtensionNotSupported as e:
self.logger.error("Erro no arquivo {}".format(arquivo.path) + str(e))
print(str(e))
except Exception as e2:
self.logger.error(str(e))
print(str(e2))
self.print_error(arquivo)
except ExtensionNotSupported as err:
print(str(err))
self.logger.error(str(err))
except Exception as err:
print(str(err))
self.print_error(arquivo, str(err))
return ''

def ta_extractor(self, value):
Expand Down Expand Up @@ -133,7 +129,9 @@ def extract_data(self, obj):
value = getattr(obj, attr)
if not value:
continue
data += getattr(self, func)(value)
data += getattr(self, func)(value) + ' '

data = data.replace('\n', ' ')

return data

Expand All @@ -159,6 +157,10 @@ class DocumentoAcessorioIndex(SearchIndex, Indexable):
)
)

def __init__(self, **kwargs):
super().__init__(**kwargs)
self.text.search_index = self

def get_model(self):
return self.model

Expand Down
2 changes: 2 additions & 0 deletions sapl/materia/views.py
Expand Up @@ -1810,6 +1810,8 @@ def get_context_data(self, **kwargs):

context['show_results'] = show_results_filter_set(qr)

context['USE_SOLR'] = settings.USE_SOLR if hasattr(settings, 'USE_SOLR') else False

return context


Expand Down
2 changes: 2 additions & 0 deletions sapl/norma/views.py
Expand Up @@ -15,6 +15,7 @@
from django.views.generic.base import RedirectView
from django.views.generic.edit import FormView
from django_filters.views import FilterView
from sapl import settings
from sapl.base.models import AppConfig
from sapl.compilacao.views import IntegracaoTaView
from sapl.crud.base import (RP_DETAIL, RP_LIST, Crud, CrudAux,
Expand Down Expand Up @@ -107,6 +108,7 @@ def get_context_data(self, **kwargs):
context['filter_url'] = ('&' + qr.urlencode()) if len(qr) > 0 else ''

context['show_results'] = show_results_filter_set(qr)
context['USE_SOLR'] = settings.USE_SOLR if hasattr(settings, 'USE_SOLR') else False

return context

Expand Down
21 changes: 13 additions & 8 deletions sapl/settings.py
Expand Up @@ -100,23 +100,28 @@
# FTS = Full Text Search
# Desabilita a indexação textual até encontramos uma solução para a issue
# https://github.com/interlegis/sapl/issues/2055
#HAYSTACK_SIGNAL_PROCESSOR = 'haystack.signals.RealtimeSignalProcessor'
HAYSTACK_SIGNAL_PROCESSOR = 'haystack.signals.BaseSignalProcessor'
#HAYSTACK_SIGNAL_PROCESSOR = 'haystack.signals.BaseSignalProcessor' # Disable auto index
HAYSTACK_SIGNAL_PROCESSOR = 'haystack.signals.RealtimeSignalProcessor'
SEARCH_BACKEND = 'haystack.backends.whoosh_backend.WhooshEngine'
SEARCH_URL = ('PATH', PROJECT_DIR.child('whoosh'))

SOLR_URL = config('SOLR_URL', cast=str, default='')
if SOLR_URL:
# SOLR
USE_SOLR = config('USE_SOLR', cast=bool, default=False)
SOLR_URL = config('SOLR_URL', cast=str, default='http://localhost:8983')
SOLR_COLLECTION = config('SOLR_COLLECTION', cast=str, default='sapl')

if USE_SOLR:
SEARCH_BACKEND = 'haystack.backends.solr_backend.SolrEngine'
SEARCH_URL = ('URL', config('SOLR_URL', cast=str))
# ...or for multicore...
# 'URL': 'http://127.0.0.1:8983/solr/mysite',
SEARCH_URL = ('URL', '{}/solr/{}'.format(SOLR_URL, SOLR_COLLECTION))


# BATCH_SIZE: default is 1000 if omitted, avoid Too Large Entity Body errors
HAYSTACK_CONNECTIONS = {
'default': {
'ENGINE': SEARCH_BACKEND,
SEARCH_URL[0]: SEARCH_URL[1]
SEARCH_URL[0]: SEARCH_URL[1],
'BATCH_SIZE': 500,
'TIMEOUT': 60,
},
}

Expand Down
10 changes: 6 additions & 4 deletions sapl/templates/materia/materialegislativa_filter.html
Expand Up @@ -3,11 +3,13 @@
{% load crispy_forms_tags %}

{% block actions %}

<div class="actions btn-group pull-right" role="group">
<!--
<a href="{% url 'sapl.base:haystack_search' %}" class="btn btn-default">
Pesquisa Textual
</a> -->
{% if USE_SOLR %}
<a href="{% url 'sapl.base:haystack_search' %}" class="btn btn-default">
Pesquisa Textual
</a>
{% endif %}

{% if perms.materia.add_materialegislativa %}
<a href="{% url 'sapl.materia:materialegislativa_create' %}" class="btn btn-default">
Expand Down
10 changes: 5 additions & 5 deletions sapl/templates/norma/normajuridica_filter.html
Expand Up @@ -4,11 +4,11 @@

{% block actions %}
<div class="actions btn-group pull-right" role="group">
<!--
<a href="{% url 'sapl.base:haystack_search' %}" class="btn btn-default">
Pesquisa Textual
</a>
-->
{% if USE_SOLR %}
<a href="{% url 'sapl.base:haystack_search' %}" class="btn btn-default">
Pesquisa Textual
</a>
{% endif %}


{% if perms.norma.add_normajuridica %}
Expand Down
61 changes: 61 additions & 0 deletions solr/docker-compose.yml
@@ -0,0 +1,61 @@
version: '2'
services:
sapldb:
image: postgres:10.5-alpine
restart: always
environment:
POSTGRES_PASSWORD: sapl
POSTGRES_USER: sapl
POSTGRES_DB: sapl
PGDATA : /var/lib/postgresql/data/
volumes:
- sapldb_data:/var/lib/postgresql/data/
ports:
- "5432:5432"

saplsolr:
image: solr:7.4-alpine
restart: always
command: bin/solr start -c -f
volumes:
- solr_data:/opt/solr/server/solr
- solr_configsets:/opt/solr/server/solr/configsets
ports:
- "8983:8983"

sapl:
image: interlegis/sapl:3.1.138
# build: .
restart: always
environment:
ADMIN_PASSWORD: interlegis
ADMIN_EMAIL: email@dominio.net
DEBUG: 'False'
USE_TLS: 'False'
EMAIL_PORT: 587
EMAIL_HOST: smtp.dominio.net
EMAIL_HOST_USER: usuariosmtp
EMAIL_HOST_PASSWORD: senhasmtp
USE_SOLR: 'True'
#SOLR_COLLECTION: sapl
#SOLR_HOST: saplsolr
SOLR_URL: http://saplsolr:8983/solr/sapl
TZ: America/Sao_Paulo
volumes:
- sapl_data:/var/interlegis/sapl/data
- sapl_media:/var/interlegis/sapl/media
- sapl_root:/var/interlegis/sapl
volumes_from:
- saplsolr
depends_on:
- sapldb
- saplsolr
ports:
- "80:80"
volumes:
sapldb_data:
sapl_data:
sapl_media:
sapl_root:
solr_data:
solr_configsets:
54 changes: 54 additions & 0 deletions solr/sapl_configset/conf/lang/stopwords_en.txt
@@ -0,0 +1,54 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# a couple of test stopwords to test that the words are really being
# configured from this file:
stopworda
stopwordb

# Standard english stop words taken from Lucene's StopAnalyzer
a
an
and
are
as
at
be
but
by
for
if
in
into
is
it
no
not
of
on
or
such
that
the
their
then
there
these
they
this
to
was
will
with

0 comments on commit 90dec5f

Please sign in to comment.