Skip to content

Commit

Permalink
Merge branch develop to master
Browse files Browse the repository at this point in the history
  • Loading branch information
fatchanghao committed Aug 13, 2019
2 parents 7729ade + ae8c8ee commit 4cebdaa
Show file tree
Hide file tree
Showing 63 changed files with 1,534 additions and 245 deletions.
15 changes: 15 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,21 @@

This is a list of notable changes to Hyperscan, in reverse chronological order.

## [5.2.0] 2019-07-12
- Literal API: add new API `hs_compile_lit()` and `hs_compile_lit_multi()` to
process pure literal rule sets. The 2 literal APIs treat each expression text
in a literal sense without recognizing any regular grammers.
- Logical combination: add support for purely negative combinations, which
report match at EOD in case of no sub-expressions matched.
- Windows porting: support shared library (DLL) on Windows with available tools
hscheck, hsbench and hsdump.
- Bugfix for issue #148: fix uninitialized use of `scatter_unit_uX` due to
padding.
- Bugfix for issue #155: fix numerical result out of range error.
- Bugfix for issue #165: avoid corruption of pending combination report in
streaming mode.
- Bugfix for issue #174: fix scratch free issue when memory allocation fails.

## [5.1.1] 2019-04-03
- Add extra detection and handling when invalid rose programs are triggered.
- Bugfix for issue #136: fix CMake parsing of CPU architecure for GCC-9.
Expand Down
22 changes: 10 additions & 12 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@ cmake_minimum_required (VERSION 2.8.11)
project (hyperscan C CXX)

set (HS_MAJOR_VERSION 5)
set (HS_MINOR_VERSION 1)
set (HS_PATCH_VERSION 1)
set (HS_MINOR_VERSION 2)
set (HS_PATCH_VERSION 0)
set (HS_VERSION ${HS_MAJOR_VERSION}.${HS_MINOR_VERSION}.${HS_PATCH_VERSION})

set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)
Expand Down Expand Up @@ -31,6 +31,7 @@ else()
endif()

if(CMAKE_BUILD_TYPE MATCHES RELEASE|RELWITHDEBINFO|MINSIZEREL)
message(STATUS "using release build")
set(RELEASE_BUILD TRUE)
else()
set(RELEASE_BUILD FALSE)
Expand Down Expand Up @@ -109,11 +110,9 @@ option(BUILD_SHARED_LIBS "Build shared libs instead of static" OFF)
option(BUILD_STATIC_AND_SHARED "Build shared libs as well as static" OFF)

if (BUILD_STATIC_AND_SHARED OR BUILD_SHARED_LIBS)
if (WIN32)
message(FATAL_ERROR "Windows DLLs currently not supported")
else()
message(STATUS "Building shared libraries")
endif()
else()
message(STATUS "Building static libraries")
endif()

if (NOT BUILD_SHARED_LIBS)
Expand Down Expand Up @@ -151,9 +150,6 @@ if(MSVC OR MSVC_IDE)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /O3 /Qstd=c99 /Qrestrict /wd4267 /Qdiag-disable:remark")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /O2 /Qstd=c++11 /Qrestrict /QxHost /wd4267 /wd4800 /Qdiag-disable:remark -DBOOST_DETAIL_NO_CONTAINER_FWD -D_SCL_SECURE_NO_WARNINGS")
else()
# todo: change these as required
set(ARCH_C_FLAGS "/arch:AVX2")
set(ARCH_CXX_FLAGS "/arch:AVX2")
set(MSVC_WARNS "/wd4101 /wd4146 /wd4172 /wd4200 /wd4244 /wd4267 /wd4307 /wd4334 /wd4805 /wd4996 -D_CRT_SECURE_NO_WARNINGS")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /O2 ${MSVC_WARNS}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /O2 ${MSVC_WARNS} /wd4800 -DBOOST_DETAIL_NO_CONTAINER_FWD")
Expand Down Expand Up @@ -1298,12 +1294,14 @@ endif()
if (BUILD_STATIC_AND_SHARED OR BUILD_SHARED_LIBS)
if (NOT FAT_RUNTIME)
add_library(hs_runtime_shared SHARED src/hs_version.c
src/hs_valid_platform.c $<TARGET_OBJECTS:hs_exec_shared>)
src/hs_valid_platform.c $<TARGET_OBJECTS:hs_exec_shared>
hs_runtime.def)
else()
add_library(hs_runtime_shared SHARED src/hs_version.c
src/hs_valid_platform.c
$<TARGET_OBJECTS:hs_exec_common_shared>
${RUNTIME_SHLIBS})
${RUNTIME_SHLIBS}
hs_runtime.def)
endif()
set_target_properties(hs_runtime_shared PROPERTIES
VERSION ${LIB_VERSION}
Expand Down Expand Up @@ -1349,7 +1347,7 @@ if (BUILD_STATIC_AND_SHARED OR BUILD_SHARED_LIBS)
${RUNTIME_SHLIBS})
endif ()

add_library(hs_shared SHARED ${hs_shared_SRCS})
add_library(hs_shared SHARED ${hs_shared_SRCS} hs.def)

add_dependencies(hs_shared ragel_Parser)
set_target_properties(hs_shared PROPERTIES
Expand Down
2 changes: 1 addition & 1 deletion chimera/ch_compile.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -322,7 +322,7 @@ PatternData::PatternData(const char *pattern, u32 flags, u32 idx, u32 id_in,
ch_misc_free(info);

u32 guardflags;
guardflags = (flags | HS_FLAG_PREFILTER) & ~HS_FLAG_SINGLEMATCH;
guardflags = flags | HS_FLAG_PREFILTER;
guard = isHyperscanSupported(pattern, guardflags, platform);
} else {
// We can't even prefilter this pattern, so we're dependent on Big Dumb
Expand Down
69 changes: 69 additions & 0 deletions doc/dev-reference/compilation.rst
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,75 @@ version of Hyperscan used to scan with it.
Hyperscan provides support for targeting a database at a particular CPU
platform; see :ref:`instr_specialization` for details.

=====================
Compile Pure Literals
=====================

Pure literal is a special case of regular expression. A character sequence is
regarded as a pure literal if and only if each character is read and
interpreted independently. No syntax association happens between any adjacent
characters.

For example, given an expression written as :regexp:`/bc?/`. We could say it is
a regluar expression, with the meaning that character ``b`` followed by nothing
or by one character ``c``. On the other view, we could also say it is a pure
literal expression, with the meaning that this is a character sequence of 3-byte
length, containing characters ``b``, ``c`` and ``?``. In regular case, the
question mark character ``?`` has a particular syntax role called 0-1 quantifier,
which has an syntax association with the character ahead of it. Similar
characters exist in regular grammer like ``[``, ``]``, ``(``, ``)``, ``{``,
``}``, ``-``, ``*``, ``+``, ``\``, ``|``, ``/``, ``:``, ``^``, ``.``, ``$``.
While in pure literal case, all these meta characters lost extra meanings
expect for that they are just common ASCII codes.

Hyperscan is initially designed to process common regualr expressions. It is
hence embedded with a complex parser to do comprehensive regular grammer
interpretion. Particularly, the identification of above meta characters is the
basic step for the interpretion of far more complex regular grammers.

However in real cases, patterns may not always be regualr expressions. They
could just be pure literals. Problem will come if the pure literals contain
regular meta characters. Supposing fed directly into traditional Hyperscan
compile API, all these meta characters will be interpreted in predefined ways,
which is unnecessary and the result is totally out of expectation. To avoid
such misunderstanding by traditional API, users have to preprocess these
literal patterns by converting the meta characters into some other formats:
either by adding a backslash ``\`` before certain meta characters, or by
converting all the characters into a hexadecimal representation.

In ``v5.2.0``, Hyperscan introduces 2 new compile APIs for pure literal patterns:

#. :c:func:`hs_compile_lit`: compiles a single pure literal into a pattern
database.

#. :c:func:`hs_compile_lit_multi`: compiles an array of pure literals into a
pattern database. All of the supplied patterns will be scanned for
concurrently at scan time, with user-supplied identifiers returned when they
match.

These 2 APIs are designed for use cases where all patterns contained in the
target rule set are pure literals. Users can pass the initial pure literal
content directly into these APIs without worrying about writing regular meta
characters in their patterns. No preprocessing work is needed any more.

For new APIs, the ``length`` of each literal pattern is a newly added parameter.
Hyperscan needs to locate the end position of the input expression via clearly
knowing each literal's length, not by simply identifying character ``\0`` of a
string.

Supported flags: :c:member:`HS_FLAG_CASELESS`, :c:member:`HS_FLAG_MULTILINE`,
:c:member:`HS_FLAG_SINGLEMATCH`, :c:member:`HS_FLAG_SOM_LEFTMOST`.

.. note:: We don't support literal compilation API with :ref:`extparam`. And
for runtime implementation, traditional runtime APIs can still be
used to match pure literal patterns.

.. note:: If the target rule set contains at least one regular expression,
please use traditional compile APIs :c:func:`hs_compile`,
:c:func:`hs_compile_multi` and :c:func:`hs_compile_ext_multi`.
The new literal APIs introduced here are designed for rule sets
containing only pure literal expressions.

***************
Pattern Support
***************
Expand Down
43 changes: 43 additions & 0 deletions hs.def
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
; Hyperscan DLL export definitions

LIBRARY hs

EXPORTS
hs_alloc_scratch
hs_clone_scratch
hs_close_stream
hs_compile
hs_compile_ext_multi
hs_compile_multi
hs_compress_stream
hs_copy_stream
hs_database_info
hs_database_size
hs_deserialize_database
hs_deserialize_database_at
hs_expand_stream
hs_expression_ext_info
hs_expression_info
hs_free_compile_error
hs_free_database
hs_free_scratch
hs_open_stream
hs_populate_platform
hs_reset_and_copy_stream
hs_reset_and_expand_stream
hs_reset_stream
hs_scan
hs_scan_stream
hs_scan_vector
hs_scratch_size
hs_serialize_database
hs_serialized_database_info
hs_serialized_database_size
hs_set_allocator
hs_set_database_allocator
hs_set_misc_allocator
hs_set_scratch_allocator
hs_set_stream_allocator
hs_stream_size
hs_valid_platform
hs_version
36 changes: 36 additions & 0 deletions hs_runtime.def
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
; Hyperscan DLL export definitions

LIBRARY hs_runtime

EXPORTS
hs_alloc_scratch
hs_clone_scratch
hs_close_stream
hs_compress_stream
hs_copy_stream
hs_database_info
hs_database_size
hs_deserialize_database
hs_deserialize_database_at
hs_expand_stream
hs_free_database
hs_free_scratch
hs_open_stream
hs_reset_and_copy_stream
hs_reset_and_expand_stream
hs_reset_stream
hs_scan
hs_scan_stream
hs_scan_vector
hs_scratch_size
hs_serialize_database
hs_serialized_database_info
hs_serialized_database_size
hs_set_allocator
hs_set_database_allocator
hs_set_misc_allocator
hs_set_scratch_allocator
hs_set_stream_allocator
hs_stream_size
hs_valid_platform
hs_version
92 changes: 90 additions & 2 deletions src/compiler/compiler.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2015-2018, Intel Corporation
* Copyright (c) 2015-2019, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
Expand Down Expand Up @@ -56,11 +56,13 @@
#include "parser/unsupported.h"
#include "parser/utf8_validate.h"
#include "rose/rose_build.h"
#include "rose/rose_internal.h"
#include "som/slot_manager_dump.h"
#include "util/bytecode_ptr.h"
#include "util/compile_error.h"
#include "util/target_info.h"
#include "util/verify_types.h"
#include "util/ue2string.h"

#include <algorithm>
#include <cassert>
Expand Down Expand Up @@ -107,6 +109,46 @@ void validateExt(const hs_expr_ext &ext) {

}

void ParsedLitExpression::parseLiteral(const char *expression, size_t len,
bool nocase) {
const char *c = expression;
for (size_t i = 0; i < len; i++) {
lit.push_back(*c, nocase);
c++;
}
}

ParsedLitExpression::ParsedLitExpression(unsigned index_in,
const char *expression,
size_t expLength, unsigned flags,
ReportID report)
: expr(index_in, false, flags & HS_FLAG_SINGLEMATCH, false, false,
SOM_NONE, report, 0, MAX_OFFSET, 0, 0, 0, false) {
// For pure literal expression, below 'HS_FLAG_'s are unuseful:
// DOTALL/ALLOWEMPTY/UTF8/UCP/PREFILTER/COMBINATION/QUIET

if (flags & ~HS_FLAG_ALL) {
DEBUG_PRINTF("Unrecognised flag, flags=%u.\n", flags);
throw CompileError("Unrecognised flag.");
}

// FIXME: we disallow highlander + SOM, see UE-1850.
if ((flags & HS_FLAG_SINGLEMATCH) && (flags & HS_FLAG_SOM_LEFTMOST)) {
throw CompileError("HS_FLAG_SINGLEMATCH is not supported in "
"combination with HS_FLAG_SOM_LEFTMOST.");
}

// Set SOM type.
if (flags & HS_FLAG_SOM_LEFTMOST) {
expr.som = SOM_LEFT;
}

// Transfer expression text into ue2_literal.
bool nocase = flags & HS_FLAG_CASELESS ? true : false;
parseLiteral(expression, expLength, nocase);

}

ParsedExpression::ParsedExpression(unsigned index_in, const char *expression,
unsigned flags, ReportID report,
const hs_expr_ext *ext)
Expand Down Expand Up @@ -345,6 +387,49 @@ void addExpression(NG &ng, unsigned index, const char *expression,
}
}

void addLitExpression(NG &ng, unsigned index, const char *expression,
unsigned flags, const hs_expr_ext *ext, ReportID id,
size_t expLength) {
assert(expression);
const CompileContext &cc = ng.cc;
DEBUG_PRINTF("index=%u, id=%u, flags=%u, expr='%s', len='%zu'\n", index,
id, flags, expression, expLength);

// Extended parameters are not supported for pure literal patterns.
if (ext && ext->flags != 0LLU) {
throw CompileError("Extended parameters are not supported for pure "
"literal matching API.");
}

// Ensure that our pattern isn't too long (in characters).
if (strlen(expression) > cc.grey.limitPatternLength) {
throw CompileError("Pattern length exceeds limit.");
}

// filter out flags not supported by pure literal API.
u64a not_supported = HS_FLAG_DOTALL | HS_FLAG_ALLOWEMPTY | HS_FLAG_UTF8 |
HS_FLAG_UCP | HS_FLAG_PREFILTER | HS_FLAG_COMBINATION |
HS_FLAG_QUIET;

if (flags & not_supported) {
throw CompileError("Only HS_FLAG_CASELESS, HS_FLAG_MULTILINE, "
"HS_FLAG_SINGLEMATCH and HS_FLAG_SOM_LEFTMOST are "
"supported in literal API.");
}

// This expression must be a pure literal, we can build ue2_literal
// directly based on expression text.
ParsedLitExpression ple(index, expression, expLength, flags, id);

// Feed the ue2_literal into Rose.
const auto &expr = ple.expr;
if (ng.addLiteral(ple.lit, expr.index, expr.report, expr.highlander,
expr.som, expr.quiet)) {
DEBUG_PRINTF("took pure literal\n");
return;
}
}

static
bytecode_ptr<RoseEngine> generateRoseEngine(NG &ng) {
const u32 minWidth =
Expand Down Expand Up @@ -416,10 +501,13 @@ hs_database_t *dbCreate(const char *in_bytecode, size_t len, u64a platform) {
}


struct hs_database *build(NG &ng, unsigned int *length) {
struct hs_database *build(NG &ng, unsigned int *length, u8 pureFlag) {
assert(length);

auto rose = generateRoseEngine(ng);
struct RoseEngine *roseHead = rose.get();
roseHead->pureLiteral = pureFlag;

if (!rose) {
throw CompileError("Unable to generate bytecode.");
}
Expand Down
Loading

0 comments on commit 4cebdaa

Please sign in to comment.