Skip to content

Commit

Permalink
Add utf8proc ffi, cleanup cre.cpp (#1382)
Browse files Browse the repository at this point in the history
Remove getLowercasedWord() via crengine, as we can use
utf8proc_NFKC_Casefold() via ffi.
Switch some user hyphenation document methods to be
functions as they don't need a document.
  • Loading branch information
zwim committed Jul 14, 2021
1 parent dfa6f74 commit be4537e
Show file tree
Hide file tree
Showing 4 changed files with 297 additions and 27 deletions.
43 changes: 16 additions & 27 deletions cre.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -544,6 +544,20 @@ static int getDomVersionWithNormalizedXPointers(lua_State *L) {
return 1;
}

static int setUserHyphenationDict(lua_State *L) {
const char *filename = luaL_checkstring(L, 1);
bool reload = lua_toboolean(L, 2);
lua_pushinteger(L, UserHyphDict::init(filename, reload));
return 1;
}

static int getHyphenationForWord(lua_State *L) {
const char *word = luaL_checkstring(L, 1);
lString32 hyphenation = UserHyphDict::getHyphenation(word);
lua_pushstring(L, UnicodeToLocal(hyphenation).c_str());
return 1;
}

static int getIntProperty(lua_State *L) {
CreDocument *doc = (CreDocument*) luaL_checkudata(L, 1, "credocument");
const char *propName = luaL_checkstring(L, 2);
Expand Down Expand Up @@ -3510,30 +3524,6 @@ static int getImageDataFromPosition(lua_State *L) {
return 0;
}

static int setUserHyphenationDict(lua_State *L) {
CreDocument *doc = (CreDocument*) luaL_checkudata(L, 1, "credocument");
const char *filename = luaL_checkstring(L, 2);
bool reload = lua_toboolean(L, 3);
lua_pushinteger(L, UserHyphDict::init(filename, reload));
return 1;
}

static int getHyphenationForWord(lua_State *L) {
CreDocument *doc = (CreDocument*) luaL_checkudata(L, 1, "credocument");
const char *word = luaL_checkstring(L, 2);
lString32 hyphenation = UserHyphDict::getHyphenation(word);
lua_pushstring(L, UnicodeToLocal(hyphenation).c_str());
return 1;
}

static int getLowercasedWord(lua_State *L) {
CreDocument *doc = (CreDocument*) luaL_checkudata(L, 1, "credocument");
const char *word = luaL_checkstring(L, 2);
lString32 word_str(word);
lua_pushstring(L, UnicodeToLocal(word_str.lowercase()).c_str());
return 1;
}

static const struct luaL_Reg cre_func[] = {
{"initCache", initCache},
{"initHyphDict", initHyphDict},
Expand All @@ -3552,6 +3542,8 @@ static const struct luaL_Reg cre_func[] = {
{"getTextLangStatus", getTextLangStatus},
{"getLatestDomVersion", getLatestDomVersion},
{"getDomVersionWithNormalizedXPointers", getDomVersionWithNormalizedXPointers},
{"setUserHyphenationDict", setUserHyphenationDict},
{"getHyphenationForWord", getHyphenationForWord},
{NULL, NULL}
};

Expand Down Expand Up @@ -3661,9 +3653,6 @@ static const struct luaL_Reg credocument_meth[] = {
{"getPageMapXPointerPageLabel", getPageMapXPointerPageLabel},
{"getPageMapVisiblePageLabels", getPageMapVisiblePageLabels},
{"hasNonLinearFlows", hasNonLinearFlows},
{"setUserHyphenationDict", setUserHyphenationDict},
{"getHyphenationForWord", getHyphenationForWord},
{"getLowercasedWord", getLowercasedWord},
{"checkRegex", checkRegex},
{"getAndClearRegexSearchError", getAndClearRegexSearchError},
{"readDefaults", readDefaults},
Expand Down
64 changes: 64 additions & 0 deletions ffi-cdecl/utf8proc_decl.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
#include <utf8proc.h>

#include "ffi-cdecl.h"

cdecl_type(utf8proc_int8_t)
cdecl_type(utf8proc_uint8_t)
cdecl_type(utf8proc_int16_t)
cdecl_type(utf8proc_uint16_t)
cdecl_type(utf8proc_int32_t)
cdecl_type(utf8proc_uint32_t)
cdecl_type(utf8proc_ssize_t)
cdecl_type(utf8proc_size_t)
cdecl_type(utf8proc_bool)

cdecl_type(utf8proc_option_t)

cdecl_const(UTF8PROC_ERROR_NOMEM)
cdecl_const(UTF8PROC_ERROR_OVERFLOW)
cdecl_const(UTF8PROC_ERROR_INVALIDUTF8)
cdecl_const(UTF8PROC_ERROR_NOTASSIGNED)
cdecl_const(UTF8PROC_ERROR_INVALIDOPTS)

cdecl_type(utf8proc_propval_t)

cdecl_struct(utf8proc_property_struct)
cdecl_type(utf8proc_property_t)

cdecl_type(utf8proc_category_t)
cdecl_type(utf8proc_bidi_class_t)
cdecl_type(utf8proc_decomp_type_t)
cdecl_type(utf8proc_boundclass_t)

cdecl_type(utf8proc_custom_func)

cdecl_func(utf8proc_utf8class)
cdecl_func(utf8proc_version)
cdecl_func(utf8proc_unicode_version)
cdecl_func(utf8proc_errmsg)
cdecl_func(utf8proc_iterate)
cdecl_func(utf8proc_codepoint_valid)
cdecl_func(utf8proc_encode_char)
cdecl_func(utf8proc_get_property)
cdecl_func(utf8proc_decompose_char)
cdecl_func(utf8proc_decompose)
cdecl_func(utf8proc_decompose_custom)
cdecl_func(utf8proc_normalize_utf32)
cdecl_func(utf8proc_reencode)
cdecl_func(utf8proc_grapheme_break_stateful)
cdecl_func(utf8proc_grapheme_break)
cdecl_func(utf8proc_tolower)
cdecl_func(utf8proc_toupper)
cdecl_func(utf8proc_totitle)
cdecl_func(utf8proc_islower)
cdecl_func(utf8proc_isupper)
cdecl_func(utf8proc_charwidth)
cdecl_func(utf8proc_category)
cdecl_func(utf8proc_category_string)
cdecl_func(utf8proc_map)
cdecl_func(utf8proc_map_custom)
cdecl_func(utf8proc_NFD)
cdecl_func(utf8proc_NFC)
cdecl_func(utf8proc_NFKD)
cdecl_func(utf8proc_NFKC)
cdecl_func(utf8proc_NFKC_Casefold)
31 changes: 31 additions & 0 deletions ffi/utf8proc.lua
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
--[[--
Module for utf8 string operations.
This is a LuaJIT FFI wrapper for utf8proc.
]]

local ffi = require("ffi")
local C = ffi.C

require("ffi/posix_h")
require("ffi/utf8proc_h")

local libutf8proc
if ffi.os == "Windows" then
libutf8proc = ffi.load("libs/libutf8proc.dll")
elseif ffi.os == "OSX" then
libutf8proc = ffi.load("libs/libutf8proc.dylib")
else
libutf8proc = ffi.load("libs/libutf8proc.so.2")
end

local Utf8Proc = {}

function Utf8Proc.lowercase(str)
local folded_strz = libutf8proc.utf8proc_NFKC_Casefold(str)
local folded_str = ffi.string(folded_strz)
C.free(folded_strz)
return folded_str
end

return Utf8Proc
186 changes: 186 additions & 0 deletions ffi/utf8proc_h.lua
Original file line number Diff line number Diff line change
@@ -0,0 +1,186 @@
local ffi = require("ffi")

ffi.cdef[[
typedef signed char utf8proc_int8_t;
typedef unsigned char utf8proc_uint8_t;
typedef short int utf8proc_int16_t;
typedef short unsigned int utf8proc_uint16_t;
typedef int utf8proc_int32_t;
typedef unsigned int utf8proc_uint32_t;
typedef long int utf8proc_ssize_t;
typedef long unsigned int utf8proc_size_t;
typedef bool utf8proc_bool;
typedef enum {
UTF8PROC_NULLTERM = 1,
UTF8PROC_STABLE = 2,
UTF8PROC_COMPAT = 4,
UTF8PROC_COMPOSE = 8,
UTF8PROC_DECOMPOSE = 16,
UTF8PROC_IGNORE = 32,
UTF8PROC_REJECTNA = 64,
UTF8PROC_NLF2LS = 128,
UTF8PROC_NLF2PS = 256,
UTF8PROC_NLF2LF = 384,
UTF8PROC_STRIPCC = 512,
UTF8PROC_CASEFOLD = 1024,
UTF8PROC_CHARBOUND = 2048,
UTF8PROC_LUMP = 4096,
UTF8PROC_STRIPMARK = 8192,
UTF8PROC_STRIPNA = 16384,
} utf8proc_option_t;
static const int UTF8PROC_ERROR_NOMEM = -1;
static const int UTF8PROC_ERROR_OVERFLOW = -2;
static const int UTF8PROC_ERROR_INVALIDUTF8 = -3;
static const int UTF8PROC_ERROR_NOTASSIGNED = -4;
static const int UTF8PROC_ERROR_INVALIDOPTS = -5;
typedef short int utf8proc_propval_t;
struct utf8proc_property_struct {
utf8proc_propval_t category;
utf8proc_propval_t combining_class;
utf8proc_propval_t bidi_class;
utf8proc_propval_t decomp_type;
utf8proc_uint16_t decomp_seqindex;
utf8proc_uint16_t casefold_seqindex;
utf8proc_uint16_t uppercase_seqindex;
utf8proc_uint16_t lowercase_seqindex;
utf8proc_uint16_t titlecase_seqindex;
utf8proc_uint16_t comb_index;
unsigned int bidi_mirrored : 1;
unsigned int comp_exclusion : 1;
unsigned int ignorable : 1;
unsigned int control_boundary : 1;
unsigned int charwidth : 2;
unsigned int pad : 2;
unsigned char boundclass;
};
typedef struct utf8proc_property_struct utf8proc_property_t;
typedef enum {
UTF8PROC_CATEGORY_CN = 0,
UTF8PROC_CATEGORY_LU = 1,
UTF8PROC_CATEGORY_LL = 2,
UTF8PROC_CATEGORY_LT = 3,
UTF8PROC_CATEGORY_LM = 4,
UTF8PROC_CATEGORY_LO = 5,
UTF8PROC_CATEGORY_MN = 6,
UTF8PROC_CATEGORY_MC = 7,
UTF8PROC_CATEGORY_ME = 8,
UTF8PROC_CATEGORY_ND = 9,
UTF8PROC_CATEGORY_NL = 10,
UTF8PROC_CATEGORY_NO = 11,
UTF8PROC_CATEGORY_PC = 12,
UTF8PROC_CATEGORY_PD = 13,
UTF8PROC_CATEGORY_PS = 14,
UTF8PROC_CATEGORY_PE = 15,
UTF8PROC_CATEGORY_PI = 16,
UTF8PROC_CATEGORY_PF = 17,
UTF8PROC_CATEGORY_PO = 18,
UTF8PROC_CATEGORY_SM = 19,
UTF8PROC_CATEGORY_SC = 20,
UTF8PROC_CATEGORY_SK = 21,
UTF8PROC_CATEGORY_SO = 22,
UTF8PROC_CATEGORY_ZS = 23,
UTF8PROC_CATEGORY_ZL = 24,
UTF8PROC_CATEGORY_ZP = 25,
UTF8PROC_CATEGORY_CC = 26,
UTF8PROC_CATEGORY_CF = 27,
UTF8PROC_CATEGORY_CS = 28,
UTF8PROC_CATEGORY_CO = 29,
} utf8proc_category_t;
typedef enum {
UTF8PROC_BIDI_CLASS_L = 1,
UTF8PROC_BIDI_CLASS_LRE = 2,
UTF8PROC_BIDI_CLASS_LRO = 3,
UTF8PROC_BIDI_CLASS_R = 4,
UTF8PROC_BIDI_CLASS_AL = 5,
UTF8PROC_BIDI_CLASS_RLE = 6,
UTF8PROC_BIDI_CLASS_RLO = 7,
UTF8PROC_BIDI_CLASS_PDF = 8,
UTF8PROC_BIDI_CLASS_EN = 9,
UTF8PROC_BIDI_CLASS_ES = 10,
UTF8PROC_BIDI_CLASS_ET = 11,
UTF8PROC_BIDI_CLASS_AN = 12,
UTF8PROC_BIDI_CLASS_CS = 13,
UTF8PROC_BIDI_CLASS_NSM = 14,
UTF8PROC_BIDI_CLASS_BN = 15,
UTF8PROC_BIDI_CLASS_B = 16,
UTF8PROC_BIDI_CLASS_S = 17,
UTF8PROC_BIDI_CLASS_WS = 18,
UTF8PROC_BIDI_CLASS_ON = 19,
UTF8PROC_BIDI_CLASS_LRI = 20,
UTF8PROC_BIDI_CLASS_RLI = 21,
UTF8PROC_BIDI_CLASS_FSI = 22,
UTF8PROC_BIDI_CLASS_PDI = 23,
} utf8proc_bidi_class_t;
typedef enum {
UTF8PROC_DECOMP_TYPE_FONT = 1,
UTF8PROC_DECOMP_TYPE_NOBREAK = 2,
UTF8PROC_DECOMP_TYPE_INITIAL = 3,
UTF8PROC_DECOMP_TYPE_MEDIAL = 4,
UTF8PROC_DECOMP_TYPE_FINAL = 5,
UTF8PROC_DECOMP_TYPE_ISOLATED = 6,
UTF8PROC_DECOMP_TYPE_CIRCLE = 7,
UTF8PROC_DECOMP_TYPE_SUPER = 8,
UTF8PROC_DECOMP_TYPE_SUB = 9,
UTF8PROC_DECOMP_TYPE_VERTICAL = 10,
UTF8PROC_DECOMP_TYPE_WIDE = 11,
UTF8PROC_DECOMP_TYPE_NARROW = 12,
UTF8PROC_DECOMP_TYPE_SMALL = 13,
UTF8PROC_DECOMP_TYPE_SQUARE = 14,
UTF8PROC_DECOMP_TYPE_FRACTION = 15,
UTF8PROC_DECOMP_TYPE_COMPAT = 16,
} utf8proc_decomp_type_t;
typedef enum {
UTF8PROC_BOUNDCLASS_START = 0,
UTF8PROC_BOUNDCLASS_OTHER = 1,
UTF8PROC_BOUNDCLASS_CR = 2,
UTF8PROC_BOUNDCLASS_LF = 3,
UTF8PROC_BOUNDCLASS_CONTROL = 4,
UTF8PROC_BOUNDCLASS_EXTEND = 5,
UTF8PROC_BOUNDCLASS_L = 6,
UTF8PROC_BOUNDCLASS_V = 7,
UTF8PROC_BOUNDCLASS_T = 8,
UTF8PROC_BOUNDCLASS_LV = 9,
UTF8PROC_BOUNDCLASS_LVT = 10,
UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR = 11,
UTF8PROC_BOUNDCLASS_SPACINGMARK = 12,
UTF8PROC_BOUNDCLASS_PREPEND = 13,
UTF8PROC_BOUNDCLASS_ZWJ = 14,
UTF8PROC_BOUNDCLASS_E_BASE = 15,
UTF8PROC_BOUNDCLASS_E_MODIFIER = 16,
UTF8PROC_BOUNDCLASS_GLUE_AFTER_ZWJ = 17,
UTF8PROC_BOUNDCLASS_E_BASE_GAZ = 18,
UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC = 19,
UTF8PROC_BOUNDCLASS_E_ZWG = 20,
} utf8proc_boundclass_t;
typedef utf8proc_int32_t (*utf8proc_custom_func)(utf8proc_int32_t, void *);
extern const utf8proc_int8_t utf8proc_utf8class[256] __attribute__((visibility("default")));
const char *utf8proc_version(void) __attribute__((visibility("default")));
const char *utf8proc_unicode_version(void) __attribute__((visibility("default")));
const char *utf8proc_errmsg(utf8proc_ssize_t) __attribute__((visibility("default")));
utf8proc_ssize_t utf8proc_iterate(const utf8proc_uint8_t *, utf8proc_ssize_t, utf8proc_int32_t *) __attribute__((visibility("default")));
utf8proc_bool utf8proc_codepoint_valid(utf8proc_int32_t) __attribute__((visibility("default")));
utf8proc_ssize_t utf8proc_encode_char(utf8proc_int32_t, utf8proc_uint8_t *) __attribute__((visibility("default")));
const utf8proc_property_t *utf8proc_get_property(utf8proc_int32_t) __attribute__((visibility("default")));
utf8proc_ssize_t utf8proc_decompose_char(utf8proc_int32_t, utf8proc_int32_t *, utf8proc_ssize_t, utf8proc_option_t, int *) __attribute__((visibility("default")));
utf8proc_ssize_t utf8proc_decompose(const utf8proc_uint8_t *, utf8proc_ssize_t, utf8proc_int32_t *, utf8proc_ssize_t, utf8proc_option_t) __attribute__((visibility("default")));
utf8proc_ssize_t utf8proc_decompose_custom(const utf8proc_uint8_t *, utf8proc_ssize_t, utf8proc_int32_t *, utf8proc_ssize_t, utf8proc_option_t, utf8proc_custom_func, void *) __attribute__((visibility("default")));
utf8proc_ssize_t utf8proc_normalize_utf32(utf8proc_int32_t *, utf8proc_ssize_t, utf8proc_option_t) __attribute__((visibility("default")));
utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *, utf8proc_ssize_t, utf8proc_option_t) __attribute__((visibility("default")));
utf8proc_bool utf8proc_grapheme_break_stateful(utf8proc_int32_t, utf8proc_int32_t, utf8proc_int32_t *) __attribute__((visibility("default")));
utf8proc_bool utf8proc_grapheme_break(utf8proc_int32_t, utf8proc_int32_t) __attribute__((visibility("default")));
utf8proc_int32_t utf8proc_tolower(utf8proc_int32_t) __attribute__((visibility("default")));
utf8proc_int32_t utf8proc_toupper(utf8proc_int32_t) __attribute__((visibility("default")));
utf8proc_int32_t utf8proc_totitle(utf8proc_int32_t) __attribute__((visibility("default")));
int utf8proc_islower(utf8proc_int32_t) __attribute__((visibility("default")));
int utf8proc_isupper(utf8proc_int32_t) __attribute__((visibility("default")));
int utf8proc_charwidth(utf8proc_int32_t) __attribute__((visibility("default")));
utf8proc_category_t utf8proc_category(utf8proc_int32_t) __attribute__((visibility("default")));
const char *utf8proc_category_string(utf8proc_int32_t) __attribute__((visibility("default")));
utf8proc_ssize_t utf8proc_map(const utf8proc_uint8_t *, utf8proc_ssize_t, utf8proc_uint8_t **, utf8proc_option_t) __attribute__((visibility("default")));
utf8proc_ssize_t utf8proc_map_custom(const utf8proc_uint8_t *, utf8proc_ssize_t, utf8proc_uint8_t **, utf8proc_option_t, utf8proc_custom_func, void *) __attribute__((visibility("default")));
utf8proc_uint8_t *utf8proc_NFD(const utf8proc_uint8_t *) __attribute__((visibility("default")));
utf8proc_uint8_t *utf8proc_NFC(const utf8proc_uint8_t *) __attribute__((visibility("default")));
utf8proc_uint8_t *utf8proc_NFKD(const utf8proc_uint8_t *) __attribute__((visibility("default")));
utf8proc_uint8_t *utf8proc_NFKC(const utf8proc_uint8_t *) __attribute__((visibility("default")));
utf8proc_uint8_t *utf8proc_NFKC_Casefold(const utf8proc_uint8_t *) __attribute__((visibility("default")));
]]

0 comments on commit be4537e

Please sign in to comment.