Skip to content

Commit

Permalink
page segmentation(text block detection) revisit
Browse files Browse the repository at this point in the history
using segmentation algorithm from leptonica described in
https://research.google.com/pubs/archive/36668.pdf
  • Loading branch information
chrox committed Jun 14, 2016
1 parent 6415623 commit cd57b68
Show file tree
Hide file tree
Showing 3 changed files with 178 additions and 46 deletions.
162 changes: 131 additions & 31 deletions ffi/koptcontext.lua
Original file line number Diff line number Diff line change
Expand Up @@ -86,13 +86,13 @@ function KOPTContext_mt.__index:getWordBoxes(bmp, x, y, w, h, box_type)
local l_x0, l_y0, l_x1, l_y1

if box_type == 0 then
k2pdfopt.k2pdfopt_get_reflowed_word_boxes(self, bmp == "src" and self.src or self.dst,
ffi.new("int", x), ffi.new("int", y), ffi.new("int", w), ffi.new("int", h))
k2pdfopt.k2pdfopt_get_reflowed_word_boxes(self,
bmp == "src" and self.src or self.dst, x, y, w, h)
boxa = self.rboxa
nai = self.rnai
elseif box_type == 1 then
k2pdfopt.k2pdfopt_get_native_word_boxes(self, bmp == "src" and self.src or self.dst,
ffi.new("int", x), ffi.new("int", y), ffi.new("int", w), ffi.new("int", h))
k2pdfopt.k2pdfopt_get_native_word_boxes(self,
bmp == "src" and self.src or self.dst, x, y, w, h)
boxa = self.nboxa
nai = self.nnai
end
Expand Down Expand Up @@ -153,7 +153,7 @@ end
function KOPTContext_mt.__index:reflowToNativePosTransform(xc, yc, wr, hr)
local function wrectmap_reflow_distance(wrmap, x, y)
local function wrectmap_reflow_inside(wrmap, x, y)
return k2pdfopt.wrectmap_inside(wrmap, ffi.new("int", x), ffi.new("int", y)) ~= 0
return k2pdfopt.wrectmap_inside(wrmap, x, y) ~= 0
end
if wrectmap_reflow_inside(wrmap, x, y) then
return 0
Expand Down Expand Up @@ -216,8 +216,7 @@ end
function KOPTContext_mt.__index:getTOCRWord(bmp, x, y, w, h, datadir, lang, ocr_type, allow_spaces, std_proc)
local word = ffi.new("char[256]")
k2pdfopt.k2pdfopt_tocr_single_word(bmp == "src" and self.src or self.dst,
ffi.new("int", x), ffi.new("int", y), ffi.new("int", w), ffi.new("int", h),
word, 255, ffi.cast("char*", datadir), ffi.cast("char*", lang),
x, y, w, h, word, 255, ffi.cast("char*", datadir), ffi.cast("char*", lang),
ocr_type, allow_spaces, std_proc)
return ffi.string(word)
end
Expand All @@ -233,32 +232,120 @@ function KOPTContext_mt.__index:getAutoBBox()
return x0, y0, x1, y1
end

function KOPTContext_mt.__index:getPageRegions()
k2pdfopt.k2pdfopt_part_bmp(self)
local w, h = self.page_width, self.page_height
local regions = {}
for i = 0, self.pageregions.n - 1 do
local bmpregion = (self.pageregions.pageregion + i).bmpregion
local c1, c2 = bmpregion.c1, bmpregion.c2
local r1, r2 = bmpregion.r1, bmpregion.r2
if c2 > 0 and r2 > 0 then
table.insert(regions, {
x0 = c1/w, x1 = c2/w,
y0 = r1/h, y1 = r2/h
})
function KOPTContext_mt.__index:findPageBlocks()
if self.src.data then
local pixs = k2pdfopt.bitmap2pix(self.src,
0, 0, self.src.width, self.src.height)
local pixb = leptonica.pixThresholdToBinary(pixs, 128)
local pixr = leptonica.pixReduceRankBinaryCascade(pixb, 1, 0, 0, 0)
leptonica.pixDestroy(ffi.new('PIX *[1]', pixs))
leptonica.pixDestroy(ffi.new('PIX *[1]', pixb))

local pixtb = ffi.new("PIX *[1]")
local status = leptonica.pixGetRegionsBinary(pixr, nil, nil, pixtb, 0)
if status == 0 then
self.nboxa = leptonica.pixSplitIntoBoxa(pixtb[0], 5, 10, 20, 80, 10, 0)
for i = 0, leptonica.boxaGetCount(self.nboxa) - 1 do
local box = leptonica.boxaGetBox(self.nboxa, i, ffi.C.L_CLONE)
leptonica.boxAdjustSides(box, box, -1, 0, -1, 0)
end
self.rboxa = leptonica.boxaCombineOverlaps(self.nboxa)
self.page_width = leptonica.pixGetWidth(pixr)
self.page_height = leptonica.pixGetHeight(pixr)

-- uncomment this to show text blocks in situ
--leptonica.pixWritePng("textblock-mask.png", pixtb[0], 0.0)

leptonica.pixDestroy(ffi.new('PIX *[1]', pixtb))
end
leptonica.pixDestroy(ffi.new('PIX *[1]', pixr))
end
return regions
end

--[[
-- get page block in location x, y both of which in range [0, 1] relative to page
-- width and height respectively
--]]
function KOPTContext_mt.__index:getPageBlock(x_rel, y_rel)
local block = nil
if self.src.data and self.nboxa ~= nil and self.rboxa ~= nil then
local w, h = self:getPageDim()
local tbox = leptonica.boxCreate(0, y_rel * h, w, 2)
local boxa = leptonica.boxaClipToBox(self.nboxa, tbox)
leptonica.boxDestroy(ffi.new('BOX *[1]', tbox))
for i = 0, leptonica.boxaGetCount(boxa) - 1 do
local box = leptonica.boxaGetBox(boxa, i, ffi.C.L_CLONE)
leptonica.boxAdjustSides(box, box, -1, 0, -1, 0)
end
local boxatb = leptonica.boxaCombineOverlaps(boxa)
leptonica.boxaDestroy(ffi.new('BOXA *[1]', boxa))
local clipped_box, unclipped_box
for i = 0, leptonica.boxaGetCount(boxatb) - 1 do
local box = leptonica.boxaGetBox(boxatb, i, ffi.C.L_CLONE)
if box.x / w <= x_rel and (box.x + box.w) / w >= x_rel then
clipped_box = leptonica.boxCreate(box.x, 0, box.w, h)
end
leptonica.boxDestroy(ffi.new('BOX *[1]', box))
if clipped_box ~= nil then break end
end
for i = 0, leptonica.boxaGetCount(self.rboxa) - 1 do
local box = leptonica.boxaGetBox(self.rboxa, i, ffi.C.L_CLONE)
if box.x / w <= x_rel and (box.x + box.w) / w >= x_rel
and box.y / h <= y_rel and (box.y + box.h) / h >= y_rel then
unclipped_box = leptonica.boxCreate(box.x, box.y, box.w, box.h)
end
leptonica.boxDestroy(ffi.new('BOX *[1]', box))
if unclipped_box ~= nil then break end
end
if clipped_box ~= nil and unclipped_box ~= nil then
local box = leptonica.boxOverlapRegion(clipped_box, unclipped_box)
if box ~= nil then
block = {
x0 = box.x / w, y0 = box.y / h,
x1 = (box.x + box.w) / w,
y1 = (box.y + box.h) / h,
}
end
leptonica.boxDestroy(ffi.new('BOX *[1]', box))
end
if clipped_box ~= nil then
leptonica.boxDestroy(ffi.new('BOX *[1]', clipped_box))
end
if unclipped_box ~= nil then
leptonica.boxDestroy(ffi.new('BOX *[1]', unclipped_box))
end

-- uncomment this to show text blocks in situ
--[[
if block then
local w, h = self.src.width, self.src.height
local box = leptonica.boxCreate(block.x0*w, block.y0*h,
(block.x1-block.x0)*w, (block.y1-block.y0)*h)
local boxa = leptonica.boxaCreate(1)
leptonica.boxaAddBox(boxa, box, ffi.C.L_COPY)
local pixs = k2pdfopt.bitmap2pix(self.src,
0, 0, self.src.width, self.src.height)
local pixc = leptonica.pixDrawBoxaRandom(pixs, boxa, 8)
leptonica.pixWritePng("textblock.png", pixc, 0.0)
leptonica.pixDestroy(ffi.new('PIX *[1]', pixc))
leptonica.boxaDestroy(ffi.new('BOXA *[1]', boxa))
leptonica.boxDestroy(ffi.new('BOX *[1]', box))
end
--]]

leptonica.boxaDestroy(ffi.new('BOXA *[1]', boxatb))
end

return block
end

--[[
-- draw highlights into pix and return leptonica pixmap
--]]
function KOPTContext_mt.__index:getSrcPix(pboxes, drawer)
if self.src ~= nil then
if self.src.data ~= nil then
local pix1 = k2pdfopt.bitmap2pix(self.src,
ffi.new("int", 0), ffi.new("int", 0),
ffi.new("int", self.src.width), ffi.new("int", self.src.height))
0, 0, self.src.width, self.src.height)
if pboxes and drawer == "lighten" then
local color = 0xFFFF0000
local bbox = self.bbox
Expand Down Expand Up @@ -292,7 +379,7 @@ function KOPTContext_mt.__index:exportSrcPNGString(pboxes, drawer)
if pix ~= nil then
local pdata = ffi.new("char *[1]")
local psize = ffi.new("size_t[1]")
leptonica.pixWriteMemPng(pdata, psize, pix, ffi.new("float", 0.0))
leptonica.pixWriteMemPng(pdata, psize, pix, 0.0)
leptonica.pixDestroy(ffi.new('PIX *[1]', pix))
if pdata[0] ~= nil then
local pngstr = ffi.string(pdata[0], psize[0])
Expand All @@ -314,7 +401,6 @@ function KOPTContext_mt.__index:free()
k2pdfopt.bmp_free(self.src)
k2pdfopt.bmp_free(self.dst)
k2pdfopt.wrectmaps_free(self.rectmaps)
k2pdfopt.pageregions_free(self.pageregions)
end

function KOPTContext_mt.__index:__gc() self:free() end
Expand Down Expand Up @@ -364,11 +450,28 @@ function KOPTContext.new()
kc.nboxa = nil
kc.nnai = nil
kc.language = nil

-- 1. in page reflowing context,
-- `src` is the source page image fed into k2pdfopt, and `dst` is the reflowed
-- page image. They usually have different page sizes.
-- 2. in page optimization context,
-- `src` is the source page image fed into k2pdfopt, and `dst` is the
-- de-watermarked page image. They have the same page size.
-- 3. in page segmentation context,
-- `src` is the source page image fed into leptonica, and `dst` is the
-- text block mask. They usually have different page sizes (the mask will be
-- scaled down to half width and height).
-- 4. in OCR context,
-- `src` is an image of a word to be OCRed fed into k2pdfopt, and `dst` is unused.
-- 5. in page cropping context,
-- `src` is the source page image fed into k2pdfopt, and `dst` is unused.
-- 6. in words boxing context,
-- `src` is the source page image fed into k2pdfopt, and `dst` is unused.
-- 7. in page drawing context,
-- `src` is the source page image fed into leptonica, and `dst` is unused.
k2pdfopt.bmp_init(kc.src)
k2pdfopt.bmp_init(kc.dst)
-- only used in words boxing context
k2pdfopt.wrectmaps_init(kc.rectmaps)
k2pdfopt.pageregions_init(kc.pageregions)

return kc
end
Expand Down Expand Up @@ -572,9 +675,6 @@ function KOPTContext.fromtable(context)
kc.rectmaps.wrectmap = nil
end

-- for now we don't serialize pageregions
k2pdfopt.pageregions_init(kc.pageregions)

return kc
end

Expand Down
36 changes: 27 additions & 9 deletions ffi/leptonica_h.lua
Original file line number Diff line number Diff line change
Expand Up @@ -66,25 +66,43 @@ struct Pix {
typedef struct Pix PIX;

/* Leptonica */
BOX * boxCreate(l_int32 x, l_int32 y, l_int32 w, l_int32 h);
BOXA * boxaCreate(l_int32 n);
BOX * boxaGetBox (BOXA *boxa, l_int32 index, l_int32 accessflag);
BOX * boxaGetBox(BOXA *boxa, l_int32 index, l_int32 accessflag);
NUMA * numaCreate(l_int32 n);
NUMA * numaCreateFromFArray(l_float32 *farray, l_int32 size, l_int32 copyflag);
BOXA * boxaCombineOverlaps(BOXA *boxas);
BOXA * boxaClipToBox(BOXA *boxas, BOX *box);
BOX * boxCopy(BOX *box);
BOX * boxClone(BOX *box);
BOX * boxOverlapRegion(BOX *box1, BOX *box2);
BOX * boxAdjustSides(BOX *boxd, BOX *boxs, l_int32 delleft, l_int32 delright, l_int32 deltop, l_int32 delbot);
l_int32 boxaAddBox(BOXA *boxa, BOX *box, l_int32 copyflag);
l_int32 numaGetMax (NUMA *na, l_float32 *pmaxval, l_int32 *pimaxloc);
l_int32 numaGetIValue (NUMA *na, l_int32 index, l_int32 *pival);
l_int32 boxaGetCount (BOXA *boxa);
l_int32 numaGetCount (NUMA *na);
void boxaDestroy (BOXA **pboxa);
void numaDestroy (NUMA **pna);
void pixDestroy (PIX **ppix);
l_int32 numaGetMax(NUMA *na, l_float32 *pmaxval, l_int32 *pimaxloc);
l_int32 numaGetIValue(NUMA *na, l_int32 index, l_int32 *pival);
l_int32 boxaGetCount(BOXA *boxa);
l_int32 numaGetCount(NUMA *na);
l_int32 boxaWrite(const char *filename, BOXA *boxa);
void boxDestroy(BOX **pbox);
void boxaDestroy(BOXA **pboxa);
void numaDestroy(NUMA **pna);
void pixDestroy(PIX **ppix);
l_int32 pixWritePng(const char *filename, PIX *pix, l_float32 gamma);
l_int32 pixWriteMemPng(l_uint8 **pdata, size_t *psize, PIX *pix, l_float32 gamma);
l_uint32 * pixGetData(PIX *pix);
l_int32 pixGetWidth(PIX *pix);
l_int32 pixGetHeight(PIX *pix);
l_int32 pixGetDepth(PIX *pix);
l_int32 pixGetWpl(PIX *pix);
l_int32 pixSetPixel(PIX *pix, l_int32 x, l_int32 y, l_uint32 val);
l_uint32 * pixGetData(PIX *pix);
PIX * pixCreate(l_int32 width, l_int32 height, l_int32 depth);
PIX * pixConvertTo1(PIX *pixs, l_int32 threshold);
PIX * pixThresholdToBinary(PIX *pixs, l_int32 thresh);
PIX * pixConvertTo32(PIX *pixs);
PIX * pixDrawBoxaRandom(PIX *pixs, BOXA *boxa, l_int32 width);
PIX * pixMultiplyByColor(PIX *pixd, PIX *pixs, BOX *box, l_uint32 color);
PIX * pixBlendBackgroundToColor(PIX *pixd, PIX *pixs, BOX *box, l_uint32 color, l_float32 gamma, l_int32 minval, l_int32 maxval);
l_int32 pixGetRegionsBinary(PIX *pixs, PIX **ppixhm, PIX **ppixtm, PIX **ppixtb, l_int32 debug);
BOXA * pixSplitIntoBoxa(PIX *pixs, l_int32 minsum, l_int32 skipdist, l_int32 delta, l_int32 maxbg, l_int32 maxcomps, l_int32 remainder);
PIX * pixReduceRankBinaryCascade(PIX *pixs, l_int32 level1, l_int32 level2, l_int32 level3, l_int32 level4);
]]
26 changes: 20 additions & 6 deletions spec/unit/koptcontext_spec.lua
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,11 @@ local paper_pdf = "spec/base/unit/data/Paper.pdf"

describe("KOPTContext module", function()
local sample_pdf_doc
local paper_pdf_doc

setup(function()
sample_pdf_doc = mupdf.openDocument(sample_pdf)
paper_pdf_doc = mupdf.openDocument(paper_pdf)
end)

teardown(function()
Expand Down Expand Up @@ -143,14 +145,26 @@ describe("KOPTContext module", function()
kc:free()
assert(kc.dst.size_allocated == 0)
end)
it("should get list of page regions #notest", function()
it("should get page textblock at any relative location", function()
local kc = KOPTContext.new()
sample_pdf_doc:openPage(1):toBmp(kc.src, 300)
paper_pdf_doc:openPage(1):toBmp(kc.src, 150)
kc.page_width, kc.page_height = kc.src.width, kc.src.height
local regions = kc:getPageRegions()
for i = 1, #regions do
assert(regions[i].x1 - regions[i].x0 <= 1)
assert(regions[i].y1 - regions[i].y0 <= 1)
kc:findPageBlocks()
local block = kc:getPageBlock(0.6, 0.5)
assert.truthy(block.x1 > 0 and block.x0 > 0)
assert.truthy(block.x1 - block.x0 < 0.5) -- we know this is a two-column page
assert.truthy(block.x0 <= 0.6 and block.x1 >= 0.6)
assert.truthy(block.y0 <= 0.5 and block.y1 >= 0.5)
for y = 0, 1, 0.2 do
for x = 0, 1, 0.2 do
local block = kc:getPageBlock(x, y)
if block then
assert.truthy(block.x1 > 0 and block.x0 > 0)
assert.truthy(block.x1 - block.x0 < 0.5)
assert.truthy(block.x0 <= x and block.x1 >= x)
assert.truthy(block.y0 <= y and block.y1 >= y)
end
end
end
end)
it("should convert koptcontext to/from table", function()
Expand Down

0 comments on commit cd57b68

Please sign in to comment.