Skip to content

Commit

Permalink
Misc
Browse files Browse the repository at this point in the history
  • Loading branch information
hupe1980 committed Jan 3, 2024
1 parent ed3318c commit b799340
Show file tree
Hide file tree
Showing 4 changed files with 73 additions and 37 deletions.
74 changes: 49 additions & 25 deletions layout.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,12 +39,55 @@ func (l *Layout) Text(optFns ...func(*TextLinearizationOptions)) string {
return ""
}

var text string

switch l.BlockType() { // nolint exhaustive
case types.BlockTypeLayoutList:
items := make([]string, 0, len(l.children))

for _, c := range l.children {
itemText := c.Text(func(tlo *TextLinearizationOptions) {
*tlo = opts
})

if opts.RemoveNewLinesInListElements {
itemText = strings.ReplaceAll(itemText, "\n", " ")
}

items = append(items, fmt.Sprintf("%s%s%s", opts.ListElementPrefix, itemText, opts.ListElementSuffix))
}

text = strings.Join(items, opts.ListElementSeparator)
case types.BlockTypeLayoutPageNumber:
text = l.linearizeChildren(l.children, opts)
text = fmt.Sprintf("%s%s%s", opts.PageNumberPrefix, text, opts.PageNumberSuffix)
case types.BlockTypeLayoutTitle:
text = l.linearizeChildren(l.children, opts)
text = fmt.Sprintf("%s%s%s", opts.TitlePrefix, text, opts.TitleSuffix)
case types.BlockTypeLayoutSectionHeader:
text = l.linearizeChildren(l.children, opts)
text = fmt.Sprintf("%s%s%s", opts.SectionHeaderPrefix, text, opts.SectionHeaderSuffix)
default:
text = l.linearizeChildren(l.children, opts)
}

invalidSeparator := strings.Repeat("\n", opts.MaxNumberOfConsecutiveNewLines+1)
validSeperator := strings.Repeat("\n", opts.MaxNumberOfConsecutiveNewLines)

for strings.Contains(text, invalidSeparator) {
text = strings.ReplaceAll(text, invalidSeparator, validSeperator)
}

return text
}

func (l *Layout) linearizeChildren(children []LayoutChild, opts TextLinearizationOptions) string {
var (
text string
prev LayoutChild
)

for _, group := range groupElementsHorizontally(l.children, opts.HeuristicOverlapRatio) {
for _, group := range groupElementsHorizontally(children, opts.HeuristicOverlapRatio) {
sort.Slice(group, func(i, j int) bool {
return group[i].BoundingBox().Left() < group[j].BoundingBox().Left()
})
Expand All @@ -57,8 +100,11 @@ func (l *Layout) Text(optFns ...func(*TextLinearizationOptions)) string {
})

switch child.(type) {
case *Table, *KeyValue:
text += childText
case *Table:
text += fmt.Sprintf("%s%s%s", opts.TableLayoutPrefix, childText, opts.TableLayoutSuffix)
addRowSeparatorIfTableLayout = false
case *KeyValue:
text += fmt.Sprintf("%s%s%s", opts.KeyValueLayoutPrefix, childText, opts.KeyValueLayoutSuffix)
addRowSeparatorIfTableLayout = false
default:
if l.BlockType() == types.BlockTypeLayoutTable {
Expand Down Expand Up @@ -104,28 +150,6 @@ func (l *Layout) Text(optFns ...func(*TextLinearizationOptions)) string {
}
}

switch l.BlockType() { // nolint exhaustive
case types.BlockTypeLayoutPageNumber:
if opts.AddPrefixesAndSuffixes {
text = fmt.Sprintf("%s%s%s", opts.PageNumberPrefix, text, opts.PageNumberSuffix)
}
case types.BlockTypeLayoutTitle:
if opts.AddPrefixesAndSuffixes {
text = fmt.Sprintf("%s%s%s", opts.TitlePrefix, text, opts.TitleSuffix)
}
case types.BlockTypeLayoutSectionHeader:
if opts.AddPrefixesAndSuffixes {
text = fmt.Sprintf("%s%s%s", opts.SectionHeaderPrefix, text, opts.SectionHeaderSuffix)
}
}

invalidSeparator := strings.Repeat("\n", opts.MaxNumberOfConsecutiveNewLines+1)
validSeperator := strings.Repeat("\n", opts.MaxNumberOfConsecutiveNewLines)

for strings.Contains(text, invalidSeparator) {
text = strings.ReplaceAll(text, invalidSeparator, validSeperator)
}

return text
}

Expand Down
26 changes: 19 additions & 7 deletions options.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@ type TextLinearizationOptions struct {
// LinearizeKeyValues includes form key and values in the linearized output.
LinearizeKeyValues bool

// RemoveNewLinesInLeafElements removes new lines in leaf layout elements, removing extra whitespace.
RemoveNewLinesInLeafElements bool
// RemoveNewLinesInListElements removes new lines in list elements.
RemoveNewLinesInListElements bool

// MaxNumberOfConsecutiveNewLines sets the maximum number of consecutive new lines to keep, removing extra whitespace.
MaxNumberOfConsecutiveNewLines int
Expand Down Expand Up @@ -59,6 +59,12 @@ type TextLinearizationOptions struct {
// TitleSuffix is the suffix for title layout elements.
TitleSuffix string

// TableLayoutPrefix is the prefix for table elements.
TableLayoutPrefix string

// TableLayoutSuffix is the suffix for table elements.
TableLayoutSuffix string

// TableLinearizationFormat sets how to represent tables in the linearized output. Choices are plaintext or markdown.
TableLinearizationFormat string

Expand Down Expand Up @@ -101,6 +107,12 @@ type TextLinearizationOptions struct {
// TextSuffix is the suffix for text layout elements.
TextSuffix string

// KeyValueLayoutPrefix is the prefix for key_value layout elements (not for individual key-value elements).
KeyValueLayoutPrefix string

// KeyValueLayoutSuffix is the suffix for key_value layout elements (not for individual key-value elements).
KeyValueLayoutSuffix string

// KeyValuePrefix is the prefix for key-value elements.
KeyValuePrefix string

Expand Down Expand Up @@ -136,15 +148,12 @@ type TextLinearizationOptions struct {

// SignatureToken is the signature representation in the linearized text.
SignatureToken string

// AddPrefixesAndSuffixes controls if the prefixes/suffixes will be added to the linearized text.
AddPrefixesAndSuffixes bool
}

var DefaultLinerizationOptions = TextLinearizationOptions{
LinearizeTables: true,
LinearizeKeyValues: true,
RemoveNewLinesInLeafElements: true,
RemoveNewLinesInListElements: true,
MaxNumberOfConsecutiveNewLines: 2,
HideHeaderLayout: false,
HideFooterLayout: false,
Expand All @@ -161,6 +170,8 @@ var DefaultLinerizationOptions = TextLinearizationOptions{
ListElementSuffix: "",
TitlePrefix: "",
TitleSuffix: "",
TableLayoutPrefix: "\n\n",
TableLayoutSuffix: "\n",
TableLinearizationFormat: "plaintext",
TableMinTableWords: 0,
TableColumnSeparator: "\t",
Expand All @@ -175,6 +186,8 @@ var DefaultLinerizationOptions = TextLinearizationOptions{
SectionHeaderSuffix: "",
TextPrefix: "",
TextSuffix: "",
KeyValueLayoutPrefix: "\n\n",
KeyValueLayoutSuffix: "",
KeyValuePrefix: "",
KeyValueSuffix: "",
KeyPrefix: "",
Expand All @@ -187,5 +200,4 @@ var DefaultLinerizationOptions = TextLinearizationOptions{
HeuristicLineBreakThreshold: 0.9,
HeuristicOverlapRatio: 0.5,
SignatureToken: "[SIGNATURE]",
AddPrefixesAndSuffixes: true,
}
6 changes: 1 addition & 5 deletions table.go
Original file line number Diff line number Diff line change
Expand Up @@ -93,11 +93,7 @@ func (t *Table) Text(optFns ...func(*TextLinearizationOptions)) string {
panic(fmt.Sprintf("unknown table format: %s", opts.TableLinearizationFormat))
}

if opts.AddPrefixesAndSuffixes {
tableText = fmt.Sprintf("%s%s%s", opts.TablePrefix, tableText, opts.TableSuffix)
}

return tableText
return fmt.Sprintf("%s%s%s", opts.TablePrefix, tableText, opts.TableSuffix)
}

func (t *Table) RowCount() int {
Expand Down
4 changes: 4 additions & 0 deletions textractor_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,15 +65,19 @@ MATrIX (ours) 166 78.60 96.05
})

//fmt.Println(text)

assert.Equal(t, `# New Document
## Paragraph 1
Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.
| A | B | C |
|----|-----|----|
| A1 | b1 | C1 |
| A2 | B2 | C2 |
| A3 | BC3 | |
| A4 | B4 | C4 |
`, text)
})

Expand Down

0 comments on commit b799340

Please sign in to comment.