From b799340d8be4c400d841e95435f9507e9015d742 Mon Sep 17 00:00:00 2001 From: hupe1980 Date: Wed, 3 Jan 2024 21:35:09 +0100 Subject: [PATCH] Misc --- layout.go | 74 ++++++++++++++++++++++++++++++---------------- options.go | 26 +++++++++++----- table.go | 6 +--- textractor_test.go | 4 +++ 4 files changed, 73 insertions(+), 37 deletions(-) diff --git a/layout.go b/layout.go index 2a0508d..1772af5 100644 --- a/layout.go +++ b/layout.go @@ -39,12 +39,55 @@ func (l *Layout) Text(optFns ...func(*TextLinearizationOptions)) string { return "" } + var text string + + switch l.BlockType() { // nolint exhaustive + case types.BlockTypeLayoutList: + items := make([]string, 0, len(l.children)) + + for _, c := range l.children { + itemText := c.Text(func(tlo *TextLinearizationOptions) { + *tlo = opts + }) + + if opts.RemoveNewLinesInListElements { + itemText = strings.ReplaceAll(itemText, "\n", " ") + } + + items = append(items, fmt.Sprintf("%s%s%s", opts.ListElementPrefix, itemText, opts.ListElementSuffix)) + } + + text = strings.Join(items, opts.ListElementSeparator) + case types.BlockTypeLayoutPageNumber: + text = l.linearizeChildren(l.children, opts) + text = fmt.Sprintf("%s%s%s", opts.PageNumberPrefix, text, opts.PageNumberSuffix) + case types.BlockTypeLayoutTitle: + text = l.linearizeChildren(l.children, opts) + text = fmt.Sprintf("%s%s%s", opts.TitlePrefix, text, opts.TitleSuffix) + case types.BlockTypeLayoutSectionHeader: + text = l.linearizeChildren(l.children, opts) + text = fmt.Sprintf("%s%s%s", opts.SectionHeaderPrefix, text, opts.SectionHeaderSuffix) + default: + text = l.linearizeChildren(l.children, opts) + } + + invalidSeparator := strings.Repeat("\n", opts.MaxNumberOfConsecutiveNewLines+1) + validSeperator := strings.Repeat("\n", opts.MaxNumberOfConsecutiveNewLines) + + for strings.Contains(text, invalidSeparator) { + text = strings.ReplaceAll(text, invalidSeparator, validSeperator) + } + + return text +} + +func (l *Layout) linearizeChildren(children []LayoutChild, opts TextLinearizationOptions) string { var ( text string prev LayoutChild ) - for _, group := range groupElementsHorizontally(l.children, opts.HeuristicOverlapRatio) { + for _, group := range groupElementsHorizontally(children, opts.HeuristicOverlapRatio) { sort.Slice(group, func(i, j int) bool { return group[i].BoundingBox().Left() < group[j].BoundingBox().Left() }) @@ -57,8 +100,11 @@ func (l *Layout) Text(optFns ...func(*TextLinearizationOptions)) string { }) switch child.(type) { - case *Table, *KeyValue: - text += childText + case *Table: + text += fmt.Sprintf("%s%s%s", opts.TableLayoutPrefix, childText, opts.TableLayoutSuffix) + addRowSeparatorIfTableLayout = false + case *KeyValue: + text += fmt.Sprintf("%s%s%s", opts.KeyValueLayoutPrefix, childText, opts.KeyValueLayoutSuffix) addRowSeparatorIfTableLayout = false default: if l.BlockType() == types.BlockTypeLayoutTable { @@ -104,28 +150,6 @@ func (l *Layout) Text(optFns ...func(*TextLinearizationOptions)) string { } } - switch l.BlockType() { // nolint exhaustive - case types.BlockTypeLayoutPageNumber: - if opts.AddPrefixesAndSuffixes { - text = fmt.Sprintf("%s%s%s", opts.PageNumberPrefix, text, opts.PageNumberSuffix) - } - case types.BlockTypeLayoutTitle: - if opts.AddPrefixesAndSuffixes { - text = fmt.Sprintf("%s%s%s", opts.TitlePrefix, text, opts.TitleSuffix) - } - case types.BlockTypeLayoutSectionHeader: - if opts.AddPrefixesAndSuffixes { - text = fmt.Sprintf("%s%s%s", opts.SectionHeaderPrefix, text, opts.SectionHeaderSuffix) - } - } - - invalidSeparator := strings.Repeat("\n", opts.MaxNumberOfConsecutiveNewLines+1) - validSeperator := strings.Repeat("\n", opts.MaxNumberOfConsecutiveNewLines) - - for strings.Contains(text, invalidSeparator) { - text = strings.ReplaceAll(text, invalidSeparator, validSeperator) - } - return text } diff --git a/options.go b/options.go index e032d44..90c6f4f 100644 --- a/options.go +++ b/options.go @@ -8,8 +8,8 @@ type TextLinearizationOptions struct { // LinearizeKeyValues includes form key and values in the linearized output. LinearizeKeyValues bool - // RemoveNewLinesInLeafElements removes new lines in leaf layout elements, removing extra whitespace. - RemoveNewLinesInLeafElements bool + // RemoveNewLinesInListElements removes new lines in list elements. + RemoveNewLinesInListElements bool // MaxNumberOfConsecutiveNewLines sets the maximum number of consecutive new lines to keep, removing extra whitespace. MaxNumberOfConsecutiveNewLines int @@ -59,6 +59,12 @@ type TextLinearizationOptions struct { // TitleSuffix is the suffix for title layout elements. TitleSuffix string + // TableLayoutPrefix is the prefix for table elements. + TableLayoutPrefix string + + // TableLayoutSuffix is the suffix for table elements. + TableLayoutSuffix string + // TableLinearizationFormat sets how to represent tables in the linearized output. Choices are plaintext or markdown. TableLinearizationFormat string @@ -101,6 +107,12 @@ type TextLinearizationOptions struct { // TextSuffix is the suffix for text layout elements. TextSuffix string + // KeyValueLayoutPrefix is the prefix for key_value layout elements (not for individual key-value elements). + KeyValueLayoutPrefix string + + // KeyValueLayoutSuffix is the suffix for key_value layout elements (not for individual key-value elements). + KeyValueLayoutSuffix string + // KeyValuePrefix is the prefix for key-value elements. KeyValuePrefix string @@ -136,15 +148,12 @@ type TextLinearizationOptions struct { // SignatureToken is the signature representation in the linearized text. SignatureToken string - - // AddPrefixesAndSuffixes controls if the prefixes/suffixes will be added to the linearized text. - AddPrefixesAndSuffixes bool } var DefaultLinerizationOptions = TextLinearizationOptions{ LinearizeTables: true, LinearizeKeyValues: true, - RemoveNewLinesInLeafElements: true, + RemoveNewLinesInListElements: true, MaxNumberOfConsecutiveNewLines: 2, HideHeaderLayout: false, HideFooterLayout: false, @@ -161,6 +170,8 @@ var DefaultLinerizationOptions = TextLinearizationOptions{ ListElementSuffix: "", TitlePrefix: "", TitleSuffix: "", + TableLayoutPrefix: "\n\n", + TableLayoutSuffix: "\n", TableLinearizationFormat: "plaintext", TableMinTableWords: 0, TableColumnSeparator: "\t", @@ -175,6 +186,8 @@ var DefaultLinerizationOptions = TextLinearizationOptions{ SectionHeaderSuffix: "", TextPrefix: "", TextSuffix: "", + KeyValueLayoutPrefix: "\n\n", + KeyValueLayoutSuffix: "", KeyValuePrefix: "", KeyValueSuffix: "", KeyPrefix: "", @@ -187,5 +200,4 @@ var DefaultLinerizationOptions = TextLinearizationOptions{ HeuristicLineBreakThreshold: 0.9, HeuristicOverlapRatio: 0.5, SignatureToken: "[SIGNATURE]", - AddPrefixesAndSuffixes: true, } diff --git a/table.go b/table.go index 0fda126..ac5e813 100644 --- a/table.go +++ b/table.go @@ -93,11 +93,7 @@ func (t *Table) Text(optFns ...func(*TextLinearizationOptions)) string { panic(fmt.Sprintf("unknown table format: %s", opts.TableLinearizationFormat)) } - if opts.AddPrefixesAndSuffixes { - tableText = fmt.Sprintf("%s%s%s", opts.TablePrefix, tableText, opts.TableSuffix) - } - - return tableText + return fmt.Sprintf("%s%s%s", opts.TablePrefix, tableText, opts.TableSuffix) } func (t *Table) RowCount() int { diff --git a/textractor_test.go b/textractor_test.go index aad8911..19db2ee 100644 --- a/textractor_test.go +++ b/textractor_test.go @@ -65,15 +65,19 @@ MATrIX (ours) 166 78.60 96.05 }) //fmt.Println(text) + assert.Equal(t, `# New Document ## Paragraph 1 Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. + + | A | B | C | |----|-----|----| | A1 | b1 | C1 | | A2 | B2 | C2 | | A3 | BC3 | | | A4 | B4 | C4 | + `, text) })