-
Notifications
You must be signed in to change notification settings - Fork 0
/
options.go
183 lines (138 loc) Β· 6.22 KB
/
options.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
package textractor
// TextLinearizationOptions defines how a document is linearized into a text string.
type TextLinearizationOptions struct {
// MaxNumberOfConsecutiveNewLines sets the maximum number of consecutive new lines to keep, removing extra whitespace.
MaxNumberOfConsecutiveNewLines int
// HideHeaderLayout hides headers in the linearized output.
HideHeaderLayout bool
// HideFooterLayout hides footers in the linearized output.
HideFooterLayout bool
// HideFigureLayout hides figures in the linearized output.
HideFigureLayout bool
// HidePageNumberLayout hides page numbers in the linearized output.
HidePageNumberLayout bool
// PageNumberPrefix is the prefix for page number layout elements.
PageNumberPrefix string
// PageNumberSuffix is the suffix for page number layout elements.
PageNumberSuffix string
// SameParagraphSeparator is the separator to use when combining elements within a text block.
SameParagraphSeparator string
// LayoutElementSeparator is the separator to use when combining linearized layout elements.
LayoutElementSeparator string
// ListElementSeparator is the separator for elements in a list layout.
ListElementSeparator string
// ListLayoutPrefix is the prefix for list layout elements (parent).
ListLayoutPrefix string
// ListLayoutSuffix is the suffix for list layout elements (parent).
ListLayoutSuffix string
// ListElementPrefix is the prefix for elements in a list layout (children).
ListElementPrefix string
// ListElementSuffix is the suffix for elements in a list layout (children).
ListElementSuffix string
// RemoveNewLinesInListElements removes new lines in list elements.
RemoveNewLinesInListElements bool
// TitlePrefix is the prefix for title layout elements.
TitlePrefix string
// TitleSuffix is the suffix for title layout elements.
TitleSuffix string
// TableLayoutPrefix is the prefix for table elements.
TableLayoutPrefix string
// TableLayoutSuffix is the suffix for table elements.
TableLayoutSuffix string
// TableLinearizationFormat sets how to represent tables in the linearized output. Choices are plaintext or markdown.
TableLinearizationFormat string
// TableMinTableWords is the threshold below which tables will be rendered as words instead of using table layout.
TableMinTableWords int
// TableColumnSeparator is the table column separator, used when linearizing layout tables, not used if AnalyzeDocument was called with the TABLES feature.
TableColumnSeparator string
// TablePrefix is the prefix for table layout.
TablePrefix string
// TableSuffix is the suffix for table layout.
TableSuffix string
// TableRowSeparator is the table row separator.
TableRowSeparator string
// TableRowPrefix is the prefix for table row.
TableRowPrefix string
// TableRowSuffix is the suffix for table row.
TableRowSuffix string
// TableCellPrefix is the prefix for table cell.
TableCellPrefix string
// TableCellSuffix is the suffix for table cell.
TableCellSuffix string
// SectionHeaderPrefix is the prefix for section header layout elements.
SectionHeaderPrefix string
// SectionHeaderSuffix is the suffix for section header layout elements.
SectionHeaderSuffix string
// KeyValueLayoutPrefix is the prefix for key_value layout elements (not for individual key-value elements).
KeyValueLayoutPrefix string
// KeyValueLayoutSuffix is the suffix for key_value layout elements (not for individual key-value elements).
KeyValueLayoutSuffix string
// KeyValuePrefix is the prefix for key-value elements.
KeyValuePrefix string
// KeyValueSuffix is the suffix for key-value elements.
KeyValueSuffix string
// KeyPrefix is the prefix for key elements.
KeyPrefix string
// KeySuffix is the suffix for key elements.
KeySuffix string
// ValuePrefix is the prefix for value elements.
ValuePrefix string
// ValueSuffix is the suffix for value elements.
ValueSuffix string
// SelectionElementSelected is the representation for selection elements when selected.
SelectionElementSelected string
// SelectionElementNotSelected is the representation for selection elements when not selected.
SelectionElementNotSelected string
// HeuristicHTolerance sets how much the line below and above the current line should differ in width to be separated.
HeuristicHTolerance float64
// HeuristicOverlapRatio sets how much vertical overlap is tolerated between two subsequent lines before merging them into a single line.
HeuristicOverlapRatio float64
// SignatureToken is the signature representation in the linearized text.
SignatureToken string
}
var DefaultLinerizationOptions = TextLinearizationOptions{
MaxNumberOfConsecutiveNewLines: 2,
HideHeaderLayout: false,
HideFooterLayout: false,
HideFigureLayout: false,
HidePageNumberLayout: false,
PageNumberPrefix: "",
PageNumberSuffix: "",
SameParagraphSeparator: " ",
LayoutElementSeparator: "\n\n",
ListElementSeparator: "\n",
ListLayoutPrefix: "",
ListLayoutSuffix: "",
ListElementPrefix: "",
ListElementSuffix: "",
RemoveNewLinesInListElements: true,
TitlePrefix: "",
TitleSuffix: "",
TableLayoutPrefix: "\n\n",
TableLayoutSuffix: "\n",
TableLinearizationFormat: "plaintext",
TableMinTableWords: 0,
TableColumnSeparator: "\t",
TablePrefix: "",
TableSuffix: "",
TableRowSeparator: "\n",
TableRowPrefix: "",
TableRowSuffix: "",
TableCellPrefix: "",
TableCellSuffix: "",
SectionHeaderPrefix: "",
SectionHeaderSuffix: "",
KeyValueLayoutPrefix: "\n\n",
KeyValueLayoutSuffix: "",
KeyValuePrefix: "",
KeyValueSuffix: "",
KeyPrefix: "",
KeySuffix: "",
ValuePrefix: "",
ValueSuffix: "",
SelectionElementSelected: "[X]",
SelectionElementNotSelected: "[ ]",
HeuristicHTolerance: 0.3,
HeuristicOverlapRatio: 0.5,
SignatureToken: "[SIGNATURE]",
}