Skip to content

Commit

Permalink
Refactor
Browse files Browse the repository at this point in the history
  • Loading branch information
hupe1980 committed Jan 1, 2024
1 parent b7d98f9 commit a11835b
Show file tree
Hide file tree
Showing 8 changed files with 312 additions and 69 deletions.
2 changes: 0 additions & 2 deletions block_parser.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,8 +65,6 @@ func (bp *blockParser) createDocument() *Document {
page.queries = pageParser.createQueries()
page.signatures = pageParser.createSignatures()

pageParser.removeDuplicates()

pages[i] = page
}

Expand Down
10 changes: 10 additions & 0 deletions document.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,16 @@ func (d *Document) KeyValues() []*KeyValue {
return internal.Concatenate(keyValues...)
}

func (d *Document) Signatures() []*Signature {
signatures := make([][]*Signature, 0, len(d.Pages()))

for _, p := range d.Pages() {
signatures = append(signatures, p.Signatures())
}

return internal.Concatenate(signatures...)
}

func (d *Document) Text(optFns ...func(*TextLinearizationOptions)) string {
pageTexts := make([]string, len(d.Pages()))

Expand Down
214 changes: 214 additions & 0 deletions examples/analyze_document_for_llm/sample_output.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,214 @@
Request for Verification of Employment
Privacy Act Notice: This information is to be used by the agency collecting it or its assignees in determining whether you qualify as a prospective mortgagor under its program. It will not be disclosed outside the agency except as required and permitted by law. You do not have to provide this information, but if you do not your application for approval as a prospec- tive mortgagor or borrower may be delayed or rejected. The information requested in this form is authorized by Title 38, USC, Chapter 37 (if VA); by 12 USC, Section 1701 et. seq. (if HUD/FHA); by 42 USC, Section 1452b (if HUD/CPD); and Title 42 USC, 1471 et. seq., or 7 USC, 1921 et. seq. (if USDA/FmHA).
Instructions: Lender - Complete items 1 through 7. Have applicant complete item 8. Forward directly to employer named in item 1. Employer - Please complete either Part II or Part III as applicable. Complete Part IV and return directly to lender named in item 2. The form is to be transmitted directly to the lender and is not to be transmitted through the applicant or any other party.


Part I - Request

1. To (Name and address of employer) Alejandro Rosalez 123 Any Street, Any Town, USA

2. From (Name and address of lender) Carlos Salazar 100 Main Street, Anytown, USA

I certify that this verification has been sent directly to the employer and has not passed through the hands of the applicant or any other interested party.

3. Signature of Lender Carlos Salazar

[SIGNATURE]

4. Title Project Manager

5. Date 12/12/2006

6. Lender's Number (Optional) 5555-5555-5555

I have applied for a mortgage loan and stated that I am now or was formerly employed by you. My signature below authorizes verification of this information.

7. Name and Address of Applicant (include employee or badge number) Paulo Santos 123 Any Street, Any Town, USA

8. Signature of Applicant Paulo Santos

[SIGNATURE]

Part II - Verification of Present Employment

9. Applicant's Date of Employment 06/06/2006

10. Present Position General Manager

11. Probability of Continued Employment 3 years

12A. Current Gross Base Pay (Enter Amount and Check Period) $ 5600

13. For Military Personnel Only

14. If Overtime or Bonus is Applicable,

Annual [X]

Hourly [ ]

Pay Grade

10

Is Its Continuance Likely?

Monthly [ ]

Other (Specify) [ ]

Type

Monthly Amount

Overtime

Yes [X]

No [ ]

Weekly [ ]

Bonus

Yes [ ]

No [X]

Base Pay $ 520

15. If paid hourly - average hours per week 40 hours

12B. Gross Earnings

Type

Year To Date

Past Year

Past Year

Rations $ 162

Base Pay

$

Thru 2006

15.00

$ 20.00

$

30.00

Flight or Hazard $ 756

16. Date of applicant's next pay increase 08/08/2007

Clothing $ 452

Overtime

$

15.00

$ 20.00

$

30.00

17. Projected amount of next pay increase $ 5600

Quarters $ 986

Commissions

$

20.00

$ 20.00

$

15.00

Pro Pay $ 123

18. Date of applicant's last pay increase 09/08/2006

Overseas or Combat $ 645

Bonus

$

20.00

$

20.00

$

15.00

19. Amount of last pay increase $ 4800

Total

$ 70.00

$ 80.00

$

90.00

Variable Housing Allowance $ 587
20. Remarks (If employee was off work for any length of time, please indicate time period and reason) Not Applicable
Part III - Verification of Previous Employment


21. Date Hired 04/04/2004

23. Salary/Wage at Termination Per (Year) (Month) (Week)

22. Date Terminated 01/03/2005

Base $ 9500

Overtime 1250

Commissions 4500

Bonus 4000

24. Reason for Leaving Medical Issue

25. Position Held Device Operator

Part IV - Authorized Signature - Federal statutes provide severe penalties for any fraud, intentional misrepresentation, or criminal connivance

or conspiracy purposed to influence the issuance of any guaranty or insurance by the VA Secretary, the U.S.D.A., FmHA/FHA Commissioner, or

the HUD/CPD Assistant Secretary.

26. Signature of Employer Richard Roe

[SIGNATURE]

27. Title (Please print or type) VA Secretary

28. Date 01/05/2007

29. Print or type name signed in Item 26 Richard Roe

30. Phone No. 555-0100
Form 1005 July 96
43 changes: 29 additions & 14 deletions layout.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,13 +53,11 @@ func (l *Layout) TextAndWords(optFns ...func(*TextLinearizationOptions)) (string
return "", nil
}

text := ""
words := make([]*Word, 0)
first := true

sort.Slice(l.children, func(i, j int) bool {
return l.children[i].BoundingBox().Top() < l.children[j].BoundingBox().Top()
})
var (
text string
words []*Word
prev LayoutChild
)

for _, group := range groupElementsHorizontally(l.children, opts.HeuristicOverlapRatio) {
sort.Slice(group, func(i, j int) bool {
Expand All @@ -77,25 +75,33 @@ func (l *Layout) TextAndWords(optFns ...func(*TextLinearizationOptions)) (string
}

text += columnSep + childText
// } else if l.BlockType() == types.BlockTypeLayoutKeyValue && len(childWords) > 0 {
// if opts.AddPrefixesAndSuffixesInText {
// text += fmt.Sprintf("%s%s%s", opts.KeyValueLayoutPrefix, text, opts.KeyValueLayoutSuffix)
// }
} else { // nolint wsl
} else if l.BlockType() == types.BlockTypeLayoutKeyValue && len(childWords) > 0 {
if opts.AddPrefixesAndSuffixesInText {
text += fmt.Sprintf("%s%s%s", opts.KeyValueLayoutPrefix, childText, opts.KeyValueLayoutSuffix)
}
} else if partOfSameParagraph(prev, child, opts) {
text += opts.SameParagraphSeparator + childText
} else {
sep := ""
if !first {
if prev != nil {
sep = opts.LayoutElementSeparator
}

text += sep + childText
}

first = false
prev = child
}

if l.BlockType() == types.BlockTypeLayoutTable {
text += opts.TableRowSeparator
}

prev = &Line{
base: base{
boundingBox: NewEnclosingBoundingBox(group...),
},
}
}

switch l.BlockType() { // nolint exhaustive
Expand Down Expand Up @@ -195,3 +201,12 @@ func groupElementsHorizontally(elements []LayoutChild, overlapRatio float32) [][

return groupedElements
}

func partOfSameParagraph(child1, child2 LayoutChild, options TextLinearizationOptions) bool {
if child1 != nil && child2 != nil {
return float32(math.Abs(float64(child1.BoundingBox().Left()-child2.BoundingBox().Left()))) <= options.HeuristicHTolerance*child1.BoundingBox().Width() &&
float32(math.Abs(float64(child1.BoundingBox().Top()-child2.BoundingBox().Top()))) <= options.HeuristicOverlapRatio*float32(math.Min(float64(child1.BoundingBox().Height()), float64(child2.BoundingBox().Height())))
}

return false
}
16 changes: 13 additions & 3 deletions page.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package textractor

import (
"slices"
"sort"
"strings"

"github.com/hupe1980/go-textractor/internal"
Expand Down Expand Up @@ -76,10 +77,19 @@ func (p *Page) Text(optFns ...func(*TextLinearizationOptions)) string {
}

func (p *Page) textAndWords(optFns ...func(*TextLinearizationOptions)) (string, []*Word) {
pageTexts := make([]string, len(p.layouts))
wordLists := make([][]*Word, len(p.layouts))
// Create a copy of the layouts to avoid modifying the original slice
sortedLayouts := make([]*Layout, len(p.layouts))
copy(sortedLayouts, p.layouts)

for i, l := range p.layouts {
// Sort layouts based on the reading order
sort.Slice(sortedLayouts, func(i, j int) bool {
return sortedLayouts[i].BoundingBox().Top() < sortedLayouts[j].BoundingBox().Top()
})

pageTexts := make([]string, len(sortedLayouts))
wordLists := make([][]*Word, len(sortedLayouts))

for i, l := range sortedLayouts {
text, words := l.TextAndWords(optFns...)

pageTexts[i] = text
Expand Down
Loading

0 comments on commit a11835b

Please sign in to comment.