-
Notifications
You must be signed in to change notification settings - Fork 0
/
block_parser.go
93 lines (75 loc) 路 2.54 KB
/
block_parser.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
package textractor
import (
"strings"
"github.com/aws/aws-sdk-go-v2/aws"
"github.com/aws/aws-sdk-go-v2/service/textract/types"
)
// blockParser is responsible for parsing and processing Textract blocks.
type blockParser struct {
idTypeMap map[string]types.BlockType
idBlockMap map[string]types.Block
typeIDMap map[types.BlockType][]string
}
// newBlockParser creates a new blockParser instance based on the provided Textract blocks.
func newBlockParser(blocks []types.Block) *blockParser {
idTypeMap := make(map[string]types.BlockType)
idBlockMap := make(map[string]types.Block)
typeIDMap := make(map[types.BlockType][]string)
for _, b := range blocks {
id := aws.ToString(b.Id)
idTypeMap[id] = b.BlockType
idBlockMap[id] = b
if strings.HasPrefix(string(b.BlockType), "LAYOUT") {
typeIDMap[types.BlockType("LAYOUT")] = append(typeIDMap["LAYOUT"], id)
} else {
typeIDMap[b.BlockType] = append(typeIDMap[b.BlockType], id)
}
}
return &blockParser{
idTypeMap: idTypeMap,
idBlockMap: idBlockMap,
typeIDMap: typeIDMap,
}
}
// createDocument processes the Textract blocks and creates a structured Document.
func (bp *blockParser) createDocument() *Document {
ids := bp.blockTypeIDs(types.BlockTypePage)
pages := make([]*Page, len(ids))
for i, id := range ids {
b := bp.blockByID(id)
page := &Page{
id: aws.ToString(b.Id),
number: int(aws.ToInt32(b.Page)),
width: float64(b.Geometry.BoundingBox.Width),
height: float64(b.Geometry.BoundingBox.Height),
childIDs: filterRelationshipIDsByType(b, types.RelationshipTypeChild),
}
pageParser := newPageParser(bp, page)
pageParser.addPageElements()
pages[i] = page
}
return &Document{
pages: pages,
}
}
// blockTypeIDs returns the block IDs of a specific block type.
func (bp *blockParser) blockTypeIDs(blockType types.BlockType) []string {
return bp.typeIDMap[blockType]
}
// blockByID returns the Textract block with the specified ID.
func (bp *blockParser) blockByID(id string) types.Block {
return bp.idBlockMap[id]
}
// filterRelationshipIDsByType filters relationship IDs in a block based on the specified relationship type.
func filterRelationshipIDsByType(b types.Block, relationshipType types.RelationshipType) []string {
var ids []string
// Iterate through each relationship in the block
for _, r := range b.Relationships {
// Check if the relationship type matches the specified type
if r.Type == relationshipType {
// Append the IDs associated with the matching type to the result slice
ids = append(ids, r.Ids...)
}
}
return ids
}