-
Notifications
You must be signed in to change notification settings - Fork 42
/
libreoffice.go
101 lines (81 loc) 路 2.25 KB
/
libreoffice.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
package loader
import (
"context"
"fmt"
"os/exec"
"github.com/henomis/lingoose/document"
"github.com/henomis/lingoose/types"
)
var (
ErrLibreOfficeNotFound = fmt.Errorf("pdftotext not found")
defaultLibreOfficePath = "/usr/bin/soffice"
)
type LibreOfficeLoader struct {
loader Loader
libreOfficePath string
libreOfficeArgs []string
filename string
}
func NewLibreOfficeLoader(filename string) *LibreOfficeLoader {
return &LibreOfficeLoader{
libreOfficePath: defaultLibreOfficePath,
libreOfficeArgs: []string{"--headless", "--convert-to", "txt:Text", "--cat"},
filename: filename,
}
}
func NewLibreOffice() *LibreOfficeLoader {
return &LibreOfficeLoader{
libreOfficePath: defaultLibreOfficePath,
libreOfficeArgs: []string{"--headless", "--convert-to", "txt:Text", "--cat"},
}
}
func (l *LibreOfficeLoader) WithLibreOfficePath(libreOfficePath string) *LibreOfficeLoader {
l.libreOfficePath = libreOfficePath
return l
}
func (l *LibreOfficeLoader) WithTextSplitter(textSplitter TextSplitter) *LibreOfficeLoader {
l.loader.textSplitter = textSplitter
return l
}
func (l *LibreOfficeLoader) WithArgs(libreOfficeArgs []string) *LibreOfficeLoader {
l.libreOfficeArgs = libreOfficeArgs
return l
}
func (l *LibreOfficeLoader) Load(ctx context.Context) ([]document.Document, error) {
err := isFile(l.libreOfficePath)
if err != nil {
return nil, ErrLibreOfficeNotFound
}
err = isFile(l.filename)
if err != nil {
return nil, err
}
documents, err := l.loadFile(ctx)
if err != nil {
return nil, err
}
if l.loader.textSplitter != nil {
documents = l.loader.textSplitter.SplitDocuments(documents)
}
return documents, nil
}
func (l *LibreOfficeLoader) LoadFromSource(ctx context.Context, source string) ([]document.Document, error) {
l.filename = source
return l.Load(ctx)
}
func (l *LibreOfficeLoader) loadFile(ctx context.Context) ([]document.Document, error) {
libreOfficeArgs := append(l.libreOfficeArgs, l.filename)
//nolint:gosec
out, err := exec.CommandContext(ctx, l.libreOfficePath, libreOfficeArgs...).Output()
if err != nil {
return nil, err
}
metadata := make(types.Meta)
metadata[SourceMetadataKey] = l.filename
return []document.Document{
{
Content: string(out),
Metadata: metadata,
},
}, nil
}