-
Notifications
You must be signed in to change notification settings - Fork 0
/
filemanager.processing.pdftextextract.go
106 lines (85 loc) · 2.74 KB
/
filemanager.processing.pdftextextract.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
package filemanager
import (
"bytes"
"fmt"
"strings"
"time"
md "github.com/JohannesKaufmann/html-to-markdown"
"github.com/unidoc/unipdf/v3/extractor"
"github.com/unidoc/unipdf/v3/model"
)
type PDFTextExtractorPlugin struct{}
func (p *PDFTextExtractorPlugin) Process(files []*ManagedFile, fileProcess *FileProcess) ([]*ManagedFile, error) {
var processedFiles []*ManagedFile
for _, file := range files {
if !isPDFFile(file) {
processedFiles = append(processedFiles, file)
continue
}
status := ProcessingStatus{
ProcessID: fileProcess.ID,
TimeStamp: int(time.Now().UnixNano() / int64(time.Millisecond)),
ProcessorName: "PDFTextExtractor",
StatusDescription: fmt.Sprintf("Extracting text from PDF: %s", file.FileName),
}
fileProcess.AddProcessingUpdate(status)
reader := bytes.NewReader(file.Content)
pdfReader, err := model.NewPdfReader(reader)
if err != nil {
return nil, fmt.Errorf("failed to read PDF: %v", err)
}
numPages, err := pdfReader.GetNumPages()
if err != nil {
return nil, fmt.Errorf("failed to get number of pages: %v", err)
}
var extractedText []string
for i := 0; i < numPages; i++ {
page, err := pdfReader.GetPage(i + 1)
if err != nil {
return nil, fmt.Errorf("failed to get page %d: %v", i+1, err)
}
ex, err := extractor.New(page)
if err != nil {
return nil, fmt.Errorf("failed to create extractor: %v", err)
}
text, err := ex.ExtractText()
if err != nil {
return nil, fmt.Errorf("failed to extract text: %v", err)
}
extractedText = append(extractedText, text)
}
outputFormat := file.MetaData["output_format"].(string)
var outputContent []byte
switch outputFormat {
case "text":
outputContent = []byte(strings.Join(extractedText, "\n"))
case "markdown":
html := convertToHTML(extractedText)
converter := md.NewConverter("", true, nil)
markdown, err := converter.ConvertString(html)
if err != nil {
return nil, fmt.Errorf("failed to convert HTML to Markdown: %v", err)
}
outputContent = []byte(markdown)
default:
return nil, fmt.Errorf("unsupported output format: %s", outputFormat)
}
file.Content = outputContent
file.MimeType = "text/plain"
file.FileName = fmt.Sprintf("%s.%s", strings.TrimSuffix(file.FileName, ".pdf"), outputFormat)
processedFiles = append(processedFiles, file)
}
return processedFiles, nil
}
func isPDFFile(file *ManagedFile) bool {
return file.MimeType == "application/pdf"
}
func convertToHTML(lines []string) string {
var htmlLines []string
htmlLines = append(htmlLines, "<html><body>")
for _, line := range lines {
htmlLines = append(htmlLines, "<p>"+line+"</p>")
}
htmlLines = append(htmlLines, "</body></html>")
return strings.Join(htmlLines, "\n")
}