-
Notifications
You must be signed in to change notification settings - Fork 0
/
script_detector.go
106 lines (92 loc) · 2.02 KB
/
script_detector.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
package nlp
import (
"sort"
"github.com/koykov/byteseq"
)
type ScriptDetectAlgo uint
const (
ScriptDetectAlgoHalf ScriptDetectAlgo = iota
ScriptDetectAlgoDistributed
ScriptDetectAlgoFull
)
// UnicodeScriptDetector is a builtin detector of writing scripts.
type UnicodeScriptDetector[T byteseq.Byteseq] struct {
algo ScriptDetectAlgo
}
func NewUnicodeScriptDetector[T byteseq.Byteseq]() UnicodeScriptDetector[T] {
return UnicodeScriptDetector[T]{algo: ScriptDetectAlgoFull}
}
func NewUnicodeScriptDetectorWithAlgo[T byteseq.Byteseq](algo ScriptDetectAlgo) UnicodeScriptDetector[T] {
return UnicodeScriptDetector[T]{algo: algo}
}
func (d UnicodeScriptDetector[T]) Detect(ctx *Ctx[T]) (Script, error) {
if err := d.dsProba(ctx); err != nil {
return 0, err
}
var (
mx float32
mi int
)
_ = ctx.BufSP[len(ctx.BufSP)-1]
for i := 0; i < len(ctx.BufSP); i++ {
if score := ctx.BufSP[i].Score; score > mx {
mx, mi = score, i
}
}
return ctx.BufSP[mi].Script, nil
}
func (d UnicodeScriptDetector[T]) DetectProba(ctx *Ctx[T]) (ScriptProba, error) {
if err := d.dsProba(ctx); err != nil {
return nil, err
}
sort.Sort(&ctx.BufSP)
return ctx.BufSP, nil
}
func (d UnicodeScriptDetector[T]) dsProba(ctx *Ctx[T]) error {
runes := ctx.GetRunes()
l := len(runes)
if l == 0 {
return ErrEmptyInput
}
s := 1
if d.algo == ScriptDetectAlgoHalf {
l /= 2
}
if d.algo == ScriptDetectAlgoDistributed {
s = distStep(l)
}
scripts := ctx.GetScriptsLimit()
sl := len(scripts)
if sl == 0 {
return nil
}
ctx.BufSP = ctx.BufSP[:0]
_ = scripts[sl-1]
for i := 0; i < len(scripts); i++ {
ctx.BufSP = append(ctx.BufSP, ScriptScore{Script: scripts[i]})
}
_ = runes[l-1]
for i := 0; i < len(runes); i += s {
for j := 0; j < len(scripts); j++ {
if scripts[j].Evaluate(runes[i]) {
ctx.BufSP[j].Score += 1
}
}
}
for i := 0; i < len(ctx.BufSP); i++ {
ctx.BufSP[i].Score /= float32(l)
}
return nil
}
func distStep(l int) int {
if l < 8 {
return 1
}
if l < 32 {
return 2
}
if l < 128 {
return 4
}
return 8
}