This repository has been archived by the owner on May 14, 2023. It is now read-only.
/
treebank.go
112 lines (98 loc) · 3.37 KB
/
treebank.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
package tokenize
import (
"regexp"
"strings"
)
// TreebankWordTokenizer splits a sentence into words.
//
// This implementation is a port of the Sed script written by Robert McIntyre,
// which is available at https://gist.github.com/jdkato/fc8b8c4266dba22d45ac85042ae53b1e.
type TreebankWordTokenizer struct {
}
// NewTreebankWordTokenizer is a TreebankWordTokenizer constructor.
func NewTreebankWordTokenizer() *TreebankWordTokenizer {
return new(TreebankWordTokenizer)
}
var startingQuotes = map[string]*regexp.Regexp{
"$1 `` ": regexp.MustCompile(`'([ (\[{<])"`),
"``": regexp.MustCompile(`^(")`),
" ``": regexp.MustCompile(`( ")`),
}
var startingQuotes2 = map[string]*regexp.Regexp{
" $1 ": regexp.MustCompile("(``)"),
}
var punctuation = map[string]*regexp.Regexp{
" $1 $2": regexp.MustCompile(`([:,])([^\d])`),
" ... ": regexp.MustCompile(`\.\.\.`),
"$1 $2$3 ": regexp.MustCompile(`([^\.])(\.)([\]\)}>"\']*)\s*$`),
"$1 ' ": regexp.MustCompile(`([^'])' `),
}
var punctuation2 = []*regexp.Regexp{
regexp.MustCompile(`([:,])$`),
regexp.MustCompile(`([;@#$%&?!])`),
}
var brackets = map[string]*regexp.Regexp{
" $1 ": regexp.MustCompile(`([\]\[\(\)\{\}\<\>])`),
" -- ": regexp.MustCompile(`--`),
}
var endingQuotes = map[string]*regexp.Regexp{
" '' ": regexp.MustCompile(`"`),
}
var endingQuotes2 = []*regexp.Regexp{
regexp.MustCompile(`'(\S)(\'\')'`),
regexp.MustCompile(`([^' ])('[sS]|'[mM]|'[dD]|') `),
regexp.MustCompile(`([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) `),
}
var contractions = []*regexp.Regexp{
regexp.MustCompile(`(?i)\b(can)(not)\b`),
regexp.MustCompile(`(?i)\b(d)('ye)\b`),
regexp.MustCompile(`(?i)\b(gim)(me)\b`),
regexp.MustCompile(`(?i)\b(gon)(na)\b`),
regexp.MustCompile(`(?i)\b(got)(ta)\b`),
regexp.MustCompile(`(?i)\b(lem)(me)\b`),
regexp.MustCompile(`(?i)\b(mor)('n)\b`),
regexp.MustCompile(`(?i)\b(wan)(na) `),
regexp.MustCompile(`(?i) ('t)(is)\b`),
regexp.MustCompile(`(?i) ('t)(was)\b`),
}
var newlines = regexp.MustCompile(`(?:\n|\n\r|\r)`)
var spaces = regexp.MustCompile(`(?: {2,})`)
// Tokenize splits a sentence into a slice of words.
//
// This tokenizer performs the following steps: (1) split on contractions (e.g.,
// "don't" -> [do n't]), (2) split on non-terminating punctuation, (3) split on
// single quotes when followed by whitespace, and (4) split on periods that
// appear at the end of lines.
//
// NOTE: As mentioned above, this function expects a sentence (not raw text) as
// input.
func (t TreebankWordTokenizer) Tokenize(text string) []string {
for substitution, r := range startingQuotes {
text = r.ReplaceAllString(text, substitution)
}
for substitution, r := range startingQuotes2 {
text = r.ReplaceAllString(text, substitution)
}
for substitution, r := range punctuation {
text = r.ReplaceAllString(text, substitution)
}
for _, r := range punctuation2 {
text = r.ReplaceAllString(text, " $1 ")
}
for substitution, r := range brackets {
text = r.ReplaceAllString(text, substitution)
}
text = " " + text + " "
for substitution, r := range endingQuotes {
text = r.ReplaceAllString(text, substitution)
}
for _, r := range endingQuotes2 {
text = r.ReplaceAllString(text, "$1 $2 ")
}
for _, r := range contractions {
text = r.ReplaceAllString(text, " $1 $2 ")
}
text = newlines.ReplaceAllString(text, " ")
text = strings.TrimSpace(spaces.ReplaceAllString(text, " "))
return strings.Split(text, " ")
}