tokenize/treebank.go

package tokenize

import (
	"regexp"
	"strings"
)

// TreebankWordTokenizer splits a sentence into words.
//
// This implementation is a port of the Sed script written by Robert McIntyre,
// which is available at https://gist.github.com/jdkato/fc8b8c4266dba22d45ac85042ae53b1e.
type TreebankWordTokenizer struct {
}

// NewTreebankWordTokenizer is a TreebankWordTokenizer constructor.
func NewTreebankWordTokenizer() *TreebankWordTokenizer {
	return new(TreebankWordTokenizer)
}

var startingQuotes = map[string]*regexp.Regexp{
	"$1 `` ": regexp.MustCompile(`'([ (\[{<])"`),
	"``":     regexp.MustCompile(`^(")`),
	" ``":    regexp.MustCompile(`( ")`),
}
var startingQuotes2 = map[string]*regexp.Regexp{
	" $1 ": regexp.MustCompile("(``)"),
}
var punctuation = map[string]*regexp.Regexp{
	" $1 $2":   regexp.MustCompile(`([:,])([^\d])`),
	" ... ":    regexp.MustCompile(`\.\.\.`),
	"$1 $2$3 ": regexp.MustCompile(`([^\.])(\.)([\]\)}>"\']*)\s*$`),
	"$1 ' ":    regexp.MustCompile(`([^'])' `),
}
var punctuation2 = []*regexp.Regexp{
	regexp.MustCompile(`([:,])$`),
	regexp.MustCompile(`([;@#$%&?!])`),
}
var brackets = map[string]*regexp.Regexp{
	" $1 ": regexp.MustCompile(`([\]\[\(\)\{\}\<\>])`),
	" -- ": regexp.MustCompile(`--`),
}
var endingQuotes = map[string]*regexp.Regexp{
	" '' ": regexp.MustCompile(`"`),
}
var endingQuotes2 = []*regexp.Regexp{
	regexp.MustCompile(`'(\S)(\'\')'`),
	regexp.MustCompile(`([^' ])('[sS]|'[mM]|'[dD]|') `),
	regexp.MustCompile(`([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) `),
}
var contractions = []*regexp.Regexp{
	regexp.MustCompile(`(?i)\b(can)(not)\b`),
	regexp.MustCompile(`(?i)\b(d)('ye)\b`),
	regexp.MustCompile(`(?i)\b(gim)(me)\b`),
	regexp.MustCompile(`(?i)\b(gon)(na)\b`),
	regexp.MustCompile(`(?i)\b(got)(ta)\b`),
	regexp.MustCompile(`(?i)\b(lem)(me)\b`),
	regexp.MustCompile(`(?i)\b(mor)('n)\b`),
	regexp.MustCompile(`(?i)\b(wan)(na) `),
	regexp.MustCompile(`(?i) ('t)(is)\b`),
	regexp.MustCompile(`(?i) ('t)(was)\b`),
}
var newlines = regexp.MustCompile(`(?:\n|\n\r|\r)`)
var spaces = regexp.MustCompile(`(?: {2,})`)

// Tokenize splits a sentence into a slice of words.
//
// This tokenizer performs the following steps: (1) split on contractions (e.g.,
// "don't" -> [do n't]), (2) split on non-terminating punctuation, (3) split on
// single quotes when followed by whitespace, and (4) split on periods that
// appear at the end of lines.
//
// NOTE: As mentioned above, this function expects a sentence (not raw text) as
// input.
func (t TreebankWordTokenizer) Tokenize(text string) []string {
	for substitution, r := range startingQuotes {
		text = r.ReplaceAllString(text, substitution)
	}

	for substitution, r := range startingQuotes2 {
		text = r.ReplaceAllString(text, substitution)
	}

	for substitution, r := range punctuation {
		text = r.ReplaceAllString(text, substitution)
	}

	for _, r := range punctuation2 {
		text = r.ReplaceAllString(text, " $1 ")
	}

	for substitution, r := range brackets {
		text = r.ReplaceAllString(text, substitution)
	}

	text = " " + text + " "

	for substitution, r := range endingQuotes {
		text = r.ReplaceAllString(text, substitution)
	}

	for _, r := range endingQuotes2 {
		text = r.ReplaceAllString(text, "$1 $2 ")
	}

	for _, r := range contractions {
		text = r.ReplaceAllString(text, " $1 $2 ")
	}

	text = newlines.ReplaceAllString(text, " ")
	text = strings.TrimSpace(spaces.ReplaceAllString(text, " "))
	return strings.Split(text, " ")
}