forked from brendano/ark-tweet-nlp
/
twokenize.scala
309 lines (243 loc) · 11.1 KB
/
twokenize.scala
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
package edu.cmu.cs.lti.ark.tweetnlp // comment this out to run as script, scala is weird
import scala.collection.JavaConversions._
/*
TweetMotif is licensed under the Apache License 2.0:
http://www.apache.org/licenses/LICENSE-2.0.html
Copyright Brendan O'Connor, Michel Krieger, and David Ahn, 2009-2010.
*/
/*
twokenize.scala -- a little Twitter tokenizer,
tested for English and some other European languages.
Twokenize.tokenize("@hellocalyclops =))=))=)) Oh well.")
=> ["@hellocalyclops", "=))", "=))", "=))", "Oh", "well", "."]
Invoking main() takes tweet texts on stdin and outputs space-separated tokenizations.
Code History
* Original version in TweetMotif in Python (2009-2010, github.com/brendano/tweetmotif)
having two forks:
- (2011) Scala port and improvements by David Snyder (dsnyder@cs.utexas.edu)
and Jason Baldridge (jasonbaldridge@gmail.com)
https://bitbucket.org/jasonbaldridge/twokenize/
- (2011) Modifications for POS tagging by Kevin Gimpel (kgimpel@cs.cmu.edu)
and Daniel Mills (dpmills@cs.cmu.edu)
* Merge to Scala by Brendan O'Connor, for ARK TweetNLP package (2011-06)
Original paper:
TweetMotif: Exploratory Search and Topic Summarization for Twitter.
Brendan O'Connor, Michel Krieger, and David Ahn.
ICWSM-2010 (demo track)
http://brenocon.com/oconnor_krieger_ahn.icwsm2010.tweetmotif.pdf
---
Scala port of Brendar O'Connor's twokenize.py
This is not a direct port, as some changes were made in the aim of
simplicity.
- David Snyder (dsnyder@cs.utexas.edu)
April 2011
Modifications to more functional style, fix a few bugs, and making
output more like twokenize.py. Added abbrevations. Tweaked some
regex's to produce better tokens.
- Jason Baldridge (jasonbaldridge@gmail.com)
June 2011
*/
/**
* TODO
* - byte offsets should be added here. can easily re-align
* since the only munged characters are whitespace (hopefully)
*/
import scala.util.matching.Regex
import collection.JavaConversions._
object Twokenize {
val Contractions = """(?i)(\w+)(n't|'ve|'ll|'d|'re|'s|'m)$""".r
val Whitespace = """\s+""".r
val punctChars = """['“\".?!,:;]"""
val punctSeq = punctChars+"""+"""
val entity = """&(amp|lt|gt|quot);"""
// URLs
// David: I give the Larry David eye to this whole URL regex
// (http://www.youtube.com/watch?v=2SmoBvg-etU) There are
// TODO potentially better options, see:
// http://daringfireball.net/2010/07/improved_regex_for_matching_urls
// http://mathiasbynens.be/demo/url-regex
val urlStart1 = """(https?://|www\.)"""
val commonTLDs = """(com|co\.uk|org|net|info|ca|ly|mp|edu|gov)"""
val urlStart2 = """[A-Za-z0-9\.-]+?\.""" + commonTLDs + """(?=[/ \W])"""
val urlBody = """[^ \t\r\n<>]*?"""
val urlExtraCrapBeforeEnd = "("+punctChars+"|"+entity+")+?"
val urlEnd = """(\.\.+|[<>]|\s|$)"""
val url = """\b("""+urlStart1+"|"+urlStart2+")"+urlBody+"(?=("+urlExtraCrapBeforeEnd+")?"+urlEnd+")"
// Numeric
val timeLike = """\d+:\d+"""
val numNum = """\d+\.\d+"""
val numberWithCommas = """(\d+,)+?\d{3}""" + """(?=([^,]|$))"""
// Abbreviations
val boundaryNotDot = """($|\s|[“\"?!,:;]|""" + entity + ")"
val aa1 = """([A-Za-z]\.){2,}(?=""" + boundaryNotDot + ")"
val aa2 = """[^A-Za-z]([A-Za-z]\.){1,}[A-Za-z](?=""" + boundaryNotDot + ")"
val standardAbbreviations = """\b([Mm]r|[Mm]rs|[Mm]s|[Dd]r|[Ss]r|[Jj]r|[Rr]ep|[Ss]en|[Ss]t)\."""
val arbitraryAbbrev = "(" + aa1 +"|"+ aa2 + "|" + standardAbbreviations + ")"
val separators = "(--+|―)"
val decorations = """[♫]+"""
val thingsThatSplitWords = """[^\s\.,]"""
val embeddedApostrophe = thingsThatSplitWords+"""+'""" + thingsThatSplitWords + """+"""
// Emoticons
val normalEyes = "(?iu)[:=]"
val wink = "[;]"
val noseArea = "(|o|O|-|[^a-zA-Z0-9 ])"
val happyMouths = """[D\)\]]+"""
val sadMouths = """[\(\[]+"""
val tongue = "[pP]"
val otherMouths = """[doO/\\]+""" // remove forward slash if http://'s aren't cleaned
// mouth repetition examples:
// @aliciakeys Put it in a love song :-))
// @hellocalyclops =))=))=)) Oh well
def OR(parts: String*) = {
"(" + parts.toList.mkString("|") + ")"
}
val emoticon = OR(
// Standard version :) :( :] :D :P
OR(normalEyes, wink) + noseArea + OR(tongue, otherMouths, sadMouths, happyMouths),
// reversed version (: D: use positive lookbehind to remove "(word):"
// because eyes on the right side is more ambiguous with the standard usage of : ;
"""(?<=( |^))""" + OR(sadMouths,happyMouths,otherMouths) + noseArea + OR(normalEyes, wink)
// TODO japanese-style emoticons
// TODO should try a big precompiled lexicon from Wikipedia, Dan Ramage told me (BTO) he does this
)
def allowEntities(pat: String)= {
// so we can write patterns with < and > and let them match escaped html too
pat.replace("<", "(<|<)").replace(">", "(>|>)")
}
val Hearts = allowEntities("""(<+/?3+)""")
val Arrows = allowEntities("""(<*[-=]*>+|<+[-=]*>*)""")
// BTO 2011-06: restored Hashtag, AtMention protection (dropped in original scala port) because it fixes
// "hello (#hashtag)" ==> "hello (#hashtag )" WRONG
// "hello (#hashtag)" ==> "hello ( #hashtag )" RIGHT
// "hello (@person)" ==> "hello (@person )" WRONG
// "hello (@person)" ==> "hello ( @person )" RIGHT
// ... Some sort of weird interaction with edgepunct I guess, because edgepunct
// has poor content-symbol detection.
val Hashtag = """#[a-zA-Z0-9_]+"""; // also gets #1 #40 which probably aren't hashtags .. but good as tokens
val AtMention = """@[a-zA-Z0-9_]+""";
// I was worried this would conflict with at-mentions
// but seems ok in sample of 5800: 7 changes all email fixes
// http://www.regular-expressions.info/email.html
val Bound = """(\W|^|$)"""
val Email = "(?<=" +Bound+ """)[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,4}(?=""" +Bound+")"
// We will be tokenizing using these regexps as delimiters
// Additionally, these things are "protected", meaning they shouldn't be further split themselves.
val Protected = new Regex(
OR(
Hearts,
Arrows,
emoticon,
url,
Email,
entity,
timeLike,
numNum,
numberWithCommas,
punctSeq,
arbitraryAbbrev,
separators,
decorations,
embeddedApostrophe,
Hashtag,
AtMention
))
// Edge punctuation
// Want: 'foo' => ' foo '
// While also: don't => don't
// the first is considered "edge punctuation".
// the second is word-internal punctuation -- don't want to mess with it.
// BTO (2011-06): the edgepunct system seems to be the #1 source of problems these days.
// I remember it causing lots of trouble in the past as well. Would be good to revisit or eliminate.
// Note the 'smart quotes' (http://en.wikipedia.org/wiki/Smart_quotes)
val edgePunctChars = """'"“”‘’«»{}\(\)\[\]\*"""
val edgePunct = "[" + edgePunctChars + "]"
val notEdgePunct = "[a-zA-Z0-9]" // content characters
val offEdge = """(^|$|:|;|\s)""" // colon here gets "(hello):" ==> "( hello ):"
val EdgePunctLeft = new Regex(offEdge + "("+edgePunct+"+)("+notEdgePunct+")")
val EdgePunctRight = new Regex("("+notEdgePunct+")("+edgePunct+"+)" + offEdge)
def splitEdgePunct (input: String) = {
var s = input
s = EdgePunctLeft.replaceAllIn(s, "$1$2 $3")
s = EdgePunctRight.replaceAllIn(s, "$1 $2$3")
s
}
// The main work of tokenizing a tweet.
def simpleTokenize (text: String) = {
// Do the no-brainers first
val splitPunctText = splitEdgePunct(text)
val textLength = splitPunctText.length
// Find the matches for subsequences that should be protected,
// e.g. URLs, 1.0, U.N.K.L.E., 12:53
val matches = Protected.findAllIn(splitPunctText).matchData.toList
// The spans of the "bads" should not be split.
val badSpans = matches map (mat => Tuple2(mat.start, mat.end))
// Create a list of indices to create the "goods", which can be
// split. We are taking "bad" spans like
// List((2,5), (8,10))
// to create
/// List(0, 2, 5, 8, 10, 12)
// where, e.g., "12" here would be the textLength
val indices = (0 :: badSpans.foldRight(List[Int]())((x,y) => x._1 :: x._2 :: y)) ::: List(textLength)
// Group the indices and map them to their respective portion of the string
val goods = indices.grouped(2).map { x => splitPunctText.slice(x(0),x(1)) }.toList
//The 'good' strings are safe to be further tokenized by whitespace
val splitGoods = goods map { str => str.trim.split(" ").toList }
//Storing as List[List[String]] to make zip easier later on
val bads = badSpans map { case(start,end) => List(splitPunctText.slice(start,end)) }
// Reinterpolate the 'good' and 'bad' Lists, ensuring that
// additonal tokens from last good item get included
val zippedStr =
(if (splitGoods.length == bads.length)
splitGoods.zip(bads) map { pair => pair._1 ++ pair._2 }
else
(splitGoods.zip(bads) map { pair => pair._1 ++ pair._2 }) ::: List(splitGoods.last)
).flatten
// Split based on special patterns (like contractions) and check all tokens are non empty
zippedStr.map(splitToken(_)).flatten.filter(_.length > 0)
}
// "foo bar" => "foo bar"
def squeezeWhitespace (input: String) = Whitespace.replaceAllIn(input," ").trim
// Final pass tokenization based on special patterns
def splitToken (token: String) = {
token match {
// BTO: our POS tagger wants "ur" and "you're" to both be one token.
// Uncomment to get "you 're"
// case Contractions(stem, contr) => List(stem.trim, contr.trim)
case token => List(token.trim)
}
}
// Apply method allows it to be used as Twokenize(line) in Scala.
def apply (text: String): List[String] = simpleTokenize(squeezeWhitespace(text))
// More normal name for @apply@
def tokenize (text: String): List[String] = apply(text)
// Very slight normalization for AFTER tokenization.
// The tokenization regexes are written to work on non-normalized text.
// (to make byte offsets easier to compute)
// Hm: 2+ repeated character normalization here?
// No, that's more linguistic, should be further down the pipeline
def normalizeText(text: String) = {
text.replaceAll("<", "<").replaceAll(">",">").replaceAll("&","&")
}
def tokenizeForTagger (text: String): List[String] = {
tokenize(text).map(normalizeText)
}
def tokenizeForTagger_J (text: String): java.util.List[String] = {
tokenizeForTagger(text).toSeq
}
// Convenience method to produce a string representation of the
// tokenized tweet in a standard-ish format.
def tokenizeToString (text: String): String = {
tokenizeForTagger(text).mkString(" ");
}
// Main method
def main (args: Array[String]) = {
// force stdin/stdout interpretation as UTF-8
// and ignore the stupid JVM default settings (MacRoman? wtf??)
Console.setOut(new java.io.PrintStream(System.out, true, "UTF8"))
io.Source.fromInputStream(System.in, "UTF-8").getLines foreach {
line => {
println(tokenizeForTagger(line).reduceLeft(_ + " " + _))
}
}
}
}