Permalink
Browse files

Consolidate AUTO_LINK_USERNAMES* and EXTRACT_MENTIONS into VALID_MENT…

…ION_OR_LIST. Rename AUTO_LINK_HASHTAG* into VALID_HASHTAG*. Remove com.twitter.regex.Spaces.
  • Loading branch information...
1 parent 26a21a4 commit 241013e46597fb285a334c966dfa7af9991b5225 @keitaf keitaf committed Dec 9, 2011
@@ -102,23 +102,23 @@ public String autoLinkUsernamesAndLists(String text) {
sb.append(chunk);
} else {
// Outside of a tag, do real work with this chunk
- matcher = Regex.AUTO_LINK_USERNAMES_OR_LISTS.matcher(chunk);
+ matcher = Regex.VALID_MENTION_OR_LIST.matcher(chunk);
while (matcher.find()) {
- if (matcher.group(Regex.AUTO_LINK_USERNAME_OR_LISTS_GROUP_LIST) == null ||
- matcher.group(Regex.AUTO_LINK_USERNAME_OR_LISTS_GROUP_LIST).isEmpty()) {
+ if (matcher.group(Regex.VALID_MENTION_OR_LIST_GROUP_LIST) == null ||
+ matcher.group(Regex.VALID_MENTION_OR_LIST_GROUP_LIST).isEmpty()) {
// Username only
- if (!Regex.SCREEN_NAME_MATCH_END.matcher(chunk.substring(matcher.end())).find()) {
+ if (!Regex.INVALID_MENTION_MATCH_END.matcher(chunk.substring(matcher.end())).find()) {
StringBuilder rb = new StringBuilder(capacity);
- rb.append(matcher.group(Regex.AUTO_LINK_USERNAME_OR_LISTS_GROUP_BEFORE))
- .append(matcher.group(Regex.AUTO_LINK_USERNAME_OR_LISTS_GROUP_AT))
+ rb.append(matcher.group(Regex.VALID_MENTION_OR_LIST_GROUP_BEFORE))
+ .append(matcher.group(Regex.VALID_MENTION_OR_LIST_GROUP_AT))
.append("<a class=\"").append(urlClass).append(" ").append(usernameClass)
.append("\" href=\"").append(usernameUrlBase)
- .append(matcher.group(Regex.AUTO_LINK_USERNAME_OR_LISTS_GROUP_USERNAME))
+ .append(matcher.group(Regex.VALID_MENTION_OR_LIST_GROUP_USERNAME))
.append("\"");
if (noFollow) rb.append(NO_FOLLOW_HTML_ATTRIBUTE);
rb.append(">")
- .append(matcher.group(Regex.AUTO_LINK_USERNAME_OR_LISTS_GROUP_USERNAME))
+ .append(matcher.group(Regex.VALID_MENTION_OR_LIST_GROUP_USERNAME))
.append("</a>");
matcher.appendReplacement(sb, rb.toString());
} else {
@@ -128,16 +128,16 @@ public String autoLinkUsernamesAndLists(String text) {
} else {
// Username and list
StringBuilder rb = new StringBuilder(capacity);
- rb.append(matcher.group(Regex.AUTO_LINK_USERNAME_OR_LISTS_GROUP_BEFORE))
- .append(matcher.group(Regex.AUTO_LINK_USERNAME_OR_LISTS_GROUP_AT))
+ rb.append(matcher.group(Regex.VALID_MENTION_OR_LIST_GROUP_BEFORE))
+ .append(matcher.group(Regex.VALID_MENTION_OR_LIST_GROUP_AT))
.append("<a class=\"").append(urlClass).append(" ").append(listClass)
.append("\" href=\"").append(listUrlBase)
- .append(matcher.group(Regex.AUTO_LINK_USERNAME_OR_LISTS_GROUP_USERNAME))
- .append(matcher.group(Regex.AUTO_LINK_USERNAME_OR_LISTS_GROUP_LIST))
+ .append(matcher.group(Regex.VALID_MENTION_OR_LIST_GROUP_USERNAME))
+ .append(matcher.group(Regex.VALID_MENTION_OR_LIST_GROUP_LIST))
.append("\"");
if (noFollow) rb.append(NO_FOLLOW_HTML_ATTRIBUTE);
- rb.append(">").append(matcher.group(Regex.AUTO_LINK_USERNAME_OR_LISTS_GROUP_USERNAME))
- .append(matcher.group(Regex.AUTO_LINK_USERNAME_OR_LISTS_GROUP_LIST))
+ rb.append(">").append(matcher.group(Regex.VALID_MENTION_OR_LIST_GROUP_USERNAME))
+ .append(matcher.group(Regex.VALID_MENTION_OR_LIST_GROUP_LIST))
.append("</a>");
matcher.appendReplacement(sb, rb.toString());
}
@@ -160,22 +160,22 @@ public String autoLinkUsernamesAndLists(String text) {
*/
public String autoLinkHashtags(String text) {
StringBuffer sb = new StringBuffer();
- Matcher matcher = Regex.AUTO_LINK_HASHTAGS.matcher(text);
+ Matcher matcher = Regex.VALID_HASHTAG.matcher(text);
while (matcher.find()) {
String after = text.substring(matcher.end());
- if (!Regex.HASHTAG_MATCH_END.matcher(after).find()) {
+ if (!Regex.INVALID_HASHTAG_MATCH_END.matcher(after).find()) {
StringBuilder replacement = new StringBuilder(text.length() * 2);
- replacement.append(matcher.group(Regex.AUTO_LINK_HASHTAGS_GROUP_BEFORE))
+ replacement.append(matcher.group(Regex.VALID_HASHTAG_GROUP_BEFORE))
.append("<a href=\"").append(hashtagUrlBase)
- .append(matcher.group(Regex.AUTO_LINK_HASHTAGS_GROUP_TAG)).append("\"")
- .append(" title=\"#").append(matcher.group(Regex.AUTO_LINK_HASHTAGS_GROUP_TAG))
+ .append(matcher.group(Regex.VALID_HASHTAG_GROUP_TAG)).append("\"")
+ .append(" title=\"#").append(matcher.group(Regex.VALID_HASHTAG_GROUP_TAG))
.append("\" class=\"").append(urlClass).append(" ")
.append(hashtagClass).append("\"");
if (noFollow) {
replacement.append(NO_FOLLOW_HTML_ATTRIBUTE);
}
- replacement.append(">").append(matcher.group(Regex.AUTO_LINK_HASHTAGS_GROUP_HASH))
- .append(matcher.group(Regex.AUTO_LINK_HASHTAGS_GROUP_TAG)).append("</a>");
+ replacement.append(">").append(matcher.group(Regex.VALID_HASHTAG_GROUP_HASH))
+ .append(matcher.group(Regex.VALID_HASHTAG_GROUP_TAG)).append("</a>");
matcher.appendReplacement(sb, replacement.toString());
} else {
// not a valid hashtag
@@ -90,17 +90,13 @@ public Extractor() {
* @return List of usernames referenced (without the leading @ sign)
*/
public List<String> extractMentionedScreennames(String text) {
- if (text == null) {
- return null;
+ if (text == null || text.isEmpty()) {
+ return Collections.emptyList();
}
List<String> extracted = new ArrayList<String>();
- Matcher matcher = Regex.EXTRACT_MENTIONS.matcher(text);
- while (matcher.find()) {
- String after = text.substring(matcher.end());
- if (! Regex.SCREEN_NAME_MATCH_END.matcher(after).find()) {
- extracted.add(matcher.group(Regex.EXTRACT_MENTIONS_GROUP_USERNAME));
- }
+ for (Entity entity : extractMentionedScreennamesWithIndices(text)) {
+ extracted.add(entity.value);
}
return extracted;
}
@@ -112,22 +108,21 @@ public Extractor() {
* @return List of usernames referenced (without the leading @ sign)
*/
public List<Entity> extractMentionedScreennamesWithIndices(String text) {
- if (text == null) {
- return null;
+ if (text == null || text.isEmpty()) {
+ return Collections.emptyList();
}
List<Entity> extracted = new ArrayList<Entity>();
- Matcher matcher = Regex.EXTRACT_MENTIONS.matcher(text);
+ Matcher matcher = Regex.VALID_MENTION_OR_LIST.matcher(text);
while (matcher.find()) {
String after = text.substring(matcher.end());
- if (! Regex.SCREEN_NAME_MATCH_END.matcher(after).find()) {
- extracted.add(new Entity(matcher, "mention", Regex.EXTRACT_MENTIONS_GROUP_USERNAME));
+ if (! Regex.INVALID_MENTION_MATCH_END.matcher(after).find()) {
+ extracted.add(new Entity(matcher, "mention", Regex.VALID_MENTION_OR_LIST_GROUP_USERNAME));
}
}
return extracted;
}
-
/**
* Extract a @username reference from the beginning of Tweet text. A reply is an occurance of @username at the
* beginning of a Tweet, preceded by 0 or more spaces.
@@ -140,13 +135,13 @@ public String extractReplyScreenname(String text) {
return null;
}
- Matcher matcher = Regex.EXTRACT_REPLY.matcher(text);
+ Matcher matcher = Regex.VALID_REPLY.matcher(text);
if (matcher.find()) {
String after = text.substring(matcher.end());
- if (Regex.SCREEN_NAME_MATCH_END.matcher(after).find()) {
+ if (Regex.INVALID_MENTION_MATCH_END.matcher(after).find()) {
return null;
} else {
- return matcher.group(Regex.EXTRACT_REPLY_GROUP_USERNAME);
+ return matcher.group(Regex.VALID_REPLY_GROUP_USERNAME);
}
} else {
return null;
@@ -160,23 +155,14 @@ public String extractReplyScreenname(String text) {
* @return List of URLs referenced.
*/
public List<String> extractURLs(String text) {
- if (text == null) {
- return null;
+ if (text == null || text.isEmpty()) {
+ return Collections.emptyList();
}
List<String> urls = new ArrayList<String>();
-
- Matcher matcher = Regex.VALID_URL.matcher(text);
- while (matcher.find()) {
- String url = matcher.group(Regex.VALID_URL_GROUP_URL);
- Matcher tco_matcher = Regex.VALID_TCO_URL.matcher(url);
- if (tco_matcher.find()) {
- // In the case of t.co URLs, don't allow additional path characters.
- url = tco_matcher.group();
- }
- urls.add(url);
+ for (Entity entity : extractURLsWithIndices(text)) {
+ urls.add(entity.value);
}
-
return urls;
}
@@ -187,8 +173,8 @@ public String extractReplyScreenname(String text) {
* @return List of URLs referenced.
*/
public List<Entity> extractURLsWithIndices(String text) {
- if (text == null) {
- return null;
+ if (text == null || text.isEmpty()) {
+ return Collections.emptyList();
}
List<Entity> urls = new ArrayList<Entity>();
@@ -218,11 +204,16 @@ public String extractReplyScreenname(String text) {
* @return List of hashtags referenced (without the leading # sign)
*/
public List<String> extractHashtags(String text) {
- if (text == null) {
- return null;
+ if (text == null || text.isEmpty()) {
+ return Collections.emptyList();
+ }
+
+ List<String> extracted = new ArrayList<String>();
+ for (Entity entity : extractHashtagsWithIndices(text)) {
+ extracted.add(entity.value);
}
- return extractList(Regex.AUTO_LINK_HASHTAGS, text, Regex.AUTO_LINK_HASHTAGS_GROUP_TAG);
+ return extracted;
}
/**
@@ -232,41 +223,20 @@ public String extractReplyScreenname(String text) {
* @return List of hashtags referenced (without the leading # sign)
*/
public List<Entity> extractHashtagsWithIndices(String text) {
- if (text == null) {
- return null;
+ if (text == null || text.isEmpty()) {
+ return Collections.emptyList();
}
- return extractListWithIndices(Regex.AUTO_LINK_HASHTAGS, text, Regex.AUTO_LINK_HASHTAGS_GROUP_TAG, "hashtag");
- }
+ List<Entity> extracted = new ArrayList<Entity>();
+ Matcher matcher = Regex.VALID_HASHTAG.matcher(text);
- /**
- * Helper method for extracting multiple matches from Tweet text.
- *
- * @param pattern to match and use for extraction
- * @param text of the Tweet to extract from
- * @param groupNumber the capturing group of the pattern that should be added to the list.
- * @return list of extracted values, or an empty list if there were none.
- */
- private List<String> extractList(Pattern pattern, String text, int groupNumber) {
- List<String> extracted = new ArrayList<String>();
- Matcher matcher = pattern.matcher(text);
while (matcher.find()) {
String after = text.substring(matcher.end());
- if (!Regex.HASHTAG_MATCH_END.matcher(after).find()) {
- extracted.add(matcher.group(groupNumber));
+ if (!Regex.INVALID_HASHTAG_MATCH_END.matcher(after).find()) {
+ extracted.add(new Entity(matcher, "hashtag", Regex.VALID_HASHTAG_GROUP_TAG));
}
}
- return extracted;
- }
-
- // TODO: Make this a real object, not a Map
- private List<Entity> extractListWithIndices(Pattern pattern, String text, int groupNumber, String valueType) {
- List<Entity> extracted = new ArrayList<Entity>();
- Matcher matcher = pattern.matcher(text);
- while (matcher.find()) {
- extracted.add(new Entity(matcher, valueType, groupNumber));
- }
return extracted;
}
}
@@ -4,7 +4,22 @@
import java.util.regex.*;
public class Regex {
- private static String LATIN_ACCENTS_CHARS = "\\u00c0-\\u00d6\\u00d8-\\u00f6\\u00f8-\\u00ff\\u015f";
+ private static final String UNICODE_SPACES = "[" +
+ "\\u0009-\\u000d" + // # White_Space # Cc [5] <control-0009>..<control-000D>
+ "\\u0020" + // White_Space # Zs SPACE
+ "\\u0085" + // White_Space # Cc <control-0085>
+ "\\u00a0" + // White_Space # Zs NO-BREAK SPACE
+ "\\u1680" + // White_Space # Zs OGHAM SPACE MARK
+ "\\u180E" + // White_Space # Zs MONGOLIAN VOWEL SEPARATOR
+ "\\u2000-\\u200a" + // # White_Space # Zs [11] EN QUAD..HAIR SPACE
+ "\\u2028" + // White_Space # Zl LINE SEPARATOR
+ "\\u2029" + // White_Space # Zp PARAGRAPH SEPARATOR
+ "\\u202F" + // White_Space # Zs NARROW NO-BREAK SPACE
+ "\\u205F" + // White_Space # Zs MEDIUM MATHEMATICAL SPACE
+ "\\u3000" + // White_Space # Zs IDEOGRAPHIC SPACE
+ "]";
+
+ private static final String LATIN_ACCENTS_CHARS = "\\u00c0-\\u00d6\\u00d8-\\u00f6\\u00f8-\\u00ff\\u015f";
private static final String HASHTAG_ALPHA_CHARS = "a-z" + LATIN_ACCENTS_CHARS +
"\\u0400-\\u04ff\\u0500-\\u0527" + // Cyrillic
"\\u2de0–\\u2dff\\ua640–\\ua69f" + // Cyrillic Extended A/B
@@ -105,19 +120,24 @@
/* Begin public constants */
+
+ public static final Pattern VALID_HASHTAG = Pattern.compile("(^|[^&/" + HASHTAG_ALPHA_NUMERIC_CHARS + "])(#|\uFF03)(" + HASHTAG_ALPHA_NUMERIC + "*" + HASHTAG_ALPHA + HASHTAG_ALPHA_NUMERIC + "*)", Pattern.CASE_INSENSITIVE);
+ public static final int VALID_HASHTAG_GROUP_BEFORE = 1;
+ public static final int VALID_HASHTAG_GROUP_HASH = 2;
+ public static final int VALID_HASHTAG_GROUP_TAG = 3;
+ public static final Pattern INVALID_HASHTAG_MATCH_END = Pattern.compile("^(?:[##]|://)");
+
public static final Pattern AT_SIGNS = Pattern.compile("[" + AT_SIGNS_CHARS + "]");
+ public static final Pattern VALID_MENTION_OR_LIST = Pattern.compile("([^a-zA-Z0-9_]|^|RT:?)(" + AT_SIGNS + "+)([a-zA-Z0-9_]{1,20})(/[a-zA-Z][a-zA-Z0-9_\\-]{0,24})?");
+ public static final int VALID_MENTION_OR_LIST_GROUP_BEFORE = 1;
+ public static final int VALID_MENTION_OR_LIST_GROUP_AT = 2;
+ public static final int VALID_MENTION_OR_LIST_GROUP_USERNAME = 3;
+ public static final int VALID_MENTION_OR_LIST_GROUP_LIST = 4;
- public static final Pattern AUTO_LINK_HASHTAGS = Pattern.compile("(^|[^&/" + HASHTAG_ALPHA_NUMERIC_CHARS + "])(#|\uFF03)(" + HASHTAG_ALPHA_NUMERIC + "*" + HASHTAG_ALPHA + HASHTAG_ALPHA_NUMERIC + "*)", Pattern.CASE_INSENSITIVE);
- public static final int AUTO_LINK_HASHTAGS_GROUP_BEFORE = 1;
- public static final int AUTO_LINK_HASHTAGS_GROUP_HASH = 2;
- public static final int AUTO_LINK_HASHTAGS_GROUP_TAG = 3;
- public static final Pattern HASHTAG_MATCH_END = Pattern.compile("^(?:[##]|://)");
+ public static final Pattern VALID_REPLY = Pattern.compile("^(?:" + UNICODE_SPACES + ")*" + AT_SIGNS + "([a-z0-9_]{1,20})", Pattern.CASE_INSENSITIVE);
+ public static final int VALID_REPLY_GROUP_USERNAME = 1;
- public static final Pattern AUTO_LINK_USERNAMES_OR_LISTS = Pattern.compile("([^a-z0-9_]|^|RT:?)(" + AT_SIGNS + "+)([a-z0-9_]{1,20})(/[a-z][a-z0-9_\\-]{0,24})?", Pattern.CASE_INSENSITIVE);
- public static final int AUTO_LINK_USERNAME_OR_LISTS_GROUP_BEFORE = 1;
- public static final int AUTO_LINK_USERNAME_OR_LISTS_GROUP_AT = 2;
- public static final int AUTO_LINK_USERNAME_OR_LISTS_GROUP_USERNAME = 3;
- public static final int AUTO_LINK_USERNAME_OR_LISTS_GROUP_LIST = 4;
+ public static final Pattern INVALID_MENTION_MATCH_END = Pattern.compile("^(?:[" + AT_SIGNS_CHARS + LATIN_ACCENTS_CHARS + "]|://)");
public static final Pattern VALID_URL = Pattern.compile(VALID_URL_PATTERN_STRING, Pattern.CASE_INSENSITIVE);
public static final int VALID_URL_GROUP_ALL = 1;
@@ -130,13 +150,4 @@
public static final int VALID_URL_GROUP_QUERY_STRING = 8;
public static final Pattern VALID_TCO_URL = Pattern.compile("^https?:\\/\\/t\\.co\\/[a-z0-9]+", Pattern.CASE_INSENSITIVE);
-
- public static final Pattern EXTRACT_MENTIONS = Pattern.compile("(^|[^a-z0-9_])" + AT_SIGNS + "([a-z0-9_]{1,20})", Pattern.CASE_INSENSITIVE);
- public static final int EXTRACT_MENTIONS_GROUP_BEFORE = 1;
- public static final int EXTRACT_MENTIONS_GROUP_USERNAME = 2;
-
- public static final Pattern EXTRACT_REPLY = Pattern.compile("^(?:[" + com.twitter.regex.Spaces.getCharacterClass() + "])*" + AT_SIGNS + "([a-z0-9_]{1,20})", Pattern.CASE_INSENSITIVE);
- public static final int EXTRACT_REPLY_GROUP_USERNAME = 1;
-
- public static final Pattern SCREEN_NAME_MATCH_END = Pattern.compile("^(?:[" + AT_SIGNS_CHARS + LATIN_ACCENTS_CHARS + "]|://)");
}
@@ -1,32 +0,0 @@
-
-package com.twitter.regex;
-
-public class Spaces {
- private static final String[] UNICODE_SPACE_RANGES = {
- "\\u0009-\\u000d", // # White_Space # Cc [5] <control-0009>..<control-000D>
- "\\u0020", // White_Space # Zs SPACE
- "\\u0085", // White_Space # Cc <control-0085>
- "\\u00a0", // White_Space # Zs NO-BREAK SPACE
- "\\u1680", // White_Space # Zs OGHAM SPACE MARK
- "\\u180E", // White_Space # Zs MONGOLIAN VOWEL SEPARATOR
- "\\u2000-\\u200a", // # White_Space # Zs [11] EN QUAD..HAIR SPACE
- "\\u2028", // White_Space # Zl LINE SEPARATOR
- "\\u2029", // White_Space # Zp PARAGRAPH SEPARATOR
- "\\u202F", // White_Space # Zs NARROW NO-BREAK SPACE
- "\\u205F", // White_Space # Zs MEDIUM MATHEMATICAL SPACE
- "\\u3000", // White_Space # Zs IDEOGRAPHIC SPACE
- };
- private static String characterClass = null;
-
- static {
- StringBuilder sb = new StringBuilder(UNICODE_SPACE_RANGES.length+1);
- for (int i=0; i < UNICODE_SPACE_RANGES.length; i++) {
- sb.append(UNICODE_SPACE_RANGES[i]);
- }
- characterClass = sb.toString();
- }
-
- public static String getCharacterClass() {
- return characterClass;
- }
-}
Oops, something went wrong.

0 comments on commit 241013e

Please sign in to comment.