Permalink
Browse files

Merge branch 'update_bundle_2011_08_05'

  • Loading branch information...
2 parents 6eb1284 + 89057a8 commit fd3b9d756f6cb7d79e43802f116a4fea5d3fd5e4 @mzsanford mzsanford committed Aug 5, 2011
Showing with 12 additions and 10 deletions.
  1. +11 −9 src/com/twitter/Regex.java
  2. +1 −1 test-data/twitter-text-conformance
View
@@ -10,15 +10,17 @@
"with_friend","with_friends","statuses","status","activity","favourites",
"favourite","favorite","favorites"};
- private static String LATIN_ACCENTS_CHARS = "\\u00c0-\\u00d6\\u00d8-\\u00f6\\u00f8-\\u00ff";
+ private static String LATIN_ACCENTS_CHARS = "\\u00c0-\\u00d6\\u00d8-\\u00f6\\u00f8-\\u00ff\\u015f";
private static final String HASHTAG_ALPHA_CHARS = "a-z" + LATIN_ACCENTS_CHARS +
- "\\u0400-\\u04ff\\u0500-\\u0527" + // Cyrillic
+ "\\u0400-\\u04ff\\u0500-\\u0527" + // Cyrillic
+ "\\u2de0–\\u2dff\\ua640–\\ua69f" + // Cyrillic Extended A/B
"\\u1100-\\u11ff\\u3130-\\u3185\\uA960-\\uA97F\\uAC00-\\uD7AF\\uD7B0-\\uD7FF" + // Hangul (Korean)
- "\\p{InHiragana}\\p{InKatakana}" + // Japanese Hiragana and Katakana
- "\\p{InCJKUnifiedIdeographs}\\u3005" + // Japanese Kanji / Chinese Han
- "\\uff21-\\uff3a\\uff41-\\uff5a" + // full width Alphabet
- "\\uff66-\\uff9f" + // half width Katakana
- "\\uffa1-\\uffdc"; // half width Hangul (Korean)
+ "\\p{InHiragana}\\p{InKatakana}" + // Japanese Hiragana and Katakana
+ "\\p{InCJKUnifiedIdeographs}" + // Japanese Kanji / Chinese Han
+ "\\u3005\\u303b" + // Kanji/Han iteration marks
+ "\\uff21-\\uff3a\\uff41-\\uff5a" + // full width Alphabet
+ "\\uff66-\\uff9f" + // half width Katakana
+ "\\uffa1-\\uffdc"; // half width Hangul (Korean)
private static final String HASHTAG_ALPHA_NUMERIC_CHARS = "0-9\\uff10-\\uff19_" + HASHTAG_ALPHA_CHARS;
private static final String HASHTAG_ALPHA = "[" + HASHTAG_ALPHA_CHARS +"]";
private static final String HASHTAG_ALPHA_NUMERIC = "[" + HASHTAG_ALPHA_NUMERIC_CHARS +"]";
@@ -32,7 +34,7 @@
private static final String URL_VALID_DOMAIN_NAME = "(?:" + URL_VALID_CHARS + "(?:[-]|" + URL_VALID_CHARS + ")*)?" + URL_VALID_CHARS;
private static final String URL_VALID_DOMAIN = URL_VALID_SUBDOMAIN + "*" + URL_VALID_DOMAIN_NAME + "\\.(?:" + URL_PUNYCODE + "|[a-z]{2,})(?::[0-9]+)?";
- private static final String URL_VALID_GENERAL_PATH_CHARS = "[a-z0-9!\\*';:=\\+\\$/%#\\[\\]\\-_,~\\.\\|]";
+ private static final String URL_VALID_GENERAL_PATH_CHARS = "[a-z0-9!\\*';:=\\+\\$/%#\\[\\]\\-_,~\\.\\|" + LATIN_ACCENTS_CHARS + "]";
private static final String URL_VALID_PATH_CHARS_WITHOUT_SLASH = "[" + URL_VALID_GENERAL_PATH_CHARS + "&&[^/]]";
private static final String URL_VALID_PATH_CHARS_WITHOUT_COMMA = "[" + URL_VALID_GENERAL_PATH_CHARS + "&&[^,]]";
@@ -50,7 +52,7 @@
/** Valid end-of-path chracters (so /foo. does not gobble the period).
* 2. Allow =&# for empty URL parameters and other URL-join artifacts
**/
- private static final String URL_VALID_URL_PATH_ENDING_CHARS = "(?:[a-z0-9=_#/\\-\\+]+|"+URL_BALANCE_PARENS+")";
+ private static final String URL_VALID_URL_PATH_ENDING_CHARS = "(?:[a-z0-9=_#/\\-\\+]+|"+ URL_BALANCE_PARENS + LATIN_ACCENTS_CHARS +")";
private static final String URL_VALID_URL_QUERY_CHARS = "[a-z0-9!\\*'\\(\\);:&=\\+\\$/%#\\[\\]\\-_\\.,~\\|]";
private static final String URL_VALID_URL_QUERY_ENDING_CHARS = "[a-z0-9_&=#/]";
private static final String VALID_URL_PATTERN_STRING =

0 comments on commit fd3b9d7

Please sign in to comment.