Skip to content

Commit

Permalink
Merge branch 'update_bundle_2011_08_05'
Browse files Browse the repository at this point in the history
  • Loading branch information
mzsanford committed Aug 5, 2011
2 parents 6eb1284 + 89057a8 commit fd3b9d7
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 10 deletions.
20 changes: 11 additions & 9 deletions src/com/twitter/Regex.java
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,17 @@ public class Regex {
"with_friend","with_friends","statuses","status","activity","favourites",
"favourite","favorite","favorites"};

private static String LATIN_ACCENTS_CHARS = "\\u00c0-\\u00d6\\u00d8-\\u00f6\\u00f8-\\u00ff";
private static String LATIN_ACCENTS_CHARS = "\\u00c0-\\u00d6\\u00d8-\\u00f6\\u00f8-\\u00ff\\u015f";
private static final String HASHTAG_ALPHA_CHARS = "a-z" + LATIN_ACCENTS_CHARS +
"\\u0400-\\u04ff\\u0500-\\u0527" + // Cyrillic
"\\u0400-\\u04ff\\u0500-\\u0527" + // Cyrillic
"\\u2de0–\\u2dff\\ua640–\\ua69f" + // Cyrillic Extended A/B
"\\u1100-\\u11ff\\u3130-\\u3185\\uA960-\\uA97F\\uAC00-\\uD7AF\\uD7B0-\\uD7FF" + // Hangul (Korean)
"\\p{InHiragana}\\p{InKatakana}" + // Japanese Hiragana and Katakana
"\\p{InCJKUnifiedIdeographs}\\u3005" + // Japanese Kanji / Chinese Han
"\\uff21-\\uff3a\\uff41-\\uff5a" + // full width Alphabet
"\\uff66-\\uff9f" + // half width Katakana
"\\uffa1-\\uffdc"; // half width Hangul (Korean)
"\\p{InHiragana}\\p{InKatakana}" + // Japanese Hiragana and Katakana
"\\p{InCJKUnifiedIdeographs}" + // Japanese Kanji / Chinese Han
"\\u3005\\u303b" + // Kanji/Han iteration marks
"\\uff21-\\uff3a\\uff41-\\uff5a" + // full width Alphabet
"\\uff66-\\uff9f" + // half width Katakana
"\\uffa1-\\uffdc"; // half width Hangul (Korean)
private static final String HASHTAG_ALPHA_NUMERIC_CHARS = "0-9\\uff10-\\uff19_" + HASHTAG_ALPHA_CHARS;
private static final String HASHTAG_ALPHA = "[" + HASHTAG_ALPHA_CHARS +"]";
private static final String HASHTAG_ALPHA_NUMERIC = "[" + HASHTAG_ALPHA_NUMERIC_CHARS +"]";
Expand All @@ -32,7 +34,7 @@ public class Regex {
private static final String URL_VALID_DOMAIN_NAME = "(?:" + URL_VALID_CHARS + "(?:[-]|" + URL_VALID_CHARS + ")*)?" + URL_VALID_CHARS;
private static final String URL_VALID_DOMAIN = URL_VALID_SUBDOMAIN + "*" + URL_VALID_DOMAIN_NAME + "\\.(?:" + URL_PUNYCODE + "|[a-z]{2,})(?::[0-9]+)?";

private static final String URL_VALID_GENERAL_PATH_CHARS = "[a-z0-9!\\*';:=\\+\\$/%#\\[\\]\\-_,~\\.\\|]";
private static final String URL_VALID_GENERAL_PATH_CHARS = "[a-z0-9!\\*';:=\\+\\$/%#\\[\\]\\-_,~\\.\\|" + LATIN_ACCENTS_CHARS + "]";
private static final String URL_VALID_PATH_CHARS_WITHOUT_SLASH = "[" + URL_VALID_GENERAL_PATH_CHARS + "&&[^/]]";
private static final String URL_VALID_PATH_CHARS_WITHOUT_COMMA = "[" + URL_VALID_GENERAL_PATH_CHARS + "&&[^,]]";

Expand All @@ -50,7 +52,7 @@ public class Regex {
/** Valid end-of-path chracters (so /foo. does not gobble the period).
* 2. Allow =&# for empty URL parameters and other URL-join artifacts
**/
private static final String URL_VALID_URL_PATH_ENDING_CHARS = "(?:[a-z0-9=_#/\\-\\+]+|"+URL_BALANCE_PARENS+")";
private static final String URL_VALID_URL_PATH_ENDING_CHARS = "(?:[a-z0-9=_#/\\-\\+]+|"+ URL_BALANCE_PARENS + LATIN_ACCENTS_CHARS +")";
private static final String URL_VALID_URL_QUERY_CHARS = "[a-z0-9!\\*'\\(\\);:&=\\+\\$/%#\\[\\]\\-_\\.,~\\|]";
private static final String URL_VALID_URL_QUERY_ENDING_CHARS = "[a-z0-9_&=#/]";
private static final String VALID_URL_PATTERN_STRING =
Expand Down
2 changes: 1 addition & 1 deletion test-data/twitter-text-conformance
Submodule twitter-text-conformance updated 1 files
+39 −0 extract.yml

0 comments on commit fd3b9d7

Please sign in to comment.