Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

Fix corner-case bugs in entity offset conversion

The entity offset conversion routines sometimes could double-convert
an entity, when its offset adjustment caused it to be examined twice.
This commit uses different routines for conversion in either
direction, that each take special care not to double-convert entities.
  • Loading branch information...
commit b1e717b5399c4aa9230643d53f7a2ee49ddb6e3a 1 parent 21b1012
@j3h authored
Showing with 200 additions and 25 deletions.
  1. +97 −19 src/com/twitter/Extractor.java
  2. +103 −6 tests/com/twitter/ExtractorTest.java
View
116 src/com/twitter/Extractor.java
@@ -279,7 +279,57 @@ public String extractReplyScreenname(String text) {
* @param entities entities with Unicode based indices
*/
public void modifyIndicesFromUnicodeToUTF16(String text, List<Entity> entities) {
- shiftIndices(text, entities, +1);
+ // In order to avoid having to track which entities we have already
+ // shifted, we process the string from the back to the front,
+ // since converting from code points to code units never makes offsets
+ // smaller.
+ int codePointLocation = text.codePointCount(0, text.length()) - 1;
+
+ // The current code unit (string location). This is always moved in lock
+ // step with the codePointLocation.
+ int charLocation = text.length() - 1;
+
+ while (true) {
+ // Find the next entity (counting code points, backwards) that needs to
+ // be shifted.
+ int nextEntityStart = -1;
+
+ for (Entity entity : entities) {
+ final int start = entity.getStart();
+
+ // If this entity's start is the current code point location,
+ // then it has not yet been converted (its units are still code units).
+ if (start == codePointLocation) {
+ if (charLocation != codePointLocation) {
+ final int entityLength = entity.end - start;
+ entity.start = charLocation;
+ entity.end = charLocation + entityLength;
+ }
+ } else {
+ // Choose the entity with the highest code point offset out of
+ // those that have not yet been converted.
+ if (start < codePointLocation && start > nextEntityStart) {
+ nextEntityStart = start;
+ }
+ }
+ }
+
+ // Stop if no entity was found between the beginning of the string and
+ // the current location.
+ if (nextEntityStart < 0) break;
+
+ while (codePointLocation > nextEntityStart) {
+ if (charLocation > 0) {
+ final char c1 = text.charAt(charLocation);
+ final char c0 = text.charAt(charLocation - 1);
+ if (Character.isSurrogatePair(c0, c1)) {
+ charLocation--;
+ }
+ }
+ codePointLocation--;
+ charLocation--;
+ }
+ }
}
/*
@@ -291,27 +341,55 @@ public void modifyIndicesFromUnicodeToUTF16(String text, List<Entity> entities)
* @param entities entities with UTF-16 based indices
*/
public void modifyIndicesFromUTF16ToToUnicode(String text, List<Entity> entities) {
- shiftIndices(text, entities, -1);
- }
-
- /*
- * Shift Entity's indices by {@code diff} for every Unicode supplementary character
- * which appears before the entity.
- *
- * @param text original text
- * @param entities extracted entities
- * @param the amount to shift the entity's indices.
- */
- protected void shiftIndices(String text, List<Entity> entities, int diff) {
- for (int i = 0; i < text.length() - 1; i++) {
- if (Character.isSupplementaryCodePoint(text.codePointAt(i))) {
- for (Entity entity: entities) {
- if (entity.start > i) {
- entity.start += diff;
- entity.end += diff;
+ int codePointLocation = 0;
+ int charLocation = 0;
+
+ boolean wasHighSurrogate = false;
+
+ while (true) {
+ // Find the next entity (counting code units, counting forward) that
+ // needs conversion, while converting any entities that occur at the
+ // current location.
+ int nextEntityStart = text.length();
+
+ for (Entity entity : entities) {
+ int start = entity.start;
+
+ // Any entities that occur at this location should have their offsets
+ // converted. Since the conversion results in a location that is less
+ // than or equal to the current location, we are guaranteed not to
+ // convert an entity more than once.
+ if (start == charLocation) {
+ if (codePointLocation != charLocation) {
+ final int entityLen = entity.end - start;
+ entity.start = codePointLocation;
+ entity.end = codePointLocation + entityLen;
+ }
+ } else {
+ // Choose the entity with the lowest code unit offset out of
+ // those that have not yet been converted.
+ if (start > charLocation && start < nextEntityStart) {
+ nextEntityStart = start;
}
}
}
+
+ // If the next entity is past the end of the text,
+ // or if no more entities were found, then we can stop counting.
+ if (nextEntityStart >= text.length()) break;
+
+ // Count the unicode code points between the current location and the
+ // next entity start.
+ while (charLocation < nextEntityStart) {
+ final char c = text.charAt(charLocation);
+ if (wasHighSurrogate && Character.isLowSurrogate(c)) {
+ wasHighSurrogate = false;
+ } else {
+ codePointLocation += 1;
+ wasHighSurrogate = Character.isHighSurrogate(c);
+ }
+ charLocation++;
+ }
}
}
}
View
109 tests/com/twitter/ExtractorTest.java
@@ -2,6 +2,9 @@
package com.twitter;
import java.util.*;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
import junit.framework.TestCase;
import junit.framework.TestSuite;
import junit.framework.Test;
@@ -10,7 +13,8 @@
protected Extractor extractor;
public static Test suite() {
- Class<?>[] testClasses = { ReplyTest.class, MentionTest.class, HashtagTest.class, URLTest.class };
+ Class<?>[] testClasses = { OffsetConversionTest.class, ReplyTest.class,
+ MentionTest.class, HashtagTest.class, URLTest.class };
return new TestSuite(testClasses);
}
@@ -18,6 +22,99 @@ public void setUp() throws Exception {
extractor = new Extractor();
}
+ public static class OffsetConversionTest extends ExtractorTest {
+
+ public void testConvertIndices() {
+ assertOffsetConversionOk("abc", "abc");
+ assertOffsetConversionOk("\ud83d\ude02abc", "abc");
+ assertOffsetConversionOk("\ud83d\ude02abc\ud83d\ude02", "abc");
+ assertOffsetConversionOk("\ud83d\ude02abc\ud838\ude02abc", "abc");
+ assertOffsetConversionOk("\ud83d\ude02abc\ud838\ude02abc\ud83d\ude02",
+ "abc");
+ assertOffsetConversionOk("\ud83d\ude02\ud83d\ude02abc", "abc");
+ assertOffsetConversionOk("\ud83d\ude02\ud83d\ude02\ud83d\ude02abc",
+ "abc");
+
+ assertOffsetConversionOk
+ ("\ud83d\ude02\ud83d\ude02\ud83d\ude02abc\ud83d\ude02", "abc");
+
+ // Several surrogate pairs following the entity
+ assertOffsetConversionOk
+ ("\ud83d\ude02\ud83d\ude02\ud83d\ude02abc\ud83d\ude02\ud83d" +
+ "\ude02\ud83d\ude02", "abc");
+
+ // Several surrogate pairs surrounding multiple entities
+ assertOffsetConversionOk
+ ("\ud83d\ude02\ud83d\ude02\ud83d\ude02\ud83d\ude02abc\ud83d" +
+ "\ude02\ud83d\ude02\ud83d\ude02\ud83d\ude02abc\ud83d" +
+ "\ude02\ud83d\ude02\ud83d\ude02\ud83d\ude02", "abc");
+
+ // unpaired low surrogate (at start)
+ assertOffsetConversionOk
+ ("\ude02\ud83d\ude02\ud83d\ude02\ud83d\ude02abc\ud83d" +
+ "\ude02\ud83d\ude02\ud83d\ude02\ud83d\ude02abc\ud83d" +
+ "\ude02\ud83d\ude02\ud83d\ude02\ud83d\ude02", "abc");
+
+ // unpaired low surrogate (at end)
+ assertOffsetConversionOk
+ ("\ud83d\ude02\ud83d\ude02\ud83d\ude02\ud83d\ude02abc\ud83d" +
+ "\ude02\ud83d\ude02\ud83d\ude02\ud83d\ude02abc\ud83d" +
+ "\ude02\ud83d\ude02\ud83d\ude02\ude02", "abc");
+
+ // unpaired low and high surrogates (at end)
+ assertOffsetConversionOk
+ ("\ud83d\ude02\ud83d\ude02\ud83d\ude02\ud83d\ude02abc\ud83d" +
+ "\ude02\ud83d\ude02\ud83d\ude02\ud83d\ude02abc\ud83d" +
+ "\ude02\ud83d\ude02\ud83d\ud83d\ude02\ude02", "abc");
+
+ assertOffsetConversionOk("\ud83dabc\ud83d", "abc");
+
+ assertOffsetConversionOk("\ude02abc\ude02", "abc");
+
+ assertOffsetConversionOk("\ude02\ude02abc\ude02\ude02", "abc");
+
+ assertOffsetConversionOk("abcabc", "abc");
+
+ assertOffsetConversionOk("abc\ud83d\ude02abc", "abc");
+
+ assertOffsetConversionOk("aa", "a");
+
+ assertOffsetConversionOk("\ud83d\ude02a\ud83d\ude02a\ud83d\ude02", "a");
+ }
+
+ private void assertOffsetConversionOk(String testData, String patStr) {
+ // Build an entity at the location of patStr
+ final Pattern pat = Pattern.compile(patStr);
+ final Matcher matcher = pat.matcher(testData);
+
+ List<Extractor.Entity> entities = new ArrayList<Extractor.Entity>();
+ List<Integer> codePointOffsets = new ArrayList<Integer>();
+ List<Integer> charOffsets = new ArrayList<Integer>();
+ while (matcher.find()) {
+ final int charOffset = matcher.start();
+ charOffsets.add(charOffset);
+ codePointOffsets.add(testData.codePointCount(0, charOffset));
+ entities.add(new Extractor.Entity(matcher, "unused", 0, 0));
+ }
+
+ extractor.modifyIndicesFromUTF16ToToUnicode(testData, entities);
+
+ for (int i = 0; i < entities.size(); i++) {
+ assertEquals(codePointOffsets.get(i), entities.get(i).getStart());
+ }
+
+ extractor.modifyIndicesFromUnicodeToUTF16(testData, entities);
+
+ for (int i = 0; i < entities.size(); i++) {
+ // This assertion could fail if the entity location is in the middle
+ // of a surrogate pair, since there is no equivalent code point
+ // offset to that location. It would be pathological for an entity to
+ // start at that point, so we can just let the test fail in that case.
+ assertEquals(charOffsets.get(i), entities.get(i).getStart());
+ }
+ }
+ }
+
/**
* Tests for the extractReplyScreenname method
*/
@@ -92,11 +189,11 @@ public void testMentionWithSupplementaryCharacters() {
// count U+10400 as 2 characters (as in UTF-16)
extractor.modifyIndicesFromUnicodeToUTF16(text, extracted);
- assertEquals(extracted.size(), 2);
- assertEquals(extracted.get(0).start, 3);
- assertEquals(extracted.get(0).end, 11);
- assertEquals(extracted.get(1).start, 15);
- assertEquals(extracted.get(1).end, 23);
+ assertEquals(2, extracted.size());
+ assertEquals(3, extracted.get(0).start);
+ assertEquals(11, extracted.get(0).end);
+ assertEquals(15, extracted.get(1).start);
+ assertEquals(23, extracted.get(1).end);
}
}
Please sign in to comment.
Something went wrong with that request. Please try again.