Skip to content
Browse files

Revert Extractor change.

  • Loading branch information...
1 parent b1e717b commit 343a0f846a5afeab99caee42c0e34f1457b1a6e4 @keitaf keitaf committed
Showing with 19 additions and 97 deletions.
  1. +19 −97 src/com/twitter/Extractor.java
View
116 src/com/twitter/Extractor.java
@@ -279,57 +279,7 @@ public String extractReplyScreenname(String text) {
* @param entities entities with Unicode based indices
*/
public void modifyIndicesFromUnicodeToUTF16(String text, List<Entity> entities) {
- // In order to avoid having to track which entities we have already
- // shifted, we process the string from the back to the front,
- // since converting from code points to code units never makes offsets
- // smaller.
- int codePointLocation = text.codePointCount(0, text.length()) - 1;
-
- // The current code unit (string location). This is always moved in lock
- // step with the codePointLocation.
- int charLocation = text.length() - 1;
-
- while (true) {
- // Find the next entity (counting code points, backwards) that needs to
- // be shifted.
- int nextEntityStart = -1;
-
- for (Entity entity : entities) {
- final int start = entity.getStart();
-
- // If this entity's start is the current code point location,
- // then it has not yet been converted (its units are still code units).
- if (start == codePointLocation) {
- if (charLocation != codePointLocation) {
- final int entityLength = entity.end - start;
- entity.start = charLocation;
- entity.end = charLocation + entityLength;
- }
- } else {
- // Choose the entity with the highest code point offset out of
- // those that have not yet been converted.
- if (start < codePointLocation && start > nextEntityStart) {
- nextEntityStart = start;
- }
- }
- }
-
- // Stop if no entity was found between the beginning of the string and
- // the current location.
- if (nextEntityStart < 0) break;
-
- while (codePointLocation > nextEntityStart) {
- if (charLocation > 0) {
- final char c1 = text.charAt(charLocation);
- final char c0 = text.charAt(charLocation - 1);
- if (Character.isSurrogatePair(c0, c1)) {
- charLocation--;
- }
- }
- codePointLocation--;
- charLocation--;
- }
- }
+ shiftIndices(text, entities, +1);
}
/*
@@ -341,54 +291,26 @@ public void modifyIndicesFromUnicodeToUTF16(String text, List<Entity> entities)
* @param entities entities with UTF-16 based indices
*/
public void modifyIndicesFromUTF16ToToUnicode(String text, List<Entity> entities) {
- int codePointLocation = 0;
- int charLocation = 0;
-
- boolean wasHighSurrogate = false;
-
- while (true) {
- // Find the next entity (counting code units, counting forward) that
- // needs conversion, while converting any entities that occur at the
- // current location.
- int nextEntityStart = text.length();
-
- for (Entity entity : entities) {
- int start = entity.start;
-
- // Any entities that occur at this location should have their offsets
- // converted. Since the conversion results in a location that is less
- // than or equal to the current location, we are guaranteed not to
- // convert an entity more than once.
- if (start == charLocation) {
- if (codePointLocation != charLocation) {
- final int entityLen = entity.end - start;
- entity.start = codePointLocation;
- entity.end = codePointLocation + entityLen;
- }
- } else {
- // Choose the entity with the lowest code unit offset out of
- // those that have not yet been converted.
- if (start > charLocation && start < nextEntityStart) {
- nextEntityStart = start;
- }
- }
- }
+ shiftIndices(text, entities, -1);
+ }
- // If the next entity is past the end of the text,
- // or if no more entities were found, then we can stop counting.
- if (nextEntityStart >= text.length()) break;
-
- // Count the unicode code points between the current location and the
- // next entity start.
- while (charLocation < nextEntityStart) {
- final char c = text.charAt(charLocation);
- if (wasHighSurrogate && Character.isLowSurrogate(c)) {
- wasHighSurrogate = false;
- } else {
- codePointLocation += 1;
- wasHighSurrogate = Character.isHighSurrogate(c);
+ /*
+ * Shift Entity's indices by {@code diff} for every Unicode supplementary character
+ * which appears before the entity.
+ *
+ * @param text original text
+ * @param entities extracted entities
+ * @param the amount to shift the entity's indices.
+ */
+ protected void shiftIndices(String text, List<Entity> entities, int diff) {
+ for (int i = 0; i < text.length() - 1; i++) {
+ if (Character.isSupplementaryCodePoint(text.codePointAt(i))) {
+ for (Entity entity: entities) {
+ if (entity.start > i) {
+ entity.start += diff;
+ entity.end += diff;
+ }
}
- charLocation++;
}
}
}

0 comments on commit 343a0f8

Please sign in to comment.
Something went wrong with that request. Please try again.