Skip to content

Commit

Permalink
Fix a bug with TokenIndex taggings. Update metadata to generate more …
Browse files Browse the repository at this point in the history
…accurate data.
  • Loading branch information
maxthomas committed Jun 16, 2016
1 parent 4f1778f commit c1870c7
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 20 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
import edu.jhu.hlt.concrete.TokenTagging;
import edu.jhu.hlt.concrete.Tokenization;
import edu.jhu.hlt.concrete.TokenizationKind;
import edu.jhu.hlt.concrete.util.ProjectConstants;
import edu.jhu.hlt.concrete.util.Timing;
import edu.jhu.hlt.concrete.uuid.UUIDFactory;
import edu.jhu.hlt.tift.TaggedTokenizationOutput;

Expand All @@ -29,27 +31,27 @@ public class ConcreteTokenization {

static {
AnnotationMetadata am = new AnnotationMetadata();
am.setTimestamp(System.currentTimeMillis());
am.setTool("Tift Tokenizer v4.4.0");
am.setTimestamp(Timing.currentLocalTime());
am.setTool("Tift " + ProjectConstants.VERSION);
tiftMetadata = new AnnotationMetadata(am);
}

public static final AnnotationMetadata getMetadata() {
return new AnnotationMetadata(tiftMetadata);
}

/**
*
*
*/
private ConcreteTokenization() {
// TODO Auto-generated constructor stub
}

/**
* Wrapper around {@link #generateConcreteTokenization(List, int[], int)} that takes an array of Strings (tokens).
*
*
* @see #generateConcreteTokenization(List, int[], int)
*
*
* @param tokens
* - an array of tokens (Strings)
* @param offsets
Expand All @@ -64,7 +66,7 @@ public static Tokenization generateConcreteTokenization(String[] tokens, int[] o

/**
* Generate a {@link Tokenization} object from a list of tokens, list of offsets, and start position of the text (e.g., first text character in the text).
*
*
* @param tokens
* - a {@link List} of tokens (Strings)
* @param offsets
Expand All @@ -78,7 +80,7 @@ public static Tokenization generateConcreteTokenization(List<String> tokens, int
tkz.setKind(TokenizationKind.TOKEN_LIST);
tkz.setMetadata(new AnnotationMetadata(tiftMetadata));
tkz.setUuid(UUIDFactory.newUUID());

TokenList tl = new TokenList();
// Note: we use token index as token id.
for (int tokenId = 0; tokenId < tokens.size(); ++tokenId) {
Expand All @@ -97,9 +99,9 @@ public static Tokenization generateConcreteTokenization(List<String> tokens, int

/**
* Wrapper for {@link #generateConcreteTokenization(List, int[], int)} that takes a {@link List} of {@link Integer} objects.
*
*
* @see #generateConcreteTokenization(List, int[], int)
*
*
* @param tokens
* - a {@link List} of tokens (Strings)
* @param offsets
Expand All @@ -115,11 +117,11 @@ public static Tokenization generateConcreteTokenization(List<String> tokens, Lis
/**
* Generate a {@link Tokenization} object from a list of tokens, list of tags, list of offsets, and start position of the text (e.g., first text character in
* the text). Assumes tags are part of speech tags.
*
*
* Invokes {@link #generateConcreteTokenization(List, int[], int)} then adds tagging.
*
*
* @see #generateConcreteTokenization(List, int[], int)
*
*
* @param tokens
* - a {@link List} of tokens (Strings)
* @param offsets
Expand All @@ -138,15 +140,15 @@ public static Tokenization generateConcreteTokenization(List<String> tokens, Lis
String tag = tokenTags.get(i);
if (tag != null) {
TaggedToken tok = new TaggedToken();
tok.setTokenIndex(offsets[i]).setTag(tokenTags.get(i));
tok.setTokenIndex(i).setTag(tokenTags.get(i));
tt.addToTaggedTokenList(tok);
}
}

// Do not set the pos tags if everything was "null".
if (tt.isSetTaggedTokenList())
tokenization.addToTokenTaggingList(tt);

return tokenization;
}

Expand All @@ -156,9 +158,9 @@ public static Tokenization generateConcreteTokenization(TaggedTokenizationOutput

/**
* Convert a {@link List} of {@link Integer} objects to an integer array primitive.
*
*
* Will throw a {@link NullPointerException} if any element in the list is null.
*
*
* @param integers
* a {@link List} of {@link Integer} objects, none of which are <code>null</code>
* @return a primitive array of ints
Expand Down
6 changes: 4 additions & 2 deletions tift/src/test/java/edu/jhu/hlt/tift/TokenizerTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,9 @@ public void testTokenizeToConcreteTwitter() {
.findFirst();
assertTrue(tt.isPresent());
for (TaggedToken t : tt.get().getTaggedTokenList()) {
logger.info("Got tagging: {} on token: {}", t.getTag(), t.getTokenIndex());
int idx = t.getTokenIndex();
logger.info("Got tagging: {} on token: {}", t.getTag(), idx);
assertEquals(4, idx);
}
}

Expand All @@ -80,7 +82,7 @@ public void testTokenize() {
List<String> tokens = Tokenizer.BASIC.tokenize(text);
assertEquals(4, tokens.size());
}

@Test
public void thriftReadWrite() throws ConcreteException {
String text = "hello world test tokens";
Expand Down

0 comments on commit c1870c7

Please sign in to comment.