Skip to content

Commit

Permalink
tift: Fix issue with end of tweet, truncated hashtags. Update to a new
Browse files Browse the repository at this point in the history
non-capturing Hashtag regex.

Additionally, remove a lot of object creation and switch these to
arrays.

Closes #1.
  • Loading branch information
maxthomas committed Jun 16, 2016
1 parent c1870c7 commit 3a3ad9e
Show file tree
Hide file tree
Showing 11 changed files with 269 additions and 233 deletions.
14 changes: 14 additions & 0 deletions tift/src/main/java/edu/jhu/hlt/tift/HashTagTagger.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
/*
*
*/
package edu.jhu.hlt.tift;

import java.util.regex.Pattern;

/**
*
*/
class HashTagTagger {
public static final Pattern HASHTAG_PATTERN = Pattern.compile("\\B#\\w*[a-zA-Z]+\\w*",
Pattern.UNICODE_CHARACTER_CLASS);
}
31 changes: 3 additions & 28 deletions tift/src/main/java/edu/jhu/hlt/tift/PatternStringTuple.java
Original file line number Diff line number Diff line change
Expand Up @@ -5,20 +5,18 @@
*/
package edu.jhu.hlt.tift;

import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;

/**
* Wrapper for mass {@code SimpleImmutableEntry<Pattern, String>} in TwitterTokenizer.
* Wrapper for mass <code>SimpleImmutableEntry[Pattern, String]</code> in TwitterTokenizer.
*/
public class PatternStringTuple {
private final Pattern pattern;
private final String entry;

/**
* Default constructor.
*
*
* @param pattern the pattern
* @param entry the entry
*/
Expand All @@ -29,7 +27,7 @@ public PatternStringTuple(Pattern pattern, String entry) {

/**
* Compile the first parameter into a {@link java.util.regex.Pattern} object.
*
*
* @param pattern
* @param entry
*/
Expand All @@ -38,29 +36,6 @@ public PatternStringTuple(String pattern, String entry) {
this.entry = entry;
}

/**
* Given two string arrays of equal length, create a {@link java.util.List} of {@link PatternStringTuple} objects that correspond to the indices of each array
* (e.g., tupleList.get(0) contains patterns[0] and entries[0]).
*
* @param patterns
* @param entries
* @return a {@link List} of {@link PatternStringTuple} objects
*/
public static List<PatternStringTuple> mapPatterns(String[] patterns, String[] entries) {
final List<PatternStringTuple> tupleList = new ArrayList<>();

if (patterns.length != entries.length) {
throw new IllegalArgumentException("Length of patterns array [" + patterns.length + "] was not equal to " + "length of entries array [" + entries.length
+ "]");
}

for (int i = 0; i < patterns.length; i++) {
tupleList.add(new PatternStringTuple(Pattern.compile(patterns[0]), entries[0]));
}

return tupleList;
}

/**
* @return the pattern
*/
Expand Down
18 changes: 8 additions & 10 deletions tift/src/main/java/edu/jhu/hlt/tift/Rewriter.java
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,13 @@

package edu.jhu.hlt.tift;

import java.util.Collections;
import java.util.HashSet;
import java.util.Set;
import java.util.regex.Pattern;

import edu.jhu.hlt.tift.PatternStringTuple;
import com.google.common.collect.ImmutableSet;

/**
* Enumeration of available "text rewriting" tools.
* Enumeration of available "text rewriting" tools.
*/
public enum Rewriter {
PTB {
Expand Down Expand Up @@ -83,13 +81,13 @@ private static Set<PatternStringTuple> getCommonUnicodePatterns () {
// 2744 \u2018 ' single left quotation
// 2722 \u2661 <3 white heart
// 2690 \u2605 star-symbol black star
// 2534 \u2600 sun-symbol BLACK SUN WITH RAYS
// 2534 \u2600 sun-symbol BLACK SUN WITH RAYS
// 2346 \u2550 = BOX DRAWINGS DOUBLE HORIZONTAL
// 2094 \u0305 - COMBINING OVERLINE

String[] p = {
"\u201c", "\"",
"\u201d", "\"",
"\u201d", "\"",
"\u2665", "<3",
"\u2014", "-",
"\u263a", ":)",
Expand Down Expand Up @@ -176,9 +174,9 @@ public static Set<PatternStringTuple> getBasicPatterns() {

/**
* Based on inspection of:
*
*
* http://www.cis.upenn.edu/~treebank/tokenizer.sed
*
*
* The header of which identifies the author as:
* "Robert MacIntyre, University of Pennsylvania, late 1995".
*/
Expand Down Expand Up @@ -257,11 +255,11 @@ public static Set<PatternStringTuple> getPTBPatterns() {
}

private static Set<PatternStringTuple> convertStringArrayPatternsToTupleSet(String[] patternArray) {
Set<PatternStringTuple> patterns = new HashSet<PatternStringTuple>(patternArray.length);
ImmutableSet.Builder<PatternStringTuple> patterns = new ImmutableSet.Builder<>();
for (int i = 0; i < patternArray.length - 1; i += 2)
patterns.add(new PatternStringTuple(Pattern.compile(patternArray[i], Pattern.MULTILINE), patternArray[i + 1]));

return Collections.unmodifiableSet(patterns);
return patterns.build();
}

private static String rewrite(String text, Set<PatternStringTuple> patterns) {
Expand Down
44 changes: 14 additions & 30 deletions tift/src/main/java/edu/jhu/hlt/tift/TaggedTokenizationOutput.java
Original file line number Diff line number Diff line change
Expand Up @@ -5,67 +5,51 @@
*/
package edu.jhu.hlt.tift;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

/**
* A data structure that has the following elements:
*
*
* <pre>
* List of tokens
* List of token tags
* List of offsets
* </pre>
*
*
* Initially created as a wrapper around the data structure returned by the twitter tokenizer.
*
* @author max
*
*
*/
public class TaggedTokenizationOutput {

private final List<String> tokens;
private final List<String> tokenTags;
private final List<Integer> offsets;

/**
*
*/
public TaggedTokenizationOutput(List<String> tokens, List<String> tokenTags, List<Integer> offsets) {
this.tokens = tokens;
this.tokenTags = tokenTags;
this.offsets = offsets;
}
private final String[] tokens;
private final String[] tokenTags;
private final int[] offsets;

public TaggedTokenizationOutput(String[][] twitterTokenizerOutput) {
this.tokens = Arrays.asList(twitterTokenizerOutput[0]);
this.tokenTags = Arrays.asList(twitterTokenizerOutput[1]);
this.tokens = twitterTokenizerOutput[0];
this.tokenTags = twitterTokenizerOutput[1];
String[] offsetStrings = twitterTokenizerOutput[2];
this.offsets = new ArrayList<>(offsetStrings.length);
for (String offset : offsetStrings)
this.offsets.add(Integer.parseInt(offset));
this.offsets = new int[offsetStrings.length];
for (int i = 0; i < offsetStrings.length; i++)
this.offsets[i] = Integer.parseInt(offsetStrings[i]);
}

/**
* @return the tokens
*/
public List<String> getTokens() {
public String[] getTokens() {
return tokens;
}

/**
* @return the tokenTags
*/
public List<String> getTokenTags() {
public String[] getTokenTags() {
return tokenTags;
}

/**
* @return the offsets
*/
public List<Integer> getOffsets() {
public int[] getOffsets() {
return offsets;
}

}
64 changes: 36 additions & 28 deletions tift/src/main/java/edu/jhu/hlt/tift/TokenTagTuple.java
Original file line number Diff line number Diff line change
@@ -1,39 +1,47 @@
/*
* Copyright 2012-2014 Johns Hopkins University HLTCOE. All rights reserved.
* This software is released under the 2-clause BSD license.
* Copyright 2012-2016 Johns Hopkins University HLTCOE. All rights reserved.
* See LICENSE in the project root directory.
*/

package edu.jhu.hlt.tift;

import java.util.Optional;

/**
* 2-tuple that contains a token and a tag.
* 2-tuple that contains a token and an {@link Optional} tag.
*/
public class TokenTagTuple {

private final String token;
private final String tag;

/**
*
*/
public TokenTagTuple(String token, String tag) {
this.token = token;
this.tag = tag;
}

/**
* @return the token
*/
public String getToken() {
return token;
}

/**
* @return the tag
*/
public String getTag() {
return tag;
}
private final String token;
private final Optional<String> tag;

/**
*
*/
public TokenTagTuple(String token) {
this(token, Optional.empty());
}

public TokenTagTuple(String token, Optional<String> tag) {
this.token = token;
this.tag = tag;
}

public TokenTagTuple(String token, String tag) {
this.token = token;
this.tag = Optional.ofNullable(tag);
}

/**
* @return the token
*/
public String getToken() {
return token;
}

/**
* @return the tag
*/
public Optional<String> getTag() {
return tag;
}
}
13 changes: 7 additions & 6 deletions tift/src/main/java/edu/jhu/hlt/tift/Tokenizer.java
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
/*
* Copyright 2012-2014 Johns Hopkins University HLTCOE. All rights reserved.
* Copyright 2012-2016 Johns Hopkins University HLTCOE. All rights reserved.
* This software is released under the 2-clause BSD license.
* See LICENSE in the project root directory.
*/

package edu.jhu.hlt.tift;

import java.io.BufferedReader;
Expand All @@ -13,9 +12,11 @@
import java.util.Arrays;
import java.util.List;

import com.google.common.collect.ImmutableList;

import edu.jhu.hlt.concrete.Tokenization;
import edu.jhu.hlt.concrete.UUID;
import edu.jhu.hlt.concrete.tift.ConcreteTokenization;
import edu.jhu.hlt.tift.concrete.ConcreteTokenization;

/**
* Enumeration of supported tokenizations.
Expand All @@ -30,7 +31,7 @@ public Tokenization tokenizeToConcrete(String text, int textStartPosition) {

@Override
public List<String> tokenize(String text) {
return Arrays.asList(Rewriter.PTB.rewrite(text).split("\\s+"));
return ImmutableList.copyOf(Rewriter.PTB.rewrite(text).split("\\s+"));
}

@Override
Expand All @@ -51,7 +52,7 @@ public Tokenization tokenizeSentence(String text, int textStartPosition, UUID se

@Override
public List<String> tokenize(String text) {
return Arrays.asList(text.split("\\s+"));
return ImmutableList.copyOf(text.split("\\s+"));
}
},
TWITTER_PETROVIC {
Expand Down Expand Up @@ -79,7 +80,7 @@ public Tokenization tokenizeToConcrete(String text, int textStartPosition) {

@Override
public List<String> tokenize(String text) {
return TwitterTokenizer.tokenize(text).getTokens();
return ImmutableList.copyOf(TwitterTokenizer.tokenize(text).getTokens());
}

@Override
Expand Down
Loading

0 comments on commit 3a3ad9e

Please sign in to comment.