Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #3 from gsingers/master
Rest of the stuff
- Loading branch information
Showing
2 changed files
with
145 additions
and
3 deletions.
There are no files selected for viewing
76 changes: 73 additions & 3 deletions
76
udf/src/main/java/org/pygmalion/udf/RangeBasedStringConcat.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,21 +1,91 @@ | ||
package org.pygmalion.udf; | ||
|
||
import org.apache.pig.EvalFunc; | ||
import org.apache.pig.data.DataBag; | ||
import org.apache.pig.data.Tuple; | ||
|
||
import java.io.IOException; | ||
|
||
/** | ||
* This class is capable of concatenating specific items in a tuple together, as well as | ||
* the whole tuple. It will also recurse on DataBags and Tuples and concat those together | ||
* | ||
* Usage: RangeBasedStringConcat("1,2,3", " "), RangeBasedStringConcat("ALL", " "); | ||
*/ | ||
//TODO: switch to byte based approach | ||
public class RangeBasedStringConcat extends EvalFunc<String> { | ||
public static final String ALL = "all"; | ||
public static final String DEFAULT_SEPARATOR = " "; | ||
private String range; | ||
private int[] ranges; | ||
private String separator = DEFAULT_SEPARATOR; | ||
|
||
public RangeBasedStringConcat() { | ||
this(ALL, DEFAULT_SEPARATOR); | ||
} | ||
|
||
/** | ||
* If the range is empty or "ALL", then concat all values. Else, a comma separated list | ||
* of the fields to concat. | ||
* @param range comma separated list of field numbers for the tuple, else ALL | ||
*/ | ||
public RangeBasedStringConcat(String range, String separator) { | ||
this.range = range; | ||
this.separator = separator; | ||
initRange(); | ||
|
||
} | ||
|
||
private void initRange() { | ||
//TODO: add support for ranges like 1-10 | ||
if (range != null && range.equalsIgnoreCase(ALL) == false){ | ||
String [] splits = range.split(","); | ||
ranges = new int[splits.length]; | ||
for (int i = 0; i < splits.length; i++) { | ||
ranges[i] = Integer.parseInt(splits[i]); | ||
} | ||
} | ||
} | ||
|
||
@Override | ||
public String exec(Tuple input) throws IOException { | ||
if (input == null || input.size() == 0) { | ||
int tupleSize = input.size(); | ||
if (input == null || tupleSize == 0) { | ||
return null; | ||
} | ||
StringBuilder builder = new StringBuilder(); | ||
if (range != null && range.equalsIgnoreCase(ALL)){ | ||
processTuple(input, builder); | ||
} else { | ||
for (int theRange : ranges) { | ||
if (theRange < tupleSize) { | ||
appendObject(input.get(theRange), builder); | ||
} | ||
} | ||
} | ||
//remove the trailing separate | ||
return builder.length() > 0 ? builder.substring(0, builder.length() -1) : ""; | ||
} | ||
|
||
private void processTuple(Tuple input, Appendable builder) throws IOException { | ||
for (Object o : input.getAll()) { | ||
builder.append(o); | ||
appendObject(o, builder); | ||
} | ||
} | ||
|
||
return builder.toString(); | ||
private void appendObject(Object o, Appendable builder) throws IOException { | ||
if (o instanceof Tuple){ | ||
Tuple tmp = (Tuple) o; | ||
if (tmp.size() > 0){ | ||
processTuple(tmp, builder); | ||
} | ||
} else if (o instanceof DataBag){ | ||
DataBag db = (DataBag) o; | ||
for (Tuple tuple : db) { | ||
processTuple(tuple, builder); | ||
} | ||
} else { | ||
builder.append(o.toString()).append(separator); | ||
} | ||
} | ||
} |
72 changes: 72 additions & 0 deletions
72
udf/src/test/java/org/pygmailion/udf/RangeBasedStringConcatTest.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,10 +1,82 @@ | ||
package org.pygmailion.udf; | ||
|
||
|
||
import org.apache.pig.data.DataBag; | ||
import org.apache.pig.data.DefaultDataBag; | ||
import org.apache.pig.data.DefaultTuple; | ||
import org.apache.pig.data.Tuple; | ||
import org.junit.Test; | ||
import org.pygmalion.udf.RangeBasedStringConcat; | ||
|
||
import static org.junit.Assert.assertEquals; | ||
import static org.junit.Assert.assertTrue; | ||
|
||
/** | ||
* | ||
* | ||
**/ | ||
public class RangeBasedStringConcatTest { | ||
private String[] fields = {"a", "b", "c", "d", "e", "f", "g", "h", "i"}; | ||
|
||
@Test | ||
public void testAllConcat() throws Exception { | ||
RangeBasedStringConcat rbsc = new RangeBasedStringConcat("ALL", " "); | ||
Tuple input = new DefaultTuple(); | ||
for (int i = 0; i < fields.length; i++) { | ||
input.append(fields[i]); | ||
} | ||
String result = rbsc.exec(input); | ||
assertEquals("a b c d e f g h i", result); | ||
Tuple innerTuple = new DefaultTuple(); | ||
innerTuple.append("j"); | ||
innerTuple.append("k"); | ||
|
||
input.append(innerTuple); | ||
result = rbsc.exec(input); | ||
assertEquals("a b c d e f g h i j k", result); | ||
DataBag db = new DefaultDataBag(); | ||
Tuple dbTuple = new DefaultTuple(); | ||
dbTuple.append("l"); | ||
dbTuple.append("m"); | ||
db.add(dbTuple); | ||
innerTuple.append(db); | ||
result = rbsc.exec(input); | ||
assertEquals("a b c d e f g h i j k l m", result); | ||
} | ||
|
||
@Test | ||
public void testRange() throws Exception { | ||
RangeBasedStringConcat rbsc = new RangeBasedStringConcat("0,1", " "); | ||
Tuple input = new DefaultTuple(); | ||
for (String field : fields) { | ||
input.append(field); | ||
} | ||
String result = rbsc.exec(input); | ||
assertEquals("a b", result); | ||
rbsc = new RangeBasedStringConcat("2,6", " "); | ||
result = rbsc.exec(input); | ||
assertEquals("c g", result); | ||
//test out of range | ||
rbsc = new RangeBasedStringConcat("0,9,1000", " "); | ||
result = rbsc.exec(input); | ||
assertEquals("a", result); | ||
|
||
Tuple innerTuple = new DefaultTuple(); | ||
innerTuple.append("j"); | ||
innerTuple.append("k"); | ||
|
||
input.append(innerTuple); | ||
rbsc = new RangeBasedStringConcat("0,9", " "); | ||
result = rbsc.exec(input); | ||
assertEquals("a j k", result); | ||
DataBag db = new DefaultDataBag(); | ||
Tuple dbTuple = new DefaultTuple(); | ||
dbTuple.append("l"); | ||
dbTuple.append("m"); | ||
db.add(dbTuple); | ||
innerTuple.append(db); | ||
rbsc = new RangeBasedStringConcat("0,9,10", " "); | ||
result = rbsc.exec(input); | ||
assertEquals("a j k l m", result); | ||
} | ||
} |