From c5944da4a531a1adcd6a53d6512449bdbc5b1bfc Mon Sep 17 00:00:00 2001 From: Grant Ingersoll Date: Tue, 19 Jul 2011 15:50:00 -0400 Subject: [PATCH] concats --- .../pygmalion/udf/RangeBasedStringConcat.java | 76 ++++++++++++++++++- .../udf/RangeBasedStringConcatTest.java | 72 ++++++++++++++++++ 2 files changed, 145 insertions(+), 3 deletions(-) diff --git a/udf/src/main/java/org/pygmalion/udf/RangeBasedStringConcat.java b/udf/src/main/java/org/pygmalion/udf/RangeBasedStringConcat.java index 2bfb6e7..88d5cbd 100644 --- a/udf/src/main/java/org/pygmalion/udf/RangeBasedStringConcat.java +++ b/udf/src/main/java/org/pygmalion/udf/RangeBasedStringConcat.java @@ -1,21 +1,91 @@ package org.pygmalion.udf; import org.apache.pig.EvalFunc; +import org.apache.pig.data.DataBag; import org.apache.pig.data.Tuple; import java.io.IOException; +/** + * This class is capable of concatenating specific items in a tuple together, as well as + * the whole tuple. It will also recurse on DataBags and Tuples and concat those together + * + * Usage: RangeBasedStringConcat("1,2,3", " "), RangeBasedStringConcat("ALL", " "); + */ +//TODO: switch to byte based approach public class RangeBasedStringConcat extends EvalFunc { + public static final String ALL = "all"; + public static final String DEFAULT_SEPARATOR = " "; + private String range; + private int[] ranges; + private String separator = DEFAULT_SEPARATOR; + + public RangeBasedStringConcat() { + this(ALL, DEFAULT_SEPARATOR); + } + + /** + * If the range is empty or "ALL", then concat all values. Else, a comma separated list + * of the fields to concat. + * @param range comma separated list of field numbers for the tuple, else ALL + */ + public RangeBasedStringConcat(String range, String separator) { + this.range = range; + this.separator = separator; + initRange(); + + } + + private void initRange() { + //TODO: add support for ranges like 1-10 + if (range != null && range.equalsIgnoreCase(ALL) == false){ + String [] splits = range.split(","); + ranges = new int[splits.length]; + for (int i = 0; i < splits.length; i++) { + ranges[i] = Integer.parseInt(splits[i]); + } + } + } + @Override public String exec(Tuple input) throws IOException { - if (input == null || input.size() == 0) { + int tupleSize = input.size(); + if (input == null || tupleSize == 0) { return null; } StringBuilder builder = new StringBuilder(); + if (range != null && range.equalsIgnoreCase(ALL)){ + processTuple(input, builder); + } else { + for (int theRange : ranges) { + if (theRange < tupleSize) { + appendObject(input.get(theRange), builder); + } + } + } + //remove the trailing separate + return builder.length() > 0 ? builder.substring(0, builder.length() -1) : ""; + } + + private void processTuple(Tuple input, Appendable builder) throws IOException { for (Object o : input.getAll()) { - builder.append(o); + appendObject(o, builder); } + } - return builder.toString(); + private void appendObject(Object o, Appendable builder) throws IOException { + if (o instanceof Tuple){ + Tuple tmp = (Tuple) o; + if (tmp.size() > 0){ + processTuple(tmp, builder); + } + } else if (o instanceof DataBag){ + DataBag db = (DataBag) o; + for (Tuple tuple : db) { + processTuple(tuple, builder); + } + } else { + builder.append(o.toString()).append(separator); + } } } diff --git a/udf/src/test/java/org/pygmailion/udf/RangeBasedStringConcatTest.java b/udf/src/test/java/org/pygmailion/udf/RangeBasedStringConcatTest.java index fb42af2..6ce22cb 100644 --- a/udf/src/test/java/org/pygmailion/udf/RangeBasedStringConcatTest.java +++ b/udf/src/test/java/org/pygmailion/udf/RangeBasedStringConcatTest.java @@ -1,10 +1,82 @@ package org.pygmailion.udf; +import org.apache.pig.data.DataBag; +import org.apache.pig.data.DefaultDataBag; +import org.apache.pig.data.DefaultTuple; +import org.apache.pig.data.Tuple; +import org.junit.Test; +import org.pygmalion.udf.RangeBasedStringConcat; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + /** * * **/ public class RangeBasedStringConcatTest { + private String[] fields = {"a", "b", "c", "d", "e", "f", "g", "h", "i"}; + + @Test + public void testAllConcat() throws Exception { + RangeBasedStringConcat rbsc = new RangeBasedStringConcat("ALL", " "); + Tuple input = new DefaultTuple(); + for (int i = 0; i < fields.length; i++) { + input.append(fields[i]); + } + String result = rbsc.exec(input); + assertEquals("a b c d e f g h i", result); + Tuple innerTuple = new DefaultTuple(); + innerTuple.append("j"); + innerTuple.append("k"); + + input.append(innerTuple); + result = rbsc.exec(input); + assertEquals("a b c d e f g h i j k", result); + DataBag db = new DefaultDataBag(); + Tuple dbTuple = new DefaultTuple(); + dbTuple.append("l"); + dbTuple.append("m"); + db.add(dbTuple); + innerTuple.append(db); + result = rbsc.exec(input); + assertEquals("a b c d e f g h i j k l m", result); + } + + @Test + public void testRange() throws Exception { + RangeBasedStringConcat rbsc = new RangeBasedStringConcat("0,1", " "); + Tuple input = new DefaultTuple(); + for (String field : fields) { + input.append(field); + } + String result = rbsc.exec(input); + assertEquals("a b", result); + rbsc = new RangeBasedStringConcat("2,6", " "); + result = rbsc.exec(input); + assertEquals("c g", result); + //test out of range + rbsc = new RangeBasedStringConcat("0,9,1000", " "); + result = rbsc.exec(input); + assertEquals("a", result); + + Tuple innerTuple = new DefaultTuple(); + innerTuple.append("j"); + innerTuple.append("k"); + input.append(innerTuple); + rbsc = new RangeBasedStringConcat("0,9", " "); + result = rbsc.exec(input); + assertEquals("a j k", result); + DataBag db = new DefaultDataBag(); + Tuple dbTuple = new DefaultTuple(); + dbTuple.append("l"); + dbTuple.append("m"); + db.add(dbTuple); + innerTuple.append(db); + rbsc = new RangeBasedStringConcat("0,9,10", " "); + result = rbsc.exec(input); + assertEquals("a j k l m", result); + } }