Skip to content

Commit

Permalink
Merge pull request #3 from gsingers/master
Browse files Browse the repository at this point in the history
Rest of the stuff
  • Loading branch information
gsingers committed Jul 19, 2011
2 parents d5ecfd1 + c5944da commit 84463bd
Show file tree
Hide file tree
Showing 2 changed files with 145 additions and 3 deletions.
76 changes: 73 additions & 3 deletions udf/src/main/java/org/pygmalion/udf/RangeBasedStringConcat.java
@@ -1,21 +1,91 @@
package org.pygmalion.udf;

import org.apache.pig.EvalFunc;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.Tuple;

import java.io.IOException;

/**
* This class is capable of concatenating specific items in a tuple together, as well as
* the whole tuple. It will also recurse on DataBags and Tuples and concat those together
*
* Usage: RangeBasedStringConcat("1,2,3", " "), RangeBasedStringConcat("ALL", " ");
*/
//TODO: switch to byte based approach
public class RangeBasedStringConcat extends EvalFunc<String> {
public static final String ALL = "all";
public static final String DEFAULT_SEPARATOR = " ";
private String range;
private int[] ranges;
private String separator = DEFAULT_SEPARATOR;

public RangeBasedStringConcat() {
this(ALL, DEFAULT_SEPARATOR);
}

/**
* If the range is empty or "ALL", then concat all values. Else, a comma separated list
* of the fields to concat.
* @param range comma separated list of field numbers for the tuple, else ALL
*/
public RangeBasedStringConcat(String range, String separator) {
this.range = range;
this.separator = separator;
initRange();

}

private void initRange() {
//TODO: add support for ranges like 1-10
if (range != null && range.equalsIgnoreCase(ALL) == false){
String [] splits = range.split(",");
ranges = new int[splits.length];
for (int i = 0; i < splits.length; i++) {
ranges[i] = Integer.parseInt(splits[i]);
}
}
}

@Override
public String exec(Tuple input) throws IOException {
if (input == null || input.size() == 0) {
int tupleSize = input.size();
if (input == null || tupleSize == 0) {
return null;
}
StringBuilder builder = new StringBuilder();
if (range != null && range.equalsIgnoreCase(ALL)){
processTuple(input, builder);
} else {
for (int theRange : ranges) {
if (theRange < tupleSize) {
appendObject(input.get(theRange), builder);
}
}
}
//remove the trailing separate
return builder.length() > 0 ? builder.substring(0, builder.length() -1) : "";
}

private void processTuple(Tuple input, Appendable builder) throws IOException {
for (Object o : input.getAll()) {
builder.append(o);
appendObject(o, builder);
}
}

return builder.toString();
private void appendObject(Object o, Appendable builder) throws IOException {
if (o instanceof Tuple){
Tuple tmp = (Tuple) o;
if (tmp.size() > 0){
processTuple(tmp, builder);
}
} else if (o instanceof DataBag){
DataBag db = (DataBag) o;
for (Tuple tuple : db) {
processTuple(tuple, builder);
}
} else {
builder.append(o.toString()).append(separator);
}
}
}
@@ -1,10 +1,82 @@
package org.pygmailion.udf;


import org.apache.pig.data.DataBag;
import org.apache.pig.data.DefaultDataBag;
import org.apache.pig.data.DefaultTuple;
import org.apache.pig.data.Tuple;
import org.junit.Test;
import org.pygmalion.udf.RangeBasedStringConcat;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;

/**
*
*
**/
public class RangeBasedStringConcatTest {
private String[] fields = {"a", "b", "c", "d", "e", "f", "g", "h", "i"};

@Test
public void testAllConcat() throws Exception {
RangeBasedStringConcat rbsc = new RangeBasedStringConcat("ALL", " ");
Tuple input = new DefaultTuple();
for (int i = 0; i < fields.length; i++) {
input.append(fields[i]);
}
String result = rbsc.exec(input);
assertEquals("a b c d e f g h i", result);
Tuple innerTuple = new DefaultTuple();
innerTuple.append("j");
innerTuple.append("k");

input.append(innerTuple);
result = rbsc.exec(input);
assertEquals("a b c d e f g h i j k", result);
DataBag db = new DefaultDataBag();
Tuple dbTuple = new DefaultTuple();
dbTuple.append("l");
dbTuple.append("m");
db.add(dbTuple);
innerTuple.append(db);
result = rbsc.exec(input);
assertEquals("a b c d e f g h i j k l m", result);
}

@Test
public void testRange() throws Exception {
RangeBasedStringConcat rbsc = new RangeBasedStringConcat("0,1", " ");
Tuple input = new DefaultTuple();
for (String field : fields) {
input.append(field);
}
String result = rbsc.exec(input);
assertEquals("a b", result);
rbsc = new RangeBasedStringConcat("2,6", " ");
result = rbsc.exec(input);
assertEquals("c g", result);
//test out of range
rbsc = new RangeBasedStringConcat("0,9,1000", " ");
result = rbsc.exec(input);
assertEquals("a", result);

Tuple innerTuple = new DefaultTuple();
innerTuple.append("j");
innerTuple.append("k");

input.append(innerTuple);
rbsc = new RangeBasedStringConcat("0,9", " ");
result = rbsc.exec(input);
assertEquals("a j k", result);
DataBag db = new DefaultDataBag();
Tuple dbTuple = new DefaultTuple();
dbTuple.append("l");
dbTuple.append("m");
db.add(dbTuple);
innerTuple.append(db);
rbsc = new RangeBasedStringConcat("0,9,10", " ");
result = rbsc.exec(input);
assertEquals("a j k l m", result);
}
}

0 comments on commit 84463bd

Please sign in to comment.