Skip to content

Commit

Permalink
Fix bug: table.splitOn fails when the categorical column is a TextCol…
Browse files Browse the repository at this point in the history
…umn #554

Convert any textColumns to stringColumns before splitting tables to allow them to be used for subgroups, etc.
  • Loading branch information
lwhite1 committed Aug 3, 2019
1 parent 8b6e9c1 commit 52b654b
Show file tree
Hide file tree
Showing 3 changed files with 100 additions and 18 deletions.
80 changes: 62 additions & 18 deletions core/src/main/java/tech/tablesaw/table/TableSliceGroup.java
Expand Up @@ -36,10 +36,12 @@
*/
public class TableSliceGroup implements Iterable<TableSlice> {

// A string that is used internally as a delimiter in creating a column name from all the grouping columns
// A string that is used internally as a delimiter in creating a column name from all the grouping
// columns
protected static final String SPLIT_STRING = "~~~";

// A function that splits the group column name back into the original column names for the grouping columns
// A function that splits the group column name back into the original column names for the
// grouping columns
private static final Splitter SPLITTER = Splitter.on(SPLIT_STRING);

// The list of slices or views over the source table that I contain
Expand All @@ -54,16 +56,55 @@ public class TableSliceGroup implements Iterable<TableSlice> {
* Returns an instance for calculating a single summary for the given table, with no sub-groupings
*/
protected TableSliceGroup(Table original) {
sourceTable = original;
if (containsAnyTextColumns(original)) {
sourceTable = original.copy();
replaceTextColumnsWithStringColumns();
} else {
sourceTable = original;
}
splitColumnNames = new String[0];
}

private boolean containsAnyTextColumns(Table original) {
for (Column<?> column : original.columns()) {
if (column.type().equals(ColumnType.TEXT)) {
return true;
}
}
return false;
}

/**
* Replace any textColumns in the table with stringColumns. We do this because TextColumns don't
* split correctly: The split algorithm uses a byte[] version of the elements to do it's magic,
* and text columns have variable sized strings, so variable sized byte arrays. Determining the
* correct array size (maybe the largest array size for the array?) would be somewhat fraught
* because the size depends on the encoding and the strings do not know they're own encoding. This
* would need to be detected using a 3rd party library.
*
* <p>So replace with the equivalent stringColumn instead.
*/
private void replaceTextColumnsWithStringColumns() {
for (int i = 0; i < sourceTable.columnCount(); i++) {
if (sourceTable.column(i).type().equals(ColumnType.TEXT)) {
String originalName = sourceTable.column(i).name();
sourceTable.replaceColumn(i, sourceTable.textColumn(i).asStringColumn());
sourceTable.column(i).setName(originalName);
}
}
}

/**
* Returns an instance for calculating subgroups,
* one for each combination of the given groupColumnNames that appear in the source table
* Returns an instance for calculating subgroups, one for each combination of the given
* groupColumnNames that appear in the source table
*/
protected TableSliceGroup(Table sourceTable, String[] groupColumnNames) {
this.sourceTable = sourceTable;
if (containsAnyTextColumns(sourceTable)) {
this.sourceTable = sourceTable.copy();
replaceTextColumnsWithStringColumns();
} else {
this.sourceTable = sourceTable;
}
this.splitColumnNames = groupColumnNames;
}

Expand Down Expand Up @@ -103,8 +144,8 @@ public int size() {
}

/**
* For a subtable that is grouped by the values in more than one column, split the grouping column into separate
* cols and return the revised view
* For a subtable that is grouped by the values in more than one column, split the grouping column
* into separate cols and return the revised view
*/
private Table splitGroupingColumn(Table groupTable) {

Expand All @@ -115,7 +156,8 @@ private Table splitGroupingColumn(Table groupTable) {
Column<?> newColumn = column.emptyCopy();
newColumns.add(newColumn);
}
// iterate through the rows in the table and split each of the grouping columns into multiple columns
// iterate through the rows in the table and split each of the grouping columns into multiple
// columns
for (int row = 0; row < groupTable.rowCount(); row++) {
List<String> strings = SPLITTER.splitToList(groupTable.stringColumn("Group").get(row));
for (int col = 0; col < newColumns.size(); col++) {
Expand All @@ -132,29 +174,31 @@ private Table splitGroupingColumn(Table groupTable) {
}

/**
* Applies the given aggregation to the given column.
* The apply and combine steps of a split-apply-combine.
* Applies the given aggregation to the given column. The apply and combine steps of a
* split-apply-combine.
*/
public Table aggregate(String colName1, AggregateFunction<?,?>... functions) {
ArrayListMultimap<String, AggregateFunction<?,?>> columnFunctionMap = ArrayListMultimap.create();
public Table aggregate(String colName1, AggregateFunction<?, ?>... functions) {
ArrayListMultimap<String, AggregateFunction<?, ?>> columnFunctionMap =
ArrayListMultimap.create();
columnFunctionMap.putAll(colName1, Lists.newArrayList(functions));
return aggregate(columnFunctionMap);
}

/**
* Applies the given aggregations to the given columns.
* The apply and combine steps of a split-apply-combine.
* Applies the given aggregations to the given columns. The apply and combine steps of a
* split-apply-combine.
*
* @param functions map from column name to aggregation to apply on that function
*/
@SuppressWarnings({ "unchecked", "rawtypes" })
public Table aggregate(ListMultimap<String, AggregateFunction<?,?>> functions) {
@SuppressWarnings({"unchecked", "rawtypes"})
public Table aggregate(ListMultimap<String, AggregateFunction<?, ?>> functions) {
Preconditions.checkArgument(!getSlices().isEmpty());
Table groupTable = summaryTableName(sourceTable);
StringColumn groupColumn = StringColumn.create("Group");
groupTable.addColumns(groupColumn);
boolean firstFunction = true;
for (Map.Entry<String, Collection<AggregateFunction<?,?>>> entry : functions.asMap().entrySet()) {
for (Map.Entry<String, Collection<AggregateFunction<?, ?>>> entry :
functions.asMap().entrySet()) {
String columnName = entry.getKey();
for (AggregateFunction function : entry.getValue()) {
String colName = aggregateColumnName(columnName, function.functionName());
Expand Down
25 changes: 25 additions & 0 deletions core/src/test/java/tech/tablesaw/api/TextColumnTest.java
Expand Up @@ -381,6 +381,17 @@ public void testJoin() {
assertEquals(result.get(1), "run--walk--swim");
}

@Test
public void testAsStringColumn() throws Exception {
Table table = Table.read().csv("../data/first_names.csv");
StringColumn name = table.stringColumn("emma");
TextColumn name2 = name.asTextColumn();
StringColumn name3 = name2.asStringColumn();
for (int i = 0; i < table.rowCount(); i++) {
assertEquals(name.get(i), name3.get(i));
}
}

@Test
public void testTrim() {
String[] words = {" running ", " run run run "};
Expand Down Expand Up @@ -427,6 +438,20 @@ public void tokenizeAndSort() {
assertEquals(result.get(1), "Backwards Writing");
}

@Test
void testSort() throws Exception {
Table t = Table.read().csv("../data/bush.csv");
TextColumn whoText = t.stringColumn("who").asTextColumn();
whoText.setName("who text");
t.addColumns(whoText);
Table t2 = t.copy();
t.sortAscendingOn("who text");
t2.sortAscendingOn("who");
for (int i = 0; i < t.rowCount(); i++) {
assertEquals(t.row(i).getString("who text"), t2.row(i).getString("who"));
}
}

@Test
public void tokenizeAndSort1() {
String[] words = {"Stop,Breaking,Down", "Writing Backwards"};
Expand Down
13 changes: 13 additions & 0 deletions core/src/test/java/tech/tablesaw/table/TableSliceGroupTest.java
Expand Up @@ -28,6 +28,7 @@
import tech.tablesaw.api.NumericColumn;
import tech.tablesaw.api.StringColumn;
import tech.tablesaw.api.Table;
import tech.tablesaw.api.TextColumn;
import tech.tablesaw.io.csv.CsvReadOptions;

public class TableSliceGroupTest {
Expand Down Expand Up @@ -96,6 +97,18 @@ public void aggregate() {
assertEquals(aggregated.rowCount(), group.size());
}

@Test
public void testCreateWithTextColumn() {
TextColumn whoText = table.stringColumn("who").asTextColumn();
whoText.setName("who text");
table.addColumns(whoText);
TableSliceGroup group1 = StandardTableSliceGroup.create(table, table.categoricalColumn("who text"));
TableSliceGroup group2 = StandardTableSliceGroup.create(table, table.categoricalColumn("who"));
Table aggregated1 = group1.aggregate("approval", exaggerate);
Table aggregated2 = group2.aggregate("approval", exaggerate);
assertEquals(aggregated1.rowCount(), aggregated2.rowCount());
}

@Test
public void aggregateWithMultipleColumns() {
table.addColumns(table.categoricalColumn("approval").copy().setName("approval2"));
Expand Down

0 comments on commit 52b654b

Please sign in to comment.