From 52b654bbddee1377ea2d4bfc1ee6717913b6d97d Mon Sep 17 00:00:00 2001 From: Larry White Date: Sat, 3 Aug 2019 14:39:04 -0400 Subject: [PATCH] Fix bug: table.splitOn fails when the categorical column is a TextColumn #554 Convert any textColumns to stringColumns before splitting tables to allow them to be used for subgroups, etc. --- .../tech/tablesaw/table/TableSliceGroup.java | 80 ++++++++++++++----- .../tech/tablesaw/api/TextColumnTest.java | 25 ++++++ .../tablesaw/table/TableSliceGroupTest.java | 13 +++ 3 files changed, 100 insertions(+), 18 deletions(-) diff --git a/core/src/main/java/tech/tablesaw/table/TableSliceGroup.java b/core/src/main/java/tech/tablesaw/table/TableSliceGroup.java index 537e0e239..b6f456200 100644 --- a/core/src/main/java/tech/tablesaw/table/TableSliceGroup.java +++ b/core/src/main/java/tech/tablesaw/table/TableSliceGroup.java @@ -36,10 +36,12 @@ */ public class TableSliceGroup implements Iterable { - // A string that is used internally as a delimiter in creating a column name from all the grouping columns + // A string that is used internally as a delimiter in creating a column name from all the grouping + // columns protected static final String SPLIT_STRING = "~~~"; - // A function that splits the group column name back into the original column names for the grouping columns + // A function that splits the group column name back into the original column names for the + // grouping columns private static final Splitter SPLITTER = Splitter.on(SPLIT_STRING); // The list of slices or views over the source table that I contain @@ -54,16 +56,55 @@ public class TableSliceGroup implements Iterable { * Returns an instance for calculating a single summary for the given table, with no sub-groupings */ protected TableSliceGroup(Table original) { - sourceTable = original; + if (containsAnyTextColumns(original)) { + sourceTable = original.copy(); + replaceTextColumnsWithStringColumns(); + } else { + sourceTable = original; + } splitColumnNames = new String[0]; } + private boolean containsAnyTextColumns(Table original) { + for (Column column : original.columns()) { + if (column.type().equals(ColumnType.TEXT)) { + return true; + } + } + return false; + } + + /** + * Replace any textColumns in the table with stringColumns. We do this because TextColumns don't + * split correctly: The split algorithm uses a byte[] version of the elements to do it's magic, + * and text columns have variable sized strings, so variable sized byte arrays. Determining the + * correct array size (maybe the largest array size for the array?) would be somewhat fraught + * because the size depends on the encoding and the strings do not know they're own encoding. This + * would need to be detected using a 3rd party library. + * + *

So replace with the equivalent stringColumn instead. + */ + private void replaceTextColumnsWithStringColumns() { + for (int i = 0; i < sourceTable.columnCount(); i++) { + if (sourceTable.column(i).type().equals(ColumnType.TEXT)) { + String originalName = sourceTable.column(i).name(); + sourceTable.replaceColumn(i, sourceTable.textColumn(i).asStringColumn()); + sourceTable.column(i).setName(originalName); + } + } + } + /** - * Returns an instance for calculating subgroups, - * one for each combination of the given groupColumnNames that appear in the source table + * Returns an instance for calculating subgroups, one for each combination of the given + * groupColumnNames that appear in the source table */ protected TableSliceGroup(Table sourceTable, String[] groupColumnNames) { - this.sourceTable = sourceTable; + if (containsAnyTextColumns(sourceTable)) { + this.sourceTable = sourceTable.copy(); + replaceTextColumnsWithStringColumns(); + } else { + this.sourceTable = sourceTable; + } this.splitColumnNames = groupColumnNames; } @@ -103,8 +144,8 @@ public int size() { } /** - * For a subtable that is grouped by the values in more than one column, split the grouping column into separate - * cols and return the revised view + * For a subtable that is grouped by the values in more than one column, split the grouping column + * into separate cols and return the revised view */ private Table splitGroupingColumn(Table groupTable) { @@ -115,7 +156,8 @@ private Table splitGroupingColumn(Table groupTable) { Column newColumn = column.emptyCopy(); newColumns.add(newColumn); } - // iterate through the rows in the table and split each of the grouping columns into multiple columns + // iterate through the rows in the table and split each of the grouping columns into multiple + // columns for (int row = 0; row < groupTable.rowCount(); row++) { List strings = SPLITTER.splitToList(groupTable.stringColumn("Group").get(row)); for (int col = 0; col < newColumns.size(); col++) { @@ -132,29 +174,31 @@ private Table splitGroupingColumn(Table groupTable) { } /** - * Applies the given aggregation to the given column. - * The apply and combine steps of a split-apply-combine. + * Applies the given aggregation to the given column. The apply and combine steps of a + * split-apply-combine. */ - public Table aggregate(String colName1, AggregateFunction... functions) { - ArrayListMultimap> columnFunctionMap = ArrayListMultimap.create(); + public Table aggregate(String colName1, AggregateFunction... functions) { + ArrayListMultimap> columnFunctionMap = + ArrayListMultimap.create(); columnFunctionMap.putAll(colName1, Lists.newArrayList(functions)); return aggregate(columnFunctionMap); } /** - * Applies the given aggregations to the given columns. - * The apply and combine steps of a split-apply-combine. + * Applies the given aggregations to the given columns. The apply and combine steps of a + * split-apply-combine. * * @param functions map from column name to aggregation to apply on that function */ - @SuppressWarnings({ "unchecked", "rawtypes" }) - public Table aggregate(ListMultimap> functions) { + @SuppressWarnings({"unchecked", "rawtypes"}) + public Table aggregate(ListMultimap> functions) { Preconditions.checkArgument(!getSlices().isEmpty()); Table groupTable = summaryTableName(sourceTable); StringColumn groupColumn = StringColumn.create("Group"); groupTable.addColumns(groupColumn); boolean firstFunction = true; - for (Map.Entry>> entry : functions.asMap().entrySet()) { + for (Map.Entry>> entry : + functions.asMap().entrySet()) { String columnName = entry.getKey(); for (AggregateFunction function : entry.getValue()) { String colName = aggregateColumnName(columnName, function.functionName()); diff --git a/core/src/test/java/tech/tablesaw/api/TextColumnTest.java b/core/src/test/java/tech/tablesaw/api/TextColumnTest.java index 377454610..6f3dd4952 100644 --- a/core/src/test/java/tech/tablesaw/api/TextColumnTest.java +++ b/core/src/test/java/tech/tablesaw/api/TextColumnTest.java @@ -381,6 +381,17 @@ public void testJoin() { assertEquals(result.get(1), "run--walk--swim"); } + @Test + public void testAsStringColumn() throws Exception { + Table table = Table.read().csv("../data/first_names.csv"); + StringColumn name = table.stringColumn("emma"); + TextColumn name2 = name.asTextColumn(); + StringColumn name3 = name2.asStringColumn(); + for (int i = 0; i < table.rowCount(); i++) { + assertEquals(name.get(i), name3.get(i)); + } + } + @Test public void testTrim() { String[] words = {" running ", " run run run "}; @@ -427,6 +438,20 @@ public void tokenizeAndSort() { assertEquals(result.get(1), "Backwards Writing"); } + @Test + void testSort() throws Exception { + Table t = Table.read().csv("../data/bush.csv"); + TextColumn whoText = t.stringColumn("who").asTextColumn(); + whoText.setName("who text"); + t.addColumns(whoText); + Table t2 = t.copy(); + t.sortAscendingOn("who text"); + t2.sortAscendingOn("who"); + for (int i = 0; i < t.rowCount(); i++) { + assertEquals(t.row(i).getString("who text"), t2.row(i).getString("who")); + } + } + @Test public void tokenizeAndSort1() { String[] words = {"Stop,Breaking,Down", "Writing Backwards"}; diff --git a/core/src/test/java/tech/tablesaw/table/TableSliceGroupTest.java b/core/src/test/java/tech/tablesaw/table/TableSliceGroupTest.java index ac1adb0c4..bf0181962 100644 --- a/core/src/test/java/tech/tablesaw/table/TableSliceGroupTest.java +++ b/core/src/test/java/tech/tablesaw/table/TableSliceGroupTest.java @@ -28,6 +28,7 @@ import tech.tablesaw.api.NumericColumn; import tech.tablesaw.api.StringColumn; import tech.tablesaw.api.Table; +import tech.tablesaw.api.TextColumn; import tech.tablesaw.io.csv.CsvReadOptions; public class TableSliceGroupTest { @@ -96,6 +97,18 @@ public void aggregate() { assertEquals(aggregated.rowCount(), group.size()); } + @Test + public void testCreateWithTextColumn() { + TextColumn whoText = table.stringColumn("who").asTextColumn(); + whoText.setName("who text"); + table.addColumns(whoText); + TableSliceGroup group1 = StandardTableSliceGroup.create(table, table.categoricalColumn("who text")); + TableSliceGroup group2 = StandardTableSliceGroup.create(table, table.categoricalColumn("who")); + Table aggregated1 = group1.aggregate("approval", exaggerate); + Table aggregated2 = group2.aggregate("approval", exaggerate); + assertEquals(aggregated1.rowCount(), aggregated2.rowCount()); + } + @Test public void aggregateWithMultipleColumns() { table.addColumns(table.categoricalColumn("approval").copy().setName("approval2"));