Skip to content

Commit

Permalink
[Enhancement] Choose the default field to be integer if no reals are …
Browse files Browse the repository at this point in the history
…present (#1409)

* Choose the default field to be integer for point layer if no reals are present

Signed-off-by: Shan He <heshan0131@gmail.com>
Co-authored-by: Isaac Brodsky <isaac@unfolded.ai>
  • Loading branch information
heshan0131 and Isaac Brodsky committed Mar 21, 2021
1 parent 072876d commit b97b58a
Show file tree
Hide file tree
Showing 2 changed files with 201 additions and 35 deletions.
166 changes: 151 additions & 15 deletions src/utils/dataset-utils.js
Expand Up @@ -91,29 +91,165 @@ export function createNewDataEntry({info, data, metadata}, datasets = {}) {
};
}

/**
* Field name prefixes and suffixes which should not be considered
* as metrics. Fields will still be included if a 'metric word'
* is found on the field name, however.
*/
const EXCLUDED_DEFAULT_FIELDS = [
// Serial numbers and identification numbers
'_id',
'id',
'index',
'uuid',
'guid',
'uid',
'gid',
'serial',
// Geographic IDs are unlikely to be interesting to color
'zip',
'code',
'post',
'region',
'fips',
'cbgs',
'h3',
's2',
// Geographic coords (but not z/elevation/altitude
// since that might be a metric)
'lat',
'lon',
'lng',
'latitude',
'longitude',
'_x',
'_y'
];

/**
* Prefixes and suffixes that indicate a field is a metric.
*
* Note that these are in order of preference, first being
* most preferred.
*/
const METRIC_DEFAULT_FIELDS = [
'metric',
'value',
'sum',
'count',
'unique',
'mean',
'mode',
'median',
'max',
'min',
'deviation',
'variance',
'p99',
'p95',
'p75',
'p50',
'p25',
'p05',
// Abbreviations are less preferred
'cnt',
'val'
];

/**
* Choose a field to use as the default color field of a layer.
*
* Right now this implements a very simple heuristic looking
* for a real-type field that is not lat/lon.
* The heuristic is:
*
* First, exclude fields that are on the exclusion list and don't
* have names that suggest they contain metrics. Also exclude
* field names that are blank.
*
* Next, look for a field that is of real type and contains one
* of the preferred names (in order of the preferred names).
*
* Next, look for a field that is of integer type and contains
* one of the preferred names (in order of the preferred names).
*
* Next, look for the first field that is of real type (in order
* of field index).
*
* Next, look for the first field that is of integer type (in
* order of field index).
*
* In the future we could consider other things:
* Consider integer fields
* look for highest dynamic range (using a sample of the data)
* Look for particular names to select ("value", "color", etc)
* Look for particular names to avoid ("" - the Pandas index column)
* It's possible no field will be chosen (i.e. because all fields
* are strings.)
*
* @param dataset
*/
export function findDefaultColorField({fields, fieldPairs = []}) {
const defaultField = fields.find(
f =>
f.type === ALL_FIELD_TYPES.real &&
const fieldsWithoutExcluded = fields.filter(field => {
if (field.type !== ALL_FIELD_TYPES.real && field.type !== ALL_FIELD_TYPES.integer) {
// Only select numeric fields.
return false;
}
if (
fieldPairs.find(
pair => pair.pair.lat.value === field.name || pair.pair.lng.value === field.name
)
) {
// Do not permit lat, lon fields
!fieldPairs.find(pair => pair.pair.lat.value === f.name || pair.pair.lng.value === f.name)
);
if (!defaultField) {
return null;
return false;
}

const normalizedFieldName = field.name.toLowerCase();
if (normalizedFieldName === '') {
// Special case excluded name when the name is blank.
return false;
}
const hasExcluded = EXCLUDED_DEFAULT_FIELDS.find(
f => normalizedFieldName.startsWith(f) || normalizedFieldName.endsWith(f)
);
const hasInclusion = METRIC_DEFAULT_FIELDS.find(
f => normalizedFieldName.startsWith(f) || normalizedFieldName.endsWith(f)
);
return !hasExcluded || hasInclusion;
});

const sortedFields = fieldsWithoutExcluded.sort((left, right) => {
const normalizedLeft = left.name.toLowerCase();
const normalizedRight = right.name.toLowerCase();
const leftHasInclusion = METRIC_DEFAULT_FIELDS.findIndex(
f => normalizedLeft.startsWith(f) || normalizedLeft.endsWith(f)
);
const rightHasInclusion = METRIC_DEFAULT_FIELDS.findIndex(
f => normalizedRight.startsWith(f) || normalizedRight.endsWith(f)
);
if (leftHasInclusion !== rightHasInclusion) {
if (leftHasInclusion === -1) {
// Elements that do not have the inclusion list should go after those that do.
return 1;
} else if (rightHasInclusion === -1) {
// Elements that do have the inclusion list should go before those that don't.
return -1;
}
// Compare based on order in the inclusion list
return leftHasInclusion - rightHasInclusion;
}

// Compare based on type
if (left.type !== right.type) {
if (left.type === ALL_FIELD_TYPES.real) {
return -1;
}
// left is an integer and right is not
// and reals come before integers
return 1;
}

// Finally, order based on the order in the datasets columns
return left.index - right.index;
});

if (sortedFields.length) {
// There was a best match
return sortedFields[0];
}
return defaultField;
// No matches
return null;
}
70 changes: 50 additions & 20 deletions test/node/utils/dataset-utils-test.js
Expand Up @@ -25,27 +25,57 @@ import {processCsvData} from 'processors/data-processor';

import csvData from 'test/fixtures/test-layer-data';

test('datasetUtils.findDefaultColorField', t => {
const dataset = createNewDataEntry({
info: {id: 'taro'},
data: processCsvData(csvData)
}).taro;

const defaultField = findDefaultColorField(dataset);
// Unfortunately lat_1 is not detected as part of a field pair :(
t.equals(defaultField.name, 'lat_1', 'default field name is OK');
const DEFAULT_FIELD_TEST_CASES = [
{
name: 'excluded lat',
csv: csvData,
expected: 'trip_distance'
},
{
name: 'empty',
csv: 'a\na',
expected: null
},
{
name: 'integer only',
csv: 'a,b\na,0\na,1',
expected: 'b'
},
{
name: 'integer and real',
csv: 'a,b,c\na,0,0.5\na,1,0.5',
expected: 'c'
},
{
name: 'excluded real',
csv: 'zipcode,b,c\n0.5,0,0.5\n0.5,1,0.5',
expected: 'c'
},
{
name: 'included real',
csv: 'zipcode mean,b,c\n0.5,0,0.5\n0.5,1,0.5',
expected: 'zipcode mean'
},
{
name: 'included real, with inclusion ordering',
csv: 'zipcode mean,a metric,b,c\n0.5,0.1,0,0.5\n0.5,0.1,1,0.5',
expected: 'a metric'
}
];

t.end();
});

test('datasetUtils.findDefaultColorField empty', t => {
const dataset = createNewDataEntry({
info: {id: 'taro'},
data: processCsvData('a\na')
}).taro;

const defaultField = findDefaultColorField(dataset);
t.notOk(defaultField, 'default field is null');
test('datasetUtils.findDefaultColorField', t => {
for (const tc of DEFAULT_FIELD_TEST_CASES) {
const dataset = createNewDataEntry({
info: {id: 'taro'},
data: processCsvData(tc.csv)
}).taro;

const defaultField = findDefaultColorField(dataset);
if (!tc.expected) {
t.notOk(defaultField, `${tc.name}: default field is null`);
} else {
t.equals(defaultField.name, tc.expected, `${tc.name}: default field name is OK`);
}
}
t.end();
});

0 comments on commit b97b58a

Please sign in to comment.