Skip to content

Commit

Permalink
issue-42 All-numeric string fields containing leading zeros should be…
Browse files Browse the repository at this point in the history
… left as string types

The theory is that the data may represent ID values of some sort.

Signed-off-by: Dan S. Camper <dan.camper@lexisnexisrisk.com>
  • Loading branch information
dcamper committed May 23, 2019
1 parent 7ece9d9 commit db2a279
Show file tree
Hide file tree
Showing 3 changed files with 59 additions and 12 deletions.
45 changes: 33 additions & 12 deletions Profile.ecl
Original file line number Diff line number Diff line change
Expand Up @@ -229,12 +229,12 @@ EXPORT Profile(inFile,
// Useful functions for pattern mapping
LOCAL MapUpperCharStr(STRING s) := REGEXREPLACE('[[:upper:]]', s, 'A');
LOCAL MapLowerCharStr(STRING s) := REGEXREPLACE('[[:lower:]]', s, 'a');
LOCAL MapDigitStr(STRING s) := REGEXREPLACE('[[:digit:]]', s, '9');
LOCAL MapDigitStr(STRING s) := REGEXREPLACE('[1-9]', s, '9'); // Leave '0' as-is and replace with '9' later
LOCAL MapAllStr(STRING s) := MapDigitStr(MapLowerCharStr(MapUpperCharStr(s)));

LOCAL MapUpperCharUni(UNICODE s) := REGEXREPLACE(u'[[:upper:]]', s, u'A');
LOCAL MapLowerCharUni(UNICODE s) := REGEXREPLACE(u'[[:lower:]]', s, u'a');
LOCAL MapDigitUni(UNICODE s) := REGEXREPLACE(u'[[:digit:]]', s, u'9');
LOCAL MapDigitUni(UNICODE s) := REGEXREPLACE(u'[1-9]', s, u'9'); // Leave '0' as-is and replace with '9' later
LOCAL MapAllUni(UNICODE s) := (STRING)MapDigitUni(MapLowerCharUni(MapUpperCharUni(s)));

LOCAL TrimmedStr(STRING s) := TRIM(s, LEFT, RIGHT);
Expand Down Expand Up @@ -507,14 +507,15 @@ EXPORT Profile(inFile,
ExpNotation = 8
);

LOCAL DataTypeEnum BestTypeFlag(STRING dataPattern) := FUNCTION
isSignedInteger := REGEXFIND('^\\-9{1,19}$', dataPattern);
isShortUnsignedInteger := REGEXFIND('^9{1,19}$', dataPattern);
isUnsignedInteger := REGEXFIND('^\\+?9{1,20}$', dataPattern);
isFloatingPoint := REGEXFIND('^(\\-|\\+)?9{0,15}\\.9{1,15}$', dataPattern);
isExpNotation := REGEXFIND('^(\\-|\\+)?9\\.9{1,6}a\\-9{1,3}$', dataPattern, NOCASE);
LOCAL DataTypeEnum BestTypeFlag(STRING dataPattern, AttributeType_t attributeType) := FUNCTION
hasLeadingZeros := REGEXFIND('^0+', dataPattern);
isSignedInteger := REGEXFIND('^\\-[09]{1,19}$', dataPattern);
isShortUnsignedInteger := REGEXFIND('^[09]{1,19}$', dataPattern);
isUnsignedInteger := REGEXFIND('^\\+?[09]{1,20}$', dataPattern);
isFloatingPoint := REGEXFIND('^(\\-|\\+)?[09]{0,15}\\.[09]{1,15}$', dataPattern);
isExpNotation := REGEXFIND('^(\\-|\\+)?[09]\\.[09]{1,6}[aA]\\-[09]{1,3}$', dataPattern);

RETURN MAP
stringWithNumbersType := MAP
(
isSignedInteger => DataTypeEnum.SignedInteger | DataTypeEnum.FloatingPoint | DataTypeEnum.ExpNotation,
isShortUnsignedInteger => DataTypeEnum.SignedInteger | DataTypeEnum.UnsignedInteger | DataTypeEnum.FloatingPoint | DataTypeEnum.ExpNotation,
Expand All @@ -523,6 +524,15 @@ EXPORT Profile(inFile,
isExpNotation => DataTypeEnum.ExpNotation,
DataTypeEnum.AsIs
);

bestType := MAP
(
REGEXFIND('(integer)|(unsigned)|(decimal)|(real)|(boolean)', attributeType) => DataTypeEnum.AsIs,
hasLeadingZeros => DataTypeEnum.AsIs,
stringWithNumbersType
);

RETURN bestType;
END;

// Estimate integer size from readable data length
Expand All @@ -536,12 +546,12 @@ EXPORT Profile(inFile,
given_attribute_type,
data_pattern,
data_length,
DataTypeEnum type_flag := BestTypeFlag(TRIM(data_pattern))
DataTypeEnum type_flag := BestTypeFlag(TRIM(data_pattern), given_attribute_type)

},
attribute, given_attribute_type, data_pattern, data_length,
MERGE
);
) : ONWARNING(2168, IGNORE);

LOCAL attributesWithTypeFlagsSummary := AGGREGATE
(
Expand Down Expand Up @@ -824,9 +834,20 @@ EXPORT Profile(inFile,

// Count data patterns used per attribute; extract the most common and
// most rare, taking care to not allow the two to overlap
LOCAL dataPatternStats0 := PROJECT
(
filledDataInfo,
TRANSFORM
(
RECORDOF(LEFT),
SELF.data_pattern := Std.Str.FindReplace(LEFT.data_pattern, '0', '9'),
SELF := LEFT
)
);

LOCAL dataPatternStats := TABLE
(
DISTRIBUTE(filledDataInfo, HASH32(attribute)),
DISTRIBUTE(dataPatternStats0, HASH32(attribute)),
{
attribute,
data_pattern,
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ level, such as within your "My Files" folder.
|1.3.5|Fix ordering of output in BestRecordStructure when TRANSFORM is emitted|
|1.4.0|Automatically include improved visual results of Profile, including data distribution graphs (within workunit's Resources tab)|
|1.4.1|Regression: Fix self-tests that were failing due to changes in v1.3.4|
|1.4.2|Strings fields containing all numerics with leading zeros are now marked as string in best\_attribute\_type|

<a name="profile"></a>
### Profile
Expand Down
25 changes: 25 additions & 0 deletions Tests.ecl
Original file line number Diff line number Diff line change
Expand Up @@ -474,4 +474,29 @@ EXPORT Tests := MODULE
ASSERT(COUNT(Embedded_Child1_Profile(attribute = 'foo.z')[1].numeric_correlations) = 2),
ASSERT(TRUE)
];

//--------------------------------------------------------------------------
// Test strings fields containing numerics with leading zeros (issue 42)
//--------------------------------------------------------------------------

SHARED Leading_Zeros := DATASET
(
[
{'0100', '1234', '0001', '7809', '-0600'},
{'0020', '0001', '0023', '0001', '600'}
],
{STRING s1, STRING s2, STRING s3, STRING s4, STRING s5}
);

SHARED Leading_Zeros_Profile := DataPatterns.Profile(NOFOLD(Leading_Zeros), features := 'best_ecl_types');

EXPORT Test_Leading_Zeros_Profile :=
[
ASSERT(ValueForAttr(Leading_Zeros_Profile, 's1', best_attribute_type) = 'string4'),
ASSERT(ValueForAttr(Leading_Zeros_Profile, 's2', best_attribute_type) = 'string4'),
ASSERT(ValueForAttr(Leading_Zeros_Profile, 's3', best_attribute_type) = 'string4'),
ASSERT(ValueForAttr(Leading_Zeros_Profile, 's4', best_attribute_type) = 'string4'),
ASSERT(ValueForAttr(Leading_Zeros_Profile, 's5', best_attribute_type) = 'integer3'),
ASSERT(TRUE)
];
END;

0 comments on commit db2a279

Please sign in to comment.