Skip to content

Commit

Permalink
Fix UTF-8 BOM strip for double-quoted header lines
Browse files Browse the repository at this point in the history
  • Loading branch information
johnkerl committed Jan 1, 2018
1 parent b815aae commit 7d19697
Show file tree
Hide file tree
Showing 8 changed files with 99 additions and 35 deletions.
15 changes: 9 additions & 6 deletions c/input/lrec_reader_mmap_csv.c
Expand Up @@ -175,6 +175,14 @@ static void lrec_reader_mmap_csv_sof(void* pvstate, void* pvhandle) {
lrec_reader_mmap_csv_state_t* pstate = pvstate;
pstate->ilno = 0LL;
pstate->expect_header_line_next = pstate->use_implicit_header ? FALSE : TRUE;

// Strip UTF-8 BOM if any
file_reader_mmap_state_t* phandle = pvhandle;
if ((phandle->eof - phandle->sol) >= 3) {
if (memcmp(phandle->sol, "\xef\xbb\xbf", 3) == 0) {
phandle->sol += 3;
}
}
}

// ----------------------------------------------------------------
Expand Down Expand Up @@ -208,12 +216,7 @@ static lrec_t* lrec_reader_mmap_csv_process(void* pvstate, void* pvhandle, conte
}
// Transfer pointer-free responsibility from the rslls to the
// header fields in the header keeper
if (string_starts_with(pe->value, "\xef\xbb\xbf")) {
// Strip UTF-8 BOM if any
slls_append(pheader_fields, mlr_strdup_or_die(&pe->value[3]), FREE_ENTRY_VALUE);
} else {
slls_append(pheader_fields, pe->value, pe->free_flag);
}
slls_append(pheader_fields, pe->value, pe->free_flag);
pe->free_flag = 0;
}
rslls_reset(pstate->pfields);
Expand Down
61 changes: 45 additions & 16 deletions c/input/lrec_reader_stdio_csv.c
Expand Up @@ -45,6 +45,12 @@
#define DQUOTE_IFS_STRIDX 0x2007
#define DQUOTE_EOF_STRIDX 0x2008
#define DQUOTE_DQUOTE_STRIDX 0x2009
#define UTF8_BOM_STRIDX 0x200b

#define UTF8_BOM "\xef\xbb\xbf"
#define UTF8_BOM_LENGTH 3

//#define DEBUG_PARSER

// ----------------------------------------------------------------
typedef struct _lrec_reader_stdio_csv_state_t {
Expand Down Expand Up @@ -73,6 +79,7 @@ typedef struct _lrec_reader_stdio_csv_state_t {
byte_reader_t* pbr;
peek_file_reader_t* pfr;

parse_trie_t* putf8_bom_parse_trie;
parse_trie_t* pno_dquote_parse_trie;
parse_trie_t* pdquote_parse_trie;

Expand All @@ -87,7 +94,7 @@ static void lrec_reader_stdio_csv_free(lrec_reader_t* preader);
static void lrec_reader_stdio_csv_sof(void* pvstate, void* pvhandle);
static lrec_t* lrec_reader_stdio_csv_process(void* pvstate, void* pvhandle, context_t* pctx);
static int lrec_reader_stdio_csv_get_fields(lrec_reader_stdio_csv_state_t* pstate, rslls_t* pfields,
context_t* pctx);
context_t* pctx, int is_header);
static lrec_t* paste_indices_and_data(lrec_reader_stdio_csv_state_t* pstate, rslls_t* pdata_fields,
context_t* pctx);
static lrec_t* paste_header_and_data(lrec_reader_stdio_csv_state_t* pstate, rslls_t* pdata_fields,
Expand Down Expand Up @@ -123,13 +130,20 @@ lrec_reader_t* lrec_reader_stdio_csv_alloc(char* irs, char* ifs, int use_implici

pstate->dquotelen = strlen(pstate->dquote);


// Parse trie for UTF-8 BOM
pstate->putf8_bom_parse_trie = parse_trie_alloc();
parse_trie_add_string(pstate->putf8_bom_parse_trie, UTF8_BOM, UTF8_BOM_STRIDX);

// Parse trie for non-double-quoted fields
pstate->pno_dquote_parse_trie = parse_trie_alloc();
parse_trie_add_string(pstate->pno_dquote_parse_trie, pstate->eof, EOF_STRIDX);
parse_trie_add_string(pstate->pno_dquote_parse_trie, pstate->irs, IRS_STRIDX);
parse_trie_add_string(pstate->pno_dquote_parse_trie, pstate->ifs_eof, IFS_EOF_STRIDX);
parse_trie_add_string(pstate->pno_dquote_parse_trie, pstate->ifs, IFS_STRIDX);
parse_trie_add_string(pstate->pno_dquote_parse_trie, pstate->dquote, DQUOTE_STRIDX);

// Parse trie for double-quoted fields
pstate->pdquote_parse_trie = parse_trie_alloc();
if (pstate->do_auto_line_term) {
pstate->dquote_irs = mlr_paste_2_strings("\"", "\n");
Expand All @@ -147,10 +161,13 @@ lrec_reader_t* lrec_reader_stdio_csv_alloc(char* irs, char* ifs, int use_implici
parse_trie_add_string(pstate->pdquote_parse_trie, pstate->dquote_eof, DQUOTE_EOF_STRIDX);
parse_trie_add_string(pstate->pdquote_parse_trie, pstate->dquote_dquote, DQUOTE_DQUOTE_STRIDX);


pstate->pfields = rslls_alloc();
pstate->psb = sb_alloc(STRING_BUILDER_INIT_SIZE);
pstate->pbr = stdio_byte_reader_alloc();
pstate->pfr = pfr_alloc(pstate->pbr, mlr_imax2(pstate->pno_dquote_parse_trie->maxlen,
pstate->pfr = pfr_alloc(pstate->pbr, mlr_imax3(
pstate->putf8_bom_parse_trie->maxlen,
pstate->pno_dquote_parse_trie->maxlen,
pstate->pdquote_parse_trie->maxlen));

pstate->expect_header_line_next = use_implicit_header ? FALSE : TRUE;
Expand All @@ -177,6 +194,7 @@ static void lrec_reader_stdio_csv_free(lrec_reader_t* preader) {
}
lhmslv_free(pstate->pheader_keepers);
pfr_free(pstate->pfr);
parse_trie_free(pstate->putf8_bom_parse_trie);
parse_trie_free(pstate->pno_dquote_parse_trie);
parse_trie_free(pstate->pdquote_parse_trie);
rslls_free(pstate->pfields);
Expand Down Expand Up @@ -204,7 +222,7 @@ static lrec_t* lrec_reader_stdio_csv_process(void* pvstate, void* pvhandle, cont
// Ingest the next header line, if expected
if (pstate->expect_header_line_next) {
while (TRUE) {
if (!lrec_reader_stdio_csv_get_fields(pstate, pstate->pfields, pctx))
if (!lrec_reader_stdio_csv_get_fields(pstate, pstate->pfields, pctx, TRUE))
return NULL;
pstate->ilno++;

Expand All @@ -227,14 +245,7 @@ static lrec_t* lrec_reader_stdio_csv_process(void* pvstate, void* pvhandle, cont
}
// Transfer pointer-free responsibility from the rslls to the
// header fields in the header keeper
if (string_starts_with(pe->value, "\xef\xbb\xbf")) {
// Strip UTF-8 BOM if any
slls_append(pheader_fields, mlr_strdup_or_die(&pe->value[3]), FREE_ENTRY_VALUE);
if (pe->free_flag & FREE_ENTRY_VALUE)
free(pe->value);
} else {
slls_append(pheader_fields, pe->value, pe->free_flag);
}
slls_append(pheader_fields, pe->value, pe->free_flag);
pe->free_flag = 0;
}
rslls_reset(pstate->pfields);
Expand All @@ -255,7 +266,7 @@ static lrec_t* lrec_reader_stdio_csv_process(void* pvstate, void* pvhandle, cont

// Ingest the next data line, if expected
while (TRUE) {
int rc = lrec_reader_stdio_csv_get_fields(pstate, pstate->pfields, pctx);
int rc = lrec_reader_stdio_csv_get_fields(pstate, pstate->pfields, pctx, FALSE);
pstate->ilno++;
if (rc == FALSE) // EOF
return NULL;
Expand All @@ -278,7 +289,7 @@ static lrec_t* lrec_reader_stdio_csv_process(void* pvstate, void* pvhandle, cont
}

static int lrec_reader_stdio_csv_get_fields(lrec_reader_stdio_csv_state_t* pstate, rslls_t* pfields,
context_t* pctx)
context_t* pctx, int is_header)
{
int rc, stridx, matchlen, record_done, field_done;
peek_file_reader_t* pfr = pstate->pfr;
Expand All @@ -289,11 +300,29 @@ static int lrec_reader_stdio_csv_get_fields(lrec_reader_stdio_csv_state_t* pstat
if (pfr_peek_char(pfr) == (char)EOF) // char defaults to unsigned on some platforms
return FALSE;

// loop over fields in record
// Strip the UTF-8 BOM, if any. This is MUCH simpler for mmap, and for stdio on files. For mmap
// we can test the first 3 bytes, then skip past them or not. For stdio on files we can fread
// the first 3 bytes, then rewind the fp if they're not the UTF-8 BOM. But for stdio on stdin
// (which is the primary reason we support stdio in Miller), we cannot rewind: stdin is not
// rewindable.
if (is_header) {
pfr_buffer_by(pfr, UTF8_BOM_LENGTH);
int rc = parse_trie_ring_match(pstate->putf8_bom_parse_trie,
pfr->peekbuf, pfr->sob, pfr->npeeked, pfr->peekbuflenmask,
&stridx, &matchlen);
#ifdef DEBUG_PARSER
printf("RC=%d stridx=0x%04x matchlen=%d\n", rc, stridx, matchlen);
#endif
if (rc == TRUE && stridx == UTF8_BOM_STRIDX) {
pfr_advance_by(pfr, matchlen);
}
}

// Loop over fields in record
record_done = FALSE;
while (!record_done) {
// Assumption is dquote is "\""
if (pfr_peek_char(pfr) != pstate->dquote[0]) {
if (pfr_peek_char(pfr) != pstate->dquote[0]) { // NOT DOUBLE-QUOTED

// Loop over characters in field
field_done = FALSE;
Expand Down Expand Up @@ -365,7 +394,7 @@ static int lrec_reader_stdio_csv_get_fields(lrec_reader_stdio_csv_state_t* pstat
}
}

} else {
} else { // DOUBLE-QUOTED
pfr_advance_by(pfr, pstate->dquotelen);

// loop over characters in field
Expand Down
5 changes: 5 additions & 0 deletions c/lib/mlrutil.c
Expand Up @@ -362,6 +362,11 @@ int mlr_imax2(int a, int b) {
return b;
}

// ----------------------------------------------------------------
int mlr_imax3(int a, int b, int c) {
return mlr_imax2(a, mlr_imax2(b, c));
}

// ----------------------------------------------------------------
int power_of_two_above(int n) {
n |= (n >> 1);
Expand Down
1 change: 1 addition & 0 deletions c/lib/mlrutil.h
Expand Up @@ -132,6 +132,7 @@ int string_starts_with(char* string, char* prefix);
int string_ends_with(char* string, char* suffix, int* pstringlen);

int mlr_imax2(int a, int b);
int mlr_imax3(int a, int b, int c);
int power_of_two_above(int n);

// The caller should free the return value. Maps two-character sequences such as
Expand Down
10 changes: 10 additions & 0 deletions c/reg_test/expected/out
Expand Up @@ -44858,6 +44858,16 @@ a b c
1 2 3
4 5 6

mlr --icsv --opprint cat ./reg_test/input/bom-dquote-header.csv
a b c
1 2 3
4 5 6

mlr --icsv --opprint cat
a b c
1 2 3
4 5 6


================================================================
MMAP AT PAGE BOUNDARIES
Expand Down
1 change: 1 addition & 0 deletions c/reg_test/input/Makefile.am
Expand Up @@ -19,6 +19,7 @@ EXTRA_DIST= \
b.csv \
b.pprint \
bom.csv \
bom-dquote-header.csv \
braced.csv \
c.csv \
c.pprint \
Expand Down
2 changes: 2 additions & 0 deletions c/reg_test/run
Expand Up @@ -5119,6 +5119,8 @@ announce UTF-8 BOM

run_mlr --icsv --opprint cat $indir/bom.csv
run_mlr --icsv --opprint cat < $indir/bom.csv
run_mlr --icsv --opprint cat $indir/bom-dquote-header.csv
run_mlr --icsv --opprint cat < $indir/bom-dquote-header.csv

# ----------------------------------------------------------------
announce MMAP AT PAGE BOUNDARIES
Expand Down
39 changes: 26 additions & 13 deletions c/todo.txt
Expand Up @@ -3,6 +3,15 @@ BUGFIXES

* CSV w/ BOM w/ double-quoted header line

$ mlr --pprint put '@quantity_sum += $quanity; @rate_sum += $rate; end {emit @quantity_sum, @rate_sum}' < a
color shape flag index quantity rate
yellow triangle 1 11 43.6498 9.8870
red square 1 15 79.2778 0.0130
red circle 1 16 13.8103 2.9010

$ mlr --pprint put '@quantity_sum += $quanity; @rate_sum += $rate; end {emit (@quantity_sum, @rate_sum)}' < a
Segmentation fault: 11

* near-overflow case:
$ mlr sort -n x reg_test/input/near-ovf.dkvp
x=9223372036854775807,y=-9223372036854775803
Expand All @@ -14,13 +23,26 @@ BUGFIXES
x=9223372036854775805,y=-9223372036854775802

================================================================
FUNDAM:
5.3.0 TO DO:

* synctool alias/flag handling ...
* CSV BOM-strip w/ dquote header
* comment pass-through option

? fromhex ?
! --skip-comments
! count-similar
-> ut
! stats1 count-unique ... uniq -c -ish
k ruby/python/etc. dkvp-reader/writers, and example code
* FAQ for https://github.com/johnkerl/miller/issues/150
* int-overflowing arithmetic operators
* fix https://github.com/johnkerl/miller/issues/158
* fix https://github.com/johnkerl/miller/issues/159

================================================================
COMMENTS:

! pass-through variant
k mlrcli
k work into lrec_readers API calls, stubbed
* impl off one at a time
Expand All @@ -45,18 +67,9 @@ k work into lrec_readers API calls, stubbed
input/lrec_reader_stdio_xtab.c:8

================================================================
5.3.0 TO-DO:
FUNDAM:

? fromhex ?
! --skip-comments
! count-similar
-> ut
! stats1 count-unique ... uniq -c -ish
k ruby/python/etc. dkvp-reader/writers, and example code
* FAQ for https://github.com/johnkerl/miller/issues/150
* int-overflowing arithmetic operators
* fix https://github.com/johnkerl/miller/issues/158
* fix https://github.com/johnkerl/miller/issues/159
* synctool alias/flag handling ...

----------------------------------------------------------------
! dirname/basename functions !
Expand Down

0 comments on commit 7d19697

Please sign in to comment.