/
parse.ts
577 lines (507 loc) · 18.2 KB
/
parse.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
// Copyright (c) Jupyter Development Team.
// Distributed under the terms of the Modified BSD License.
/*
Possible options to add to the parser:
- Optional offsets array to modify, so we don't need to create a new offsets list (we would need to be careful not to overwrite things if a row needs to be truncated.)
- Comment character at the start of the line
- Skip empty whitespace lines
- Skip rows with empty columns
- Logging an error for too many or too few fields on a line
- Ignore whitespace around delimiters
- Add an exported function in this file for getting a field from the returned offsets array (including stripping field or row delimiters and parsing quoted data). Right now this logic is in the DSVModel. Likely we want to keep the logic there for speed, but having it here as well will make the parser more self-contained and usable by others.
- Sanity check on field size, with an error if the field exceeds the size
- Tests against https://github.com/maxogden/csv-spectrum
- Benchmark against https://www.npmjs.com/package/csv-parser and https://www.npmjs.com/package/csv-string and fast-csv.
*/
/**
* Interface for a delimiter-separated data parser.
*
* @param options The parser options
* @returns An object giving the offsets for the rows or columns parsed.
*
* #### Notes
* The parsers are based on [RFC 4180](https://tools.ietf.org/html/rfc4180).
*/
export type IParser = (options: IParser.IOptions) => IParser.IResults;
export namespace IParser {
/**
* The options for a parser.
*/
export interface IOptions {
/**
* The data to parse.
*/
data: string;
/**
* Whether to return column offsets in the offsets array.
*
* #### Notes
* If false, the returned offsets array contains just the row offsets. If
* true, the returned offsets array contains all column offsets for each
* column in the rows (i.e., it has nrows*ncols entries). Individual rows
* will have empty columns added or extra columns merged into the last
* column if they do not have exactly ncols columns.
*/
columnOffsets: boolean;
/**
* The delimiter to use. Defaults to ','.
*/
delimiter?: string;
/**
* The row delimiter to use. Defaults to '\r\n'.
*/
rowDelimiter?: string;
/**
* The quote character for quoting fields. Defaults to the double quote (").
*
* #### Notes
* As specified in [RFC 4180](https://tools.ietf.org/html/rfc4180), quotes
* are escaped in a quoted field by doubling them (for example, "a""b" is the field
* a"b).
*/
quote?: string;
/**
* The starting index in the string for processing. Defaults to 0. This
* index should be the first character of a new row. This must be less than
* data.length.
*/
startIndex?: number;
/**
* Maximum number of rows to parse.
*
* If this is not given, parsing proceeds to the end of the data.
*/
maxRows?: number;
/**
* Number of columns in each row to parse.
*
* #### Notes
* If this is not given, the ncols defaults to the number of columns in the
* first row.
*/
ncols?: number;
}
/**
* The results from a parser.
*/
export interface IResults {
/**
* The number of rows parsed.
*/
nrows: number;
/**
* The number of columns parsed, or 0 if only row offsets are returned.
*/
ncols: number;
/**
* The index offsets into the data string for the rows or data items.
*
* #### Notes
* If the columnOffsets argument to the parser is false, the offsets array
* will be an array of length nrows, where `offsets[r]` is the index of the
* first character of row r.
*
* If the columnOffsets argument to the parser is true, the offsets array
* will be an array of length `nrows*ncols`, where `offsets[r*ncols + c]` is
* the index of the first character of the item in row r, column c.
*/
offsets: number[];
}
}
/**
* Possible parser states.
*/
enum STATE {
QUOTED_FIELD,
QUOTED_FIELD_QUOTE,
UNQUOTED_FIELD,
NEW_FIELD,
NEW_ROW
}
/**
* Possible row delimiters for the parser.
*/
enum ROW_DELIMITER {
CR,
CRLF,
LF
}
/**
* Parse delimiter-separated data.
*
* @param options The parser options
* @returns An object giving the offsets for the rows or columns parsed.
*
* #### Notes
* This implementation is based on [RFC 4180](https://tools.ietf.org/html/rfc4180).
*/
export function parseDSV(options: IParser.IOptions): IParser.IResults {
const {
data,
columnOffsets,
delimiter = ',',
startIndex = 0,
maxRows = 0xffffffff,
rowDelimiter = '\r\n',
quote = '"'
} = options;
// ncols will be set automatically if it is undefined.
let ncols = options.ncols;
// The number of rows we've already parsed.
let nrows = 0;
// The row or column offsets we return.
const offsets = [];
// Set up some useful local variables.
const CH_DELIMITER = delimiter.charCodeAt(0);
const CH_QUOTE = quote.charCodeAt(0);
const CH_LF = 10; // \n
const CH_CR = 13; // \r
const endIndex = data.length;
const {
QUOTED_FIELD,
QUOTED_FIELD_QUOTE,
UNQUOTED_FIELD,
NEW_FIELD,
NEW_ROW
} = STATE;
const { CR, LF, CRLF } = ROW_DELIMITER;
const [rowDelimiterCode, rowDelimiterLength] =
rowDelimiter === '\r\n'
? [CRLF, 2]
: rowDelimiter === '\r'
? [CR, 1]
: [LF, 1];
// Always start off at the beginning of a row.
let state = NEW_ROW;
// Set up the starting index.
let i = startIndex;
// We initialize to 0 just in case we are asked to parse past the end of the
// string. In that case, we want the number of columns to be 0.
let col = 0;
// Declare some useful temporaries
let char;
// Loop through the data string
while (i < endIndex) {
// i is the index of a character in the state.
// If we just hit a new row, and there are still characters left, push a new
// offset on and reset the column counter. We want this logic at the top of
// the while loop rather than the bottom because we don't want a trailing
// row delimiter at the end of the data to trigger a new row offset.
if (state === NEW_ROW) {
// Start a new row and reset the column counter.
offsets.push(i);
col = 1;
}
// Below, we handle this character, modify the parser state and increment the index to be consistent.
// Get the integer code for the current character, so the comparisons below
// are faster.
char = data.charCodeAt(i);
// Update the parser state. This switch statement is responsible for
// updating the state to be consistent with the index i+1 (we increment i
// after the switch statement). In some situations, we may increment i
// inside this loop to skip over indices as a shortcut.
switch (state) {
// At the beginning of a row or field, we can have a quote, row delimiter, or field delimiter.
case NEW_ROW:
case NEW_FIELD:
switch (char) {
// If we have a quote, we are starting an escaped field.
case CH_QUOTE:
state = QUOTED_FIELD;
break;
// A field delimiter means we are starting a new field.
case CH_DELIMITER:
state = NEW_FIELD;
break;
// A row delimiter means we are starting a new row.
case CH_CR:
if (rowDelimiterCode === CR) {
state = NEW_ROW;
} else if (
rowDelimiterCode === CRLF &&
data.charCodeAt(i + 1) === CH_LF
) {
// If we see an expected \r\n, then increment to the end of the delimiter.
i++;
state = NEW_ROW;
} else {
throw `string index ${i} (in row ${nrows}, column ${col}): carriage return found, but not as part of a row delimiter C ${data.charCodeAt(
i + 1
)}`;
}
break;
case CH_LF:
if (rowDelimiterCode === LF) {
state = NEW_ROW;
} else {
throw `string index ${i} (in row ${nrows}, column ${col}): line feed found, but row delimiter starts with a carriage return`;
}
break;
// Otherwise, we are starting an unquoted field.
default:
state = UNQUOTED_FIELD;
break;
}
break;
// We are in a quoted field.
case QUOTED_FIELD:
// Skip ahead until we see another quote, which either ends the quoted
// field or starts an escaped quote.
i = data.indexOf(quote, i);
if (i < 0) {
throw `string index ${i} (in row ${nrows}, column ${col}): mismatched quote`;
}
state = QUOTED_FIELD_QUOTE;
break;
// We just saw a quote in a quoted field. This could be the end of the
// field, or it could be a repeated quote (i.e., an escaped quote according
// to RFC 4180).
case QUOTED_FIELD_QUOTE:
switch (char) {
// Another quote means we just saw an escaped quote, so we are still in
// the quoted field.
case CH_QUOTE:
state = QUOTED_FIELD;
break;
// A field or row delimiter means the quoted field just ended and we are
// going into a new field or new row.
case CH_DELIMITER:
state = NEW_FIELD;
break;
// A row delimiter means we are starting a new row in the next index.
case CH_CR:
if (rowDelimiterCode === CR) {
state = NEW_ROW;
} else if (
rowDelimiterCode === CRLF &&
data.charCodeAt(i + 1) === CH_LF
) {
// If we see an expected \r\n, then increment to the end of the delimiter.
i++;
state = NEW_ROW;
} else {
throw `string index ${i} (in row ${nrows}, column ${col}): carriage return found, but not as part of a row delimiter C ${data.charCodeAt(
i + 1
)}`;
}
break;
case CH_LF:
if (rowDelimiterCode === LF) {
state = NEW_ROW;
} else {
throw `string index ${i} (in row ${nrows}, column ${col}): line feed found, but row delimiter starts with a carriage return`;
}
break;
default:
throw `string index ${i} (in row ${nrows}, column ${col}): quote in escaped field not followed by quote, delimiter, or row delimiter`;
}
break;
// We are in an unquoted field, so the only thing we look for is the next
// row or field delimiter.
case UNQUOTED_FIELD:
// Skip ahead to either the next field delimiter or possible start of a
// row delimiter (CR or LF).
while (i < endIndex) {
char = data.charCodeAt(i);
if (char === CH_DELIMITER || char === CH_LF || char === CH_CR) {
break;
}
i++;
}
// Process the character we're seeing in an unquoted field.
switch (char) {
// A field delimiter means we are starting a new field.
case CH_DELIMITER:
state = NEW_FIELD;
break;
// A row delimiter means we are starting a new row in the next index.
case CH_CR:
if (rowDelimiterCode === CR) {
state = NEW_ROW;
} else if (
rowDelimiterCode === CRLF &&
data.charCodeAt(i + 1) === CH_LF
) {
// If we see an expected \r\n, then increment to the end of the delimiter.
i++;
state = NEW_ROW;
} else {
throw `string index ${i} (in row ${nrows}, column ${col}): carriage return found, but not as part of a row delimiter C ${data.charCodeAt(
i + 1
)}`;
}
break;
case CH_LF:
if (rowDelimiterCode === LF) {
state = NEW_ROW;
} else {
throw `string index ${i} (in row ${nrows}, column ${col}): line feed found, but row delimiter starts with a carriage return`;
}
break;
// Otherwise, we continue on in the unquoted field.
default:
continue;
}
break;
// We should never reach this point since the parser state is handled above,
// so throw an error if we do.
default:
throw `string index ${i} (in row ${nrows}, column ${col}): state not recognized`;
}
// Increment i to the next character index
i++;
// Update return values based on state.
switch (state) {
case NEW_ROW:
nrows++;
// If ncols is undefined, set it to the number of columns in this row (first row implied).
if (ncols === undefined) {
if (nrows !== 1) {
throw new Error('Error parsing default number of columns');
}
ncols = col;
}
// Pad or truncate the column offsets in the previous row if we are
// returning them.
if (columnOffsets === true) {
if (col < ncols) {
// We didn't have enough columns, so add some more column offsets that
// point to just before the row delimiter we just saw.
for (; col < ncols; col++) {
offsets.push(i - rowDelimiterLength);
}
} else if (col > ncols) {
// We had too many columns, so truncate them.
offsets.length = offsets.length - (col - ncols);
}
}
// Shortcut return if nrows reaches the maximum rows we are to parse.
if (nrows === maxRows) {
return { nrows, ncols: columnOffsets ? ncols : 0, offsets };
}
break;
case NEW_FIELD:
// If we are returning column offsets, log the current index.
if (columnOffsets === true) {
offsets.push(i);
}
// Update the column counter.
col++;
break;
default:
break;
}
}
// If we finished parsing and we are *not* in the NEW_ROW state, then do the
// column padding/truncation for the last row. Also make sure ncols is
// defined.
if (state !== NEW_ROW) {
nrows++;
if (columnOffsets === true) {
// If ncols is *still* undefined, then we only parsed one row and didn't
// have a newline, so set it to the number of columns we found.
if (ncols === undefined) {
ncols = col;
}
if (col < ncols) {
// We didn't have enough columns, so add some more column offsets that
// point to just before the row delimiter we just saw.
for (; col < ncols; col++) {
offsets.push(i - (rowDelimiterLength - 1));
}
} else if (col > ncols) {
// We had too many columns, so truncate them.
offsets.length = offsets.length - (col - ncols);
}
}
}
return { nrows, ncols: columnOffsets ? ncols ?? 0 : 0, offsets };
}
/**
* Parse delimiter-separated data where no delimiter is quoted.
*
* @param options The parser options
* @returns An object giving the offsets for the rows or columns parsed.
*
* #### Notes
* This function is an optimized parser for cases where there are no field or
* row delimiters in quotes. Note that the data can have quotes, but they are
* not interpreted in any special way. This implementation is based on [RFC
* 4180](https://tools.ietf.org/html/rfc4180), but disregards quotes.
*/
export function parseDSVNoQuotes(options: IParser.IOptions): IParser.IResults {
// Set option defaults.
const {
data,
columnOffsets,
delimiter = ',',
rowDelimiter = '\r\n',
startIndex = 0,
maxRows = 0xffffffff
} = options;
// ncols will be set automatically if it is undefined.
let ncols = options.ncols;
// Set up our return variables.
const offsets: number[] = [];
let nrows = 0;
// Set up various state variables.
const rowDelimiterLength = rowDelimiter.length;
let currRow = startIndex;
const len = data.length;
let nextRow: number;
let col: number;
let rowString: string;
let colIndex: number;
// The end of the current row.
let rowEnd: number;
// Start parsing at the start index.
nextRow = startIndex;
// Loop through rows until we run out of data or we've reached maxRows.
while (nextRow !== -1 && nrows < maxRows && currRow < len) {
// Store the offset for the beginning of the row and increment the rows.
offsets.push(currRow);
nrows++;
// Find the next row delimiter.
nextRow = data.indexOf(rowDelimiter, currRow);
// If the next row delimiter is not found, set the end of the row to the
// end of the data string.
rowEnd = nextRow === -1 ? len : nextRow;
// If we are returning column offsets, push them onto the array.
if (columnOffsets === true) {
// Find the next field delimiter. We slice the current row out so that
// the indexOf will stop at the end of the row. It may possibly be faster
// to just use a loop to check each character.
col = 1;
rowString = data.slice(currRow, rowEnd);
colIndex = rowString.indexOf(delimiter);
if (ncols === undefined) {
// If we don't know how many columns we need, loop through and find all
// of the field delimiters in this row.
while (colIndex !== -1) {
offsets.push(currRow + colIndex + 1);
col++;
colIndex = rowString.indexOf(delimiter, colIndex + 1);
}
// Set ncols to the number of fields we found.
ncols = col;
} else {
// If we know the number of columns we expect, find the field delimiters
// up to that many columns.
while (colIndex !== -1 && col < ncols) {
offsets.push(currRow + colIndex + 1);
col++;
colIndex = rowString.indexOf(delimiter, colIndex + 1);
}
// If we didn't reach the number of columns we expected, pad the offsets
// with the offset just before the row delimiter.
while (col < ncols) {
offsets.push(rowEnd);
col++;
}
}
}
// Skip past the row delimiter at the end of the row.
currRow = rowEnd + rowDelimiterLength;
}
return { nrows, ncols: columnOffsets ? ncols ?? 0 : 0, offsets };
}