-
Notifications
You must be signed in to change notification settings - Fork 0
/
icsv.go
308 lines (265 loc) · 6.25 KB
/
icsv.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
// package icsv is an alternative of encoding/csv.
package icsv
import (
"bufio"
"errors"
"io"
)
type Reader struct {
r *bufio.Reader
// attributes in standard package
Comma rune // default ,
FieldsPerRecord int
Comment rune // default #
// extra attributes
Quote rune // default "
Escape rune // default \
Terminator rune // default \n
MaxQuoted int // default 128
// more extra attributes
// Trim white space or any char
// for example char in `"'\n\r\t ,|;`
AroundTrim string // default "'\n\r\t\" ,|;"
LeadingTrim string // extra for leading trim
TrailingTrim string // extra for trailing trim
leadingTrim map[rune]bool
trailingTrim map[rune]bool
CharMapping string // "\n ,-"
charMapping map[rune]rune // {'\n': ' ', ',':'-'}
// internal status
beginFlag bool // for trimming BOM
lastChar rune
cellNo int
recordNo int
line int // line number
col int // col number
// initialized bool
// columnCount int
// lineNum int
// lineTxt string
// Future attributes
// Replace [][]string // [[`""`, `"`]]
// Report string
// if there is no new line at the end of line
// 0: do nothing, 1: warning, 2: error
// NewLineEOF uint8
// BufferSize int // use for detecting csv config
// Dialect string // rcf, excel, mysql, postgres, informix, probe
// Delimiter string // , \t | ||
}
const (
const_bom rune = 65279
)
var (
ErrorParsing = errors.New("csv parsing error")
)
// NewReader returns a new Reader that reads from r.
func NewReader(r io.Reader) *Reader {
return &Reader{
r: bufio.NewReader(r),
Terminator: '\n',
Comma: ',',
MaxQuoted: 128,
// Quote: '"',
// AroundTrim: "\n\r \t",
// CharMapping: "\n \r \t ,.",
// Escape: '\\',
// Comment: '#',
// AroundTrim: "\n\r\t \",",
}
}
// ReadAll
// return: all records
func (r *Reader) ReadAll() (records [][]string, err error) {
for {
rec, err := r.Read()
if rec != nil {
records = append(records, rec)
}
if err != nil {
if err == io.EOF {
err = nil
}
break
}
}
return
}
// Read till end of line or end of file
// return: a record, end of file or other error
func (r *Reader) Read() (rec []string, err error) {
var cell string
var recordEnd uint8
r.cellNo = 0
for {
cell, recordEnd, err = r.readCell()
if recordEnd > 2 {
break
}
r.cellNo++
rec = append(rec, cell)
if recordEnd > 0 {
break
}
if err != nil {
break
}
}
if r.cellNo > 0 {
r.recordNo++
}
return
}
// Read till end of cell
// return: a cell, end of record or other error
func (r *Reader) readCell() (cellStr string, recordEnd uint8, err error) {
readNo := 0 // char number in the read loop
charNo := -1 // char number of a cell
var ch rune // char
var sz int // size of char
var cell []rune // data cell
// flags and cursors
var comment bool
var startQuot bool
var endQuot bool
var endQuotIdx int
var escape bool
charRemap := func(ch rune) rune {
if val, ok := r.charMapping[ch]; ok {
return val
}
return ch
}
addToCell := func() {
charNo += 1
cell = append(cell, charRemap(ch))
if startQuot && charNo > r.MaxQuoted {
err = ErrorParsing
}
}
Loop:
for {
if err != nil {
break
}
ch, sz, err = r.r.ReadRune()
readNo++
r.col++
// recordEnd=0: record is not ended
// =1: when "...a\n..." // more lines to be read
// =2: when "...a<EOF>" // This could be a broken file
// =3: when "...a\n<EOF>"
// =4: when "...a\n <EOF>" // may be a broken file
// =5: when "<EOF>" // empty file
if sz == 0 {
recordEnd = 2
if r.lastChar == '\n' {
recordEnd = 3
}
if readNo > 1 && charNo == -1 {
recordEnd = 4
}
if r.lastChar == 0 {
recordEnd = 5
}
break
} else {
r.lastChar = ch
}
if ch == r.Terminator {
r.line++
r.col = 0
}
if !r.beginFlag {
r.beginFlag = true
r.trimMap()
// skip BOM 0xEF,0xBB,0xBF
if ch == const_bom {
continue
}
}
if comment && ch != r.Terminator {
continue
}
switch {
case escape:
escape = false
// next escape tokens will tread as normal char
// \"=>" \,=>,
addToCell()
// skip empty line
case r.cellNo == 0 && charNo == -1 && ch == r.Terminator:
break
// if not in quotation string
case !startQuot && ch == r.Comma: // end of cell if not in quoted string
break Loop
case !startQuot && ch == r.Terminator: // end of cell and record if not in quoted string
recordEnd = 1
break Loop
case !startQuot && ch == r.Quote: // start quoted string
startQuot = true
case !startQuot && ch == r.Comment: // start comment
comment = true
case ch == r.Escape && !escape: // escape token
escape = true
// trim leading space
case charNo == -1 && r.leadingTrim[ch]:
break
// if in quotation string
// this could be the 2nd quotation mark or the 4th
case ch == r.Quote && startQuot && !endQuot:
endQuotIdx = readNo
endQuot = true
// this is just next to the 2nd quotation mark,
// this is the end of quoted string and the end of the cell
case ch == r.Comma && startQuot && endQuot && readNo == endQuotIdx+1:
startQuot = false
endQuot = false
break Loop
// same as above this is just next to the 2nd quotation mark,
// "" double quotation marks means one in rcf csv standard
case ch == r.Quote && startQuot && endQuot && readNo == endQuotIdx+1:
endQuot = false
addToCell()
default:
addToCell()
}
}
for i0 := charNo; i0 >= 0; i0-- {
if r.trailingTrim[cell[i0]] {
charNo = i0 - 1
} else {
break
}
}
cellStr = string(cell[:charNo+1])
return
}
// Initialize maps for trim cell string
func (r *Reader) trimMap() {
r.charMapping = make(map[rune]rune)
r.leadingTrim = make(map[rune]bool)
r.trailingTrim = make(map[rune]bool)
stringToCharMap(r.charMapping, r.CharMapping)
stringToCharBool(r.leadingTrim, r.AroundTrim)
stringToCharBool(r.trailingTrim, r.AroundTrim)
stringToCharBool(r.leadingTrim, r.LeadingTrim)
stringToCharBool(r.trailingTrim, r.TrailingTrim)
}
// helper function
func stringToCharBool(rtn map[rune]bool, s string) {
for _, c := range s {
rtn[c] = true
}
}
// helper function
func stringToCharMap(rtn map[rune]rune, str string) {
var s []rune
for _, c := range str {
s = append(s, c)
}
l := len(s)
for i := 0; i < l/2; i++ {
rtn[s[i]] = s[i+1]
}
}