-
Notifications
You must be signed in to change notification settings - Fork 0
/
DelimParser.scala
139 lines (124 loc) · 3.92 KB
/
DelimParser.scala
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import java.io.Reader
/**
* Parses delimited data. Handles quotes and embedded delimiters.
* <p>
* The caller must close the input Reader.
*
* @author Jim Menard, <a href="mailto:jimm@io.com">jimm@io.com</a>
*/
class DelimParser(in: Reader, delimiter: Char) {
val EOF = -1
var pushbackChar = EOF
/**
* Constructor, using ',' as the delimiter. The caller must close
* <var>in</var>.
*
* @param in input reader
*/
def this(in: Reader) = this(in, ',')
/**
* Returns an array of column data or <code>null</code> if there is no more
* data. Handles delimiters and quotes within the data just as they are
* generated by Excel comma- and tab-separated files.
*
* @return a <code>List</code> of strings; return <code>null</code> if
* there is no more data.
*/
def parse(): List[String] = {
var columns = List[String]()
var insideQuotes = false
var numQuotesSeen = 0
var buf = new StringBuffer()
var prevChar = '\0'
var charAsInt = nextChar()
while (charAsInt != EOF) {
var c = charAsInt.toChar
val canonicalChar = c match {
case '\r' => '\n'
case _ => c
}
canonicalChar match {
case '"' => { // Quote character
if (!insideQuotes) { // Start of quoted column
insideQuotes = true
numQuotesSeen = 0
}
else if (insideQuotes) { // Inside quoted column
if (numQuotesSeen == 1) { // This is second of doubled quotes
buf.append(c)
numQuotesSeen = 0
}
else
numQuotesSeen = 1
}
}
case '\n' => { // Linefeed/newline
if (insideQuotes) {
if (numQuotesSeen == 1) { // Closing quote at end of line
return (buf.toString :: columns).reverse
}
else
buf.append(c)
}
else { // End of line; return columns
// Handle DOS line endings
if (c == '\r') { // Check for following '\n
charAsInt = nextChar()
c = charAsInt.toChar
if (c != '\n') // Eat following '\n' if it exists
pushback(charAsInt) // Else put it back
}
charAsInt = nextChar()
c = charAsInt.toChar
if (columns.length == 0 && buf.length() == 0 && charAsInt == EOF)
return null // Empty line at end of file
pushback(charAsInt)
return (buf.toString() :: columns).reverse
}
}
case _ => {
if (c == delimiter) { // Normal delimiter
if (!insideQuotes) {
columns = buf.toString :: columns
buf = new StringBuffer()
}
else { // Inside quoted column
// Delimiter at end of quoted column data
if (numQuotesSeen == 1) {
insideQuotes = false
columns = buf.toString :: columns
buf = new StringBuffer()
}
// Delimiter inside quoted column
else
buf.append(delimiter)
}
}
else { // Everything else
numQuotesSeen = 0
buf.append(c)
}
}
}
prevChar = c
charAsInt = nextChar()
}
// We've reached EOF
if (columns.length == 0 && buf.length() == 0) // Empty line at end of file
return null
if (buf.length() > 0 || prevChar == delimiter) {
columns = buf.toString :: columns
}
if (columns.length == 0) null else columns.reverse
}
def nextChar(): Int = {
if (pushbackChar == EOF)
in.read()
else {
val c = pushbackChar
pushbackChar = EOF
c
}
}
def pushback(charAsInt: Int): Unit = { pushbackChar = charAsInt }
}