diff --git a/ios/bytesReplacingReader.go b/ios/bytesReplacingReader.go new file mode 100644 index 0000000..8566297 --- /dev/null +++ b/ios/bytesReplacingReader.go @@ -0,0 +1,158 @@ +package ios + +import ( + "bytes" + "io" +) + +// BytesReplacer allows customization on how BytesReplacingReader does sizing estimate during +// initialization/reset and does search and replacement during the execution. +type BytesReplacer interface { + // GetSizingHints returns hints for BytesReplacingReader to do sizing estimate and allocation. + // Return values: + // - 1st: max search token len + // - 2nd: max replace token len + // - 3rd: max (search_len / replace_len) ratio that is less than 1, + // if none of the search/replace ratio is less than 1, then return a negative number. + // will only be called once during BytesReplacingReader initialization/reset. + GetSizingHints() (int, int, float64) + // Index does token search for BytesReplacingReader. + // Return values: + // - 1st: index of the first found search token; -1, if not found; + // - 2nd: the found search token; ignored if not found; + // - 3rd: the matching replace token; ignored if not found; + Index(buf []byte) (int, []byte, []byte) +} + +// BytesReplacingReader allows transparent replacement of a given token during read operation. +type BytesReplacingReader struct { + replacer BytesReplacer + maxSearchTokenLen int + r io.Reader + err error + buf []byte + // buf[0:buf0]: bytes already processed; buf[buf0:buf1] bytes read in but not yet processed. + buf0, buf1 int + // because we need to replace 'search' with 'replace', this marks the max bytes we can read into buf + max int +} + +const defaultBufSize = int(4096) + +func max(a, b int) int { + if a > b { + return a + } + return b +} + +// ResetEx allows reuse of a previous allocated `*BytesReplacingReader` for buf allocation optimization. +func (r *BytesReplacingReader) ResetEx(r1 io.Reader, replacer BytesReplacer) *BytesReplacingReader { + if r1 == nil { + panic("io.Reader cannot be nil") + } + r.replacer = replacer + maxSearchTokenLen, maxReplaceTokenLen, maxSearchOverReplaceLenRatio := r.replacer.GetSizingHints() + if maxSearchTokenLen == 0 { + panic("search token cannot be nil/empty") + } + r.maxSearchTokenLen = maxSearchTokenLen + r.r = r1 + r.err = nil + bufSize := max(defaultBufSize, max(maxSearchTokenLen, maxReplaceTokenLen)) + if r.buf == nil || len(r.buf) < bufSize { + r.buf = make([]byte, bufSize) + } + r.buf0 = 0 + r.buf1 = 0 + r.max = len(r.buf) + if maxSearchOverReplaceLenRatio > 0 { + // If len(search) < len(replace), then we have to assume the worst case: + // what's the max bound value such that if we have consecutive 'search' filling up + // the buf up to buf[:max], and all of them are placed with 'replace', and the final + // result won't end up exceed the len(buf)? + r.max = int(maxSearchOverReplaceLenRatio * float64(len(r.buf))) + } + return r +} + +// Reset allows reuse of a previous allocated `*BytesReplacingReader` for buf allocation optimization. +// `search` cannot be nil/empty. `replace` can. +func (r *BytesReplacingReader) Reset(r1 io.Reader, search1, replace1 []byte) *BytesReplacingReader { + return r.ResetEx(r1, &singleSearchReplaceReplacer{search: search1, replace: replace1}) +} + +// Read implements the `io.Reader` interface. +func (r *BytesReplacingReader) Read(p []byte) (int, error) { + n := 0 + for { + if r.buf0 > 0 { + n = copy(p, r.buf[0:r.buf0]) + r.buf0 -= n + r.buf1 -= n + if r.buf1 == 0 && r.err != nil { + return n, r.err + } + copy(r.buf, r.buf[n:r.buf1+n]) + return n, nil + } else if r.err != nil { + return 0, r.err + } + + n, r.err = r.r.Read(r.buf[r.buf1:r.max]) + if n > 0 { + r.buf1 += n + for { + index, search, replace := r.replacer.Index(r.buf[r.buf0:r.buf1]) + if index < 0 { + r.buf0 = max(r.buf0, r.buf1-r.maxSearchTokenLen+1) + break + } + searchTokenLen := len(search) + if searchTokenLen == 0 { + panic("search token cannot be nil/empty") + } + replaceTokenLen := len(replace) + lenDelta := replaceTokenLen - searchTokenLen + index += r.buf0 + copy(r.buf[index+replaceTokenLen:r.buf1+lenDelta], r.buf[index+searchTokenLen:r.buf1]) + copy(r.buf[index:index+replaceTokenLen], replace) + r.buf0 = index + replaceTokenLen + r.buf1 += lenDelta + } + } + if r.err != nil { + r.buf0 = r.buf1 + } + } +} + +type singleSearchReplaceReplacer struct { + search []byte + replace []byte +} + +func (r *singleSearchReplaceReplacer) GetSizingHints() (int, int, float64) { + searchLen := len(r.search) + replaceLen := len(r.replace) + ratio := float64(-1) + if searchLen < replaceLen { + ratio = float64(searchLen) / float64(replaceLen) + } + return searchLen, replaceLen, ratio +} + +func (r *singleSearchReplaceReplacer) Index(buf []byte) (int, []byte, []byte) { + return bytes.Index(buf, r.search), r.search, r.replace +} + +// NewBytesReplacingReader creates a new `*BytesReplacingReader` for a single pair of search:replace token replacement. +// `search` cannot be nil/empty. `replace` can. +func NewBytesReplacingReader(r io.Reader, search, replace []byte) *BytesReplacingReader { + return (&BytesReplacingReader{}).ResetEx(r, &singleSearchReplaceReplacer{search: search, replace: replace}) +} + +// NewBytesReplacingReaderEx creates a new `*BytesReplacingReader` for a given BytesReplacer customization. +func NewBytesReplacingReaderEx(r io.Reader, replacer BytesReplacer) *BytesReplacingReader { + return (&BytesReplacingReader{}).ResetEx(r, replacer) +} diff --git a/ios/bytesReplacingReader_test.go b/ios/bytesReplacingReader_test.go new file mode 100644 index 0000000..44e71b7 --- /dev/null +++ b/ios/bytesReplacingReader_test.go @@ -0,0 +1,233 @@ +package ios + +import ( + "bytes" + "fmt" + "io/ioutil" + "math/rand" + "strings" + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestBytesReplacingReader(t *testing.T) { + for _, test := range []struct { + name string + input []byte + search []byte + replace []byte + expected []byte + }{ + { + name: "len(replace) > len(search)", + input: []byte{1, 2, 3, 2, 2, 3, 4, 5}, + search: []byte{2, 3}, + replace: []byte{4, 5, 6}, + expected: []byte{1, 4, 5, 6, 2, 4, 5, 6, 4, 5}, + }, + { + name: "len(replace) < len(search)", + input: []byte{1, 2, 3, 2, 2, 3, 4, 5, 6, 7, 8}, + search: []byte{2, 3, 2}, + replace: []byte{9}, + expected: []byte{1, 9, 2, 3, 4, 5, 6, 7, 8}, + }, + { + name: "strip out search, no replace", + input: []byte{1, 2, 3, 2, 2, 3, 4, 2, 3, 2, 8}, + search: []byte{2, 3, 2}, + replace: []byte{}, + expected: []byte{1, 2, 3, 4, 8}, + }, + { + name: "len(replace) == len(search)", + input: []byte{1, 2, 3, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5}, + search: []byte{5, 5}, + replace: []byte{6, 6}, + expected: []byte{1, 2, 3, 4, 6, 6, 6, 6, 6, 6, 6, 6, 5}, + }, + { + name: "double quote -> single quote", + input: []byte(`r = NewLineNumReportingCsvReader(strings.NewReader("a,b,c"))`), + search: []byte(`"`), + replace: []byte(`'`), + expected: []byte(`r = NewLineNumReportingCsvReader(strings.NewReader('a,b,c'))`), + }, + } { + t.Run(test.name, func(t *testing.T) { + r := NewBytesReplacingReader(bytes.NewReader(test.input), test.search, test.replace) + result, err := ioutil.ReadAll(r) + assert.NoError(t, err) + assert.Equal(t, test.expected, result) + + }) + } + + assert.PanicsWithValue(t, "io.Reader cannot be nil", func() { + NewBytesReplacingReader(nil, []byte{1}, []byte{2}) + }) + assert.PanicsWithValue(t, "search token cannot be nil/empty", func() { + (&BytesReplacingReader{}).Reset(strings.NewReader("test"), nil, []byte("est")) + }) +} + +func createTestInput(length int, numTarget int) []byte { + rand.Seed(1234) // fixed rand seed to ensure bench stability + b := make([]byte, length) + for i := 0; i < length; i++ { + b[i] = byte(rand.Intn(100) + 10) // all regular numbers >= 10 + } + for i := 0; i < numTarget; i++ { + for { + index := rand.Intn(length) + if b[index] == 7 { + continue + } + b[index] = 7 // special number 7 we will search for and replace with 8. + break + } + } + return b +} + +var testInput70MBLength500Targets = createTestInput(70*1024*1024, 500) +var testInput1KBLength20Targets = createTestInput(1024, 20) +var testInput50KBLength1000Targets = createTestInput(50*1024, 1000) +var testSearchFor = []byte{7} +var testReplaceWith = []byte{8} +var testReplacer = &singleSearchReplaceReplacer{search: testSearchFor, replace: testReplaceWith} + +func BenchmarkBytesReplacingReader_70MBLength_500Targets(b *testing.B) { + r := &BytesReplacingReader{} + for i := 0; i < b.N; i++ { + r.ResetEx(bytes.NewReader(testInput70MBLength500Targets), testReplacer) + _, _ = ioutil.ReadAll(r) + } +} + +func BenchmarkRegularReader_70MBLength_500Targets(b *testing.B) { + for i := 0; i < b.N; i++ { + _, _ = ioutil.ReadAll(bytes.NewReader(testInput70MBLength500Targets)) + } +} + +func BenchmarkBytesReplacingReader_1KBLength_20Targets(b *testing.B) { + r := &BytesReplacingReader{} + for i := 0; i < b.N; i++ { + r.ResetEx(bytes.NewReader(testInput1KBLength20Targets), testReplacer) + _, _ = ioutil.ReadAll(r) + } +} + +func BenchmarkRegularReader_1KBLength_20Targets(b *testing.B) { + for i := 0; i < b.N; i++ { + _, _ = ioutil.ReadAll(bytes.NewReader(testInput1KBLength20Targets)) + } +} + +func BenchmarkBytesReplacingReader_50KBLength_1000Targets(b *testing.B) { + r := &BytesReplacingReader{} + for i := 0; i < b.N; i++ { + r.ResetEx(bytes.NewReader(testInput50KBLength1000Targets), testReplacer) + _, _ = ioutil.ReadAll(r) + } +} + +func BenchmarkRegularReader_50KBLength_1000Targets(b *testing.B) { + for i := 0; i < b.N; i++ { + _, _ = ioutil.ReadAll(bytes.NewReader(testInput50KBLength1000Targets)) + } +} + +// The follow struct/test is to demonstrate how to do a different customization of BytesReplacer. +type multiTokenReplacer struct { + searches [][]byte + replaces [][]byte +} + +func (r *multiTokenReplacer) GetSizingHints() (int, int, float64) { + if len(r.searches) != len(r.replaces) { + panic(fmt.Sprintf("len(searches) (%d) != len(replaces) (%d)", len(r.searches), len(r.replaces))) + } + if len(r.searches) == 0 { + panic("searches must have at least one token") + } + maxSearchLen := 0 + maxReplaceLen := 0 + maxRatio := float64(-1) + for i, _ := range r.searches { + searchLen := len(r.searches[i]) + replaceLen := len(r.replaces[i]) + if searchLen > maxSearchLen { + maxSearchLen = searchLen + } + if replaceLen > maxReplaceLen { + maxReplaceLen = replaceLen + } + if searchLen < replaceLen { + ratio := float64(searchLen) / float64(replaceLen) + if ratio > maxRatio { + maxRatio = ratio + } + } + } + return maxSearchLen, maxReplaceLen, maxRatio +} + +func (r *multiTokenReplacer) Index(buf []byte) (int, []byte, []byte) { + for i, _ := range r.searches { + index := bytes.Index(buf, r.searches[i]) + if index >= 0 { + return index, r.searches[i], r.replaces[i] + } + } + return -1, nil, nil +} + +func TestMultiTokenBytesReplacingReader(t *testing.T) { + for _, test := range []struct { + name string + input []byte + searches [][]byte + replaces [][]byte + expected []byte + }{ + { + name: "multi tokens; len(search) < len(replace); len(search) > len(replace); replace = nil", + input: []byte("abcdefgop01234qrstuvwxyz"), + searches: [][]byte{ + []byte("abc"), + []byte("12"), + []byte("st"), + []byte("xyz"), + }, + replaces: [][]byte{ + []byte("one two three"), + []byte("twelve is an int"), + nil, + []byte("uv"), + }, + expected: []byte("one two threedefgop0twelve is an int34qruvwuv"), + }, + } { + replacer := &multiTokenReplacer{ + searches: test.searches, + replaces: test.replaces, + } + r := NewBytesReplacingReaderEx(bytes.NewReader(test.input), replacer) + result, err := ioutil.ReadAll(r) + assert.NoError(t, err) + assert.Equal(t, string(test.expected), string(result)) + } + + r := (&BytesReplacingReader{}).ResetEx( + strings.NewReader("test"), + &multiTokenReplacer{ + searches: [][]byte{[]byte("abc"), []byte("")}, + replaces: [][]byte{[]byte("xyz"), []byte("wrong")}, + }) + assert.PanicsWithValue(t, "search token cannot be nil/empty", func() { + _, _ = ioutil.ReadAll(r) + }) +} diff --git a/ios/readers.go b/ios/readers.go index f24d3e3..102a4c3 100644 --- a/ios/readers.go +++ b/ios/readers.go @@ -2,7 +2,6 @@ package ios import ( "bufio" - "bytes" "encoding/csv" "fmt" "io" @@ -33,107 +32,6 @@ func NewLineNumReportingCsvReader(r io.Reader) *LineNumReportingCsvReader { return &LineNumReportingCsvReader{csv.NewReader(r), "numLine"} } -// BytesReplacingReader allows transparent replacement of a given token during read operation. -type BytesReplacingReader struct { - r io.Reader - search []byte - searchLen int - replace []byte - replaceLen int - lenDelta int // = replaceLen - searchLen. can be negative - err error - buf []byte - buf0, buf1 int // buf[0:buf0]: bytes already processed; buf[buf0:buf1] bytes read in but not yet processed. - max int // because we need to replace 'search' with 'replace', this marks the max bytes we can read into buf -} - -const defaultBufSize = int(4096) - -// NewBytesReplacingReader creates a new `*BytesReplacingReader`. -// `search` cannot be nil/empty. `replace` can. -func NewBytesReplacingReader(r io.Reader, search, replace []byte) *BytesReplacingReader { - return (&BytesReplacingReader{}).Reset(r, search, replace) -} - -func max(a, b int) int { - if a > b { - return a - } - return b -} - -// Reset allows reuse of a previous allocated `*BytesReplacingReader` for buf allocation optimization. -// `search` cannot be nil/empty. `replace` can. -func (r *BytesReplacingReader) Reset(r1 io.Reader, search1, replace1 []byte) *BytesReplacingReader { - if r1 == nil { - panic("io.Reader cannot be nil") - } - if len(search1) == 0 { - panic("search token cannot be nil/empty") - } - r.r = r1 - r.search = search1 - r.searchLen = len(search1) - r.replace = replace1 - r.replaceLen = len(replace1) - r.lenDelta = r.replaceLen - r.searchLen // could be negative - r.err = nil - bufSize := max(defaultBufSize, max(r.searchLen, r.replaceLen)) - if r.buf == nil || len(r.buf) < bufSize { - r.buf = make([]byte, bufSize) - } - r.buf0 = 0 - r.buf1 = 0 - r.max = len(r.buf) - if r.searchLen < r.replaceLen { - // If len(search) < len(replace), then we have to assume the worst case: - // what's the max bound value such that if we have consecutive 'search' filling up - // the buf up to buf[:max], and all of them are placed with 'replace', and the final - // result won't end up exceed the len(buf)? - r.max = (len(r.buf) / r.replaceLen) * r.searchLen - } - return r -} - -// Read implements the `io.Reader` interface. -func (r *BytesReplacingReader) Read(p []byte) (int, error) { - n := 0 - for { - if r.buf0 > 0 { - n = copy(p, r.buf[0:r.buf0]) - r.buf0 -= n - r.buf1 -= n - if r.buf1 == 0 && r.err != nil { - return n, r.err - } - copy(r.buf, r.buf[n:r.buf1+n]) - return n, nil - } else if r.err != nil { - return 0, r.err - } - - n, r.err = r.r.Read(r.buf[r.buf1:r.max]) - if n > 0 { - r.buf1 += n - for { - index := bytes.Index(r.buf[r.buf0:r.buf1], r.search) - if index < 0 { - r.buf0 = max(r.buf0, r.buf1-r.searchLen+1) - break - } - index += r.buf0 - copy(r.buf[index+r.replaceLen:r.buf1+r.lenDelta], r.buf[index+r.searchLen:r.buf1]) - copy(r.buf[index:index+r.replaceLen], r.replace) - r.buf0 = index + r.replaceLen - r.buf1 += r.lenDelta - } - } - if r.err != nil { - r.buf0 = r.buf1 - } - } -} - // ByteReadLine reads in a single line from a bufio.Reader and returns it in []byte. // Note the returned []byte may be pointing directly into the bufio.Reader, so assume // the returned []byte will be invalidated and shouldn't be used upon next ByteReadLine diff --git a/ios/readers_test.go b/ios/readers_test.go index 3e7b4df..e911b61 100644 --- a/ios/readers_test.go +++ b/ios/readers_test.go @@ -6,7 +6,6 @@ import ( "errors" "io" "io/ioutil" - "math/rand" "strings" "testing" @@ -35,134 +34,6 @@ func TestLineNumReportingCsvReader(t *testing.T) { }) } -func TestBytesReplacingReader(t *testing.T) { - for _, test := range []struct { - name string - input []byte - search []byte - replace []byte - expected []byte - }{ - { - name: "len(replace) > len(search)", - input: []byte{1, 2, 3, 2, 2, 3, 4, 5}, - search: []byte{2, 3}, - replace: []byte{4, 5, 6}, - expected: []byte{1, 4, 5, 6, 2, 4, 5, 6, 4, 5}, - }, - { - name: "len(replace) < len(search)", - input: []byte{1, 2, 3, 2, 2, 3, 4, 5, 6, 7, 8}, - search: []byte{2, 3, 2}, - replace: []byte{9}, - expected: []byte{1, 9, 2, 3, 4, 5, 6, 7, 8}, - }, - { - name: "strip out search, no replace", - input: []byte{1, 2, 3, 2, 2, 3, 4, 2, 3, 2, 8}, - search: []byte{2, 3, 2}, - replace: []byte{}, - expected: []byte{1, 2, 3, 4, 8}, - }, - { - name: "len(replace) == len(search)", - input: []byte{1, 2, 3, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5}, - search: []byte{5, 5}, - replace: []byte{6, 6}, - expected: []byte{1, 2, 3, 4, 6, 6, 6, 6, 6, 6, 6, 6, 5}, - }, - { - name: "double quote -> single quote", - input: []byte(`r = NewLineNumReportingCsvReader(strings.NewReader("a,b,c"))`), - search: []byte(`"`), - replace: []byte(`'`), - expected: []byte(`r = NewLineNumReportingCsvReader(strings.NewReader('a,b,c'))`), - }, - } { - t.Run(test.name, func(t *testing.T) { - r := NewBytesReplacingReader(bytes.NewReader(test.input), test.search, test.replace) - result, err := ioutil.ReadAll(r) - assert.NoError(t, err) - assert.Equal(t, test.expected, result) - - }) - } - - assert.PanicsWithValue(t, "io.Reader cannot be nil", func() { - NewBytesReplacingReader(nil, []byte{1}, []byte{2}) - }) - assert.PanicsWithValue(t, "search token cannot be nil/empty", func() { - (&BytesReplacingReader{}).Reset(strings.NewReader("test"), nil, []byte("est")) - }) -} - -func createTestInput(length int, numTarget int) []byte { - rand.Seed(1234) // fixed rand seed to ensure bench stability - b := make([]byte, length) - for i := 0; i < length; i++ { - b[i] = byte(rand.Intn(100) + 10) // all regular numbers >= 10 - } - for i := 0; i < numTarget; i++ { - for { - index := rand.Intn(length) - if b[index] == 7 { - continue - } - b[index] = 7 // special number 7 we will search for and replace with 8. - break - } - } - return b -} - -var testInput70MBLength500Targets = createTestInput(70*1024*1024, 500) -var testInput1KBLength20Targets = createTestInput(1024, 20) -var testInput50KBLength1000Targets = createTestInput(50*1024, 1000) -var testSearchFor = []byte{7} -var testReplaceWith = []byte{8} - -func BenchmarkBytesReplacingReader_70MBLength_500Targets(b *testing.B) { - r := &BytesReplacingReader{} - for i := 0; i < b.N; i++ { - r.Reset(bytes.NewReader(testInput70MBLength500Targets), testSearchFor, testReplaceWith) - _, _ = ioutil.ReadAll(r) - } -} - -func BenchmarkRegularReader_70MBLength_500Targets(b *testing.B) { - for i := 0; i < b.N; i++ { - _, _ = ioutil.ReadAll(bytes.NewReader(testInput70MBLength500Targets)) - } -} - -func BenchmarkBytesReplacingReader_1KBLength_20Targets(b *testing.B) { - r := &BytesReplacingReader{} - for i := 0; i < b.N; i++ { - r.Reset(bytes.NewReader(testInput1KBLength20Targets), testSearchFor, testReplaceWith) - _, _ = ioutil.ReadAll(r) - } -} - -func BenchmarkRegularReader_1KBLength_20Targets(b *testing.B) { - for i := 0; i < b.N; i++ { - _, _ = ioutil.ReadAll(bytes.NewReader(testInput1KBLength20Targets)) - } -} - -func BenchmarkBytesReplacingReader_50KBLength_1000Targets(b *testing.B) { - r := &BytesReplacingReader{} - for i := 0; i < b.N; i++ { - r.Reset(bytes.NewReader(testInput50KBLength1000Targets), testSearchFor, testReplaceWith) - _, _ = ioutil.ReadAll(r) - } -} - -func BenchmarkRegularReader_50KBLength_1000Targets(b *testing.B) { - for i := 0; i < b.N; i++ { - _, _ = ioutil.ReadAll(bytes.NewReader(testInput50KBLength1000Targets)) - } -} - func TestByteReadLineAndReadLine(t *testing.T) { for _, test := range []struct { name string