Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
158 changes: 158 additions & 0 deletions ios/bytesReplacingReader.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
package ios

import (
"bytes"
"io"
)

// BytesReplacer allows customization on how BytesReplacingReader does sizing estimate during
// initialization/reset and does search and replacement during the execution.
type BytesReplacer interface {
// GetSizingHints returns hints for BytesReplacingReader to do sizing estimate and allocation.
// Return values:
// - 1st: max search token len
// - 2nd: max replace token len
// - 3rd: max (search_len / replace_len) ratio that is less than 1,
// if none of the search/replace ratio is less than 1, then return a negative number.
// will only be called once during BytesReplacingReader initialization/reset.
GetSizingHints() (int, int, float64)
// Index does token search for BytesReplacingReader.
// Return values:
// - 1st: index of the first found search token; -1, if not found;
// - 2nd: the found search token; ignored if not found;
// - 3rd: the matching replace token; ignored if not found;
Index(buf []byte) (int, []byte, []byte)
}

// BytesReplacingReader allows transparent replacement of a given token during read operation.
type BytesReplacingReader struct {
replacer BytesReplacer
maxSearchTokenLen int
r io.Reader
err error
buf []byte
// buf[0:buf0]: bytes already processed; buf[buf0:buf1] bytes read in but not yet processed.
buf0, buf1 int
// because we need to replace 'search' with 'replace', this marks the max bytes we can read into buf
max int
}

const defaultBufSize = int(4096)

func max(a, b int) int {
if a > b {
return a
}
return b
}

// ResetEx allows reuse of a previous allocated `*BytesReplacingReader` for buf allocation optimization.
func (r *BytesReplacingReader) ResetEx(r1 io.Reader, replacer BytesReplacer) *BytesReplacingReader {
if r1 == nil {
panic("io.Reader cannot be nil")
}
r.replacer = replacer
maxSearchTokenLen, maxReplaceTokenLen, maxSearchOverReplaceLenRatio := r.replacer.GetSizingHints()
if maxSearchTokenLen == 0 {
panic("search token cannot be nil/empty")
}
r.maxSearchTokenLen = maxSearchTokenLen
r.r = r1
r.err = nil
bufSize := max(defaultBufSize, max(maxSearchTokenLen, maxReplaceTokenLen))
if r.buf == nil || len(r.buf) < bufSize {
r.buf = make([]byte, bufSize)
}
r.buf0 = 0
r.buf1 = 0
r.max = len(r.buf)
if maxSearchOverReplaceLenRatio > 0 {
// If len(search) < len(replace), then we have to assume the worst case:
// what's the max bound value such that if we have consecutive 'search' filling up
// the buf up to buf[:max], and all of them are placed with 'replace', and the final
// result won't end up exceed the len(buf)?
r.max = int(maxSearchOverReplaceLenRatio * float64(len(r.buf)))
}
return r
}

// Reset allows reuse of a previous allocated `*BytesReplacingReader` for buf allocation optimization.
// `search` cannot be nil/empty. `replace` can.
func (r *BytesReplacingReader) Reset(r1 io.Reader, search1, replace1 []byte) *BytesReplacingReader {
return r.ResetEx(r1, &singleSearchReplaceReplacer{search: search1, replace: replace1})
}

// Read implements the `io.Reader` interface.
func (r *BytesReplacingReader) Read(p []byte) (int, error) {
n := 0
for {
if r.buf0 > 0 {
n = copy(p, r.buf[0:r.buf0])
r.buf0 -= n
r.buf1 -= n
if r.buf1 == 0 && r.err != nil {
return n, r.err
}
copy(r.buf, r.buf[n:r.buf1+n])
return n, nil
} else if r.err != nil {
return 0, r.err
}

n, r.err = r.r.Read(r.buf[r.buf1:r.max])
if n > 0 {
r.buf1 += n
for {
index, search, replace := r.replacer.Index(r.buf[r.buf0:r.buf1])
if index < 0 {
r.buf0 = max(r.buf0, r.buf1-r.maxSearchTokenLen+1)
break
}
searchTokenLen := len(search)
if searchTokenLen == 0 {
panic("search token cannot be nil/empty")
}
replaceTokenLen := len(replace)
lenDelta := replaceTokenLen - searchTokenLen
index += r.buf0
copy(r.buf[index+replaceTokenLen:r.buf1+lenDelta], r.buf[index+searchTokenLen:r.buf1])
copy(r.buf[index:index+replaceTokenLen], replace)
r.buf0 = index + replaceTokenLen
r.buf1 += lenDelta
}
}
if r.err != nil {
r.buf0 = r.buf1
}
}
}

type singleSearchReplaceReplacer struct {
search []byte
replace []byte
}

func (r *singleSearchReplaceReplacer) GetSizingHints() (int, int, float64) {
searchLen := len(r.search)
replaceLen := len(r.replace)
ratio := float64(-1)
if searchLen < replaceLen {
ratio = float64(searchLen) / float64(replaceLen)
}
return searchLen, replaceLen, ratio
}

func (r *singleSearchReplaceReplacer) Index(buf []byte) (int, []byte, []byte) {
return bytes.Index(buf, r.search), r.search, r.replace
}

// NewBytesReplacingReader creates a new `*BytesReplacingReader` for a single pair of search:replace token replacement.
// `search` cannot be nil/empty. `replace` can.
func NewBytesReplacingReader(r io.Reader, search, replace []byte) *BytesReplacingReader {
return (&BytesReplacingReader{}).ResetEx(r, &singleSearchReplaceReplacer{search: search, replace: replace})
}

// NewBytesReplacingReaderEx creates a new `*BytesReplacingReader` for a given BytesReplacer customization.
func NewBytesReplacingReaderEx(r io.Reader, replacer BytesReplacer) *BytesReplacingReader {
return (&BytesReplacingReader{}).ResetEx(r, replacer)
}
233 changes: 233 additions & 0 deletions ios/bytesReplacingReader_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,233 @@
package ios

import (
"bytes"
"fmt"
"io/ioutil"
"math/rand"
"strings"
"testing"

"github.com/stretchr/testify/assert"
)

func TestBytesReplacingReader(t *testing.T) {
for _, test := range []struct {
name string
input []byte
search []byte
replace []byte
expected []byte
}{
{
name: "len(replace) > len(search)",
input: []byte{1, 2, 3, 2, 2, 3, 4, 5},
search: []byte{2, 3},
replace: []byte{4, 5, 6},
expected: []byte{1, 4, 5, 6, 2, 4, 5, 6, 4, 5},
},
{
name: "len(replace) < len(search)",
input: []byte{1, 2, 3, 2, 2, 3, 4, 5, 6, 7, 8},
search: []byte{2, 3, 2},
replace: []byte{9},
expected: []byte{1, 9, 2, 3, 4, 5, 6, 7, 8},
},
{
name: "strip out search, no replace",
input: []byte{1, 2, 3, 2, 2, 3, 4, 2, 3, 2, 8},
search: []byte{2, 3, 2},
replace: []byte{},
expected: []byte{1, 2, 3, 4, 8},
},
{
name: "len(replace) == len(search)",
input: []byte{1, 2, 3, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5},
search: []byte{5, 5},
replace: []byte{6, 6},
expected: []byte{1, 2, 3, 4, 6, 6, 6, 6, 6, 6, 6, 6, 5},
},
{
name: "double quote -> single quote",
input: []byte(`r = NewLineNumReportingCsvReader(strings.NewReader("a,b,c"))`),
search: []byte(`"`),
replace: []byte(`'`),
expected: []byte(`r = NewLineNumReportingCsvReader(strings.NewReader('a,b,c'))`),
},
} {
t.Run(test.name, func(t *testing.T) {
r := NewBytesReplacingReader(bytes.NewReader(test.input), test.search, test.replace)
result, err := ioutil.ReadAll(r)
assert.NoError(t, err)
assert.Equal(t, test.expected, result)

})
}

assert.PanicsWithValue(t, "io.Reader cannot be nil", func() {
NewBytesReplacingReader(nil, []byte{1}, []byte{2})
})
assert.PanicsWithValue(t, "search token cannot be nil/empty", func() {
(&BytesReplacingReader{}).Reset(strings.NewReader("test"), nil, []byte("est"))
})
}

func createTestInput(length int, numTarget int) []byte {
rand.Seed(1234) // fixed rand seed to ensure bench stability
b := make([]byte, length)
for i := 0; i < length; i++ {
b[i] = byte(rand.Intn(100) + 10) // all regular numbers >= 10
}
for i := 0; i < numTarget; i++ {
for {
index := rand.Intn(length)
if b[index] == 7 {
continue
}
b[index] = 7 // special number 7 we will search for and replace with 8.
break
}
}
return b
}

var testInput70MBLength500Targets = createTestInput(70*1024*1024, 500)
var testInput1KBLength20Targets = createTestInput(1024, 20)
var testInput50KBLength1000Targets = createTestInput(50*1024, 1000)
var testSearchFor = []byte{7}
var testReplaceWith = []byte{8}
var testReplacer = &singleSearchReplaceReplacer{search: testSearchFor, replace: testReplaceWith}

func BenchmarkBytesReplacingReader_70MBLength_500Targets(b *testing.B) {
r := &BytesReplacingReader{}
for i := 0; i < b.N; i++ {
r.ResetEx(bytes.NewReader(testInput70MBLength500Targets), testReplacer)
_, _ = ioutil.ReadAll(r)
}
}

func BenchmarkRegularReader_70MBLength_500Targets(b *testing.B) {
for i := 0; i < b.N; i++ {
_, _ = ioutil.ReadAll(bytes.NewReader(testInput70MBLength500Targets))
}
}

func BenchmarkBytesReplacingReader_1KBLength_20Targets(b *testing.B) {
r := &BytesReplacingReader{}
for i := 0; i < b.N; i++ {
r.ResetEx(bytes.NewReader(testInput1KBLength20Targets), testReplacer)
_, _ = ioutil.ReadAll(r)
}
}

func BenchmarkRegularReader_1KBLength_20Targets(b *testing.B) {
for i := 0; i < b.N; i++ {
_, _ = ioutil.ReadAll(bytes.NewReader(testInput1KBLength20Targets))
}
}

func BenchmarkBytesReplacingReader_50KBLength_1000Targets(b *testing.B) {
r := &BytesReplacingReader{}
for i := 0; i < b.N; i++ {
r.ResetEx(bytes.NewReader(testInput50KBLength1000Targets), testReplacer)
_, _ = ioutil.ReadAll(r)
}
}

func BenchmarkRegularReader_50KBLength_1000Targets(b *testing.B) {
for i := 0; i < b.N; i++ {
_, _ = ioutil.ReadAll(bytes.NewReader(testInput50KBLength1000Targets))
}
}

// The follow struct/test is to demonstrate how to do a different customization of BytesReplacer.
type multiTokenReplacer struct {
searches [][]byte
replaces [][]byte
}

func (r *multiTokenReplacer) GetSizingHints() (int, int, float64) {
if len(r.searches) != len(r.replaces) {
panic(fmt.Sprintf("len(searches) (%d) != len(replaces) (%d)", len(r.searches), len(r.replaces)))
}
if len(r.searches) == 0 {
panic("searches must have at least one token")
}
maxSearchLen := 0
maxReplaceLen := 0
maxRatio := float64(-1)
for i, _ := range r.searches {
searchLen := len(r.searches[i])
replaceLen := len(r.replaces[i])
if searchLen > maxSearchLen {
maxSearchLen = searchLen
}
if replaceLen > maxReplaceLen {
maxReplaceLen = replaceLen
}
if searchLen < replaceLen {
ratio := float64(searchLen) / float64(replaceLen)
if ratio > maxRatio {
maxRatio = ratio
}
}
}
return maxSearchLen, maxReplaceLen, maxRatio
}

func (r *multiTokenReplacer) Index(buf []byte) (int, []byte, []byte) {
for i, _ := range r.searches {
index := bytes.Index(buf, r.searches[i])
if index >= 0 {
return index, r.searches[i], r.replaces[i]
}
}
return -1, nil, nil
}

func TestMultiTokenBytesReplacingReader(t *testing.T) {
for _, test := range []struct {
name string
input []byte
searches [][]byte
replaces [][]byte
expected []byte
}{
{
name: "multi tokens; len(search) < len(replace); len(search) > len(replace); replace = nil",
input: []byte("abcdefgop01234qrstuvwxyz"),
searches: [][]byte{
[]byte("abc"),
[]byte("12"),
[]byte("st"),
[]byte("xyz"),
},
replaces: [][]byte{
[]byte("one two three"),
[]byte("twelve is an int"),
nil,
[]byte("uv"),
},
expected: []byte("one two threedefgop0twelve is an int34qruvwuv"),
},
} {
replacer := &multiTokenReplacer{
searches: test.searches,
replaces: test.replaces,
}
r := NewBytesReplacingReaderEx(bytes.NewReader(test.input), replacer)
result, err := ioutil.ReadAll(r)
assert.NoError(t, err)
assert.Equal(t, string(test.expected), string(result))
}

r := (&BytesReplacingReader{}).ResetEx(
strings.NewReader("test"),
&multiTokenReplacer{
searches: [][]byte{[]byte("abc"), []byte("")},
replaces: [][]byte{[]byte("xyz"), []byte("wrong")},
})
assert.PanicsWithValue(t, "search token cannot be nil/empty", func() {
_, _ = ioutil.ReadAll(r)
})
}
Loading