Skip to content

Commit

Permalink
Support utf-16 in file and tail inputs (influxdata#7792)
Browse files Browse the repository at this point in the history
  • Loading branch information
danielnelson authored and idohalevi committed Sep 23, 2020
1 parent bd6edd8 commit 86145ef
Show file tree
Hide file tree
Showing 18 changed files with 883 additions and 341 deletions.
6 changes: 4 additions & 2 deletions go.mod
@@ -1,6 +1,6 @@
module github.com/influxdata/telegraf

go 1.12
go 1.13

require (
cloud.google.com/go v0.53.0
Expand Down Expand Up @@ -38,6 +38,7 @@ require (
github.com/couchbase/goutils v0.0.0-20180530154633-e865a1461c8a // indirect
github.com/denisenkom/go-mssqldb v0.0.0-20190707035753-2be1aa521ff4
github.com/dgrijalva/jwt-go v3.2.0+incompatible
github.com/dimchansky/utfbom v1.1.0
github.com/docker/distribution v2.6.0-rc.1.0.20170726174610-edc3ab29cdff+incompatible // indirect
github.com/docker/docker v1.4.2-0.20180327123150-ed7b6428c133
github.com/docker/go-connections v0.3.0 // indirect
Expand Down Expand Up @@ -71,7 +72,7 @@ require (
github.com/hashicorp/memberlist v0.1.5 // indirect
github.com/hashicorp/serf v0.8.1 // indirect
github.com/influxdata/go-syslog/v2 v2.0.1
github.com/influxdata/tail v1.0.1-0.20180327235535-c43482518d41
github.com/influxdata/tail v1.0.1-0.20200707181643-03a791b270e4
github.com/influxdata/toml v0.0.0-20190415235208-270119a8ce65
github.com/influxdata/wlog v0.0.0-20160411224016-7c63b0a71ef8
github.com/jackc/fake v0.0.0-20150926172116-812a484cc733 // indirect
Expand Down Expand Up @@ -133,6 +134,7 @@ require (
golang.org/x/oauth2 v0.0.0-20200107190931-bf48bf16ab8d
golang.org/x/sync v0.0.0-20200317015054-43a5402ce75a
golang.org/x/sys v0.0.0-20200212091648-12a6c2dcc1e4
golang.org/x/text v0.3.3
golang.org/x/tools v0.0.0-20200317043434-63da46f3035e // indirect
golang.zx2c4.com/wireguard/wgctrl v0.0.0-20200205215550-e35592f146e4
gonum.org/v1/gonum v0.6.2 // indirect
Expand Down
8 changes: 6 additions & 2 deletions go.sum
Expand Up @@ -335,8 +335,8 @@ github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpO
github.com/ianlancetaylor/demangle v0.0.0-20181102032728-5e5cf60278f6/go.mod h1:aSSvb/t6k1mPoxDqO4vJh6VOCGPwU4O0C2/Eqndh1Sc=
github.com/influxdata/go-syslog/v2 v2.0.1 h1:l44S4l4Q8MhGQcoOxJpbo+QQYxJqp0vdgIVHh4+DO0s=
github.com/influxdata/go-syslog/v2 v2.0.1/go.mod h1:hjvie1UTaD5E1fTnDmxaCw8RRDrT4Ve+XHr5O2dKSCo=
github.com/influxdata/tail v1.0.1-0.20180327235535-c43482518d41 h1:HxQo1NpNXQDpvEBzthbQLmePvTLFTa5GzSFUjL03aEs=
github.com/influxdata/tail v1.0.1-0.20180327235535-c43482518d41/go.mod h1:xTFF2SILpIYc5N+Srb0d5qpx7d+f733nBrbasb13DtQ=
github.com/influxdata/tail v1.0.1-0.20200707181643-03a791b270e4 h1:K3A5vHPs/p8OjI4SL3l1+hs/98mhxTVDcV1Ap0c265E=
github.com/influxdata/tail v1.0.1-0.20200707181643-03a791b270e4/go.mod h1:VeiWgI3qaGdJWust2fP27a6J+koITo/1c/UhxeOxgaM=
github.com/influxdata/toml v0.0.0-20190415235208-270119a8ce65 h1:vvyMtD5LTJc1W9sQKjDkAWdcg0478CszSdzlHtiAXCY=
github.com/influxdata/toml v0.0.0-20190415235208-270119a8ce65/go.mod h1:zApaNFpP/bTpQItGZNNUMISDMDAnTXu9UqJ4yT3ocz8=
github.com/influxdata/wlog v0.0.0-20160411224016-7c63b0a71ef8 h1:W2IgzRCb0L9VzMujq/QuTaZUKcH8096jWwP519mHN6Q=
Expand Down Expand Up @@ -720,6 +720,8 @@ golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.1-0.20180807135948-17ff2d5776d2/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.2 h1:tW2bmiBqwgJj/UpqtC8EpXEZVYOwU0yG4iWbprSVAcs=
golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk=
golang.org/x/text v0.3.3 h1:cokOdA+Jmi5PJGXLlLllQSgYigAEfHXJAERHVMaCc2k=
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/time v0.0.0-20181108054448-85acf8d2951c h1:fqgJT0MGcGpPgpWU7VRdRjuArfcOvC4AoJmILihzhDg=
golang.org/x/time v0.0.0-20181108054448-85acf8d2951c/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
golang.org/x/time v0.0.0-20190308202827-9d24e82272b4/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
Expand Down Expand Up @@ -838,6 +840,7 @@ gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8
gopkg.in/errgo.v2 v2.1.0/go.mod h1:hNsd1EY+bozCKY1Ytp96fpM3vjJbqLJn88ws8XvfDNI=
gopkg.in/fatih/pool.v2 v2.0.0 h1:xIFeWtxifuQJGk/IEPKsTduEKcKvPmhoiVDGpC40nKg=
gopkg.in/fatih/pool.v2 v2.0.0/go.mod h1:8xVGeu1/2jr2wm5V9SPuMht2H5AEmf5aFMGSQixtjTY=
gopkg.in/fsnotify.v1 v1.2.1/go.mod h1:Tz8NjZHkW78fSQdbUxIjBTcgA1z1m8ZHf0WmKUhAMys=
gopkg.in/fsnotify.v1 v1.4.7 h1:xOHLXZwVvI9hhs+cLKq5+I5onOuwQLhQwiu63xxlHs4=
gopkg.in/fsnotify.v1 v1.4.7/go.mod h1:Tz8NjZHkW78fSQdbUxIjBTcgA1z1m8ZHf0WmKUhAMys=
gopkg.in/gorethink/gorethink.v3 v3.0.5 h1:e2Uc/Xe+hpcVQFsj6MuHlYog3r0JYpnTzwDj/y2O4MU=
Expand All @@ -861,6 +864,7 @@ gopkg.in/mgo.v2 v2.0.0-20180705113604-9856a29383ce h1:xcEWjVhvbDy+nHP67nPDDpbYrY
gopkg.in/mgo.v2 v2.0.0-20180705113604-9856a29383ce/go.mod h1:yeKp02qBN3iKW1OzL3MGk2IdtZzaj7SFntXj72NppTA=
gopkg.in/olivere/elastic.v5 v5.0.70 h1:DqFG2Odzs74JCz6SssgJjd6qpGnsOAzNc7+l5EnvsnE=
gopkg.in/olivere/elastic.v5 v5.0.70/go.mod h1:FylZT6jQWtfHsicejzOm3jIMVPOAksa80i3o+6qtQRk=
gopkg.in/tomb.v1 v1.0.0-20140529071818-c131134a1947/go.mod h1:dt/ZhP58zS4L8KSrWDmTeBkI65Dw0HsyUHuEVlX15mw=
gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7 h1:uRGJdciOHaEIrze2W8Q3AKkepLTh2hOroT7a+7czfdQ=
gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7/go.mod h1:dt/ZhP58zS4L8KSrWDmTeBkI65Dw0HsyUHuEVlX15mw=
gopkg.in/yaml.v2 v2.2.1/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
Expand Down
36 changes: 36 additions & 0 deletions plugins/common/encoding/decoder.go
@@ -0,0 +1,36 @@
package encoding

import (
"errors"

"golang.org/x/text/encoding"
"golang.org/x/text/encoding/unicode"
)

type Decoder = encoding.Decoder

// NewDecoder returns a x/text Decoder for the specified text encoding. The
// Decoder converts a character encoding into utf-8 bytes. If a BOM is found
// it will be converted into a utf-8 BOM, you can use
// github.com/dimchansky/utfbom to strip the BOM.
//
// The "none" or "" encoding will pass through bytes unchecked. Use the utf-8
// encoding if you want invalid bytes replaced using the the unicode
// replacement character.
//
// Detection of utf-16 endianness using the BOM is not currently provided due
// to the tail input plugins requirement to be able to start at the middle or
// end of the file.
func NewDecoder(enc string) (*Decoder, error) {
switch enc {
case "utf-8":
return unicode.UTF8.NewDecoder(), nil
case "utf-16le":
return unicode.UTF16(unicode.LittleEndian, unicode.IgnoreBOM).NewDecoder(), nil
case "utf-16be":
return unicode.UTF16(unicode.BigEndian, unicode.IgnoreBOM).NewDecoder(), nil
case "none", "":
return encoding.Nop.NewDecoder(), nil
}
return nil, errors.New("unknown character encoding")
}
78 changes: 78 additions & 0 deletions plugins/common/encoding/decoder_test.go
@@ -0,0 +1,78 @@
package encoding

import (
"bytes"
"io/ioutil"
"testing"

"github.com/stretchr/testify/require"
)

func TestDecoder(t *testing.T) {
tests := []struct {
name string
encoding string
input []byte
expected []byte
expectedErr bool
}{
{
name: "no decoder utf-8",
encoding: "",
input: []byte("howdy"),
expected: []byte("howdy"),
},
{
name: "utf-8 decoder",
encoding: "utf-8",
input: []byte("howdy"),
expected: []byte("howdy"),
},
{
name: "utf-8 decoder invalid bytes replaced with replacement char",
encoding: "utf-8",
input: []byte("\xff\xfe"),
expected: []byte("\uFFFD\uFFFD"),
},
{
name: "utf-16le decoder no BOM",
encoding: "utf-16le",
input: []byte("h\x00o\x00w\x00d\x00y\x00"),
expected: []byte("howdy"),
},
{
name: "utf-16le decoder with BOM",
encoding: "utf-16le",
input: []byte("\xff\xfeh\x00o\x00w\x00d\x00y\x00"),
expected: []byte("\xef\xbb\xbfhowdy"),
},
{
name: "utf-16be decoder no BOM",
encoding: "utf-16be",
input: []byte("\x00h\x00o\x00w\x00d\x00y"),
expected: []byte("howdy"),
},
{
name: "utf-16be decoder with BOM",
encoding: "utf-16be",
input: []byte("\xfe\xff\x00h\x00o\x00w\x00d\x00y"),
expected: []byte("\xef\xbb\xbfhowdy"),
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
decoder, err := NewDecoder(tt.encoding)
require.NoError(t, err)
buf := bytes.NewBuffer(tt.input)
r := decoder.Reader(buf)
actual, err := ioutil.ReadAll(r)
if tt.expectedErr {
require.Error(t, err)
return
}
require.NoError(t, err)
require.Equal(t, tt.expected, actual)
})
}
}
46 changes: 36 additions & 10 deletions plugins/inputs/file/file.go
Expand Up @@ -3,36 +3,50 @@ package file
import (
"fmt"
"io/ioutil"
"os"
"path/filepath"

"github.com/dimchansky/utfbom"
"github.com/influxdata/telegraf"
"github.com/influxdata/telegraf/internal/globpath"
"github.com/influxdata/telegraf/plugins/common/encoding"
"github.com/influxdata/telegraf/plugins/inputs"
"github.com/influxdata/telegraf/plugins/parsers"
)

type File struct {
Files []string `toml:"files"`
FileTag string `toml:"file_tag"`
parser parsers.Parser
Files []string `toml:"files"`
FileTag string `toml:"file_tag"`
CharacterEncoding string `toml:"character_encoding"`
parser parsers.Parser

filenames []string
decoder *encoding.Decoder
}

const sampleConfig = `
## Files to parse each interval. Accept standard unix glob matching rules,
## as well as ** to match recursive files and directories.
files = ["/tmp/metrics.out"]
## Name a tag containing the name of the file the data was parsed from. Leave empty
## to disable.
# file_tag = ""
## Character encoding to use when interpreting the file contents. Invalid
## characters are replaced using the unicode replacement character. When set
## to the empty string the data is not decoded to text.
## ex: character_encoding = "utf-8"
## character_encoding = "utf-16le"
## character_encoding = "utf-16be"
## character_encoding = ""
# character_encoding = ""
## The dataformat to be read from files
## Each data format has its own unique set of configuration options, read
## more about them here:
## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md
data_format = "influx"
## Name a tag containing the name of the file the data was parsed from. Leave empty
## to disable.
# file_tag = ""
`

// SampleConfig returns the default configuration of the Input
Expand All @@ -44,6 +58,12 @@ func (f *File) Description() string {
return "Parse a complete file each interval"
}

func (f *File) Init() error {
var err error
f.decoder, err = encoding.NewDecoder(f.CharacterEncoding)
return err
}

func (f *File) Gather(acc telegraf.Accumulator) error {
err := f.refreshFilePaths()
if err != nil {
Expand All @@ -59,7 +79,7 @@ func (f *File) Gather(acc telegraf.Accumulator) error {
if f.FileTag != "" {
m.AddTag(f.FileTag, filepath.Base(k))
}
acc.AddFields(m.Name(), m.Fields(), m.Tags(), m.Time())
acc.AddMetric(m)
}
}
return nil
Expand Down Expand Up @@ -88,12 +108,18 @@ func (f *File) refreshFilePaths() error {
}

func (f *File) readMetric(filename string) ([]telegraf.Metric, error) {
fileContents, err := ioutil.ReadFile(filename)
file, err := os.Open(filename)
if err != nil {
return nil, err
}
defer file.Close()

r, _ := utfbom.Skip(f.decoder.Reader(file))
fileContents, err := ioutil.ReadAll(r)
if err != nil {
return nil, fmt.Errorf("E! Error file: %v could not be read, %s", filename, err)
}
return f.parser.Parse(fileContents)

}

func init() {
Expand Down

0 comments on commit 86145ef

Please sign in to comment.