New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add an input plugin to monitor basic info of Windows Services #3023
Changes from 35 commits
0db2520
7400e87
1d52da8
3b4df68
b67c645
017c131
5f93ba1
f895ef3
570f613
45fb743
96af3b7
37c2952
03447bc
b05aa4a
2bd9a78
3d4e3db
b88e0e5
3d00f89
1ab4cdc
b5a5a1f
957015c
b54b8a0
3b3e973
c568b19
e0a8773
d203d20
4a0d3e8
75407cc
d570894
3b3106f
721264a
e0dbcab
86b989a
a96932c
3568a10
5eda565
1966f42
ddbdcfc
8c75562
5b71ba8
e444c75
9085744
547cb9b
9f6ca38
86cef94
215828e
d307493
2b1ee6d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
# Telegraf Plugin: win_services | ||
Input plugin to report Windows services info. | ||
|
||
It requires that Telegraf must be running under the administrator privileges. | ||
### Configuration: | ||
|
||
```toml | ||
[[inputs.win_services]] | ||
## Names of the services to monitor. Leave empty to monitor all the available services on the host | ||
service_names = [ | ||
"LanmanServer", | ||
"TermService", | ||
] | ||
``` | ||
|
||
### Measurements & Fields: | ||
|
||
- win_services | ||
- state : integer | ||
- startup_mode : integer | ||
|
||
The `state` field can have the following values: | ||
- 1 - stopped | ||
- 2 - start pending | ||
- 3 - stop pending | ||
- 4 - running | ||
- 5 - continue pending | ||
- 6 - pause pending | ||
- 7 - paused | ||
|
||
The `startup_mode` field can have the following values: | ||
- 0 - boot start | ||
- 1 - system start | ||
- 2 - auto start | ||
- 3 - demand start | ||
- 4 - disabled | ||
|
||
### Tags: | ||
|
||
- All measurements have the following tags: | ||
- service_name | ||
- display_name | ||
|
||
### Example Output: | ||
``` | ||
* Plugin: inputs.win_services, Collection 1 | ||
> win_services,host=WIN2008R2H401,display_name=Server,service_name=LanmanServer state=4i,startup_mode=2i 1500040669000000000 | ||
> win_services,display_name=Remote\ Desktop\ Services,service_name=TermService,host=WIN2008R2H401 state=1i,startup_mode=3i 1500040669000000000 | ||
``` | ||
### TICK Scripts | ||
|
||
A sample TICK script for a notification about a not running service. | ||
It sends a notification whenever any service changes its state to be not _running_ and when it changes that state back to _running_. | ||
The notification is sent via an HTTP POST call. | ||
|
||
``` | ||
stream | ||
|from() | ||
.database('telegraf') | ||
.retentionPolicy('autogen') | ||
.measurement('win_services') | ||
.groupBy('host','service_name') | ||
|alert() | ||
.crit(lambda: "state" != 4) | ||
.stateChangesOnly() | ||
.message('Service {{ index .Tags "service_name" }} on Host {{ index .Tags "host" }} is in state {{ index .Fields "state" }} ') | ||
.post('http://localhost:666/alert/service') | ||
``` |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,172 @@ | ||
// +build windows | ||
|
||
package win_services | ||
|
||
import ( | ||
"fmt" | ||
"github.com/influxdata/telegraf" | ||
"github.com/influxdata/telegraf/plugins/inputs" | ||
"golang.org/x/sys/windows/svc/mgr" | ||
) | ||
|
||
var sampleConfig = ` | ||
## Names of the services to monitor. Leave empty to monitor all the available services on the host | ||
service_names = [ | ||
"LanmanServer", | ||
"TermService", | ||
] | ||
` | ||
|
||
var description = "Input plugin to report Windows services info." | ||
|
||
type Win_Services struct { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Rename this |
||
ServiceNames []string `toml:"service_names"` | ||
} | ||
|
||
type ServiceInfo struct { | ||
ServiceName string | ||
DisplayName string | ||
State int | ||
StartUpMode int | ||
Error error | ||
} | ||
|
||
var ServiceStatesMap = map[int]string{ | ||
0x00000001: "stopped", | ||
0x00000002: "start_pending", | ||
0x00000003: "stop_pending", | ||
0x00000004: "running", | ||
0x00000005: "continue_pending", | ||
0x00000006: "pause_pending", | ||
0x00000007: "paused", | ||
} | ||
|
||
var ServiceStartupModeMap = map[int]string{ | ||
0x00000000: "boot_start", | ||
0x00000001: "system_start", | ||
0x00000002: "auto_start", | ||
0x00000003: "demand_start", | ||
0x00000004: "disabled", | ||
} | ||
|
||
func (m *Win_Services) Description() string { | ||
return description | ||
} | ||
|
||
func (m *Win_Services) SampleConfig() string { | ||
return sampleConfig | ||
} | ||
|
||
func (m *Win_Services) Gather(acc telegraf.Accumulator) error { | ||
|
||
serviceInfos, err := listServices(m.ServiceNames) | ||
|
||
if err != nil { | ||
return err | ||
} | ||
|
||
for _, service := range serviceInfos { | ||
if service.Error == nil { | ||
fields := make(map[string]interface{}) | ||
tags := make(map[string]string) | ||
|
||
tags["display_name"] = service.DisplayName | ||
tags["service_name"] = service.ServiceName | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Tags should not contain the empty string, since InfluxDB will not accept them. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What is the best practice for storing a measurement in case of a tag has empty value?
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Either skip the tag or omit the entire point, depending on if the field needs to be required. For display name I would skip, since it is optional. For service name I would omit the point. |
||
|
||
fields["state"] = service.State | ||
fields["startup_mode"] = service.StartUpMode | ||
|
||
acc.AddFields("win_services", fields, tags) | ||
} else { | ||
acc.AddError(service.Error) | ||
} | ||
} | ||
|
||
return nil | ||
} | ||
|
||
func listServices(userServices []string) ([]ServiceInfo, error) { | ||
scmgr, err := mgr.Connect() | ||
if err != nil { | ||
return nil, fmt.Errorf("Could not open service manager: %s", err) | ||
} | ||
defer scmgr.Disconnect() | ||
|
||
var serviceNames []string | ||
if len(userServices) == 0 { | ||
//Listing service names from system | ||
serviceNames, err = scmgr.ListServices() | ||
if err != nil { | ||
return nil, fmt.Errorf("Could not list services: %s", err) | ||
} | ||
} else { | ||
serviceNames = userServices | ||
} | ||
serviceInfos := make([]ServiceInfo, len(serviceNames)) | ||
|
||
for i, srvName := range serviceNames { | ||
serviceInfos[i] = collectServiceInfo(scmgr, srvName) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Easiest way to deal with zero value tags/fields IMO is to return an (ServiceInfo, error) and append if err != nil There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would then need to propagate a service error to Gather, or pass Accumulator here. The ServiceInfo structure is a domain model of a service, any error occurred during the collecting of service info is a property of such model. This allows future enhancement in case a user will want to record also service errors into db. Because I still think that if user wants to monitor a particular service and on some hosts it is not possible due to an error, user has to look into the telegraf log instead into the db. But maybe not, we will see. |
||
} | ||
|
||
return serviceInfos, nil | ||
} | ||
|
||
func collectServiceInfo(scmgr *mgr.Mgr, serviceName string) (serviceInfo ServiceInfo) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Recommend returning (ServiceInfo, error) then you can remove the Error field from ServiceInfo. |
||
|
||
serviceInfo.ServiceName = serviceName | ||
srv, err := scmgr.OpenService(serviceName) | ||
if err != nil { | ||
serviceInfo.Error = fmt.Errorf("Could not open service '%s': %s", serviceName, err) | ||
return | ||
} | ||
defer srv.Close() | ||
|
||
//While getting service info there could a theoretically a lot of errors on different places. | ||
//However in reality if there is a problem with a service then usually openService fails and if it passes, other calls will most probably be ok | ||
//So, following error checking is just for sake | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Remove this comment, basically applies to almost any error :) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Network, or more generally communication, errors are for example more probable than subsequent errors from Windows Service API. I still think that checking valid ranges of state and startup_mode is not necessary cause if invalid value would be return than the function will return error anyway. This comment was intended to explain this. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We could get rid of the checkState logic since we are now operating on ints, but we need to check all errors from the external api. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, checking errors of functions is must, no doubts. |
||
srvStatus, err := srv.Query() | ||
if err == nil { | ||
state := int(srvStatus.State) | ||
if !checkState(state) { | ||
serviceInfo.Error = fmt.Errorf("Uknown state of Service %s: %d", serviceName, state) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. typo in Uknown -> Unknown |
||
//finish collecting info on first found error | ||
return | ||
} | ||
serviceInfo.State = state | ||
} else { | ||
serviceInfo.Error = fmt.Errorf("Could not query service '%s': %s", serviceName, err) | ||
//finish collecting info on first found error | ||
return | ||
} | ||
|
||
srvCfg, err := srv.Config() | ||
if err == nil { | ||
startupMode := int(srvCfg.StartType) | ||
if !checkStartupMode(startupMode) { | ||
serviceInfo.Error = fmt.Errorf("Uknown startup mode of Service %s: %d", serviceName, startupMode) | ||
//finish collecting info on first found error | ||
return | ||
} | ||
serviceInfo.DisplayName = srvCfg.DisplayName | ||
serviceInfo.StartUpMode = startupMode | ||
} else { | ||
serviceInfo.Error = fmt.Errorf("Could not get config of service '%s': %s", serviceName, err) | ||
} | ||
return | ||
} | ||
|
||
//returns true of state is in valid range | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This isn't super important to me, but go has the convention of starting the comment with the name of the function. https://blog.golang.org/godoc-documenting-go-code, ex:
|
||
func checkState(state int) bool { | ||
_, ok := ServiceStatesMap[state] | ||
return ok | ||
} | ||
|
||
//returns true of startup mode is in valid range | ||
func checkStartupMode(startupMode int) bool { | ||
_, ok := ServiceStartupModeMap[startupMode] | ||
return ok | ||
} | ||
|
||
func init() { | ||
inputs.Add("win_services", func() telegraf.Input { return &Win_Services{} }) | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
// +build !windows | ||
|
||
package win_services |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,101 @@ | ||
// +build windows | ||
|
||
//this test must be run under administrator account | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this still required? |
||
package win_services | ||
|
||
import ( | ||
"github.com/influxdata/telegraf/testutil" | ||
"github.com/stretchr/testify/assert" | ||
"github.com/stretchr/testify/require" | ||
"golang.org/x/sys/windows/svc/mgr" | ||
"testing" | ||
) | ||
|
||
var InvalidServices = []string{"XYZ1@", "ZYZ@", "SDF_@#"} | ||
var KnownServices = []string{"LanmanServer", "TermService"} | ||
|
||
func TestList(t *testing.T) { | ||
services, err := listServices(KnownServices) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We don't want to add any tests that require special permissions or certain services to running (unless it can also set them up). It would be better to use an interface with a test version. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not sure what is the best goal here. I can easily mock listServices to test the Gather method, but to properly test listServices function I will need to mock Mgr and Service from mgr package. As there is type dependency, as Mgr.OpenService returns mgr.Service, I can not just create an interface which will be satisfied by existing structs in mgr package and their methods. Cause current tests use services available on every Windows edition. I would rather create a special for test purposes, but that would be more E2E test and that's probably not suitable for such test suite. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Chronograf uses a nice pattern for mocks that can handle this. I wrote an example and added it to this gist https://gist.github.com/danielnelson/79908f0bc7145d3feb7a91e0ef56d88c There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That's exactly what I meant. You have to create on struct (proxy) that satisfies interface and redirect call to real implementation, But if it has to be that way... There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, we need to have unit tests. The integration tests can remain but should be guarded with a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sure. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Have |
||
require.NoError(t, err) | ||
assert.Len(t, services, 2, "Different number of services") | ||
assert.Equal(t, services[0].ServiceName, KnownServices[0]) | ||
assert.Nil(t, services[0].Error) | ||
assert.Equal(t, services[1].ServiceName, KnownServices[1]) | ||
assert.Nil(t, services[1].Error) | ||
} | ||
|
||
func TestEmptyList(t *testing.T) { | ||
services, err := listServices([]string{}) | ||
require.NoError(t, err) | ||
assert.Condition(t, func() bool { return len(services) > 20 }, "Too few service") | ||
} | ||
|
||
func TestListEr(t *testing.T) { | ||
services, err := listServices(InvalidServices) | ||
require.NoError(t, err) | ||
assert.Len(t, services, 3, "Different number of services") | ||
for i := 0; i < 3; i++ { | ||
assert.Equal(t, services[i].ServiceName, InvalidServices[i]) | ||
assert.NotNil(t, services[i].Error) | ||
} | ||
} | ||
|
||
func TestGather(t *testing.T) { | ||
ws := &Win_Services{KnownServices} | ||
assert.Len(t, ws.ServiceNames, 2, "Different number of services") | ||
var acc testutil.Accumulator | ||
require.NoError(t, ws.Gather(&acc)) | ||
assert.Len(t, acc.Errors, 0, "There should be no errors after gather") | ||
|
||
for i := 0; i < 2; i++ { | ||
fields := make(map[string]interface{}) | ||
tags := make(map[string]string) | ||
si := getServiceInfo(KnownServices[i]) | ||
fields["state"] = int(si.State) | ||
fields["startup_mode"] = int(si.StartUpMode) | ||
tags["service_name"] = si.ServiceName | ||
tags["display_name"] = si.DisplayName | ||
acc.AssertContainsTaggedFields(t, "win_services", fields, tags) | ||
} | ||
|
||
} | ||
|
||
func TestGatherErrors(t *testing.T) { | ||
ws := &Win_Services{InvalidServices} | ||
assert.Len(t, ws.ServiceNames, 3, "Different number of services") | ||
var acc testutil.Accumulator | ||
require.NoError(t, ws.Gather(&acc)) | ||
assert.Len(t, acc.Errors, 3, "There should be 3 errors after gather") | ||
} | ||
|
||
func getServiceInfo(srvName string) *ServiceInfo { | ||
|
||
scmgr, err := mgr.Connect() | ||
if err != nil { | ||
return nil | ||
} | ||
defer scmgr.Disconnect() | ||
|
||
srv, err := scmgr.OpenService(srvName) | ||
if err != nil { | ||
return nil | ||
} | ||
var si ServiceInfo | ||
si.ServiceName = srvName | ||
srvStatus, err := srv.Query() | ||
if err == nil { | ||
si.State = int(srvStatus.State) | ||
} else { | ||
si.Error = err | ||
} | ||
|
||
srvCfg, err := srv.Config() | ||
if err == nil { | ||
si.DisplayName = srvCfg.DisplayName | ||
si.StartUpMode = int(srvCfg.StartType) | ||
} else { | ||
si.Error = err | ||
} | ||
srv.Close() | ||
return &si | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You can remove everything in this section up through this line, so the only items are the example line protocol
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ok, np.
I made it this way cause it's done so in the example plugins you advised me (ngnix, kapacitor).