Skip to content
Permalink
master
Switch branches/tags
Go to file
 
 
Cannot retrieve contributors at this time
219 lines (191 sloc) 7.49 KB
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2016 Nexenta Systems, Inc. All rights reserved.
*/
#pragma dictionary "DISK"
#define P disk
fru P;
asru P;
/*
* Over all comments for this file:
* <disk-as-detector> The disk-as-detector DE provides the mapping between
* ereports generated by a kernel disk driver sd(7D) and resulting faults.
*/
/*
* SERD engine for media error fault propagation:
*
* This strategy is designed to give a file system, like ZFS, the
* ability to attempt data recovery/relocation without faulting a disk.
* This implementation depends on a file system retry to the same lba
* to trigger a fault when recovery/relocation is not possible.
*
* We let the engine propagate one error only once every 1 minute and then if we
* still get 2 or more errors within 24 hours for the same LBA,
* there is a fault.
*/
engine serd.io.scsi.cmd.disk.dev.rqs.merr@P, N=1, T=24h;
/*
* disk-as-detector: fault events.
*/
event fault.io.scsi.cmd.disk.dev.rqs.derr@P;
event fault.io.scsi.cmd.disk.dev.rqs.merr@P,
engine=serd.io.scsi.cmd.disk.dev.rqs.merr@P;
/*
* The uderr fault will be defined at some future time.
* event fault.io.scsi.cmd.disk.dev.uderr@P;
*/
/*
* disk-as-detector: upset events.
* NOTE: For now we define an upset to implement discard.
*/
event upset.io.scsi.cmd.disk.dev.rqs.derr@P;
event upset.io.scsi.cmd.disk.dev.rqs.merr@P;
event upset.io.scsi.cmd.disk.dev.uderr@P;
event upset.io.scsi.cmd.disk.dev.serr@P;
event upset.io.scsi.cmd.disk.tran@P;
event upset.io.scsi.cmd.disk.recovered@P;
/*
* disk-as-detector: ereports from the kernel.
*
* We don't know the topology for all scsi disks, but the kernel will always
* generate ereport telemetry assuming that we do. We define these ereports
* with 'discard_if_config_unknown=1', which permits ereports against things
* with unknown topology to be silently discarded. The ereport data is logged
* in either case, and can be viewed via 'fmdump -eV'.
*/
event ereport.io.scsi.cmd.disk.dev.rqs.derr@P, discard_if_config_unknown=1;
event ereport.io.scsi.cmd.disk.dev.rqs.merr@P, discard_if_config_unknown=1;
event ereport.io.scsi.cmd.disk.dev.serr@P, discard_if_config_unknown=1;
event ereport.io.scsi.cmd.disk.dev.uderr@P, discard_if_config_unknown=1;
event ereport.io.scsi.cmd.disk.recovered@P, discard_if_config_unknown=1;
event ereport.io.scsi.cmd.disk.tran@P, discard_if_config_unknown=1;
/*
* For some ereports we let the 'driver-assessment', communicated as part of
* the ereport payload, determine fault .vs. upset via propagation constraints.
*/
#define DRIVER_ASSESSMENT_FATAL \
(payloadprop_contains("driver-assessment", "fatal"))
#define DRIVER_ASSESSMENT_NONFATAL (!DRIVER_ASSESSMENT_FATAL)
/*
* disk-as-detector: propagations from faults(based on
* DRIVER_ASSESSMENT_FATAL).
* We need to set additional fault payloads to indicate fault details.
* The payload we may need are listed as following:
* fault.io.scsi.cmd.disk.dev.rqs.derr
* op_code, key, asc, ascq
* fault.io.scsi.cmd.disk.dev.rqs.merr
* op_code, key, asc, ascq, lba
*/
prop fault.io.scsi.cmd.disk.dev.rqs.derr@P->
ereport.io.scsi.cmd.disk.dev.rqs.derr@P{ DRIVER_ASSESSMENT_FATAL &&
setpayloadprop("key", payloadprop("key")) &&
setpayloadprop("asc", payloadprop("asc")) &&
setpayloadprop("ascq", payloadprop("ascq"))};
/*
* Utilize setserdsuffix with specific LBA,
* the serd engine would only trigger if the fault recurred on the same LBA
*/
prop fault.io.scsi.cmd.disk.dev.rqs.merr@P->
ereport.io.scsi.cmd.disk.dev.rqs.merr@P{ DRIVER_ASSESSMENT_FATAL &&
setserdsuffix(payloadprop("lba")) &&
setpayloadprop("key", payloadprop("key")) &&
setpayloadprop("asc", payloadprop("asc")) &&
setpayloadprop("ascq", payloadprop("ascq")) &&
setpayloadprop("lba", payloadprop("lba"))};
/*
* NOTE: this propagation uses the "may" propagation of eversholt.
* The ereport need never exist. It's just a way of making
* the diagnosis wait for the within time on that ereport
* to complete. Once it has completed the diagnosis continues
* even though the dummy ereport didn't occur.
*/
event ereport.io.scsi.cmd.disk.dev.rqs.merr.dummy@P {within(60s)};
prop fault.io.scsi.cmd.disk.dev.rqs.merr@P (0) ->
ereport.io.scsi.cmd.disk.dev.rqs.merr.dummy@P;
/*
* The uderr fault will be propagated at some future time.
* prop fault.io.scsi.cmd.disk.dev.uderr@P->
* ereport.io.scsi.cmd.disk.dev.uderr@P{ DRIVER_ASSESSMENT_FATAL };
*/
/*
* disk-as-detector: propagations from upsets(based on
* DRIVER_ASSESSMENT_NONFATAL).
*/
prop upset.io.scsi.cmd.disk.dev.rqs.derr@P->
ereport.io.scsi.cmd.disk.dev.rqs.derr@P{ DRIVER_ASSESSMENT_NONFATAL };
prop upset.io.scsi.cmd.disk.dev.rqs.merr@P->
ereport.io.scsi.cmd.disk.dev.rqs.merr@P{ DRIVER_ASSESSMENT_NONFATAL };
/*
* disk-as-detector: propagations from upsets(independent of
* driver-assessment)
*/
prop upset.io.scsi.cmd.disk.dev.serr@P->
ereport.io.scsi.cmd.disk.dev.serr@P;
prop upset.io.scsi.cmd.disk.dev.uderr@P->
ereport.io.scsi.cmd.disk.dev.uderr@P;
prop upset.io.scsi.cmd.disk.recovered@P->
ereport.io.scsi.cmd.disk.recovered@P;
prop upset.io.scsi.cmd.disk.tran@P->
ereport.io.scsi.cmd.disk.tran@P;
/*
* --------------------------------------
* The remainder of this file contains rules associated with the operation of
* cmd/fm/modules/common/disk-monitor/disk_monitor.c code.
*
* The disk DE provides a very simple 1-to-1 mapping between SCSI disk events
* generated by the disk-transport fmd module, and the resulting faults.
*/
/*
* Fault events.
*/
event fault.io.disk.over-temperature@P,
FITrate=10, FRU=P, ASRU=P;
event fault.io.disk.predictive-failure@P, FITrate=10,
FITrate=10, FRU=P, ASRU=P;
event fault.io.disk.self-test-failure@P, FITrate=10,
FITrate=10, FRU=P, ASRU=P;
event fault.io.disk.ssm-wearout@P;
/*
* ereports.
*/
event ereport.io.scsi.disk.over-temperature@P;
event ereport.io.scsi.disk.predictive-failure@P;
event ereport.io.scsi.disk.self-test-failure@P;
event ereport.io.scsi.disk.ssm-wearout@P;
/*
* Propagations.
*/
prop fault.io.disk.over-temperature@P ->
ereport.io.scsi.disk.over-temperature@P;
prop fault.io.disk.self-test-failure@P ->
ereport.io.scsi.disk.self-test-failure@P;
prop fault.io.disk.predictive-failure@P ->
ereport.io.scsi.disk.predictive-failure@P {
setpayloadprop("asc", payloadprop("additional-sense-code")) &&
setpayloadprop("ascq", payloadprop("additional-sense-code-qualifier")) };
prop fault.io.disk.ssm-wearout@P ->
ereport.io.scsi.disk.ssm-wearout@P {
setpayloadprop("current-wearout-percentage",
payloadprop("current-ssm-wearout"))
&& setpayloadprop("threshold-wearout-percentage",
payloadprop("threshold-ssm-wearout")) };