Permalink
Switch branches/tags
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
5858 lines (5233 sloc) 166 KB
/*
* This file and its contents are supplied under the terms of the
* Common Development and Distribution License ("CDDL"), version 1.0.
* You may only use this file in accordance with the terms of version
* 1.0 of the CDDL.
*
* A full copy of the text of the CDDL should have accompanied this
* source. A copy of the CDDL is also available via the Internet at
* http://www.illumos.org/license/CDDL.
*/
/*
* Copyright (c) 2018, Joyent, Inc.
*/
/*
* vnd - virtual (machine) networking datapath
*
* vnd's purpose is to provide a highly performant data path for Layer 2 network
* traffic and exist side by side an active IP netstack, each servicing
* different datalinks. vnd provides many of the same capabilities as the
* current TCP/IP stack does and some specific to layer two. Specifically:
*
* o Use of the DLD fastpath
* o Packet capture hooks
* o Ability to use hardware capabilities
* o Useful interfaces for handling multiple frames
*
* The following image shows where vnd fits into today's networking stack:
*
* +---------+----------+----------+
* | libdlpi | libvnd | libsocket|
* +---------+----------+----------+
* | · · VFS |
* | VFS · VFS +----------+
* | · | sockfs |
* +---------+----------+----------+
* | | VND | IP |
* | +----------+----------+
* | DLD/DLS |
* +-------------------------------+
* | MAC |
* +-------------------------------+
* | GLDv3 |
* +-------------------------------+
*
* -----------------------------------------
* A Tale of Two Devices - DDI Device Basics
* -----------------------------------------
*
* vnd presents itself to userland as a character device; however, it also is a
* STREAMS device so that it can interface with dld and the rest of the
* networking stack. Users never interface with the STREAMs devices directly and
* they are purely an implementation detail of vnd. Opening the STREAMS device
* require kcred and as such userland cannot interact with it or push it onto
* the stream head.
*
* The main vnd character device, /dev/vnd/ctl, is a self-cloning device. Every
* clone gets its own minor number; however, minor nodes are not created in the
* devices tree for these instances. In this state a user may do two different
* things. They may issue ioctls that affect global state or they may issue
* ioctls that try to attach it to a given datalink. Once a minor device has
* been attached to a datalink, all operations on it are scoped to that context,
* therefore subsequent global operations are not permitted.
*
* A given device can be linked into the /devices and /dev name space via a link
* ioctl. That ioctl causes a minor node to be created in /devices and then it
* will also appear under /dev/vnd/ due to vnd's sdev plugin. This is similar
* to, but simpler than, IP's persistence mechanism.
*
* ---------------------
* Binding to a datalink
* ---------------------
*
* Datalinks are backed by the dld (datalink device) and dls (datalink services)
* drivers. These drivers provide a STREAMS device for datalinks on the system
* which are exposed through /dev/net. Userland generally manipulates datalinks
* through libdlpi. When an IP interface is being plumbed up what actually
* happens is that someone does a dlpi_open(3DLPI) of the underlying datalink
* and then pushes on the ip STREAMS module with an I_PUSH ioctl. Modules may
* then can negotiate with dld and dls to obtain access to various capabilities
* and fast paths via a series of STREAMS messages.
*
* In vnd, we do the same thing, but we leave our STREAMS module as an
* implementation detail of the system. We don't want users to be able to
* arbitrarily push vnd STREAMS module onto any stream, so we explicitly require
* kcred to manipulate it. Thus, when a user issues a request to attach a
* datalink to a minor instance of the character device, that vnd minor instance
* itself does a layered open (ldi_open_by_name(9F)) of the specified datalink.
* vnd does that open using the passed in credentials from the ioctl, not kcred.
* This ensures that users who doesn't have permissions to open the device
* cannot. Once that's been opened, we push on the vnd streams module.
*
* Once the vnd STREAMS instance has been created for this device, eg. the
* I_PUSH ioctl returns, we explicitly send a STREAMS ioctl
* (VND_STRIOC_ASSOCIATE) to associate the vnd STREAMS and character devices.
* This association begins the STREAM device's initialization. We start up an
* asynchronous state machine that takes care of all the different aspects of
* plumbing up the device with dld and dls and enabling the MAC fast path. We
* need to guarantee to consumers of the character device that by the time their
* ioctl returns, the data path has been fully initialized.
*
* The state progression is fairly linear. There are two general steady states.
* The first is VND_S_ONLINE, which means that everything is jacked up and good
* to go. The alternative is VND_S_ZOMBIE, which means that the streams device
* encountered an error or we have finished tearing it down and the character
* device can clean it up. The following is our state progression and the
* meaning of each state:
*
* |
* |
* V
* +---------------+
* | VNS_S_INITIAL | This is our initial state. Every
* +---------------+ vnd STREAMS device starts here.
* | While in this state, only dlpi
* | M_PROTO and M_IOCTL messages can be
* | sent or received. All STREAMS based
* | data messages are dropped.
* | We transition out of this state by
* | sending a DL_INFO_REQ to obtain
* | information about the underlying
* | link.
* v
* +-----------------+
* +--<-| VNS_S_INFO_SENT | In this state, we verify and
* | +-----------------+ record information about the
* | | underlying device. If the device is
* | | not suitable, eg. not of type
* v | DL_ETHER, then we immediately
* | | become a ZOMBIE. To leave this
* | | state we request exclusive active
* | | access to the device via
* v | DL_EXCLUSIVE_REQ.
* | v
* | +----------------------+
* +--<-| VNS_S_EXCLUSIVE_SENT | In this state, we verify whether
* | +----------------------+ or not we were able to obtain
* | | | exclusive access to the device. If
* | | | we were not able to, then we leave,
* v | | as that means that something like
* | | | IP is already plumbed up on top of
* | | | the datalink. We leave this state
* | | | by progressing through to the
* | | | appropriate DLPI primitive, either
* v | | DLPI_ATTACH_REQ or DLPI_BIND_REQ
* | | | depending on the style of the
* | | | datalink.
* | | v
* | | +-------------------+
* +------ |--<-| VNS_S_ATTACH_SENT | In this state, we verify we were
* | | +-------------------+ able to perform a standard DLPI
* | | | attach and if so, go ahead and
* v | | send a DLPI_BIND_REQ.
* | v v
* | +-------------------+
* +--<-| VNS_S_BIND_SENT | In this state we see the result of
* | +-------------------+ our attempt to bind to PPA 0 of the
* v | underlying device. Because we're
* | | trying to be a layer two datapath,
* | | the specific attachment point isn't
* | | too important as we're going to
* v | have to enable promiscuous mode. We
* | | transition out of this by sending
* | | our first of three promiscuous mode
* | | requests.
* v v
* | +------------------------+
* +--<-| VNS_S_SAP_PROMISC_SENT | In this state we verify that we
* | +------------------------+ were able to enable promiscuous
* | | mode at the physical level. We
* | | transition out of this by enabling
* | | multicast and broadcast promiscuous
* v | mode.
* | v
* | +--------------------------+
* +--<-| VNS_S_MULTI_PROMISC_SENT | In this state we verify that we
* | +--------------------------+ have enabled DL_PROMISC_MULTI and
* v | move onto the second promiscuous
* | | mode request.
* | v
* | +----------------------------+
* +--<-| VNS_S_RX_ONLY_PROMISC_SENT | In this state we verify that we
* | +----------------------------+ enabled RX_ONLY promiscuous mode.
* | | We specifically do this as we don't
* v | want to receive our own traffic
* | | that we'll send out. We leave this
* | | state by enabling the final flag
* | | DL_PROMISC_FIXUPS.
* | v
* | +--------------------------+
* +--<-| VNS_S_FIXUP_PROMISC_SENT | In this state we verify that we
* | +--------------------------+ enabled FIXUP promiscuous mode.
* | | We specifically do this as we need
* v | to ensure that traffic which is
* | | received by being looped back to us
* | | correctly has checksums fixed. We
* | | leave this state by requesting the
* | | dld/dls capabilities that we can
* v | process.
* | v
* | +--------------------+
* +--<-| VNS_S_CAPAB_Q_SENT | We loop over the set of
* | +--------------------+ capabilities that dld advertised
* | | and enable the ones that currently
* v | support for use. See the section
* | | later on regarding capabilities
* | | for more information. We leave this
* | | state by sending an enable request.
* v v
* | +--------------------+
* +--<-| VNS_S_CAPAB_E_SENT | Here we finish all capability
* | +--------------------+ initialization. Once finished, we
* | | transition to the next state. If
* v | the dld fast path is not available,
* | | we become a zombie.
* | v
* | +--------------+
* | | VNS_S_ONLINE | This is a vnd STREAMS device's
* | +--------------+ steady state. It will normally
* | | reside in this state while it is in
* | | active use. It will only transition
* v | to the next state when the STREAMS
* | | device is closed by the character
* | | device. In this state, all data
* | | flows over the dld fast path.
* | v
* | +---------------------+
* +--->| VNS_S_SHUTTING_DOWN | This vnd state takes care of
* | +---------------------+ disabling capabilities and
* | | flushing all data. At this point
* | | any additional data that we receive
* | | will be dropped. We leave this
* v | state by trying to remove multicast
* | | promiscuity.
* | |
* | v
* | +---------------------------------+
* +-->| VNS_S_MULTICAST_PROMISCOFF_SENT | In this state, we check if we have
* | +---------------------------------+ successfully removed multicast
* | | promiscuous mode. If we have
* | | failed, we still carry on but only
* | | warn. We leave this state by trying
* | | to disable SAP level promiscuous
* | | mode.
* | v
* | +---------------------------+
* +-->| VNS_S_SAP_PROMISCOFF_SENT | In this state, we check if we have
* | +---------------------------+ successfully removed SAP level
* | | promiscuous mode. If we have
* | | failed, we still carry on but only
* | | warn. Note that we don't worry
* | | about either of
* | | DL_PROMISC_FIXUPS or
* | | DL_PROMISC_RX_ONLY. If these are
* | | the only two entries left, then we
* | | should have anything that MAC is
* | | doing for us at this point,
* | | therefore it's safe for us to
* | | proceed to unbind, which is how we
* | | leave this state via a
* | v DL_UNBIND_REQ.
* | +-------------------+
* +--->| VNS_S_UNBIND_SENT | Here, we check how the unbind
* | +-------------------+ request went. Regardless of its
* | | success, we always transition to
* | | a zombie state.
* | v
* | +--------------+
* +--->| VNS_S_ZOMBIE | In this state, the vnd STREAMS
* +--------------+ device is waiting to finish being
* reaped. Because we have no more
* ways to receive data it should be
* safe to destroy all remaining data
* structures.
*
* If the stream association fails for any reason the state machine reaches
* VNS_S_ZOMBIE. A more detailed vnd_errno_t will propagate back through the
* STREAMS ioctl to the character device. That will fail the user ioctl and
* propagate the vnd_errno_t back to userland. If, on the other hand, the
* association succeeds, then the vnd STREAMS device will be fully plumbed up
* and ready to transmit and receive message blocks. Consumers will be able to
* start using the other cbops(9E) entry points once the attach has fully
* finished, which will occur after the original user attach ioctl to the
* character device returns.
*
* It's quite important that we end up sending the full series of STREAMS
* messages when tearing down. While it's tempting to say that we should just
* rely on the STREAMS device being closed to properly ensure that we have no
* more additional data, that's not sufficient due to our use of direct
* callbacks. DLS does not ensure that by the time we change the direct
* callback (vnd_mac_input) that all callers to it will have been quiesced.
* However, it does guarantee that if we disable promiscuous mode ourselves and
* we turn off the main data path via DL_UNBIND_REQ that it will work.
* Therefore, we make sure to do this ourselves rather than letting DLS/DLD do
* it as part of tearing down the STREAMS device. This ensures that we'll
* quiesce all data before we destroy our data structures and thus we should
* eliminate the race in changing the data function.
*
* --------------------
* General Architecture
* --------------------
*
* There are several different devices and structures in the vnd driver. There
* is a per-netstack component, pieces related to the character device that
* consumers see, the internal STREAMS device state, and the data queues
* themselves. The following ASCII art picture describes their relationships and
* some of the major pieces of data that contain them. These are not exhaustive,
* e.g. synchronization primitives are left out.
*
* +----------------+ +-----------------+
* | global | | global |
* | device list | | netstack list |
* | vnd_dev_list | | vnd_nsd_list |
* +----------------+ +-----------------+
* | |
* | v
* | +-------------------+ +-------------------+
* | | per-netstack data | ---> | per-netstack data | --> ...
* | | vnd_pnsd_t | | vnd_pnsd_t |
* | | | +-------------------+
* | | |
* | | nestackid_t ---+----> Netstack ID
* | | vnd_pnsd_flags_t -+----> Status flags
* | | zoneid_t ---+----> Zone ID for this netstack
* | | hook_family_t ---+----> VND IPv4 Hooks
* | | hook_family_t ---+----> VND IPv6 Hooks
* | | list_t ----+ |
* | +------------+------+
* | |
* | v
* | +------------------+ +------------------+
* | | character device | ---> | character device | -> ...
* +---------->| vnd_dev_t | | vnd_dev_t |
* | | +------------------+
* | |
* | minor_t ---+--> device minor number
* | ldi_handle_t ---+--> handle to /dev/net/%datalink
* | vnd_dev_flags_t -+--> device flags, non blocking, etc.
* | char[] ---+--> name if linked
* | vnd_str_t * -+ |
* +--------------+---+
* |
* v
* +-------------------------+
* | STREAMS device |
* | vnd_str_t |
* | |
* | vnd_str_state_t ---+---> State machine state
* | gsqueue_t * ---+---> mblk_t Serialization queue
* | vnd_str_stat_t ---+---> per-device kstats
* | vnd_str_capab_t ---+----------------------------+
* | vnd_data_queue_t ---+ | |
* | vnd_data_queue_t -+ | | v
* +-------------------+-+---+ +---------------------+
* | | | Stream capabilities |
* | | | vnd_str_capab_t |
* | | | |
* | | supported caps <--+-- vnd_capab_flags_t |
* | | dld cap handle <--+-- void * |
* | | direct tx func <--+-- vnd_dld_tx_t |
* | | +---------------------+
* | |
* +----------------+ +-------------+
* | |
* v v
* +-------------------+ +-------------------+
* | Read data queue | | Write data queue |
* | vnd_data_queue_t | | vnd_data_queue_t |
* | | | |
* | size_t ----+--> Current size | size_t ----+--> Current size
* | size_t ----+--> Max size | size_t ----+--> Max size
* | mblk_t * ----+--> Queue head | mblk_t * ----+--> Queue head
* | mblk_t * ----+--> Queue tail | mblk_t * ----+--> Queue tail
* +-------------------+ +-------------------+
*
*
* Globally, we maintain two lists. One list contains all of the character
* device soft states. The other maintains a list of all our netstack soft
* states. Each netstack maintains a list of active devices that have been
* associated with a datalink in its netstack.
*
* Recall that a given minor instance of the character device exists in one of
* two modes. It can either be a cloned open of /dev/vnd/ctl, the control node,
* or it can be associated with a given datalink. When minor instances are in
* the former state, they do not exist in a given vnd_pnsd_t's list of devices.
* As part of attaching to a datalink, the given vnd_dev_t will be inserted into
* the appropriate vnd_pnsd_t. In addition, this will cause a STREAMS device, a
* vnd_str_t, to be created and associated to a vnd_dev_t.
*
* The character device, and its vnd_dev_t, is the interface to the rest of the
* system. The vnd_dev_t keeps track of various aspects like whether various
* operations, such as read, write and the frameio ioctls, are considered
* blocking or non-blocking in the O_NONBLOCK sense. It also is responsible for
* keeping track of things like the name of the device, if any, in /dev. The
* vnd_str_t, on the other hand manages aspects like buffer sizes and the actual
* data queues. However, ioctls that manipulate these properties all go through
* the vnd_dev_t to its associated vnd_str_t.
*
* Each of the STREAMS devices, the vnd_str_t, maintains two data queues. One
* for frames to transmit (write queue) and one for frames received (read
* queue). These data queues have a maximum size and attempting to add data
* beyond that maximum size will result in data being dropped. The sizes are
* configurable via ioctls VND_IOC_SETTXBUF, VND_IOC_SETRXBUF. Data either sits
* in those buffers or has a reservation in those buffers while they are in vnd
* and waiting to be consumed by the user or by mac.
*
* Finally, the vnd_str_t also has a vnd_str_capab_t which we use to manage the
* available, negotiated, and currently active features.
*
* ----------------------
* Data Path and gsqueues
* ----------------------
*
* There's a lot of plumbing in vnd to get to the point where we can send data,
* but vnd's bread and butter is the data path, so it's worth diving into it in
* more detail. Data enters and exits the system from two ends.
*
* The first end is the vnd consumer. This comes in the form of read and write
* system calls as well as the frame I/O ioctls. The read and write system calls
* operate on a single frame at a time. Think of a frame as a single message
* that has come in off the wire, which may itself comprise multiple mblk_t's
* linked together in the kernel. readv(2) and writev(2) have the same
* limitations as read(2) and write(2). We enforce this as the system is
* required to fill up every uio(9S) buffer before moving onto the next one.
* This means that if you have a MTU sized buffer and two frames come in which
* are less than half of the MTU they must fill up the given iovec. Even if we
* didn't want to do this, we have no way of informing the supplier of the
* iovecs that they were only partially filled or where one frame ends and
* another begins. That's life, as such we have frame I/O which solves this
* problem. It allows for multiple frames to be consumed as well as for frames
* to be broken down into multiple vector components.
*
* The second end is the mac direct calls. As part of negotiating capabilities
* via dld, we give mac a function of ours to call when packets are received
* [vnd_mac_input()] and a callback to indicate that flow has been restored
* [vnd_mac_flow_control()]. In turn, we also get a function pointer that we can
* transmit data with. As part of the contract with mac, mac is allowed to flow
* control us by returning a cookie to the transmit function. When that happens,
* all outbound traffic is halted until our callback function is called and we
* can schedule drains.
*
* It's worth looking at these in further detail. We'll start with the rx path.
*
*
* |
* * . . . packets from gld
* |
* v
* +-------------+
* | mac |
* +-------------+
* |
* v
* +-------------+
* | dld |
* +-------------+
* |
* * . . . dld direct callback
* |
* v
* +---------------+
* | vnd_mac_input |
* +---------------+
* |
* v
* +---------+ +-------------+
* | dropped |<--*---------| vnd_hooks |
* | by | . +-------------+
* | hooks | . drop probe |
* +---------+ kstat bump * . . . Do we have free
* | buffer space?
* |
* no . | . yes
* . + .
* +---*--+------*-------+
* | |
* * . . drop probe * . . recv probe
* | kstat bump | kstat bump
* v |
* +---------+ * . . fire pollin
* | freemsg | v
* +---------+ +-----------------------+
* | vnd_str_t`vns_dq_read |
* +-----------------------+
* ^ ^
* +----------+ | | +---------+
* | read(9E) |-->-+ +--<--| frameio |
* +----------+ +---------+
*
* The rx path is rather linear. Packets come into us from mac. We always run
* them through the various hooks, and if they come out of that, we inspect the
* read data queue. If there is not enough space for a packet, we drop it.
* Otherwise, we append it to the data queue, and fire read notifications
* targetting anyone polling or doing blocking I/O on this device. Those
* consumers then drain the head of the data queue.
*
* The tx path is more complicated due to mac flow control. After any call into
* mac, we may have to potentially suspend writes and buffer data for an
* arbitrary amount of time. As such, we need to carefully track the total
* amount of outstanding data so that we don't waste kernel memory. This is
* further complicated by the fact that mac will asynchronously tell us when our
* flow has been resumed.
*
* For data to be able to enter the system, it needs to be able to take a
* reservation from the write data queue. Once the reservation has been
* obtained, we enter the gsqueue so that we can actually append it. We use
* gsqueues (serialization queues) to ensure that packets are manipulated in
* order as we deal with the draining and appending packets. We also leverage
* its worker thread to help us do draining after mac has restorted our flow.
*
* The following image describes the flow:
*
* +-----------+ +--------------+ +-------------------------+ +------+
* | write(9E) |-->| Space in the |--*--->| gsqueue_enter_one() |-->| Done |
* | frameio | | write queue? | . | +->vnd_squeue_tx_append | +------+
* +-----------+ +--------------+ . +-------------------------+
* | ^ .
* | | . reserve space from gsqueue
* | | |
* queue . . . * | space v
* full | * . . . avail +------------------------+
* v | | vnd_squeue_tx_append() |
* +--------+ +------------+ +------------------------+
* | EAGAIN |<--*------| Non-block? |<-+ |
* +--------+ . +------------+ | v
* . yes v | wait +--------------+
* no . .* * . . for | append chain |
* +----+ space | to outgoing |
* | mblk chain |
* from gsqueue +--------------+
* | |
* | +-------------------------------------------------+
* | |
* | | yes . . .
* v v .
* +-----------------------+ +--------------+ . +------+
* | vnd_squeue_tx_drain() |--->| mac blocked? |----*---->| Done |
* +-----------------------+ +--------------+ +------+
* | |
* +---------------------------------|---------------------+
* | | tx |
* | no . . * queue . . *
* | flow controlled . | empty * . fire pollout
* | . v | if mblk_t's
* +-------------+ . +---------------------+ | sent
* | set blocked |<----*------| vnd_squeue_tx_one() |--------^-------+
* | flags | +---------------------+ |
* +-------------+ More data | | | More data |
* and limit ^ v * . . and limit ^
* not reached . . * | | reached |
* +----+ | |
* v |
* +----------+ +-------------+ +---------------------------+
* | mac flow |--------->| remove mac |--->| gsqueue_enter_one() with |
* | control | | block flags | | vnd_squeue_tx_drain() and |
* | callback | +-------------+ | GSQUEUE_FILL flag, iff |
* +----------+ | not already scheduled |
* +---------------------------+
*
* The final path taken for a given write(9E)/frameio ioctl depends on whether
* or not the vnd_dev_t is non-blocking. That controls the initial path of
* trying to take a reservation in write data queue. If the device is in
* non-blocking mode, we'll return EAGAIN when there is not enough space
* available, otherwise, the calling thread blocks on the data queue.
*
* Today when we call into vnd_squeue_tx_drain() we will not try to drain the
* entire queue, as that could be quite large and we don't want to necessarily
* keep the thread that's doing the drain until it's been finished. Not only
* could more data be coming in, but the draining thread could be a userland
* thread that has more work to do. We have two limits today. There is an upper
* bound on the total amount of data and the total number of mblk_t chains. If
* we hit either limit, then we will schedule another drain in the gsqueue and
* go from there.
*
* It's worth taking some time to describe how we interact with gsqueues. vnd
* has a gsqueue_set_t for itself. It's important that it has its own set, as
* the profile of work that vnd does is different from other sub-systems in the
* kernel. When we open a STREAMS device in vnd_s_open, we get a random gsqueue.
* Unlike TCP/IP which uses an gsqueue for per TCP connection, we end up
* maintaining one for a given device. Because of that, we want to use a
* pseudo-random one to try and spread out the load, and picking one at random
* is likely to be just as good as any fancy algorithm we might come up with,
* especially as any two devices could have radically different transmit
* profiles.
*
* While some of the write path may seem complicated, it does allow us to
* maintain an important property. Once we have acknowledged a write(9E) or
* frameio ioctl, we will not drop the packet, excepting something like ipf via
* the firewall hooks.
*
* There is one other source of flow control that can exist in the system which
* is in the form of a barrier. The barrier is an internal mechanism used for
* ensuring that an gsqueue is drained for a given device. We use this as part
* of tearing down. Specifically we disable the write path so nothing new can be
* inserted into the gsqueue and then insert a barrier block. Once the barrier
* block comes out of the gsqueue, then we know nothing else in the gsqueue that
* could refer to the vnd_str_t, being destroyed, exists.
*
* ---------------------
* vnd, zones, netstacks
* ---------------------
*
* vnd devices are scoped to datalinks and datalinks are scoped to a netstack.
* Because of that, vnd is also a netstack module. It registers with the
* netstack sub-system and receives callbacks every time a netstack is created,
* being shutdown, and destroyed. The netstack callbacks drive the creation and
* destruction of the vnd_pnsd_t structures.
*
* Recall from the earlier architecture diagrams that every vnd device is scoped
* to a netstack and known about by a given vnd_pnsd_t. When that netstack is
* torn down, we also tear down any vnd devices that are hanging around. When
* the netstack is torn down, we know that any zones that are scoped to that
* netstack are being shut down and have no processes remaining. This is going
* to be the case whether they are shared or exclusive stack zones. We have to
* perform a careful dance.
*
* There are two different callbacks that happen on tear down, the first is a
* shutdown callback, the second is a destroy callback. When the shutdown
* callback is fired we need to prepare for the netstack to go away and ensure
* that nothing can continue to persist itself.
*
* More specifically, when we get notice of a stack being shutdown we first
* remove the netstack from the global netstack list to ensure that no one new
* can come in and find the netstack and get a reference to it. After that, we
* notify the neti hooks that they're going away. Once that's all done, we get
* to the heart of the matter.
*
* When shutting down there could be any number of outstanding contexts that
* have a reference on the vnd_pnsd_t and on the individual links. However, we
* know that no one new will be able to find the vnd_pnsd_t. To account for
* things that have existing references we mark the vnd_pnsd_t`vpnd_flags with
* VND_NS_CONDEMNED. This is checked by code paths that wish to append a device
* to the netstack's list. If this is set, then they must not append to it.
* Once this is set, we know that the netstack's list of devices can never grow,
* only shrink.
*
* Next, for each device we tag it with VND_D_ZONE_DYING. This indicates that
* the container for the device is being destroyed and that we should not allow
* additional references to the device to be created, whether via open, or
* linking. The presence of this bit also allows things like the list ioctl and
* sdev to know not to consider its existence. At the conclusion of this being
* set, we know that no one else should be able to obtain a new reference to the
* device.
*
* Once that has been set for all devices, we go through and remove any existing
* links that have been established in sdev. Because doing that may cause the
* final reference for the device to be dropped, which still has a reference to
* the netstack, we have to restart our walk due to dropped locks. We know that
* this walk will eventually complete because the device cannot be relinked and
* no new devices will be attached in this netstack due to VND_NS_CONDEMNED.
* Once that's finished, the shutdown callback returns.
*
* When we reach the destroy callback, we simply wait for references on the
* netstack to disappear. Because the zone has been shut down, all processes in
* it that have open references have been terminated and reaped. Any threads
* that are newly trying to reference it will fail. However, there is one thing
* that can halt this that we have no control over, which is the global zone
* holding open a reference to the device. In this case the zone halt will hang
* in vnd_stack_destroy. Once the last references is dropped we finish destroy
* the netinfo hooks and free the vnd_pnsd_t.
*
* ----
* sdev
* ----
*
* vnd registers a sdev plugin which allows it to dynamically fill out /dev/vnd
* for both the global and non-global zones. In any given zone we always supply
* a control node via /dev/vnd/ctl. This is the self-cloning node. Each zone
* will also have an entry per-link in that zone under /dev/vnd/%datalink, eg.
* if a link was named net0, there would be a /dev/vnd/net0. The global zone can
* also see every link for every zone, ala /dev/net, under
* /dev/vnd/%zonename/%datalink, eg. if a zone named 'turin' had a vnd device
* named net0, the global zone would have /dev/vnd/turin/net0.
*
* The sdev plugin has three interfaces that it supplies back to sdev. One is to
* validate that a given node is still valid. The next is a callback from sdev
* to say that it is no longer using the node. The third and final one is from
* sdev where it asks us to fill a directory. All of the heavy lifting is done
* in directory filling and in valiation. We opt not to maintain a reference on
* the device while there is an sdev node present. This makes the removal of
* nodes much simpler and most of the possible failure modes shouldn't cause any
* real problems. For example, the open path has to handle both dev_t's which no
* longer exist and which are no longer linked.
*
* -----
* hooks
* -----
*
* Like IP, vnd sends all L3 packets through its firewall hooks. Currently vnd
* provides these for L3 IP and IPv6 traffic. Each netstack provides these hooks
* in a minimal fashion. While we will allow traffic to be filtered through the
* hooks, we do not provide means for packet injection or additional inspection
* at this time. There are a total of four different events created:
*
* o IPv4 physical in
* o IPv4 physical out
* o IPv6 physical in
* o IPv6 physical out
*
* ---------------
* Synchronization
* ---------------
*
* To make our synchronization simpler, we've put more effort into making the
* metadata/setup paths do more work. That work allows the data paths to make
* assumptions around synchronization that simplify the general case. Each major
* structure, the vnd_pnsd_t, vnd_dev_t, vnd_str_t, and vnd_data_queue_t is
* annotated with the protection that its members receives. The following
* annotations are used:
*
* A Atomics; these values are only modified using atomics values.
* Currently this only applies to kstat values.
* E Existence; no lock is needed to access this member, it does not
* change while the structure is valid.
* GL Global Lock; these members are protected by the global
* vnd_dev_lock.
* L Locked; access to the member is controlled by a lock that is in
* the structure.
* NSL netstack lock; this member is protected by the containing
* netstack. This only applies to the vnd_dev_t`vdd_nslink.
* X This member is special, and is discussed in this section.
*
* In addition to locking, we also have reference counts on the vnd_dev_t and
* the vnd_pnsd_t. The reference counts describe the lifetimes of the structure.
* With rare exception, once a reference count is decremented, the consumer
* should not assume that the data is valid any more. The only exception to this
* is the case where we're removing an extant reference count from a link into
* /devices or /dev. Reference counts are obtained on these structures as a part
* of looking them up.
*
* # Global Lock Ordering
* ######################
*
* The following is the order that you must take locks in vnd:
*
* 1) vnd`vnd_dev_lock
* 2) vnd_pnsd_t`vpnd_lock
* 3) vnd_dev_t`vnd_lock
* 4) vnd_str_t`vns_lock
* 5) vnd_data_queue_t`vdq_lock
*
* One must adhere to the following rules:
*
* o You must acquire a lower numbered lock before a high numbered lock.
* o It is NOT legal to hold two locks of the same level concurrently, eg. you
* can not hold two different vnd_dev_t's vnd_lock at the same time.
* o You may release locks in any order.
* o If you release a lock, you must honor the locking rules before acquiring
* it again.
* o You should not hold any locks when calling any of the rele functions.
*
* # Special Considerations
* ########################
*
* While most of the locking is what's expected, it's worth going into the
* special nature that a few members hold. Today, only two structures have
* special considerations: the vnd_dev_t and the vnd_str_t. All members with
* special considerations have an additional annotation that describes how you
* should interact with it.
*
* vnd_dev_t: The vdd_nsd and vdd_cr are only valid when the minor node is
* attached or in the process of attaching. If the code path that goes through
* requires an attached vnd_dev_t, eg. the data path and tear down path, then it
* is always legal to dereference that member without a lock held. When they are
* added to the system, they should be done under the vdd_lock and done as part
* of setting the VND_D_ATTACH_INFLIGHT flag. These should not change during the
* lifetime of the vnd_dev_t.
*
* vnd_dev_t: The vdd_ldih is similar to the vdd_nsd and vdd_cr, except that it
* always exists as it is a part of the structure. The only time that it's valid
* to be using it is during the attach path with the VND_D_ATTACH_INFLIGHT flag
* set or during tear down. Outside of those paths which are naturally
* serialized, there is no explicit locking around the member.
*
* vnd_str_t: The vns_dev and vns_nsd work in similar ways. They are not
* initially set as part of creating the structure, but are set as part of
* responding to the association ioctl. Anything in the data path or metadata
* path that requires association may assume that they exist, as we do not kick
* off the state machine until they're set.
*
* vnd_str_t: The vns_drainblk and vns_barrierblk are similarly special. The
* members are designed to be used as part of various operations with the
* gsqueues. A lock isn't needed to use them, but to work with them, the
* appropriate flag in the vnd_str_t`vns_flags must have been set by the current
* thread. Otherwise, it is always fair game to refer to their addresses. Their
* contents are ignored by vnd, but some members are manipulated by the gsqueue
* subsystem.
*/
#include <sys/conf.h>
#include <sys/devops.h>
#include <sys/modctl.h>
#include <sys/stat.h>
#include <sys/file.h>
#include <sys/types.h>
#include <sys/errno.h>
#include <sys/open.h>
#include <sys/ddi.h>
#include <sys/ethernet.h>
#include <sys/stropts.h>
#include <sys/sunddi.h>
#include <sys/stream.h>
#include <sys/strsun.h>
#include <sys/ksynch.h>
#include <sys/taskq_impl.h>
#include <sys/sdt.h>
#include <sys/debug.h>
#include <sys/sysmacros.h>
#include <sys/dlpi.h>
#include <sys/cred.h>
#include <sys/id_space.h>
#include <sys/list.h>
#include <sys/ctype.h>
#include <sys/policy.h>
#include <sys/sunldi.h>
#include <sys/cred.h>
#include <sys/strsubr.h>
#include <sys/poll.h>
#include <sys/neti.h>
#include <sys/hook.h>
#include <sys/hook_event.h>
#include <sys/vlan.h>
#include <sys/dld.h>
#include <sys/mac_client.h>
#include <sys/netstack.h>
#include <sys/fs/sdev_plugin.h>
#include <sys/kstat.h>
#include <sys/atomic.h>
#include <sys/disp.h>
#include <sys/random.h>
#include <sys/gsqueue.h>
#include <sys/ht.h>
#include <inet/ip.h>
#include <inet/ip6.h>
#include <sys/vnd.h>
/*
* Globals
*/
static dev_info_t *vnd_dip;
static taskq_t *vnd_taskq;
static kmem_cache_t *vnd_str_cache;
static kmem_cache_t *vnd_dev_cache;
static kmem_cache_t *vnd_pnsd_cache;
static id_space_t *vnd_minors;
static int vnd_list_init = 0;
static sdev_plugin_hdl_t vnd_sdev_hdl;
static gsqueue_set_t *vnd_sqset;
static kmutex_t vnd_dev_lock;
static list_t vnd_dev_list; /* Protected by the vnd_dev_lock */
static list_t vnd_nsd_list; /* Protected by the vnd_dev_lock */
/*
* STREAMs ioctls
*
* The STREAMs ioctls are internal to vnd. No one should be seeing them, as such
* they aren't a part of the header file.
*/
#define VND_STRIOC (('v' << 24) | ('n' << 16) | ('d' << 8) | 0x80)
/*
* Private ioctl to associate a given streams instance with a minor instance of
* the character device.
*/
#define VND_STRIOC_ASSOCIATE (VND_STRIOC | 0x1)
typedef struct vnd_strioc_associate {
minor_t vsa_minor; /* minor device node */
netstackid_t vsa_nsid; /* netstack id */
vnd_errno_t vsa_errno; /* errno */
} vnd_strioc_associate_t;
typedef enum vnd_strioc_state {
VSS_UNKNOWN = 0,
VSS_COPYIN = 1,
VSS_COPYOUT = 2,
} vnd_strioc_state_t;
typedef struct vnd_strioc {
vnd_strioc_state_t vs_state;
caddr_t vs_addr;
} vnd_strioc_t;
/*
* VND SQUEUE TAGS, start at 0x42 so we don't overlap with extent tags. Though
* really, overlap is at the end of the day, inevitable.
*/
#define VND_SQUEUE_TAG_TX_DRAIN 0x42
#define VND_SQUEUE_TAG_MAC_FLOW_CONTROL 0x43
#define VND_SQUEUE_TAG_VND_WRITE 0x44
#define VND_SQUEUE_TAG_ND_FRAMEIO_WRITE 0x45
#define VND_SQUEUE_TAG_STRBARRIER 0x46
/*
* vnd reserved names. These are names which are reserved by vnd and thus
* shouldn't be used by some external program.
*/
static char *vnd_reserved_names[] = {
"ctl",
"zone",
NULL
};
/*
* vnd's DTrace probe macros
*
* DTRACE_VND* are all for a stable provider. We also have an unstable internal
* set of probes for reference count manipulation.
*/
#define DTRACE_VND3(name, type1, arg1, type2, arg2, type3, arg3) \
DTRACE_PROBE3(__vnd_##name, type1, arg1, type2, arg2, type3, arg3);
#define DTRACE_VND4(name, type1, arg1, type2, arg2, type3, arg3, type4, arg4) \
DTRACE_PROBE4(__vnd_##name, type1, arg1, type2, arg2, type3, arg3, \
type4, arg4);
#define DTRACE_VND5(name, type1, arg1, type2, arg2, type3, arg3, \
type4, arg4, type5, arg5) \
DTRACE_PROBE5(__vnd_##name, type1, arg1, type2, arg2, type3, arg3, \
type4, arg4, type5, arg5);
#define DTRACE_VND_REFINC(vdp) \
DTRACE_PROBE2(vnd__ref__inc, vnd_dev_t *, vdp, int, vdp->vdd_ref);
#define DTRACE_VND_REFDEC(vdp) \
DTRACE_PROBE2(vnd__ref__dec, vnd_dev_t *, vdp, int, vdp->vdd_ref);
/*
* Tunables
*/
size_t vnd_vdq_default_size = 1024 * 64; /* 64 KB */
size_t vnd_vdq_hard_max = 1024 * 1024 * 4; /* 4 MB */
/*
* These numbers are designed as per-device tunables that are applied when a new
* vnd device is attached. They're a rough stab at what may be a reasonable
* amount of work to do in one burst in an squeue.
*/
size_t vnd_flush_burst_size = 1520 * 10; /* 10 1500 MTU packets */
size_t vnd_flush_nburst = 10; /* 10 frames */
/*
* Constants related to our sdev plugins
*/
#define VND_SDEV_NAME "vnd"
#define VND_SDEV_ROOT "/dev/vnd"
#define VND_SDEV_ZROOT "/dev/vnd/zone"
/*
* vnd relies on privileges, not mode bits to limit access. As such, device
* files are read-write to everyone.
*/
#define VND_SDEV_MODE (S_IFCHR | S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | \
S_IROTH | S_IWOTH)
/*
* Statistic macros
*/
#define VND_STAT_INC(vsp, field, val) \
atomic_add_64(&(vsp)->vns_ksdata.field.value.ui64, val)
#define VND_LATENCY_1MS 1000000
#define VND_LATENCY_10MS 10000000
#define VND_LATENCY_100MS 100000000
#define VND_LATENCY_1S 1000000000
#define VND_LATENCY_10S 10000000000
/*
* Constants for vnd hooks
*/
static uint8_t vnd_bcast_addr[6] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF };
#define IPV4_MCAST_LEN 3
static uint8_t vnd_ipv4_mcast[3] = { 0x01, 0x00, 0x5E };
#define IPV6_MCAST_LEN 2
static uint8_t vnd_ipv6_mcast[2] = { 0x33, 0x33 };
/*
* vnd internal data structures and types
*/
struct vnd_str;
struct vnd_dev;
struct vnd_pnsd;
/*
* As part of opening the device stream we need to properly communicate with our
* underlying stream. This is a bit of an asynchronous dance and we need to
* properly work with dld to get everything set up. We have to initiate the
* conversation with dld and as such we keep track of our state here.
*/
typedef enum vnd_str_state {
VNS_S_INITIAL = 0,
VNS_S_INFO_SENT,
VNS_S_EXCLUSIVE_SENT,
VNS_S_ATTACH_SENT,
VNS_S_BIND_SENT,
VNS_S_SAP_PROMISC_SENT,
VNS_S_MULTI_PROMISC_SENT,
VNS_S_RX_ONLY_PROMISC_SENT,
VNS_S_FIXUP_PROMISC_SENT,
VNS_S_CAPAB_Q_SENT,
VNS_S_CAPAB_E_SENT,
VNS_S_ONLINE,
VNS_S_SHUTTING_DOWN,
VNS_S_MULTICAST_PROMISCOFF_SENT,
VNS_S_SAP_PROMISCOFF_SENT,
VNS_S_UNBIND_SENT,
VNS_S_ZOMBIE
} vnd_str_state_t;
typedef enum vnd_str_flags {
VNS_F_NEED_ZONE = 0x1,
VNS_F_TASKQ_DISPATCHED = 0x2,
VNS_F_CONDEMNED = 0x4,
VNS_F_FLOW_CONTROLLED = 0x8,
VNS_F_DRAIN_SCHEDULED = 0x10,
VNS_F_BARRIER = 0x20,
VNS_F_BARRIER_DONE = 0x40
} vnd_str_flags_t;
typedef enum vnd_capab_flags {
VNS_C_HCKSUM = 0x1,
VNS_C_DLD = 0x2,
VNS_C_DIRECT = 0x4,
VNS_C_HCKSUM_BADVERS = 0x8
} vnd_capab_flags_t;
/*
* Definitions to interact with direct callbacks
*/
typedef void (*vnd_rx_t)(struct vnd_str *, mac_resource_t *, mblk_t *,
mac_header_info_t *);
typedef uintptr_t vnd_mac_cookie_t;
/* DLD Direct capability function */
typedef int (*vnd_dld_cap_t)(void *, uint_t, void *, uint_t);
/* DLD Direct tx function */
typedef vnd_mac_cookie_t (*vnd_dld_tx_t)(void *, mblk_t *, uint64_t, uint16_t);
/* DLD Direct function to set flow control callback */
typedef void *(*vnd_dld_set_fcb_t)(void *, void (*)(void *, vnd_mac_cookie_t),
void *);
/* DLD Direct function to see if flow controlled still */
typedef int (*vnd_dld_is_fc_t)(void *, vnd_mac_cookie_t);
/*
* The vnd_str_capab_t is always protected by the vnd_str_t it's a member of.
*/
typedef struct vnd_str_capab {
vnd_capab_flags_t vsc_flags;
t_uscalar_t vsc_hcksum_opts;
vnd_dld_cap_t vsc_capab_f;
void *vsc_capab_hdl;
vnd_dld_tx_t vsc_tx_f;
void *vsc_tx_hdl;
vnd_dld_set_fcb_t vsc_set_fcb_f;
void *vsc_set_fcb_hdl;
vnd_dld_is_fc_t vsc_is_fc_f;
void *vsc_is_fc_hdl;
vnd_mac_cookie_t vsc_fc_cookie;
void *vsc_tx_fc_hdl;
} vnd_str_capab_t;
/*
* The vnd_data_queue is a simple construct for storing a series of messages in
* a queue.
*
* See synchronization section of the big theory statement for member
* annotations.
*/
typedef struct vnd_data_queue {
struct vnd_str *vdq_vns; /* E */
kmutex_t vdq_lock;
kcondvar_t vdq_ready; /* Uses vdq_lock */
ssize_t vdq_max; /* L */
ssize_t vdq_cur; /* L */
mblk_t *vdq_head; /* L */
mblk_t *vdq_tail; /* L */
} vnd_data_queue_t;
typedef struct vnd_str_stat {
kstat_named_t vks_rbytes;
kstat_named_t vks_rpackets;
kstat_named_t vks_obytes;
kstat_named_t vks_opackets;
kstat_named_t vks_nhookindrops;
kstat_named_t vks_nhookoutdrops;
kstat_named_t vks_ndlpidrops;
kstat_named_t vks_ndataindrops;
kstat_named_t vks_ndataoutdrops;
kstat_named_t vks_tdrops;
kstat_named_t vks_linkname;
kstat_named_t vks_zonename;
kstat_named_t vks_nmacflow;
kstat_named_t vks_tmacflow;
kstat_named_t vks_mac_flow_1ms;
kstat_named_t vks_mac_flow_10ms;
kstat_named_t vks_mac_flow_100ms;
kstat_named_t vks_mac_flow_1s;
kstat_named_t vks_mac_flow_10s;
} vnd_str_stat_t;
/*
* vnd stream structure
*
* See synchronization section of the big theory statement for member
* annotations.
*/
typedef struct vnd_str {
kmutex_t vns_lock;
kcondvar_t vns_cancelcv; /* Uses vns_lock */
kcondvar_t vns_barriercv; /* Uses vns_lock */
kcondvar_t vns_stcv; /* Uses vns_lock */
vnd_str_state_t vns_state; /* L */
vnd_str_state_t vns_laststate; /* L */
vnd_errno_t vns_errno; /* L */
vnd_str_flags_t vns_flags; /* L */
vnd_str_capab_t vns_caps; /* L */
taskq_ent_t vns_tqe; /* L */
vnd_data_queue_t vns_dq_read; /* E */
vnd_data_queue_t vns_dq_write; /* E */
mblk_t *vns_dlpi_inc; /* L */
queue_t *vns_rq; /* E */
queue_t *vns_wq; /* E */
queue_t *vns_lrq; /* E */
t_uscalar_t vns_dlpi_style; /* L */
t_uscalar_t vns_minwrite; /* L */
t_uscalar_t vns_maxwrite; /* L */
hrtime_t vns_fclatch; /* L */
hrtime_t vns_fcupdate; /* L */
kstat_t *vns_kstat; /* E */
gsqueue_t *vns_squeue; /* E */
mblk_t vns_drainblk; /* E + X */
mblk_t vns_barrierblk; /* E + X */
vnd_str_stat_t vns_ksdata; /* A */
size_t vns_nflush; /* L */
size_t vns_bsize; /* L */
struct vnd_dev *vns_dev; /* E + X */
struct vnd_pnsd *vns_nsd; /* E + X */
} vnd_str_t;
typedef enum vnd_dev_flags {
VND_D_ATTACH_INFLIGHT = 0x001,
VND_D_ATTACHED = 0x002,
VND_D_LINK_INFLIGHT = 0x004,
VND_D_LINKED = 0x008,
VND_D_CONDEMNED = 0x010,
VND_D_ZONE_DYING = 0x020,
VND_D_OPENED = 0x040
} vnd_dev_flags_t;
/*
* This represents the data associated with a minor device instance.
*
* See synchronization section of the big theory statement for member
* annotations.
*/
typedef struct vnd_dev {
kmutex_t vdd_lock;
list_node_t vdd_link; /* GL */
list_node_t vdd_nslink; /* NSL */
int vdd_ref; /* L */
vnd_dev_flags_t vdd_flags; /* L */
minor_t vdd_minor; /* E */
dev_t vdd_devid; /* E */
ldi_ident_t vdd_ldiid; /* E */
ldi_handle_t vdd_ldih; /* X */
cred_t *vdd_cr; /* X */
vnd_str_t *vdd_str; /* L */
struct pollhead vdd_ph; /* E */
struct vnd_pnsd *vdd_nsd; /* E + X */
char vdd_datalink[VND_NAMELEN]; /* L */
char vdd_lname[VND_NAMELEN]; /* L */
} vnd_dev_t;
typedef enum vnd_pnsd_flags {
VND_NS_CONDEMNED = 0x1
} vnd_pnsd_flags_t;
/*
* Per netstack data structure.
*
* See synchronization section of the big theory statement for member
* annotations.
*/
typedef struct vnd_pnsd {
list_node_t vpnd_link; /* protected by global dev lock */
zoneid_t vpnd_zid; /* E */
netstackid_t vpnd_nsid; /* E */
boolean_t vpnd_hooked; /* E */
net_handle_t vpnd_neti_v4; /* E */
hook_family_t vpnd_family_v4; /* E */
hook_event_t vpnd_event_in_v4; /* E */
hook_event_t vpnd_event_out_v4; /* E */
hook_event_token_t vpnd_token_in_v4; /* E */
hook_event_token_t vpnd_token_out_v4; /* E */
net_handle_t vpnd_neti_v6; /* E */
hook_family_t vpnd_family_v6; /* E */
hook_event_t vpnd_event_in_v6; /* E */
hook_event_t vpnd_event_out_v6; /* E */
hook_event_token_t vpnd_token_in_v6; /* E */
hook_event_token_t vpnd_token_out_v6; /* E */
kmutex_t vpnd_lock; /* Protects remaining members */
kcondvar_t vpnd_ref_change; /* Uses vpnd_lock */
int vpnd_ref; /* L */
vnd_pnsd_flags_t vpnd_flags; /* L */
list_t vpnd_dev_list; /* L */
} vnd_pnsd_t;
static void vnd_squeue_tx_drain(void *, mblk_t *, gsqueue_t *, void *);
/*
* Drop function signature.
*/
typedef void (*vnd_dropper_f)(vnd_str_t *, mblk_t *, const char *);
static void
vnd_drop_ctl(vnd_str_t *vsp, mblk_t *mp, const char *reason)
{
DTRACE_VND4(drop__ctl, mblk_t *, mp, vnd_str_t *, vsp, mblk_t *,
mp, const char *, reason);
if (mp != NULL) {
freemsg(mp);
}
VND_STAT_INC(vsp, vks_ndlpidrops, 1);
VND_STAT_INC(vsp, vks_tdrops, 1);
}
static void
vnd_drop_in(vnd_str_t *vsp, mblk_t *mp, const char *reason)
{
DTRACE_VND4(drop__in, mblk_t *, mp, vnd_str_t *, vsp, mblk_t *,
mp, const char *, reason);
if (mp != NULL) {
freemsg(mp);
}
VND_STAT_INC(vsp, vks_ndataindrops, 1);
VND_STAT_INC(vsp, vks_tdrops, 1);
}
static void
vnd_drop_out(vnd_str_t *vsp, mblk_t *mp, const char *reason)
{
DTRACE_VND4(drop__out, mblk_t *, mp, vnd_str_t *, vsp, mblk_t *,
mp, const char *, reason);
if (mp != NULL) {
freemsg(mp);
}
VND_STAT_INC(vsp, vks_ndataoutdrops, 1);
VND_STAT_INC(vsp, vks_tdrops, 1);
}
static void
vnd_drop_hook_in(vnd_str_t *vsp, mblk_t *mp, const char *reason)
{
DTRACE_VND4(drop__in, mblk_t *, mp, vnd_str_t *, vsp, mblk_t *,
mp, const char *, reason);
if (mp != NULL) {
freemsg(mp);
}
VND_STAT_INC(vsp, vks_nhookindrops, 1);
VND_STAT_INC(vsp, vks_tdrops, 1);
}
static void
vnd_drop_hook_out(vnd_str_t *vsp, mblk_t *mp, const char *reason)
{
DTRACE_VND4(drop__out, mblk_t *, mp, vnd_str_t *, vsp, mblk_t *,
mp, const char *, reason);
if (mp != NULL) {
freemsg(mp);
}
VND_STAT_INC(vsp, vks_nhookoutdrops, 1);
VND_STAT_INC(vsp, vks_tdrops, 1);
}
/* ARGSUSED */
static void
vnd_drop_panic(vnd_str_t *vsp, mblk_t *mp, const char *reason)
{
panic("illegal vnd drop");
}
/* ARGSUSED */
static void
vnd_mac_drop_input(vnd_str_t *vsp, mac_resource_t *unused, mblk_t *mp_chain,
mac_header_info_t *mhip)
{
mblk_t *mp;
while (mp_chain != NULL) {
mp = mp_chain;
mp_chain = mp->b_next;
vnd_drop_hook_in(vsp, mp, "stream not associated");
}
}
static vnd_pnsd_t *
vnd_nsd_lookup(netstackid_t nsid)
{
vnd_pnsd_t *nsp;
mutex_enter(&vnd_dev_lock);
for (nsp = list_head(&vnd_nsd_list); nsp != NULL;
nsp = list_next(&vnd_nsd_list, nsp)) {
if (nsp->vpnd_nsid == nsid) {
mutex_enter(&nsp->vpnd_lock);
VERIFY(nsp->vpnd_ref >= 0);
nsp->vpnd_ref++;
mutex_exit(&nsp->vpnd_lock);
break;
}
}
mutex_exit(&vnd_dev_lock);
return (nsp);
}
static vnd_pnsd_t *
vnd_nsd_lookup_by_zid(zoneid_t zid)
{
netstack_t *ns;
vnd_pnsd_t *nsp;
ns = netstack_find_by_zoneid(zid);
if (ns == NULL)
return (NULL);
nsp = vnd_nsd_lookup(ns->netstack_stackid);
netstack_rele(ns);
return (nsp);
}
static vnd_pnsd_t *
vnd_nsd_lookup_by_zonename(char *zname)
{
zone_t *zonep;
vnd_pnsd_t *nsp;
zonep = zone_find_by_name(zname);
if (zonep == NULL)
return (NULL);
nsp = vnd_nsd_lookup_by_zid(zonep->zone_id);
zone_rele(zonep);
return (nsp);
}
static void
vnd_nsd_ref(vnd_pnsd_t *nsp)
{
mutex_enter(&nsp->vpnd_lock);
/*
* This can only be used on something that has been obtained through
* some other means. As such, the caller should already have a reference
* before adding another one. This function should not be used as a
* means of creating the initial reference.
*/
VERIFY(nsp->vpnd_ref > 0);
nsp->vpnd_ref++;
mutex_exit(&nsp->vpnd_lock);
cv_broadcast(&nsp->vpnd_ref_change);
}
static void
vnd_nsd_rele(vnd_pnsd_t *nsp)
{
mutex_enter(&nsp->vpnd_lock);
VERIFY(nsp->vpnd_ref > 0);
nsp->vpnd_ref--;
mutex_exit(&nsp->vpnd_lock);
cv_broadcast(&nsp->vpnd_ref_change);
}
static vnd_dev_t *
vnd_dev_lookup(minor_t m)
{
vnd_dev_t *vdp;
mutex_enter(&vnd_dev_lock);
for (vdp = list_head(&vnd_dev_list); vdp != NULL;
vdp = list_next(&vnd_dev_list, vdp)) {
if (vdp->vdd_minor == m) {
mutex_enter(&vdp->vdd_lock);
VERIFY(vdp->vdd_ref > 0);
vdp->vdd_ref++;
DTRACE_VND_REFINC(vdp);
mutex_exit(&vdp->vdd_lock);
break;
}
}
mutex_exit(&vnd_dev_lock);
return (vdp);
}
static void
vnd_dev_free(vnd_dev_t *vdp)
{
/*
* When the STREAM exists we need to go through and make sure
* communication gets torn down. As part of closing the stream, we
* guarantee that nothing else should be able to enter the stream layer
* at this point. That means no one should be able to call
* read(),write() or one of the frameio ioctls.
*/
if (vdp->vdd_flags & VND_D_ATTACHED) {
(void) ldi_close(vdp->vdd_ldih, FREAD | FWRITE, vdp->vdd_cr);
crfree(vdp->vdd_cr);
vdp->vdd_cr = NULL;
/*
* We have to remove ourselves from our parents list now. It is
* really quite important that we have already set the condemend
* flag here so that our containing netstack basically knows
* that we're on the way down and knows not to wait for us. It's
* also important that we do that before we put a rele on the
* the device as that is the point at which it will check again.
*/
mutex_enter(&vdp->vdd_nsd->vpnd_lock);
list_remove(&vdp->vdd_nsd->vpnd_dev_list, vdp);
mutex_exit(&vdp->vdd_nsd->vpnd_lock);
vnd_nsd_rele(vdp->vdd_nsd);
vdp->vdd_nsd = NULL;
}
ASSERT(vdp->vdd_flags & VND_D_CONDEMNED);
id_free(vnd_minors, vdp->vdd_minor);
mutex_destroy(&vdp->vdd_lock);
kmem_cache_free(vnd_dev_cache, vdp);
}
static void
vnd_dev_ref(vnd_dev_t *vdp)
{
mutex_enter(&vdp->vdd_lock);
VERIFY(vdp->vdd_ref > 0);
vdp->vdd_ref++;
DTRACE_VND_REFINC(vdp);
mutex_exit(&vdp->vdd_lock);
}
/*
* As part of releasing the hold on this we may tear down a given vnd_dev_t As
* such we need to make sure that we grab the list lock first before grabbing
* the vnd_dev_t's lock to ensure proper lock ordering.
*/
static void
vnd_dev_rele(vnd_dev_t *vdp)
{
mutex_enter(&vnd_dev_lock);
mutex_enter(&vdp->vdd_lock);
VERIFY(vdp->vdd_ref > 0);
vdp->vdd_ref--;
DTRACE_VND_REFDEC(vdp);
if (vdp->vdd_ref > 0) {
mutex_exit(&vdp->vdd_lock);
mutex_exit(&vnd_dev_lock);
return;
}
/*
* Now that we've removed this from the list, we can go ahead and
* drop the list lock. No one else can find this device and reference
* it. As its reference count is zero, it by definition does not have
* any remaining entries in /devices that could lead someone back to
* this.
*/
vdp->vdd_flags |= VND_D_CONDEMNED;
list_remove(&vnd_dev_list, vdp);
mutex_exit(&vdp->vdd_lock);
mutex_exit(&vnd_dev_lock);
vnd_dev_free(vdp);
}
/*
* Insert a mesage block chain if there's space, otherwise drop it. Return one
* so someone who was waiting for data would now end up having found it. eg.
* caller should consider a broadcast.
*/
static int
vnd_dq_push(vnd_data_queue_t *vqp, mblk_t *mp, boolean_t reserved,
vnd_dropper_f dropf)
{
size_t msize;
ASSERT(MUTEX_HELD(&vqp->vdq_lock));
if (reserved == B_FALSE) {
msize = msgsize(mp);
if (vqp->vdq_cur + msize > vqp->vdq_max) {
dropf(vqp->vdq_vns, mp, "buffer full");
return (0);
}
vqp->vdq_cur += msize;
}
if (vqp->vdq_head == NULL) {
ASSERT(vqp->vdq_tail == NULL);
vqp->vdq_head = mp;
vqp->vdq_tail = mp;
} else {
vqp->vdq_tail->b_next = mp;
vqp->vdq_tail = mp;
}
return (1);
}
/*
* Remove a message message block chain. If the amount of space in the buffer
* has changed we return 1. We have no way of knowing whether or not there is
* enough space overall for a given writer who is blocked, so we always end up
* having to return true and thus tell consumers that they should consider
* signalling.
*/
static int
vnd_dq_pop(vnd_data_queue_t *vqp, mblk_t **mpp)
{
size_t msize;
mblk_t *mp;
ASSERT(MUTEX_HELD(&vqp->vdq_lock));
ASSERT(mpp != NULL);
if (vqp->vdq_head == NULL) {
ASSERT(vqp->vdq_tail == NULL);
*mpp = NULL;
return (0);
}
mp = vqp->vdq_head;
msize = msgsize(mp);
vqp->vdq_cur -= msize;
if (mp->b_next == NULL) {
vqp->vdq_head = NULL;
vqp->vdq_tail = NULL;
/*
* We can't be certain that this is always going to be zero.
* Someone may have basically taken a reservation of space on
* the data queue, eg. claimed spae but not yet pushed it on
* yet.
*/
ASSERT(vqp->vdq_cur >= 0);
} else {
vqp->vdq_head = mp->b_next;
ASSERT(vqp->vdq_cur > 0);
}
mp->b_next = NULL;
*mpp = mp;
return (1);
}
/*
* Reserve space in the queue. This will bump up the size of the queue and
* entitle the user to push something on later without bumping the space.
*/
static int
vnd_dq_reserve(vnd_data_queue_t *vqp, ssize_t size)
{
ASSERT(MUTEX_HELD(&vqp->vdq_lock));
ASSERT(size >= 0);
if (size == 0)
return (0);
if (size + vqp->vdq_cur > vqp->vdq_max)
return (0);
vqp->vdq_cur += size;
return (1);
}
static void
vnd_dq_unreserve(vnd_data_queue_t *vqp, ssize_t size)
{
ASSERT(MUTEX_HELD(&vqp->vdq_lock));
ASSERT(size > 0);
ASSERT(size <= vqp->vdq_cur);
vqp->vdq_cur -= size;
}
static void
vnd_dq_flush(vnd_data_queue_t *vqp, vnd_dropper_f dropf)
{
mblk_t *mp, *next;
mutex_enter(&vqp->vdq_lock);
for (mp = vqp->vdq_head; mp != NULL; mp = next) {
next = mp->b_next;
mp->b_next = NULL;
dropf(vqp->vdq_vns, mp, "vnd_dq_flush");
}
vqp->vdq_cur = 0;
vqp->vdq_head = NULL;
vqp->vdq_tail = NULL;
mutex_exit(&vqp->vdq_lock);
}
static boolean_t
vnd_dq_is_empty(vnd_data_queue_t *vqp)
{
boolean_t ret;
mutex_enter(&vqp->vdq_lock);
if (vqp->vdq_head == NULL)
ret = B_TRUE;
else
ret = B_FALSE;
mutex_exit(&vqp->vdq_lock);
return (ret);
}
/*
* Get a network uint16_t from the message and translate it into something the
* host understands.
*/
static int
vnd_mbc_getu16(mblk_t *mp, off_t off, uint16_t *out)
{
size_t mpsize;
uint8_t *bp;
mpsize = msgsize(mp);
/* Check for overflow */
if (off + sizeof (uint16_t) > mpsize)
return (1);
mpsize = MBLKL(mp);
while (off >= mpsize) {
mp = mp->b_cont;
off -= mpsize;
mpsize = MBLKL(mp);
}
/*
* Data is in network order. Note the second byte of data might be in
* the next mp.
*/
bp = mp->b_rptr + off;
*out = *bp << 8;
if (off + 1 == mpsize) {
mp = mp->b_cont;
bp = mp->b_rptr;
} else {
bp++;
}
*out |= *bp;
return (0);
}
/*
* Given an mblk chain find the mblk and address of a particular offset.
*/
static int
vnd_mbc_getoffset(mblk_t *mp, off_t off, mblk_t **mpp, uintptr_t *offp)
{
size_t mpsize;
if (off >= msgsize(mp))
return (1);
mpsize = MBLKL(mp);
while (off >= mpsize) {
mp = mp->b_cont;
off -= mpsize;
mpsize = MBLKL(mp);
}
*mpp = mp;
*offp = (uintptr_t)mp->b_rptr + off;
return (0);
}
/*
* Fetch the destination mac address. Set *dstp to that mac address. If the data
* is not contiguous in the first mblk_t, fill in datap and set *dstp to it.
*/
static int
vnd_mbc_getdstmac(mblk_t *mp, uint8_t **dstpp, uint8_t *datap)
{
int i;
if (MBLKL(mp) >= ETHERADDRL) {
*dstpp = mp->b_rptr;
return (0);
}
*dstpp = datap;
for (i = 0; i < ETHERADDRL; i += 2, datap += 2) {
if (vnd_mbc_getu16(mp, i, (uint16_t *)datap) != 0)
return (1);
}
return (0);
}
static int
vnd_hook(vnd_str_t *vsp, mblk_t **mpp, net_handle_t netiv4, hook_event_t hev4,
hook_event_token_t hetv4, net_handle_t netiv6, hook_event_t hev6,
hook_event_token_t hetv6, vnd_dropper_f hdrop, vnd_dropper_f ddrop)
{
uint16_t etype;
hook_pkt_event_t info;
size_t offset, mblen;
uint8_t *dstp;
uint8_t dstaddr[6];
hook_event_t he;
hook_event_token_t het;
net_handle_t neti;
/*
* Before we can ask if we're interested we have to do enough work to
* determine the ethertype.
*/
/* Byte 12 is either the VLAN tag or the ethertype */
if (vnd_mbc_getu16(*mpp, 12, &etype) != 0) {
ddrop(vsp, *mpp, "packet has incomplete ethernet header");
*mpp = NULL;
return (1);
}
if (etype == ETHERTYPE_VLAN) {
/* Actual ethertype is another four bytes in */
if (vnd_mbc_getu16(*mpp, 16, &etype) != 0) {
ddrop(vsp, *mpp,
"packet has incomplete ethernet vlan header");
*mpp = NULL;
return (1);
}
offset = sizeof (struct ether_vlan_header);
} else {
offset = sizeof (struct ether_header);
}
/*
* At the moment we only hook on the kinds of things that the IP module
* would normally.
*/
if (etype != ETHERTYPE_IP && etype != ETHERTYPE_IPV6)
return (0);
if (etype == ETHERTYPE_IP) {
neti = netiv4;
he = hev4;
het = hetv4;
} else {
neti = netiv6;
he = hev6;
het = hetv6;
}
if (!he.he_interested)
return (0);
if (vnd_mbc_getdstmac(*mpp, &dstp, dstaddr) != 0) {
ddrop(vsp, *mpp, "packet has incomplete ethernet header");
*mpp = NULL;
return (1);
}
/*
* Now that we know we're interested, we have to do some additional
* sanity checking for IPF's sake, ala ip_check_length(). Specifically
* we need to check to make sure that the remaining packet size,
* excluding MAC, is at least the size of an IP header.
*/
mblen = msgsize(*mpp);
if ((etype == ETHERTYPE_IP &&
mblen - offset < IP_SIMPLE_HDR_LENGTH) ||
(etype == ETHERTYPE_IPV6 && mblen - offset < IPV6_HDR_LEN)) {
ddrop(vsp, *mpp, "packet has invalid IP header");
*mpp = NULL;
return (1);
}
info.hpe_protocol = neti;
info.hpe_ifp = (phy_if_t)vsp;
info.hpe_ofp = (phy_if_t)vsp;
info.hpe_mp = mpp;
info.hpe_flags = 0;
if (bcmp(vnd_bcast_addr, dstp, ETHERADDRL) == 0)
info.hpe_flags |= HPE_BROADCAST;
else if (etype == ETHERTYPE_IP &&
bcmp(vnd_ipv4_mcast, vnd_bcast_addr, IPV4_MCAST_LEN) == 0)
info.hpe_flags |= HPE_MULTICAST;
else if (etype == ETHERTYPE_IPV6 &&
bcmp(vnd_ipv6_mcast, vnd_bcast_addr, IPV6_MCAST_LEN) == 0)
info.hpe_flags |= HPE_MULTICAST;
if (vnd_mbc_getoffset(*mpp, offset, &info.hpe_mb,
(uintptr_t *)&info.hpe_hdr) != 0) {
ddrop(vsp, *mpp, "packet too small -- "
"unable to find payload");
*mpp = NULL;
return (1);
}
if (hook_run(neti->netd_hooks, het, (hook_data_t)&info) != 0) {
hdrop(vsp, *mpp, "drooped by hooks");
return (1);
}
return (0);
}
/*
* This should not be used for DL_INFO_REQ.
*/
static mblk_t *
vnd_dlpi_alloc(size_t len, t_uscalar_t prim)
{
mblk_t *mp;
mp = allocb(len, BPRI_MED);
if (mp == NULL)
return (NULL);
mp->b_datap->db_type = M_PROTO;
mp->b_wptr = mp->b_rptr + len;
bzero(mp->b_rptr, len);
((dl_unitdata_req_t *)mp->b_rptr)->dl_primitive = prim;
return (mp);
}
static void
vnd_dlpi_inc_push(vnd_str_t *vsp, mblk_t *mp)
{
mblk_t **mpp;
VERIFY(MUTEX_HELD(&vsp->vns_lock));
ASSERT(mp->b_next == NULL);
mpp = &vsp->vns_dlpi_inc;
while (*mpp != NULL)
mpp = &((*mpp)->b_next);
*mpp = mp;
}
static mblk_t *
vnd_dlpi_inc_pop(vnd_str_t *vsp)
{
mblk_t *mp;
VERIFY(MUTEX_HELD(&vsp->vns_lock));
mp = vsp->vns_dlpi_inc;
if (mp != NULL) {
VERIFY(mp->b_next == NULL || mp->b_next != mp);
vsp->vns_dlpi_inc = mp->b_next;
mp->b_next = NULL;
}
return (mp);
}
static int
vnd_st_sinfo(vnd_str_t *vsp)
{
mblk_t *mp;
dl_info_req_t *dlir;
VERIFY(MUTEX_HELD(&vsp->vns_lock));
mp = allocb(MAX(sizeof (dl_info_req_t), sizeof (dl_info_ack_t)),
BPRI_HI);
if (mp == NULL) {
vsp->vns_errno = VND_E_NOMEM;
return (1);
}
vsp->vns_state = VNS_S_INFO_SENT;
cv_broadcast(&vsp->vns_stcv);
mp->b_datap->db_type = M_PCPROTO;
dlir = (dl_info_req_t *)mp->b_rptr;
mp->b_wptr = (uchar_t *)&dlir[1];
dlir->dl_primitive = DL_INFO_REQ;
putnext(vsp->vns_wq, mp);
return (0);
}
static int
vnd_st_info(vnd_str_t *vsp)
{
dl_info_ack_t *dlia;
mblk_t *mp;
VERIFY(MUTEX_HELD(&vsp->vns_lock));
mp = vnd_dlpi_inc_pop(vsp);
dlia = (dl_info_ack_t *)mp->b_rptr;
vsp->vns_dlpi_style = dlia->dl_provider_style;
vsp->vns_minwrite = dlia->dl_min_sdu;
vsp->vns_maxwrite = dlia->dl_max_sdu;
/*
* At this time we only support DL_ETHER devices.
*/
if (dlia->dl_mac_type != DL_ETHER) {
freemsg(mp);
vsp->vns_errno = VND_E_NOTETHER;
return (1);
}
/*
* Because vnd operates on entire packets, we need to manually account
* for the ethernet header information. We add the size of the
* ether_vlan_header to account for this, regardless if it is using
* vlans or not.
*/
vsp->vns_maxwrite += sizeof (struct ether_vlan_header);
freemsg(mp);
return (0);
}
static int
vnd_st_sexclusive(vnd_str_t *vsp)
{
mblk_t *mp;
VERIFY(MUTEX_HELD(&vsp->vns_lock));
mp = vnd_dlpi_alloc(sizeof (dl_attach_req_t), DL_EXCLUSIVE_REQ);
if (mp == NULL) {
vsp->vns_errno = VND_E_NOMEM;
return (1);
}
vsp->vns_state = VNS_S_EXCLUSIVE_SENT;
cv_broadcast(&vsp->vns_stcv);
putnext(vsp->vns_wq, mp);
return (0);
}
static int
vnd_st_exclusive(vnd_str_t *vsp)
{
mblk_t *mp;
t_uscalar_t prim, cprim;
VERIFY(MUTEX_HELD(&vsp->vns_lock));
mp = vnd_dlpi_inc_pop(vsp);
prim = ((dl_error_ack_t *)mp->b_rptr)->dl_primitive;
cprim = ((dl_ok_ack_t *)mp->b_rptr)->dl_correct_primitive;
if (prim != DL_OK_ACK && prim != DL_ERROR_ACK) {
vnd_drop_ctl(vsp, mp,
"wrong dlpi primitive for vnd_st_exclusive");
vsp->vns_errno = VND_E_DLPIINVAL;
return (1);
}
if (cprim != DL_EXCLUSIVE_REQ) {
vnd_drop_ctl(vsp, mp,
"vnd_st_exclusive: got ack/nack for wrong primitive");
vsp->vns_errno = VND_E_DLPIINVAL;
return (1);
}
if (prim == DL_ERROR_ACK)
vsp->vns_errno = VND_E_DLEXCL;
freemsg(mp);
return (prim == DL_ERROR_ACK);
}
/*
* Send down a DLPI_ATTACH_REQ.
*/
static int
vnd_st_sattach(vnd_str_t *vsp)
{
mblk_t *mp;
VERIFY(MUTEX_HELD(&vsp->vns_lock));
mp = vnd_dlpi_alloc(sizeof (dl_attach_req_t), DL_ATTACH_REQ);
if (mp == NULL) {
vsp->vns_errno = VND_E_NOMEM;
return (1);
}
((dl_attach_req_t *)mp->b_rptr)->dl_ppa = 0;
vsp->vns_state = VNS_S_ATTACH_SENT;
cv_broadcast(&vsp->vns_stcv);
putnext(vsp->vns_wq, mp);
return (0);
}
static int
vnd_st_attach(vnd_str_t *vsp)
{
mblk_t *mp;
t_uscalar_t prim, cprim;
VERIFY(MUTEX_HELD(&vsp->vns_lock));
mp = vnd_dlpi_inc_pop(vsp);
prim = ((dl_ok_ack_t *)mp->b_rptr)->dl_primitive;
cprim = ((dl_ok_ack_t *)mp->b_rptr)->dl_correct_primitive;
if (prim != DL_OK_ACK && prim != DL_ERROR_ACK) {
vnd_drop_ctl(vsp, mp, "vnd_st_attach: unknown primitive type");
vsp->vns_errno = VND_E_DLPIINVAL;
return (1);
}
if (cprim != DL_ATTACH_REQ) {
vnd_drop_ctl(vsp, mp,
"vnd_st_attach: Got ack/nack for wrong primitive");
vsp->vns_errno = VND_E_DLPIINVAL;
return (1);
}
if (prim == DL_ERROR_ACK)
vsp->vns_errno = VND_E_ATTACHFAIL;
freemsg(mp);
return (prim == DL_ERROR_ACK);
}
static int
vnd_st_sbind(vnd_str_t *vsp)
{
mblk_t *mp;
dl_bind_req_t *dbrp;
VERIFY(MUTEX_HELD(&vsp->vns_lock));
mp = vnd_dlpi_alloc(sizeof (dl_bind_req_t) + sizeof (long),
DL_BIND_REQ);
if (mp == NULL) {
vsp->vns_errno = VND_E_NOMEM;
return (1);
}
dbrp = (dl_bind_req_t *)(mp->b_rptr);
dbrp->dl_sap = 0;
dbrp->dl_service_mode = DL_CLDLS;
vsp->vns_state = VNS_S_BIND_SENT;
cv_broadcast(&vsp->vns_stcv);
putnext(vsp->vns_wq, mp);
return (0);
}
static int
vnd_st_bind(vnd_str_t *vsp)
{
mblk_t *mp;
t_uscalar_t prim;
VERIFY(MUTEX_HELD(&vsp->vns_lock));
mp = vnd_dlpi_inc_pop(vsp);
prim = ((dl_error_ack_t *)mp->b_rptr)->dl_primitive;
if (prim != DL_BIND_ACK && prim != DL_ERROR_ACK) {
vnd_drop_ctl(vsp, mp, "wrong dlpi primitive for vnd_st_bind");
vsp->vns_errno = VND_E_DLPIINVAL;
return (1);
}
if (prim == DL_ERROR_ACK)
vsp->vns_errno = VND_E_BINDFAIL;
freemsg(mp);
return (prim == DL_ERROR_ACK);
}
static int
vnd_st_spromisc(vnd_str_t *vsp, int type, vnd_str_state_t next)
{
mblk_t *mp;
dl_promiscon_req_t *dprp;
VERIFY(MUTEX_HELD(&vsp->vns_lock));
mp = vnd_dlpi_alloc(sizeof (dl_promiscon_req_t), DL_PROMISCON_REQ);
if (mp == NULL) {
vsp->vns_errno = VND_E_NOMEM;
return (1);
}
dprp = (dl_promiscon_req_t *)mp->b_rptr;
dprp->dl_level = type;
vsp->vns_state = next;
cv_broadcast(&vsp->vns_stcv);
putnext(vsp->vns_wq, mp);
return (0);
}
static int
vnd_st_promisc(vnd_str_t *vsp)
{
mblk_t *mp;
t_uscalar_t prim, cprim;
VERIFY(MUTEX_HELD(&vsp->vns_lock));
mp = vnd_dlpi_inc_pop(vsp);
prim = ((dl_ok_ack_t *)mp->b_rptr)->dl_primitive;
cprim = ((dl_ok_ack_t *)mp->b_rptr)->dl_correct_primitive;
if (prim != DL_OK_ACK && prim != DL_ERROR_ACK) {
vnd_drop_ctl(vsp, mp,
"wrong dlpi primitive for vnd_st_promisc");
vsp->vns_errno = VND_E_DLPIINVAL;
return (1);
}
if (cprim != DL_PROMISCON_REQ) {
vnd_drop_ctl(vsp, mp,
"vnd_st_promisc: Got ack/nack for wrong primitive");
vsp->vns_errno = VND_E_DLPIINVAL;
return (1);
}
if (prim == DL_ERROR_ACK)
vsp->vns_errno = VND_E_PROMISCFAIL;
freemsg(mp);
return (prim == DL_ERROR_ACK);
}
static int
vnd_st_scapabq(vnd_str_t *vsp)
{
mblk_t *mp;
VERIFY(MUTEX_HELD(&vsp->vns_lock));
mp = vnd_dlpi_alloc(sizeof (dl_capability_req_t), DL_CAPABILITY_REQ);
if (mp == NULL) {
vsp->vns_errno = VND_E_NOMEM;
return (1);
}
vsp->vns_state = VNS_S_CAPAB_Q_SENT;
cv_broadcast(&vsp->vns_stcv);
putnext(vsp->vns_wq, mp);
return (0);
}
/* ARGSUSED */
static void
vnd_mac_input(vnd_str_t *vsp, mac_resource_t *unused, mblk_t *mp_chain,
mac_header_info_t *mhip)
{
int signal = 0;
mblk_t *mp;
vnd_pnsd_t *nsp = vsp->vns_nsd;
ASSERT(vsp != NULL);
ASSERT(mp_chain != NULL);
for (mp = mp_chain; mp != NULL; mp = mp_chain) {
uint16_t vid;
mp_chain = mp->b_next;
mp->b_next = NULL;
/*
* If we were operating in a traditional dlpi context then we
* would have enabled DLIOCRAW and rather than the fast path, we
* would come through dld_str_rx_raw. That function does two
* things that we have to consider doing ourselves. The first is
* that it adjusts the b_rptr back to account for dld bumping us
* past the mac header. It also tries to account for cases where
* mac provides an illusion of the mac header. Fortunately, dld
* only allows the fastpath when the media type is the same as
* the native type. Therefore all we have to do here is adjust
* the b_rptr.
*/
ASSERT(mp->b_rptr >= DB_BASE(mp) + mhip->mhi_hdrsize);
mp->b_rptr -= mhip->mhi_hdrsize;
vid = VLAN_ID(mhip->mhi_tci);
if (mhip->mhi_istagged && vid != VLAN_ID_NONE) {
/*
* This is an overlapping copy. Do not use bcopy(9F).
*/
(void) memmove(mp->b_rptr + 4, mp->b_rptr, 12);
mp->b_rptr += 4;
}
if (nsp->vpnd_hooked && vnd_hook(vsp, &mp, nsp->vpnd_neti_v4,
nsp->vpnd_event_in_v4, nsp->vpnd_token_in_v4,
nsp->vpnd_neti_v6, nsp->vpnd_event_in_v6,
nsp->vpnd_token_in_v6, vnd_drop_hook_in, vnd_drop_in) != 0)
continue;
VND_STAT_INC(vsp, vks_rpackets, 1);
VND_STAT_INC(vsp, vks_rbytes, msgsize(mp));
DTRACE_VND5(recv, mblk_t *, mp, void *, NULL, void *, NULL,
vnd_str_t *, vsp, mblk_t *, mp);
mutex_enter(&vsp->vns_dq_read.vdq_lock);
signal |= vnd_dq_push(&vsp->vns_dq_read, mp, B_FALSE,
vnd_drop_in);
mutex_exit(&vsp->vns_dq_read.vdq_lock);
}
if (signal != 0) {
cv_broadcast(&vsp->vns_dq_read.vdq_ready);
pollwakeup(&vsp->vns_dev->vdd_ph, POLLIN | POLLRDNORM);
}
}
static void
vnd_mac_flow_control_stat(vnd_str_t *vsp, hrtime_t diff)
{
VND_STAT_INC(vsp, vks_nmacflow, 1);
VND_STAT_INC(vsp, vks_tmacflow, diff);
if (diff >= VND_LATENCY_1MS)
VND_STAT_INC(vsp, vks_mac_flow_1ms, 1);
if (diff >= VND_LATENCY_10MS)
VND_STAT_INC(vsp, vks_mac_flow_10ms, 1);
if (diff >= VND_LATENCY_100MS)
VND_STAT_INC(vsp, vks_mac_flow_100ms, 1);
if (diff >= VND_LATENCY_1S)
VND_STAT_INC(vsp, vks_mac_flow_1s, 1);
if (diff >= VND_LATENCY_10S)
VND_STAT_INC(vsp, vks_mac_flow_10s, 1);
}
/*
* This is a callback from MAC that indicates that we are allowed to send
* packets again.
*/
static void
vnd_mac_flow_control(void *arg, vnd_mac_cookie_t cookie)
{
vnd_str_t *vsp = arg;
hrtime_t now;
mutex_enter(&vsp->vns_lock);
now = gethrtime();
/*
* Check for the case that we beat vnd_squeue_tx_one to the punch.
* There's also an additional case here that we got notified because
* we're sharing a device that ran out of tx descriptors, even though it
* wasn't because of us.
*/
if (!(vsp->vns_flags & VNS_F_FLOW_CONTROLLED)) {
vsp->vns_fcupdate = now;
mutex_exit(&vsp->vns_lock);
return;
}
ASSERT(vsp->vns_flags & VNS_F_FLOW_CONTROLLED);
ASSERT(vsp->vns_caps.vsc_fc_cookie == cookie);
vsp->vns_flags &= ~VNS_F_FLOW_CONTROLLED;
vsp->vns_caps.vsc_fc_cookie = NULL;
vsp->vns_fclatch = 0;
DTRACE_VND3(flow__resumed, vnd_str_t *, vsp, uint64_t,
vsp->vns_dq_write.vdq_cur, uintptr_t, cookie);
/*
* If someone has asked to flush the squeue and thus inserted a barrier,
* than we shouldn't schedule a drain.
*/
if (!(vsp->vns_flags & (VNS_F_DRAIN_SCHEDULED | VNS_F_BARRIER))) {
vsp->vns_flags |= VNS_F_DRAIN_SCHEDULED;
gsqueue_enter_one(vsp->vns_squeue, &vsp->vns_drainblk,
vnd_squeue_tx_drain, vsp, GSQUEUE_FILL,
VND_SQUEUE_TAG_MAC_FLOW_CONTROL);
}
mutex_exit(&vsp->vns_lock);
}
static void
vnd_mac_enter(vnd_str_t *vsp, mac_perim_handle_t *mphp)
{
ASSERT(MUTEX_HELD(&vsp->vns_lock));
VERIFY(vsp->vns_caps.vsc_capab_f(vsp->vns_caps.vsc_capab_hdl,
DLD_CAPAB_PERIM, mphp, DLD_ENABLE) == 0);
}
static void
vnd_mac_exit(vnd_str_t *vsp, mac_perim_handle_t mph)
{
ASSERT(MUTEX_HELD(&vsp->vns_lock));
VERIFY(vsp->vns_caps.vsc_capab_f(vsp->vns_caps.vsc_capab_hdl,
DLD_CAPAB_PERIM, mph, DLD_DISABLE) == 0);
}
static int
vnd_dld_cap_enable(vnd_str_t *vsp, vnd_rx_t rxfunc)
{
int ret;
dld_capab_direct_t d;
mac_perim_handle_t mph;
vnd_str_capab_t *c = &vsp->vns_caps;
bzero(&d, sizeof (d));
d.di_rx_cf = (uintptr_t)rxfunc;
d.di_rx_ch = vsp;
d.di_flags = DI_DIRECT_RAW;
vnd_mac_enter(vsp, &mph);
/*
* If we're coming in here for a second pass, we need to make sure that
* we remove an existing flow control notification callback, otherwise
* we'll create a duplicate that will remain with garbage data.
*/
if (c->vsc_tx_fc_hdl != NULL) {
ASSERT(c->vsc_set_fcb_hdl != NULL);
(void) c->vsc_set_fcb_f(c->vsc_set_fcb_hdl, NULL,
c->vsc_tx_fc_hdl);
c->vsc_tx_fc_hdl = NULL;
}
if (vsp->vns_caps.vsc_capab_f(c->vsc_capab_hdl,
DLD_CAPAB_DIRECT, &d, DLD_ENABLE) == 0) {
c->vsc_tx_f = (vnd_dld_tx_t)d.di_tx_df;
c->vsc_tx_hdl = d.di_tx_dh;
c->vsc_set_fcb_f = (vnd_dld_set_fcb_t)d.di_tx_cb_df;
c->vsc_set_fcb_hdl = d.di_tx_cb_dh;
c->vsc_is_fc_f = (vnd_dld_is_fc_t)d.di_tx_fctl_df;
c->vsc_is_fc_hdl = d.di_tx_fctl_dh;
c->vsc_tx_fc_hdl = c->vsc_set_fcb_f(c->vsc_set_fcb_hdl,
vnd_mac_flow_control, vsp);
c->vsc_flags |= VNS_C_DIRECT;
ret = 0;
} else {
vsp->vns_errno = VND_E_DIRECTFAIL;
ret = 1;
}
vnd_mac_exit(vsp, mph);
return (ret);
}
static int
vnd_st_capabq(vnd_str_t *vsp)
{
mblk_t *mp;
dl_capability_ack_t *cap;
dl_capability_sub_t *subp;
dl_capab_hcksum_t *hck;
dl_capab_dld_t *dld;
unsigned char *rp;
int ret = 0;
VERIFY(MUTEX_HELD(&vsp->vns_lock));
mp = vnd_dlpi_inc_pop(vsp);
rp = mp->b_rptr;
cap = (dl_capability_ack_t *)rp;
if (cap->dl_sub_length == 0)
goto done;
/* Don't try to process something too big */
if (sizeof (dl_capability_ack_t) + cap->dl_sub_length > MBLKL(mp)) {
VND_STAT_INC(vsp, vks_ndlpidrops, 1);
VND_STAT_INC(vsp, vks_tdrops, 1);
vsp->vns_errno = VND_E_CAPACKINVAL;
ret = 1;
goto done;
}
rp += cap->dl_sub_offset;
while (cap->dl_sub_length > 0) {
subp = (dl_capability_sub_t *)rp;
/* Sanity check something crazy from down below */
if (subp->dl_length + sizeof (dl_capability_sub_t) >
cap->dl_sub_length) {
VND_STAT_INC(vsp, vks_ndlpidrops, 1);
VND_STAT_INC(vsp, vks_tdrops, 1);
vsp->vns_errno = VND_E_SUBCAPINVAL;
ret = 1;
goto done;
}
switch (subp->dl_cap) {
case DL_CAPAB_HCKSUM:
hck = (dl_capab_hcksum_t *)(rp +
sizeof (dl_capability_sub_t));
if (hck->hcksum_version != HCKSUM_CURRENT_VERSION) {
vsp->vns_caps.vsc_flags |= VNS_C_HCKSUM_BADVERS;
break;
}
if (dlcapabcheckqid(&hck->hcksum_mid, vsp->vns_lrq) !=
B_TRUE) {
vsp->vns_errno = VND_E_CAPABPASS;
ret = 1;
goto done;
}
vsp->vns_caps.vsc_flags |= VNS_C_HCKSUM;
vsp->vns_caps.vsc_hcksum_opts = hck->hcksum_txflags;
break;
case DL_CAPAB_DLD:
dld = (dl_capab_dld_t *)(rp +
sizeof (dl_capability_sub_t));
if (dld->dld_version != DLD_CURRENT_VERSION) {
vsp->vns_errno = VND_E_DLDBADVERS;
ret = 1;
goto done;
}
if (dlcapabcheckqid(&dld->dld_mid, vsp->vns_lrq) !=
B_TRUE) {
vsp->vns_errno = VND_E_CAPABPASS;
ret = 1;
goto done;
}
vsp->vns_caps.vsc_flags |= VNS_C_DLD;
vsp->vns_caps.vsc_capab_f =
(vnd_dld_cap_t)dld->dld_capab;
vsp->vns_caps.vsc_capab_hdl =
(void *)dld->dld_capab_handle;
/*
* At this point in time, we have to set up a direct
* function that drops all input. This validates that
* we'll be able to set up direct input and that we can
* easily switch it earlier to the real data function
* when we've plumbed everything up.
*/
if (vnd_dld_cap_enable(vsp, vnd_mac_drop_input) != 0) {
/* vns_errno set by vnd_dld_cap_enable */
ret = 1;
goto done;
}
break;
default:
/* Ignore unsupported cap */
break;
}
rp += sizeof (dl_capability_sub_t) + subp->dl_length;
cap->dl_sub_length -= sizeof (dl_capability_sub_t) +
subp->dl_length;
}
done:
/* Make sure we enabled direct callbacks */
if (ret == 0 && !(vsp->vns_caps.vsc_flags & VNS_C_DIRECT)) {
vsp->vns_errno = VND_E_DIRECTNOTSUP;
ret = 1;
}
freemsg(mp);
return (ret);
}
static void
vnd_st_sonline(vnd_str_t *vsp)
{
VERIFY(MUTEX_HELD(&vsp->vns_lock));
vsp->vns_state = VNS_S_ONLINE;
cv_broadcast(&vsp->vns_stcv);
}
static void
vnd_st_shutdown(vnd_str_t *vsp)
{
mac_perim_handle_t mph;
vnd_str_capab_t *vsc = &vsp->vns_caps;
VERIFY(MUTEX_HELD(&vsp->vns_lock));
/*
* At this point in time we know that there is no one transmitting as
* our final reference has been torn down and that vnd_s_close inserted
* a barrier to validate that everything is flushed.
*/
if (vsc->vsc_flags & VNS_C_DIRECT) {
vnd_mac_enter(vsp, &mph);
vsc->vsc_flags &= ~VNS_C_DIRECT;
(void) vsc->vsc_set_fcb_f(vsc->vsc_set_fcb_hdl, NULL,
vsc->vsc_tx_fc_hdl);
vsc->vsc_tx_fc_hdl = NULL;
(void) vsc->vsc_capab_f(vsc->vsc_capab_hdl, DLD_CAPAB_DIRECT,
NULL, DLD_DISABLE);
vnd_mac_exit(vsp, mph);
}
}
static boolean_t
vnd_st_spromiscoff(vnd_str_t *vsp, int type, vnd_str_state_t next)
{
boolean_t ret = B_TRUE;
mblk_t *mp;
dl_promiscoff_req_t *dprp;
VERIFY(MUTEX_HELD(&vsp->vns_lock));
mp = vnd_dlpi_alloc(sizeof (dl_promiscon_req_t), DL_PROMISCOFF_REQ);
if (mp == NULL) {
cmn_err(CE_NOTE, "!vnd failed to allocate mblk_t for "
"promiscoff request");
ret = B_FALSE;
goto next;
}
dprp = (dl_promiscoff_req_t *)mp->b_rptr;
dprp->dl_level = type;
putnext(vsp->vns_wq, mp);
next:
vsp->vns_state = next;
cv_broadcast(&vsp->vns_stcv);
return (ret);
}
static void
vnd_st_promiscoff(vnd_str_t *vsp)
{
mblk_t *mp;
t_uscalar_t prim, cprim;
VERIFY(MUTEX_HELD(&vsp->vns_lock));
/*
* Unlike other cases where we guard against the incoming packet being
* NULL, during tear down we try to keep driving and therefore we may
* have gotten here due to an earlier failure, so there's nothing to do.
*/
mp = vnd_dlpi_inc_pop(vsp);
if (mp == NULL)
return;
prim = ((dl_ok_ack_t *)mp->b_rptr)->dl_primitive;
cprim = ((dl_ok_ack_t *)mp->b_rptr)->dl_correct_primitive;
if (prim != DL_OK_ACK && prim != DL_ERROR_ACK) {
vnd_drop_ctl(vsp, mp,
"wrong dlpi primitive for vnd_st_promiscoff");
return;
}
if (cprim != DL_PROMISCOFF_REQ) {
vnd_drop_ctl(vsp, mp,
"vnd_st_promiscoff: Got ack/nack for wrong primitive");
return;
}
if (prim == DL_ERROR_ACK) {
cmn_err(CE_WARN, "!failed to disable promiscuos mode during "
"vnd teardown");
}
}
static boolean_t
vnd_st_sunbind(vnd_str_t *vsp)
{
mblk_t *mp;
boolean_t ret = B_TRUE;
mp = vnd_dlpi_alloc(sizeof (dl_unbind_req_t), DL_UNBIND_REQ);
if (mp == NULL) {
cmn_err(CE_NOTE, "!vnd failed to allocate mblk_t for "
"unbind request");
ret = B_FALSE;
goto next;
}
putnext(vsp->vns_wq, mp);
next:
vsp->vns_state = VNS_S_UNBIND_SENT;
cv_broadcast(&vsp->vns_stcv);
return (ret);
}
static void
vnd_st_unbind(vnd_str_t *vsp)
{
mblk_t *mp;
t_uscalar_t prim, cprim;
/*
* Unlike other cases where we guard against the incoming packet being
* NULL, during tear down we try to keep driving and therefore we may
* have gotten here due to an earlier failure, so there's nothing to do.
*/
mp = vnd_dlpi_inc_pop(vsp);
if (mp == NULL)
goto next;
prim = ((dl_ok_ack_t *)mp->b_rptr)->dl_primitive;
cprim = ((dl_ok_ack_t *)mp->b_rptr)->dl_correct_primitive;
if (prim != DL_OK_ACK && prim != DL_ERROR_ACK) {
vnd_drop_ctl(vsp, mp,
"wrong dlpi primitive for vnd_st_unbind");
goto next;
}
if (cprim != DL_UNBIND_REQ) {
vnd_drop_ctl(vsp, mp,
"vnd_st_unbind: Got ack/nack for wrong primitive");
goto next;
}
if (prim == DL_ERROR_ACK) {
cmn_err(CE_WARN, "!failed to unbind stream during vnd "
"teardown");
}
next:
vsp->vns_state = VNS_S_ZOMBIE;
cv_broadcast(&vsp->vns_stcv);
}
/*
* Perform state transitions. This is a one way shot down the flow chart
* described in the big theory statement.
*/
static void
vnd_str_state_transition(void *arg)
{
boolean_t died = B_FALSE;
vnd_str_t *vsp = arg;
mblk_t *mp;
mutex_enter(&vsp->vns_lock);
if (vsp->vns_dlpi_inc == NULL && (vsp->vns_state != VNS_S_INITIAL &&
vsp->vns_state != VNS_S_SHUTTING_DOWN)) {
mutex_exit(&vsp->vns_lock);
return;
}
/*
* When trying to shut down, or unwinding from a failed enabling, rather
* than immediately entering the ZOMBIE state, we may instead opt to try
* and enter the next state in the progression. This is especially
* important when trying to tear everything down.
*/
loop:
DTRACE_PROBE2(vnd__state__transition, uintptr_t, vsp,
vnd_str_state_t, vsp->vns_state);
switch (vsp->vns_state) {
case VNS_S_INITIAL:
VERIFY(vsp->vns_dlpi_inc == NULL);
if (vnd_st_sinfo(vsp) != 0)
died = B_TRUE;
break;
case VNS_S_INFO_SENT:
VERIFY(vsp->vns_dlpi_inc != NULL);
if (vnd_st_info(vsp) == 0) {
if (vnd_st_sexclusive(vsp) != 0)
died = B_TRUE;
} else {
died = B_TRUE;
}
break;
case VNS_S_EXCLUSIVE_SENT:
VERIFY(vsp->vns_dlpi_inc != NULL);
if (vnd_st_exclusive(vsp) == 0) {
if (vsp->vns_dlpi_style == DL_STYLE2) {
if (vnd_st_sattach(vsp) != 0)
died = B_TRUE;
} else {
if (vnd_st_sbind(vsp) != 0)
died = B_TRUE;
}
} else {
died = B_TRUE;
}
break;
case VNS_S_ATTACH_SENT:
VERIFY(vsp->vns_dlpi_inc != NULL);
if (vnd_st_attach(vsp) == 0) {
if (vnd_st_sbind(vsp) != 0)
died = B_TRUE;
} else {
died = B_TRUE;
}
break;
case VNS_S_BIND_SENT:
VERIFY(vsp->vns_dlpi_inc != NULL);
if (vnd_st_bind(vsp) == 0) {
if (vnd_st_spromisc(vsp, DL_PROMISC_SAP,
VNS_S_SAP_PROMISC_SENT) != 0)
died = B_TRUE;
} else {
died = B_TRUE;
}
break;
case VNS_S_SAP_PROMISC_SENT:
VERIFY(vsp->vns_dlpi_inc != NULL);
if (vnd_st_promisc(vsp) == 0) {
if (vnd_st_spromisc(vsp, DL_PROMISC_MULTI,
VNS_S_MULTI_PROMISC_SENT) != 0)
died = B_TRUE;
} else {
died = B_TRUE;
}
break;
case VNS_S_MULTI_PROMISC_SENT:
VERIFY(vsp->vns_dlpi_inc != NULL);
if (vnd_st_promisc(vsp) == 0) {
if (vnd_st_spromisc(vsp, DL_PROMISC_RX_ONLY,
VNS_S_RX_ONLY_PROMISC_SENT) != 0)
died = B_TRUE;
} else {
died = B_TRUE;
}
break;
case VNS_S_RX_ONLY_PROMISC_SENT:
VERIFY(vsp->vns_dlpi_inc != NULL);
if (vnd_st_promisc(vsp) == 0) {
if (vnd_st_spromisc(vsp, DL_PROMISC_FIXUPS,
VNS_S_FIXUP_PROMISC_SENT) != 0)
died = B_TRUE;
} else {
died = B_TRUE;
}
break;
case VNS_S_FIXUP_PROMISC_SENT:
VERIFY(vsp->vns_dlpi_inc != NULL);
if (vnd_st_promisc(vsp) == 0) {
if (vnd_st_scapabq(vsp) != 0)
died = B_TRUE;
} else {
died = B_TRUE;
}
break;
case VNS_S_CAPAB_Q_SENT:
if (vnd_st_capabq(vsp) != 0)
died = B_TRUE;
else
vnd_st_sonline(vsp);
break;
case VNS_S_SHUTTING_DOWN:
vnd_st_shutdown(vsp);
if (vnd_st_spromiscoff(vsp, DL_PROMISC_MULTI,
VNS_S_MULTICAST_PROMISCOFF_SENT) == B_FALSE)
goto loop;
break;
case VNS_S_MULTICAST_PROMISCOFF_SENT:
vnd_st_promiscoff(vsp);
if (vnd_st_spromiscoff(vsp, DL_PROMISC_SAP,
VNS_S_SAP_PROMISCOFF_SENT) == B_FALSE)
goto loop;
break;
case VNS_S_SAP_PROMISCOFF_SENT:
vnd_st_promiscoff(vsp);
if (vnd_st_sunbind(vsp) == B_FALSE)
goto loop;
break;
case VNS_S_UNBIND_SENT:
vnd_st_unbind(vsp);
break;
case VNS_S_ZOMBIE:
while ((mp = vnd_dlpi_inc_pop(vsp)) != NULL)
vnd_drop_ctl(vsp, mp, "vsp received data as a zombie");
break;
default:
panic("vnd_str_t entered an unknown state");
}
if (died == B_TRUE) {
ASSERT(vsp->vns_errno != VND_E_SUCCESS);
vsp->vns_laststate = vsp->vns_state;
vsp->vns_state = VNS_S_ZOMBIE;
cv_broadcast(&vsp->vns_stcv);
}
mutex_exit(&vsp->vns_lock);
}
static void
vnd_dlpi_taskq_dispatch(void *arg)
{
vnd_str_t *vsp = arg;
int run = 1;
while (run != 0) {
vnd_str_state_transition(vsp);
mutex_enter(&vsp->vns_lock);
if (vsp->vns_flags & VNS_F_CONDEMNED ||
vsp->vns_dlpi_inc == NULL) {
run = 0;
vsp->vns_flags &= ~VNS_F_TASKQ_DISPATCHED;
}
if (vsp->vns_flags & VNS_F_CONDEMNED)
cv_signal(&vsp->vns_cancelcv);
mutex_exit(&vsp->vns_lock);
}
}
/* ARGSUSED */
static int
vnd_neti_getifname(net_handle_t neti, phy_if_t phy, char *buf, const size_t len)
{
return (-1);
}
/* ARGSUSED */
static int
vnd_neti_getmtu(net_handle_t neti, phy_if_t phy, lif_if_t ifdata)
{
return (-1);
}
/* ARGSUSED */
static int
vnd_neti_getptmue(net_handle_t neti)
{
return (-1);
}
/* ARGSUSED */
static int
vnd_neti_getlifaddr(net_handle_t neti, phy_if_t phy, lif_if_t ifdata,
size_t nelem, net_ifaddr_t type[], void *storage)
{
return (-1);
}
/* ARGSUSED */
static int
vnd_neti_getlifzone(net_handle_t neti, phy_if_t phy, lif_if_t ifdata,
zoneid_t *zid)
{
return (-1);
}
/* ARGSUSED */
static int
vnd_neti_getlifflags(net_handle_t neti, phy_if_t phy, lif_if_t ifdata,
uint64_t *flags)
{
return (-1);
}
/* ARGSUSED */
static phy_if_t
vnd_neti_phygetnext(net_handle_t neti, phy_if_t phy)
{
return ((phy_if_t)-1);
}
/* ARGSUSED */
static phy_if_t
vnd_neti_phylookup(net_handle_t neti, const char *name)
{
return ((phy_if_t)-1);
}
/* ARGSUSED */
static lif_if_t
vnd_neti_lifgetnext(net_handle_t neti, phy_if_t phy, lif_if_t ifdata)
{
return (-1);
}
/* ARGSUSED */
static int
vnd_neti_inject(net_handle_t neti, inject_t style, net_inject_t *packet)
{
return (-1);
}
/* ARGSUSED */
static phy_if_t
vnd_neti_route(net_handle_t neti, struct sockaddr *address,
struct sockaddr *next)
{
return ((phy_if_t)-1);
}
/* ARGSUSED */
static int
vnd_neti_ispchksum(net_handle_t neti, mblk_t *mp)
{
return (-1);
}
/* ARGSUSED */
static int
vnd_neti_isvchksum(net_handle_t neti, mblk_t *mp)
{
return (-1);
}
static net_protocol_t vnd_neti_info_v4 = {
NETINFO_VERSION,
NHF_VND_INET,
vnd_neti_getifname,
vnd_neti_getmtu,
vnd_neti_getptmue,
vnd_neti_getlifaddr,
vnd_neti_getlifzone,
vnd_neti_getlifflags,
vnd_neti_phygetnext,
vnd_neti_phylookup,
vnd_neti_lifgetnext,
vnd_neti_inject,
vnd_neti_route,
vnd_neti_ispchksum,
vnd_neti_isvchksum
};
static net_protocol_t vnd_neti_info_v6 = {
NETINFO_VERSION,
NHF_VND_INET6,
vnd_neti_getifname,
vnd_neti_getmtu,
vnd_neti_getptmue,
vnd_neti_getlifaddr,
vnd_neti_getlifzone,
vnd_neti_getlifflags,
vnd_neti_phygetnext,
vnd_neti_phylookup,
vnd_neti_lifgetnext,
vnd_neti_inject,
vnd_neti_route,
vnd_neti_ispchksum,
vnd_neti_isvchksum
};
static int
vnd_netinfo_init(vnd_pnsd_t *nsp)
{
nsp->vpnd_neti_v4 = net_protocol_register(nsp->vpnd_nsid,
&vnd_neti_info_v4);
ASSERT(nsp->vpnd_neti_v4 != NULL);
nsp->vpnd_neti_v6 = net_protocol_register(nsp->vpnd_nsid,
&vnd_neti_info_v6);
ASSERT(nsp->vpnd_neti_v6 != NULL);
nsp->vpnd_family_v4.hf_version = HOOK_VERSION;
nsp->vpnd_family_v4.hf_name = "vnd_inet";
if (net_family_register(nsp->vpnd_neti_v4, &nsp->vpnd_family_v4) != 0) {
(void) net_protocol_unregister(nsp->vpnd_neti_v4);
(void) net_protocol_unregister(nsp->vpnd_neti_v6);
cmn_err(CE_NOTE, "vnd_netinfo_init: net_family_register "
"failed for stack %d", nsp->vpnd_nsid);
return (1);
}
nsp->vpnd_family_v6.hf_version = HOOK_VERSION;
nsp->vpnd_family_v6.hf_name = "vnd_inet6";
if (net_family_register(nsp->vpnd_neti_v6, &nsp->vpnd_family_v6) != 0) {
(void) net_family_unregister(nsp->vpnd_neti_v4,
&nsp->vpnd_family_v4);
(void) net_protocol_unregister(nsp->vpnd_neti_v4);
(void) net_protocol_unregister(nsp->vpnd_neti_v6);
cmn_err(CE_NOTE, "vnd_netinfo_init: net_family_register "
"failed for stack %d", nsp->vpnd_nsid);
return (1);
}
nsp->vpnd_event_in_v4.he_version = HOOK_VERSION;
nsp->vpnd_event_in_v4.he_name = NH_PHYSICAL_IN;
nsp->vpnd_event_in_v4.he_flags = 0;
nsp->vpnd_event_in_v4.he_interested = B_FALSE;
nsp->vpnd_token_in_v4 = net_event_register(nsp->vpnd_neti_v4,
&nsp->vpnd_event_in_v4);
if (nsp->vpnd_token_in_v4 == NULL) {
(void) net_family_unregister(nsp->vpnd_neti_v4,
&nsp->vpnd_family_v4);
(void) net_family_unregister(nsp->vpnd_neti_v6,
&nsp->vpnd_family_v6);
(void) net_protocol_unregister(nsp->vpnd_neti_v4);
(void) net_protocol_unregister(nsp->vpnd_neti_v6);
cmn_err(CE_NOTE, "vnd_netinfo_init: net_event_register "
"failed for stack %d", nsp->vpnd_nsid);
return (1);
}
nsp->vpnd_event_in_v6.he_version = HOOK_VERSION;
nsp->vpnd_event_in_v6.he_name = NH_PHYSICAL_IN;
nsp->vpnd_event_in_v6.he_flags = 0;
nsp->vpnd_event_in_v6.he_interested = B_FALSE;
nsp->vpnd_token_in_v6 = net_event_register(nsp->vpnd_neti_v6,
&nsp->vpnd_event_in_v6);
if (nsp->vpnd_token_in_v6 == NULL) {
(void) net_event_shutdown(nsp->vpnd_neti_v4,
&nsp->vpnd_event_in_v4);
(void) net_event_unregister(nsp->vpnd_neti_v4,
&nsp->vpnd_event_in_v4);
(void) net_family_unregister(nsp->vpnd_neti_v4,
&nsp->vpnd_family_v4);
(void) net_family_unregister(nsp->vpnd_neti_v6,
&nsp->vpnd_family_v6);
(void) net_protocol_unregister(nsp->vpnd_neti_v4);
(void) net_protocol_unregister(nsp->vpnd_neti_v6);
cmn_err(CE_NOTE, "vnd_netinfo_init: net_event_register "
"failed for stack %d", nsp->vpnd_nsid);
return (1);
}
nsp->vpnd_event_out_v4.he_version = HOOK_VERSION;
nsp->vpnd_event_out_v4.he_name = NH_PHYSICAL_OUT;
nsp->vpnd_event_out_v4.he_flags = 0;
nsp->vpnd_event_out_v4.he_interested = B_FALSE;
nsp->vpnd_token_out_v4 = net_event_register(nsp->vpnd_neti_v4,
&nsp->vpnd_event_out_v4);
if (nsp->vpnd_token_out_v4 == NULL) {
(void) net_event_shutdown(nsp->vpnd_neti_v6,
&nsp->vpnd_event_in_v6);
(void) net_event_unregister(nsp->vpnd_neti_v6,
&nsp->vpnd_event_in_v6);
(void) net_event_shutdown(nsp->vpnd_neti_v4,
&nsp->vpnd_event_in_v4);
(void) net_event_unregister(nsp->vpnd_neti_v4,
&nsp->vpnd_event_in_v4);
(void) net_family_unregister(nsp->vpnd_neti_v4,
&nsp->vpnd_family_v4);
(void) net_family_unregister(nsp->vpnd_neti_v6,
&nsp->vpnd_family_v6);
(void) net_protocol_unregister(nsp->vpnd_neti_v4);
(void) net_protocol_unregister(nsp->vpnd_neti_v6);
cmn_err(CE_NOTE, "vnd_netinfo_init: net_event_register "
"failed for stack %d", nsp->vpnd_nsid);
return (1);
}
nsp->vpnd_event_out_v6.he_version = HOOK_VERSION;
nsp->vpnd_event_out_v6.he_name = NH_PHYSICAL_OUT;
nsp->vpnd_event_out_v6.he_flags = 0;
nsp->vpnd_event_out_v6.he_interested = B_FALSE;
nsp->vpnd_token_out_v6 = net_event_register(nsp->vpnd_neti_v6,
&nsp->vpnd_event_out_v6);
if (nsp->vpnd_token_out_v6 == NULL) {
(void) net_event_shutdown(nsp->vpnd_neti_v6,
&nsp->vpnd_event_in_v6);
(void) net_event_unregister(nsp->vpnd_neti_v6,
&nsp->vpnd_event_in_v6);
(void) net_event_shutdown(nsp->vpnd_neti_v6,
&nsp->vpnd_event_in_v6);
(void) net_event_unregister(nsp->vpnd_neti_v6,
&nsp->vpnd_event_in_v6);
(void) net_event_shutdown(nsp->vpnd_neti_v4,
&nsp->vpnd_event_in_v4);
(void) net_event_unregister(nsp->vpnd_neti_v4,
&nsp->vpnd_event_in_v4);
(void) net_family_unregister(nsp->vpnd_neti_v4,
&nsp->vpnd_family_v4);
(void) net_family_unregister(nsp->vpnd_neti_v6,
&nsp->vpnd_family_v6);
(void) net_protocol_unregister(nsp->vpnd_neti_v4);
(void) net_protocol_unregister(nsp->vpnd_neti_v6);
cmn_err(CE_NOTE, "vnd_netinfo_init: net_event_register "
"failed for stack %d", nsp->vpnd_nsid);
return (1);
}
return (0);
}
static void
vnd_netinfo_shutdown(vnd_pnsd_t *nsp)
{
int ret;
ret = net_event_shutdown(nsp->vpnd_neti_v4, &nsp->vpnd_event_in_v4);
VERIFY(ret == 0);
ret = net_event_shutdown(nsp->vpnd_neti_v4, &nsp->vpnd_event_out_v4);
VERIFY(ret == 0);
ret = net_event_shutdown(nsp->vpnd_neti_v6, &nsp->vpnd_event_in_v6);
VERIFY(ret == 0);
ret = net_event_shutdown(nsp->vpnd_neti_v6, &nsp->vpnd_event_out_v6);
VERIFY(ret == 0);
}
static void
vnd_netinfo_fini(vnd_pnsd_t *nsp)
{
int ret;
ret = net_event_unregister(nsp->vpnd_neti_v4, &nsp->vpnd_event_in_v4);
VERIFY(ret == 0);
ret = net_event_unregister(nsp->vpnd_neti_v4, &nsp->vpnd_event_out_v4);
VERIFY(ret == 0);
ret = net_event_unregister(nsp->vpnd_neti_v6, &nsp->vpnd_event_in_v6);
VERIFY(ret == 0);
ret = net_event_unregister(nsp->vpnd_neti_v6, &nsp->vpnd_event_out_v6);
VERIFY(ret == 0);
ret = net_family_unregister(nsp->vpnd_neti_v4, &nsp->vpnd_family_v4);
VERIFY(ret == 0);
ret = net_family_unregister(nsp->vpnd_neti_v6, &nsp->vpnd_family_v6);
VERIFY(ret == 0);
ret = net_protocol_unregister(nsp->vpnd_neti_v4);
VERIFY(ret == 0);
ret = net_protocol_unregister(nsp->vpnd_neti_v6);
VERIFY(ret == 0);
}
/* ARGSUSED */
static void
vnd_strbarrier_cb(void *arg, mblk_t *bmp, gsqueue_t *gsp, void *dummy)
{
vnd_str_t *vsp = arg;
VERIFY(bmp == &vsp->vns_barrierblk);
mutex_enter(&vsp->vns_lock);
VERIFY(vsp->vns_flags & VNS_F_BARRIER);
VERIFY(!(vsp->vns_flags & VNS_F_BARRIER_DONE));
vsp->vns_flags |= VNS_F_BARRIER_DONE;
mutex_exit(&vsp->vns_lock);
/*
* For better or worse, we have to broadcast here as we could have a
* thread that's blocked for completion as well as one that's blocked
* waiting to do a barrier itself.
*/
cv_broadcast(&vsp->vns_barriercv);
}
/*
* This is a data barrier for the stream while it is in fastpath mode. It blocks
* and ensures that there is nothing else in the squeue.
*/
static void
vnd_strbarrier(vnd_str_t *vsp)
{
mutex_enter(&vsp->vns_lock);
while (vsp->vns_flags & VNS_F_BARRIER)
cv_wait(&vsp->vns_barriercv, &vsp->vns_lock);
vsp->vns_flags |= VNS_F_BARRIER;
mutex_exit(&vsp->vns_lock);
gsqueue_enter_one(vsp->vns_squeue, &vsp->vns_barrierblk,
vnd_strbarrier_cb, vsp, GSQUEUE_PROCESS, VND_SQUEUE_TAG_STRBARRIER);
mutex_enter(&vsp->vns_lock);
while (!(vsp->vns_flags & VNS_F_BARRIER_DONE))
cv_wait(&vsp->vns_barriercv, &vsp->vns_lock);
vsp->vns_flags &= ~VNS_F_BARRIER;
vsp->vns_flags &= ~VNS_F_BARRIER_DONE;
mutex_exit(&vsp->vns_lock);
/*
* We have to broadcast in case anyone is waiting for the barrier
* themselves.
*/
cv_broadcast(&vsp->vns_barriercv);
}
/*
* Based on the type of message that we're dealing with we're going to want to
* do one of several things. Basically if it looks like it's something we know
* about, we should probably handle it in one of our transition threads.
* Otherwise, we should just simply putnext.
*/
static int
vnd_s_rput(queue_t *q, mblk_t *mp)
{
t_uscalar_t prim;
int dispatch = 0;
vnd_str_t *vsp = q->q_ptr;
switch (DB_TYPE(mp)) {
case M_PROTO:
case M_PCPROTO:
if (MBLKL(mp) < sizeof (t_uscalar_t)) {
vnd_drop_ctl(vsp, mp, "PROTO message too short");
break;
}
prim = ((union DL_primitives *)mp->b_rptr)->dl_primitive;
if (prim == DL_UNITDATA_REQ || prim == DL_UNITDATA_IND) {
vnd_drop_ctl(vsp, mp,
"recieved an unsupported dlpi DATA req");
break;
}
/*
* Enqueue the entry and fire off a taskq dispatch.
*/
mutex_enter(&vsp->vns_lock);
vnd_dlpi_inc_push(vsp, mp);
if (!(vsp->vns_flags & VNS_F_TASKQ_DISPATCHED)) {
dispatch = 1;
vsp->vns_flags |= VNS_F_TASKQ_DISPATCHED;
}
mutex_exit(&vsp->vns_lock);
if (dispatch != 0)
taskq_dispatch_ent(vnd_taskq, vnd_dlpi_taskq_dispatch,
vsp, 0, &vsp->vns_tqe);
break;
case M_DATA:
vnd_drop_in(vsp, mp, "M_DATA via put(9E)");
break;
default:
putnext(vsp->vns_rq, mp);
}
return (0);
}
/* ARGSUSED */
static void
vnd_strioctl(queue_t *q, vnd_str_t *vsp, mblk_t *mp, struct iocblk *iocp)
{
int error;
vnd_strioc_t *visp;
if (iocp->ioc_cmd != VND_STRIOC_ASSOCIATE ||
iocp->ioc_count != TRANSPARENT) {
error = EINVAL;
goto nak;
}
/*
* All streams ioctls that we support must use kcred as a means to
* distinguish that this is a layered open by the kernel as opposed to
* one by a user who has done an I_PUSH of the module.
*/
if (iocp->ioc_cr != kcred) {
error = EPERM;
goto nak;
}
if (mp->b_cont == NULL) {
error = EAGAIN;
goto nak;
}
visp = kmem_alloc(sizeof (vnd_strioc_t), KM_SLEEP);
ASSERT(MBLKL(mp->b_cont) == sizeof (caddr_t));
visp->vs_addr = *(caddr_t *)mp->b_cont->b_rptr;
visp->vs_state = VSS_COPYIN;
mcopyin(mp, (void *)visp, sizeof (vnd_strioc_associate_t), NULL);
qreply(q, mp);
return;
nak:
if (mp->b_cont != NULL) {
freemsg(mp->b_cont);
mp->b_cont = NULL;
}
iocp->ioc_error = error;
mp->b_datap->db_type = M_IOCNAK;
iocp->ioc_count = 0;
qreply(q, mp);
}
static void
vnd_striocdata(queue_t *q, vnd_str_t *vsp, mblk_t *mp, struct copyresp *csp)
{
vnd_str_state_t state;
struct copyreq *crp;
vnd_strioc_associate_t *vss;
vnd_dev_t *vdp = NULL;
vnd_pnsd_t *nsp = NULL;
char iname[2*VND_NAMELEN];
zone_t *zone;
vnd_strioc_t *visp;
visp = (vnd_strioc_t *)csp->cp_private;
/* If it's not ours, it's not our problem */
if (csp->cp_cmd != VND_STRIOC_ASSOCIATE) {
if (q->q_next != NULL) {
putnext(q, mp);
} else {
VND_STAT_INC(vsp, vks_ndlpidrops, 1);
VND_STAT_INC(vsp, vks_tdrops, 1);
vnd_drop_ctl(vsp, mp, "uknown cmd for M_IOCDATA");
}
kmem_free(visp, sizeof (vnd_strioc_t));
return;
}
/* The nak is already sent for us */
if (csp->cp_rval != 0) {
vnd_drop_ctl(vsp, mp, "M_COPYIN failed");
kmem_free(visp, sizeof (vnd_strioc_t));
return;
}
/* Data is sitting for us in b_cont */
if (mp->b_cont == NULL ||
MBLKL(mp->b_cont) != sizeof (vnd_strioc_associate_t)) {
kmem_free(visp, sizeof (vnd_strioc_t));
miocnak(q, mp, 0, EINVAL);
return;
}
vss = (vnd_strioc_associate_t *)mp->b_cont->b_rptr;
vdp = vnd_dev_lookup(vss->vsa_minor);
if (vdp == NULL) {
vss->vsa_errno = VND_E_NODEV;
goto nak;
}
nsp = vnd_nsd_lookup(vss->vsa_nsid);
if (nsp == NULL) {
vss->vsa_errno = VND_E_NONETSTACK;
goto nak;
}
mutex_enter(&vsp->vns_lock);
if (!(vsp->vns_flags & VNS_F_NEED_ZONE)) {
mutex_exit(&vsp->vns_lock);
vss->vsa_errno = VND_E_ASSOCIATED;
goto nak;
}
vsp->vns_nsd = nsp;
vsp->vns_flags &= ~VNS_F_NEED_ZONE;
vsp->vns_flags |= VNS_F_TASKQ_DISPATCHED;
mutex_exit(&vsp->vns_lock);