forked from illumos/illumos-gate
-
Notifications
You must be signed in to change notification settings - Fork 109
/
vnd.c
5796 lines (5176 loc) · 165 KB
/
vnd.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/*
* This file and its contents are supplied under the terms of the
* Common Development and Distribution License ("CDDL"), version 1.0.
* You may only use this file in accordance with the terms of version
* 1.0 of the CDDL.
*
* A full copy of the text of the CDDL should have accompanied this
* source. A copy of the CDDL is also available via the Internet at
* http://www.illumos.org/license/CDDL.
*/
/*
* Copyright 2017 Joyent, Inc.
*/
/*
* vnd - virtual (machine) networking datapath
*
* vnd's purpose is to provide a highly performant data path for Layer 2 network
* traffic and exist side by side an active IP netstack, each servicing
* different datalinks. vnd provides many of the same capabilities as the
* current TCP/IP stack does and some specific to layer two. Specifically:
*
* o Use of the DLD fastpath
* o Packet capture hooks
* o Ability to use hardware capabilities
* o Useful interfaces for handling multiple frames
*
* The following image shows where vnd fits into today's networking stack:
*
* +---------+----------+----------+
* | libdlpi | libvnd | libsocket|
* +---------+----------+----------+
* | · · VFS |
* | VFS · VFS +----------+
* | · | sockfs |
* +---------+----------+----------+
* | | VND | IP |
* | +----------+----------+
* | DLD/DLS |
* +-------------------------------+
* | MAC |
* +-------------------------------+
* | GLDv3 |
* +-------------------------------+
*
* -----------------------------------------
* A Tale of Two Devices - DDI Device Basics
* -----------------------------------------
*
* vnd presents itself to userland as a character device; however, it also is a
* STREAMS device so that it can interface with dld and the rest of the
* networking stack. Users never interface with the STREAMs devices directly and
* they are purely an implementation detail of vnd. Opening the STREAMS device
* require kcred and as such userland cannot interact with it or push it onto
* the stream head.
*
* The main vnd character device, /dev/vnd/ctl, is a self-cloning device. Every
* clone gets its own minor number; however, minor nodes are not created in the
* devices tree for these instances. In this state a user may do two different
* things. They may issue ioctls that affect global state or they may issue
* ioctls that try to attach it to a given datalink. Once a minor device has
* been attached to a datalink, all operations on it are scoped to that context,
* therefore subsequent global operations are not permitted.
*
* A given device can be linked into the /devices and /dev name space via a link
* ioctl. That ioctl causes a minor node to be created in /devices and then it
* will also appear under /dev/vnd/ due to vnd's sdev plugin. This is similar
* to, but simpler than, IP's persistence mechanism.
*
* ---------------------
* Binding to a datalink
* ---------------------
*
* Datalinks are backed by the dld (datalink device) and dls (datalink services)
* drivers. These drivers provide a STREAMS device for datalinks on the system
* which are exposed through /dev/net. Userland generally manipulates datalinks
* through libdlpi. When an IP interface is being plumbed up what actually
* happens is that someone does a dlpi_open(3DLPI) of the underlying datalink
* and then pushes on the ip STREAMS module with an I_PUSH ioctl. Modules may
* then can negotiate with dld and dls to obtain access to various capabilities
* and fast paths via a series of STREAMS messages.
*
* In vnd, we do the same thing, but we leave our STREAMS module as an
* implementation detail of the system. We don't want users to be able to
* arbitrarily push vnd STREAMS module onto any stream, so we explicitly require
* kcred to manipulate it. Thus, when a user issues a request to attach a
* datalink to a minor instance of the character device, that vnd minor instance
* itself does a layered open (ldi_open_by_name(9F)) of the specified datalink.
* vnd does that open using the passed in credentials from the ioctl, not kcred.
* This ensures that users who doesn't have permissions to open the device
* cannot. Once that's been opened, we push on the vnd streams module.
*
* Once the vnd STREAMS instance has been created for this device, eg. the
* I_PUSH ioctl returns, we explicitly send a STREAMS ioctl
* (VND_STRIOC_ASSOCIATE) to associate the vnd STREAMS and character devices.
* This association begins the STREAM device's initialization. We start up an
* asynchronous state machine that takes care of all the different aspects of
* plumbing up the device with dld and dls and enabling the MAC fast path. We
* need to guarantee to consumers of the character device that by the time their
* ioctl returns, the data path has been fully initialized.
*
* The state progression is fairly linear. There are two general steady states.
* The first is VND_S_ONLINE, which means that everything is jacked up and good
* to go. The alternative is VND_S_ZOMBIE, which means that the streams device
* encountered an error or we have finished tearing it down and the character
* device can clean it up. The following is our state progression and the
* meaning of each state:
*
* |
* |
* V
* +---------------+
* | VNS_S_INITIAL | This is our initial state. Every
* +---------------+ vnd STREAMS device starts here.
* | While in this state, only dlpi
* | M_PROTO and M_IOCTL messages can be
* | sent or received. All STREAMS based
* | data messages are dropped.
* | We transition out of this state by
* | sending a DL_INFO_REQ to obtain
* | information about the underlying
* | link.
* v
* +-----------------+
* +--<-| VNS_S_INFO_SENT | In this state, we verify and
* | +-----------------+ record information about the
* | | underlying device. If the device is
* | | not suitable, eg. not of type
* v | DL_ETHER, then we immediately
* | | become a ZOMBIE. To leave this
* | | state we request exclusive active
* | | access to the device via
* v | DL_EXCLUSIVE_REQ.
* | v
* | +----------------------+
* +--<-| VNS_S_EXCLUSIVE_SENT | In this state, we verify whether
* | +----------------------+ or not we were able to obtain
* | | | exclusive access to the device. If
* | | | we were not able to, then we leave,
* v | | as that means that something like
* | | | IP is already plumbed up on top of
* | | | the datalink. We leave this state
* | | | by progressing through to the
* | | | appropriate DLPI primitive, either
* v | | DLPI_ATTACH_REQ or DLPI_BIND_REQ
* | | | depending on the style of the
* | | | datalink.
* | | v
* | | +-------------------+
* +------ |--<-| VNS_S_ATTACH_SENT | In this state, we verify we were
* | | +-------------------+ able to perform a standard DLPI
* | | | attach and if so, go ahead and
* v | | send a DLPI_BIND_REQ.
* | v v
* | +-------------------+
* +--<-| VNS_S_BIND_SENT | In this state we see the result of
* | +-------------------+ our attempt to bind to PPA 0 of the
* v | underlying device. Because we're
* | | trying to be a layer two datapath,
* | | the specific attachment point isn't
* | | too important as we're going to
* v | have to enable promiscuous mode. We
* | | transition out of this by sending
* | | our first of three promiscuous mode
* | | requests.
* v v
* | +------------------------+
* +--<-| VNS_S_SAP_PROMISC_SENT | In this state we verify that we
* | +------------------------+ were able to enable promiscuous
* | | mode at the physical level. We
* | | transition out of this by enabling
* | | multicast and broadcast promiscuous
* v | mode.
* | v
* | +--------------------------+
* +--<-| VNS_S_MULTI_PROMISC_SENT | In this state we verify that we
* | +--------------------------+ have enabled DL_PROMISC_MULTI and
* v | move onto the second promiscuous
* | | mode request.
* | v
* | +----------------------------+
* +--<-| VNS_S_RX_ONLY_PROMISC_SENT | In this state we verify that we
* | +----------------------------+ enabled RX_ONLY promiscuous mode.
* | | We specifically do this as we don't
* v | want to receive our own traffic
* | | that we'll send out. We leave this
* | | state by enabling the final flag
* | | DL_PROMISC_FIXUPS.
* | v
* | +--------------------------+
* +--<-| VNS_S_FIXUP_PROMISC_SENT | In this state we verify that we
* | +--------------------------+ enabled FIXUP promiscuous mode.
* | | We specifically do this as we need
* v | to ensure that traffic which is
* | | received by being looped back to us
* | | correctly has checksums fixed. We
* | | leave this state by requesting the
* | | dld/dls capabilities that we can
* v | process.
* | v
* | +--------------------+
* +--<-| VNS_S_CAPAB_Q_SENT | We loop over the set of
* | +--------------------+ capabilities that dld advertised
* | | and enable the ones that currently
* v | support for use. See the section
* | | later on regarding capabilities
* | | for more information. We leave this
* | | state by sending an enable request.
* v v
* | +--------------------+
* +--<-| VNS_S_CAPAB_E_SENT | Here we finish all capability
* | +--------------------+ initialization. Once finished, we
* | | transition to the next state. If
* v | the dld fast path is not available,
* | | we become a zombie.
* | v
* | +--------------+
* | | VNS_S_ONLINE | This is a vnd STREAMS device's
* | +--------------+ steady state. It will normally
* | | reside in this state while it is in
* | | active use. It will only transition
* v | to the next state when the STREAMS
* | | device is closed by the character
* | | device. In this state, all data
* | | flows over the dld fast path.
* | v
* | +---------------------+
* +--->| VNS_S_SHUTTING_DOWN | This vnd state takes care of
* | +---------------------+ disabling capabilities and
* | | flushing all data. At this point
* | | any additional data that we receive
* | | will be dropped. We leave this
* v | state by trying to remove multicast
* | | promiscuity.
* | |
* | v
* | +---------------------------------+
* +-->| VNS_S_MULTICAST_PROMISCOFF_SENT | In this state, we check if we have
* | +---------------------------------+ successfully removed multicast
* | | promiscuous mode. If we have
* | | failed, we still carry on but only
* | | warn. We leave this state by trying
* | | to disable SAP level promiscuous
* | | mode.
* | v
* | +---------------------------+
* +-->| VNS_S_SAP_PROMISCOFF_SENT | In this state, we check if we have
* | +---------------------------+ successfully removed SAP level
* | | promiscuous mode. If we have
* | | failed, we still carry on but only
* | | warn. Note that we don't worry
* | | about either of
* | | DL_PROMISC_FIXUPS or
* | | DL_PROMISC_RX_ONLY. If these are
* | | the only two entries left, then we
* | | should have anything that MAC is
* | | doing for us at this point,
* | | therefore it's safe for us to
* | | proceed to unbind, which is how we
* | | leave this state via a
* | v DL_UNBIND_REQ.
* | +-------------------+
* +--->| VNS_S_UNBIND_SENT | Here, we check how the unbind
* | +-------------------+ request went. Regardless of its
* | | success, we always transition to
* | | a zombie state.
* | v
* | +--------------+
* +--->| VNS_S_ZOMBIE | In this state, the vnd STREAMS
* +--------------+ device is waiting to finish being
* reaped. Because we have no more
* ways to receive data it should be
* safe to destroy all remaining data
* structures.
*
* If the stream association fails for any reason the state machine reaches
* VNS_S_ZOMBIE. A more detailed vnd_errno_t will propagate back through the
* STREAMS ioctl to the character device. That will fail the user ioctl and
* propagate the vnd_errno_t back to userland. If, on the other hand, the
* association succeeds, then the vnd STREAMS device will be fully plumbed up
* and ready to transmit and receive message blocks. Consumers will be able to
* start using the other cbops(9E) entry points once the attach has fully
* finished, which will occur after the original user attach ioctl to the
* character device returns.
*
* It's quite important that we end up sending the full series of STREAMS
* messages when tearing down. While it's tempting to say that we should just
* rely on the STREAMS device being closed to properly ensure that we have no
* more additional data, that's not sufficient due to our use of direct
* callbacks. DLS does not ensure that by the time we change the direct
* callback (vnd_mac_input) that all callers to it will have been quiesced.
* However, it does guarantee that if we disable promiscuous mode ourselves and
* we turn off the main data path via DL_UNBIND_REQ that it will work.
* Therefore, we make sure to do this ourselves rather than letting DLS/DLD do
* it as part of tearing down the STREAMS device. This ensures that we'll
* quiesce all data before we destroy our data structures and thus we should
* eliminate the race in changing the data function.
*
* --------------------
* General Architecture
* --------------------
*
* There are several different devices and structures in the vnd driver. There
* is a per-netstack component, pieces related to the character device that
* consumers see, the internal STREAMS device state, and the data queues
* themselves. The following ASCII art picture describes their relationships and
* some of the major pieces of data that contain them. These are not exhaustive,
* e.g. synchronization primitives are left out.
*
* +----------------+ +-----------------+
* | global | | global |
* | device list | | netstack list |
* | vnd_dev_list | | vnd_nsd_list |
* +----------------+ +-----------------+
* | |
* | v
* | +-------------------+ +-------------------+
* | | per-netstack data | ---> | per-netstack data | --> ...
* | | vnd_pnsd_t | | vnd_pnsd_t |
* | | | +-------------------+
* | | |
* | | nestackid_t ---+----> Netstack ID
* | | vnd_pnsd_flags_t -+----> Status flags
* | | zoneid_t ---+----> Zone ID for this netstack
* | | hook_family_t ---+----> VND IPv4 Hooks
* | | hook_family_t ---+----> VND IPv6 Hooks
* | | list_t ----+ |
* | +------------+------+
* | |
* | v
* | +------------------+ +------------------+
* | | character device | ---> | character device | -> ...
* +---------->| vnd_dev_t | | vnd_dev_t |
* | | +------------------+
* | |
* | minor_t ---+--> device minor number
* | ldi_handle_t ---+--> handle to /dev/net/%datalink
* | vnd_dev_flags_t -+--> device flags, non blocking, etc.
* | char[] ---+--> name if linked
* | vnd_str_t * -+ |
* +--------------+---+
* |
* v
* +-------------------------+
* | STREAMS device |
* | vnd_str_t |
* | |
* | vnd_str_state_t ---+---> State machine state
* | gsqueue_t * ---+---> mblk_t Serialization queue
* | vnd_str_stat_t ---+---> per-device kstats
* | vnd_str_capab_t ---+----------------------------+
* | vnd_data_queue_t ---+ | |
* | vnd_data_queue_t -+ | | v
* +-------------------+-+---+ +---------------------+
* | | | Stream capabilities |
* | | | vnd_str_capab_t |
* | | | |
* | | supported caps <--+-- vnd_capab_flags_t |
* | | dld cap handle <--+-- void * |
* | | direct tx func <--+-- vnd_dld_tx_t |
* | | +---------------------+
* | |
* +----------------+ +-------------+
* | |
* v v
* +-------------------+ +-------------------+
* | Read data queue | | Write data queue |
* | vnd_data_queue_t | | vnd_data_queue_t |
* | | | |
* | size_t ----+--> Current size | size_t ----+--> Current size
* | size_t ----+--> Max size | size_t ----+--> Max size
* | mblk_t * ----+--> Queue head | mblk_t * ----+--> Queue head
* | mblk_t * ----+--> Queue tail | mblk_t * ----+--> Queue tail
* +-------------------+ +-------------------+
*
*
* Globally, we maintain two lists. One list contains all of the character
* device soft states. The other maintains a list of all our netstack soft
* states. Each netstack maintains a list of active devices that have been
* associated with a datalink in its netstack.
*
* Recall that a given minor instance of the character device exists in one of
* two modes. It can either be a cloned open of /dev/vnd/ctl, the control node,
* or it can be associated with a given datalink. When minor instances are in
* the former state, they do not exist in a given vnd_pnsd_t's list of devices.
* As part of attaching to a datalink, the given vnd_dev_t will be inserted into
* the appropriate vnd_pnsd_t. In addition, this will cause a STREAMS device, a
* vnd_str_t, to be created and associated to a vnd_dev_t.
*
* The character device, and its vnd_dev_t, is the interface to the rest of the
* system. The vnd_dev_t keeps track of various aspects like whether various
* operations, such as read, write and the frameio ioctls, are considered
* blocking or non-blocking in the O_NONBLOCK sense. It also is responsible for
* keeping track of things like the name of the device, if any, in /dev. The
* vnd_str_t, on the other hand manages aspects like buffer sizes and the actual
* data queues. However, ioctls that manipulate these properties all go through
* the vnd_dev_t to its associated vnd_str_t.
*
* Each of the STREAMS devices, the vnd_str_t, maintains two data queues. One
* for frames to transmit (write queue) and one for frames received (read
* queue). These data queues have a maximum size and attempting to add data
* beyond that maximum size will result in data being dropped. The sizes are
* configurable via ioctls VND_IOC_SETTXBUF, VND_IOC_SETRXBUF. Data either sits
* in those buffers or has a reservation in those buffers while they are in vnd
* and waiting to be consumed by the user or by mac.
*
* Finally, the vnd_str_t also has a vnd_str_capab_t which we use to manage the
* available, negotiated, and currently active features.
*
* ----------------------
* Data Path and gsqueues
* ----------------------
*
* There's a lot of plumbing in vnd to get to the point where we can send data,
* but vnd's bread and butter is the data path, so it's worth diving into it in
* more detail. Data enters and exits the system from two ends.
*
* The first end is the vnd consumer. This comes in the form of read and write
* system calls as well as the frame I/O ioctls. The read and write system calls
* operate on a single frame at a time. Think of a frame as a single message
* that has come in off the wire, which may itself comprise multiple mblk_t's
* linked together in the kernel. readv(2) and writev(2) have the same
* limitations as read(2) and write(2). We enforce this as the system is
* required to fill up every uio(9S) buffer before moving onto the next one.
* This means that if you have a MTU sized buffer and two frames come in which
* are less than half of the MTU they must fill up the given iovec. Even if we
* didn't want to do this, we have no way of informing the supplier of the
* iovecs that they were only partially filled or where one frame ends and
* another begins. That's life, as such we have frame I/O which solves this
* problem. It allows for multiple frames to be consumed as well as for frames
* to be broken down into multiple vector components.
*
* The second end is the mac direct calls. As part of negotiating capabilities
* via dld, we give mac a function of ours to call when packets are received
* [vnd_mac_input()] and a callback to indicate that flow has been restored
* [vnd_mac_flow_control()]. In turn, we also get a function pointer that we can
* transmit data with. As part of the contract with mac, mac is allowed to flow
* control us by returning a cookie to the transmit function. When that happens,
* all outbound traffic is halted until our callback function is called and we
* can schedule drains.
*
* It's worth looking at these in further detail. We'll start with the rx path.
*
*
* |
* * . . . packets from gld
* |
* v
* +-------------+
* | mac |
* +-------------+
* |
* v
* +-------------+
* | dld |
* +-------------+
* |
* * . . . dld direct callback
* |
* v
* +---------------+
* | vnd_mac_input |
* +---------------+
* |
* v
* +---------+ +-------------+
* | dropped |<--*---------| vnd_hooks |
* | by | . +-------------+
* | hooks | . drop probe |
* +---------+ kstat bump * . . . Do we have free
* | buffer space?
* |
* no . | . yes
* . + .
* +---*--+------*-------+
* | |
* * . . drop probe * . . recv probe
* | kstat bump | kstat bump
* v |
* +---------+ * . . fire pollin
* | freemsg | v
* +---------+ +-----------------------+
* | vnd_str_t`vns_dq_read |
* +-----------------------+
* ^ ^
* +----------+ | | +---------+
* | read(9E) |-->-+ +--<--| frameio |
* +----------+ +---------+
*
* The rx path is rather linear. Packets come into us from mac. We always run
* them through the various hooks, and if they come out of that, we inspect the
* read data queue. If there is not enough space for a packet, we drop it.
* Otherwise, we append it to the data queue, and fire read notifications
* targetting anyone polling or doing blocking I/O on this device. Those
* consumers then drain the head of the data queue.
*
* The tx path is more complicated due to mac flow control. After any call into
* mac, we may have to potentially suspend writes and buffer data for an
* arbitrary amount of time. As such, we need to carefully track the total
* amount of outstanding data so that we don't waste kernel memory. This is
* further complicated by the fact that mac will asynchronously tell us when our
* flow has been resumed.
*
* For data to be able to enter the system, it needs to be able to take a
* reservation from the write data queue. Once the reservation has been
* obtained, we enter the gsqueue so that we can actually append it. We use
* gsqueues (serialization queues) to ensure that packets are manipulated in
* order as we deal with the draining and appending packets. We also leverage
* its worker thread to help us do draining after mac has restorted our flow.
*
* The following image describes the flow:
*
* +-----------+ +--------------+ +-------------------------+ +------+
* | write(9E) |-->| Space in the |--*--->| gsqueue_enter_one() |-->| Done |
* | frameio | | write queue? | . | +->vnd_squeue_tx_append | +------+
* +-----------+ +--------------+ . +-------------------------+
* | ^ .
* | | . reserve space from gsqueue
* | | |
* queue . . . * | space v
* full | * . . . avail +------------------------+
* v | | vnd_squeue_tx_append() |
* +--------+ +------------+ +------------------------+
* | EAGAIN |<--*------| Non-block? |<-+ |
* +--------+ . +------------+ | v
* . yes v | wait +--------------+
* no . .* * . . for | append chain |
* +----+ space | to outgoing |
* | mblk chain |
* from gsqueue +--------------+
* | |
* | +-------------------------------------------------+
* | |
* | | yes . . .
* v v .
* +-----------------------+ +--------------+ . +------+
* | vnd_squeue_tx_drain() |--->| mac blocked? |----*---->| Done |
* +-----------------------+ +--------------+ +------+
* | |
* +---------------------------------|---------------------+
* | | tx |
* | no . . * queue . . *
* | flow controlled . | empty * . fire pollout
* | . v | if mblk_t's
* +-------------+ . +---------------------+ | sent
* | set blocked |<----*------| vnd_squeue_tx_one() |--------^-------+
* | flags | +---------------------+ |
* +-------------+ More data | | | More data |
* and limit ^ v * . . and limit ^
* not reached . . * | | reached |
* +----+ | |
* v |
* +----------+ +-------------+ +---------------------------+
* | mac flow |--------->| remove mac |--->| gsqueue_enter_one() with |
* | control | | block flags | | vnd_squeue_tx_drain() and |
* | callback | +-------------+ | GSQUEUE_FILL flag, iff |
* +----------+ | not already scheduled |
* +---------------------------+
*
* The final path taken for a given write(9E)/frameio ioctl depends on whether
* or not the vnd_dev_t is non-blocking. That controls the initial path of
* trying to take a reservation in write data queue. If the device is in
* non-blocking mode, we'll return EAGAIN when there is not enough space
* available, otherwise, the calling thread blocks on the data queue.
*
* Today when we call into vnd_squeue_tx_drain() we will not try to drain the
* entire queue, as that could be quite large and we don't want to necessarily
* keep the thread that's doing the drain until it's been finished. Not only
* could more data be coming in, but the draining thread could be a userland
* thread that has more work to do. We have two limits today. There is an upper
* bound on the total amount of data and the total number of mblk_t chains. If
* we hit either limit, then we will schedule another drain in the gsqueue and
* go from there.
*
* It's worth taking some time to describe how we interact with gsqueues. vnd
* has a gsqueue_set_t for itself. It's important that it has its own set, as
* the profile of work that vnd does is different from other sub-systems in the
* kernel. When we open a STREAMS device in vnd_s_open, we get a random gsqueue.
* Unlike TCP/IP which uses an gsqueue for per TCP connection, we end up
* maintaining one for a given device. Because of that, we want to use a
* pseudo-random one to try and spread out the load, and picking one at random
* is likely to be just as good as any fancy algorithm we might come up with,
* especially as any two devices could have radically different transmit
* profiles.
*
* While some of the write path may seem complicated, it does allow us to
* maintain an important property. Once we have acknowledged a write(9E) or
* frameio ioctl, we will not drop the packet, excepting something like ipf via
* the firewall hooks.
*
* There is one other source of flow control that can exist in the system which
* is in the form of a barrier. The barrier is an internal mechanism used for
* ensuring that an gsqueue is drained for a given device. We use this as part
* of tearing down. Specifically we disable the write path so nothing new can be
* inserted into the gsqueue and then insert a barrier block. Once the barrier
* block comes out of the gsqueue, then we know nothing else in the gsqueue that
* could refer to the vnd_str_t, being destroyed, exists.
*
* ---------------------
* vnd, zones, netstacks
* ---------------------
*
* vnd devices are scoped to datalinks and datalinks are scoped to a netstack.
* Because of that, vnd is also a netstack module. It registers with the
* netstack sub-system and receives callbacks every time a netstack is created,
* being shutdown, and destroyed. The netstack callbacks drive the creation and
* destruction of the vnd_pnsd_t structures.
*
* Recall from the earlier architecture diagrams that every vnd device is scoped
* to a netstack and known about by a given vnd_pnsd_t. When that netstack is
* torn down, we also tear down any vnd devices that are hanging around. When
* the netstack is torn down, we know that any zones that are scoped to that
* netstack are being shut down and have no processes remaining. This is going
* to be the case whether they are shared or exclusive stack zones. We have to
* perform a careful dance.
*
* There are two different callbacks that happen on tear down, the first is a
* shutdown callback, the second is a destroy callback. When the shutdown
* callback is fired we need to prepare for the netstack to go away and ensure
* that nothing can continue to persist itself.
*
* More specifically, when we get notice of a stack being shutdown we first
* remove the netstack from the global netstack list to ensure that no one new
* can come in and find the netstack and get a reference to it. After that, we
* notify the neti hooks that they're going away. Once that's all done, we get
* to the heart of the matter.
*
* When shutting down there could be any number of outstanding contexts that
* have a reference on the vnd_pnsd_t and on the individual links. However, we
* know that no one new will be able to find the vnd_pnsd_t. To account for
* things that have existing references we mark the vnd_pnsd_t`vpnd_flags with
* VND_NS_CONDEMNED. This is checked by code paths that wish to append a device
* to the netstack's list. If this is set, then they must not append to it.
* Once this is set, we know that the netstack's list of devices can never grow,
* only shrink.
*
* Next, for each device we tag it with VND_D_ZONE_DYING. This indicates that
* the container for the device is being destroyed and that we should not allow
* additional references to the device to be created, whether via open, or
* linking. The presence of this bit also allows things like the list ioctl and
* sdev to know not to consider its existence. At the conclusion of this being
* set, we know that no one else should be able to obtain a new reference to the
* device.
*
* Once that has been set for all devices, we go through and remove any existing
* links that have been established in sdev. Because doing that may cause the
* final reference for the device to be dropped, which still has a reference to
* the netstack, we have to restart our walk due to dropped locks. We know that
* this walk will eventually complete because the device cannot be relinked and
* no new devices will be attached in this netstack due to VND_NS_CONDEMNED.
* Once that's finished, the shutdown callback returns.
*
* When we reach the destroy callback, we simply wait for references on the
* netstack to disappear. Because the zone has been shut down, all processes in
* it that have open references have been terminated and reaped. Any threads
* that are newly trying to reference it will fail. However, there is one thing
* that can halt this that we have no control over, which is the global zone
* holding open a reference to the device. In this case the zone halt will hang
* in vnd_stack_destroy. Once the last references is dropped we finish destroy
* the netinfo hooks and free the vnd_pnsd_t.
*
* ----
* sdev
* ----
*
* vnd registers a sdev plugin which allows it to dynamically fill out /dev/vnd
* for both the global and non-global zones. In any given zone we always supply
* a control node via /dev/vnd/ctl. This is the self-cloning node. Each zone
* will also have an entry per-link in that zone under /dev/vnd/%datalink, eg.
* if a link was named net0, there would be a /dev/vnd/net0. The global zone can
* also see every link for every zone, ala /dev/net, under
* /dev/vnd/%zonename/%datalink, eg. if a zone named 'turin' had a vnd device
* named net0, the global zone would have /dev/vnd/turin/net0.
*
* The sdev plugin has three interfaces that it supplies back to sdev. One is to
* validate that a given node is still valid. The next is a callback from sdev
* to say that it is no longer using the node. The third and final one is from
* sdev where it asks us to fill a directory. All of the heavy lifting is done
* in directory filling and in valiation. We opt not to maintain a reference on
* the device while there is an sdev node present. This makes the removal of
* nodes much simpler and most of the possible failure modes shouldn't cause any
* real problems. For example, the open path has to handle both dev_t's which no
* longer exist and which are no longer linked.
*
* -----
* hooks
* -----
*
* Like IP, vnd sends all L3 packets through its firewall hooks. Currently vnd
* provides these for L3 IP and IPv6 traffic. Each netstack provides these hooks
* in a minimal fashion. While we will allow traffic to be filtered through the
* hooks, we do not provide means for packet injection or additional inspection
* at this time. There are a total of four different events created:
*
* o IPv4 physical in
* o IPv4 physical out
* o IPv6 physical in
* o IPv6 physical out
*
* ---------------
* Synchronization
* ---------------
*
* To make our synchronization simpler, we've put more effort into making the
* metadata/setup paths do more work. That work allows the data paths to make
* assumptions around synchronization that simplify the general case. Each major
* structure, the vnd_pnsd_t, vnd_dev_t, vnd_str_t, and vnd_data_queue_t is
* annotated with the protection that its members receives. The following
* annotations are used:
*
* A Atomics; these values are only modified using atomics values.
* Currently this only applies to kstat values.
* E Existence; no lock is needed to access this member, it does not
* change while the structure is valid.
* GL Global Lock; these members are protected by the global
* vnd_dev_lock.
* L Locked; access to the member is controlled by a lock that is in
* the structure.
* NSL netstack lock; this member is protected by the containing
* netstack. This only applies to the vnd_dev_t`vdd_nslink.
* X This member is special, and is discussed in this section.
*
* In addition to locking, we also have reference counts on the vnd_dev_t and
* the vnd_pnsd_t. The reference counts describe the lifetimes of the structure.
* With rare exception, once a reference count is decremented, the consumer
* should not assume that the data is valid any more. The only exception to this
* is the case where we're removing an extant reference count from a link into
* /devices or /dev. Reference counts are obtained on these structures as a part
* of looking them up.
*
* # Global Lock Ordering
* ######################
*
* The following is the order that you must take locks in vnd:
*
* 1) vnd`vnd_dev_lock
* 2) vnd_pnsd_t`vpnd_lock
* 3) vnd_dev_t`vnd_lock
* 4) vnd_str_t`vns_lock
* 5) vnd_data_queue_t`vdq_lock
*
* One must adhere to the following rules:
*
* o You must acquire a lower numbered lock before a high numbered lock.
* o It is NOT legal to hold two locks of the same level concurrently, eg. you
* can not hold two different vnd_dev_t's vnd_lock at the same time.
* o You may release locks in any order.
* o If you release a lock, you must honor the locking rules before acquiring
* it again.
* o You should not hold any locks when calling any of the rele functions.
*
* # Special Considerations
* ########################
*
* While most of the locking is what's expected, it's worth going into the
* special nature that a few members hold. Today, only two structures have
* special considerations: the vnd_dev_t and the vnd_str_t. All members with
* special considerations have an additional annotation that describes how you
* should interact with it.
*
* vnd_dev_t: The vdd_nsd and vdd_cr are only valid when the minor node is
* attached or in the process of attaching. If the code path that goes through
* requires an attached vnd_dev_t, eg. the data path and tear down path, then it
* is always legal to dereference that member without a lock held. When they are
* added to the system, they should be done under the vdd_lock and done as part
* of setting the VND_D_ATTACH_INFLIGHT flag. These should not change during the
* lifetime of the vnd_dev_t.
*
* vnd_dev_t: The vdd_ldih is similar to the vdd_nsd and vdd_cr, except that it
* always exists as it is a part of the structure. The only time that it's valid
* to be using it is during the attach path with the VND_D_ATTACH_INFLIGHT flag
* set or during tear down. Outside of those paths which are naturally
* serialized, there is no explicit locking around the member.
*
* vnd_str_t: The vns_dev and vns_nsd work in similar ways. They are not
* initially set as part of creating the structure, but are set as part of
* responding to the association ioctl. Anything in the data path or metadata
* path that requires association may assume that they exist, as we do not kick
* off the state machine until they're set.
*
* vnd_str_t: The vns_drainblk and vns_barrierblk are similarly special. The
* members are designed to be used as part of various operations with the
* gsqueues. A lock isn't needed to use them, but to work with them, the
* appropriate flag in the vnd_str_t`vns_flags must have been set by the current
* thread. Otherwise, it is always fair game to refer to their addresses. Their
* contents are ignored by vnd, but some members are manipulated by the gsqueue
* subsystem.
*/
#include <sys/conf.h>
#include <sys/devops.h>
#include <sys/modctl.h>
#include <sys/stat.h>
#include <sys/file.h>
#include <sys/types.h>
#include <sys/errno.h>
#include <sys/open.h>
#include <sys/ddi.h>
#include <sys/ethernet.h>
#include <sys/stropts.h>
#include <sys/sunddi.h>
#include <sys/stream.h>
#include <sys/strsun.h>
#include <sys/ksynch.h>
#include <sys/taskq_impl.h>
#include <sys/sdt.h>
#include <sys/debug.h>
#include <sys/sysmacros.h>
#include <sys/dlpi.h>
#include <sys/cred.h>
#include <sys/id_space.h>
#include <sys/list.h>
#include <sys/ctype.h>
#include <sys/policy.h>
#include <sys/sunldi.h>
#include <sys/cred.h>
#include <sys/strsubr.h>
#include <sys/poll.h>
#include <sys/neti.h>
#include <sys/hook.h>
#include <sys/hook_event.h>
#include <sys/vlan.h>
#include <sys/dld.h>
#include <sys/mac_client.h>
#include <sys/netstack.h>
#include <sys/fs/sdev_plugin.h>
#include <sys/kstat.h>
#include <sys/atomic.h>
#include <sys/disp.h>
#include <sys/random.h>
#include <sys/gsqueue.h>
#include <inet/ip.h>
#include <inet/ip6.h>
#include <sys/vnd.h>
/*
* Globals
*/
static dev_info_t *vnd_dip;
static taskq_t *vnd_taskq;
static kmem_cache_t *vnd_str_cache;
static kmem_cache_t *vnd_dev_cache;
static kmem_cache_t *vnd_pnsd_cache;
static id_space_t *vnd_minors;
static int vnd_list_init = 0;
static sdev_plugin_hdl_t vnd_sdev_hdl;
static gsqueue_set_t *vnd_sqset;
static kmutex_t vnd_dev_lock;
static list_t vnd_dev_list; /* Protected by the vnd_dev_lock */
static list_t vnd_nsd_list; /* Protected by the vnd_dev_lock */
/*
* STREAMs ioctls
*
* The STREAMs ioctls are internal to vnd. No one should be seeing them, as such
* they aren't a part of the header file.
*/
#define VND_STRIOC (('v' << 24) | ('n' << 16) | ('d' << 8) | 0x80)
/*
* Private ioctl to associate a given streams instance with a minor instance of
* the character device.
*/
#define VND_STRIOC_ASSOCIATE (VND_STRIOC | 0x1)
typedef struct vnd_strioc_associate {
minor_t vsa_minor; /* minor device node */
netstackid_t vsa_nsid; /* netstack id */
vnd_errno_t vsa_errno; /* errno */
} vnd_strioc_associate_t;
typedef enum vnd_strioc_state {
VSS_UNKNOWN = 0,
VSS_COPYIN = 1,
VSS_COPYOUT = 2,
} vnd_strioc_state_t;
typedef struct vnd_strioc {
vnd_strioc_state_t vs_state;
caddr_t vs_addr;
} vnd_strioc_t;
/*
* VND SQUEUE TAGS, start at 0x42 so we don't overlap with extent tags. Though
* really, overlap is at the end of the day, inevitable.
*/
#define VND_SQUEUE_TAG_TX_DRAIN 0x42
#define VND_SQUEUE_TAG_MAC_FLOW_CONTROL 0x43
#define VND_SQUEUE_TAG_VND_WRITE 0x44
#define VND_SQUEUE_TAG_ND_FRAMEIO_WRITE 0x45
#define VND_SQUEUE_TAG_STRBARRIER 0x46
/*
* vnd reserved names. These are names which are reserved by vnd and thus
* shouldn't be used by some external program.
*/
static char *vnd_reserved_names[] = {
"ctl",
"zone",
NULL
};
/*
* vnd's DTrace probe macros
*
* DTRACE_VND* are all for a stable provider. We also have an unstable internal
* set of probes for reference count manipulation.
*/
#define DTRACE_VND3(name, type1, arg1, type2, arg2, type3, arg3) \
DTRACE_PROBE3(__vnd_##name, type1, arg1, type2, arg2, type3, arg3);
#define DTRACE_VND4(name, type1, arg1, type2, arg2, type3, arg3, type4, arg4) \
DTRACE_PROBE4(__vnd_##name, type1, arg1, type2, arg2, type3, arg3, \
type4, arg4);
#define DTRACE_VND5(name, type1, arg1, type2, arg2, type3, arg3, \
type4, arg4, type5, arg5) \
DTRACE_PROBE5(__vnd_##name, type1, arg1, type2, arg2, type3, arg3, \
type4, arg4, type5, arg5);
#define DTRACE_VND_REFINC(vdp) \
DTRACE_PROBE2(vnd__ref__inc, vnd_dev_t *, vdp, int, vdp->vdd_ref);
#define DTRACE_VND_REFDEC(vdp) \
DTRACE_PROBE2(vnd__ref__dec, vnd_dev_t *, vdp, int, vdp->vdd_ref);
/*
* Tunables
*/
size_t vnd_vdq_default_size = 1024 * 64; /* 64 KB */
size_t vnd_vdq_hard_max = 1024 * 1024 * 4; /* 4 MB */
/*
* These numbers are designed as per-device tunables that are applied when a new
* vnd device is attached. They're a rough stab at what may be a reasonable
* amount of work to do in one burst in an squeue.
*/
size_t vnd_flush_burst_size = 1520 * 10; /* 10 1500 MTU packets */
size_t vnd_flush_nburst = 10; /* 10 frames */
/*
* Constants related to our sdev plugins
*/
#define VND_SDEV_NAME "vnd"
#define VND_SDEV_ROOT "/dev/vnd"
#define VND_SDEV_ZROOT "/dev/vnd/zone"
/*
* Statistic macros
*/
#define VND_STAT_INC(vsp, field, val) \
atomic_add_64(&(vsp)->vns_ksdata.field.value.ui64, val)
#define VND_LATENCY_1MS 1000000
#define VND_LATENCY_10MS 10000000
#define VND_LATENCY_100MS 100000000
#define VND_LATENCY_1S 1000000000
#define VND_LATENCY_10S 10000000000
/*
* Constants for vnd hooks
*/
static uint8_t vnd_bcast_addr[6] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF };
#define IPV4_MCAST_LEN 3
static uint8_t vnd_ipv4_mcast[3] = { 0x01, 0x00, 0x5E };
#define IPV6_MCAST_LEN 2
static uint8_t vnd_ipv6_mcast[2] = { 0x33, 0x33 };
/*
* vnd internal data structures and types
*/
struct vnd_str;
struct vnd_dev;
struct vnd_pnsd;
/*
* As part of opening the device stream we need to properly communicate with our
* underlying stream. This is a bit of an asynchronous dance and we need to
* properly work with dld to get everything set up. We have to initiate the
* conversation with dld and as such we keep track of our state here.
*/
typedef enum vnd_str_state {
VNS_S_INITIAL = 0,
VNS_S_INFO_SENT,
VNS_S_EXCLUSIVE_SENT,
VNS_S_ATTACH_SENT,
VNS_S_BIND_SENT,
VNS_S_SAP_PROMISC_SENT,
VNS_S_MULTI_PROMISC_SENT,
VNS_S_RX_ONLY_PROMISC_SENT,
VNS_S_FIXUP_PROMISC_SENT,
VNS_S_CAPAB_Q_SENT,
VNS_S_CAPAB_E_SENT,
VNS_S_ONLINE,
VNS_S_SHUTTING_DOWN,
VNS_S_MULTICAST_PROMISCOFF_SENT,