forked from illumos/illumos-gate
-
Notifications
You must be signed in to change notification settings - Fork 109
/
mac_sched.c
4049 lines (3726 loc) · 118 KB
/
mac_sched.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
* Copyright 2011 Joyent, Inc. All rights reserved.
* Copyright 2013 Nexenta Systems, Inc. All rights reserved.
*/
#include <sys/types.h>
#include <sys/callb.h>
#include <sys/sdt.h>
#include <sys/strsubr.h>
#include <sys/strsun.h>
#include <sys/vlan.h>
#include <sys/stack.h>
#include <sys/archsystm.h>
#include <inet/ipsec_impl.h>
#include <inet/ip_impl.h>
#include <inet/sadb.h>
#include <inet/ipsecesp.h>
#include <inet/ipsecah.h>
#include <inet/ip6.h>
#include <sys/mac_impl.h>
#include <sys/mac_client_impl.h>
#include <sys/mac_client_priv.h>
#include <sys/mac_soft_ring.h>
#include <sys/mac_flow_impl.h>
static mac_tx_cookie_t mac_tx_single_ring_mode(mac_soft_ring_set_t *, mblk_t *,
uintptr_t, uint16_t, mblk_t **);
static mac_tx_cookie_t mac_tx_serializer_mode(mac_soft_ring_set_t *, mblk_t *,
uintptr_t, uint16_t, mblk_t **);
static mac_tx_cookie_t mac_tx_fanout_mode(mac_soft_ring_set_t *, mblk_t *,
uintptr_t, uint16_t, mblk_t **);
static mac_tx_cookie_t mac_tx_bw_mode(mac_soft_ring_set_t *, mblk_t *,
uintptr_t, uint16_t, mblk_t **);
static mac_tx_cookie_t mac_tx_aggr_mode(mac_soft_ring_set_t *, mblk_t *,
uintptr_t, uint16_t, mblk_t **);
typedef struct mac_tx_mode_s {
mac_tx_srs_mode_t mac_tx_mode;
mac_tx_func_t mac_tx_func;
} mac_tx_mode_t;
/*
* There are seven modes of operation on the Tx side. These modes get set
* in mac_tx_srs_setup(). Except for the experimental TX_SERIALIZE mode,
* none of the other modes are user configurable. They get selected by
* the system depending upon whether the link (or flow) has multiple Tx
* rings or a bandwidth configured, or if the link is an aggr, etc.
*
* When the Tx SRS is operating in aggr mode (st_mode) or if there are
* multiple Tx rings owned by Tx SRS, then each Tx ring (pseudo or
* otherwise) will have a soft ring associated with it. These soft rings
* are stored in srs_tx_soft_rings[] array.
*
* Additionally in the case of aggr, there is the st_soft_rings[] array
* in the mac_srs_tx_t structure. This array is used to store the same
* set of soft rings that are present in srs_tx_soft_rings[] array but
* in a different manner. The soft ring associated with the pseudo Tx
* ring is saved at mr_index (of the pseudo ring) in st_soft_rings[]
* array. This helps in quickly getting the soft ring associated with the
* Tx ring when aggr_find_tx_ring() returns the pseudo Tx ring that is to
* be used for transmit.
*/
mac_tx_mode_t mac_tx_mode_list[] = {
{SRS_TX_DEFAULT, mac_tx_single_ring_mode},
{SRS_TX_SERIALIZE, mac_tx_serializer_mode},
{SRS_TX_FANOUT, mac_tx_fanout_mode},
{SRS_TX_BW, mac_tx_bw_mode},
{SRS_TX_BW_FANOUT, mac_tx_bw_mode},
{SRS_TX_AGGR, mac_tx_aggr_mode},
{SRS_TX_BW_AGGR, mac_tx_bw_mode}
};
/*
* Soft Ring Set (SRS) - The Run time code that deals with
* dynamic polling from the hardware, bandwidth enforcement,
* fanout etc.
*
* We try to use H/W classification on NIC and assign traffic for
* a MAC address to a particular Rx ring or ring group. There is a
* 1-1 mapping between a SRS and a Rx ring. The SRS dynamically
* switches the underlying Rx ring between interrupt and
* polling mode and enforces any specified B/W control.
*
* There is always a SRS created and tied to each H/W and S/W rule.
* Whenever we create a H/W rule, we always add the the same rule to
* S/W classifier and tie a SRS to it.
*
* In case a B/W control is specified, it is broken into bytes
* per ticks and as soon as the quota for a tick is exhausted,
* the underlying Rx ring is forced into poll mode for remainder of
* the tick. The SRS poll thread only polls for bytes that are
* allowed to come in the SRS. We typically let 4x the configured
* B/W worth of packets to come in the SRS (to prevent unnecessary
* drops due to bursts) but only process the specified amount.
*
* A MAC client (e.g. a VNIC or aggr) can have 1 or more
* Rx rings (and corresponding SRSs) assigned to it. The SRS
* in turn can have softrings to do protocol level fanout or
* softrings to do S/W based fanout or both. In case the NIC
* has no Rx rings, we do S/W classification to respective SRS.
* The S/W classification rule is always setup and ready. This
* allows the MAC layer to reassign Rx rings whenever needed
* but packets still continue to flow via the default path and
* getting S/W classified to correct SRS.
*
* The SRS's are used on both Tx and Rx side. They use the same
* data structure but the processing routines have slightly different
* semantics due to the fact that Rx side needs to do dynamic
* polling etc.
*
* Dynamic Polling Notes
* =====================
*
* Each Soft ring set is capable of switching its Rx ring between
* interrupt and poll mode and actively 'polls' for packets in
* poll mode. If the SRS is implementing a B/W limit, it makes
* sure that only Max allowed packets are pulled in poll mode
* and goes to poll mode as soon as B/W limit is exceeded. As
* such, there are no overheads to implement B/W limits.
*
* In poll mode, its better to keep the pipeline going where the
* SRS worker thread keeps processing packets and poll thread
* keeps bringing more packets (specially if they get to run
* on different CPUs). This also prevents the overheads associated
* by excessive signalling (on NUMA machines, this can be
* pretty devastating). The exception is latency optimized case
* where worker thread does no work and interrupt and poll thread
* are allowed to do their own drain.
*
* We use the following policy to control Dynamic Polling:
* 1) We switch to poll mode anytime the processing
* thread causes a backlog to build up in SRS and
* its associated Soft Rings (sr_poll_pkt_cnt > 0).
* 2) As long as the backlog stays under the low water
* mark (sr_lowat), we poll the H/W for more packets.
* 3) If the backlog (sr_poll_pkt_cnt) exceeds low
* water mark, we stay in poll mode but don't poll
* the H/W for more packets.
* 4) Anytime in polling mode, if we poll the H/W for
* packets and find nothing plus we have an existing
* backlog (sr_poll_pkt_cnt > 0), we stay in polling
* mode but don't poll the H/W for packets anymore
* (let the polling thread go to sleep).
* 5) Once the backlog is relived (packets are processed)
* we reenable polling (by signalling the poll thread)
* only when the backlog dips below sr_poll_thres.
* 6) sr_hiwat is used exclusively when we are not
* polling capable and is used to decide when to
* drop packets so the SRS queue length doesn't grow
* infinitely.
*
* NOTE: Also see the block level comment on top of mac_soft_ring.c
*/
/*
* mac_latency_optimize
*
* Controls whether the poll thread can process the packets inline
* or let the SRS worker thread do the processing. This applies if
* the SRS was not being processed. For latency sensitive traffic,
* this needs to be true to allow inline processing. For throughput
* under load, this should be false.
*
* This (and other similar) tunable should be rolled into a link
* or flow specific workload hint that can be set using dladm
* linkprop (instead of multiple such tunables).
*/
boolean_t mac_latency_optimize = B_TRUE;
/*
* MAC_RX_SRS_ENQUEUE_CHAIN and MAC_TX_SRS_ENQUEUE_CHAIN
*
* queue a mp or chain in soft ring set and increment the
* local count (srs_count) for the SRS and the shared counter
* (srs_poll_pkt_cnt - shared between SRS and its soft rings
* to track the total unprocessed packets for polling to work
* correctly).
*
* The size (total bytes queued) counters are incremented only
* if we are doing B/W control.
*/
#define MAC_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz) { \
ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \
if ((mac_srs)->srs_last != NULL) \
(mac_srs)->srs_last->b_next = (head); \
else \
(mac_srs)->srs_first = (head); \
(mac_srs)->srs_last = (tail); \
(mac_srs)->srs_count += count; \
}
#define MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz) { \
mac_srs_rx_t *srs_rx = &(mac_srs)->srs_rx; \
\
MAC_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz); \
srs_rx->sr_poll_pkt_cnt += count; \
ASSERT(srs_rx->sr_poll_pkt_cnt > 0); \
if ((mac_srs)->srs_type & SRST_BW_CONTROL) { \
(mac_srs)->srs_size += (sz); \
mutex_enter(&(mac_srs)->srs_bw->mac_bw_lock); \
(mac_srs)->srs_bw->mac_bw_sz += (sz); \
mutex_exit(&(mac_srs)->srs_bw->mac_bw_lock); \
} \
}
#define MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz) { \
mac_srs->srs_state |= SRS_ENQUEUED; \
MAC_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz); \
if ((mac_srs)->srs_type & SRST_BW_CONTROL) { \
(mac_srs)->srs_size += (sz); \
(mac_srs)->srs_bw->mac_bw_sz += (sz); \
} \
}
/*
* Turn polling on routines
*/
#define MAC_SRS_POLLING_ON(mac_srs) { \
ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \
if (((mac_srs)->srs_state & \
(SRS_POLLING_CAPAB|SRS_POLLING)) == SRS_POLLING_CAPAB) { \
(mac_srs)->srs_state |= SRS_POLLING; \
(void) mac_hwring_disable_intr((mac_ring_handle_t) \
(mac_srs)->srs_ring); \
(mac_srs)->srs_rx.sr_poll_on++; \
} \
}
#define MAC_SRS_WORKER_POLLING_ON(mac_srs) { \
ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \
if (((mac_srs)->srs_state & \
(SRS_POLLING_CAPAB|SRS_WORKER|SRS_POLLING)) == \
(SRS_POLLING_CAPAB|SRS_WORKER)) { \
(mac_srs)->srs_state |= SRS_POLLING; \
(void) mac_hwring_disable_intr((mac_ring_handle_t) \
(mac_srs)->srs_ring); \
(mac_srs)->srs_rx.sr_worker_poll_on++; \
} \
}
/*
* MAC_SRS_POLL_RING
*
* Signal the SRS poll thread to poll the underlying H/W ring
* provided it wasn't already polling (SRS_GET_PKTS was set).
*
* Poll thread gets to run only from mac_rx_srs_drain() and only
* if the drain was being done by the worker thread.
*/
#define MAC_SRS_POLL_RING(mac_srs) { \
mac_srs_rx_t *srs_rx = &(mac_srs)->srs_rx; \
\
ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \
srs_rx->sr_poll_thr_sig++; \
if (((mac_srs)->srs_state & \
(SRS_POLLING_CAPAB|SRS_WORKER|SRS_GET_PKTS)) == \
(SRS_WORKER|SRS_POLLING_CAPAB)) { \
(mac_srs)->srs_state |= SRS_GET_PKTS; \
cv_signal(&(mac_srs)->srs_cv); \
} else { \
srs_rx->sr_poll_thr_busy++; \
} \
}
/*
* MAC_SRS_CHECK_BW_CONTROL
*
* Check to see if next tick has started so we can reset the
* SRS_BW_ENFORCED flag and allow more packets to come in the
* system.
*/
#define MAC_SRS_CHECK_BW_CONTROL(mac_srs) { \
ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \
ASSERT(((mac_srs)->srs_type & SRST_TX) || \
MUTEX_HELD(&(mac_srs)->srs_bw->mac_bw_lock)); \
clock_t now = ddi_get_lbolt(); \
if ((mac_srs)->srs_bw->mac_bw_curr_time != now) { \
(mac_srs)->srs_bw->mac_bw_curr_time = now; \
(mac_srs)->srs_bw->mac_bw_used = 0; \
if ((mac_srs)->srs_bw->mac_bw_state & SRS_BW_ENFORCED) \
(mac_srs)->srs_bw->mac_bw_state &= ~SRS_BW_ENFORCED; \
} \
}
/*
* MAC_SRS_WORKER_WAKEUP
*
* Wake up the SRS worker thread to process the queue as long as
* no one else is processing the queue. If we are optimizing for
* latency, we wake up the worker thread immediately or else we
* wait mac_srs_worker_wakeup_ticks before worker thread gets
* woken up.
*/
int mac_srs_worker_wakeup_ticks = 0;
#define MAC_SRS_WORKER_WAKEUP(mac_srs) { \
ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \
if (!((mac_srs)->srs_state & SRS_PROC) && \
(mac_srs)->srs_tid == NULL) { \
if (((mac_srs)->srs_state & SRS_LATENCY_OPT) || \
(mac_srs_worker_wakeup_ticks == 0)) \
cv_signal(&(mac_srs)->srs_async); \
else \
(mac_srs)->srs_tid = \
timeout(mac_srs_fire, (mac_srs), \
mac_srs_worker_wakeup_ticks); \
} \
}
#define TX_BANDWIDTH_MODE(mac_srs) \
((mac_srs)->srs_tx.st_mode == SRS_TX_BW || \
(mac_srs)->srs_tx.st_mode == SRS_TX_BW_FANOUT || \
(mac_srs)->srs_tx.st_mode == SRS_TX_BW_AGGR)
#define TX_SRS_TO_SOFT_RING(mac_srs, head, hint) { \
if (tx_mode == SRS_TX_BW_FANOUT) \
(void) mac_tx_fanout_mode(mac_srs, head, hint, 0, NULL);\
else \
(void) mac_tx_aggr_mode(mac_srs, head, hint, 0, NULL); \
}
/*
* MAC_TX_SRS_BLOCK
*
* Always called from mac_tx_srs_drain() function. SRS_TX_BLOCKED
* will be set only if srs_tx_woken_up is FALSE. If
* srs_tx_woken_up is TRUE, it indicates that the wakeup arrived
* before we grabbed srs_lock to set SRS_TX_BLOCKED. We need to
* attempt to transmit again and not setting SRS_TX_BLOCKED does
* that.
*/
#define MAC_TX_SRS_BLOCK(srs, mp) { \
ASSERT(MUTEX_HELD(&(srs)->srs_lock)); \
if ((srs)->srs_tx.st_woken_up) { \
(srs)->srs_tx.st_woken_up = B_FALSE; \
} else { \
ASSERT(!((srs)->srs_state & SRS_TX_BLOCKED)); \
(srs)->srs_state |= SRS_TX_BLOCKED; \
(srs)->srs_tx.st_stat.mts_blockcnt++; \
} \
}
/*
* MAC_TX_SRS_TEST_HIWAT
*
* Called before queueing a packet onto Tx SRS to test and set
* SRS_TX_HIWAT if srs_count exceeds srs_tx_hiwat.
*/
#define MAC_TX_SRS_TEST_HIWAT(srs, mp, tail, cnt, sz, cookie) { \
boolean_t enqueue = 1; \
\
if ((srs)->srs_count > (srs)->srs_tx.st_hiwat) { \
/* \
* flow-controlled. Store srs in cookie so that it \
* can be returned as mac_tx_cookie_t to client \
*/ \
(srs)->srs_state |= SRS_TX_HIWAT; \
cookie = (mac_tx_cookie_t)srs; \
(srs)->srs_tx.st_hiwat_cnt++; \
if ((srs)->srs_count > (srs)->srs_tx.st_max_q_cnt) { \
/* increment freed stats */ \
(srs)->srs_tx.st_stat.mts_sdrops += cnt; \
/* \
* b_prev may be set to the fanout hint \
* hence can't use freemsg directly \
*/ \
mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE); \
DTRACE_PROBE1(tx_queued_hiwat, \
mac_soft_ring_set_t *, srs); \
enqueue = 0; \
} \
} \
if (enqueue) \
MAC_TX_SRS_ENQUEUE_CHAIN(srs, mp, tail, cnt, sz); \
}
/* Some utility macros */
#define MAC_SRS_BW_LOCK(srs) \
if (!(srs->srs_type & SRST_TX)) \
mutex_enter(&srs->srs_bw->mac_bw_lock);
#define MAC_SRS_BW_UNLOCK(srs) \
if (!(srs->srs_type & SRST_TX)) \
mutex_exit(&srs->srs_bw->mac_bw_lock);
#define MAC_TX_SRS_DROP_MESSAGE(srs, mp, cookie) { \
mac_pkt_drop(NULL, NULL, mp, B_FALSE); \
/* increment freed stats */ \
mac_srs->srs_tx.st_stat.mts_sdrops++; \
cookie = (mac_tx_cookie_t)srs; \
}
#define MAC_TX_SET_NO_ENQUEUE(srs, mp_chain, ret_mp, cookie) { \
mac_srs->srs_state |= SRS_TX_WAKEUP_CLIENT; \
cookie = (mac_tx_cookie_t)srs; \
*ret_mp = mp_chain; \
}
/*
* MAC_RX_SRS_TOODEEP
*
* Macro called as part of receive-side processing to determine if handling
* can occur in situ (in the interrupt thread) or if it should be left to a
* worker thread. Note that the constant used to make this determination is
* not entirely made-up, and is a result of some emprical validation. That
* said, the constant is left as a static variable to allow it to be
* dynamically tuned in the field if and as needed.
*/
static uintptr_t mac_rx_srs_stack_needed = 10240;
static uint_t mac_rx_srs_stack_toodeep;
#ifndef STACK_GROWTH_DOWN
#error Downward stack growth assumed.
#endif
#define MAC_RX_SRS_TOODEEP() (STACK_BIAS + (uintptr_t)getfp() - \
(uintptr_t)curthread->t_stkbase < mac_rx_srs_stack_needed && \
++mac_rx_srs_stack_toodeep)
/*
* Drop the rx packet and advance to the next one in the chain.
*/
static void
mac_rx_drop_pkt(mac_soft_ring_set_t *srs, mblk_t *mp)
{
mac_srs_rx_t *srs_rx = &srs->srs_rx;
ASSERT(mp->b_next == NULL);
mutex_enter(&srs->srs_lock);
MAC_UPDATE_SRS_COUNT_LOCKED(srs, 1);
MAC_UPDATE_SRS_SIZE_LOCKED(srs, msgdsize(mp));
mutex_exit(&srs->srs_lock);
srs_rx->sr_stat.mrs_sdrops++;
freemsg(mp);
}
/* DATAPATH RUNTIME ROUTINES */
/*
* mac_srs_fire
*
* Timer callback routine for waking up the SRS worker thread.
*/
static void
mac_srs_fire(void *arg)
{
mac_soft_ring_set_t *mac_srs = (mac_soft_ring_set_t *)arg;
mutex_enter(&mac_srs->srs_lock);
if (mac_srs->srs_tid == 0) {
mutex_exit(&mac_srs->srs_lock);
return;
}
mac_srs->srs_tid = 0;
if (!(mac_srs->srs_state & SRS_PROC))
cv_signal(&mac_srs->srs_async);
mutex_exit(&mac_srs->srs_lock);
}
/*
* 'hint' is fanout_hint (type of uint64_t) which is given by the TCP/IP stack,
* and it is used on the TX path.
*/
#define HASH_HINT(hint) \
((hint) ^ ((hint) >> 24) ^ ((hint) >> 16) ^ ((hint) >> 8))
/*
* hash based on the src address, dst address and the port information.
*/
#define HASH_ADDR(src, dst, ports) \
(ntohl((src) + (dst)) ^ ((ports) >> 24) ^ ((ports) >> 16) ^ \
((ports) >> 8) ^ (ports))
#define COMPUTE_INDEX(key, sz) (key % sz)
#define FANOUT_ENQUEUE_MP(head, tail, cnt, bw_ctl, sz, sz0, mp) { \
if ((tail) != NULL) { \
ASSERT((tail)->b_next == NULL); \
(tail)->b_next = (mp); \
} else { \
ASSERT((head) == NULL); \
(head) = (mp); \
} \
(tail) = (mp); \
(cnt)++; \
if ((bw_ctl)) \
(sz) += (sz0); \
}
#define MAC_FANOUT_DEFAULT 0
#define MAC_FANOUT_RND_ROBIN 1
int mac_fanout_type = MAC_FANOUT_DEFAULT;
#define MAX_SR_TYPES 3
/* fanout types for port based hashing */
enum pkt_type {
V4_TCP = 0,
V4_UDP,
OTH,
UNDEF
};
/*
* Pair of local and remote ports in the transport header
*/
#define PORTS_SIZE 4
/*
* mac_rx_srs_proto_fanout
*
* This routine delivers packets destined to an SRS into one of the
* protocol soft rings.
*
* Given a chain of packets we need to split it up into multiple sub chains
* destined into TCP, UDP or OTH soft ring. Instead of entering
* the soft ring one packet at a time, we want to enter it in the form of a
* chain otherwise we get this start/stop behaviour where the worker thread
* goes to sleep and then next packets comes in forcing it to wake up etc.
*/
static void
mac_rx_srs_proto_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head)
{
struct ether_header *ehp;
struct ether_vlan_header *evhp;
uint32_t sap;
ipha_t *ipha;
uint8_t *dstaddr;
size_t hdrsize;
mblk_t *mp;
mblk_t *headmp[MAX_SR_TYPES];
mblk_t *tailmp[MAX_SR_TYPES];
int cnt[MAX_SR_TYPES];
size_t sz[MAX_SR_TYPES];
size_t sz1;
boolean_t bw_ctl;
boolean_t hw_classified;
boolean_t dls_bypass;
boolean_t is_ether;
boolean_t is_unicast;
enum pkt_type type;
mac_client_impl_t *mcip = mac_srs->srs_mcip;
is_ether = (mcip->mci_mip->mi_info.mi_nativemedia == DL_ETHER);
bw_ctl = ((mac_srs->srs_type & SRST_BW_CONTROL) != 0);
/*
* If we don't have a Rx ring, S/W classification would have done
* its job and its a packet meant for us. If we were polling on
* the default ring (i.e. there was a ring assigned to this SRS),
* then we need to make sure that the mac address really belongs
* to us.
*/
hw_classified = mac_srs->srs_ring != NULL &&
mac_srs->srs_ring->mr_classify_type == MAC_HW_CLASSIFIER;
/*
* Special clients (eg. VLAN, non ether, etc) need DLS
* processing in the Rx path. SRST_DLS_BYPASS will be clear for
* such SRSs. Another way of disabling bypass is to set the
* MCIS_RX_BYPASS_DISABLE flag.
*/
dls_bypass = ((mac_srs->srs_type & SRST_DLS_BYPASS) != 0) &&
((mcip->mci_state_flags & MCIS_RX_BYPASS_DISABLE) == 0);
bzero(headmp, MAX_SR_TYPES * sizeof (mblk_t *));
bzero(tailmp, MAX_SR_TYPES * sizeof (mblk_t *));
bzero(cnt, MAX_SR_TYPES * sizeof (int));
bzero(sz, MAX_SR_TYPES * sizeof (size_t));
/*
* We got a chain from SRS that we need to send to the soft rings.
* Since squeues for TCP & IPv4 sap poll their soft rings (for
* performance reasons), we need to separate out v4_tcp, v4_udp
* and the rest goes in other.
*/
while (head != NULL) {
mp = head;
head = head->b_next;
mp->b_next = NULL;
type = OTH;
sz1 = (mp->b_cont == NULL) ? MBLKL(mp) : msgdsize(mp);
if (is_ether) {
/*
* At this point we can be sure the packet at least
* has an ether header.
*/
if (sz1 < sizeof (struct ether_header)) {
mac_rx_drop_pkt(mac_srs, mp);
continue;
}
ehp = (struct ether_header *)mp->b_rptr;
/*
* Determine if this is a VLAN or non-VLAN packet.
*/
if ((sap = ntohs(ehp->ether_type)) == VLAN_TPID) {
evhp = (struct ether_vlan_header *)mp->b_rptr;
sap = ntohs(evhp->ether_type);
hdrsize = sizeof (struct ether_vlan_header);
/*
* Check if the VID of the packet, if any,
* belongs to this client.
*/
if (!mac_client_check_flow_vid(mcip,
VLAN_ID(ntohs(evhp->ether_tci)))) {
mac_rx_drop_pkt(mac_srs, mp);
continue;
}
} else {
hdrsize = sizeof (struct ether_header);
}
is_unicast =
((((uint8_t *)&ehp->ether_dhost)[0] & 0x01) == 0);
dstaddr = (uint8_t *)&ehp->ether_dhost;
} else {
mac_header_info_t mhi;
if (mac_header_info((mac_handle_t)mcip->mci_mip,
mp, &mhi) != 0) {
mac_rx_drop_pkt(mac_srs, mp);
continue;
}
hdrsize = mhi.mhi_hdrsize;
sap = mhi.mhi_bindsap;
is_unicast = (mhi.mhi_dsttype == MAC_ADDRTYPE_UNICAST);
dstaddr = (uint8_t *)mhi.mhi_daddr;
}
if (!dls_bypass) {
FANOUT_ENQUEUE_MP(headmp[type], tailmp[type],
cnt[type], bw_ctl, sz[type], sz1, mp);
continue;
}
if (sap == ETHERTYPE_IP) {
/*
* If we are H/W classified, but we have promisc
* on, then we need to check for the unicast address.
*/
if (hw_classified && mcip->mci_promisc_list != NULL) {
mac_address_t *map;
rw_enter(&mcip->mci_rw_lock, RW_READER);
map = mcip->mci_unicast;
if (bcmp(dstaddr, map->ma_addr,
map->ma_len) == 0)
type = UNDEF;
rw_exit(&mcip->mci_rw_lock);
} else if (is_unicast) {
type = UNDEF;
}
}
/*
* This needs to become a contract with the driver for
* the fast path.
*
* In the normal case the packet will have at least the L2
* header and the IP + Transport header in the same mblk.
* This is usually the case when the NIC driver sends up
* the packet. This is also true when the stack generates
* a packet that is looped back and when the stack uses the
* fastpath mechanism. The normal case is optimized for
* performance and may bypass DLS. All other cases go through
* the 'OTH' type path without DLS bypass.
*/
ipha = (ipha_t *)(mp->b_rptr + hdrsize);
if ((type != OTH) && MBLK_RX_FANOUT_SLOWPATH(mp, ipha))
type = OTH;
if (type == OTH) {
FANOUT_ENQUEUE_MP(headmp[type], tailmp[type],
cnt[type], bw_ctl, sz[type], sz1, mp);
continue;
}
ASSERT(type == UNDEF);
/*
* We look for at least 4 bytes past the IP header to get
* the port information. If we get an IP fragment, we don't
* have the port information, and we use just the protocol
* information.
*/
switch (ipha->ipha_protocol) {
case IPPROTO_TCP:
type = V4_TCP;
mp->b_rptr += hdrsize;
break;
case IPPROTO_UDP:
type = V4_UDP;
mp->b_rptr += hdrsize;
break;
default:
type = OTH;
break;
}
FANOUT_ENQUEUE_MP(headmp[type], tailmp[type], cnt[type],
bw_ctl, sz[type], sz1, mp);
}
for (type = V4_TCP; type < UNDEF; type++) {
if (headmp[type] != NULL) {
mac_soft_ring_t *softring;
ASSERT(tailmp[type]->b_next == NULL);
switch (type) {
case V4_TCP:
softring = mac_srs->srs_tcp_soft_rings[0];
break;
case V4_UDP:
softring = mac_srs->srs_udp_soft_rings[0];
break;
case OTH:
softring = mac_srs->srs_oth_soft_rings[0];
}
mac_rx_soft_ring_process(mcip, softring,
headmp[type], tailmp[type], cnt[type], sz[type]);
}
}
}
int fanout_unaligned = 0;
/*
* mac_rx_srs_long_fanout
*
* The fanout routine for VLANs, and for anything else that isn't performing
* explicit dls bypass. Returns -1 on an error (drop the packet due to a
* malformed packet), 0 on success, with values written in *indx and *type.
*/
static int
mac_rx_srs_long_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *mp,
uint32_t sap, size_t hdrsize, enum pkt_type *type, uint_t *indx)
{
ip6_t *ip6h;
ipha_t *ipha;
uint8_t *whereptr;
uint_t hash;
uint16_t remlen;
uint8_t nexthdr;
uint16_t hdr_len;
uint32_t src_val, dst_val;
boolean_t modifiable = B_TRUE;
boolean_t v6;
ASSERT(MBLKL(mp) >= hdrsize);
if (sap == ETHERTYPE_IPV6) {
v6 = B_TRUE;
hdr_len = IPV6_HDR_LEN;
} else if (sap == ETHERTYPE_IP) {
v6 = B_FALSE;
hdr_len = IP_SIMPLE_HDR_LENGTH;
} else {
*indx = 0;
*type = OTH;
return (0);
}
ip6h = (ip6_t *)(mp->b_rptr + hdrsize);
ipha = (ipha_t *)ip6h;
if ((uint8_t *)ip6h == mp->b_wptr) {
/*
* The first mblk_t only includes the mac header.
* Note that it is safe to change the mp pointer here,
* as the subsequent operation does not assume mp
* points to the start of the mac header.
*/
mp = mp->b_cont;
/*
* Make sure the IP header points to an entire one.
*/
if (mp == NULL)
return (-1);
if (MBLKL(mp) < hdr_len) {
modifiable = (DB_REF(mp) == 1);
if (modifiable && !pullupmsg(mp, hdr_len))
return (-1);
}
ip6h = (ip6_t *)mp->b_rptr;
ipha = (ipha_t *)ip6h;
}
if (!modifiable || !(OK_32PTR((char *)ip6h)) ||
((uint8_t *)ip6h + hdr_len > mp->b_wptr)) {
/*
* If either the IP header is not aligned, or it does not hold
* the complete simple structure (a pullupmsg() is not an
* option since it would result in an unaligned IP header),
* fanout to the default ring.
*
* Note that this may cause packet reordering.
*/
*indx = 0;
*type = OTH;
fanout_unaligned++;
return (0);
}
/*
* Extract next-header, full header length, and source-hash value
* using v4/v6 specific fields.
*/
if (v6) {
remlen = ntohs(ip6h->ip6_plen);
nexthdr = ip6h->ip6_nxt;
src_val = V4_PART_OF_V6(ip6h->ip6_src);
dst_val = V4_PART_OF_V6(ip6h->ip6_dst);
/*
* Do src based fanout if below tunable is set to B_TRUE or
* when mac_ip_hdr_length_v6() fails because of malformed
* packets or because mblks need to be concatenated using
* pullupmsg().
*/
if (!mac_ip_hdr_length_v6(ip6h, mp->b_wptr, &hdr_len, &nexthdr,
NULL)) {
goto src_dst_based_fanout;
}
} else {
hdr_len = IPH_HDR_LENGTH(ipha);
remlen = ntohs(ipha->ipha_length) - hdr_len;
nexthdr = ipha->ipha_protocol;
src_val = (uint32_t)ipha->ipha_src;
dst_val = (uint32_t)ipha->ipha_dst;
/*
* Catch IPv4 fragment case here. IPv6 has nexthdr == FRAG
* for its equivalent case.
*/
if ((ntohs(ipha->ipha_fragment_offset_and_flags) &
(IPH_MF | IPH_OFFSET)) != 0) {
goto src_dst_based_fanout;
}
}
if (remlen < MIN_EHDR_LEN)
return (-1);
whereptr = (uint8_t *)ip6h + hdr_len;
/* If the transport is one of below, we do port/SPI based fanout */
switch (nexthdr) {
case IPPROTO_TCP:
case IPPROTO_UDP:
case IPPROTO_SCTP:
case IPPROTO_ESP:
/*
* If the ports or SPI in the transport header is not part of
* the mblk, do src_based_fanout, instead of calling
* pullupmsg().
*/
if (mp->b_cont == NULL || whereptr + PORTS_SIZE <= mp->b_wptr)
break; /* out of switch... */
/* FALLTHRU */
default:
goto src_dst_based_fanout;
}
switch (nexthdr) {
case IPPROTO_TCP:
hash = HASH_ADDR(src_val, dst_val, *(uint32_t *)whereptr);
*indx = COMPUTE_INDEX(hash, mac_srs->srs_tcp_ring_count);
*type = OTH;
break;
case IPPROTO_UDP:
case IPPROTO_SCTP:
case IPPROTO_ESP:
if (mac_fanout_type == MAC_FANOUT_DEFAULT) {
hash = HASH_ADDR(src_val, dst_val,
*(uint32_t *)whereptr);
*indx = COMPUTE_INDEX(hash,
mac_srs->srs_udp_ring_count);
} else {
*indx = mac_srs->srs_ind % mac_srs->srs_udp_ring_count;
mac_srs->srs_ind++;
}
*type = OTH;
break;
}
return (0);
src_dst_based_fanout:
hash = HASH_ADDR(src_val, dst_val, (uint32_t)0);
*indx = COMPUTE_INDEX(hash, mac_srs->srs_oth_ring_count);
*type = OTH;
return (0);
}
/*
* mac_rx_srs_fanout
*
* This routine delivers packets destined to an SRS into a soft ring member
* of the set.
*
* Given a chain of packets we need to split it up into multiple sub chains
* destined for one of the TCP, UDP or OTH soft rings. Instead of entering
* the soft ring one packet at a time, we want to enter it in the form of a
* chain otherwise we get this start/stop behaviour where the worker thread
* goes to sleep and then next packets comes in forcing it to wake up etc.
*
* Note:
* Since we know what is the maximum fanout possible, we create a 2D array
* of 'softring types * MAX_SR_FANOUT' for the head, tail, cnt and sz
* variables so that we can enter the softrings with chain. We need the
* MAX_SR_FANOUT so we can allocate the arrays on the stack (a kmem_alloc
* for each packet would be expensive). If we ever want to have the
* ability to have unlimited fanout, we should probably declare a head,
* tail, cnt, sz with each soft ring (a data struct which contains a softring
* along with these members) and create an array of this uber struct so we
* don't have to do kmem_alloc.
*/
int fanout_oth1 = 0;
int fanout_oth2 = 0;
int fanout_oth3 = 0;
int fanout_oth4 = 0;
int fanout_oth5 = 0;
static void
mac_rx_srs_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head)
{
struct ether_header *ehp;
struct ether_vlan_header *evhp;
uint32_t sap;
ipha_t *ipha;
uint8_t *dstaddr;
uint_t indx;
size_t ports_offset;
size_t ipha_len;
size_t hdrsize;
uint_t hash;
mblk_t *mp;
mblk_t *headmp[MAX_SR_TYPES][MAX_SR_FANOUT];
mblk_t *tailmp[MAX_SR_TYPES][MAX_SR_FANOUT];
int cnt[MAX_SR_TYPES][MAX_SR_FANOUT];
size_t sz[MAX_SR_TYPES][MAX_SR_FANOUT];
size_t sz1;
boolean_t bw_ctl;
boolean_t hw_classified;
boolean_t dls_bypass;
boolean_t is_ether;
boolean_t is_unicast;
int fanout_cnt;
enum pkt_type type;
mac_client_impl_t *mcip = mac_srs->srs_mcip;
is_ether = (mcip->mci_mip->mi_info.mi_nativemedia == DL_ETHER);
bw_ctl = ((mac_srs->srs_type & SRST_BW_CONTROL) != 0);
/*
* If we don't have a Rx ring, S/W classification would have done
* its job and its a packet meant for us. If we were polling on
* the default ring (i.e. there was a ring assigned to this SRS),
* then we need to make sure that the mac address really belongs
* to us.
*/
hw_classified = mac_srs->srs_ring != NULL &&
mac_srs->srs_ring->mr_classify_type == MAC_HW_CLASSIFIER;
/*
* Special clients (eg. VLAN, non ether, etc) need DLS
* processing in the Rx path. SRST_DLS_BYPASS will be clear for
* such SRSs. Another way of disabling bypass is to set the
* MCIS_RX_BYPASS_DISABLE flag.
*/
dls_bypass = ((mac_srs->srs_type & SRST_DLS_BYPASS) != 0) &&
((mcip->mci_state_flags & MCIS_RX_BYPASS_DISABLE) == 0);