Skip to content

Linux Kernel tracing for sendto() using AF_PACKET, PACKET_MMAP and PACKET_FANOUT

James Bensley edited this page Sep 16, 2017 · 16 revisions

Below is a high level view of the call flow executed to send packets from a Tx ring when using PACKET_MMAP on Kernel version 4.10.x. A do loop runs which memcpy()'s each packet in the Tx ring (with TP_STATUS_SEND_REQUEST) into an sk_buff in Kernel memory. The sk_buff is then queued for transmission and the virtual Tx function called. The loop then starts over if there are more packets in the Tx ring waiting to be transmitted (the next item in the ring buffer has status TP_STATUS_SEND_REQUEST).

Points of note:

  • Each frame is memcpy()'ed into an sk_buff in Kernel memory from the Tx ring. This means a copy overhead is incurred per frame.
  • The Tx ring space and sk_buff remain occupied until the frames are transmitted by the NIC.
  • Kernel's with version > 3.3.x and supporting drivers may have BQL active which might reduce the Kernel Tx queue size txqueuelen automatically to improve latency at the cost of throughput.
  • Limiting factors on the Tx rate with regards to queues/buffers in order of operations can be:
    • The number of frames in the PACKET_MMAP Tx ring (increasing the ring size can help with NIC starvation)
    • The size of the socket Tx queue sk->sk_sndbuf (increasing the socket queue can help with NIC starvation)
    • The size of the Tx queue in the Kernel (the QDISC queue size, if QDISC bypass is not enabled) txqueuelen (increasing the QDISC length can help with NIC starvation)
    • The size of the Tx queue on the NIC controlled by the driver with ethtool (increasing the Tx buffer can help to drain available Tx descriptors)
    • Adversely, an increase in any buffer can introduce bufferbloat and reduce through (increasing NIC starvation)
packet_sendmsg();

    tpacket_snd();
    
        // Start of do{} loop
        packet_current_frame();
            packet_lookup_frame();
                __packet_get_status();
                
        tpacket_fill_skb();
                
        packet_pick_tx_queue();
            __packet_pick_tx_queue();
        
        __packet_set_status();
        
        packet_direct_xmit();
            netdev_start_xmit();
                __netdev_start_xmit();
        
        packet_increment_head();
        

        // Start of do{} loop
        packet_current_frame();
            packet_lookup_frame();
                __packet_get_status();
                
        tpacket_fill_skb();
        
        packet_pick_tx_queue();
            __packet_pick_tx_queue();
        
        __packet_set_status();
        
        packet_direct_xmit();
            netdev_start_xmit();
                __netdev_start_xmit();
        
        packet_increment_head();

        // And so on...

Below is a more detailed breakdown of the call flow through af_packet.c for AF_PACKET socket. This was traced out using SystemTap and by reading through the Kernel source. An example SystemTap file can be found here.

// Sending a single packet using PACKET_MMAP with a Tx ring:

    // This is called when we call socket(AF_PACKET, SOCK_RAW, htons(ETH_P_ALL))
    packet_create();

        sk->sk_family = PF_PACKET;
        // The xmit virtual function will be changed later by setsockopt(PACKET_QDISC_BYPASS)
        po->xmit = dev_queue_xmit();
        sk->sk_destruct = packet_sock_destruct;
        po->prot_hook.func = packet_rcv;
        register_prot_hook();
            __fanout_link();

    register_prot_hook();
    
    packet_setsockopt();
    
    packet_bind();
        packet_do_bind();
    
    packet_do_bind();
        register_prot_hook();
    
    register_prot_hook();
    
    packet_setsockopt();         // setsockopt(PACKET_ADD_MEMBERSHIP)
    packet_setsockopt();         // setsockopt(PACKET_QDISC_BYPASS)
        po->xmit = val ? packet_direct_xmit;
    packet_setsockopt();         // setsockopt(PACKET_TIMESTAMP)
    packet_setsockopt();         // setsockopt(PACKET_LOSS)
    packet_setsockopt();         // setsockopt(PACKET_TX_RING)
        packet_set_ring();
            rb->frame_size = req->tp_frame_size;
            po->tp_hdrlen = TPACKET2_HDRLEN;
            register_prot_hook();
    
    register_prot_hook();
    
    packet_setsockopt();         // PACKET_FANOUT
        fanout_add();
        __fanout_link();
        
    // Called by sendto(thd_opt->sock_fd, NULL, 0, 0, NULL, 0)
    packet_sendmsg(struct socket *sock, struct msghdr *msg);
    
        struct packet_sock *po = pkt_sk(sock->sk);  // *po == &sk
        tpacket_snd(struct packet_sock *po, struct msghdr *msg);
        
            // returns po->cached_dev;
            struct net_device *dev = packet_cached_dev_get(po);
    
            do {
    
                void *ph = packet_current_frame(po, &po->tx_ring, TP_STATUS_SEND_REQUEST);
                    packet_current_frame(struct packet_sock *po, struct packet_ring_buffer *rb, int status);
                        packet_lookup_frame();  // Returns * to frame with TP_STATUS_SEND_REQUEST
                            __packet_get_status();
    
                void *data;
    
                tp_len = tpacket_parse_header(po, ph, size_max, &data);

                    // Returns data == ph + headers offset to payload
                    // off = po->tp_hdrlen - sizeof(struct sockaddr_ll);
                    tpacket_parse_header(struct packet_sock *po, void *frame, int size_max, void **data);
    
                struct sk_buff *skb = sock_alloc_send_skb(&po->sk, hlen + tlen + sizeof(struct sockaddr_ll) + (copylen - dev->hard_header_len), !need_wait, &err);
                    sock_alloc_send_pskb()
                        // Here the socket tx queue is checked for space:
                        // if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf) {}
                        // returns struct sk_buff *

                tpacket_fill_skb(po, skb, ph, dev, data, tp_len, proto, addr, hlen, copylen, &sockc);
                    skb_store_bits();
                        skb_copy_to_linear_data_offset();
                            memcpy();   // Copies bits from mmap ring buffer to skb
    
                packet_pick_tx_queue(dev, skb);
                    if (ops->ndo_select_queue) {
                        queue_index = ops->ndo_select_queue(dev, skb, NULL, __packet_pick_tx_queue);
                        /*
                        Example with IXGBE drivers,
                        ops->ndo_select_queue() == ixgbe_select_queue();
                        When not using FCoE card ixgbe_select_queue() actually falls back to __packet_pick_tx_queue()
                        */
                    } else {
                        queue_index = __packet_pick_tx_queue(dev, skb);    // Here a simple hash; raw_smp_processor_id() % num_tx_queues
                    }
                    skb_set_queue_mapping(skb, queue_index);
    
                skb->destructor = tpacket_destruct_skb();
    
                __packet_set_status();                             // TP_STATUS_SENDING
    
                err = po->xmit();                                  // == packet_direct_xmit()
                
                    /*
                    If PACKET_QDISC_BYPASS was not enabled then this is where we'd call
                    dev_queue_xmit() to place packets into the queuing discipline.
                    Devices which don't have a Qdisc (e.g. loopback) go directly to
                    dev_hard_start_xmit().
                    
                    dev_queue_xmit() -> __dev_queue_xmit() -> dev_hard_start_xmit() -> netdev_start_xmit()...From here on its the same as below.
                    
                    dev_queue_xmit() puts the sk_buff on the device queue using the qdisc→enqueue virtual method. 
                    ...Qdisc stuff happens...
                    Eventually, the sk_buff is sent with dev_hard_start_xmit() and removed from the Qdisc.
                    
                    Instead call packet_direct_xmit() to bypass the queuing disciplines (if Kernel version is > 4.1)
                    */
                    
                    packet_direct_xmit();
                    
                        netdev_start_xmit();
                    
                            struct net_device_ops *ops = dev->netdev_ops;
                    
                            __netdev_start_xmit();
                    
                                // This will ultimately lead to the device/driver specific DMA function:
                                ops->ndo_start_xmit(skb, dev);
                    
                                /*
                                Example with IXGBE drivers;
                                po->cached_dev->netdev_ops->ndo_start_xmit == 
                                     static const struct net_device_ops ixgbe_netdev_ops.ndo_start_xmit ==
                                        ixgbe_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
                                */
                                ixgbe_xmit_frame(skb, dev);
                                
                                    __ixgbe_xmit_frame(skb, netdev, NULL);
                                
                                        // netdev_tx_t = NETDEV_TX_OK on success, which is passed back up to err = po->xmit() above
                                        // netdev_tx_t ixgbe_xmit_frame_ring()
                                        ixgbe_xmit_frame_ring(skb, adapter, tx_ring);
                                
                                            struct ixgbe_tx_buffer *first->skb = skb;
                                
                                            ixgbe_tx_map(tx_ring, first, hdr_len);
                                
                                                dma_addr_t dma = dma_map_single(tx_ring->dev, skb->data, size, DMA_TO_DEVICE);
                                                for {
                                                    while  {
                                                        // Fill TX descriptors with SKB info (size, location etc)
                                                    }
                                                }
                                
                                                netdev_tx_sent_queue(txring_txq(tx_ring), first->bytecount);

                                                // Trigger the NIC to DMA the SKB (by reading the info from the TX descriptors)
                                                writel(i, tx_ring->tail);
                                
    
                packet_increment_head(&po->tx_ring);
    
            } while (likely((ph != NULL) || (need_wait && packet_read_pending(&po->tx_ring))));
                            


// Post transmit packet clean up...

// tpacket_destruct_skb is the registered callback/handles for post Tx cleanup
tpacket_destruct_skb(struct sk_buff *skb)

    ts = __packet_set_timestamp(po, ph, skb);
        tpacket_get_timestamp(skb, &ts, po->tp_tstamp)
        // return TP_STATUS_TS_RAW_HARDWARE;

    __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts);

/*
The device driver's hard_start_xmit function will generate one or more commands to the network device for scheduling transfer of the buffer.
After a while, the network device replies that it's done. This triggers freeing of the sk_buff.
If the sk_buff is freed from interrupt context, dev_kfree_skb_irq() is used.
This delays the actual freeing until the next NET_TX_SOFTIRQ run, by putting the skb on the softnet_data completion_queue.
This avoids doing frees from interrupt context. 
*/