Skip to content

Commit

Permalink
net/tap: add netlink back-end for flow API
Browse files Browse the repository at this point in the history
Each kernel netdevice may have queueing disciplines set for it, which
determine how to handle the packet (mostly on egress). That's part of
the TC (Traffic Control) mechanism.

Through TC, it is possible to set filter rules that match specific
packets, and act according to what is in the rule. This is a perfect
candidate to implement the flow API for the tap PMD, as it has an
associated kernel netdevice automatically.

Each flow API rule will be translated into its TC counterpart.

To leverage TC, it is necessary to communicate with the kernel using
netlink. This patch introduces a library to help that communication.

Inside netlink.c, functions are generic for any netlink messaging.
Inside tcmsgs.c, functions are specific to deal with TC rules.

Signed-off-by: Pascal Mazon <pascal.mazon@6wind.com>
Acked-by: Olga Shern <olgas@mellanox.com>
Acked-by: Keith Wiles <keith.wiles@intel.com>
  • Loading branch information
Pascal Mazon authored and Ferruh Yigit committed Apr 4, 2017
1 parent 268483d commit 7c25284
Show file tree
Hide file tree
Showing 5 changed files with 879 additions and 0 deletions.
2 changes: 2 additions & 0 deletions drivers/net/tap/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,8 @@ CFLAGS += $(WERROR_FLAGS)
#
SRCS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += rte_eth_tap.c
SRCS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += tap_flow.c
SRCS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += tap_netlink.c
SRCS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += tap_tcmsgs.c

# this lib depends upon:
DEPDIRS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += lib/librte_eal
Expand Down
367 changes: 367 additions & 0 deletions drivers/net/tap/tap_netlink.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,367 @@
/*-
* BSD LICENSE
*
* Copyright 2017 6WIND S.A.
* Copyright 2017 Mellanox.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* * Neither the name of 6WIND S.A. nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

#include <errno.h>
#include <inttypes.h>
#include <linux/netlink.h>
#include <string.h>
#include <sys/socket.h>
#include <unistd.h>

#include <rte_malloc.h>
#include <tap_netlink.h>
#include <rte_random.h>

/* Must be quite large to support dumping a huge list of QDISC or filters. */
#define BUF_SIZE (32 * 1024) /* Size of the buffer to receive kernel messages */
#define SNDBUF_SIZE 32768 /* Send buffer size for the netlink socket */
#define RCVBUF_SIZE 32768 /* Receive buffer size for the netlink socket */

struct nested_tail {
struct rtattr *tail;
struct nested_tail *prev;
};

/**
* Initialize a netlink socket for communicating with the kernel.
*
* @return
* netlink socket file descriptor on success, -1 otherwise.
*/
int
nl_init(void)
{
int fd, sndbuf_size = SNDBUF_SIZE, rcvbuf_size = RCVBUF_SIZE;
struct sockaddr_nl local = { .nl_family = AF_NETLINK };

fd = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, NETLINK_ROUTE);
if (fd < 0) {
RTE_LOG(ERR, PMD, "Unable to create a netlink socket\n");
return -1;
}
if (setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &sndbuf_size, sizeof(int))) {
RTE_LOG(ERR, PMD, "Unable to set socket buffer send size\n");
return -1;
}
if (setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &rcvbuf_size, sizeof(int))) {
RTE_LOG(ERR, PMD, "Unable to set socket buffer receive size\n");
return -1;
}
if (bind(fd, (struct sockaddr *)&local, sizeof(local)) < 0) {
RTE_LOG(ERR, PMD, "Unable to bind to the netlink socket\n");
return -1;
}
return fd;
}

/**
* Clean up a netlink socket once all communicating with the kernel is finished.
*
* @param[in] nlsk_fd
* The netlink socket file descriptor used for communication.
*
* @return
* 0 on success, -1 otherwise.
*/
int
nl_final(int nlsk_fd)
{
if (close(nlsk_fd)) {
RTE_LOG(ERR, PMD, "Failed to close netlink socket: %s (%d)\n",
strerror(errno), errno);
return -1;
}
return 0;
}

/**
* Send a message to the kernel on the netlink socket.
*
* @param[in] nlsk_fd
* The netlink socket file descriptor used for communication.
* @param[in] nh
* The netlink message send to the kernel.
*
* @return
* the number of sent bytes on success, -1 otherwise.
*/
int
nl_send(int nlsk_fd, struct nlmsghdr *nh)
{
/* man 7 netlink EXAMPLE */
struct sockaddr_nl sa = {
.nl_family = AF_NETLINK,
};
struct iovec iov = {
.iov_base = nh,
.iov_len = nh->nlmsg_len,
};
struct msghdr msg = {
.msg_name = &sa,
.msg_namelen = sizeof(sa),
.msg_iov = &iov,
.msg_iovlen = 1,
};
int send_bytes;

nh->nlmsg_pid = 0; /* communication with the kernel uses pid 0 */
nh->nlmsg_seq = (uint32_t)rte_rand();
send_bytes = sendmsg(nlsk_fd, &msg, 0);
if (send_bytes < 0) {
RTE_LOG(ERR, PMD, "Failed to send netlink message: %s (%d)\n",
strerror(errno), errno);
return -1;
}
return send_bytes;
}

/**
* Check that the kernel sends an appropriate ACK in response to an nl_send().
*
* @param[in] nlsk_fd
* The netlink socket file descriptor used for communication.
*
* @return
* 0 on success, -1 otherwise.
*/
int
nl_recv_ack(int nlsk_fd)
{
return nl_recv(nlsk_fd, NULL, NULL);
}

/**
* Receive a message from the kernel on the netlink socket, following an
* nl_send().
*
* @param[in] nlsk_fd
* The netlink socket file descriptor used for communication.
* @param[in] cb
* The callback function to call for each netlink message received.
* @param[in, out] arg
* Custom arguments for the callback.
*
* @return
* 0 on success, -1 otherwise.
*/
int
nl_recv(int nlsk_fd, int (*cb)(struct nlmsghdr *, void *arg), void *arg)
{
/* man 7 netlink EXAMPLE */
struct sockaddr_nl sa;
struct nlmsghdr *nh;
char buf[BUF_SIZE];
struct iovec iov = {
.iov_base = buf,
.iov_len = sizeof(buf),
};
struct msghdr msg = {
.msg_name = &sa,
.msg_namelen = sizeof(sa),
.msg_iov = &iov,
.msg_iovlen = 1,
};
int recv_bytes = 0, done = 0, multipart = 0, error = 0;

read:
recv_bytes = recvmsg(nlsk_fd, &msg, 0);
if (recv_bytes < 0)
return -1;
for (nh = (struct nlmsghdr *)buf;
NLMSG_OK(nh, (unsigned int)recv_bytes);
nh = NLMSG_NEXT(nh, recv_bytes)) {
/*
* Multi-part messages and their following DONE message have the
* NLM_F_MULTI flag set. Make note, in order to read the DONE
* message afterwards.
*/
if (nh->nlmsg_flags & NLM_F_MULTI)
multipart = 1;
if (nh->nlmsg_type == NLMSG_ERROR) {
struct nlmsgerr *err_data = NLMSG_DATA(nh);

if (err_data->error == 0)
RTE_LOG(DEBUG, PMD, "%s() ack message recvd\n",
__func__);
else {
RTE_LOG(DEBUG, PMD,
"%s() error message recvd\n", __func__);
error = 1;
}
}
/* The end of multipart message. */
if (nh->nlmsg_type == NLMSG_DONE)
/* No need to call the callback for a DONE message. */
done = 1;
else if (cb)
if (cb(nh, arg) < 0)
error = 1;
}
if (multipart && !done)
goto read;
if (error)
return -1;
return 0;
}

/**
* Append a netlink attribute to a message.
*
* @param[in, out] nh
* The netlink message to parse, received from the kernel.
* @param[in] type
* The type of attribute to append.
* @param[in] data_len
* The length of the data to append.
* @param[in] data
* The data to append.
*/
void
nlattr_add(struct nlmsghdr *nh, unsigned short type,
unsigned int data_len, const void *data)
{
/* see man 3 rtnetlink */
struct rtattr *rta;

rta = (struct rtattr *)NLMSG_TAIL(nh);
rta->rta_len = RTA_LENGTH(data_len);
rta->rta_type = type;
memcpy(RTA_DATA(rta), data, data_len);
nh->nlmsg_len = NLMSG_ALIGN(nh->nlmsg_len) + RTA_ALIGN(rta->rta_len);
}

/**
* Append a uint8_t netlink attribute to a message.
*
* @param[in, out] nh
* The netlink message to parse, received from the kernel.
* @param[in] type
* The type of attribute to append.
* @param[in] data
* The data to append.
*/
void
nlattr_add8(struct nlmsghdr *nh, unsigned short type, uint8_t data)
{
nlattr_add(nh, type, sizeof(uint8_t), &data);
}

/**
* Append a uint16_t netlink attribute to a message.
*
* @param[in, out] nh
* The netlink message to parse, received from the kernel.
* @param[in] type
* The type of attribute to append.
* @param[in] data
* The data to append.
*/
void
nlattr_add16(struct nlmsghdr *nh, unsigned short type, uint16_t data)
{
nlattr_add(nh, type, sizeof(uint16_t), &data);
}

/**
* Append a uint16_t netlink attribute to a message.
*
* @param[in, out] nh
* The netlink message to parse, received from the kernel.
* @param[in] type
* The type of attribute to append.
* @param[in] data
* The data to append.
*/
void
nlattr_add32(struct nlmsghdr *nh, unsigned short type, uint32_t data)
{
nlattr_add(nh, type, sizeof(uint32_t), &data);
}

/**
* Start a nested netlink attribute.
* It must be followed later by a call to nlattr_nested_finish().
*
* @param[in, out] msg
* The netlink message where to edit the nested_tails metadata.
* @param[in] type
* The nested attribute type to append.
*
* @return
* -1 if adding a nested netlink attribute failed, 0 otherwise.
*/
int
nlattr_nested_start(struct nlmsg *msg, uint16_t type)
{
struct nested_tail *tail;

tail = rte_zmalloc(NULL, sizeof(struct nested_tail), 0);
if (!tail) {
RTE_LOG(ERR, PMD,
"Couldn't allocate memory for nested netlink"
" attribute\n");
return -1;
}

tail->tail = (struct rtattr *)NLMSG_TAIL(&msg->nh);

nlattr_add(&msg->nh, type, 0, NULL);

tail->prev = msg->nested_tails;

msg->nested_tails = tail;

return 0;
}

/**
* End a nested netlink attribute.
* It follows a call to nlattr_nested_start().
* In effect, it will modify the nested attribute length to include every bytes
* from the nested attribute start, up to here.
*
* @param[in, out] msg
* The netlink message where to edit the nested_tails metadata.
*/
void
nlattr_nested_finish(struct nlmsg *msg)
{
struct nested_tail *tail = msg->nested_tails;

tail->tail->rta_len = (char *)NLMSG_TAIL(&msg->nh) - (char *)tail->tail;

if (tail->prev)
msg->nested_tails = tail->prev;

rte_free(tail);
}
Loading

0 comments on commit 7c25284

Please sign in to comment.