-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
net/tap: add netlink back-end for flow API
Each kernel netdevice may have queueing disciplines set for it, which determine how to handle the packet (mostly on egress). That's part of the TC (Traffic Control) mechanism. Through TC, it is possible to set filter rules that match specific packets, and act according to what is in the rule. This is a perfect candidate to implement the flow API for the tap PMD, as it has an associated kernel netdevice automatically. Each flow API rule will be translated into its TC counterpart. To leverage TC, it is necessary to communicate with the kernel using netlink. This patch introduces a library to help that communication. Inside netlink.c, functions are generic for any netlink messaging. Inside tcmsgs.c, functions are specific to deal with TC rules. Signed-off-by: Pascal Mazon <pascal.mazon@6wind.com> Acked-by: Olga Shern <olgas@mellanox.com> Acked-by: Keith Wiles <keith.wiles@intel.com>
- Loading branch information
Pascal Mazon
authored and
Ferruh Yigit
committed
Apr 4, 2017
1 parent
268483d
commit 7c25284
Showing
5 changed files
with
879 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,367 @@ | ||
/*- | ||
* BSD LICENSE | ||
* | ||
* Copyright 2017 6WIND S.A. | ||
* Copyright 2017 Mellanox. | ||
* | ||
* Redistribution and use in source and binary forms, with or without | ||
* modification, are permitted provided that the following conditions | ||
* are met: | ||
* | ||
* * Redistributions of source code must retain the above copyright | ||
* notice, this list of conditions and the following disclaimer. | ||
* * Redistributions in binary form must reproduce the above copyright | ||
* notice, this list of conditions and the following disclaimer in | ||
* the documentation and/or other materials provided with the | ||
* distribution. | ||
* * Neither the name of 6WIND S.A. nor the names of its | ||
* contributors may be used to endorse or promote products derived | ||
* from this software without specific prior written permission. | ||
* | ||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | ||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | ||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | ||
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | ||
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | ||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | ||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | ||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | ||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | ||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | ||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||
*/ | ||
|
||
#include <errno.h> | ||
#include <inttypes.h> | ||
#include <linux/netlink.h> | ||
#include <string.h> | ||
#include <sys/socket.h> | ||
#include <unistd.h> | ||
|
||
#include <rte_malloc.h> | ||
#include <tap_netlink.h> | ||
#include <rte_random.h> | ||
|
||
/* Must be quite large to support dumping a huge list of QDISC or filters. */ | ||
#define BUF_SIZE (32 * 1024) /* Size of the buffer to receive kernel messages */ | ||
#define SNDBUF_SIZE 32768 /* Send buffer size for the netlink socket */ | ||
#define RCVBUF_SIZE 32768 /* Receive buffer size for the netlink socket */ | ||
|
||
struct nested_tail { | ||
struct rtattr *tail; | ||
struct nested_tail *prev; | ||
}; | ||
|
||
/** | ||
* Initialize a netlink socket for communicating with the kernel. | ||
* | ||
* @return | ||
* netlink socket file descriptor on success, -1 otherwise. | ||
*/ | ||
int | ||
nl_init(void) | ||
{ | ||
int fd, sndbuf_size = SNDBUF_SIZE, rcvbuf_size = RCVBUF_SIZE; | ||
struct sockaddr_nl local = { .nl_family = AF_NETLINK }; | ||
|
||
fd = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, NETLINK_ROUTE); | ||
if (fd < 0) { | ||
RTE_LOG(ERR, PMD, "Unable to create a netlink socket\n"); | ||
return -1; | ||
} | ||
if (setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &sndbuf_size, sizeof(int))) { | ||
RTE_LOG(ERR, PMD, "Unable to set socket buffer send size\n"); | ||
return -1; | ||
} | ||
if (setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &rcvbuf_size, sizeof(int))) { | ||
RTE_LOG(ERR, PMD, "Unable to set socket buffer receive size\n"); | ||
return -1; | ||
} | ||
if (bind(fd, (struct sockaddr *)&local, sizeof(local)) < 0) { | ||
RTE_LOG(ERR, PMD, "Unable to bind to the netlink socket\n"); | ||
return -1; | ||
} | ||
return fd; | ||
} | ||
|
||
/** | ||
* Clean up a netlink socket once all communicating with the kernel is finished. | ||
* | ||
* @param[in] nlsk_fd | ||
* The netlink socket file descriptor used for communication. | ||
* | ||
* @return | ||
* 0 on success, -1 otherwise. | ||
*/ | ||
int | ||
nl_final(int nlsk_fd) | ||
{ | ||
if (close(nlsk_fd)) { | ||
RTE_LOG(ERR, PMD, "Failed to close netlink socket: %s (%d)\n", | ||
strerror(errno), errno); | ||
return -1; | ||
} | ||
return 0; | ||
} | ||
|
||
/** | ||
* Send a message to the kernel on the netlink socket. | ||
* | ||
* @param[in] nlsk_fd | ||
* The netlink socket file descriptor used for communication. | ||
* @param[in] nh | ||
* The netlink message send to the kernel. | ||
* | ||
* @return | ||
* the number of sent bytes on success, -1 otherwise. | ||
*/ | ||
int | ||
nl_send(int nlsk_fd, struct nlmsghdr *nh) | ||
{ | ||
/* man 7 netlink EXAMPLE */ | ||
struct sockaddr_nl sa = { | ||
.nl_family = AF_NETLINK, | ||
}; | ||
struct iovec iov = { | ||
.iov_base = nh, | ||
.iov_len = nh->nlmsg_len, | ||
}; | ||
struct msghdr msg = { | ||
.msg_name = &sa, | ||
.msg_namelen = sizeof(sa), | ||
.msg_iov = &iov, | ||
.msg_iovlen = 1, | ||
}; | ||
int send_bytes; | ||
|
||
nh->nlmsg_pid = 0; /* communication with the kernel uses pid 0 */ | ||
nh->nlmsg_seq = (uint32_t)rte_rand(); | ||
send_bytes = sendmsg(nlsk_fd, &msg, 0); | ||
if (send_bytes < 0) { | ||
RTE_LOG(ERR, PMD, "Failed to send netlink message: %s (%d)\n", | ||
strerror(errno), errno); | ||
return -1; | ||
} | ||
return send_bytes; | ||
} | ||
|
||
/** | ||
* Check that the kernel sends an appropriate ACK in response to an nl_send(). | ||
* | ||
* @param[in] nlsk_fd | ||
* The netlink socket file descriptor used for communication. | ||
* | ||
* @return | ||
* 0 on success, -1 otherwise. | ||
*/ | ||
int | ||
nl_recv_ack(int nlsk_fd) | ||
{ | ||
return nl_recv(nlsk_fd, NULL, NULL); | ||
} | ||
|
||
/** | ||
* Receive a message from the kernel on the netlink socket, following an | ||
* nl_send(). | ||
* | ||
* @param[in] nlsk_fd | ||
* The netlink socket file descriptor used for communication. | ||
* @param[in] cb | ||
* The callback function to call for each netlink message received. | ||
* @param[in, out] arg | ||
* Custom arguments for the callback. | ||
* | ||
* @return | ||
* 0 on success, -1 otherwise. | ||
*/ | ||
int | ||
nl_recv(int nlsk_fd, int (*cb)(struct nlmsghdr *, void *arg), void *arg) | ||
{ | ||
/* man 7 netlink EXAMPLE */ | ||
struct sockaddr_nl sa; | ||
struct nlmsghdr *nh; | ||
char buf[BUF_SIZE]; | ||
struct iovec iov = { | ||
.iov_base = buf, | ||
.iov_len = sizeof(buf), | ||
}; | ||
struct msghdr msg = { | ||
.msg_name = &sa, | ||
.msg_namelen = sizeof(sa), | ||
.msg_iov = &iov, | ||
.msg_iovlen = 1, | ||
}; | ||
int recv_bytes = 0, done = 0, multipart = 0, error = 0; | ||
|
||
read: | ||
recv_bytes = recvmsg(nlsk_fd, &msg, 0); | ||
if (recv_bytes < 0) | ||
return -1; | ||
for (nh = (struct nlmsghdr *)buf; | ||
NLMSG_OK(nh, (unsigned int)recv_bytes); | ||
nh = NLMSG_NEXT(nh, recv_bytes)) { | ||
/* | ||
* Multi-part messages and their following DONE message have the | ||
* NLM_F_MULTI flag set. Make note, in order to read the DONE | ||
* message afterwards. | ||
*/ | ||
if (nh->nlmsg_flags & NLM_F_MULTI) | ||
multipart = 1; | ||
if (nh->nlmsg_type == NLMSG_ERROR) { | ||
struct nlmsgerr *err_data = NLMSG_DATA(nh); | ||
|
||
if (err_data->error == 0) | ||
RTE_LOG(DEBUG, PMD, "%s() ack message recvd\n", | ||
__func__); | ||
else { | ||
RTE_LOG(DEBUG, PMD, | ||
"%s() error message recvd\n", __func__); | ||
error = 1; | ||
} | ||
} | ||
/* The end of multipart message. */ | ||
if (nh->nlmsg_type == NLMSG_DONE) | ||
/* No need to call the callback for a DONE message. */ | ||
done = 1; | ||
else if (cb) | ||
if (cb(nh, arg) < 0) | ||
error = 1; | ||
} | ||
if (multipart && !done) | ||
goto read; | ||
if (error) | ||
return -1; | ||
return 0; | ||
} | ||
|
||
/** | ||
* Append a netlink attribute to a message. | ||
* | ||
* @param[in, out] nh | ||
* The netlink message to parse, received from the kernel. | ||
* @param[in] type | ||
* The type of attribute to append. | ||
* @param[in] data_len | ||
* The length of the data to append. | ||
* @param[in] data | ||
* The data to append. | ||
*/ | ||
void | ||
nlattr_add(struct nlmsghdr *nh, unsigned short type, | ||
unsigned int data_len, const void *data) | ||
{ | ||
/* see man 3 rtnetlink */ | ||
struct rtattr *rta; | ||
|
||
rta = (struct rtattr *)NLMSG_TAIL(nh); | ||
rta->rta_len = RTA_LENGTH(data_len); | ||
rta->rta_type = type; | ||
memcpy(RTA_DATA(rta), data, data_len); | ||
nh->nlmsg_len = NLMSG_ALIGN(nh->nlmsg_len) + RTA_ALIGN(rta->rta_len); | ||
} | ||
|
||
/** | ||
* Append a uint8_t netlink attribute to a message. | ||
* | ||
* @param[in, out] nh | ||
* The netlink message to parse, received from the kernel. | ||
* @param[in] type | ||
* The type of attribute to append. | ||
* @param[in] data | ||
* The data to append. | ||
*/ | ||
void | ||
nlattr_add8(struct nlmsghdr *nh, unsigned short type, uint8_t data) | ||
{ | ||
nlattr_add(nh, type, sizeof(uint8_t), &data); | ||
} | ||
|
||
/** | ||
* Append a uint16_t netlink attribute to a message. | ||
* | ||
* @param[in, out] nh | ||
* The netlink message to parse, received from the kernel. | ||
* @param[in] type | ||
* The type of attribute to append. | ||
* @param[in] data | ||
* The data to append. | ||
*/ | ||
void | ||
nlattr_add16(struct nlmsghdr *nh, unsigned short type, uint16_t data) | ||
{ | ||
nlattr_add(nh, type, sizeof(uint16_t), &data); | ||
} | ||
|
||
/** | ||
* Append a uint16_t netlink attribute to a message. | ||
* | ||
* @param[in, out] nh | ||
* The netlink message to parse, received from the kernel. | ||
* @param[in] type | ||
* The type of attribute to append. | ||
* @param[in] data | ||
* The data to append. | ||
*/ | ||
void | ||
nlattr_add32(struct nlmsghdr *nh, unsigned short type, uint32_t data) | ||
{ | ||
nlattr_add(nh, type, sizeof(uint32_t), &data); | ||
} | ||
|
||
/** | ||
* Start a nested netlink attribute. | ||
* It must be followed later by a call to nlattr_nested_finish(). | ||
* | ||
* @param[in, out] msg | ||
* The netlink message where to edit the nested_tails metadata. | ||
* @param[in] type | ||
* The nested attribute type to append. | ||
* | ||
* @return | ||
* -1 if adding a nested netlink attribute failed, 0 otherwise. | ||
*/ | ||
int | ||
nlattr_nested_start(struct nlmsg *msg, uint16_t type) | ||
{ | ||
struct nested_tail *tail; | ||
|
||
tail = rte_zmalloc(NULL, sizeof(struct nested_tail), 0); | ||
if (!tail) { | ||
RTE_LOG(ERR, PMD, | ||
"Couldn't allocate memory for nested netlink" | ||
" attribute\n"); | ||
return -1; | ||
} | ||
|
||
tail->tail = (struct rtattr *)NLMSG_TAIL(&msg->nh); | ||
|
||
nlattr_add(&msg->nh, type, 0, NULL); | ||
|
||
tail->prev = msg->nested_tails; | ||
|
||
msg->nested_tails = tail; | ||
|
||
return 0; | ||
} | ||
|
||
/** | ||
* End a nested netlink attribute. | ||
* It follows a call to nlattr_nested_start(). | ||
* In effect, it will modify the nested attribute length to include every bytes | ||
* from the nested attribute start, up to here. | ||
* | ||
* @param[in, out] msg | ||
* The netlink message where to edit the nested_tails metadata. | ||
*/ | ||
void | ||
nlattr_nested_finish(struct nlmsg *msg) | ||
{ | ||
struct nested_tail *tail = msg->nested_tails; | ||
|
||
tail->tail->rta_len = (char *)NLMSG_TAIL(&msg->nh) - (char *)tail->tail; | ||
|
||
if (tail->prev) | ||
msg->nested_tails = tail->prev; | ||
|
||
rte_free(tail); | ||
} |
Oops, something went wrong.