From ee8508c985306e1a7f874d5db0faf75461d88eb7 Mon Sep 17 00:00:00 2001 From: Vladimir Kuramshin Date: Thu, 28 Mar 2024 18:59:57 +0300 Subject: [PATCH] ipvs: sctp implementation --- include/conf/inet.h | 1 + include/conf/match.h | 2 + include/conf/service.h | 5 +- include/ipvs/proto_sctp.h | 66 ++ include/ipvs/stats.h | 1 + include/sctp/sctp.h | 664 ++++++++++++++++++ src/iftraf.c | 6 +- src/ipvs/ip_vs_conn.c | 38 +- src/ipvs/ip_vs_dest.c | 4 +- src/ipvs/ip_vs_laddr.c | 3 +- src/ipvs/ip_vs_nat64.c | 1 + src/ipvs/ip_vs_proto.c | 11 + src/ipvs/ip_vs_proto_sctp.c | 558 +++++++++++++++ src/ipvs/ip_vs_service.c | 3 +- src/tc/cls_match.c | 13 + tools/dpip/cls.c | 2 +- tools/ipvsadm/ipvsadm.c | 41 +- .../keepalived/keepalived/check/check_data.c | 3 + 18 files changed, 1409 insertions(+), 13 deletions(-) create mode 100644 include/ipvs/proto_sctp.h create mode 100755 include/sctp/sctp.h create mode 100644 src/ipvs/ip_vs_proto_sctp.c diff --git a/include/conf/inet.h b/include/conf/inet.h index 1c3449781..6ae6c7e9c 100644 --- a/include/conf/inet.h +++ b/include/conf/inet.h @@ -89,6 +89,7 @@ static inline const char *inet_proto_name(uint8_t proto) const static char *proto_names[256] = { [IPPROTO_TCP] = "TCP", [IPPROTO_UDP] = "UDP", + [IPPROTO_SCTP] = "SCTP", [IPPROTO_ICMP] = "ICMP", [IPPROTO_ICMPV6] = "ICMPV6", }; diff --git a/include/conf/match.h b/include/conf/match.h index 30a9d1e17..e0eac905a 100644 --- a/include/conf/match.h +++ b/include/conf/match.h @@ -93,6 +93,8 @@ static inline int parse_match(const char *pattern, uint8_t *proto, *proto = IPPROTO_TCP; } else if (strcmp(tok, "udp") == 0) { *proto = IPPROTO_UDP; + } else if (strcmp(tok, "sctp") == 0) { + *proto = IPPROTO_SCTP; } else if (strcmp(tok, "icmp") == 0) { *proto = IPPROTO_ICMP; } else if (strcmp(tok, "icmp6") == 0) { diff --git a/include/conf/service.h b/include/conf/service.h index d16164f3c..3a3279cb4 100644 --- a/include/conf/service.h +++ b/include/conf/service.h @@ -52,8 +52,9 @@ #define DEST_HC_PASSIVE 0x01 #define DEST_HC_TCP 0x02 #define DEST_HC_UDP 0x04 -#define DEST_HC_PING 0x08 -#define DEST_HC_MASK_EXTERNAL 0x0e +#define DEST_HC_SCTP 0x08 +#define DEST_HC_PING 0x10 +#define DEST_HC_MASK_EXTERNAL 0x1e /* defaults for dest passive health check */ #define DEST_DOWN_NOTICE_DEFAULT 1 diff --git a/include/ipvs/proto_sctp.h b/include/ipvs/proto_sctp.h new file mode 100644 index 000000000..232d9594d --- /dev/null +++ b/include/ipvs/proto_sctp.h @@ -0,0 +1,66 @@ +/* + * DPVS is a software load balancer (Virtual Server) based on DPDK. + * + * Copyright (C) 2021 iQIYI (www.iqiyi.com). + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ +#ifndef __DP_VS_PROTO_SCTP_H__ +#define __DP_VS_PROTO_SCTP_H__ + +#include +#include "sctp/sctp.h" + +enum dpvs_sctp_event_t { + DPVS_SCTP_DATA = 0, /* DATA, SACK, HEARTBEATs */ + DPVS_SCTP_INIT, + DPVS_SCTP_INIT_ACK, + DPVS_SCTP_COOKIE_ECHO, + DPVS_SCTP_COOKIE_ACK, + DPVS_SCTP_SHUTDOWN, + DPVS_SCTP_SHUTDOWN_ACK, + DPVS_SCTP_SHUTDOWN_COMPLETE, + DPVS_SCTP_ERROR, + DPVS_SCTP_ABORT, + DPVS_SCTP_EVENT_LAST +}; + +/* ip_vs_conn handling functions + * (from ip_vs_conn.c) + */ +enum { DPVS_DIR_INPUT = 0, + DPVS_DIR_OUTPUT, + DPVS_DIR_INPUT_ONLY, + DPVS_DIR_LAST, +}; + +/* SCTP State Values */ +enum dpvs_sctp_states { + DPVS_SCTP_S_NONE, + DPVS_SCTP_S_INIT1, + DPVS_SCTP_S_INIT, + DPVS_SCTP_S_COOKIE_SENT, + DPVS_SCTP_S_COOKIE_REPLIED, + DPVS_SCTP_S_COOKIE_WAIT, + DPVS_SCTP_S_COOKIE, + DPVS_SCTP_S_COOKIE_ECHOED, + DPVS_SCTP_S_ESTABLISHED, + DPVS_SCTP_S_SHUTDOWN_SENT, + DPVS_SCTP_S_SHUTDOWN_RECEIVED, + DPVS_SCTP_S_SHUTDOWN_ACK_SENT, + DPVS_SCTP_S_REJECTED, + DPVS_SCTP_S_CLOSED, + DPVS_SCTP_S_LAST +}; + +#endif diff --git a/include/ipvs/stats.h b/include/ipvs/stats.h index 3e4f8db11..0332fd2d9 100644 --- a/include/ipvs/stats.h +++ b/include/ipvs/stats.h @@ -53,6 +53,7 @@ enum dp_vs_estats_type { SYNPROXY_CONN_REUSED_CLOSEWAIT, SYNPROXY_CONN_REUSED_LASTACK, DEFENCE_IP_FRAG_DROP, + DEFENCE_SCTP_DROP, DEFENCE_TCP_DROP, DEFENCE_UDP_DROP, FAST_XMIT_REJECT, diff --git a/include/sctp/sctp.h b/include/sctp/sctp.h new file mode 100755 index 000000000..2d2d76100 --- /dev/null +++ b/include/sctp/sctp.h @@ -0,0 +1,664 @@ +/*- + * SPDX-License-Identifier: BSD-3-Clause + * + * Copyright (c) 2001-2008, by Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved. + * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * a) Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * b) Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the distribution. + * + * c) Neither the name of Cisco Systems, Inc. nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + */ + +#if defined(__FreeBSD__) && !defined(__Userspace__) +#include +__FBSDID("$FreeBSD$"); +#endif + +#ifndef _NETINET_SCTP_H_ +#define _NETINET_SCTP_H_ + +#if defined(__APPLE__) || defined(__linux__) +#include +#endif +#include + +#if !defined(_WIN32) +#define SCTP_PACKED __attribute__((packed)) +#else +#pragma pack (push, 1) +#define SCTP_PACKED +#endif + +/* + * SCTP protocol - RFC4960. + */ +struct sctphdr { + uint16_t src_port; /* source port */ + uint16_t dest_port; /* destination port */ + uint32_t v_tag; /* verification tag of packet */ + uint32_t checksum; /* CRC32C checksum */ + /* chunks follow... */ +} SCTP_PACKED; + +/* + * SCTP Chunks + */ +struct sctp_chunkhdr { + uint8_t chunk_type; /* chunk type */ + uint8_t chunk_flags; /* chunk flags */ + uint16_t chunk_length; /* chunk length */ + /* optional params follow */ +} SCTP_PACKED; + +/* + * SCTP chunk parameters + */ +struct sctp_paramhdr { + uint16_t param_type; /* parameter type */ + uint16_t param_length; /* parameter length */ +} SCTP_PACKED; + +/* + * user socket options: socket API defined + */ +/* + * read-write options + */ +#define SCTP_RTOINFO 0x00000001 +#define SCTP_ASSOCINFO 0x00000002 +#define SCTP_INITMSG 0x00000003 +#define SCTP_NODELAY 0x00000004 +#define SCTP_AUTOCLOSE 0x00000005 +#define SCTP_SET_PEER_PRIMARY_ADDR 0x00000006 +#define SCTP_PRIMARY_ADDR 0x00000007 +#define SCTP_ADAPTATION_LAYER 0x00000008 +/* same as above */ +#define SCTP_ADAPTION_LAYER 0x00000008 +#define SCTP_DISABLE_FRAGMENTS 0x00000009 +#define SCTP_PEER_ADDR_PARAMS 0x0000000a +#define SCTP_DEFAULT_SEND_PARAM 0x0000000b +/* ancillary data/notification interest options */ +#define SCTP_EVENTS 0x0000000c /* deprecated */ +/* Without this applied we will give V4 and V6 addresses on a V6 socket */ +#define SCTP_I_WANT_MAPPED_V4_ADDR 0x0000000d +#define SCTP_MAXSEG 0x0000000e +#define SCTP_DELAYED_SACK 0x0000000f +#define SCTP_FRAGMENT_INTERLEAVE 0x00000010 +#define SCTP_PARTIAL_DELIVERY_POINT 0x00000011 +/* authentication support */ +#define SCTP_AUTH_CHUNK 0x00000012 +#define SCTP_AUTH_KEY 0x00000013 +#define SCTP_HMAC_IDENT 0x00000014 +#define SCTP_AUTH_ACTIVE_KEY 0x00000015 +#define SCTP_AUTH_DELETE_KEY 0x00000016 +#define SCTP_USE_EXT_RCVINFO 0x00000017 +#define SCTP_AUTO_ASCONF 0x00000018 /* rw */ +#define SCTP_MAXBURST 0x00000019 /* rw */ +#define SCTP_MAX_BURST 0x00000019 /* rw */ +/* assoc level context */ +#define SCTP_CONTEXT 0x0000001a /* rw */ +/* explicit EOR signalling */ +#define SCTP_EXPLICIT_EOR 0x0000001b +#define SCTP_REUSE_PORT 0x0000001c /* rw */ +#define SCTP_AUTH_DEACTIVATE_KEY 0x0000001d +#define SCTP_EVENT 0x0000001e +#define SCTP_RECVRCVINFO 0x0000001f +#define SCTP_RECVNXTINFO 0x00000020 +#define SCTP_DEFAULT_SNDINFO 0x00000021 +#define SCTP_DEFAULT_PRINFO 0x00000022 +#define SCTP_PEER_ADDR_THLDS 0x00000023 +#define SCTP_REMOTE_UDP_ENCAPS_PORT 0x00000024 +#define SCTP_ECN_SUPPORTED 0x00000025 +#define SCTP_PR_SUPPORTED 0x00000026 +#define SCTP_AUTH_SUPPORTED 0x00000027 +#define SCTP_ASCONF_SUPPORTED 0x00000028 +#define SCTP_RECONFIG_SUPPORTED 0x00000029 +#define SCTP_NRSACK_SUPPORTED 0x00000030 +#define SCTP_PKTDROP_SUPPORTED 0x00000031 +#define SCTP_MAX_CWND 0x00000032 + +/* + * read-only options + */ +#define SCTP_STATUS 0x00000100 +#define SCTP_GET_PEER_ADDR_INFO 0x00000101 +/* authentication support */ +#define SCTP_PEER_AUTH_CHUNKS 0x00000102 +#define SCTP_LOCAL_AUTH_CHUNKS 0x00000103 +#define SCTP_GET_ASSOC_NUMBER 0x00000104 /* ro */ +#define SCTP_GET_ASSOC_ID_LIST 0x00000105 /* ro */ +#define SCTP_TIMEOUTS 0x00000106 +#define SCTP_PR_STREAM_STATUS 0x00000107 +#define SCTP_PR_ASSOC_STATUS 0x00000108 + +/* + * user socket options: BSD implementation specific + */ +/* + * Blocking I/O is enabled on any TCP type socket by default. For the UDP + * model if this is turned on then the socket buffer is shared for send + * resources amongst all associations. The default for the UDP model is that + * is SS_NBIO is set. Which means all associations have a separate send + * limit BUT they will NOT ever BLOCK instead you will get an error back + * EAGAIN if you try to send too much. If you want the blocking semantics you + * set this option at the cost of sharing one socket send buffer size amongst + * all associations. Peeled off sockets turn this option off and block. But + * since both TCP and peeled off sockets have only one assoc per socket this + * is fine. It probably does NOT make sense to set this on SS_NBIO on a TCP + * model OR peeled off UDP model, but we do allow you to do so. You just use + * the normal syscall to toggle SS_NBIO the way you want. + * + * Blocking I/O is controlled by the SS_NBIO flag on the socket state so_state + * field. + */ + +#define SCTP_ENABLE_STREAM_RESET 0x00000900 /* struct sctp_assoc_value */ +#define SCTP_RESET_STREAMS 0x00000901 /* struct sctp_reset_streams */ +#define SCTP_RESET_ASSOC 0x00000902 /* sctp_assoc_t */ +#define SCTP_ADD_STREAMS 0x00000903 /* struct sctp_add_streams */ + +/* For enable stream reset */ +#define SCTP_ENABLE_RESET_STREAM_REQ 0x00000001 +#define SCTP_ENABLE_RESET_ASSOC_REQ 0x00000002 +#define SCTP_ENABLE_CHANGE_ASSOC_REQ 0x00000004 +#define SCTP_ENABLE_VALUE_MASK 0x00000007 +/* For reset streams */ +#define SCTP_STREAM_RESET_INCOMING 0x00000001 +#define SCTP_STREAM_RESET_OUTGOING 0x00000002 + +/* here on down are more implementation specific */ +#define SCTP_SET_DEBUG_LEVEL 0x00001005 +#define SCTP_CLR_STAT_LOG 0x00001007 +/* CMT ON/OFF socket option */ +#define SCTP_CMT_ON_OFF 0x00001200 +#define SCTP_CMT_USE_DAC 0x00001201 +/* JRS - Pluggable Congestion Control Socket option */ +#define SCTP_PLUGGABLE_CC 0x00001202 +/* RS - Pluggable Stream Scheduling Socket option */ +#define SCTP_PLUGGABLE_SS 0x00001203 +#define SCTP_SS_VALUE 0x00001204 +#define SCTP_CC_OPTION 0x00001205 /* Options for CC modules */ +/* For I-DATA */ +#define SCTP_INTERLEAVING_SUPPORTED 0x00001206 + +/* read only */ +#define SCTP_GET_SNDBUF_USE 0x00001101 +#define SCTP_GET_STAT_LOG 0x00001103 +#define SCTP_PCB_STATUS 0x00001104 +#define SCTP_GET_NONCE_VALUES 0x00001105 + +/* Special hook for dynamically setting primary for all assoc's, + * this is a write only option that requires root privilege. + */ +#define SCTP_SET_DYNAMIC_PRIMARY 0x00002001 + +/* VRF (virtual router feature) and multi-VRF support + * options. VRF's provide splits within a router + * that give the views of multiple routers. A + * standard host, without VRF support, is just + * a single VRF. If VRF's are supported then + * the transport must be VRF aware. This means + * that every socket call coming in must be directed + * within the endpoint to one of the VRF's it belongs + * to. The endpoint, before binding, may select + * the "default" VRF it is in by using a set socket + * option with SCTP_VRF_ID. This will also + * get propagated to the default VRF. Once the + * endpoint binds an address then it CANNOT add + * additional VRF's to become a Multi-VRF endpoint. + * + * Before BINDING additional VRF's can be added with + * the SCTP_ADD_VRF_ID call or deleted with + * SCTP_DEL_VRF_ID. + * + * Associations are ALWAYS contained inside a single + * VRF. They cannot reside in two (or more) VRF's. Incoming + * packets, assuming the router is VRF aware, can always + * tell us what VRF they arrived on. A host not supporting + * any VRF's will find that the packets always arrived on the + * single VRF that the host has. + * + */ + +#define SCTP_VRF_ID 0x00003001 +#define SCTP_ADD_VRF_ID 0x00003002 +#define SCTP_GET_VRF_IDS 0x00003003 +#define SCTP_GET_ASOC_VRF 0x00003004 +#define SCTP_DEL_VRF_ID 0x00003005 + +/* + * If you enable packet logging you can get + * a poor mans ethereal output in binary + * form. Note this is a compile option to + * the kernel, SCTP_PACKET_LOGGING, and + * without it in your kernel you + * will get a EOPNOTSUPP + */ +#define SCTP_GET_PACKET_LOG 0x00004001 + +/* + * hidden implementation specific options these are NOT user visible (should + * move out of sctp.h) + */ +/* sctp_bindx() flags as hidden socket options */ +#define SCTP_BINDX_ADD_ADDR 0x00008001 +#define SCTP_BINDX_REM_ADDR 0x00008002 +/* Hidden socket option that gets the addresses */ +#define SCTP_GET_PEER_ADDRESSES 0x00008003 +#define SCTP_GET_LOCAL_ADDRESSES 0x00008004 +/* return the total count in bytes needed to hold all local addresses bound */ +#define SCTP_GET_LOCAL_ADDR_SIZE 0x00008005 +/* Return the total count in bytes needed to hold the remote address */ +#define SCTP_GET_REMOTE_ADDR_SIZE 0x00008006 +/* hidden option for connectx */ +#define SCTP_CONNECT_X 0x00008007 +/* hidden option for connectx_delayed, part of sendx */ +#define SCTP_CONNECT_X_DELAYED 0x00008008 +#define SCTP_CONNECT_X_COMPLETE 0x00008009 +/* hidden socket option based sctp_peeloff */ +#define SCTP_PEELOFF 0x0000800a +/* the real worker for sctp_getaddrlen() */ +#define SCTP_GET_ADDR_LEN 0x0000800b +#if defined(__APPLE__) && !defined(__Userspace__) +/* temporary workaround for Apple listen() issue, no args used */ +#define SCTP_LISTEN_FIX 0x0000800c +#endif +#if defined(_WIN32) && !defined(__Userspace__) +/* workaround for Cygwin on Windows: returns the SOCKET handle */ +#define SCTP_GET_HANDLE 0x0000800d +#endif +/* Debug things that need to be purged */ +#define SCTP_SET_INITIAL_DBG_SEQ 0x00009f00 + +/* JRS - Supported congestion control modules for pluggable + * congestion control + */ +/* Standard TCP Congestion Control */ +#define SCTP_CC_RFC2581 0x00000000 +/* High Speed TCP Congestion Control (Floyd) */ +#define SCTP_CC_HSTCP 0x00000001 +/* HTCP Congestion Control */ +#define SCTP_CC_HTCP 0x00000002 +/* RTCC Congestion Control - RFC2581 plus */ +#define SCTP_CC_RTCC 0x00000003 + +#define SCTP_CC_OPT_RTCC_SETMODE 0x00002000 +#define SCTP_CC_OPT_USE_DCCC_ECN 0x00002001 +#define SCTP_CC_OPT_STEADY_STEP 0x00002002 + +#define SCTP_CMT_OFF 0 +#define SCTP_CMT_BASE 1 +#define SCTP_CMT_RPV1 2 +#define SCTP_CMT_RPV2 3 +#define SCTP_CMT_MPTCP 4 +#define SCTP_CMT_MAX SCTP_CMT_MPTCP + +/* RS - Supported stream scheduling modules for pluggable + * stream scheduling + */ +/* Default simple round-robin */ +#define SCTP_SS_DEFAULT 0x00000000 +/* Real round-robin */ +#define SCTP_SS_ROUND_ROBIN 0x00000001 +/* Real round-robin per packet */ +#define SCTP_SS_ROUND_ROBIN_PACKET 0x00000002 +/* Priority */ +#define SCTP_SS_PRIORITY 0x00000003 +/* Fair Bandwidth */ +#define SCTP_SS_FAIR_BANDWIDTH 0x00000004 +/* First-come, first-serve */ +#define SCTP_SS_FIRST_COME 0x00000005 + +/* fragment interleave constants + * setting must be one of these or + * EINVAL returned. + */ +#define SCTP_FRAG_LEVEL_0 0x00000000 +#define SCTP_FRAG_LEVEL_1 0x00000001 +#define SCTP_FRAG_LEVEL_2 0x00000002 + +/* + * user state values + */ +#define SCTP_CLOSED 0x0000 +#define SCTP_BOUND 0x1000 +#define SCTP_LISTEN 0x2000 +#define SCTP_COOKIE_WAIT 0x0002 +#define SCTP_COOKIE_ECHOED 0x0004 +#define SCTP_ESTABLISHED 0x0008 +#define SCTP_SHUTDOWN_SENT 0x0010 +#define SCTP_SHUTDOWN_RECEIVED 0x0020 +#define SCTP_SHUTDOWN_ACK_SENT 0x0040 +#define SCTP_SHUTDOWN_PENDING 0x0080 + +/* + * SCTP operational error codes (user visible) + */ +#define SCTP_CAUSE_NO_ERROR 0x0000 +#define SCTP_CAUSE_INVALID_STREAM 0x0001 +#define SCTP_CAUSE_MISSING_PARAM 0x0002 +#define SCTP_CAUSE_STALE_COOKIE 0x0003 +#define SCTP_CAUSE_OUT_OF_RESC 0x0004 +#define SCTP_CAUSE_UNRESOLVABLE_ADDR 0x0005 +#define SCTP_CAUSE_UNRECOG_CHUNK 0x0006 +#define SCTP_CAUSE_INVALID_PARAM 0x0007 +#define SCTP_CAUSE_UNRECOG_PARAM 0x0008 +#define SCTP_CAUSE_NO_USER_DATA 0x0009 +#define SCTP_CAUSE_COOKIE_IN_SHUTDOWN 0x000a +#define SCTP_CAUSE_RESTART_W_NEWADDR 0x000b +#define SCTP_CAUSE_USER_INITIATED_ABT 0x000c +#define SCTP_CAUSE_PROTOCOL_VIOLATION 0x000d + +/* Error causes from RFC5061 */ +#define SCTP_CAUSE_DELETING_LAST_ADDR 0x00a0 +#define SCTP_CAUSE_RESOURCE_SHORTAGE 0x00a1 +#define SCTP_CAUSE_DELETING_SRC_ADDR 0x00a2 +#define SCTP_CAUSE_ILLEGAL_ASCONF_ACK 0x00a3 +#define SCTP_CAUSE_REQUEST_REFUSED 0x00a4 + +/* Error causes from nat-draft */ +#define SCTP_CAUSE_NAT_COLLIDING_STATE 0x00b0 +#define SCTP_CAUSE_NAT_MISSING_STATE 0x00b1 + +/* Error causes from RFC4895 */ +#define SCTP_CAUSE_UNSUPPORTED_HMACID 0x0105 + +/* + * error cause parameters (user visible) + */ +struct sctp_gen_error_cause { + uint16_t code; + uint16_t length; + uint8_t info[]; +} SCTP_PACKED; + +struct sctp_error_cause { + uint16_t code; + uint16_t length; + /* optional cause-specific info may follow */ +} SCTP_PACKED; + +struct sctp_error_invalid_stream { + struct sctp_error_cause cause; /* code=SCTP_CAUSE_INVALID_STREAM */ + uint16_t stream_id; /* stream id of the DATA in error */ + uint16_t reserved; +} SCTP_PACKED; + +struct sctp_error_missing_param { + struct sctp_error_cause cause; /* code=SCTP_CAUSE_MISSING_PARAM */ + uint32_t num_missing_params; /* number of missing parameters */ + uint16_t type[]; +} SCTP_PACKED; + +struct sctp_error_stale_cookie { + struct sctp_error_cause cause; /* code=SCTP_CAUSE_STALE_COOKIE */ + uint32_t stale_time; /* time in usec of staleness */ +} SCTP_PACKED; + +struct sctp_error_out_of_resource { + struct sctp_error_cause cause; /* code=SCTP_CAUSE_OUT_OF_RESOURCES */ +} SCTP_PACKED; + +struct sctp_error_unresolv_addr { + struct sctp_error_cause cause; /* code=SCTP_CAUSE_UNRESOLVABLE_ADDR */ +} SCTP_PACKED; + +struct sctp_error_unrecognized_chunk { + struct sctp_error_cause cause; /* code=SCTP_CAUSE_UNRECOG_CHUNK */ + struct sctp_chunkhdr ch;/* header from chunk in error */ +} SCTP_PACKED; + +struct sctp_error_no_user_data { + struct sctp_error_cause cause; /* code=SCTP_CAUSE_NO_USER_DATA */ + uint32_t tsn; /* TSN of the empty data chunk */ +} SCTP_PACKED; + +struct sctp_error_auth_invalid_hmac { + struct sctp_error_cause cause; /* code=SCTP_CAUSE_UNSUPPORTED_HMACID */ + uint16_t hmac_id; +} SCTP_PACKED; + +/* + * Main SCTP chunk types we place these here so natd and f/w's in user land + * can find them. + */ +/************0x00 series ***********/ +#define SCTP_DATA 0x00 +#define SCTP_INITIATION 0x01 +#define SCTP_INITIATION_ACK 0x02 +#define SCTP_SELECTIVE_ACK 0x03 +#define SCTP_HEARTBEAT_REQUEST 0x04 +#define SCTP_HEARTBEAT_ACK 0x05 +#define SCTP_ABORT_ASSOCIATION 0x06 +#define SCTP_SHUTDOWN 0x07 +#define SCTP_SHUTDOWN_ACK 0x08 +#define SCTP_OPERATION_ERROR 0x09 +#define SCTP_COOKIE_ECHO 0x0a +#define SCTP_COOKIE_ACK 0x0b +#define SCTP_ECN_ECHO 0x0c +#define SCTP_ECN_CWR 0x0d +#define SCTP_SHUTDOWN_COMPLETE 0x0e +/* RFC4895 */ +#define SCTP_AUTHENTICATION 0x0f +/* EY nr_sack chunk id*/ +#define SCTP_NR_SELECTIVE_ACK 0x10 +/************0x40 series ***********/ +#define SCTP_IDATA 0x40 +/************0x80 series ***********/ +/* RFC5061 */ +#define SCTP_ASCONF_ACK 0x80 +/* draft-ietf-stewart-pktdrpsctp */ +#define SCTP_PACKET_DROPPED 0x81 +/* draft-ietf-stewart-strreset-xxx */ +#define SCTP_STREAM_RESET 0x82 + +/* RFC4820 */ +#define SCTP_PAD_CHUNK 0x84 +/************0xc0 series ***********/ +/* RFC3758 */ +#define SCTP_FORWARD_CUM_TSN 0xc0 +/* RFC5061 */ +#define SCTP_ASCONF 0xc1 +#define SCTP_IFORWARD_CUM_TSN 0xc2 + +/* ABORT and SHUTDOWN COMPLETE FLAG */ +#define SCTP_HAD_NO_TCB 0x01 + +/* Packet dropped flags */ +#define SCTP_FROM_MIDDLE_BOX SCTP_HAD_NO_TCB +#define SCTP_BADCRC 0x02 +#define SCTP_PACKET_TRUNCATED 0x04 + +/* Flag for ECN -CWR */ +#define SCTP_CWR_REDUCE_OVERRIDE 0x01 +#define SCTP_CWR_IN_SAME_WINDOW 0x02 + +#define SCTP_SAT_NETWORK_MIN 400 /* min ms for RTT to set satellite + * time */ +#define SCTP_SAT_NETWORK_BURST_INCR 2 /* how many times to multiply maxburst + * in sat */ + +/* Data Chuck Specific Flags */ +#define SCTP_DATA_FRAG_MASK 0x03 +#define SCTP_DATA_MIDDLE_FRAG 0x00 +#define SCTP_DATA_LAST_FRAG 0x01 +#define SCTP_DATA_FIRST_FRAG 0x02 +#define SCTP_DATA_NOT_FRAG 0x03 +#define SCTP_DATA_UNORDERED 0x04 +#define SCTP_DATA_SACK_IMMEDIATELY 0x08 +/* ECN Nonce: SACK Chunk Specific Flags */ +#define SCTP_SACK_NONCE_SUM 0x01 + +/* CMT DAC algorithm SACK flag */ +#define SCTP_SACK_CMT_DAC 0x80 + +/* + * PCB flags (in sctp_flags bitmask). + * Note the features and flags are meant + * for use by netstat. + */ +#define SCTP_PCB_FLAGS_UDPTYPE 0x00000001 +#define SCTP_PCB_FLAGS_TCPTYPE 0x00000002 +#define SCTP_PCB_FLAGS_BOUNDALL 0x00000004 +#define SCTP_PCB_FLAGS_ACCEPTING 0x00000008 +#define SCTP_PCB_FLAGS_UNBOUND 0x00000010 +#define SCTP_PCB_FLAGS_SND_ITERATOR_UP 0x00000020 +#define SCTP_PCB_FLAGS_CLOSE_IP 0x00040000 +#define SCTP_PCB_FLAGS_WAS_CONNECTED 0x00080000 +#define SCTP_PCB_FLAGS_WAS_ABORTED 0x00100000 +/* TCP model support */ + +#define SCTP_PCB_FLAGS_CONNECTED 0x00200000 +#define SCTP_PCB_FLAGS_IN_TCPPOOL 0x00400000 +#define SCTP_PCB_FLAGS_DONT_WAKE 0x00800000 +#define SCTP_PCB_FLAGS_WAKEOUTPUT 0x01000000 +#define SCTP_PCB_FLAGS_WAKEINPUT 0x02000000 +#define SCTP_PCB_FLAGS_BOUND_V6 0x04000000 +#define SCTP_PCB_FLAGS_BLOCKING_IO 0x08000000 +#define SCTP_PCB_FLAGS_SOCKET_GONE 0x10000000 +#define SCTP_PCB_FLAGS_SOCKET_ALLGONE 0x20000000 +#define SCTP_PCB_FLAGS_SOCKET_CANT_READ 0x40000000 +#if defined(__Userspace__) +#define SCTP_PCB_FLAGS_BOUND_CONN 0x80000000 + +/* flags to copy to new PCB */ +#define SCTP_PCB_COPY_FLAGS (SCTP_PCB_FLAGS_BOUNDALL|\ + SCTP_PCB_FLAGS_WAKEINPUT|\ + SCTP_PCB_FLAGS_BOUND_V6|\ + SCTP_PCB_FLAGS_BOUND_CONN) +#else + +/* flags to copy to new PCB */ +#define SCTP_PCB_COPY_FLAGS (SCTP_PCB_FLAGS_BOUNDALL|\ + SCTP_PCB_FLAGS_WAKEINPUT|\ + SCTP_PCB_FLAGS_BOUND_V6) +#endif + +/* + * PCB Features (in sctp_features bitmask) + */ +#define SCTP_PCB_FLAGS_DO_NOT_PMTUD 0x0000000000000001 +#define SCTP_PCB_FLAGS_EXT_RCVINFO 0x0000000000000002 /* deprecated */ +#define SCTP_PCB_FLAGS_DONOT_HEARTBEAT 0x0000000000000004 +#define SCTP_PCB_FLAGS_FRAG_INTERLEAVE 0x0000000000000008 +#define SCTP_PCB_FLAGS_INTERLEAVE_STRMS 0x0000000000000010 +#define SCTP_PCB_FLAGS_DO_ASCONF 0x0000000000000020 +#define SCTP_PCB_FLAGS_AUTO_ASCONF 0x0000000000000040 +/* socket options */ +#define SCTP_PCB_FLAGS_NODELAY 0x0000000000000100 +#define SCTP_PCB_FLAGS_AUTOCLOSE 0x0000000000000200 +#define SCTP_PCB_FLAGS_RECVDATAIOEVNT 0x0000000000000400 /* deprecated */ +#define SCTP_PCB_FLAGS_RECVASSOCEVNT 0x0000000000000800 +#define SCTP_PCB_FLAGS_RECVPADDREVNT 0x0000000000001000 +#define SCTP_PCB_FLAGS_RECVPEERERR 0x0000000000002000 +#define SCTP_PCB_FLAGS_RECVSENDFAILEVNT 0x0000000000004000 /* deprecated */ +#define SCTP_PCB_FLAGS_RECVSHUTDOWNEVNT 0x0000000000008000 +#define SCTP_PCB_FLAGS_ADAPTATIONEVNT 0x0000000000010000 +#define SCTP_PCB_FLAGS_PDAPIEVNT 0x0000000000020000 +#define SCTP_PCB_FLAGS_AUTHEVNT 0x0000000000040000 +#define SCTP_PCB_FLAGS_STREAM_RESETEVNT 0x0000000000080000 +#define SCTP_PCB_FLAGS_NO_FRAGMENT 0x0000000000100000 +#define SCTP_PCB_FLAGS_EXPLICIT_EOR 0x0000000000400000 +#define SCTP_PCB_FLAGS_NEEDS_MAPPED_V4 0x0000000000800000 +#define SCTP_PCB_FLAGS_MULTIPLE_ASCONFS 0x0000000001000000 +#define SCTP_PCB_FLAGS_PORTREUSE 0x0000000002000000 +#define SCTP_PCB_FLAGS_DRYEVNT 0x0000000004000000 +#define SCTP_PCB_FLAGS_RECVRCVINFO 0x0000000008000000 +#define SCTP_PCB_FLAGS_RECVNXTINFO 0x0000000010000000 +#define SCTP_PCB_FLAGS_ASSOC_RESETEVNT 0x0000000020000000 +#define SCTP_PCB_FLAGS_STREAM_CHANGEEVNT 0x0000000040000000 +#define SCTP_PCB_FLAGS_RECVNSENDFAILEVNT 0x0000000080000000 + +/*- + * mobility_features parameters (by micchie).Note + * these features are applied against the + * sctp_mobility_features flags.. not the sctp_features + * flags. + */ +#define SCTP_MOBILITY_BASE 0x00000001 +#define SCTP_MOBILITY_FASTHANDOFF 0x00000002 +#define SCTP_MOBILITY_PRIM_DELETED 0x00000004 + +/* Smallest PMTU allowed when disabling PMTU discovery */ +#define SCTP_SMALLEST_PMTU 512 +/* Largest PMTU allowed when disabling PMTU discovery */ +#define SCTP_LARGEST_PMTU 65536 + +#if defined(_WIN32) +#pragma pack(pop) +#endif +#undef SCTP_PACKED + +/* This dictates the size of the packet + * collection buffer. This only applies + * if SCTP_PACKET_LOGGING is enabled in + * your config. + */ +#define SCTP_PACKET_LOG_SIZE 65536 + +/* Maximum delays and such a user can set for options that + * take ms. + */ +#define SCTP_MAX_SACK_DELAY 500 /* per RFC4960 */ +#define SCTP_MAX_HB_INTERVAL 14400000 /* 4 hours in ms */ +#define SCTP_MIN_COOKIE_LIFE 1000 /* 1 second in ms */ +#define SCTP_MAX_COOKIE_LIFE 3600000 /* 1 hour in ms */ + +/* Types of logging/KTR tracing that can be enabled via the + * sysctl net.inet.sctp.sctp_logging. You must also enable + * SUBSYS tracing. + * Note that you must have the SCTP option in the kernel + * to enable these as well. + */ +#define SCTP_BLK_LOGGING_ENABLE 0x00000001 +#define SCTP_CWND_MONITOR_ENABLE 0x00000002 +#define SCTP_CWND_LOGGING_ENABLE 0x00000004 +#define SCTP_FLIGHT_LOGGING_ENABLE 0x00000020 +#define SCTP_FR_LOGGING_ENABLE 0x00000040 +#define SCTP_LOCK_LOGGING_ENABLE 0x00000080 +#define SCTP_MAP_LOGGING_ENABLE 0x00000100 +#define SCTP_MBCNT_LOGGING_ENABLE 0x00000200 +#define SCTP_MBUF_LOGGING_ENABLE 0x00000400 +#define SCTP_NAGLE_LOGGING_ENABLE 0x00000800 +#define SCTP_RECV_RWND_LOGGING_ENABLE 0x00001000 +#define SCTP_RTTVAR_LOGGING_ENABLE 0x00002000 +#define SCTP_SACK_LOGGING_ENABLE 0x00004000 +#define SCTP_SACK_RWND_LOGGING_ENABLE 0x00008000 +#define SCTP_SB_LOGGING_ENABLE 0x00010000 +#define SCTP_STR_LOGGING_ENABLE 0x00020000 +#define SCTP_WAKE_LOGGING_ENABLE 0x00040000 +#define SCTP_LOG_MAXBURST_ENABLE 0x00080000 +#define SCTP_LOG_RWND_ENABLE 0x00100000 +#define SCTP_LOG_SACK_ARRIVALS_ENABLE 0x00200000 +#define SCTP_LTRACE_CHUNK_ENABLE 0x00400000 +#define SCTP_LTRACE_ERROR_ENABLE 0x00800000 +#define SCTP_LAST_PACKET_TRACING 0x01000000 +#define SCTP_THRESHOLD_LOGGING 0x02000000 +#define SCTP_LOG_AT_SEND_2_SCTP 0x04000000 +#define SCTP_LOG_AT_SEND_2_OUTQ 0x08000000 +#define SCTP_LOG_TRY_ADVANCE 0x10000000 + +#endif /* !_NETINET_SCTP_H_ */ diff --git a/src/iftraf.c b/src/iftraf.c index a03277402..5d36e6657 100644 --- a/src/iftraf.c +++ b/src/iftraf.c @@ -678,7 +678,8 @@ static int iftraf_pkt_deliver(int af, struct rte_mbuf *mbuf, struct netif_port * struct rte_ipv4_hdr *ip4h = ip4_hdr(mbuf); if (unlikely(ip4h->next_proto_id != IPPROTO_TCP && - ip4h->next_proto_id != IPPROTO_UDP)) { + ip4h->next_proto_id != IPPROTO_UDP && + ip4h->next_proto_id != IPPROTO_SCTP)) { RTE_LOG(DEBUG, IFTRAF, "%s: unspported proto[core: %d, proto: %d]\n", __func__, cid, ip4h->next_proto_id); @@ -740,7 +741,8 @@ static int iftraf_pkt_deliver(int af, struct rte_mbuf *mbuf, struct netif_port * uint8_t ip6nxt = ip6h->ip6_nxt; if (unlikely(ip6nxt != IPPROTO_TCP && - ip6nxt != IPPROTO_UDP)) { + ip6nxt != IPPROTO_UDP && + ip6nxt != IPPROTO_SCTP)) { RTE_LOG(DEBUG, IFTRAF, "%s: unspported proto[core: %d, proto: %d]\n", __func__, cid, ip6nxt); diff --git a/src/ipvs/ip_vs_conn.c b/src/ipvs/ip_vs_conn.c index e8deb0576..f40bfb470 100644 --- a/src/ipvs/ip_vs_conn.c +++ b/src/ipvs/ip_vs_conn.c @@ -30,6 +30,7 @@ #include "ipvs/synproxy.h" #include "ipvs/proto_tcp.h" #include "ipvs/proto_udp.h" +#include "ipvs/proto_sctp.h" #include "ipvs/proto_icmp.h" #include "parser/parser.h" #include "ctrl.h" @@ -517,7 +518,9 @@ void dp_vs_conn_set_timeout(struct dp_vs_conn *conn, struct dp_vs_proto *pp) /* set proper timeout */ if ((conn->proto == IPPROTO_TCP && conn->state == DPVS_TCP_S_ESTABLISHED) - || conn->proto == IPPROTO_UDP) { + || conn->proto == IPPROTO_UDP + || (conn->proto == IPPROTO_SCTP && + conn->state == DPVS_SCTP_S_ESTABLISHED)) { conn_timeout = dp_vs_conn_get_timeout(conn); if (conn_timeout > 0) { @@ -1308,6 +1311,39 @@ static inline char* get_conn_state_name(uint16_t proto, uint16_t state) break; } break; + case IPPROTO_SCTP: + switch (state) { + case DPVS_SCTP_S_NONE: + return "SCTP_NONE"; + case DPVS_SCTP_S_INIT1: + return "SCTP_INIT1"; + case DPVS_SCTP_S_INIT: + return "SCTP_INIT"; + case DPVS_SCTP_S_COOKIE_SENT: + return "SCTP_COOKIE_SENT"; + case DPVS_SCTP_S_COOKIE_REPLIED: + return "SCTP_COOKIE_REPLIED"; + case DPVS_SCTP_S_COOKIE_WAIT: + return "SCTP_COOKIE_WAIT"; + case DPVS_SCTP_S_COOKIE: + return "SCTP_COOKIE"; + case DPVS_SCTP_S_COOKIE_ECHOED: + return "SCTP_COOKIE_ECHOED"; + case DPVS_SCTP_S_ESTABLISHED: + return "SCTP_ESTABLISHED"; + case DPVS_SCTP_S_SHUTDOWN_SENT: + return "SCTP_SHUTDOWN_SENT"; + case DPVS_SCTP_S_SHUTDOWN_RECEIVED: + return "SCTP_SHUTDOWN_RECEIVED"; + case DPVS_SCTP_S_SHUTDOWN_ACK_SENT: + return "SCTP_SHUTDOWN_ACK_SENT"; + case DPVS_SCTP_S_REJECTED: + return "SCTP_REJECTED"; + case DPVS_SCTP_S_CLOSED: + return "SCTP_CLOSED"; + default: + return "SCTP_UNKNOWN"; + } case IPPROTO_ICMP: case IPPROTO_ICMPV6: switch (state) { diff --git a/src/ipvs/ip_vs_dest.c b/src/ipvs/ip_vs_dest.c index 8564c6ebe..a3a01fbe9 100644 --- a/src/ipvs/ip_vs_dest.c +++ b/src/ipvs/ip_vs_dest.c @@ -394,7 +394,7 @@ static void dest_inhibit_logging(const struct dp_vs_dest *dest, const char *msg) RTE_LOG(INFO, SERVICE, "[cid %02d, %s, svc %s:%d, rs %s:%d, weight %d, inhibited %s," " down_notice_recvd %d, inhibit_duration %ds, origin_weight %d] %s\n", cid, - dest->proto == IPPROTO_TCP ? "tcp" : "udp", + dest->proto == IPPROTO_TCP ? "tcp" : IPPROTO_UDP ? "udp" : "sctp", inet_ntop(dest->svc->af, &dest->svc->addr, str_vaddr, sizeof(str_vaddr)) ? str_vaddr : "::", ntohs(dest->svc->port), inet_ntop(dest->af, &dest->addr, str_daddr, sizeof(str_daddr)) ? str_daddr : "::", @@ -409,7 +409,7 @@ static void dest_inhibit_logging(const struct dp_vs_dest *dest, const char *msg) } else { RTE_LOG(DEBUG, SERVICE, "[cid %02d, %s, svc %s:%d, rs %s:%d, weight %d, inhibited %s, warm_up_count %d] %s\n", cid, - dest->proto == IPPROTO_TCP ? "tcp" : "udp", + dest->proto == IPPROTO_TCP ? "tcp" : IPPROTO_UDP ? "udp" : "sctp", inet_ntop(dest->svc->af, &dest->svc->addr, str_vaddr, sizeof(str_vaddr)) ? str_vaddr : "::", ntohs(dest->svc->port), inet_ntop(dest->af, &dest->addr, str_daddr, sizeof(str_daddr)) ? str_daddr : "::", diff --git a/src/ipvs/ip_vs_laddr.c b/src/ipvs/ip_vs_laddr.c index 4f15005a9..fa04e4258 100644 --- a/src/ipvs/ip_vs_laddr.c +++ b/src/ipvs/ip_vs_laddr.c @@ -175,7 +175,8 @@ int dp_vs_laddr_bind(struct dp_vs_conn *conn, struct dp_vs_service *svc) if (!conn || !conn->dest || !svc) return EDPVS_INVAL; - if (svc->proto != IPPROTO_TCP && svc->proto != IPPROTO_UDP) + if (svc->proto != IPPROTO_TCP && svc->proto != IPPROTO_UDP && + svc->proto != IPPROTO_SCTP) return EDPVS_NOTSUPP; if (dp_vs_conn_is_template(conn)) return EDPVS_OK; diff --git a/src/ipvs/ip_vs_nat64.c b/src/ipvs/ip_vs_nat64.c index e9a827b60..b0ff237a6 100644 --- a/src/ipvs/ip_vs_nat64.c +++ b/src/ipvs/ip_vs_nat64.c @@ -34,6 +34,7 @@ int mbuf_6to4(struct rte_mbuf *mbuf, */ if (ip6h->ip6_nxt != IPPROTO_TCP && ip6h->ip6_nxt != IPPROTO_UDP && + ip6h->ip6_nxt != IPPROTO_SCTP && ip6h->ip6_nxt != IPPROTO_ICMPV6 && ip6h->ip6_nxt != IPPROTO_OPT) { return EDPVS_NOTSUPP; diff --git a/src/ipvs/ip_vs_proto.c b/src/ipvs/ip_vs_proto.c index bb5baa5ff..9d7f1e422 100644 --- a/src/ipvs/ip_vs_proto.c +++ b/src/ipvs/ip_vs_proto.c @@ -71,6 +71,7 @@ struct dp_vs_proto *dp_vs_proto_lookup(uint8_t proto) extern struct dp_vs_proto dp_vs_proto_udp; extern struct dp_vs_proto dp_vs_proto_tcp; +extern struct dp_vs_proto dp_vs_proto_sctp; extern struct dp_vs_proto dp_vs_proto_icmp; extern struct dp_vs_proto dp_vs_proto_icmp6; @@ -88,6 +89,11 @@ int dp_vs_proto_init(void) goto tcp_error; } + if ((err = proto_register(&dp_vs_proto_sctp)) != EDPVS_OK) { + RTE_LOG(ERR, IPVS, "%s: fail to register SCTP\n", __func__); + goto sctp_error; + } + if ((err = proto_register(&dp_vs_proto_icmp6)) != EDPVS_OK) { RTE_LOG(ERR, IPVS, "%s: fail to register ICMPV6\n", __func__); goto icmp6_error; @@ -103,6 +109,8 @@ int dp_vs_proto_init(void) icmp_error: proto_unregister(&dp_vs_proto_icmp6); icmp6_error: + proto_unregister(&dp_vs_proto_sctp); +sctp_error: proto_unregister(&dp_vs_proto_tcp); tcp_error: proto_unregister(&dp_vs_proto_udp); @@ -117,6 +125,9 @@ int dp_vs_proto_term(void) if (proto_unregister(&dp_vs_proto_icmp6) != EDPVS_OK) RTE_LOG(ERR, IPVS, "%s: fail to unregister ICMPV6\n", __func__); + if (proto_unregister(&dp_vs_proto_sctp) != EDPVS_OK) + RTE_LOG(ERR, IPVS, "%s: fail to unregister SCTP\n", __func__); + if (proto_unregister(&dp_vs_proto_tcp) != EDPVS_OK) RTE_LOG(ERR, IPVS, "%s: fail to unregister TCP\n", __func__); diff --git a/src/ipvs/ip_vs_proto_sctp.c b/src/ipvs/ip_vs_proto_sctp.c new file mode 100644 index 000000000..28b067ae1 --- /dev/null +++ b/src/ipvs/ip_vs_proto_sctp.c @@ -0,0 +1,558 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "conf/common.h" +#include "dpdk.h" +#include "mbuf.h" +#include "ipv6.h" +#include "route6.h" +#include "neigh.h" +#include "ipvs/ipvs.h" +#include "ipvs/proto.h" +#include "ipvs/proto_sctp.h" +#include "ipvs/conn.h" +#include "ipvs/service.h" +#include "ipvs/dest.h" +#include "ipvs/synproxy.h" +#include "ipvs/blklst.h" +#include "ipvs/whtlst.h" +#include "parser/parser.h" +#include "rte_hash_crc.h" + +/* + * Compute the SCTP checksum in network byte order for a given mbuf chain m + * which contains an SCTP packet starting at offset. + * Since this function is also called by ipfw, don't assume that + * it is compiled on a kernel with SCTP support. + */ +static inline uint32_t sctp_calculate_cksum(struct rte_mbuf *mbuf, + int32_t offset) +{ + int len; + uint32_t _old, _new; + + len = mbuf->data_len; + + struct sctphdr *sh = + rte_pktmbuf_mtod_offset(mbuf, struct sctphdr *, offset); + + _old = sh->checksum; + + sh->checksum = 0; + + _new = ~rte_hash_crc(rte_pktmbuf_mtod_offset(mbuf, const void *, + offset), + len - offset, ~(uint32_t)0); + + sh->checksum = _old; + + return _new; +} + +static int sctp_csum_check(struct dp_vs_proto *proto, int af, + struct rte_mbuf *mbuf); + +static struct dp_vs_conn *sctp_conn_lookup(struct dp_vs_proto *proto, + const struct dp_vs_iphdr *iph, + struct rte_mbuf *mbuf, int *direct, + bool reverse, bool *drop, + lcoreid_t *peer_cid) +{ + struct sctphdr *sh, _sctph; + struct sctp_chunkhdr *sch, _schunkh; + struct dp_vs_conn *conn; + assert(proto && iph && mbuf); + + sh = mbuf_header_pointer(mbuf, iph->len, sizeof(_sctph), &_sctph); + if (unlikely(!sh)) + return NULL; + + sch = mbuf_header_pointer(mbuf, iph->len + sizeof(_sctph), + sizeof(_schunkh), &_schunkh); + if (unlikely(!sch)) + return NULL; + + if (dp_vs_blklst_lookup(iph->af, iph->proto, &iph->daddr, sh->dest_port, + &iph->saddr)) { + *drop = true; + return NULL; + } + + if (!dp_vs_whtlst_allow(iph->af, iph->proto, &iph->daddr, sh->dest_port, + &iph->saddr)) { + *drop = true; + return NULL; + } + + conn = dp_vs_conn_get(iph->af, iph->proto, &iph->saddr, &iph->daddr, + sh->src_port, sh->dest_port, direct, reverse); + + /* + * L2 confirm neighbour + * pkt in from client confirm neighbour to client + * pkt out from rs confirm neighbour to rs + */ + if (conn != NULL) { + if ((*direct == DPVS_CONN_DIR_INBOUND) && conn->out_dev && + (!inet_is_addr_any(tuplehash_in(conn).af, + &conn->out_nexthop))) { + neigh_confirm(tuplehash_in(conn).af, &conn->out_nexthop, + conn->out_dev); + } else if ((*direct == DPVS_CONN_DIR_OUTBOUND) && + conn->in_dev && + (!inet_is_addr_any(tuplehash_out(conn).af, + &conn->in_nexthop))) { + neigh_confirm(tuplehash_out(conn).af, &conn->in_nexthop, + conn->in_dev); + } + } else { + struct dp_vs_redirect *r; + + r = dp_vs_redirect_get(iph->af, iph->proto, &iph->saddr, + &iph->daddr, sh->src_port, + sh->dest_port); + if (r) { + *peer_cid = r->cid; + } + } + + return conn; +} + +static int sctp_conn_schedule(struct dp_vs_proto *proto, + const struct dp_vs_iphdr *iph, + struct rte_mbuf *mbuf, struct dp_vs_conn **conn, + int *verdict) +{ + struct sctphdr *sh, _sctph; + struct dp_vs_service *svc; + + assert(proto && iph && mbuf && conn && verdict); + + sh = mbuf_header_pointer(mbuf, iph->len, sizeof(_sctph), &_sctph); + if (unlikely(!sh)) { + *verdict = INET_DROP; + return EDPVS_INVPKT; + } + + svc = dp_vs_service_lookup(iph->af, iph->proto, &iph->daddr, + sh->dest_port, 0, mbuf, NULL, rte_lcore_id()); + if (!svc) { + *verdict = INET_ACCEPT; + return EDPVS_NOSERV; + } + + *conn = dp_vs_schedule(svc, iph, mbuf, false); + if (!*conn) { + *verdict = INET_DROP; + return EDPVS_RESOURCE; + } + + return EDPVS_OK; +} + +static void sctp_nat_csum(struct rte_mbuf *mbuf, struct sctphdr *sctph, + unsigned int offset) +{ + sctph->checksum = sctp_calculate_cksum(mbuf, offset); +} + +static int sctp_fnat_in_handler(struct dp_vs_proto *proto, + struct dp_vs_conn *conn, struct rte_mbuf *mbuf) +{ + struct sctphdr *sh; + + /* af/mbuf may be changed for nat64 which in af is ipv6 and out is ipv4 */ + int af = tuplehash_out(conn).af; + int iphdrlen = ((AF_INET6 == af) ? ip6_hdrlen(mbuf) : ip4_hdrlen(mbuf)); + + if (mbuf_may_pull(mbuf, iphdrlen + sizeof(*sh)) != 0) + return EDPVS_INVPKT; + + sh = rte_pktmbuf_mtod_offset(mbuf, struct sctphdr *, iphdrlen); + + /* Some checks before mangling */ + if (sctp_csum_check(proto, af, mbuf)) + return EDPVS_INVAL; + + /* L4 translation */ + sh->src_port = conn->lport; + sh->dest_port = conn->dport; + + sctp_nat_csum(mbuf, sh, iphdrlen); + + return EDPVS_OK; +} + +static int sctp_fnat_out_handler(struct dp_vs_proto *proto, + struct dp_vs_conn *conn, struct rte_mbuf *mbuf) +{ + struct sctphdr *sh; + + /* af/mbuf may be changed for nat64 which in af is ipv6 and out is ipv4 */ + int af = tuplehash_in(conn).af; + int iphdrlen = ((AF_INET6 == af) ? ip6_hdrlen(mbuf) : ip4_hdrlen(mbuf)); + + if (mbuf_may_pull(mbuf, iphdrlen + sizeof(*sh)) != 0) + return EDPVS_INVPKT; + + sh = rte_pktmbuf_mtod_offset(mbuf, struct sctphdr *, iphdrlen); + + /* Some checks before mangling */ + if (sctp_csum_check(proto, af, mbuf)) + return EDPVS_INVAL; + + /* L4 translation */ + sh->src_port = conn->vport; + sh->dest_port = conn->cport; + + sctp_nat_csum(mbuf, sh, iphdrlen); + + return EDPVS_OK; +} + +static int sctp_nat_in_handler(struct dp_vs_proto *proto, + struct dp_vs_conn *conn, struct rte_mbuf *mbuf) +{ + struct sctphdr *sh; + int af = conn->af; + int iphdrlen = ((AF_INET6 == af) ? ip6_hdrlen(mbuf) : ip4_hdrlen(mbuf)); + + if (mbuf_may_pull(mbuf, iphdrlen + sizeof(*sh)) != 0) + return EDPVS_INVPKT; + + sh = rte_pktmbuf_mtod_offset(mbuf, struct sctphdr *, iphdrlen); + + /* Some checks before mangling */ + if (sctp_csum_check(proto, af, mbuf)) + return EDPVS_INVAL; + + /* Only update csum if we really have to */ + sh->dest_port = conn->dport; + sctp_nat_csum(mbuf, sh, iphdrlen); + + return EDPVS_OK; +} + +static int sctp_nat_out_handler(struct dp_vs_proto *proto, + struct dp_vs_conn *conn, struct rte_mbuf *mbuf) +{ + struct sctphdr *sh; + int af = conn->af; + int iphdrlen = ((AF_INET6 == af) ? ip6_hdrlen(mbuf) : ip4_hdrlen(mbuf)); + + if (mbuf_may_pull(mbuf, iphdrlen + sizeof(*sh)) != 0) + return EDPVS_INVPKT; + + sh = rte_pktmbuf_mtod_offset(mbuf, struct sctphdr *, iphdrlen); + + /* Some checks before mangling */ + if (sctp_csum_check(proto, af, mbuf)) + return EDPVS_INVAL; + + /* Only update csum if we really have to */ + sh->src_port = conn->vport; + sctp_nat_csum(mbuf, sh, iphdrlen); + + return EDPVS_OK; +} + +static int sctp_csum_check(struct dp_vs_proto *proto, int af, + struct rte_mbuf *mbuf) +{ + struct sctphdr *sh; + uint32_t cmp, val; + int iphdrlen = ((AF_INET6 == af) ? ip6_hdrlen(mbuf) : ip4_hdrlen(mbuf)); + + sh = rte_pktmbuf_mtod_offset(mbuf, struct sctphdr *, iphdrlen); + cmp = sh->checksum; + val = sctp_calculate_cksum(mbuf, iphdrlen); + + if (val != cmp) { + /* CRC failure, dump it. */ + RTE_LOG(WARNING, IPVS, "Failed checksum for %d %s %p!\n", af, + proto->name, mbuf); + return EDPVS_INVAL; + } + return EDPVS_OK; +} + +/* RFC 2960, 3.2 Chunk Field Descriptions */ +static __u8 sctp_events[] = { + [SCTP_DATA] = DPVS_SCTP_DATA, + [SCTP_INITIATION] = DPVS_SCTP_INIT, + [SCTP_INITIATION_ACK] = DPVS_SCTP_INIT_ACK, + [SCTP_SELECTIVE_ACK] = DPVS_SCTP_DATA, + [SCTP_HEARTBEAT_REQUEST] = DPVS_SCTP_DATA, + [SCTP_HEARTBEAT_ACK] = DPVS_SCTP_DATA, + [SCTP_ABORT_ASSOCIATION] = DPVS_SCTP_ABORT, + [SCTP_SHUTDOWN] = DPVS_SCTP_SHUTDOWN, + [SCTP_SHUTDOWN_ACK] = DPVS_SCTP_SHUTDOWN_ACK, + [SCTP_OPERATION_ERROR] = DPVS_SCTP_ERROR, + [SCTP_COOKIE_ECHO] = DPVS_SCTP_COOKIE_ECHO, + [SCTP_COOKIE_ACK] = DPVS_SCTP_COOKIE_ACK, + [SCTP_ECN_ECHO] = DPVS_SCTP_DATA, + [SCTP_ECN_CWR] = DPVS_SCTP_DATA, + [SCTP_SHUTDOWN_COMPLETE] = DPVS_SCTP_SHUTDOWN_COMPLETE, +}; + +/* SCTP States: + * See RFC 2960, 4. SCTP Association State Diagram + * + * New states (not in diagram): + * - INIT1 state: use shorter timeout for dropped INIT packets + * - REJECTED state: use shorter timeout if INIT is rejected with ABORT + * - INIT, COOKIE_SENT, COOKIE_REPLIED, COOKIE states: for better debugging + * + * The states are as seen in real server. In the diagram, INIT1, INIT, + * COOKIE_SENT and COOKIE_REPLIED processing happens in CLOSED state. + * + * States as per packets from client (C) and server (S): + * + * Setup of client connection: + * DPVS_SCTP_S_INIT1: First C:INIT sent, wait for S:INIT-ACK + * DPVS_SCTP_S_INIT: Next C:INIT sent, wait for S:INIT-ACK + * DPVS_SCTP_S_COOKIE_SENT: S:INIT-ACK sent, wait for C:COOKIE-ECHO + * DPVS_SCTP_S_COOKIE_REPLIED: C:COOKIE-ECHO sent, wait for S:COOKIE-ACK + * + * Setup of server connection: + * DPVS_SCTP_S_COOKIE_WAIT: S:INIT sent, wait for C:INIT-ACK + * DPVS_SCTP_S_COOKIE: C:INIT-ACK sent, wait for S:COOKIE-ECHO + * DPVS_SCTP_S_COOKIE_ECHOED: S:COOKIE-ECHO sent, wait for C:COOKIE-ACK + */ + +#define sNO DPVS_SCTP_S_NONE +#define sI1 DPVS_SCTP_S_INIT1 +#define sIN DPVS_SCTP_S_INIT +#define sCS DPVS_SCTP_S_COOKIE_SENT +#define sCR DPVS_SCTP_S_COOKIE_REPLIED +#define sCW DPVS_SCTP_S_COOKIE_WAIT +#define sCO DPVS_SCTP_S_COOKIE +#define sCE DPVS_SCTP_S_COOKIE_ECHOED +#define sES DPVS_SCTP_S_ESTABLISHED +#define sSS DPVS_SCTP_S_SHUTDOWN_SENT +#define sSR DPVS_SCTP_S_SHUTDOWN_RECEIVED +#define sSA DPVS_SCTP_S_SHUTDOWN_ACK_SENT +#define sRJ DPVS_SCTP_S_REJECTED +#define sCL DPVS_SCTP_S_CLOSED + +static const __u8 sctp_states[DPVS_DIR_LAST][DPVS_SCTP_EVENT_LAST][DPVS_SCTP_S_LAST] = { + { /* INPUT */ +/* sNO, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL*/ +/* d */ { sES, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL }, +/* i */ { sI1, sIN, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sIN, sIN }, +/* i_a */ { sCW, sCW, sCW, sCS, sCR, sCO, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL }, +/* c_e */ { sCR, sIN, sIN, sCR, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL }, +/* c_a */ { sES, sI1, sIN, sCS, sCR, sCW, sCO, sES, sES, sSS, sSR, sSA, sRJ, sCL }, +/* s */ { sSR, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sSR, sSS, sSR, sSA, sRJ, sCL }, +/* s_a */ { sCL, sIN, sIN, sCS, sCR, sCW, sCO, sCE, sES, sCL, sSR, sCL, sRJ, sCL }, +/* s_c */ { sCL, sCL, sCL, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sCL, sRJ, sCL }, +/* err */ { sCL, sI1, sIN, sCS, sCR, sCW, sCO, sCL, sES, sSS, sSR, sSA, sRJ, sCL }, +/* ab */ { sCL, sCL, sCL, sCL, sCL, sRJ, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL }, + }, + { /* OUTPUT */ +/* sNO, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL*/ +/* d */ { sES, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL }, +/* i */ { sCW, sCW, sCW, sCW, sCW, sCW, sCW, sCW, sES, sCW, sCW, sCW, sCW, sCW }, +/* i_a */ { sCS, sCS, sCS, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL }, +/* c_e */ { sCE, sCE, sCE, sCE, sCE, sCE, sCE, sCE, sES, sSS, sSR, sSA, sRJ, sCL }, +/* c_a */ { sES, sES, sES, sES, sES, sES, sES, sES, sES, sSS, sSR, sSA, sRJ, sCL }, +/* s */ { sSS, sSS, sSS, sSS, sSS, sSS, sSS, sSS, sSS, sSS, sSR, sSA, sRJ, sCL }, +/* s_a */ { sSA, sSA, sSA, sSA, sSA, sCW, sCO, sCE, sES, sSA, sSA, sSA, sRJ, sCL }, +/* s_c */ { sCL, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL }, +/* err */ { sCL, sCL, sCL, sCL, sCL, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL }, +/* ab */ { sCL, sRJ, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL }, + }, + { /* INPUT-ONLY */ +/* sNO, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL*/ +/* d */ { sES, sI1, sIN, sCS, sCR, sES, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL }, +/* i */ { sI1, sIN, sIN, sIN, sIN, sIN, sCO, sCE, sES, sSS, sSR, sSA, sIN, sIN }, +/* i_a */ { sCE, sCE, sCE, sCE, sCE, sCE, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL }, +/* c_e */ { sES, sES, sES, sES, sES, sES, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL }, +/* c_a */ { sES, sI1, sIN, sES, sES, sCW, sES, sES, sES, sSS, sSR, sSA, sRJ, sCL }, +/* s */ { sSR, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sSR, sSS, sSR, sSA, sRJ, sCL }, +/* s_a */ { sCL, sIN, sIN, sCS, sCR, sCW, sCO, sCE, sCL, sCL, sSR, sCL, sRJ, sCL }, +/* s_c */ { sCL, sCL, sCL, sCL, sCL, sCW, sCO, sCE, sES, sSS, sCL, sCL, sRJ, sCL }, +/* err */ { sCL, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL }, +/* ab */ { sCL, sCL, sCL, sCL, sCL, sRJ, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL }, + }, +}; + +#define DPVS_SCTP_MAX_RTO (60 + 1) + +/* Timeout table[state] */ +static int sctp_timeouts[DPVS_SCTP_S_LAST + 1] = { + [DPVS_SCTP_S_NONE] = 2, + [DPVS_SCTP_S_INIT1] = (0 + 3 + 1), + [DPVS_SCTP_S_INIT] = DPVS_SCTP_MAX_RTO, + [DPVS_SCTP_S_COOKIE_SENT] = DPVS_SCTP_MAX_RTO, + [DPVS_SCTP_S_COOKIE_REPLIED] = DPVS_SCTP_MAX_RTO, + [DPVS_SCTP_S_COOKIE_WAIT] = DPVS_SCTP_MAX_RTO, + [DPVS_SCTP_S_COOKIE] = DPVS_SCTP_MAX_RTO, + [DPVS_SCTP_S_COOKIE_ECHOED] = DPVS_SCTP_MAX_RTO, + [DPVS_SCTP_S_ESTABLISHED] = 15 * 60, + [DPVS_SCTP_S_SHUTDOWN_SENT] = DPVS_SCTP_MAX_RTO, + [DPVS_SCTP_S_SHUTDOWN_RECEIVED] = DPVS_SCTP_MAX_RTO, + [DPVS_SCTP_S_SHUTDOWN_ACK_SENT] = DPVS_SCTP_MAX_RTO, + [DPVS_SCTP_S_REJECTED] = (0 + 3 + 1), + [DPVS_SCTP_S_CLOSED] = DPVS_SCTP_MAX_RTO, + [DPVS_SCTP_S_LAST] = 2, +}; + +static const char *sctp_state_name_table[DPVS_SCTP_S_LAST + 1] = { + [DPVS_SCTP_S_NONE] = "NONE", + [DPVS_SCTP_S_INIT1] = "INIT1", + [DPVS_SCTP_S_INIT] = "INIT", + [DPVS_SCTP_S_COOKIE_SENT] = "C-SENT", + [DPVS_SCTP_S_COOKIE_REPLIED] = "C-REPLIED", + [DPVS_SCTP_S_COOKIE_WAIT] = "C-WAIT", + [DPVS_SCTP_S_COOKIE] = "COOKIE", + [DPVS_SCTP_S_COOKIE_ECHOED] = "C-ECHOED", + [DPVS_SCTP_S_ESTABLISHED] = "ESTABLISHED", + [DPVS_SCTP_S_SHUTDOWN_SENT] = "S-SENT", + [DPVS_SCTP_S_SHUTDOWN_RECEIVED] = "S-RECEIVED", + [DPVS_SCTP_S_SHUTDOWN_ACK_SENT] = "S-ACK-SENT", + [DPVS_SCTP_S_REJECTED] = "REJECTED", + [DPVS_SCTP_S_CLOSED] = "CLOSED", + [DPVS_SCTP_S_LAST] = "BUG!", +}; + +static const char *sctp_state_name(int state) +{ + if (state >= DPVS_SCTP_S_LAST) + return "ERR!"; + if (sctp_state_name_table[state]) + return sctp_state_name_table[state]; + return "?"; +} + +static int sctp_state_trans(struct dp_vs_proto *proto, struct dp_vs_conn *conn, + struct rte_mbuf *mbuf, int dir) +{ + struct sctp_chunkhdr _sctpch, *sch; + unsigned char chunk_type; + int event, next_state; + int iphdrlen, cofs; + assert(proto && conn && mbuf); + + iphdrlen = + ((AF_INET6 == conn->af) ? ip6_hdrlen(mbuf) : ip4_hdrlen(mbuf)); + + cofs = iphdrlen + sizeof(struct sctphdr); + sch = mbuf_header_pointer(mbuf, cofs, sizeof(_sctpch), &_sctpch); + if (!sch) + return EDPVS_INVPKT; + + chunk_type = sch->chunk_type; + /* + * Section 3: Multiple chunks can be bundled into one SCTP packet + * up to the MTU size, except for the INIT, INIT ACK, and + * SHUTDOWN COMPLETE chunks. These chunks MUST NOT be bundled with + * any other chunk in a packet. + * + * Section 3.3.7: DATA chunks MUST NOT be bundled with ABORT. Control + * chunks (except for INIT, INIT ACK, and SHUTDOWN COMPLETE) MAY be + * bundled with an ABORT, but they MUST be placed before the ABORT + * in the SCTP packet or they will be ignored by the receiver. + */ + if ((sch->chunk_type == SCTP_COOKIE_ECHO) || + (sch->chunk_type == SCTP_COOKIE_ACK)) { + int clen = ntohs(sch->chunk_length); + + if (clen >= sizeof(_sctpch)) { + sch = mbuf_header_pointer(mbuf, + cofs + RTE_ALIGN(clen, 4), + sizeof(_sctpch), &_sctpch); + if (sch && sch->chunk_type == SCTP_ABORT_ASSOCIATION) + chunk_type = sch->chunk_type; + } + } + + event = (chunk_type < sizeof(sctp_events)) ? sctp_events[chunk_type] : + DPVS_SCTP_DATA; + + next_state = sctp_states[dir][event][conn->state]; + + if (next_state != conn->state) { + struct dp_vs_dest *dest = conn->dest; + +#ifdef CONFIG_DPVS_IPVS_DEBUG + RTE_LOG(DEBUG, IPVS, + "%s %s %X:%d->" + "%X:%d state: %s->%s conn->refcnt:%d\n", + proto->name, + ((dir == DPVS_CONN_DIR_OUTBOUND) ? "output " : + "input "), + inet_addr_fold(conn->af, &conn->caddr), + ntohs(conn->dport), + inet_addr_fold(conn->af, &conn->caddr), + ntohs(conn->cport), sctp_state_name(conn->state), + sctp_state_name(next_state), + rte_atomic32_read(&conn->refcnt)); +#endif + if (dest) { + if (!(conn->flags & DPVS_CONN_F_INACTIVE) && + (next_state != DPVS_SCTP_S_ESTABLISHED)) { + rte_atomic32_dec(&dest->actconns); + rte_atomic32_inc(&dest->inactconns); + conn->flags |= DPVS_CONN_F_INACTIVE; + } else if ((conn->flags & DPVS_CONN_F_INACTIVE) && + (next_state == DPVS_SCTP_S_ESTABLISHED)) { + rte_atomic32_inc(&dest->actconns); + rte_atomic32_dec(&dest->inactconns); + conn->flags &= ~DPVS_CONN_F_INACTIVE; + } + } + conn->old_state = conn->state; + conn->state = next_state; + } + dp_vs_conn_set_timeout(conn, proto); + return EDPVS_OK; +} + +static int sctp_conn_expire(struct dp_vs_proto *proto, struct dp_vs_conn *conn) +{ + if (conn && conn->prot_data) + rte_free(conn->prot_data); + + return EDPVS_OK; +} + +static int sctp_conn_expire_quiescent(struct dp_vs_conn *conn) +{ + dp_vs_conn_expire_now(conn); + + return EDPVS_OK; +} + +static int sctp_init(struct dp_vs_proto *proto) +{ + if (!proto) + return EDPVS_INVAL; + + proto->timeout_table = sctp_timeouts; + + return EDPVS_OK; +} + +static int sctp_exit(struct dp_vs_proto *proto) +{ + return EDPVS_OK; +} + +struct dp_vs_proto dp_vs_proto_sctp = { + .name = "SCTP", + .proto = IPPROTO_SCTP, + .init = sctp_init, + .exit = sctp_exit, + .conn_sched = sctp_conn_schedule, + .conn_lookup = sctp_conn_lookup, + .conn_expire = sctp_conn_expire, + .conn_expire_quiescent = sctp_conn_expire_quiescent, + .nat_in_handler = sctp_nat_in_handler, + .nat_out_handler = sctp_nat_out_handler, + .fnat_in_handler = sctp_fnat_in_handler, + .fnat_out_handler = sctp_fnat_out_handler, + .snat_in_handler = sctp_nat_in_handler, + .snat_out_handler = sctp_nat_out_handler, + .state_trans = sctp_state_trans, + .state_name = sctp_state_name, +}; diff --git a/src/ipvs/ip_vs_service.c b/src/ipvs/ip_vs_service.c index 44907dcd0..974f8527c 100644 --- a/src/ipvs/ip_vs_service.c +++ b/src/ipvs/ip_vs_service.c @@ -913,7 +913,8 @@ static int dp_vs_service_set(sockoptid_t opt, const void *user, size_t len) } if (usvc.proto != IPPROTO_TCP && usvc.proto != IPPROTO_UDP && - usvc.proto != IPPROTO_ICMP && usvc.proto != IPPROTO_ICMPV6) { + usvc.proto != IPPROTO_SCTP && usvc.proto != IPPROTO_ICMP && + usvc.proto != IPPROTO_ICMPV6) { RTE_LOG(ERR, SERVICE, "%s: protocol not support.\n", __func__); return EDPVS_INVAL; } diff --git a/src/tc/cls_match.c b/src/tc/cls_match.c index ab7ec357d..8f5bcf6ec 100644 --- a/src/tc/cls_match.c +++ b/src/tc/cls_match.c @@ -25,6 +25,7 @@ #include #include #include +#include "sctp/sctp.h" #include "netif.h" #include "vlan.h" #include "tc/tc.h" @@ -54,6 +55,7 @@ static int match_classify(struct tc_cls *cls, struct rte_mbuf *mbuf, struct ip6_hdr *ip6h = NULL; struct tcphdr *th; struct udphdr *uh; + struct sctphdr *sh; uint8_t l4_proto = 0; int offset = sizeof(*eh); __be16 pkt_type = eh->ether_type; @@ -175,6 +177,17 @@ static int match_classify(struct tc_cls *cls, struct rte_mbuf *mbuf, dport = uh->dest; break; + case IPPROTO_SCTP: + if (mbuf_may_pull(mbuf, offset + sizeof(struct sctphdr)) != 0) { + err = TC_ACT_SHOT; + goto done; + } + + sh = rte_pktmbuf_mtod_offset(mbuf, struct sctphdr *, offset); + sport = sh->src_port; + dport = sh->dest_port; + break; + default: /* priv->proto is not assigned */ goto match; } diff --git a/tools/dpip/cls.c b/tools/dpip/cls.c index cab722a68..f38d7e63f 100644 --- a/tools/dpip/cls.c +++ b/tools/dpip/cls.c @@ -49,7 +49,7 @@ static void cls_help(void) " PATTERN := comma seperated of tokens below,\n" " { PROTO | SRANGE | DRANGE | IIF | OIF }\n" " CHILD_QSCH := child qsch handle of the qsch cls attached.\n" - " PROTO := \"{ tcp | udp }\"\n" + " PROTO := \"{ tcp | sctp | udp }\"\n" " SRANGE := \"from=RANGE\"\n" " DRANGE := \"to=RANGE\"\n" " RANGE := ADDR[-ADDR][:PORT[-PORT]]\n" diff --git a/tools/ipvsadm/ipvsadm.c b/tools/ipvsadm/ipvsadm.c index dd234691e..0b6fd76e5 100644 --- a/tools/ipvsadm/ipvsadm.c +++ b/tools/ipvsadm/ipvsadm.c @@ -329,6 +329,7 @@ enum { TAG_SORT, TAG_NO_SORT, TAG_PERSISTENCE_ENGINE, + TAG_SCTP_SERVICE, TAG_SOCKPAIR, TAG_HASH_TARGET, TAG_CPU, @@ -432,6 +433,8 @@ static int parse_dest_check(const char *optarg, struct dest_check_configs *conf) conf->types |= DEST_HC_TCP; } else if (!strcmp(optarg, "udp")) { conf->types |= DEST_HC_UDP; + } else if (!strcmp(optarg, "sctp")) { + conf->types |= DEST_HC_SCTP; } else if (!strcmp(optarg, "ping")) { conf->types |= DEST_HC_PING; } else if (!strcmp(optarg, "default")) { @@ -504,6 +507,8 @@ parse_options(int argc, char **argv, struct ipvs_command_entry *ce, NULL, NULL }, { "udp-service", 'u', POPT_ARG_STRING, &optarg, 'u', NULL, NULL }, + { "sctp-service", '\0', POPT_ARG_STRING, &optarg, + TAG_SCTP_SERVICE, NULL, NULL }, { "icmp-service", 'q', POPT_ARG_STRING, &optarg, 'q', NULL, NULL }, { "icmpv6-service", '1', POPT_ARG_STRING, &optarg, '1', @@ -668,11 +673,14 @@ parse_options(int argc, char **argv, struct ipvs_command_entry *ce, case 'u': case 'q': case '1': + case TAG_SCTP_SERVICE: set_option(options, OPT_SERVICE); if (c == 't') { ce->dpvs_svc.proto = IPPROTO_TCP; } else if (c == 'u') { ce->dpvs_svc.proto = IPPROTO_UDP; + } else if (c == TAG_SCTP_SERVICE) { + ce->dpvs_svc.proto = IPPROTO_SCTP; } else if (c == 'q') { ce->dpvs_svc.proto = IPPROTO_ICMP; } else if (c == '1') { /*a~Z is out. ipvsadm is really not friendly here*/ @@ -1379,7 +1387,7 @@ parse_service(char *buf, dpvs_service_compat_t *dpvs_svc) /* * Get sockpair from the arguments. * sockpair := PROTO:SIP:SPORT:TIP:TPORT - * PROTO := [tcp|udp] + * PROTO := [tcp|udp|sctp] * SIP,TIP := dotted-decimal ip address or square-blacketed ip6 address * SPORT,TPORT := range(0, 65535) */ @@ -1402,6 +1410,8 @@ parse_sockpair(char *buf, ipvs_sockpair_t *sockpair) proto = IPPROTO_TCP; else if (strncmp(pos, "udp", 3) == 0) proto = IPPROTO_UDP; + else if (strncmp(pos, "sctp", 4) == 0) + proto = IPPROTO_SCTP; else return 0; @@ -1474,7 +1484,7 @@ parse_sockpair(char *buf, ipvs_sockpair_t *sockpair) /* * comma separated parameters list, all fields is used to match packets. * - * proto := tcp | udp | icmp |icmpv6 + * proto := tcp | udp | sctp | icmp |icmpv6 * src-range := RANGE * dst-range := RANGE * iif := IFNAME @@ -1510,6 +1520,8 @@ static int parse_match_snat(const char *buf, dpvs_service_compat_t *dpvs_svc) dpvs_svc->proto = IPPROTO_TCP; } else if (strcmp(val, "udp") == 0) { dpvs_svc->proto = IPPROTO_UDP; + } else if (strcmp(val, "sctp") == 0) { + dpvs_svc->proto = IPPROTO_SCTP; } else if (strcmp(val, "icmp") == 0) { dpvs_svc->proto = IPPROTO_ICMP; } else if (strcmp(val, "icmpv6") == 0) { @@ -1685,6 +1697,7 @@ static void usage_exit(const char *program, const int exit_status) "Options:\n" " --tcp-service -t service-address service-address is host[:port]\n" " --udp-service -u service-address service-address is host[:port]\n" + " --sctp-service service-address service-address is host[:port]\n" " --icmp-service -q service-address service-address is host[:port]\n" " --icmpv6-service -1 service-address service-address is host[:port]\n" " --fwmark-service -f fwmark fwmark is an integer greater than zero\n" @@ -1728,7 +1741,7 @@ static void usage_exit(const char *program, const int exit_status) " --cpu cpu_index specifi cpu (lcore) index to show, 0 for master worker\n" " --expire-quiescent expire the quiescent connections timely whose realserver went down\n" " --dest-check CHECK_CONF config health check, inhibit scheduling to failed backends\n" - " CHECK_CONF:=disable|default(passive)|DETAIL(passive)|tcp|udp|ping, DETAIL:=UPDOWN|DOWNONLY\n" + " CHECK_CONF:=disable|default(passive)|DETAIL(passive)|tcp|udp|sctp|ping, DETAIL:=UPDOWN|DOWNONLY\n" " UPDOWN:=down_retry,up_confirm,down_wait,inhibit_min-inhibit_max, for example, the default is 1,1,3s,5-3600s\n" " DOWNONLY:=down_retry,down_wait, for example, --dest-check=1,3s\n" " --laddr -z local-ip local IP\n" @@ -1786,6 +1799,8 @@ static void print_conn_entry(const ipvs_conn_entry_t *conn_entry, snprintf(proto_str, sizeof(proto_str), "%s", "tcp"); else if (conn_entry->proto == IPPROTO_UDP) snprintf(proto_str, sizeof(proto_str), "%s", "udp"); + else if (conn_entry->proto == IPPROTO_SCTP) + snprintf(proto_str, sizeof(proto_str), "%s", "sctp"); else if (conn_entry->proto == IPPROTO_ICMP) snprintf(proto_str, sizeof(proto_str), "%s", "icmp"); else if (conn_entry->proto == IPPROTO_ICMPV6) @@ -2034,6 +2049,8 @@ print_service_entry(dpvs_service_compat_t *se, unsigned int format) proto = "-t"; else if (se->proto == IPPROTO_UDP) proto = "-u"; + else if (se->proto == IPPROTO_SCTP) + proto = "--sctp-service"; else proto = "-q"; @@ -2043,6 +2060,8 @@ print_service_entry(dpvs_service_compat_t *se, unsigned int format) proto = "TCP"; else if (se->proto == IPPROTO_UDP) proto = "UDP"; + else if (se->proto == IPPROTO_SCTP) + proto = "SCTP"; else if (se->proto == IPPROTO_ICMP) proto = "ICMP"; else @@ -2063,6 +2082,8 @@ print_service_entry(dpvs_service_compat_t *se, unsigned int format) proto = "tcp"; else if (se->proto == IPPROTO_UDP) proto = "udp"; + else if (se->proto == IPPROTO_SCTP) + proto = "sctp"; else if (se->proto == IPPROTO_ICMP) proto = "icmp"; else @@ -2203,6 +2224,8 @@ print_service_entry(dpvs_service_compat_t *se, unsigned int format) strcat(buf, "tcp,"); if (se->check_conf.types & DEST_HC_UDP) strcat(buf, "udp,"); + if (se->check_conf.types & DEST_HC_SCTP) + strcat(buf, "sctp,"); if (se->check_conf.types & DEST_HC_PING) strcat(buf, "ping,"); *strrchr(buf, ',') = '\0'; @@ -2409,6 +2432,9 @@ static void print_service_and_blklsts(struct dp_vs_blklst_conf *blklst) case IPPROTO_UDP: snprintf(proto, sizeof(proto), "%s", "UDP"); break; + case IPPROTO_SCTP: + snprintf(proto, sizeof(proto), "%s", "SCTP"); + break; case IPPROTO_ICMP: snprintf(proto, sizeof(proto), "%s", "ICMP"); break; @@ -2509,6 +2535,9 @@ static void print_service_and_whtlsts(struct dp_vs_whtlst_conf *whtlst) case IPPROTO_UDP: snprintf(proto, sizeof(proto), "%s", "UDP"); break; + case IPPROTO_SCTP: + snprintf(proto, sizeof(proto), "%s", "SCTP"); + break; case IPPROTO_ICMP: snprintf(proto, sizeof(proto), "%s", "ICMP"); break; @@ -2706,6 +2735,9 @@ int service_to_port(const char *name, unsigned short proto) else if (proto == IPPROTO_UDP && (service = getservbyname(name, "udp")) != NULL) return ntohs((unsigned short) service->s_port); + else if (proto == IPPROTO_SCTP + && (service = getservbyname(name, "sctp")) != NULL) + return ntohs((unsigned short) service->s_port); else if (proto == IPPROTO_ICMP && (service = getservbyname(name, "icmp")) != NULL) return ntohs((unsigned short) service->s_port); @@ -2727,6 +2759,9 @@ static char * port_to_service(unsigned short port, unsigned short proto) else if (proto == IPPROTO_UDP && (service = getservbyport(htons(port), "udp")) != NULL) return service->s_name; + else if (proto == IPPROTO_SCTP && + (service = getservbyport(htons(port), "sctp")) != NULL) + return service->s_name; else if (proto == IPPROTO_ICMP && (service = getservbyport(htons(port), "icmp")) != NULL) return service->s_name; diff --git a/tools/keepalived/keepalived/check/check_data.c b/tools/keepalived/keepalived/check/check_data.c index ba3ece87e..1fd928e82 100644 --- a/tools/keepalived/keepalived/check/check_data.c +++ b/tools/keepalived/keepalived/check/check_data.c @@ -1128,6 +1128,9 @@ char *dump_vs_match(const virtual_server_t *vs) case IPPROTO_UDP: snprintf(vs_str, sizeof(vs_str) - 1, "%s", "udp"); break; + case IPPROTO_SCTP: + snprintf(vs_str, sizeof(vs_str) - 1, "%s", "sctp"); + break; case IPPROTO_ICMP: snprintf(vs_str, sizeof(vs_str) - 1, "%s", "icmp"); break;