diff --git a/.gitignore b/.gitignore index b85211335..8be1e7227 100644 --- a/.gitignore +++ b/.gitignore @@ -33,3 +33,4 @@ Module.symvers modules.order *.ko *.mod.c +.ycm_extra_conf.py diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 000000000..88f4c6a7e --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,20 @@ +# Contributing + +We are really glad you're reading this, because we need volunteer developers to enhance this open source project. Pls feel free to submit issues, bugfix or new features. We really appreciate any contributing work you made to the project. + +How to contribute the project? In general, we follow the "fork-and-pull" [Git workflow](https://nvie.com/posts/a-successful-git-branching-model/). Two main branches with infinite lifetime exist: **master** and **devel**. It is recommended that you should follow the workflow if you want to submit a patch to the project. + +* S1. **Fork** the repo on Github. +* S2. **Clone** the project to your own machine. +* S3. **Checkout** to `devel` branch of the project on your own machine. +* S4. **Commit** changes on your own branch. +* S5. **Push** your work back up to your fork. +* S6. Submit a **Pull Request** so that we can review your changes. + +NOTE: Be sure to merge the latest from "upstream" before making a pull request! + +# Coding conventions + +Basically, codeing style should be consistent in the whole project. We recommend to use the [Linux kernel coding style](https://www.kernel.org/doc/html/v4.10/process/coding-style.html#). + +NOTE: As for indentations, we use 4-char indents, not 8-char indents. This is different from Linux kernel coding style. diff --git a/conf/dpvs.bond.conf.sample b/conf/dpvs.bond.conf.sample index 4f919f1ec..d933f7823 100755 --- a/conf/dpvs.bond.conf.sample +++ b/conf/dpvs.bond.conf.sample @@ -25,7 +25,7 @@ netif_defs { rx { queue_number 8 descriptor_number 1024 - rss tcp + rss all } tx { queue_number 8 @@ -39,7 +39,7 @@ netif_defs { rx { queue_number 8 descriptor_number 1024 - rss tcp + rss all } tx { queue_number 8 @@ -54,7 +54,7 @@ netif_defs { rx { queue_number 8 descriptor_number 1024 - rss tcp + rss all } tx { queue_number 8 @@ -68,7 +68,7 @@ netif_defs { rx { queue_number 8 descriptor_number 1024 - rss tcp + rss all } tx { queue_number 8 diff --git a/conf/dpvs.conf.items b/conf/dpvs.conf.items index 4c5f5fb51..165fd016a 100755 --- a/conf/dpvs.conf.items +++ b/conf/dpvs.conf.items @@ -26,12 +26,17 @@ netif_defs { #max_burst_size 32 queue_number 6 <16, 0-16> descriptor_number 256 <256, 16-8192> - rss ip + rss all } tx { queue_number 6 <16, 0-16> descriptor_number 512 <512, 16-8192> } + fdir { + mode perfect + pballoc 64k <64k, 64k|128k|256k> + status matched + } ! promisc_mode ! kni_name dpdk0.kni } @@ -41,7 +46,7 @@ netif_defs { #max_burst_size 32 queue_number 4 descriptor_number 128 - rss tcp + rss all } tx { queue_number 4 @@ -137,6 +142,7 @@ neigh_defs { ! dpvs ipv4 config ipv4_defs { + forwarding off default_ttl 64 <64, 0-255> fragment { bucket_number 4096 <4096, 32-65536> @@ -146,6 +152,22 @@ ipv4_defs { } } +! dpvs ipv6 config +ipv6_defs { + disable off + forwarding off + route6 { + method "hlist" <"hlist"/"lpm"> + recycle_time 10 <10, 1-36000> + lpm { + lpm6_max_rules 1024 <1024, 16-2147483647> + lpm6_num_tbl8s 65536 <65536, 16-2147483647> + rt6_array_size 65536 <65536, 16-2147483647> + rt6_hash_bucket 256 <256, 2-2147483647> + } + } +} + ! control plane config ctrl_defs { lcore_msg { @@ -221,3 +243,4 @@ ipvs_defs { sa_pool { pool_hash_size 16 <16, 1-128> } + diff --git a/conf/dpvs.conf.sample b/conf/dpvs.conf.sample index ba84b977b..f8fb8950d 100755 --- a/conf/dpvs.conf.sample +++ b/conf/dpvs.conf.sample @@ -13,25 +13,30 @@ ! global config global_defs { log_level WARNING -! log_file /var/log/dpvs.log + ! log_file /var/log/dpvs.log } ! netif config netif_defs { - pktpool_size 2097151 + pktpool_size 1048575 pktpool_cache 256 device dpdk0 { rx { queue_number 8 descriptor_number 1024 - rss tcp + rss all } tx { queue_number 8 descriptor_number 1024 } - ! promisc_mode + fdir { + mode perfect + pballoc 64k + status matched + } + ! promisc_mode kni_name dpdk0.kni } @@ -39,24 +44,29 @@ netif_defs { rx { queue_number 8 descriptor_number 1024 - rss tcp + rss all } tx { queue_number 8 descriptor_number 1024 } - ! promisc_mode + fdir { + mode perfect + pballoc 64k + status matched + } + ! promisc_mode kni_name dpdk1.kni } -! bonding bond0 { -! mode 0 -! slave dpdk0 -! slave dpdk1 -! primary dpdk0 -! kni_name bond0.kni -! } -} + ! bonding bond0 { + ! mode 0 + ! slave dpdk0 + ! slave dpdk1 + ! primary dpdk0 + ! kni_name bond0.kni + !} + ! worker config (lcores) worker_defs { @@ -217,7 +227,7 @@ neigh_defs { ! dpvs ipv4 config ipv4_defs { - ipv4_forward off ! set this to on, dpvs will forward packets that NOT hit rules directly + forwarding off ! set this to on, dpvs will forward packets that NOT hit rules directly default_ttl 64 fragment { bucket_number 4096 @@ -227,12 +237,22 @@ ipv4_defs { } } +! dpvs ipv6 config +ipv6_defs { + disable off + forwarding off + route6 { + method "hlist" + recycle_time 10 + } +} + ! control plane config ctrl_defs { lcore_msg { ring_size 4096 multicast_queue_length 256 - sync_msg_timeout_us 2000 + sync_msg_timeout_us 20000 } ipc_msg { unix_domain /var/run/dpvs_ctrl @@ -245,12 +265,12 @@ ipvs_defs { conn_pool_size 2097152 conn_pool_cache 256 conn_init_timeout 3 - ! expire_quiescent_template - ! fast_xmit_close + ! expire_quiescent_template + ! fast_xmit_close } udp { - defence_udp_drop + ! defence_udp_drop timeout { normal 300 last 3 @@ -258,7 +278,7 @@ ipvs_defs { } tcp { - defence_tcp_drop + ! defence_tcp_drop timeout { none 2 established 90 @@ -278,19 +298,19 @@ ipvs_defs { mss 1452 ttl 63 sack - ! wscale - ! timestamp + ! wscale + ! timestamp } - ! defer_rs_syn + ! defer_rs_syn rs_syn_max_retry 3 ack_storm_thresh 10 max_ack_saved 3 conn_reuse_state { close time_wait - ! fin_wait - ! close_wait - ! last_ack + ! fin_wait + ! close_wait + ! last_ack } } } diff --git a/conf/dpvs.conf.single-bond.sample b/conf/dpvs.conf.single-bond.sample index ac498fee4..db95307d3 100755 --- a/conf/dpvs.conf.single-bond.sample +++ b/conf/dpvs.conf.single-bond.sample @@ -25,28 +25,38 @@ netif_defs { rx { queue_number 8 descriptor_number 1024 - rss tcp + rss all } tx { queue_number 8 descriptor_number 1024 } - ! promisc_mode - ! kni_name dpdk0.kni + fdir { + mode perfect + pballoc 64k + status matched + } + ! promisc_mode + ! kni_name dpdk0.kni } device dpdk2 { rx { queue_number 8 descriptor_number 1024 - rss tcp + rss all } tx { queue_number 8 descriptor_number 1024 } - ! promisc_mode - ! kni_name dpdk2.kni + fdir { + mode perfect + pballoc 64k + status matched + } + ! promisc_mode + ! kni_name dpdk2.kni } bonding bond0 { @@ -71,8 +81,8 @@ worker_defs { port bond0 { rx_queue_ids 0 tx_queue_ids 0 - isol_rx_cpu_ids 9 - isol_rxq_ring_sz 1048576 + ! isol_rx_cpu_ids 9 + ! isol_rxq_ring_sz 1048576 } } @@ -82,8 +92,8 @@ worker_defs { port bond0 { rx_queue_ids 1 tx_queue_ids 1 - isol_rx_cpu_ids 10 - isol_rxq_ring_sz 1048576 + ! isol_rx_cpu_ids 10 + ! isol_rxq_ring_sz 1048576 } } @@ -93,8 +103,8 @@ worker_defs { port bond0 { rx_queue_ids 2 tx_queue_ids 2 - isol_rx_cpu_ids 11 - isol_rxq_ring_sz 1048576 + ! isol_rx_cpu_ids 11 + ! isol_rxq_ring_sz 1048576 } } @@ -104,8 +114,8 @@ worker_defs { port bond0 { rx_queue_ids 3 tx_queue_ids 3 - isol_rx_cpu_ids 12 - isol_rxq_ring_sz 1048576 + ! isol_rx_cpu_ids 12 + ! isol_rxq_ring_sz 1048576 } } @@ -115,8 +125,8 @@ worker_defs { port bond0 { rx_queue_ids 4 tx_queue_ids 4 - isol_rx_cpu_ids 13 - isol_rxq_ring_sz 1048576 + ! isol_rx_cpu_ids 13 + ! isol_rxq_ring_sz 1048576 } } @@ -126,8 +136,8 @@ worker_defs { port bond0 { rx_queue_ids 5 tx_queue_ids 5 - isol_rx_cpu_ids 14 - isol_rxq_ring_sz 1048576 + ! isol_rx_cpu_ids 14 + ! isol_rxq_ring_sz 1048576 } } @@ -137,8 +147,8 @@ worker_defs { port bond0 { rx_queue_ids 6 tx_queue_ids 6 - isol_rx_cpu_ids 15 - isol_rxq_ring_sz 1048576 + ! isol_rx_cpu_ids 15 + ! isol_rxq_ring_sz 1048576 } } @@ -148,8 +158,8 @@ worker_defs { port bond0 { rx_queue_ids 7 tx_queue_ids 7 - isol_rx_cpu_ids 16 - isol_rxq_ring_sz 1048576 + ! isol_rx_cpu_ids 16 + ! isol_rxq_ring_sz 1048576 } } } @@ -168,6 +178,7 @@ neigh_defs { ! dpvs ipv4 config ipv4_defs { + forwarding off default_ttl 64 fragment { bucket_number 4096 @@ -177,12 +188,22 @@ ipv4_defs { } } +! dpvs ipv6 config +ipv6_defs { + disable off + forwarding off + route6 { + method "hlist" + recycle_time 10 + } +} + ! control plane config ctrl_defs { lcore_msg { ring_size 4096 multicast_queue_length 256 - sync_msg_timeout_us 2000 + sync_msg_timeout_us 20000 } ipc_msg { unix_domain /var/run/dpvs_ctrl @@ -200,7 +221,7 @@ ipvs_defs { } udp { - defence_udp_drop + ! defence_udp_drop timeout { normal 300 last 3 @@ -208,7 +229,7 @@ ipvs_defs { } tcp { - defence_tcp_drop + ! defence_tcp_drop timeout { none 2 established 90 @@ -228,19 +249,19 @@ ipvs_defs { mss 1452 ttl 63 sack - ! wscale - ! timestamp + ! wscale + ! timestamp } - ! defer_rs_syn + ! defer_rs_syn rs_syn_max_retry 3 ack_storm_thresh 10 max_ack_saved 3 conn_reuse_state { close time_wait - ! fin_wait - ! close_wait - ! last_ack + ! fin_wait + ! close_wait + ! last_ack } } } diff --git a/conf/dpvs.conf.single-nic.sample b/conf/dpvs.conf.single-nic.sample index d627c1aa7..da3191967 100755 --- a/conf/dpvs.conf.single-nic.sample +++ b/conf/dpvs.conf.single-nic.sample @@ -13,7 +13,7 @@ ! global config global_defs { log_level WARNING -! log_file /var/log/dpvs.log + ! log_file /var/log/dpvs.log } ! netif config @@ -25,13 +25,18 @@ netif_defs { rx { queue_number 8 descriptor_number 1024 - rss tcp + rss all } tx { queue_number 8 descriptor_number 1024 } - ! promisc_mode + fdir { + mode perfect + pballoc 64k + status matched + } + ! promisc_mode kni_name dpdk0.kni } } @@ -147,6 +152,7 @@ neigh_defs { ! dpvs ipv4 config ipv4_defs { + forwarding off default_ttl 64 fragment { bucket_number 4096 @@ -156,12 +162,22 @@ ipv4_defs { } } +! dpvs ipv6 config +ipv6_defs { + disable off + forwarding off + route6 { + method "hlist" + recycle_time 10 + } +} + ! control plane config ctrl_defs { lcore_msg { ring_size 4096 multicast_queue_length 256 - sync_msg_timeout_us 2000 + sync_msg_timeout_us 20000 } ipc_msg { unix_domain /var/run/dpvs_ctrl @@ -174,12 +190,12 @@ ipvs_defs { conn_pool_size 2097152 conn_pool_cache 256 conn_init_timeout 3 - ! expire_quiescent_template - ! fast_xmit_close + ! expire_quiescent_template + ! fast_xmit_close } udp { - defence_udp_drop + ! defence_udp_drop timeout { normal 300 last 3 @@ -187,7 +203,7 @@ ipvs_defs { } tcp { - defence_tcp_drop + ! defence_tcp_drop timeout { none 2 established 90 @@ -207,19 +223,19 @@ ipvs_defs { mss 1452 ttl 63 sack - ! wscale - ! timestamp + ! wscale + ! timestamp } - ! defer_rs_syn + ! defer_rs_syn rs_syn_max_retry 3 ack_storm_thresh 10 max_ack_saved 3 conn_reuse_state { close time_wait - ! fin_wait - ! close_wait - ! last_ack + ! fin_wait + ! close_wait + ! last_ack } } } diff --git a/doc/tutorial.md b/doc/tutorial.md index 752c636d5..65b5cc670 100644 --- a/doc/tutorial.md +++ b/doc/tutorial.md @@ -13,6 +13,7 @@ DPVS Tutorial * [Tunnel Mode(one-arm)](#tunnel) * [NAT Mode(one-arm)](#nat) * [SNAT Mode (two-arm)](#snat) +* [IPv6 Support](#ipv6_support) * [Virtual devices](#virt-dev) - [Bonding Device](#vdev-bond) - [VLAN Device](#vdev-vlan) @@ -765,6 +766,140 @@ host$ ping www.iqiyi.com host$ curl www.iqiyi.com ``` + + +# IPv6 Support + +DPVS support IPv6 since 1.7-0. You can configure IPv6 fullnat just like IPv4: + +```bash +#!/bin/sh - + +# add VIP to WAN interface +./dpip addr add 2001::1/128 dev dpdk1 + +# route for WAN/LAN access +# add routes for other network or default route if needed. +./dpip route -6 add 2001::/64 dev dpdk1 + +# add service to forwarding, scheduling mode is RR. +# use ipvsadm --help for more info. +./ipvsadm -A -t [2001::1]:80 -s rr + +# add two RS for service, forwarding mode is FNAT (-b) +./ipvsadm -a -t [2001::1]:80 -r 2001::3 -b +./ipvsadm -a -t [2001::1]:80 -r 2001::4 -b + +# add at least one Local-IP (LIP) for FNAT on LAN interface +./ipvsadm --add-laddr -z 2001::2 -t [2001::1]:80 -F dpdk0 +``` +You can use commands to check what's you have set like IPv4 except route: + +```bash +$./dpip route -6 show +inet6 2001::1/128 dev dpdk0 mtu 1500 scope host +inet6 2001::2/128 dev dpdk0 mtu 1500 scope host +inet6 2001::/64 dev dpdk0 mtu 1500 scope link +``` + +You can configure IPv6 OSPF's configuration like this: + +```bash +$ cat /etc/quagga/ospf6d.conf # may installed to other path +log file /var/log/quagga/ospf6.log +log stdout +log syslog +password **** +enable password **** +interface dpdk1.kni + ipv6 ospf6 network point-to-point + ipv6 ospf6 hello-interval 10 + ipv6 ospf6 dead-interval 40 +! +router ospf6 + router-id 192.168.100.200 + area 0.0.0.0 range 2001::1/64 # announce VIP + area 0.0.0.0 range fec0::172:10:10:11/127 # announce inter-connection network + interface dpdk1.kni area 0.0.0.0 +! +``` + +If you prefer keepalived, you can configure it like this: +``` +$ cat /etc/keepalived/keepalived.conf +! Configuration File for keepalived + +global_defs { + notification_email { + foo@example.com + } + notification_email_from bar@example.com + smtp_server 1.2.3.4 + smtp_connect_timeout 60 + router_id DPVS_DEVEL +} + +local_address_group laddr_g1 { + 2001::2 dpdk0 # use DPDK interface +} + +# +# VRRP section +# +vrrp_instance VI_1 { + state MASTER # master + interface dpdk0.kni # should be kni interface, and IPv4 should be configured for vrrp + dpdk_interface dpdk0 # should be DPDK interface + virtual_router_id 123 # VID should be unique in network + priority 100 # master's priority is bigger than worker + advert_int 1 + authentication { + auth_type PASS + auth_pass **** + } + + virtual_ipaddress { + 2001::1 + } +} + +# +# Virtual Server Section +# +virtual_server_group 2001-1-80 { + 2001::1 80 +} + +virtual_server group 2001-1-80 { + delay_loop 3 + lb_algo rr # scheduling algorithm Round-Robin + lb_kind FNAT # Forwarding Mode Full-NAT + protocol TCP # Protocol TCP + + laddr_group_name laddr_g1 # Local IP group-ID + + real_server 2001::3 80 { # real-server + weight 100 + inhibit_on_failure + TCP_CHECK { # health check + nb_sock_retry 2 + connect_timeout 3 + connect_port 80 + } + } + + real_server 2001::4 80 { # real-server + weight 100 + inhibit_on_failure + TCP_CHECK { # health check + nb_sock_retry 2 + connect_timeout 3 + connect_port 80 + } + } +} +``` + # Virtual Devices diff --git a/include/common.h b/include/common.h index e1e3c3512..66f69df2b 100644 --- a/include/common.h +++ b/include/common.h @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -134,4 +135,18 @@ ssize_t writen(int fd, const void *vptr, size_t n); /* send "n" bytes to a descriptor */ ssize_t sendn(int fd, const void *vptr, size_t n, int flags); +static inline char *strupr(char *str) { + char *s; + for (s = str; *s != '\0'; s++) + *s = toupper(*s); + return str; +} + +static inline char *strlwr(char *str) { + char *s; + for (s = str; *s != '\0'; s++) + *s = tolower(*s); + return str; +} + #endif /* __DPVS_COMMON_H__ */ diff --git a/include/conf/conn.h b/include/conf/conn.h index 3752b4c43..c7ffd3511 100644 --- a/include/conf/conn.h +++ b/include/conf/conn.h @@ -52,26 +52,26 @@ struct ip_vs_sockpair { uint16_t proto; __be16 sport; __be16 tport; - __be32 sip; - __be32 tip; + union inet_addr sip; + union inet_addr tip; }; typedef struct ip_vs_sockpair ipvs_sockpair_t; struct ip_vs_conn_entry { - uint16_t af; - uint16_t proto; - __be32 caddr; - __be32 vaddr; - __be32 laddr; - __be32 daddr; - uint16_t cport; - uint16_t vport; - uint16_t lport; - uint16_t dport; - uint32_t timeout; - uint8_t lcoreid; - char state[16]; + uint16_t af; + uint16_t proto; + union inet_addr caddr; + union inet_addr vaddr; + union inet_addr laddr; + union inet_addr daddr; + uint16_t cport; + uint16_t vport; + uint16_t lport; + uint16_t dport; + uint32_t timeout; + uint8_t lcoreid; + char state[16]; }; typedef struct ip_vs_conn_entry ipvs_conn_entry_t; diff --git a/include/conf/ipv6.h b/include/conf/ipv6.h new file mode 100644 index 000000000..4f893ed1c --- /dev/null +++ b/include/conf/ipv6.h @@ -0,0 +1,39 @@ +/* + * DPVS is a software load balancer (Virtual Server) based on DPDK. + * + * Copyright (C) 2017 iQIYI (www.iqiyi.com). + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ +/** + * IPv6 protocol control plane. + * + * Lei Chen , initial, Jul 2018. + */ +#ifndef __DPVS_IPV6_CONF_H__ +#define __DPVS_IPV6_CONF_H__ +#include "inet.h" + +enum { + /* set */ + SOCKOPT_IP6_SET = 1100, + /* get */ + SOCKOPT_IP6_STATS, +}; + +struct ip6_stats_param { + struct inet_stats stats; + struct inet_stats stats_cpus[DPVS_MAX_LCORE]; +} __attribute__((__packed__)); + +#endif /* __DPVS_IPV6_CONF_H__ */ diff --git a/include/conf/laddr.h b/include/conf/laddr.h index 12117e6da..4a39ff99c 100644 --- a/include/conf/laddr.h +++ b/include/conf/laddr.h @@ -35,6 +35,7 @@ enum { }; struct dp_vs_laddr_entry { + int af; union inet_addr addr; uint64_t nport_conflict; uint32_t nconns; diff --git a/include/conf/neigh.h b/include/conf/neigh.h index dcd50eecc..fca935c79 100644 --- a/include/conf/neigh.h +++ b/include/conf/neigh.h @@ -20,6 +20,7 @@ #include #include +#include "inet.h" enum { /* get */ @@ -40,20 +41,45 @@ enum { }; struct dp_vs_neigh_conf { - int af; - uint8_t flag; - uint32_t state; - union inet_addr ip_addr; + int af; + uint8_t flag; + uint32_t state; + union inet_addr ip_addr; struct ether_addr eth_addr; - uint32_t que_num; - char ifname[IFNAMSIZ]; + uint32_t que_num; + char ifname[IFNAMSIZ]; + uint8_t cid; }__attribute__((__packed__)); struct dp_vs_neigh_conf_array { - int n_neigh; + int neigh_nums; struct dp_vs_neigh_conf addrs[0]; }__attribute__((__packed__)); +#define sNNO DPVS_NUD_S_NONE +#define sNSD DPVS_NUD_S_SEND +#define sNRE DPVS_NUD_S_REACHABLE +#define sNPR DPVS_NUD_S_PROBE +#define sNDE DPVS_NUD_S_DELAY + +#define DPVS_NUD_S_KEEP DPVS_NUD_S_MAX +#define sNKP DPVS_NUD_S_KEEP /*Keep state and do not reset timer*/ + +static const char *nud_state_names[] = { + [DPVS_NUD_S_NONE] = "NONE", + [DPVS_NUD_S_SEND] = "SEND", + [DPVS_NUD_S_REACHABLE] = "REACHABLE", + [DPVS_NUD_S_PROBE] = "PROBE", + [DPVS_NUD_S_DELAY] = "DELAY", +}; + +static inline const char *nud_state_name(int state) +{ + if (state >= DPVS_NUD_S_KEEP) + return "ERR!"; + return nud_state_names[state] ? nud_state_names[state] :""; +} + #define NEIGHBOUR_HASHED 0x01 #define NEIGHBOUR_STATIC 0x02 diff --git a/include/conf/netif.h b/include/conf/netif.h index 9721d5a26..ed4e6c813 100644 --- a/include/conf/netif.h +++ b/include/conf/netif.h @@ -18,6 +18,7 @@ #ifndef __NETIF_CONF_H__ #define __NETIF_CONF_H__ #include +#include #define NETIF_MAX_PORTS 4096 @@ -25,7 +26,6 @@ * All types defined here must be the same as in dpdk.h, * error would occur otherwise */ -#define DEVICE_NAME_MAX_LEN 32 #define RTE_ETHDEV_QUEUE_STAT_CNTRS 16 #define NETIF_MAX_BOND_SLAVES 32 @@ -92,7 +92,7 @@ typedef struct netif_lcore_stats_get struct port_id_name { portid_t id; - char name[DEVICE_NAME_MAX_LEN]; + char name[IFNAMSIZ]; } __attribute__((__packed__)); /* all nics in use */ diff --git a/include/conf/route6.h b/include/conf/route6.h new file mode 100644 index 000000000..b85dee151 --- /dev/null +++ b/include/conf/route6.h @@ -0,0 +1,56 @@ +/* + * DPVS is a software load balancer (Virtual Server) based on DPDK. + * + * Copyright (C) 2018 iQIYI (www.iqiyi.com). + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#ifndef __DPVS_ROUTE6_CONF_H__ +#define __DPVS_ROUTE6_CONF_H__ + +#include "flow.h" + +enum { + /* set */ + SOCKOPT_SET_ROUTE6_ADD_DEL = 6300, + SOCKOPT_SET_ROUTE6_FLUSH, + + /* get */ + SOCKOPT_GET_ROUTE6_SHOW = 6300, +}; + +enum { + RT6_OPS_GET = 1, + RT6_OPS_ADD, + RT6_OPS_DEL, + RT6_OPS_FLUSH, +}; + +struct dp_vs_route6_conf { + int ops; + struct rt6_prefix dst; + struct rt6_prefix src; + struct rt6_prefix prefsrc; + char ifname[IFNAMSIZ]; + struct in6_addr gateway; + uint32_t mtu; + uint32_t flags; +} __attribute__((__packed__)); + +struct dp_vs_route6_conf_array { + int nroute; + struct dp_vs_route6_conf routes[0]; +} __attribute__((__packed__)); + +#endif /* __DPVS_ROUTE6_CONF_H__ */ diff --git a/include/ctrl.h b/include/ctrl.h index ca76eb56f..291105204 100644 --- a/include/ctrl.h +++ b/include/ctrl.h @@ -190,6 +190,9 @@ int msg_type_table_print(char *buf, int len); /* debug */ #define MSG_TYPE_TC_STATS 13 #define MSG_TYPE_CONN_GET 14 #define MSG_TYPE_CONN_GET_ALL 15 +#define MSG_TYPE_IPV6_STATS 16 +#define MSG_TYPE_ROUTE6 17 +#define MSG_TYPE_NEIGH_GET 18 #define SOCKOPT_VERSION_MAJOR 1 #define SOCKOPT_VERSION_MINOR 0 diff --git a/include/flow.h b/include/flow.h new file mode 100644 index 000000000..11017c940 --- /dev/null +++ b/include/flow.h @@ -0,0 +1,125 @@ +/* + * DPVS is a software load balancer (Virtual Server) based on DPDK. + * + * Copyright (C) 2018 iQIYI (www.iqiyi.com). + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ +/** + * flow for IPv4/IPv6 route lookup. + * Linux Kernel is referred. + * + * Lei Chen , initial, Jul 2018. + */ +#ifndef __DPVS_FLOW_H__ +#define __DPVS_FLOW_H__ + +#ifdef __DPVS__ +#include "common.h" +#include "netif.h" +#include "inet.h" +#endif + +/* linux:include/uapi/route.h */ +#define RTF_UP 0x0001 /* route usable */ +#define RTF_GATEWAY 0x0002 /* destination is a gateway */ +#define RTF_HOST 0x0004 /* host entry (net otherwise) */ +#define RTF_REINSTATE 0x0008 /* reinstate route after tmout */ +#define RTF_DYNAMIC 0x0010 /* created dyn. (by redirect) */ +#define RTF_MODIFIED 0x0020 /* modified dyn. (by redirect) */ +#define RTF_MTU 0x0040 /* specific MTU for this route */ +#define RTF_MSS RTF_MTU /* Compatibility :-( */ +#define RTF_WINDOW 0x0080 /* per route window clamping */ +#define RTF_IRTT 0x0100 /* Initial round trip time */ +#define RTF_REJECT 0x0200 /* Reject route */ + +/* dpvs defined. */ +#define RTF_FORWARD 0x0400 +#define RTF_LOCALIN 0x0800 +#define RTF_DEFAULT 0x1000 +#define RTF_KNI 0X2000 + +struct rt6_prefix { + struct in6_addr addr; + int plen; +}; + +#ifdef __DPVS__ +/* common flow info of upper layer (l4) */ +union flow_ul { + struct { + __be16 dport; + __be16 sport; + } ports; + + struct { + __u8 type; + __u8 code; + } icmpt; + + __be32 gre_key; +}; + +/* common flow info */ +struct flow_common { + struct netif_port *flc_oif; + struct netif_port *flc_iif; + uint8_t flc_tos; + uint8_t flc_proto; + uint8_t flc_scope; + uint8_t flc_ttl; + uint32_t flc_mark; + uint32_t flc_flags; +}; + +struct flow4 { + struct flow_common __fl_common; +#define fl4_oif __fl_common.flc_oif +#define fl4_iif __fl_common.flc_iif +#define fl4_tos __fl_common.flc_tos +#define fl4_proto __fl_common.flc_proto +#define fl4_scope __fl_common.flc_scope +#define fl4_ttl __fl_common.flc_ttl +#define fl4_mark __fl_common.flc_mark +#define fl4_flags __fl_common.flc_flags + + struct in_addr fl4_saddr; + struct in_addr fl4_daddr; + + union flow_ul __fl_ul; +#define fl4_sport __fl_ul.ports.sport +#define fl4_dport __fl_ul.ports.dport +}; + +struct flow6 { + struct flow_common __fl_common; +#define fl6_oif __fl_common.flc_oif +#define fl6_iif __fl_common.flc_iif +#define fl6_tos __fl_common.flc_tos +#define fl6_proto __fl_common.flc_proto +#define fl6_scope __fl_common.flc_scope +#define fl6_ttl __fl_common.flc_ttl +#define fl6_mark __fl_common.flc_mark +#define fl6_flags __fl_common.flc_flags + + struct in6_addr fl6_daddr; + struct in6_addr fl6_saddr; + __be32 fl6_flow; + + union flow_ul __fl_ul; +#define fl6_sport __fl_ul.ports.sport +#define fl6_dport __fl_ul.ports.dport +}; + +#endif /* __DPVS__ */ +#endif /* __DPVS_FLOW_H__ */ diff --git a/include/icmp6.h b/include/icmp6.h new file mode 100644 index 000000000..993ea0b33 --- /dev/null +++ b/include/icmp6.h @@ -0,0 +1,15 @@ +#ifndef __DPVS_ICMPV6_H__ +#define __DPVS_ICMPV6_H__ + +#include +#include + +#define icmp6h_id(icmp6h) ((icmp6h)->icmp6_dataun.icmp6_un_data16[0]) +void icmp6_send(struct rte_mbuf *imbuf, int type, int code, uint32_t info); +uint16_t icmp6_csum(struct ip6_hdr *iph, struct icmp6_hdr *ich); +void icmp6_send_csum(struct ip6_hdr *shdr, struct icmp6_hdr *ich); + +int icmpv6_init(void); +int icmpv6_term(void); + +#endif /* __DPVS_ICMPV6_H__ */ diff --git a/include/inet.h b/include/inet.h index 9f61e4179..bdb7a1759 100644 --- a/include/inet.h +++ b/include/inet.h @@ -25,6 +25,10 @@ #include #include "common.h" +#define INET_DEF_TTL 64 + +#define INET_MAX_PROTS 256 /* cannot change */ + union inet_addr { struct in_addr in; struct in6_addr in6; @@ -42,35 +46,104 @@ struct inet_addr_range { __be16 max_port; }; +struct inet_stats { + uint64_t inpkts; /* InReceives */ + uint64_t inoctets; /* InOctets */ + uint64_t indelivers; /* InDelivers */ + uint64_t outforwdatagrams; /* OutForwDatagrams */ + uint64_t outpkts; /* OutRequests */ + uint64_t outoctets; /* OutOctets */ + uint64_t inhdrerrors; /* InHdrErrors */ + uint64_t intoobigerrors; /* InTooBigErrors */ + uint64_t innoroutes; /* InNoRoutes */ + uint64_t inaddrerrors; /* InAddrErrors */ + uint64_t inunknownprotos; /* InUnknownProtos */ + uint64_t intruncatedpkts; /* InTruncatedPkts */ + uint64_t indiscards; /* InDiscards */ + uint64_t outdiscards; /* OutDiscards */ + uint64_t outnoroutes; /* OutNoRoutes */ + uint64_t reasmtimeout; /* ReasmTimeout */ + uint64_t reasmreqds; /* ReasmReqds */ + uint64_t reasmoks; /* ReasmOKs */ + uint64_t reasmfails; /* ReasmFails */ + uint64_t fragoks; /* FragOKs */ + uint64_t fragfails; /* FragFails */ + uint64_t fragcreates; /* FragCreates */ + uint64_t inmcastpkts; /* InMcastPkts */ + uint64_t outmcastpkts; /* OutMcastPkts */ + uint64_t inbcastpkts; /* InBcastPkts */ + uint64_t outbcastpkts; /* OutBcastPkts */ + uint64_t inmcastoctets; /* InMcastOctets */ + uint64_t outmcastoctets; /* OutMcastOctets */ + uint64_t inbcastoctets; /* InBcastOctets */ + uint64_t outbcastoctets; /* OutBcastOctets */ + uint64_t csumerrors; /* InCsumErrors */ + uint64_t noectpkts; /* InNoECTPkts */ + uint64_t ect1pkts; /* InECT1Pkts */ + uint64_t ect0pkts; /* InECT0Pkts */ + uint64_t cepkts; /* InCEPkts */ +}; + static inline const char *inet_proto_name(uint8_t proto) { const static char *proto_names[256] = { - [IPPROTO_TCP] = "TCP", - [IPPROTO_UDP] = "UDP", - [IPPROTO_ICMP] = "ICMP", + [IPPROTO_TCP] = "TCP", + [IPPROTO_UDP] = "UDP", + [IPPROTO_ICMP] = "ICMP", + [IPPROTO_ICMPV6] = "ICMPV6", }; return proto_names[proto] ? proto_names[proto] : ""; } +static inline uint32_t inet_addr_fold(int af, const union inet_addr *addr) +{ + uint32_t addr_fold = 0; + + if (af == AF_INET) { + addr_fold = addr->in.s_addr; + } else if (af == AF_INET6) { + addr_fold = addr->in6.s6_addr32[0] ^ addr->in6.s6_addr32[1] ^ + addr->in6.s6_addr32[2] ^ addr->in6.s6_addr32[3]; + } else { + return 0; + } + + return addr_fold; +} + /* ip1[-ip2][:port1[-port2]] */ -static inline int inet_addr_range_parse(int af, const char *param, - struct inet_addr_range *range) +static inline int inet_addr_range_parse(const char *param, + struct inet_addr_range *range, + int *af) { - char _param[256], *ips, *ports; + char _param[256], *ips, *ports = NULL; char *ip1, *ip2, *port1, *port2; - if (af != AF_INET) - return EDPVS_NOTSUPP; - if (strlen(param) == 0) return EDPVS_OK; /* return asap */ snprintf(_param, sizeof(_param), "%s", param); - ports = strrchr(_param, ':'); - if (ports) - *ports++ = '\0'; + ips = _param; + if (_param[0] == '[') { + ips++; + ports = strrchr(_param, ']'); + if (ports == NULL) + return EDPVS_INVAL; + *ports++ = '\0'; + if (*ports == ':') + *ports++ = '\0'; + else + return EDPVS_INVAL; + } + + /* judge ipv4 */ + if (strrchr(_param, ':') == strchr(_param, ':')) { + ports = strrchr(_param, ':'); + if (ports) + *ports++ = '\0'; + } ip1 = ips; ip2 = strrchr(ips, '-'); @@ -88,14 +161,25 @@ static inline int inet_addr_range_parse(int af, const char *param, memset(range, 0, sizeof(*range)); - if (strlen(ip1) && inet_pton(AF_INET, ip1, &range->min_addr.in) <= 0) - return EDPVS_INVAL; - - if (ip2 && strlen(ip2)) { - if (inet_pton(AF_INET, ip2, &range->max_addr.in) <= 0) - return EDPVS_INVAL; + if (strlen(ip1) && inet_pton(AF_INET6, ip1, &range->min_addr.in6) > 0) { + if (ip2 && strlen(ip2)) { + if (inet_pton(AF_INET6, ip2, &range->max_addr.in6) <= 0) + return EDPVS_INVAL; + } else { + range->max_addr = range->min_addr; + } + *af = AF_INET6; } else { - range->max_addr = range->min_addr; + if (strlen(ip1) && inet_pton(AF_INET, ip1, &range->min_addr.in) <= 0) + return EDPVS_INVAL; + + if (ip2 && strlen(ip2)) { + if (inet_pton(AF_INET, ip2, &range->max_addr.in) <= 0) + return EDPVS_INVAL; + } else { + range->max_addr = range->min_addr; + } + *af = AF_INET; } if (port1 && strlen(port1)) @@ -121,11 +205,105 @@ static inline int inet_addr_range_dump(int af, snprintf(min_port, sizeof(min_port), "%u", ntohs(range->min_port)); snprintf(max_port, sizeof(max_port), "%u", ntohs(range->max_port)); - return snprintf(buf, size, "%s-%s:%s-%s", + if (af == AF_INET) + return snprintf(buf, size, "%s-%s:%s-%s", + min_ip, max_ip, min_port, max_port); + return snprintf(buf, size, "[%s-%s]:%s-%s", min_ip, max_ip, min_port, max_port); } +static inline void inet_stats_dump(const char *title, const char *prefix, + const struct inet_stats *st) +{ + if (!st) + return; + + if (title) + printf("%s\n", title); + + printf("%s%-18s %lu\n", prefix ? : "", "InReceives:", st->inpkts); + printf("%s%-18s %lu\n", prefix ? : "", "InOctets:", st->inoctets); + printf("%s%-18s %lu\n", prefix ? : "", "InDelivers:", st->indelivers); + printf("%s%-18s %lu\n", prefix ? : "", "OutForwDatagrams:", st->outforwdatagrams); + printf("%s%-18s %lu\n", prefix ? : "", "OutRequests:", st->outpkts); + printf("%s%-18s %lu\n", prefix ? : "", "OutOctets:", st->outoctets); + printf("%s%-18s %lu\n", prefix ? : "", "InHdrErrors:", st->inhdrerrors); + printf("%s%-18s %lu\n", prefix ? : "", "InTooBigErrors:", st->intoobigerrors); + printf("%s%-18s %lu\n", prefix ? : "", "InNoRoutes:", st->innoroutes); + printf("%s%-18s %lu\n", prefix ? : "", "InAddrErrors:", st->inaddrerrors); + printf("%s%-18s %lu\n", prefix ? : "", "InUnknownProtos:", st->inunknownprotos); + printf("%s%-18s %lu\n", prefix ? : "", "InTruncatedPkts:", st->intruncatedpkts); + printf("%s%-18s %lu\n", prefix ? : "", "InDiscards:", st->indiscards); + printf("%s%-18s %lu\n", prefix ? : "", "OutDiscards:", st->outdiscards); + printf("%s%-18s %lu\n", prefix ? : "", "OutNoRoutes:", st->outnoroutes); + printf("%s%-18s %lu\n", prefix ? : "", "ReasmTimeout:", st->reasmtimeout); + printf("%s%-18s %lu\n", prefix ? : "", "ReasmReqds:", st->reasmreqds); + printf("%s%-18s %lu\n", prefix ? : "", "ReasmOKs:", st->reasmoks); + printf("%s%-18s %lu\n", prefix ? : "", "ReasmFails:", st->reasmfails); + printf("%s%-18s %lu\n", prefix ? : "", "FragOKs:", st->fragoks); + printf("%s%-18s %lu\n", prefix ? : "", "FragFails:", st->fragfails); + printf("%s%-18s %lu\n", prefix ? : "", "FragCreates:", st->fragcreates); + printf("%s%-18s %lu\n", prefix ? : "", "InMcastPkts:", st->inmcastpkts); + printf("%s%-18s %lu\n", prefix ? : "", "OutMcastPkts:", st->outmcastpkts); + printf("%s%-18s %lu\n", prefix ? : "", "InBcastPkts:", st->inbcastpkts); + printf("%s%-18s %lu\n", prefix ? : "", "OutBcastPkts:", st->outbcastpkts); + printf("%s%-18s %lu\n", prefix ? : "", "InMcastOctets:", st->inmcastoctets); + printf("%s%-18s %lu\n", prefix ? : "", "OutMcastOctets:", st->outmcastoctets); + printf("%s%-18s %lu\n", prefix ? : "", "InBcastOctets:", st->inbcastoctets); + printf("%s%-18s %lu\n", prefix ? : "", "OutBcastOctets:", st->outbcastoctets); + printf("%s%-18s %lu\n", prefix ? : "", "InCsumErrors:", st->csumerrors); + printf("%s%-18s %lu\n", prefix ? : "", "InNoECTPkts:", st->noectpkts); + printf("%s%-18s %lu\n", prefix ? : "", "InECT1Pkts:", st->ect1pkts); + printf("%s%-18s %lu\n", prefix ? : "", "InECT0Pkts:", st->ect0pkts); + printf("%s%-18s %lu\n", prefix ? : "", "InCEPkts:", st->cepkts); +} + #ifdef __DPVS__ +#include "dpdk.h" +#include "netif.h" +/* + * Inet Hooks + */ +enum { + INET_HOOK_PRE_ROUTING, + INET_HOOK_LOCAL_IN, + INET_HOOK_FORWARD, + INET_HOOK_LOCAL_OUT, + INET_HOOK_POST_ROUTING, + INET_HOOK_NUMHOOKS, +}; + +struct inet_hook_state { + unsigned int hook; +} __rte_cache_aligned; + +enum { + INET_DROP = 0, + INET_ACCEPT, + INET_STOLEN, + INET_REPEAT, + INET_STOP, + INET_VERDICT_NUM, +}; + +typedef int (*inet_hook_fn)(void *priv, struct rte_mbuf *mbuf, + const struct inet_hook_state *state); + +struct inet_hook_ops { + inet_hook_fn hook; + unsigned int hooknum; + int af; + void *priv; + int priority; + + struct list_head list; +}; + +struct netif_port; + +int INET_HOOK(int af, unsigned int hook, struct rte_mbuf *mbuf, + struct netif_port *in, struct netif_port *out, + int (*okfn)(struct rte_mbuf *mbuf)); int inet_init(void); int inet_term(void); @@ -147,11 +325,14 @@ bool inet_addr_same_net(int af, uint8_t plen, const union inet_addr *addr1, const union inet_addr *addr2); -int inet_addr_range_parse(int af, const char *param, - struct inet_addr_range *range); - int inet_addr_range_dump(int af, const struct inet_addr_range *range, char *buf, size_t size); + +int inet_register_hooks(struct inet_hook_ops *reg, size_t n); +int inet_unregister_hooks(struct inet_hook_ops *reg, size_t n); + +void inet_stats_add(struct inet_stats *stats, const struct inet_stats *diff); + #endif /* __DPVS__ */ #endif /* __DPVS_INET_H__ */ diff --git a/include/inetaddr.h b/include/inetaddr.h index 6f78327d4..4e7fce77b 100644 --- a/include/inetaddr.h +++ b/include/inetaddr.h @@ -28,13 +28,24 @@ struct inet_device { struct netif_port *dev; - int af; struct list_head ifa_list; /* inet_ifaddr list */ + struct list_head ifm_list; /* inet_ifmcaddr list*/ rte_atomic32_t ifa_cnt; rte_atomic32_t refcnt; }; -/** +/* + * no timer, release me by inet_ifaddr + */ +struct inet_ifmcaddr { + struct list_head d_list; + struct inet_device *idev; + union inet_addr addr; + uint32_t flags; + rte_atomic32_t refcnt; +}; + +/* * do not support peer address now. */ struct inet_ifaddr { @@ -42,6 +53,7 @@ struct inet_ifaddr { struct list_head h_list; /* global hash, key is addr */ struct inet_device *idev; + int af; union inet_addr addr; /* primary address of iface */ uint8_t plen; union inet_addr mask; @@ -99,6 +111,14 @@ static inline void inet_addr_ifa_put(struct inet_ifaddr *ifa) rte_atomic32_dec(&ifa->refcnt); } +bool inet_chk_mcast_addr(int af, struct netif_port *dev, + const union inet_addr *group, + const union inet_addr *src); + +void inet_ifaddr_dad_failure(struct inet_ifaddr *ifa); + +int idev_add_mcast_init(struct netif_port *dev); + int inet_addr_init(void); int inet_addr_term(void); diff --git a/include/ipv4.h b/include/ipv4.h index 534053f11..40f4aeab3 100644 --- a/include/ipv4.h +++ b/include/ipv4.h @@ -19,13 +19,12 @@ #define __DPVS_IPV4_H__ #include #include "common.h" +#include "inet.h" #include "netif.h" #include "route.h" #define IPPROTO_OSPF 89 /* OSPF protocol */ -#define INET_DEF_TTL 64 - int ipv4_init(void); int ipv4_term(void); @@ -62,50 +61,9 @@ enum { IP_DEFRAG_VS_FWD, }; -/* - * Inet Hooks - */ -enum { - INET_HOOK_PRE_ROUTING, - INET_HOOK_LOCAL_IN, - INET_HOOK_FORWARD, - INET_HOOK_LOCAL_OUT, - INET_HOOK_POST_ROUTING, - INET_HOOK_NUMHOOKS, -}; - -struct inet_hook_state { - unsigned int hook; -} __rte_cache_aligned; - -enum { - INET_DROP = 0, - INET_ACCEPT, - INET_STOLEN, - INET_REPEAT, - INET_STOP, - INET_VERDICT_NUM, -}; - -typedef int (*inet_hook_fn)(void *priv, struct rte_mbuf *mbuf, - const struct inet_hook_state *state); - -struct inet_hook_ops { - inet_hook_fn hook; - unsigned int hooknum; - void *priv; - int priority; - - struct list_head list; -}; - int ipv4_register_hooks(struct inet_hook_ops *ops, size_t n); int ipv4_unregister_hooks(struct inet_hook_ops *ops, size_t n); -int INET_HOOK(unsigned int hook, struct rte_mbuf *mbuf, - struct netif_port *in, struct netif_port *out, - int (*okfn)(struct rte_mbuf *mbuf)); - /* * Statistics */ @@ -153,44 +111,9 @@ extern rte_spinlock_t ip4_stats_lock; #define IP4_UPD_PO_STATS(field, val) #endif -struct ip4_stats { - uint64_t inpkts; /* InReceives */ - uint64_t inoctets; /* InOctets */ - uint64_t indelivers; /* InDelivers */ - uint64_t outforwdatagrams; /* OutForwDatagrams */ - uint64_t outpkts; /* OutRequests */ - uint64_t outoctets; /* OutOctets */ - uint64_t inhdrerrors; /* InHdrErrors */ - uint64_t intoobigerrors; /* InTooBigErrors */ - uint64_t innoroutes; /* InNoRoutes */ - uint64_t inaddrerrors; /* InAddrErrors */ - uint64_t inunknownprotos; /* InUnknownProtos */ - uint64_t intruncatedpkts; /* InTruncatedPkts */ - uint64_t indiscards; /* InDiscards */ - uint64_t outdiscards; /* OutDiscards */ - uint64_t outnoroutes; /* OutNoRoutes */ - uint64_t reasmtimeout; /* ReasmTimeout */ - uint64_t reasmreqds; /* ReasmReqds */ - uint64_t reasmoks; /* ReasmOKs */ - uint64_t reasmfails; /* ReasmFails */ - uint64_t fragoks; /* FragOKs */ - uint64_t fragfails; /* FragFails */ - uint64_t fragcreates; /* FragCreates */ - uint64_t inmcastpkts; /* InMcastPkts */ - uint64_t outmcastpkts; /* OutMcastPkts */ - uint64_t inbcastpkts; /* InBcastPkts */ - uint64_t outbcastpkts; /* OutBcastPkts */ - uint64_t inmcastoctets; /* InMcastOctets */ - uint64_t outmcastoctets; /* OutMcastOctets */ - uint64_t inbcastoctets; /* InBcastOctets */ - uint64_t outbcastoctets; /* OutBcastOctets */ - uint64_t csumerrors; /* InCsumErrors */ - uint64_t noectpkts; /* InNoECTPkts */ - uint64_t ect1pkts; /* InECT1Pkts */ - uint64_t ect0pkts; /* InECT0Pkts */ - uint64_t cepkts; /* InCEPkts */ -} __rte_cache_aligned; +typedef struct inet_stats ip4_stats; +struct ip4_stats; int ipv4_get_stats(struct ip4_stats *stats); int ip4_defrag(struct rte_mbuf *mbuf, int user); diff --git a/include/ipv6.h b/include/ipv6.h new file mode 100644 index 000000000..7313c6d18 --- /dev/null +++ b/include/ipv6.h @@ -0,0 +1,126 @@ +/* + * DPVS is a software load balancer (Virtual Server) based on DPDK. + * + * Copyright (C) 2017 iQIYI (www.iqiyi.com). + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ +/** + * IPv6 protocol for "lite stack". + * Linux Kernel net/ipv6/ is referred. + * + * Lei Chen , initial, Jul 2018. + */ + +#ifndef __DPVS_IPV6_H__ +#define __DPVS_IPV6_H__ + +#include +#include "rte_mbuf.h" +#include "linux_ipv6.h" +#include "flow.h" + +#define IPV6 +#define RTE_LOGTYPE_IPV6 RTE_LOGTYPE_USER1 + +/* + * helper functions + */ +static inline struct ip6_hdr *ip6_hdr(const struct rte_mbuf *mbuf) +{ + /* can only invoked at L3 */ + return rte_pktmbuf_mtod(mbuf, struct ip6_hdr *); +} + +static inline bool ip6_is_frag(struct ip6_hdr *ip6h) +{ + return (ip6h->ip6_nxt == IPPROTO_FRAGMENT); +} + +enum { + INET6_PROTO_F_NONE = 0x01, + INET6_PROTO_F_FINAL = 0x02, +}; + +/* + * inet6_protocol: + * to process IPv6 upper-layer protocol or ext-header. + * + * @handler + * handler protocol, it consume pkt or return next-header. + * + * 1. if return > 0, it's always "nexthdr", + * no matter if proto is final or not. + * 2. if return == 0, the pkt is consumed. + * 3. should not return < 0, or it'll be ignored. + * 4. mbuf->l3_len must be upadted by handler + * to the value as ext-header length. + * + * @flags: INET6_PROTO_F_XXX + */ +struct inet6_protocol { + int (*handler)(struct rte_mbuf *mbuf); + unsigned int flags; +}; + +int ipv6_init(void); +int ipv6_term(void); + +int ipv6_xmit(struct rte_mbuf *mbuf, struct flow6 *fl6); +int ip6_output(struct rte_mbuf *mbuf); + +int ip6_local_out(struct rte_mbuf *mbuf); + +int ipv6_register_hooks(struct inet_hook_ops *ops, size_t n); +int ipv6_unregister_hooks(struct inet_hook_ops *ops, size_t n); + +int ipv6_register_protocol(struct inet6_protocol *prot, + unsigned char protocol); +int ipv6_unregister_protocol(struct inet6_protocol *prot, + unsigned char protocol); + +int ipv6_stats_cpu(struct inet_stats *stats); + +void install_ipv6_keywords(void); +void ipv6_keyword_value_init(void); + +/* control plane */ +int ipv6_ctrl_init(void); +int ipv6_ctrl_term(void); + +/* extension header and options. */ +int ipv6_exthdrs_init(void); +void ipv6_exthdrs_term(void); +int ipv6_parse_hopopts(struct rte_mbuf *mbuf); +int ip6_skip_exthdr(const struct rte_mbuf *imbuf, int start, + __u8 *nexthdrp); +/* get ipv6 header length, including extension header length. */ +int ip6_hdrlen(const struct rte_mbuf *mbuf); + +/* + * Exthdr supported checksum function for upper layer protocol. + * @param ol_flags + * The ol_flags of the associated mbuf. + * @param exthdrlen + * The IPv6 fixed header length plus the extension header length. + * @param l4_proto + * The L4 protocol type, i.e. IPPROTO_TCP, IPPROTO_UDP, IPPROTO_ICMP + * @return + * The non-complemented checksum to set in the L4 header. + */ +uint16_t ip6_phdr_cksum(struct ip6_hdr*, uint64_t ol_flags, + uint32_t exthdrlen, uint8_t l4_proto); +uint16_t ip6_udptcp_cksum(struct ip6_hdr*, const void *l4_hdr, + uint32_t exthdrlen, uint8_t l4_proto); + +#endif /* __DPVS_IPV6_H__ */ diff --git a/include/ipvs/conn.h b/include/ipvs/conn.h index 957e4f861..28db75daf 100644 --- a/include/ipvs/conn.h +++ b/include/ipvs/conn.h @@ -164,6 +164,7 @@ int dp_vs_conn_term(void); struct dp_vs_conn * dp_vs_conn_new(struct rte_mbuf *mbuf, + const struct dp_vs_iphdr *iph, struct dp_vs_conn_param *param, struct dp_vs_dest *dest, uint32_t flags); diff --git a/include/ipvs/dest.h b/include/ipvs/dest.h index daf08fd03..94a7f6e0c 100644 --- a/include/ipvs/dest.h +++ b/include/ipvs/dest.h @@ -17,9 +17,7 @@ */ #ifndef __DPVS_DEST_H__ #define __DPVS_DEST_H__ -#include "common.h" -#include "list.h" -#include "dpdk.h" + #include "ipvs/service.h" /* must consistent with IP_VS_CONN_F_XXX (libipvs-2.6/ip_vs.h) */ @@ -39,6 +37,10 @@ enum { DPVS_DEST_F_OVERLOAD = 0x1<<1, }; +#ifdef __DPVS__ +#include "common.h" +#include "list.h" +#include "dpdk.h" struct dp_vs_dest { struct list_head n_list; /* for the dests in the service */ @@ -78,34 +80,37 @@ struct dp_vs_dest { unsigned conn_timeout; /* conn timeout copied from svc*/ unsigned limit_proportion; /* limit copied from svc*/ } __rte_cache_aligned; +#endif struct dp_vs_dest_conf { /* destination server address */ - union inet_addr addr; - uint16_t port; + int af; + union inet_addr addr; + uint16_t port; enum dpvs_fwd_mode fwdmode; /* real server options */ - unsigned conn_flags; /* connection flags */ - int weight; /* destination weight */ + unsigned conn_flags; /* connection flags */ + int weight; /* destination weight */ /* thresholds for active connections */ - uint32_t max_conn; /* upper threshold */ - uint32_t min_conn; /* lower threshold */ + uint32_t max_conn; /* upper threshold */ + uint32_t min_conn; /* lower threshold */ }; struct dp_vs_dest_entry { - uint32_t addr; /* destination address */ - uint16_t port; - unsigned conn_flags; /* connection flags */ - int weight; /* destination weight */ + int af; + union inet_addr addr; /* destination address */ + uint16_t port; + unsigned conn_flags; /* connection flags */ + int weight; /* destination weight */ - uint32_t max_conn; /* upper threshold */ - uint32_t min_conn; /* lower threshold */ + uint32_t max_conn; /* upper threshold */ + uint32_t min_conn; /* lower threshold */ - uint32_t actconns; /* active connections */ - uint32_t inactconns; /* inactive connections */ - uint32_t persistconns; /* persistent connections */ + uint32_t actconns; /* active connections */ + uint32_t inactconns; /* inactive connections */ + uint32_t persistconns; /* persistent connections */ /* statistics */ struct dp_vs_stats stats; @@ -113,10 +118,11 @@ struct dp_vs_dest_entry { struct dp_vs_get_dests { /* which service: user fills in these */ - uint16_t proto; - uint32_t addr; /* virtual address */ - uint16_t port; - uint32_t fwmark; /* firwall mark of service */ + int af; + uint16_t proto; + union inet_addr addr; /* virtual address */ + uint16_t port; + uint32_t fwmark; /* firwall mark of service */ /* number of real servers */ unsigned int num_dests; @@ -131,17 +137,18 @@ struct dp_vs_get_dests { }; struct dp_vs_dest_user{ - uint32_t addr; - uint16_t port; + int af; + union inet_addr addr; + uint16_t port; - unsigned conn_flags; - int weight; + unsigned conn_flags; + int weight; - uint32_t max_conn; - uint32_t min_conn; + uint32_t max_conn; + uint32_t min_conn; }; - +#ifdef __DPVS__ int dp_vs_new_dest(struct dp_vs_service *svc, struct dp_vs_dest_conf *udest, struct dp_vs_dest **dest_p); @@ -175,5 +182,6 @@ int dp_vs_get_dest_entries(const struct dp_vs_service *svc, int dp_vs_dest_init(void); int dp_vs_dest_term(void); +#endif #endif /* __DPVS_DEST_H__ */ diff --git a/include/ipvs/laddr.h b/include/ipvs/laddr.h index 28d974c47..9dbc830c7 100644 --- a/include/ipvs/laddr.h +++ b/include/ipvs/laddr.h @@ -24,9 +24,9 @@ int dp_vs_laddr_bind(struct dp_vs_conn *conn, struct dp_vs_service *svc); int dp_vs_laddr_unbind(struct dp_vs_conn *conn); -int dp_vs_laddr_add(struct dp_vs_service *svc, const union inet_addr *addr, +int dp_vs_laddr_add(struct dp_vs_service *svc, int af, const union inet_addr *addr, const char *ifname); -int dp_vs_laddr_del(struct dp_vs_service *svc, const union inet_addr *addr); +int dp_vs_laddr_del(struct dp_vs_service *svc, int af, const union inet_addr *addr); int dp_vs_laddr_flush(struct dp_vs_service *svc); int dp_vs_laddr_init(void); diff --git a/include/ipvs/proto.h b/include/ipvs/proto.h index 8d0d4b71e..2a364e3c1 100644 --- a/include/ipvs/proto.h +++ b/include/ipvs/proto.h @@ -24,6 +24,8 @@ #include "ipvs/conn.h" struct dp_vs_conn; +#define IPV6_ADDR_LEN_IN_BYTES 16 +#define IPV4_ADDR_LEN_IN_BYTES 4 struct dp_vs_proto { char *name; diff --git a/include/ipvs/proto_tcp.h b/include/ipvs/proto_tcp.h index 6354ef4f0..541f38bd7 100644 --- a/include/ipvs/proto_tcp.h +++ b/include/ipvs/proto_tcp.h @@ -18,6 +18,8 @@ #ifndef __DP_VS_PROTO_TCP_H__ #define __DP_VS_PROTO_TCP_H__ +#include + enum { TCP_OPT_EOL = 0, TCP_OPT_NOP = 1, @@ -31,7 +33,8 @@ enum { #define TCP_OLEN_MSS 4 #define TCP_OLEN_TIMESTAMP 10 -#define TCP_OLEN_ADDR 8 +#define TCP_OLEN_IP4_ADDR 8 +#define TCP_OLEN_IP6_ADDR 20 #define TCP_OLEN_TSTAMP_ALIGNED 12 #define TCP_OLEN_SACK_BASE 2 @@ -44,12 +47,25 @@ enum { (((tm_spec).tv_sec % 100) * 1000000 + \ ((tm_spec).tv_nsec / 1000)) -/* now IPv4 only */ +struct tcpopt_ip4_addr { + uint8_t opcode; + uint8_t opsize; + __be16 port; + struct in_addr addr; +} __attribute__((__packed__)); + +struct tcpopt_ip6_addr { + uint8_t opcode; + uint8_t opsize; + __be16 port; + struct in6_addr addr; +} __attribute__((__packed__)); + struct tcpopt_addr { uint8_t opcode; uint8_t opsize; - uint16_t port; - uint32_t addr; + __be16 port; + uint8_t addr[16]; } __attribute__((__packed__)); enum { @@ -85,6 +101,7 @@ struct tcp_state { struct tcphdr *tcp_hdr(const struct rte_mbuf *mbuf); void tcp4_send_csum(struct ipv4_hdr *iph, struct tcphdr *th); +void tcp6_send_csum(struct ipv6_hdr *iph, struct tcphdr *th); struct rte_mempool *get_mbuf_pool(const struct dp_vs_conn *conn, int dir); void install_proto_tcp_keywords(void); void tcp_keyword_value_init(void); diff --git a/include/ipvs/proto_udp.h b/include/ipvs/proto_udp.h index bf76fc605..e407882e8 100644 --- a/include/ipvs/proto_udp.h +++ b/include/ipvs/proto_udp.h @@ -18,9 +18,11 @@ #ifndef __DP_VS_PROTO_UDP_H__ #define __DP_VS_PROTO_UDP_H__ +#include + enum { - DPVS_UDP_S_NORMAL = 0, - DPVS_UDP_S_LAST + DPVS_UDP_S_NORMAL = 0, + DPVS_UDP_S_LAST }; extern int g_defence_udp_drop; @@ -28,4 +30,7 @@ extern int g_defence_udp_drop; void install_proto_udp_keywords(void); void udp_keyword_value_init(void); +void udp4_send_csum(struct ipv4_hdr *iph, struct udphdr *uh); +void udp6_send_csum(struct ipv6_hdr *iph, struct udphdr *uh); + #endif diff --git a/include/ipvs/service.h b/include/ipvs/service.h index 5a2f65c26..34385cb2e 100644 --- a/include/ipvs/service.h +++ b/include/ipvs/service.h @@ -17,16 +17,21 @@ */ #ifndef __DPVS_SVC_H__ #define __DPVS_SVC_H__ + #include #include +#include "match.h" +#include "ipvs/stats.h" +#include "ipvs/dest.h" +#include "inet.h" + +#define DP_VS_SCHEDNAME_MAXLEN 16 + +#ifdef __DPVS__ #include "list.h" #include "dpdk.h" -#include "inet.h" #include "netif.h" -#include "match.h" #include "ipvs/ipvs.h" -#include "ipvs/stats.h" -#include "ipvs/dest.h" #include "ipvs/sched.h" #define RTE_LOGTYPE_SERVICE RTE_LOGTYPE_USER3 @@ -37,8 +42,6 @@ #define DP_VS_SVC_F_SIP_HASH 0x0100 /* sip hash target */ #define DP_VS_SVC_F_QID_HASH 0x0200 /* quic cid hash target */ -#define DP_VS_SCHEDNAME_MAXLEN 16 - rte_rwlock_t __dp_vs_svc_lock; /* virtual service */ @@ -87,7 +90,7 @@ struct dp_vs_service { /* ... flags, timer ... */ } __rte_cache_aligned; - +#endif struct dp_vs_service_conf { /* virtual service addresses */ @@ -109,8 +112,9 @@ struct dp_vs_service_conf { }; struct dp_vs_service_entry { + int af; uint16_t proto; - uint32_t addr; + union inet_addr addr; uint16_t port; uint32_t fwmark; @@ -138,7 +142,34 @@ struct dp_vs_get_services { struct dp_vs_service_entry entrytable[0]; }; +struct dp_vs_service_user{ + int af; + uint16_t proto; + union inet_addr addr; + uint16_t port; + uint32_t fwmark; + + char sched_name[DP_VS_SCHEDNAME_MAXLEN]; + unsigned flags; + unsigned timeout; + unsigned conn_timeout; + uint32_t netmask; + unsigned bps; + unsigned limit_proportion; + + char srange[256]; + char drange[256]; + char iifname[IFNAMSIZ]; + char oifname[IFNAMSIZ]; +}; + +struct dp_vs_getinfo { + unsigned int version; + unsigned int size; + unsigned int num_services; +}; +#ifdef __DPVS__ int dp_vs_service_init(void); int dp_vs_service_term(void); @@ -157,7 +188,7 @@ dp_vs_service_lookup(int af, uint16_t protocol, const struct rte_mbuf *mbuf, const struct dp_vs_match *match); -int dp_vs_match_parse(int af, const char *srange, const char *drange, +int dp_vs_match_parse(const char *srange, const char *drange, const char *iifname, const char *oifname, struct dp_vs_match *match); @@ -190,33 +221,6 @@ int dp_vs_zero_service(struct dp_vs_service *svc); int dp_vs_zero_all(void); - -struct dp_vs_service_user{ - uint16_t proto; - uint32_t addr; - uint16_t port; - uint32_t fwmark; - - char sched_name[DP_VS_SCHEDNAME_MAXLEN]; - unsigned flags; - unsigned timeout; - unsigned conn_timeout; - uint32_t netmask; - unsigned bps; - unsigned limit_proportion; - - char srange[256]; - char drange[256]; - char iifname[IFNAMSIZ]; - char oifname[IFNAMSIZ]; -}; - -struct dp_vs_getinfo { - unsigned int version; - unsigned int size; - unsigned int num_services; -}; - enum{ DPVS_SO_SET_FLUSH = 200, DPVS_SO_SET_ZERO, @@ -246,5 +250,6 @@ enum{ sizeof(struct dp_vs_dest_user)) #define DPVS_WAIT_WHILE(expr) while(expr){;} +#endif #endif /* __DPVS_SVC_H__ */ diff --git a/include/ipvs/stats.h b/include/ipvs/stats.h index d131501ca..665ae04f1 100644 --- a/include/ipvs/stats.h +++ b/include/ipvs/stats.h @@ -19,9 +19,6 @@ #define __DPVS_STATS_H__ #include #include "ipvs/service.h" -#include "dpdk.h" - -struct dp_vs_conn; struct dp_vs_stats { uint64_t conns; @@ -37,6 +34,10 @@ struct dp_vs_stats { uint32_t outbps; }; +#ifdef __DPVS__ +#include "dpdk.h" + +struct dp_vs_conn; /* statistics for FULLNAT and SYNPROXY */ enum dp_vs_estats_type { @@ -112,15 +113,6 @@ int dp_vs_new_stats(struct dp_vs_stats **p); void dp_vs_del_stats(struct dp_vs_stats *p); void dp_vs_zero_stats(struct dp_vs_stats* stats); int dp_vs_copy_stats(struct dp_vs_stats* dst, struct dp_vs_stats* src); - -#if 0 -/*rate control code*/ -int dp_vs_rate_init(void); -void dp_vs_rate_cleanup(void); -void dp_vs_new_rate(struct dp_vs_service *svc); -void dp_vs_kill_rate(struct dp_vs_service *svc); -void dp_vs_zero_rate(struct dp_vs_service *svc); -__u64 dp_vs_rate_bps(struct dp_vs_rate *rate); #endif #endif /* __DPVS_STATS_H__ */ diff --git a/include/ipvs/synproxy.h b/include/ipvs/synproxy.h index 59b1e0b25..412e981d7 100644 --- a/include/ipvs/synproxy.h +++ b/include/ipvs/synproxy.h @@ -102,7 +102,7 @@ int dp_vs_synproxy_ack_rcv(int af, struct rte_mbuf *mbuf, /* Syn-proxy step 3 logic: receive rs's Syn/Ack. */ int dp_vs_synproxy_synack_rcv(struct rte_mbuf *mbuf, struct dp_vs_conn *cp, - struct dp_vs_proto *pp, int ihl, int *verdict); + struct dp_vs_proto *pp, int th_offset, int *verdict); /* Syn-proxy conn reuse logic: receive client's Ack */ int dp_vs_synproxy_reuse_conn(int af, struct rte_mbuf *mbuf, diff --git a/include/linux_ipv6.h b/include/linux_ipv6.h new file mode 100644 index 000000000..3c42fbc6f --- /dev/null +++ b/include/linux_ipv6.h @@ -0,0 +1,636 @@ +/* + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ +/* + * modifyed from + * linux:include/net/ipv6.h + * linux:net/ipv6/addrconf_core.c + * + * Authors: + * Pedro Roque + */ +#ifndef __LINUX_IPV6_H__ +#define __LINUX_IPV6_H__ +#include +#include +#include +#include "common.h" +#ifdef __DPVS__ +#include "inetaddr.h" +#endif + +#define IPV6_MAXPLEN 65535 +#define IPV6_MIN_MTU 1280 + +/* + * NextHeader field of IPv6 header + */ +#define NEXTHDR_HOP 0 /* Hop-by-hop option header. */ +#define NEXTHDR_TCP 6 /* TCP segment. */ +#define NEXTHDR_UDP 17 /* UDP message. */ +#define NEXTHDR_IPV6 41 /* IPv6 in IPv6 */ +#define NEXTHDR_ROUTING 43 /* Routing header. */ +#define NEXTHDR_FRAGMENT 44 /* Fragmentation/reassembly header. */ +#define NEXTHDR_GRE 47 /* GRE header. */ +#define NEXTHDR_ESP 50 /* Encapsulating security payload. */ +#define NEXTHDR_AUTH 51 /* Authentication header. */ +#define NEXTHDR_ICMP 58 /* ICMP for IPv6. */ +#define NEXTHDR_NONE 59 /* No next header */ +#define NEXTHDR_DEST 60 /* Destination options header. */ +#define NEXTHDR_SCTP 132 /* SCTP message. */ +#define NEXTHDR_MOBILITY 135 /* Mobility header. */ + +#define NEXTHDR_MAX 255 + +#define IPV6_DEFAULT_HOPLIMIT 64 +#define IPV6_DEFAULT_MCASTHOPS 1 + +/* + * Addr type + * + * type - unicast | multicast + * scope - local | site | global + * v4 - compat + * v4mapped + * any + * loopback + */ + +#define IPV6_ADDR_ANY 0x0000U + +#define IPV6_ADDR_UNICAST 0x0001U +#define IPV6_ADDR_MULTICAST 0x0002U + +#define IPV6_ADDR_LOOPBACK 0x0010U +#define IPV6_ADDR_LINKLOCAL 0x0020U +#define IPV6_ADDR_SITELOCAL 0x0040U + +#define IPV6_ADDR_COMPATv4 0x0080U + +#define IPV6_ADDR_SCOPE_MASK 0x00f0U + +#define IPV6_ADDR_MAPPED 0x1000U + +#define IPV6_ADDR_RESERVED 0x2000U /* reserved address space */ + +/* + * Addr scopes + */ +#define IPV6_ADDR_MC_SCOPE(a) \ + ((a)->s6_addr[1] & 0x0f) /* nonstandard */ +#define __IPV6_ADDR_SCOPE_INVALID -1 +#define IPV6_ADDR_SCOPE_NODELOCAL 0x01 +#define IPV6_ADDR_SCOPE_LINKLOCAL 0x02 +#define IPV6_ADDR_SCOPE_SITELOCAL 0x05 +#define IPV6_ADDR_SCOPE_ORGLOCAL 0x08 +#define IPV6_ADDR_SCOPE_GLOBAL 0x0e + +/* + * Addr flags + */ +#define IPV6_ADDR_MC_FLAG_TRANSIENT(a) \ + ((a)->s6_addr[1] & 0x10) +#define IPV6_ADDR_MC_FLAG_PREFIX(a) \ + ((a)->s6_addr[1] & 0x20) +#define IPV6_ADDR_MC_FLAG_RENDEZVOUS(a) \ + ((a)->s6_addr[1] & 0x40) + +/* + * choose an appropriate source address (RFC3484) + */ +enum { + IPV6_SADDR_RULE_INIT = 0, + IPV6_SADDR_RULE_LOCAL, + IPV6_SADDR_RULE_SCOPE, + IPV6_SADDR_RULE_PREFERRED, +#ifdef CONFIG_IPV6_MIP6 + IPV6_SADDR_RULE_HOA, +#endif + IPV6_SADDR_RULE_OIF, + IPV6_SADDR_RULE_LABEL, +#ifdef CONFIG_IPV6_PRIVACY + IPV6_SADDR_RULE_PRIVACY, +#endif + IPV6_SADDR_RULE_ORCHID, + IPV6_SADDR_RULE_PREFIX, + IPV6_SADDR_RULE_MAX +}; + +#ifdef __DPVS__ +/* struct help for src select */ +struct ipv6_saddr_score { + int rule; + int addr_type; + struct inet_ifaddr *ifa; + bool scorebits[IPV6_SADDR_RULE_MAX]; + int scopedist; + int matchlen; +}; + +struct ipv6_saddr_dst { + const struct in6_addr *addr; + struct inet_device *idev; + int scope; +}; +#endif + +/** + * from linux:net/ipv6/addrconf_core.c + */ +#define IPV6_ADDR_SCOPE_TYPE(scope) ((scope) << 16) + +#define IN6ADDR_LINKLOCAL_ALLNODES_INIT \ + { { { 0xff,2,0,0,0,0,0,0,0,0,0,0,0,0,0,1 } } } +#define IN6ADDR_LINKLOCAL_ALLROUTERS_INIT \ + { { { 0xff,2,0,0,0,0,0,0,0,0,0,0,0,0,0,2 } } } + +static const struct in6_addr in6addr_linklocal_allnodes = IN6ADDR_LINKLOCAL_ALLNODES_INIT; +static const struct in6_addr in6addr_linklocal_allrouters = IN6ADDR_LINKLOCAL_ALLROUTERS_INIT; + +static inline unsigned int ipv6_addr_scope2type(unsigned int scope) +{ + switch (scope) { + case IPV6_ADDR_SCOPE_NODELOCAL: + return (IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_NODELOCAL) | + IPV6_ADDR_LOOPBACK); + case IPV6_ADDR_SCOPE_LINKLOCAL: + return (IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_LINKLOCAL) | + IPV6_ADDR_LINKLOCAL); + case IPV6_ADDR_SCOPE_SITELOCAL: + return (IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_SITELOCAL) | + IPV6_ADDR_SITELOCAL); + } + return IPV6_ADDR_SCOPE_TYPE(scope); +} + +static inline int __ipv6_addr_type(const struct in6_addr *addr) +{ + __be32 st; + + st = addr->s6_addr32[0]; + + /* Consider all addresses with the first three bits different of + 000 and 111 as unicasts. + */ + if ((st & htonl(0xE0000000)) != htonl(0x00000000) && + (st & htonl(0xE0000000)) != htonl(0xE0000000)) + return (IPV6_ADDR_UNICAST | + IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_GLOBAL)); + + if ((st & htonl(0xFF000000)) == htonl(0xFF000000)) { + /* multicast */ + /* addr-select 3.1 */ + return (IPV6_ADDR_MULTICAST | + ipv6_addr_scope2type(IPV6_ADDR_MC_SCOPE(addr))); + } + + if ((st & htonl(0xFFC00000)) == htonl(0xFE800000)) + return (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST | + IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_LINKLOCAL)); /* addr-select 3.1 */ + if ((st & htonl(0xFFC00000)) == htonl(0xFEC00000)) + return (IPV6_ADDR_SITELOCAL | IPV6_ADDR_UNICAST | + IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_SITELOCAL)); /* addr-select 3.1 */ + if ((st & htonl(0xFE000000)) == htonl(0xFC000000)) + return (IPV6_ADDR_UNICAST | + IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_GLOBAL)); /* RFC 4193 */ + + if ((addr->s6_addr32[0] | addr->s6_addr32[1]) == 0) { + if (addr->s6_addr32[2] == 0) { + if (addr->s6_addr32[3] == 0) + return IPV6_ADDR_ANY; + + if (addr->s6_addr32[3] == htonl(0x00000001)) + return (IPV6_ADDR_LOOPBACK | IPV6_ADDR_UNICAST | + IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_LINKLOCAL)); /* addr-select 3.4 */ + + return (IPV6_ADDR_COMPATv4 | IPV6_ADDR_UNICAST | + IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_GLOBAL)); /* addr-select 3.3 */ + } + + if (addr->s6_addr32[2] == htonl(0x0000ffff)) + return (IPV6_ADDR_MAPPED | + IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_GLOBAL)); /* addr-select 3.3 */ + } + + return (IPV6_ADDR_UNICAST | + IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_GLOBAL)); /* addr-select 3.4 */ +} + +static inline int ipv6_addr_type(const struct in6_addr *addr) +{ + return __ipv6_addr_type(addr) & 0xffff; +} + +static inline int ipv6_addr_scope(const struct in6_addr *addr) +{ + return __ipv6_addr_type(addr) & IPV6_ADDR_SCOPE_MASK; +} + +static inline int __ipv6_addr_src_scope(int type) +{ + return (type == IPV6_ADDR_ANY) ? __IPV6_ADDR_SCOPE_INVALID : (type >> 16); +} + +static inline int ipv6_addr_src_scope(const struct in6_addr *addr) +{ + return __ipv6_addr_src_scope(__ipv6_addr_type(addr)); +} + +static inline bool __ipv6_addr_needs_scope_id(int type) +{ + return type & IPV6_ADDR_LINKLOCAL || + (type & IPV6_ADDR_MULTICAST && + (type & (IPV6_ADDR_LOOPBACK|IPV6_ADDR_LINKLOCAL))); +} + +static inline __u32 ipv6_iface_scope_id(const struct in6_addr *addr, int iface) +{ + return __ipv6_addr_needs_scope_id(__ipv6_addr_type(addr)) ? iface : 0; +} + +static inline int ipv6_addr_cmp(const struct in6_addr *a1, const struct in6_addr *a2) +{ + return memcmp(a1, a2, sizeof(struct in6_addr)); +} + +static inline bool +ipv6_masked_addr_cmp(const struct in6_addr *a1, const struct in6_addr *m, + const struct in6_addr *a2) +{ + return !!(((a1->s6_addr32[0] ^ a2->s6_addr32[0]) & m->s6_addr32[0]) | + ((a1->s6_addr32[1] ^ a2->s6_addr32[1]) & m->s6_addr32[1]) | + ((a1->s6_addr32[2] ^ a2->s6_addr32[2]) & m->s6_addr32[2]) | + ((a1->s6_addr32[3] ^ a2->s6_addr32[3]) & m->s6_addr32[3])); +} + +static inline void ipv6_addr_prefix(struct in6_addr *pfx, + const struct in6_addr *addr, + int plen) +{ + /* caller must guarantee 0 <= plen <= 128 */ + int o = plen >> 3, + b = plen & 0x7; + + memset(pfx->s6_addr, 0, sizeof(pfx->s6_addr)); + memcpy(pfx->s6_addr, addr, o); + if (b != 0) + pfx->s6_addr[o] = addr->s6_addr[o] & (0xff00 >> b); +} + +static inline void ipv6_addr_prefix_copy(struct in6_addr *addr, + const struct in6_addr *pfx, + int plen) +{ + /* caller must guarantee 0 <= plen <= 128 */ + int o = plen >> 3, + b = plen & 0x7; + + memcpy(addr->s6_addr, pfx, o); + if (b != 0) { + addr->s6_addr[o] &= ~(0xff00 >> b); + addr->s6_addr[o] |= (pfx->s6_addr[o] & (0xff00 >> b)); + } +} + +static inline bool ipv6_addr_equal(const struct in6_addr *a1, + const struct in6_addr *a2) +{ + return ((a1->s6_addr32[0] ^ a2->s6_addr32[0]) | + (a1->s6_addr32[1] ^ a2->s6_addr32[1]) | + (a1->s6_addr32[2] ^ a2->s6_addr32[2]) | + (a1->s6_addr32[3] ^ a2->s6_addr32[3])) == 0; +} + +static inline bool ipv6_prefix_equal(const struct in6_addr *addr1, + const struct in6_addr *addr2, + unsigned int prefixlen) +{ + const __be32 *a1 = addr1->s6_addr32; + const __be32 *a2 = addr2->s6_addr32; + unsigned int pdw, pbi; + + /* check complete u32 in prefix */ + pdw = prefixlen >> 5; + if (pdw && memcmp(a1, a2, pdw << 2)) + return false; + + /* check incomplete u32 in prefix */ + pbi = prefixlen & 0x1f; + if (pbi && ((a1[pdw] ^ a2[pdw]) & htonl((0xffffffff) << (32 - pbi)))) + return false; + + return true; +} + +static inline bool ipv6_addr_any(const struct in6_addr *a) +{ + return (a->s6_addr32[0] | a->s6_addr32[1] | + a->s6_addr32[2] | a->s6_addr32[3]) == 0; +} + +static inline bool ipv6_addr_loopback(const struct in6_addr *a) +{ + return (a->s6_addr32[0] | a->s6_addr32[1] | + a->s6_addr32[2] | (a->s6_addr32[3] ^ htonl(1))) == 0; +} + +static inline bool ipv6_addr_v4mapped(const struct in6_addr *a) +{ + return ( + (unsigned long)(a->s6_addr32[0] | a->s6_addr32[1]) | + (unsigned long)(a->s6_addr32[2] ^ + htonl(0x0000ffff))) == 0UL; +} + +static inline bool ipv6_addr_orchid(const struct in6_addr *a) +{ + return (a->s6_addr32[0] & htonl(0xfffffff0)) == htonl(0x20010010); +} + +static inline bool ipv6_addr_is_multicast(const struct in6_addr *addr) +{ + return (addr->s6_addr32[0] & htonl(0xFF000000)) == htonl(0xFF000000); +} + +static inline void ipv6_addr_set(struct in6_addr *addr, + uint32_t w1, uint32_t w2, + uint32_t w3, uint32_t w4) +{ + addr->s6_addr32[0] = w1; + addr->s6_addr32[1] = w2; + addr->s6_addr32[2] = w3; + addr->s6_addr32[3] = w4; +} + +static inline void ipv6_addr_copy(struct in6_addr *a1, + const struct in6_addr *a2) +{ + memcpy(a1, a2, sizeof(struct in6_addr)); +} + +static inline void addrconf_addr_solict_mult(const struct in6_addr *addr, + struct in6_addr *solicited) +{ + ipv6_addr_set(solicited, + htonl(0xFF020000), 0, + htonl(0x1), + htonl(0xFF000000) | addr->s6_addr32[3]); +} + +/* net/addrconf.h */ +static inline bool ipv6_addr_is_ll_all_nodes(const struct in6_addr *addr) +{ + return ((addr->s6_addr32[0] ^ htonl(0xff020000)) | + addr->s6_addr32[1] | addr->s6_addr32[2] | + (addr->s6_addr32[3] ^ htonl(0x00000001))) == 0; +} + +static inline bool ipv6_addr_is_ll_all_routers(const struct in6_addr *addr) +{ + return ((addr->s6_addr32[0] ^ htonl(0xff020000)) | + addr->s6_addr32[1] | addr->s6_addr32[2] | + (addr->s6_addr32[3] ^ htonl(0x00000002))) == 0; +} + +static inline bool ipv6_addr_is_isatap(const struct in6_addr *addr) +{ + return (addr->s6_addr32[2] | htonl(0x02000000)) == htonl(0x02005EFE); +} + +static inline bool ipv6_addr_is_solict_mult(const struct in6_addr *addr) +{ + return ((addr->s6_addr32[0] ^ htonl(0xff020000)) | + addr->s6_addr32[1] | + (addr->s6_addr32[2] ^ htonl(0x00000001)) | + (addr->s6_addr[12] ^ 0xff)) == 0; +} + +static inline int fls(int x) +{ + int r = 32; + + if (!x) + return 0; + if (!(x & 0xffff0000u)) { + x <<= 16; + r -= 16; + } + if (!(x & 0xff000000u)) { + x <<= 8; + r -= 8; + } + if (!(x & 0xf0000000u)) { + x <<= 4; + r -= 4; + } + if (!(x & 0xc0000000u)) { + x <<= 2; + r -= 2; + } + if (!(x & 0x80000000u)) { + x <<= 1; + r -= 1; + } + return r; +} + +static inline int __ipv6_addr_diff(const void *token1, const void *token2, int addrlen) +{ + const __be32 *a1 = token1, *a2 = token2; + int i; + + addrlen >>= 2; + + for (i = 0; i < addrlen; i++) { + __be32 xb = a1[i] ^ a2[i]; + if (xb) + return i * 32 + 32 - fls(ntohl(xb)); + } + + /* + * we should *never* get to this point since that + * would mean the addrs are equal + * + * However, we do get to it 8) And exacly, when + * addresses are equal 8) + * + * ip route add 1111::/128 via ... + * ip route add 1111::/64 via ... + * and we are here. + * + * Ideally, this function should stop comparison + * at prefix length. It does not, but it is still OK, + * if returned value is greater than prefix length. + * --ANK (980803) + */ + return (addrlen << 5); +} + +static inline int ipv6_addr_diff(const struct in6_addr *a1, const struct in6_addr *a2) +{ + return __ipv6_addr_diff(a1, a2, sizeof(struct in6_addr)); +} + +static inline int ipv6_saddr_preferred(int type) +{ + if (type & (IPV6_ADDR_MAPPED|IPV6_ADDR_COMPATv4| + IPV6_ADDR_LOOPBACK|IPV6_ADDR_RESERVED)) + return 1; + return 0; +} + +#ifdef __DPVS__ +/*functions below were edited from addrconf.c*/ + +/* + * 1. Prefer same address. (i.e. destination is local machine) + * 2. Prefer appropriate scope. (i.e. smallest scope shared with the destination) + * 3. Avoid deprecated addresses. + * 4. Prefer home addresses. (not support here!) + * 5. Prefer outgoing interface. (i.e. prefer an address on the interface we’re sending out of) + * 6. Prefer matching label. (not support here!) + * 7. Prefer public addresses. (not support here) + * 8. Use longest matching prefix. + */ +static inline int ipv6_get_saddr_eval(struct ipv6_saddr_score *score, + struct ipv6_saddr_dst *dst, + int i) +{ + int ret; + + if (i <= score->rule) { + switch (i) { + case IPV6_SADDR_RULE_SCOPE: + ret = score->scopedist; + break; + case IPV6_SADDR_RULE_PREFIX: + ret = score->matchlen; + break; + default: + ret = score->scorebits[i]; + } + goto out; + } + + switch (i) { + case IPV6_SADDR_RULE_INIT: + /* Rule 0: remember if hiscore is not ready yet */ + ret = !!score->ifa; + break; + case IPV6_SADDR_RULE_LOCAL: + /* Rule 1: Prefer same address */ + ret = ipv6_addr_equal(&score->ifa->addr.in6, dst->addr); + break; + case IPV6_SADDR_RULE_SCOPE: + /* Rule 2: Prefer appropriate scope */ + ret = __ipv6_addr_src_scope(score->addr_type); + if (ret >= dst->scope) + ret = -ret; + else + ret -= 128; + score->scopedist = ret; + break; + case IPV6_SADDR_RULE_PREFERRED: + /* Rule 3: Avoid deprecated and optimistic addresses */ + ret = ipv6_saddr_preferred(score->addr_type) || + !(score->ifa->flags & (IFA_F_DEPRECATED|IFA_F_OPTIMISTIC)); + break; + case IPV6_SADDR_RULE_OIF: + /* Rule 5: Prefer outgoing interface */ + ret = (!dst->idev || dst->idev == score->ifa->idev); + break; + case IPV6_SADDR_RULE_ORCHID: + /* Rule 8-: Prefer ORCHID vs ORCHID or + * non-ORCHID vs non-ORCHID + */ + ret = !(ipv6_addr_orchid(&score->ifa->addr.in6) ^ + ipv6_addr_orchid(dst->addr)); + break; + case IPV6_SADDR_RULE_PREFIX: + /* Rule 8: Use longest matching prefix */ + score->matchlen = ret = ipv6_addr_diff(&score->ifa->addr.in6, + dst->addr); + break; + default: + ret = 0; + } + + if (ret) + score->scorebits[i] = 1; + score->rule = i; + +out: + return ret; +} + +/* call me by lock */ +static inline int ipv6_addr_select(struct inet_device *idev, + const union inet_addr *daddr, + union inet_addr *saddr) +{ + struct ipv6_saddr_score scores[2]; + struct ipv6_saddr_score *score = &scores[0], *hiscore = &scores[1]; + struct ipv6_saddr_dst dst; + int dst_type; + struct inet_ifaddr *ifa; + int i; + + dst_type = __ipv6_addr_type(&daddr->in6); + dst.addr = &daddr->in6; + dst.idev = idev; + dst.scope = __ipv6_addr_src_scope(dst_type); + + hiscore->rule = -1; + hiscore->ifa = NULL; + + list_for_each_entry(ifa, &idev->ifa_list, d_list) { + + if (ifa->flags & IFA_F_TENTATIVE) + continue; + + score->ifa = ifa; + score->addr_type = __ipv6_addr_type(&score->ifa->addr.in6); + + if (unlikely(score->addr_type == IPV6_ADDR_ANY || + score->addr_type & IPV6_ADDR_MULTICAST)) + continue; + + score->rule = -1; + memset(score->scorebits, 0, sizeof(bool) * IPV6_SADDR_RULE_MAX); + + for (i = 0; i < IPV6_SADDR_RULE_MAX; i++) { + int minihiscore, miniscore; + + minihiscore = ipv6_get_saddr_eval(hiscore, &dst, i); + miniscore = ipv6_get_saddr_eval(score, &dst, i); + + if (minihiscore > miniscore) { + break; + } else if (minihiscore < miniscore) { + struct ipv6_saddr_score *temscore; + temscore = score; + score = hiscore; + hiscore = temscore; + break; + } + } + } + + if (!hiscore->ifa) + return EDPVS_NOTEXIST; + + *saddr = hiscore->ifa->addr; + return EDPVS_OK; +} +#endif + +#endif /* __LINUX_IPV6_H__ */ diff --git a/include/match.h b/include/match.h index 614109a87..f47e08d0f 100644 --- a/include/match.h +++ b/include/match.h @@ -27,8 +27,8 @@ #include "inet.h" struct dp_vs_match { - /* TODO: add af, proto, ... */ - + /* TODO: add proto, ... */ + int af; /* range is more flexible than prefix. */ struct inet_addr_range srange; /* source range */ struct inet_addr_range drange; /* dest range */ @@ -61,13 +61,13 @@ static inline int parse_match(const char *pattern, uint8_t *proto, } else if (strncmp(tok, "from=", strlen("from=")) == 0) { tok += strlen("from="); - err = inet_addr_range_parse(AF_INET, tok, &match->srange); + err = inet_addr_range_parse(tok, &match->srange, &match->af); if (err != EDPVS_OK) return err; } else if (strncmp(tok, "to=", strlen("to=")) == 0) { tok += strlen("to="); - err = inet_addr_range_parse(AF_INET, tok, &match->drange); + err = inet_addr_range_parse(tok, &match->drange, &match->af); if (err != EDPVS_OK) return err; } else if (strncmp(tok, "iif=", strlen("iif=")) == 0) { @@ -99,13 +99,13 @@ static inline char *dump_match(uint8_t proto, const struct dp_vs_match *match, if (memcmp(&match->srange, &zero_range, sizeof(zero_range)) != 0) { left -= snprintf(buf + strlen(buf), left, ",from="); - left -= inet_addr_range_dump(AF_INET, &match->srange, + left -= inet_addr_range_dump(match->af, &match->srange, buf + strlen(buf), left); } if (memcmp(&match->drange, &zero_range, sizeof(zero_range)) != 0) { left -= snprintf(buf + strlen(buf), left, ",to="); - left -= inet_addr_range_dump(AF_INET, &match->drange, + left -= inet_addr_range_dump(match->af, &match->drange, buf + strlen(buf), left); } diff --git a/include/ndisc.h b/include/ndisc.h new file mode 100644 index 000000000..6b0ee9968 --- /dev/null +++ b/include/ndisc.h @@ -0,0 +1,32 @@ +/* + * DPVS is a software load balancer (Virtual Server) based on DPDK. + * + * Copyright (C) 2017 iQIYI (www.iqiyi.com). + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ +#ifndef __DPVS_NDISC_H__ +#define __DPVS_NDISC_H__ + +#include "neigh.h" + +int ndisc_rcv(struct rte_mbuf *mbuf, + struct netif_port *dev); + +void ndisc_send_dad(struct netif_port *dev, + const struct in6_addr* solicit); + +void ndisc_solicit(struct neighbour_entry *neigh, + const struct in6_addr *saddr); + +#endif /* __DPVS_NDISC_H__ */ diff --git a/include/neigh.h b/include/neigh.h index d81d6a691..a7814dcc4 100644 --- a/include/neigh.h +++ b/include/neigh.h @@ -22,6 +22,9 @@ #include #include #include +#include +#include +#include #include #include @@ -40,16 +43,71 @@ #include "list.h" #include "timer.h" #include "netif.h" +#include "linux_ipv6.h" + +#define RTE_LOGTYPE_NEIGHBOUR RTE_LOGTYPE_USER2 +#define NEIGH_TAB_BITS 8 +#define NEIGH_TAB_SIZE (1 << NEIGH_TAB_BITS) +#define NEIGH_TAB_MASK (NEIGH_TAB_SIZE - 1) + +struct neighbour_entry { + int af; + struct list_head neigh_list; + union inet_addr ip_addr; + struct ether_addr eth_addr; + struct netif_port *port; + struct dpvs_timer timer; + struct list_head queue_list; + uint32_t que_num; + uint32_t state; + uint32_t ts; + uint8_t flag; +} __rte_cache_aligned; + +enum param_kind { + NEIGH_ENTRY, + NEIGH_PARAM +}; + +/* + * no matter which kind of ip_addr, just use 32 bit to hash + * since neighbour table is not a large table + */ +static inline unsigned int neigh_hashkey(int af, + const union inet_addr *ip_addr, + struct netif_port *port) { + return rte_be_to_cpu_32(inet_addr_fold(af, ip_addr)) \ + & NEIGH_TAB_MASK; +} + +void neigh_entry_state_trans(struct neighbour_entry *neighbour, int idx); + +struct neighbour_entry *neigh_lookup_entry(int af, const union inet_addr *key, + const struct netif_port *port, + unsigned int hashkey); + +void neigh_send_mbuf_cach(struct neighbour_entry *neighbour); + +int neigh_edit(struct neighbour_entry *neighbour, + struct ether_addr *eth_addr); int neigh_init(void); -int neigh_term(void); -#define RTE_LOGTYPE_NEIGHBOUR RTE_LOGTYPE_USER2 +int neigh_term(void); void neigh_keyword_value_init(void); + void install_neighbor_keywords(void); -int neigh_resolve_output(struct in_addr *nexhop, struct rte_mbuf *mbuf, struct netif_port *port); +int neigh_output(int af, + union inet_addr *nexhop, + struct rte_mbuf *mbuf, + struct netif_port *port); + +struct neighbour_entry *neigh_add_table(int af, const union inet_addr *ipaddr, + const struct ether_addr *eth_addr, + struct netif_port *port, + unsigned int hashkey, int flag); int neigh_gratuitous_arp(struct in_addr *src, struct netif_port *port); @@ -57,7 +115,18 @@ int neigh_resolve_input(struct rte_mbuf *mbuf, struct netif_port *port); void neigh_process_ring(void *arg); -void neigh_confirm(struct in_addr nexthop, struct netif_port *port); +void neigh_confirm(int af, union inet_addr *nexthop, struct netif_port *port); + +int neigh_sync_core(const void *param, bool add_del, enum param_kind kind); + +static inline void ipv6_mac_mult(const struct in6_addr *mult_target, + struct ether_addr *mult_eth) +{ + uint8_t *w = (uint8_t *)mult_eth; + w[0] = 0x33; + w[1] = 0x33; + rte_memcpy(&w[2], &mult_target->s6_addr32[3], 4); +} /* ethSwap(u16_t * to, u16_t * from) - Swap two 16 bit values */ static __inline__ void diff --git a/include/netif.h b/include/netif.h index ad1308dcc..df87389fb 100644 --- a/include/netif.h +++ b/include/netif.h @@ -17,6 +17,7 @@ */ #ifndef __DPVS_NETIF_H__ #define __DPVS_NETIF_H__ +#include #include "list.h" #include "dpdk.h" #include "inetaddr.h" @@ -168,7 +169,6 @@ typedef enum { } eth_type_t; /************************ data type for NIC ****************************/ -#define DEVICE_NAME_MAX_LEN 32 typedef enum { PORT_TYPE_GENERAL, PORT_TYPE_BOND_MASTER, @@ -179,7 +179,7 @@ typedef enum { } port_type_t; struct netif_kni { - char name[DEVICE_NAME_MAX_LEN]; + char name[IFNAMSIZ]; struct rte_kni *kni; struct ether_addr addr; struct dpvs_timer kni_rtnl_timer; @@ -240,7 +240,7 @@ struct netif_hw_addr_list { }; struct netif_port { - char name[DEVICE_NAME_MAX_LEN]; /* device name */ + char name[IFNAMSIZ]; /* device name */ portid_t id; /* device id */ port_type_t type; /* device type */ uint16_t flag; /* device flag */ @@ -261,7 +261,6 @@ struct netif_port { struct list_head list; /* device list node hashed by id */ struct list_head nlist; /* device list node hashed by name */ struct inet_device *in_ptr; - struct inet_device *in6_ptr; struct netif_kni kni; /* kni device */ union netif_bond *bond; /* bonding conf */ struct vlan_info *vlan_info; /* VLANs info for real device */ @@ -292,6 +291,8 @@ int netif_unregister_pkt(struct pkt_type *pt); /**************************** port API ******************************/ int netif_fdir_filter_set(struct netif_port *port, enum rte_filter_op opcode, const struct rte_eth_fdir_filter *fdir_flt); +void netif_mask_fdir_filter(int af, const struct netif_port *port, + struct rte_eth_fdir_filter *filt); struct netif_port* netif_port_get(portid_t id); /* port_conf can be NULL for default port configure */ int netif_print_port_conf(const struct rte_eth_conf *port_conf, char *buf, int *len); diff --git a/include/route.h b/include/route.h index 20886fbfa..d78ca4b72 100644 --- a/include/route.h +++ b/include/route.h @@ -25,6 +25,7 @@ #include "list.h" #include "netif.h" #include "common.h" +#include "flow.h" struct route_entry { uint8_t netmask; @@ -39,38 +40,6 @@ struct route_entry { rte_atomic32_t refcnt; }; -struct flow4 { - struct in_addr saddr; - struct in_addr daddr; - uint16_t sport; - uint16_t dport; - struct netif_port *oif; - struct netif_port *iif; - uint8_t tos; - uint8_t proto; - uint8_t scope; - uint8_t ttl; - uint32_t mark; - uint32_t flags; -}; - -#define RTF_UP 0x0001 /* route usable */ -#define RTF_GATEWAY 0x0002 /* destination is a gateway */ -#define RTF_HOST 0x0004 /* host entry (net otherwise) */ -#define RTF_REINSTATE 0x0008 /* reinstate route after tmout */ -#define RTF_DYNAMIC 0x0010 /* created dyn. (by redirect) */ -#define RTF_MODIFIED 0x0020 /* modified dyn. (by redirect) */ -#define RTF_MTU 0x0040 /* specific MTU for this route */ -#define RTF_MSS RTF_MTU /* Compatibility :-( */ -#define RTF_WINDOW 0x0080 /* per route window clamping */ -#define RTF_IRTT 0x0100 /* Initial round trip time */ -#define RTF_REJECT 0x0200 /* Reject route */ - -#define RTF_FORWARD 0x0400 -#define RTF_LOCALIN 0x0800 -#define RTF_DEFAULT 0x1000 -#define RTF_KNI 0X2000 - struct route_entry *route4_local(uint32_t src, struct netif_port *port); struct route_entry *route_out_local_lookup(uint32_t dest); diff --git a/include/route6.h b/include/route6.h new file mode 100644 index 000000000..cfe7c576d --- /dev/null +++ b/include/route6.h @@ -0,0 +1,132 @@ +/* + * DPVS is a software load balancer (Virtual Server) based on DPDK. + * + * Copyright (C) 2018 iQIYI (www.iqiyi.com). + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ +/** + * IPv6 route. + */ +#ifndef __DPVS_ROUTE6_H__ +#define __DPVS_ROUTE6_H__ + +#include +#include "flow.h" +#include "conf/route6.h" + +//#define DPVS_ROUTE6_DEBUG +#define RTE_LOGTYPE_RT6 RTE_LOGTYPE_USER1 +#define RT6_METHOD_NAME_SZ 32 + +struct route6 { + struct rt6_prefix rt6_dst; + struct rt6_prefix rt6_src; + struct rt6_prefix rt6_prefsrc; + struct in6_addr rt6_gateway; + struct netif_port *rt6_dev; + uint32_t rt6_mtu; + uint32_t rt6_flags; /* RTF_XXX */ + + /* private members */ + uint32_t arr_idx; /* lpm6 array index */ + struct list_head hnode; /* hash list node */ + rte_atomic32_t refcnt; +}; + +struct route6 *route6_input(const struct rte_mbuf *mbuf, struct flow6 *fl6); +struct route6 *route6_output(const struct rte_mbuf *mbuf, struct flow6 *fl6); +int route6_get(struct route6 *rt); +int route6_put(struct route6 *rt); + +int route6_init(void); +int route6_term(void); + +int route6_add(const struct in6_addr *dest, int plen, uint32_t flags, + const struct in6_addr *gw, struct netif_port *dev, + const struct in6_addr *src, uint32_t mtu); + +int route6_del(const struct in6_addr *dest, int plen, uint32_t flags, + const struct in6_addr *gw, struct netif_port *dev, + const struct in6_addr *src, uint32_t mtu); + +/* for route6_xxx.c only */ +void route6_free(struct route6*); + +static inline int dump_rt6_prefix(const struct rt6_prefix *rt6_p, char *buf, int len) +{ + size_t rlen; + + if (!inet_ntop(AF_INET6, &rt6_p->addr, buf, len)) + return 0; + + rlen = strlen(buf); + rlen += snprintf(buf+rlen, len-rlen, "/%d", rt6_p->plen); + + return rlen; +} + +struct route6_method { + char name[RT6_METHOD_NAME_SZ]; + struct list_head lnode; + int (*rt6_setup_lcore)(void *); + int (*rt6_destroy_lcore)(void *); + uint32_t (*rt6_count)(void); + int (*rt6_add_lcore)(const struct dp_vs_route6_conf *); + int (*rt6_del_lcore)(const struct dp_vs_route6_conf *); + struct route6* (*rt6_get)(const struct dp_vs_route6_conf *); + struct route6* (*rt6_input)(const struct rte_mbuf *, struct flow6 *); + struct route6* (*rt6_output)(const struct rte_mbuf *, struct flow6 *); + struct dp_vs_route6_conf_array* (*rt6_dump)( + const struct dp_vs_route6_conf *rt6_cfg, + size_t *nbytes); +}; + +int route6_method_register(struct route6_method *rt6_mtd); +int route6_method_unregister(struct route6_method *rt6_mtd); + +static inline void rt6_fill_with_cfg(struct route6 *rt6, + const struct dp_vs_route6_conf *cf) +{ + memset(rt6, 0, sizeof(struct route6)); + + rt6->rt6_dst = cf->dst; + rt6->rt6_src = cf->src; + rt6->rt6_prefsrc = cf->prefsrc; + rt6->rt6_dev = netif_port_get_by_name(cf->ifname); + rt6->rt6_gateway = cf->gateway; + rt6->rt6_flags = cf->flags; + rt6->rt6_mtu = cf->mtu; + if (!cf->mtu && rt6->rt6_dev) + rt6->rt6_mtu = rt6->rt6_dev->mtu; +} + +static inline void rt6_fill_cfg(struct dp_vs_route6_conf *cf, + const struct route6 *rt6) +{ + memset(cf, 0, sizeof(struct dp_vs_route6_conf)); + + cf->dst = rt6->rt6_dst; + cf->src = rt6->rt6_src; + cf->prefsrc = rt6->rt6_prefsrc; + + strncpy(cf->ifname, rt6->rt6_dev->name, sizeof(cf->ifname)); + cf->gateway = rt6->rt6_gateway; + cf->mtu = rt6->rt6_mtu; + cf->flags = rt6->rt6_flags; +} + +void install_route6_keywords(void); +void route6_keyword_value_init(void); + +#endif /* __DPVS_ROUTE6_H__ */ diff --git a/include/route6_hlist.h b/include/route6_hlist.h new file mode 100644 index 000000000..abdaa7c49 --- /dev/null +++ b/include/route6_hlist.h @@ -0,0 +1,24 @@ +/* + * DPVS is a software load balancer (Virtual Server) based on DPDK. + * + * Copyright (C) 2018 iQIYI (www.iqiyi.com). + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ +#ifndef __DPVS_ROUTE6_HLIST_H__ +#define __DPVS_ROUTE6_HLIST_H__ + +int route6_hlist_init(void); +int route6_hlist_term(void); + +#endif /* __DPVS_ROUTE6_HLIST_H__ */ diff --git a/include/route6_lpm.h b/include/route6_lpm.h new file mode 100644 index 000000000..47c75b76e --- /dev/null +++ b/include/route6_lpm.h @@ -0,0 +1,27 @@ +/* + * DPVS is a software load balancer (Virtual Server) based on DPDK. + * + * Copyright (C) 2018 iQIYI (www.iqiyi.com). + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ +#ifndef __DPVS_ROUTE6_LPM_H__ +#define __DPVS_ROUTE6_LPM_H__ + +int route6_lpm_init(void); +int route6_lpm_term(void); + +void route6_lpm_keyword_value_init(void); +void install_rt6_lpm_keywords(void); + +#endif /* __DPVS_ROUTE6_LPM_H__ */ diff --git a/include/sa_pool.h b/include/sa_pool.h index b1991c67a..7a136aebb 100644 --- a/include/sa_pool.h +++ b/include/sa_pool.h @@ -21,7 +21,7 @@ * for multi-core app, the traffic comes back of local initiated * connection need reach original CPU core. there are several * ways to achieve the goal. one is to calc RSS the same way of - * NIC to select the currect CPU for connect. + * NIC to select the correct CPU for connect. * * the way we use is based on Flow-Director (fdir), allocate * local source (e.g., ) for each CPU core in advance. @@ -59,11 +59,13 @@ int sa_pool_destroy(struct inet_ifaddr *ifa); * @dev and @daddr is optional, * note: if @daddr is used, it must be the same for sa_fetch and sa_release. */ -int sa_fetch(struct netif_port *dev, const struct sockaddr_in *daddr, - struct sockaddr_in *saddr); +int sa_fetch(int af, struct netif_port *dev, + const struct sockaddr_storage *daddr, + struct sockaddr_storage *saddr); -int sa_release(const struct netif_port *dev, const struct sockaddr_in *daddr, - const struct sockaddr_in *saddr); +int sa_release(const struct netif_port *dev, + const struct sockaddr_storage *daddr, + const struct sockaddr_storage *saddr); int sa_pool_stats(const struct inet_ifaddr *ifa, struct sa_pool_stats *stats); diff --git a/include/uoa.h b/include/uoa.h index f587f6323..6236fe054 100644 --- a/include/uoa.h +++ b/include/uoa.h @@ -32,15 +32,23 @@ #endif /* avoid IANA ip options */ -#define IPOPT_UOA (31|IPOPT_CONTROL) -#define IPOLEN_UOA sizeof(struct ipopt_uoa) +#define IPOPT_UOA (31 | IPOPT_CONTROL) +#define IPOLEN_UOA_IPV4 (sizeof(struct ipopt_uoa) + 4) +#define IPOLEN_UOA_IPV6 (sizeof(struct ipopt_uoa) + 16) -/* UOA IP option */ +/* + * UOA IP option + * @op_code: operation code + * @op_len: length of struct ipopt_uoa + real op_addr (v4/v6) length + * i.e. IPOLEN_UOA_IPV4 or IPOLEN_UOA_IPV6 + * @op_port: port number + * @op_addr: real ipv4 or ipv6 address following it + */ struct ipopt_uoa { __u8 op_code; __u8 op_len; __be16 op_port; - __be32 op_addr; + __u8 op_addr[0]; } __attribute__((__packed__)); /* per-cpu statistics */ @@ -124,8 +132,9 @@ struct uoa_param_map { * : Options : * +---------------+---------------+---------------+--------------+ * - * Ver. Version, now 0x1 (1). - * Rsvd. Reserved bits, must be zero. + * Ve. Version, now 0x1 (1) for ipv4 address family, OPPHDR_IPV4 + * 0x2 (2) for ipv6 address family, OPPHDR_IPV6 + * Rsvd. Reserved bits, must be zero. * Protocol Next level protocol, e.g., IPPROTO_UDP. * Length Length of fixed header and options, not include payloads. * Options Compatible with IPv4 options, including IPOPT_UOA. @@ -133,6 +142,9 @@ struct uoa_param_map { #define IPPROTO_OPT 0xf8 /* 248 */ +#define OPPHDR_IPV6 0x02 +#define OPPHDR_IPV4 0x01 + /* OPtion Protocol header */ struct opphdr { #if defined(__LITTLE_ENDIAN_BITFIELD) || (__BYTE_ORDER == __LITTLE_ENDIAN) diff --git a/kmod/toa/Makefile b/kmod/toa/Makefile new file mode 100644 index 000000000..0a091b6ba --- /dev/null +++ b/kmod/toa/Makefile @@ -0,0 +1,26 @@ +obj-m += toa.o + +ifeq ($(KERNDIR), ) +KDIR := /lib/modules/$(shell uname -r)/build +else +KDIR := $(KERNDIR) +endif +PWD := $(shell pwd) + +ccflags-y := -DTOA_IPV6_ENABLE + +ifeq ($(DEBUG), 1) +ccflags-y += -g -O0 +endif + +all: + $(MAKE) -C $(KDIR) M=$(PWD) modules + +clean: + $(MAKE) -C $(KDIR) M=$(PWD) modules clean + +install: + if [ -d "$(INSDIR)" ]; then \ + install -m 664 toa.ko $(INSDIR)/toa.ko; \ + fi + diff --git a/kmod/toa/toa.c b/kmod/toa/toa.c new file mode 100644 index 000000000..63c0709ac --- /dev/null +++ b/kmod/toa/toa.c @@ -0,0 +1,575 @@ +#include "toa.h" + +/* + * TOA: Address is a new TCP Option + * Address include ip+port, Now support IPV4 and IPV6 + */ + +unsigned long sk_data_ready_addr = 0; + +#define TOA_NIPQUAD_FMT "%u.%u.%u.%u" + +#define TOA_NIPQUAD(addr) \ + ((unsigned char *)&addr)[0], \ + ((unsigned char *)&addr)[1], \ + ((unsigned char *)&addr)[2], \ + ((unsigned char *)&addr)[3] + +#ifdef TOA_IPV6_ENABLE +#define TOA_NIP6_FMT "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x" + +#define TOA_NIP6(addr) \ + ntohs((addr).s6_addr16[0]), \ + ntohs((addr).s6_addr16[1]), \ + ntohs((addr).s6_addr16[2]), \ + ntohs((addr).s6_addr16[3]), \ + ntohs((addr).s6_addr16[4]), \ + ntohs((addr).s6_addr16[5]), \ + ntohs((addr).s6_addr16[6]), \ + ntohs((addr).s6_addr16[7]) + +static struct proto_ops *inet6_stream_ops_p = NULL; +static struct inet_connection_sock_af_ops *ipv6_specific_p = NULL; + +typedef struct sock *(*syn_recv_sock_func_pt)( + struct sock *sk, struct sk_buff *skb, + struct request_sock *req, + struct dst_entry *dst); +static syn_recv_sock_func_pt tcp_v6_syn_recv_sock_org_pt = NULL; +#endif + +/* + * Statistics of toa in proc /proc/net/toa_stats + */ + +struct toa_stats_entry toa_stats[] = { + TOA_STAT_ITEM("syn_recv_sock_toa", SYN_RECV_SOCK_TOA_CNT), + TOA_STAT_ITEM("syn_recv_sock_no_toa", SYN_RECV_SOCK_NO_TOA_CNT), + TOA_STAT_ITEM("getname_toa_ok", GETNAME_TOA_OK_CNT), + TOA_STAT_ITEM("getname_toa_mismatch", GETNAME_TOA_MISMATCH_CNT), + TOA_STAT_ITEM("getname_toa_bypass", GETNAME_TOA_BYPASS_CNT), + TOA_STAT_ITEM("getname_toa_empty", GETNAME_TOA_EMPTY_CNT), +#ifdef TOA_IPV6_ENABLE + TOA_STAT_ITEM("ip6_address_alloc", IP6_ADDR_ALLOC_CNT), + TOA_STAT_ITEM("ip6_address_free", IP6_ADDR_FREE_CNT), +#endif + TOA_STAT_END +}; + +DEFINE_TOA_STAT(struct toa_stat_mib, ext_stats); + +/* + * Funcs for toa hooks + */ + +/* Parse TCP options in skb, try to get client ip, port + * @param skb [in] received skb, it should be a ack/get-ack packet. + * @return NULL if we don't get client ip/port; + * value of toa_data in ret_ptr if we get client ip/port. + */ +static void *get_toa_data(int af, struct sk_buff *skb) +{ + struct tcphdr *th; + int length; + unsigned char *ptr; + + TOA_DBG("get_toa_data called\n"); + + if (NULL != skb) { + th = tcp_hdr(skb); + length = (th->doff * 4) - sizeof(struct tcphdr); + ptr = (unsigned char *) (th + 1); + + while (length > 0) { + int opcode = *ptr++; + int opsize; + switch (opcode) { + case TCPOPT_EOL: + return NULL; + case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ + length--; + continue; + default: + opsize = *ptr++; + if (opsize < 2) /* "silly options" */ + return NULL; + if (opsize > length) + /* don't parse partial options */ + return NULL; + if (TCPOPT_TOA == opcode && + TCPOLEN_IP4_TOA == opsize) { + + struct toa_ip4_data tdata; + void *ret_ptr = NULL; + + memcpy(&tdata, ptr - 2, sizeof(tdata)); + TOA_DBG("af = %d, find toa data: ip = " + TOA_NIPQUAD_FMT", port = %u\n", + af, + TOA_NIPQUAD(tdata.ip), + ntohs(tdata.port)); + if (af == AF_INET) { + memcpy(&ret_ptr, &tdata, + sizeof(ret_ptr)); + TOA_DBG("coded ip4 toa data: %p\n", + ret_ptr); + return ret_ptr; + } +#ifdef TOA_IPV6_ENABLE + else if (af == AF_INET6) { + struct toa_ip6_data *ptr_toa_ip6 = + kmalloc(sizeof(struct toa_ip6_data), GFP_ATOMIC); + if (!ptr_toa_ip6) { + return NULL; + } + ptr_toa_ip6->opcode = opcode; + ptr_toa_ip6->opsize = TCPOLEN_IP6_TOA; + ipv6_addr_set(&ptr_toa_ip6->in6_addr, 0, 0, + htonl(0x0000FFFF), tdata.ip); + TOA_DBG("coded ip6 toa data: %p\n", + ptr_toa_ip6); + TOA_INC_STATS(ext_stats, IP6_ADDR_ALLOC_CNT); + return ptr_toa_ip6; + } +#endif + } + +#ifdef TOA_IPV6_ENABLE + if (TCPOPT_TOA == opcode && + TCPOLEN_IP6_TOA == opsize && + af == AF_INET6) { + struct toa_ip6_data *ptr_toa_ip6 = + kmalloc(sizeof(struct toa_ip6_data), GFP_ATOMIC); + if (!ptr_toa_ip6) { + return NULL; + } + memcpy(ptr_toa_ip6, ptr - 2, sizeof(struct toa_ip6_data)); + + TOA_DBG("find toa_v6 data : ip = " + TOA_NIP6_FMT", port = %u," + " coded ip6 toa data: %p\n", + TOA_NIP6(ptr_toa_ip6->in6_addr), + ptr_toa_ip6->port, + ptr_toa_ip6); + TOA_INC_STATS(ext_stats, IP6_ADDR_ALLOC_CNT); + return ptr_toa_ip6; + } +#endif + ptr += opsize - 2; + length -= opsize; + } + } + } + return NULL; +} + +/* get client ip from socket + * @param sock [in] the socket to getpeername() or getsockname() + * @param uaddr [out] the place to put client ip, port + * @param uaddr_len [out] lenth of @uaddr + * @peer [in] if(peer), try to get remote address; if(!peer), + * try to get local address + * @return return what the original inet_getname() returns. + */ +static int +inet_getname_toa(struct socket *sock, struct sockaddr *uaddr, + int *uaddr_len, int peer) +{ + int retval = 0; + struct sock *sk = sock->sk; + struct sockaddr_in *sin = (struct sockaddr_in *) uaddr; + struct toa_ip4_data tdata; + + TOA_DBG("inet_getname_toa called, sk->sk_user_data is %p\n", + sk->sk_user_data); + + /* call orginal one */ + retval = inet_getname(sock, uaddr, uaddr_len, peer); + + /* set our value if need */ + if (retval == 0 && NULL != sk->sk_user_data && peer) { + if (sk_data_ready_addr == (unsigned long) sk->sk_data_ready) { + memcpy(&tdata, &sk->sk_user_data, sizeof(tdata)); + if (TCPOPT_TOA == tdata.opcode && + TCPOLEN_IP4_TOA == tdata.opsize) { + TOA_INC_STATS(ext_stats, GETNAME_TOA_OK_CNT); + TOA_DBG("inet_getname_toa: set new sockaddr, ip " + TOA_NIPQUAD_FMT" -> "TOA_NIPQUAD_FMT + ", port %u -> %u\n", + TOA_NIPQUAD(sin->sin_addr.s_addr), + TOA_NIPQUAD(tdata.ip), ntohs(sin->sin_port), + ntohs(tdata.port)); + sin->sin_port = tdata.port; + sin->sin_addr.s_addr = tdata.ip; + } else { /* sk_user_data doesn't belong to us */ + TOA_INC_STATS(ext_stats, + GETNAME_TOA_MISMATCH_CNT); + TOA_DBG("inet_getname_toa: invalid toa data, " + "ip "TOA_NIPQUAD_FMT" port %u opcode %u " + "opsize %u\n", + TOA_NIPQUAD(tdata.ip), ntohs(tdata.port), + tdata.opcode, tdata.opsize); + } + } else { + TOA_INC_STATS(ext_stats, GETNAME_TOA_BYPASS_CNT); + } + } else { /* no need to get client ip */ + TOA_INC_STATS(ext_stats, GETNAME_TOA_EMPTY_CNT); + } + + return retval; +} + +#ifdef TOA_IPV6_ENABLE +static int +inet6_getname_toa(struct socket *sock, struct sockaddr *uaddr, + int *uaddr_len, int peer) +{ + int retval = 0; + struct sock *sk = sock->sk; + struct sockaddr_in6 *sin = (struct sockaddr_in6 *) uaddr; + struct toa_ip6_data* t_ip6_data_ptr; + + TOA_DBG("inet6_getname_toa called, sk->sk_user_data is %p\n", + sk->sk_user_data); + + /* call orginal one */ + retval = inet6_getname(sock, uaddr, uaddr_len, peer); + + /* set our value if need */ + if (retval == 0 && NULL != sk->sk_user_data && peer) { + if (sk_data_ready_addr == (unsigned long) sk->sk_data_ready) { + t_ip6_data_ptr = sk->sk_user_data; + if (TCPOPT_TOA == t_ip6_data_ptr->opcode && + TCPOLEN_IP6_TOA == t_ip6_data_ptr->opsize) { + TOA_INC_STATS(ext_stats, GETNAME_TOA_OK_CNT); + TOA_DBG("inet6_getname_toa: set new sockaddr, ip " + TOA_NIP6_FMT" -> "TOA_NIP6_FMT + ", port %u -> %u\n", + TOA_NIP6(sin->sin6_addr), + TOA_NIP6(t_ip6_data_ptr->in6_addr), + ntohs(sin->sin6_port), + ntohs(t_ip6_data_ptr->port)); + sin->sin6_port = t_ip6_data_ptr->port; + sin->sin6_addr = t_ip6_data_ptr->in6_addr; + } else { /* sk_user_data doesn't belong to us */ + TOA_INC_STATS(ext_stats, + GETNAME_TOA_MISMATCH_CNT); + } + } else { + TOA_INC_STATS(ext_stats, GETNAME_TOA_BYPASS_CNT); + } + } else { /* no need to get client ip */ + TOA_INC_STATS(ext_stats, GETNAME_TOA_EMPTY_CNT); + } + + return retval; +} + +static inline int +get_kernel_ipv6_symbol(void) +{ + inet6_stream_ops_p = + (struct proto_ops *)kallsyms_lookup_name("inet6_stream_ops"); + if (inet6_stream_ops_p == NULL) { + TOA_INFO("CPU [%u] kallsyms_lookup_name cannot find symbol inet6_stream_ops\n", + smp_processor_id()); + + return -1; + } + ipv6_specific_p = + (struct inet_connection_sock_af_ops *)kallsyms_lookup_name("ipv6_specific"); + if (ipv6_specific_p == NULL) { + TOA_INFO("CPU [%u] kallsyms_lookup_name cannot find symbol ipv6_specific\n", + smp_processor_id()); + return -1; + } + tcp_v6_syn_recv_sock_org_pt = + (syn_recv_sock_func_pt)kallsyms_lookup_name("tcp_v6_syn_recv_sock"); + if (tcp_v6_syn_recv_sock_org_pt == NULL) { + TOA_INFO("CPU [%u] kallsyms_lookup_name cannot find symbol tcp_v6_syn_recv_sock\n", + smp_processor_id()); + return -1; + } + return 0; +} +#endif + +/* The three way handshake has completed - we got a valid synack - + * now create the new socket. + * We need to save toa data into the new socket. + * @param sk [out] the socket + * @param skb [in] the ack/ack-get packet + * @param req [in] the open request for this connection + * @param dst [out] route cache entry + * @return NULL if fail new socket if succeed. + */ +static struct sock * +tcp_v4_syn_recv_sock_toa(struct sock *sk, struct sk_buff *skb, + struct request_sock *req, struct dst_entry *dst) +{ + struct sock *newsock = NULL; + + TOA_DBG("tcp_v4_syn_recv_sock_toa called\n"); + + /* call orginal one */ + newsock = tcp_v4_syn_recv_sock(sk, skb, req, dst); + + /* set our value if need */ + if (NULL != newsock && NULL == newsock->sk_user_data) { + newsock->sk_user_data = get_toa_data(AF_INET, skb); + if (NULL != newsock->sk_user_data) + TOA_INC_STATS(ext_stats, SYN_RECV_SOCK_TOA_CNT); + else + TOA_INC_STATS(ext_stats, SYN_RECV_SOCK_NO_TOA_CNT); + + TOA_DBG("tcp_v4_syn_recv_sock_toa: set " + "sk->sk_user_data to %p\n", + newsock->sk_user_data); + } + return newsock; +} + +#ifdef TOA_IPV6_ENABLE +static void +tcp_v6_sk_destruct_toa(struct sock *sk) { + if (sk->sk_user_data) { + kfree(sk->sk_user_data); + sk->sk_user_data = NULL; + TOA_INC_STATS(ext_stats, IP6_ADDR_FREE_CNT); + } + inet_sock_destruct(sk); +} + +static struct sock * +tcp_v6_syn_recv_sock_toa(struct sock *sk, struct sk_buff *skb, + struct request_sock *req, struct dst_entry *dst) +{ + struct sock *newsock = NULL; + + TOA_DBG("tcp_v6_syn_recv_sock_toa called\n"); + + /* call orginal one */ + newsock = tcp_v6_syn_recv_sock_org_pt(sk, skb, req, dst); + + /* set our value if need */ + if (NULL != newsock && NULL == newsock->sk_user_data) { + newsock->sk_user_data = get_toa_data(AF_INET6, skb); + if (NULL != newsock->sk_user_data) { + newsock->sk_destruct = tcp_v6_sk_destruct_toa; + TOA_INC_STATS(ext_stats, SYN_RECV_SOCK_TOA_CNT); + } else { + TOA_INC_STATS(ext_stats, SYN_RECV_SOCK_NO_TOA_CNT); + } + + TOA_DBG("tcp_v6_syn_recv_sock_toa: set " + "sk->sk_user_data to %p\n", + newsock->sk_user_data); + } + return newsock; +} + + +#endif + +/* + * HOOK FUNCS + */ + +/* replace the functions with our functions */ +static inline int +hook_toa_functions(void) +{ + /* hook inet_getname for ipv4 */ + struct proto_ops *inet_stream_ops_p = + (struct proto_ops *)&inet_stream_ops; + /* hook tcp_v4_syn_recv_sock for ipv4 */ + struct inet_connection_sock_af_ops *ipv4_specific_p = + (struct inet_connection_sock_af_ops *)&ipv4_specific; + + inet_stream_ops_p->getname = inet_getname_toa; + TOA_INFO("CPU [%u] hooked inet_getname <%p> --> <%p>\n", + smp_processor_id(), inet_getname, inet_stream_ops_p->getname); + + ipv4_specific_p->syn_recv_sock = tcp_v4_syn_recv_sock_toa; + TOA_INFO("CPU [%u] hooked tcp_v4_syn_recv_sock <%p> --> <%p>\n", + smp_processor_id(), tcp_v4_syn_recv_sock, + ipv4_specific_p->syn_recv_sock); + +#ifdef TOA_IPV6_ENABLE + inet6_stream_ops_p->getname = inet6_getname_toa; + TOA_INFO("CPU [%u] hooked inet6_getname <%p> --> <%p>\n", + smp_processor_id(), inet6_getname, inet6_stream_ops_p->getname); + + ipv6_specific_p->syn_recv_sock = tcp_v6_syn_recv_sock_toa; + TOA_INFO("CPU [%u] hooked tcp_v6_syn_recv_sock <%p> --> <%p>\n", + smp_processor_id(), tcp_v6_syn_recv_sock_org_pt, + ipv6_specific_p->syn_recv_sock); +#endif + + return 0; +} + +/* replace the functions to original ones */ +static int +unhook_toa_functions(void) +{ + /* unhook inet_getname for ipv4 */ + struct proto_ops *inet_stream_ops_p = + (struct proto_ops *)&inet_stream_ops; + /* unhook tcp_v4_syn_recv_sock for ipv4 */ + struct inet_connection_sock_af_ops *ipv4_specific_p = + (struct inet_connection_sock_af_ops *)&ipv4_specific; + + inet_stream_ops_p->getname = inet_getname; + TOA_INFO("CPU [%u] unhooked inet_getname\n", + smp_processor_id()); + + ipv4_specific_p->syn_recv_sock = tcp_v4_syn_recv_sock; + TOA_INFO("CPU [%u] unhooked tcp_v4_syn_recv_sock\n", + smp_processor_id()); + +#ifdef TOA_IPV6_ENABLE + if (inet6_stream_ops_p) { + inet6_stream_ops_p->getname = inet6_getname; + TOA_INFO("CPU [%u] unhooked inet6_getname\n", + smp_processor_id()); + } + if (ipv6_specific_p) { + ipv6_specific_p->syn_recv_sock = tcp_v6_syn_recv_sock_org_pt; + TOA_INFO("CPU [%u] unhooked tcp_v6_syn_recv_sock\n", + smp_processor_id()); + } +#endif + + return 0; +} + +/* + * Statistics of toa in proc /proc/net/toa_stats + */ +static int toa_stats_show(struct seq_file *seq, void *v) +{ + int i, j, cpu_nr; + + /* print CPU first */ + seq_printf(seq, " "); + cpu_nr = num_possible_cpus(); + for (i = 0; i < cpu_nr; i++) + if (cpu_online(i)) + seq_printf(seq, "CPU%d ", i); + seq_putc(seq, '\n'); + + i = 0; + while (NULL != toa_stats[i].name) { + seq_printf(seq, "%-25s:", toa_stats[i].name); + for (j = 0; j < cpu_nr; j++) { + if (cpu_online(j)) { + seq_printf(seq, "%10lu ", *( + ((unsigned long *) per_cpu_ptr( + ext_stats, j)) + toa_stats[i].entry + )); + } + } + seq_putc(seq, '\n'); + i++; + } + return 0; +} + +static int toa_stats_seq_open(struct inode *inode, struct file *file) +{ + return single_open(file, toa_stats_show, NULL); +} + +static const struct file_operations toa_stats_fops = { + .owner = THIS_MODULE, + .open = toa_stats_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +/* + * TOA module init and destory + */ +#if LINUX_VERSION_CODE >=KERNEL_VERSION(3,9,0) +static struct proc_dir_entry *proc_net_fops_create(struct net *net, + const char *name, mode_t mode, const struct file_operations *fops) +{ + return proc_create(name, mode, net->proc_net, fops); +} + +static void proc_net_remove(struct net *net, const char *name) +{ + remove_proc_entry(name, net->proc_net); +} +#endif + + +/* module init */ +static int __init +toa_init(void) +{ + + TOA_INFO("TOA " TOA_VERSION " by pukong.wjm\n"); + + /* alloc statistics array for toa */ + ext_stats = alloc_percpu(struct toa_stat_mib); + if (NULL == ext_stats) + return 1; + proc_net_fops_create(&init_net, "toa_stats", 0, &toa_stats_fops); + + /* get the address of function sock_def_readable + * so later we can know whether the sock is for rpc, tux or others + */ + sk_data_ready_addr = kallsyms_lookup_name("sock_def_readable"); + TOA_INFO("CPU [%u] sk_data_ready_addr = " + "kallsyms_lookup_name(sock_def_readable) = %lu\n", + smp_processor_id(), sk_data_ready_addr); + if (0 == sk_data_ready_addr) { + TOA_INFO("cannot find sock_def_readable.\n"); + goto err; + } + +#ifdef TOA_IPV6_ENABLE + if (0 != get_kernel_ipv6_symbol()) { + TOA_INFO("get ipv6 struct from kernel fail.\n"); + goto err; + } +#endif + + /* hook funcs for parse and get toa */ + hook_toa_functions(); + + TOA_INFO("toa loaded\n"); + return 0; + +err: + proc_net_remove(&init_net, "toa_stats"); + if (NULL != ext_stats) { + free_percpu(ext_stats); + ext_stats = NULL; + } + + return 1; +} + +/* module cleanup*/ +static void __exit +toa_exit(void) +{ + unhook_toa_functions(); + synchronize_net(); + + proc_net_remove(&init_net, "toa_stats"); + if (NULL != ext_stats) { + free_percpu(ext_stats); + ext_stats = NULL; + } + TOA_INFO("toa unloaded\n"); +} + +module_init(toa_init); +module_exit(toa_exit); +MODULE_LICENSE("GPL"); diff --git a/kmod/toa/toa.h b/kmod/toa/toa.h new file mode 100644 index 000000000..ebda933e0 --- /dev/null +++ b/kmod/toa/toa.h @@ -0,0 +1,105 @@ +#ifndef __NET__TOA_H__ +#define __NET__TOA_H__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define TOA_VERSION "1.0.0.0" + +//#define TOA_DEBUG_ENABLE +//#define TOA_IPV6_ENABLE + +#ifdef TOA_DEBUG_ENABLE +#define TOA_DBG(msg...) \ + do { \ + printk(KERN_DEBUG "[DEBUG] TOA: " msg); \ + } while (0) +#else +#define TOA_DBG(msg...) +#endif + +#define TOA_INFO(msg...) \ + do { \ + if (net_ratelimit()) \ + printk(KERN_INFO "TOA: " msg); \ + } while (0) + + +#define TCPOPT_TOA 254 + +/* MUST be 4n !!!! */ +#define TCPOLEN_IP4_TOA 8 /* |opcode|size|ip+port| = 1 + 1 + 6 */ +#define TCPOLEN_IP6_TOA 20 /* |opcode|size|ip_of_v6+port| = 1 + 1 + 18 */ + +/* MUST be 4 bytes alignment */ +struct toa_ip4_data { + __u8 opcode; + __u8 opsize; + __u16 port; + __u32 ip; +}; + +struct toa_ip6_data{ + __u8 opcode; + __u8 opsize; + __u16 port; + struct in6_addr in6_addr; +}; + + +/* statistics about toa in proc /proc/net/toa_stat */ +enum { + SYN_RECV_SOCK_TOA_CNT = 1, + SYN_RECV_SOCK_NO_TOA_CNT, + GETNAME_TOA_OK_CNT, + GETNAME_TOA_MISMATCH_CNT, + GETNAME_TOA_BYPASS_CNT, + GETNAME_TOA_EMPTY_CNT, +#ifdef TOA_IPV6_ENABLE + IP6_ADDR_ALLOC_CNT, + IP6_ADDR_FREE_CNT, +#endif + TOA_STAT_LAST +}; + +struct toa_stats_entry { + char *name; + int entry; +}; + +#define TOA_STAT_ITEM(_name, _entry) { \ + .name = _name, \ + .entry = _entry, \ +} + +#define TOA_STAT_END { \ + NULL, \ + 0, \ +} + +struct toa_stat_mib { + unsigned long mibs[TOA_STAT_LAST]; +}; + +#define DEFINE_TOA_STAT(type, name) \ + __typeof__(type) *name +#define TOA_INC_STATS(mib, field) \ + (per_cpu_ptr(mib, smp_processor_id())->mibs[field]++) + +#endif diff --git a/uoa/Makefile b/kmod/uoa/Makefile similarity index 100% rename from uoa/Makefile rename to kmod/uoa/Makefile diff --git a/uoa/example/.gitignore b/kmod/uoa/example/.gitignore similarity index 100% rename from uoa/example/.gitignore rename to kmod/uoa/example/.gitignore diff --git a/uoa/example/make.sh b/kmod/uoa/example/make.sh similarity index 100% rename from uoa/example/make.sh rename to kmod/uoa/example/make.sh diff --git a/uoa/example/opp.c b/kmod/uoa/example/opp.c similarity index 99% rename from uoa/example/opp.c rename to kmod/uoa/example/opp.c index b7ad9b964..cb5795cd5 100644 --- a/uoa/example/opp.c +++ b/kmod/uoa/example/opp.c @@ -125,7 +125,7 @@ int main(int argc, char *argv[]) /* uoa option */ uoa = (void *)opph->options; uoa->op_code = IPOPT_UOA; - uoa->op_len = IPOLEN_UOA; + uoa->op_len = IPOLEN_UOA_IPV4; uoa->op_port = htons(atoi(argv[4])); if (inet_pton(AF_INET, argv[3], &uoa->op_addr) <= 0) { diff --git a/kmod/uoa/example/udp_serv.c b/kmod/uoa/example/udp_serv.c new file mode 100644 index 000000000..cfb0117ef --- /dev/null +++ b/kmod/uoa/example/udp_serv.c @@ -0,0 +1,175 @@ +/* + * DPVS is a software load balancer (Virtual Server) based on DPDK. + * + * Copyright (C) 2018 iQIYI (www.iqiyi.com). + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ +/* + * Example UDP server to get real client IP/port by UOA. + * + * raychen@qiyi.com, Mar 2018, initial. + * yuwenchao@qiyi.com, Sep 25, add ipv6 support + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "common.h" /* for __u8, __be16, __be32, __u64 only, + just define them if not want common.h */ +#include "uoa.h" + +#define MAX_SUPP_AF 2 +#define MAX_EPOLL_EVENTS 2 +#define SA struct sockaddr +#define SERV_PORT 6000 + +void handle_reply(int efd, int fd) +{ + struct sockaddr_storage peer; + struct sockaddr_in *sin = NULL; + struct sockaddr_in6 *sin6 = NULL; + char buff[4096], from[64]; + struct uoa_param_map map; + socklen_t len, mlen; + int n; + + len = sizeof(peer); + n = recvfrom(fd, buff, sizeof(buff), 0, (SA *)&peer, &len); + if (n < 0) { + perror("recvfrom failed\n"); + exit(1); + } + buff[n]='\0'; + + if (((SA *)&peer)->sa_family == AF_INET) { + sin = (struct sockaddr_in *)&peer; + inet_ntop(AF_INET, &sin->sin_addr.s_addr, from, sizeof(from)); + printf("Receive %d bytes from %s:%d -- %s\n", + n, from, ntohs(sin->sin_port), buff); + /* + * get real client address: + * + * note: src/dst is for original pkt, so peer is + * "orginal" source, instead of local. wildcard + * lookup for daddr (or local IP) is supported. + * */ + memset(&map, 0, sizeof(map)); + map.saddr = sin->sin_addr.s_addr; + map.sport = sin->sin_port; + map.daddr = htonl(INADDR_ANY); + map.dport = htons(SERV_PORT); + mlen = sizeof(map); + if (getsockopt(fd, IPPROTO_IP, UOA_SO_GET_LOOKUP, &map, &mlen) == 0) { + inet_ntop(AF_INET, &map.real_saddr, from, sizeof(from)); + printf(" real client %s:%d\n", from, ntohs(map.real_sport)); + } + + len = sizeof(peer); + sendto(fd, buff, n, 0, (SA *)&peer, len); + } else { /* AF_INET6 */ + sin6 = (struct sockaddr_in6 *)&peer; + inet_ntop(AF_INET6, &sin6->sin6_addr, from, sizeof(from)); + printf("Receive %d bytes from %s:%d -- %s\n", + n, from, ntohs(sin6->sin6_port), buff); + + /* Todo: IPv6 uoa support */ + + sendto(fd, buff, n, 0, (SA *)&peer, len); + } +} + +int main(int argc, char *argv[]) +{ + int i, sockfd[MAX_SUPP_AF]; + int epfd, nfds; + int enable = 1; + struct epoll_event events[MAX_EPOLL_EVENTS]; + struct sockaddr_in local; + struct sockaddr_in6 local6; + + if ((sockfd[0] = socket(AF_INET, SOCK_DGRAM, 0)) < 0) { + perror("Fail to create INET socket!\n"); + exit(1); + } + + if ((sockfd[1] = socket(AF_INET6, SOCK_DGRAM, 0)) < 0) { + perror("Fail to create INET6 socket!"); + exit(1); + } + + if ((epfd = epoll_create1(0)) < 0) { + perror("Fail to create epoll fd!\n"); + exit(1); + } + + for (i = 0; i < MAX_SUPP_AF; i++) { + setsockopt(sockfd[i], SOL_SOCKET, SO_REUSEADDR, &enable, sizeof(enable)); + setsockopt(sockfd[i], SOL_SOCKET, SO_REUSEPORT, &enable, sizeof(enable)); + } + + memset(&local, 0, sizeof(struct sockaddr_in)); + local.sin_family = AF_INET; + local.sin_port = htons(SERV_PORT); + local.sin_addr.s_addr = htonl(INADDR_ANY); + + if (bind(sockfd[0], (struct sockaddr *)&local, sizeof(local)) != 0) { + perror("Fail to bind INET socket!\n"); + exit(1); + } + + memset(&local6, 0, sizeof(struct sockaddr_in6)); + local6.sin6_family = AF_INET6; + local6.sin6_port = htons(SERV_PORT); + local6.sin6_addr = in6addr_any; + + if (bind(sockfd[1], (struct sockaddr *)&local6, sizeof(local6)) != 0) { + perror("Fail to bind INET6 socket!\n"); + exit(1); + } + + for (i = 0; i < MAX_SUPP_AF; i++) { + struct epoll_event ev; + memset(&ev, 0, sizeof(ev)); + ev.events = EPOLLIN | EPOLLERR; + ev.data.fd = sockfd[i]; + if (epoll_ctl(epfd, EPOLL_CTL_ADD, sockfd[i], &ev) != 0) { + fprintf(stderr, "epoll_ctl add failed for sockfd[%d]\n", i); + exit(1); + } + } + + while (1) { + nfds = epoll_wait(epfd, events, 2, -1); + if (nfds == -1) { + perror("epoll_wait failed\n"); + exit(1); + } + + for (i = 0; i < nfds; i++) { + handle_reply(epfd, events[i].data.fd); + } + } + + for (i = 0; i < MAX_SUPP_AF; i++) + close(sockfd[i]); + + exit(0); +} diff --git a/kmod/uoa/example/uperf.c b/kmod/uoa/example/uperf.c new file mode 100644 index 000000000..3d165de6a --- /dev/null +++ b/kmod/uoa/example/uperf.c @@ -0,0 +1,567 @@ +/* + * DPVS is a software load balancer (Virtual Server) based on DPDK. + * + * Copyright (C) 2018 iQIYI (www.iqiyi.com). + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ +/* + * UDP client for performance (high concurrency) test. + * + * raychen@qiyi.com, Mar 2018, initial. + */ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define SA struct sockaddr + +#define DEF_SERV_PORT 6000 +#define DEF_MAX_CONN 2000 /* per worker */ +#define DEF_DURATION 10 /* seconds */ +#define DEF_PKT_SIZE 1000 /* bytes */ +#define DEF_DUMP_INTV 1 /* seconds */ + +struct config { + int max_conn; /* max conn per worker */ + int duration; /* test duration in seconds */ + int pkt_size; /* packet size in bytes */ + int interval; /* dump interval seconds */ + int af; + struct sockaddr_storage servaddr; /* server address */ +}; + +struct stats { + uint64_t tot_conns; + uint64_t conns; + uint64_t pkts_sent; + uint64_t pkts_recv; + uint64_t bytes_sent; + uint64_t bytes_recv; + uint64_t errors; +}; + +struct worker { + int cpu; + pid_t pid; + struct config conf; + struct stats stats; + char *sndbuf; + char *rcvbuf; +}; + +static cpu_set_t cpuset; /* cpu for workers */ +static sig_atomic_t quit_test = 0; /* for master */ +static sig_atomic_t quit_client = 0; /* for worker */ + +static struct worker workers[CPU_SETSIZE] = {}; + +static void usage(const char *prog) +{ + fprintf(stderr, "Usage:\n"); + fprintf(stderr, " %s [OPTIONS] host[:port]\n", prog); + fprintf(stderr, "Options:\n"); + fprintf(stderr, " -c CPUMASK workers' CPU mask in hex format.\n"); + fprintf(stderr, " -m MAXCONN connection per worker (CPU).\n"); + fprintf(stderr, " -t DRUATION test duration in second.\n"); + fprintf(stderr, " -s SIZE packet size (payload) in byte.\n"); + fprintf(stderr, " -i INTERVAL print interval in second.\n"); + fprintf(stderr, " -h show this help info.\n"); + fprintf(stderr, "Examples:\n"); + fprintf(stderr, " %s 127.0.0.1\n", prog); + fprintf(stderr, " %s [2001::1]\n", prog); + fprintf(stderr, " %s -c 1f 1.1.1.1:1234\n", prog); + fprintf(stderr, " %s -c 3 [2002::12:1]:5320\n", prog); + fprintf(stderr, " %s -c f -m 1000 -t 10 -s 10 2.2.2.2:5000\n", prog); +} + +static void sig_quit(int signo) +{ + quit_test = 1; +} + +static void sig_quit_client(int signo) +{ + quit_client = 1; +} + +static int hexstr_to_cpuset(const char *hex, cpu_set_t *set) +{ + const char *c; + unsigned long long mask; + int cpu; + + if (!hex || !set) + return -1; + + for (c = hex; *c != '\0'; c++) { + if (!isxdigit(*c)) + return -1; + } + + CPU_ZERO(set); + mask = strtoull(hex, NULL, 16); + + for (cpu = 0; cpu < sizeof(mask) * 8; cpu++) { + if (mask & (0x1LL<servaddr; + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&conf->servaddr; + + prog = strrchr(argv[0], '/'); + if (prog) + *prog++ = '\0'; + else + prog = argv[0]; + + CPU_ZERO(&cpuset); + memset(conf, 0, sizeof(*conf)); + conf->max_conn = DEF_MAX_CONN; + conf->duration = DEF_DURATION; + conf->pkt_size = DEF_PKT_SIZE; + conf->interval = DEF_DUMP_INTV; + + if (argc <= 1) { + usage(prog); + exit(0); + } + + while ((opt = getopt_long(argc, argv, "hc:m:t:s:i:", + opts, NULL)) != -1) { + switch (opt) { + case 'h': + usage(prog); + exit(0); + case 'c': + if (hexstr_to_cpuset(optarg, &cpuset) != 0) { + fprintf(stderr, "Bad CPU mask: %s\n", optarg); + exit(1); + } + break; + case 'm': + conf->max_conn = atoi(optarg); + break; + case 't': + conf->duration = atoi(optarg); + if (conf->duration <= 0) { + fprintf(stderr, "Invalid duration.\n"); + exit(1); + } + break; + case 's': + conf->pkt_size = atoi(optarg); + break; + case 'i': + conf->interval = atoi(optarg); + if (conf->interval <= 0) { + fprintf(stderr, "Invalid interval.\n"); + exit(1); + } + break; + case '?': + default: + fprintf(stderr, "Invalid option: %s\n", argv[optind]); + return -1; + } + } + + argc -= optind; + argv += optind; + + if (argc < 1) { + fprintf(stderr, "Missing server IP address.\n"); + exit(1); + } + + host = argv[0]; + port = argv[0]; + + if (index(host, '[') && index(host, ']')) { + host = strchr(host, '['); + port = strchr(host, ']'); + host++; + *port++ = '\0'; + } + + port = strrchr(port, ':'); + if (port) + *port++ = '\0'; + + if (port) { + if (atoi(port) <= 0 || atoi(port) >= 65535) { + fprintf(stderr, "Invalid port: %s\n", port); + exit(1); + } + } + + if (inet_pton(AF_INET6, host, &sin6->sin6_addr) == 1) { + sin6->sin6_family = conf->af = AF_INET6; + if (port) + sin6->sin6_port = htons(atoi(port)); + else + sin6->sin6_port = htons(DEF_SERV_PORT); + } else if (inet_pton(AF_INET, host, &sin->sin_addr.s_addr) == 1) { + sin->sin_family = conf->af = AF_INET; + if (port) + sin->sin_port = htons(atoi(port)); + else + sin->sin_port = htons(DEF_SERV_PORT); + } else { + fprintf(stderr, "Invalid host IP: %s\n", host); + exit(1); + } + + return 0; +} + +static inline void dump_stats(int cpu, const struct stats *st) +{ + printf("[% 2d] %5"PRIu64" %8"PRIu64" %8"PRIu64" %12"PRIu64" %12"PRIu64" %8"PRIu64" %8"PRIu64"\n", + cpu, st->conns, st->pkts_recv, st->pkts_sent, + st->bytes_recv, st->bytes_sent, st->errors, st->tot_conns); +} + +static int udp_new_conn(int epfd, struct worker *wk) +{ + int sockfd; + struct epoll_event ev; + socklen_t salen; + + if (wk->conf.af == AF_INET6) + salen = sizeof(struct sockaddr_in6); + else + salen = sizeof(struct sockaddr_in); + + sockfd = socket(wk->conf.af, SOCK_DGRAM, 0); + if (sockfd < 0) { + perror("socket"); + return -1; + } + + /* use connect to receive ICMP port unreachable. */ + if (connect(sockfd, (SA *)&wk->conf.servaddr, salen) != 0) { + perror("connect"); + close(sockfd); + return -1; + } + + if (send(sockfd, wk->sndbuf, wk->conf.pkt_size, 0) != wk->conf.pkt_size) { + perror("send"); + close(sockfd); + return -1; + } + + wk->stats.pkts_sent++; + wk->stats.bytes_sent += wk->conf.pkt_size; + + fcntl(sockfd, F_SETFL, fcntl(sockfd, F_GETFL, 0) | O_NONBLOCK); + + memset(&ev, 0, sizeof(ev)); + ev.events = EPOLLIN | EPOLLERR; + ev.data.fd = sockfd; + if (epoll_ctl(epfd, EPOLL_CTL_ADD, sockfd, &ev) != 0) { + perror("epoll_ctl"); + close(sockfd); + return -1; + } + + wk->stats.conns++; + wk->stats.tot_conns++; + + return 0; +} + +static void udp_handle_reply(int epfd, int fd, struct worker *wk) +{ + int n; + + n = recv(fd, wk->rcvbuf, wk->conf.pkt_size, 0); + + if (n < 0) { + /* we're nonblock recv */ + if (errno == EINTR && errno == EAGAIN) + return; + + wk->stats.errors++; + } + + epoll_ctl(epfd, EPOLL_CTL_DEL, fd, NULL); + close(fd); + + wk->stats.conns--; + if (n >= 0) { + wk->stats.pkts_recv++; + wk->stats.bytes_recv += n; + } +} + +static void udp_client(struct worker *wk) +{ + int epfd, nfds, timeo, i; + struct epoll_event *events; + struct config *conf = &wk->conf; + struct stats *stats = &wk->stats; + struct timespec ts_start, ts_now, ts_elapse, ts_dump; + + events = malloc(conf->max_conn * sizeof(struct epoll_event)); + if (!events) { + fprintf(stderr, "%s: no memory\n", __func__); + exit(1); + } + + wk->sndbuf = malloc(conf->pkt_size); + wk->rcvbuf = malloc(conf->pkt_size); + if (!wk->sndbuf || !wk->rcvbuf) { + fprintf(stderr, "%s: no memory\n", __func__); + exit(1); + } + + /* generate random alpha string for UDP payload. */ + for (i = 0; i < conf->pkt_size; i++) + wk->sndbuf[i] = 'A' + (random() % 26); + + /* + * each socket send one packet and receive a reply, + * try to create "connections" until max_conn reached. + * + * use epoll to avoid block on recv reply. + */ + epfd = epoll_create1(0); + if (epfd < 0) { + perror("epoll_create1"); + exit(1); + } + + signal(SIGQUIT, sig_quit_client); + + memset(stats, 0, sizeof(*stats)); + clock_gettime(CLOCK_MONOTONIC_COARSE, &ts_start); + clock_gettime(CLOCK_MONOTONIC_COARSE, &ts_dump); + + /* 0123 01234 01234567 01234567 012345678901 012345678901 01234567 01234567 */ + printf("CPU%d conns ipackets opackets ibytes obytes errors tot-conn\n", wk->cpu); + + /* main loop */ + while (1) { + if (quit_test || quit_client) + break; + + /* try create conn as much as possible */ + while (stats->conns < conf->max_conn) + udp_new_conn(epfd, wk); + + clock_gettime(CLOCK_MONOTONIC_COARSE, &ts_now); + timespec_sub(&ts_now, &ts_start, &ts_elapse); + + /* stop test if duration reached. */ + if (ts_elapse.tv_sec >= conf->duration) + break; + + /* decide wait timeout for MIN(interval, duration_remain). + * calculate in ms */ + timeo = (conf->duration - ts_elapse.tv_sec) * 1000 \ + - ts_elapse.tv_nsec / 1000000; + timeo = (timeo <= conf->interval * 1000) ? timeo : + conf->interval * 1000; + + /* dump stats with interval */ + timespec_sub(&ts_now, &ts_dump, &ts_elapse); + if (ts_elapse.tv_sec >= conf->interval) { + dump_stats(wk->cpu, stats); + ts_dump = ts_now; + } + + nfds = epoll_wait(epfd, events, conf->max_conn, timeo); + if (nfds == -1) { + perror("epoll_wait"); + exit(1); + } + + for (i = 0; i < nfds; i++) { + udp_handle_reply(epfd, events[i].data.fd, wk); + } + } + + clock_gettime(CLOCK_MONOTONIC_COARSE, &ts_now); + timespec_sub(&ts_now, &ts_start, &ts_elapse); + + dump_stats(wk->cpu, stats); + + printf("[%2d] --------\n", wk->cpu); + printf("[%2d] Summary: total connection %"PRIu64", errors %"PRIu64" duration %lu.%03lu\n", + wk->cpu, stats->tot_conns, stats->errors, ts_elapse.tv_sec, ts_elapse.tv_nsec / 1000000); + printf("[%2d] RX %lu pps %lu B/s, TX %lu pps %lu B/s\n", wk->cpu, + stats->pkts_recv * 1000 / (ts_elapse.tv_sec * 1000 + ts_elapse.tv_nsec / 1000000), + stats->bytes_recv * 1000 / (ts_elapse.tv_sec * 1000 + ts_elapse.tv_nsec / 1000000), + stats->pkts_sent * 1000 / (ts_elapse.tv_sec * 1000 + ts_elapse.tv_nsec / 1000000), + stats->bytes_sent * 1000 / (ts_elapse.tv_sec * 1000 + ts_elapse.tv_nsec / 1000000)); + + /* exiting, nothing need to release. */ + return; +} + +static int new_worker(const int cpu, const struct config *conf) +{ + pid_t pid; + + workers[cpu].cpu = cpu; + workers[cpu].conf = *conf; + + pid = fork(); + + if (pid > 0) { /* master */ + workers[cpu].pid = pid; + } else if (pid == 0) { /* worker */ + cpu_set_t set; + + CPU_ZERO(&set); + CPU_SET(cpu, &set); + if (sched_setaffinity(getpid(), CPU_SETSIZE, &set) != 0) + perror("sched_setaffinity"); + + udp_client(&workers[cpu]); + + exit(1); /* never return */ + } else { + fprintf(stderr, "%s: fail to fork worker\n", __func__); + return -1; + } + + return 0; +} + +int main(int argc, char *argv[]) +{ + int cpu; + int num_workers = 0; + struct config conf; + struct rlimit limit; + + if (parse_args(argc, argv, &conf) != 0) + exit(1); + + /* example only, pls use sigaction */ + signal(SIGINT, sig_quit); + + /* extend open-file limit as needed. */ + if (getrlimit(RLIMIT_OFILE, &limit) == 0) { + limit.rlim_cur = limit.rlim_max; + if (setrlimit(RLIMIT_OFILE, &limit) != 0) + perror("setrlimit(OFILE)"); + } + + /* standalone mode ? */ + if (CPU_COUNT(&cpuset) == 0) { + struct worker *wk = &workers[0]; + + /* master itself is worker (client) */ + memset(wk, 0, sizeof(*wk)); + wk->cpu = 0; + wk->pid = getpid(); + wk->conf = conf; + + udp_client(wk); + exit(0); + } + + /* + * master/worker mode. + * let worker to performe test. + */ + for (cpu = 0; cpu < CPU_SETSIZE; cpu++) { + if (!CPU_ISSET(cpu, &cpuset)) + continue; + + if (new_worker(cpu, &conf) == 0) + num_workers++; + } + + /* abort test if no worker created ! */ + if (!num_workers) + exit(1); + + /* wait all workers exit or user stop the test */ + while (num_workers) { + while (waitpid(-1, NULL, WNOHANG) > 0) + num_workers--; + + /* kill all workers if user stop test */ + if (quit_test) { + for (cpu = 0; cpu < CPU_SETSIZE; cpu++) { + if (workers[cpu].pid == 0) + continue; + + kill(workers[cpu].pid, SIGQUIT); + } + + quit_test = 0; + } + + sleep(1); + } + + printf("Test stopped!\n"); + exit(0); +} diff --git a/uoa/uoa.c b/kmod/uoa/uoa.c similarity index 96% rename from uoa/uoa.c rename to kmod/uoa/uoa.c index df10f025d..39a039c6d 100644 --- a/uoa/uoa.c +++ b/kmod/uoa/uoa.c @@ -306,7 +306,7 @@ static inline void uoa_map_hash(struct uoa_map *um) um->sport == cur->sport && um->dport == cur->dport) { /* update */ - memcpy(&cur->optuoa, &um->optuoa, IPOLEN_UOA); + memcpy(&cur->optuoa, &um->optuoa, IPOLEN_UOA_IPV4); mod_timer(&cur->timer, jiffies + uoa_map_timeout * HZ); @@ -504,8 +504,8 @@ static int uoa_so_get(struct sock *sk, int cmd, void __user *user, int *len) uoa_map_dump(um, "hit:"); if (likely(um->optuoa.op_code == IPOPT_UOA && - um->optuoa.op_len == IPOLEN_UOA)) { - map.real_saddr = um->optuoa.op_addr; + um->optuoa.op_len == IPOLEN_UOA_IPV4)) { + memcpy(&map.real_saddr, um->optuoa.op_addr, sizeof(map.real_saddr)); map.real_sport = um->optuoa.op_port; UOA_STATS_INC(success); err = 0; @@ -624,7 +624,7 @@ static struct uoa_map *uoa_parse_ipopt(unsigned char *optptr, int optlen, if (unlikely(optlen < 2 || optlen > l)) goto out; /* invalid */ - if (*optptr == IPOPT_UOA && optlen == IPOLEN_UOA) { + if (*optptr == IPOPT_UOA && optlen == IPOLEN_UOA_IPV4) { UOA_STATS_INC(uoa_got); um = kmem_cache_alloc(uoa_map_cache, GFP_ATOMIC); @@ -639,7 +639,7 @@ static struct uoa_map *uoa_parse_ipopt(unsigned char *optptr, int optlen, um->sport = sport; um->dport = dport; - memcpy(&um->optuoa, optptr, IPOLEN_UOA); + memcpy(&um->optuoa, optptr, IPOLEN_UOA_IPV4); UOA_STATS_INC(uoa_saved); return um; @@ -727,6 +727,18 @@ static struct uoa_map *uoa_opp_rcv(struct iphdr *iph, struct sk_buff *skb) */ skb_set_transport_header(skb, ip_hdrlen(skb) + opplen); + /* Old kernel like 2.6.32 use "iph->ihl" rather "skb->transport_header" + * to get UDP header offset. The UOA private protocol data should be + * erased here, but this should move skb data and harm perfomance. As a + * compromise, we convert the private protocol data into NOP IP option + * data if possible.*/ + if (iph->ihl + (opplen >> 2) < 16) { + iph->ihl = (iph->ihl) + (opplen >> 2); + memset(opph, opplen, IPOPT_NOOP); + } else { + pr_warn("IP header has no room to convert uoa data into option.\n"); + } + /* need change it to parse transport layer */ iph->protocol = opph->protocol; ip_send_check(iph); diff --git a/uoa/uoa.md b/kmod/uoa/uoa.md similarity index 100% rename from uoa/uoa.md rename to kmod/uoa/uoa.md diff --git a/src/cfgfile.c b/src/cfgfile.c index ccd80d397..6dea928cd 100644 --- a/src/cfgfile.c +++ b/src/cfgfile.c @@ -26,6 +26,7 @@ #include "neigh.h" #include "ipv4.h" #include "ipv4_frag.h" +#include "ipv6.h" #include "ctrl.h" #include "sa_pool.h" #include "ipvs/conn.h" @@ -51,6 +52,8 @@ static void keyword_value_init(void) udp_keyword_value_init(); tcp_keyword_value_init(); synproxy_keyword_value_init(); + + ipv6_keyword_value_init(); } static vector_t install_keywords(void) @@ -84,6 +87,8 @@ static vector_t install_keywords(void) install_proto_udp_keywords(); install_sublevel_end(); + install_ipv6_keywords(); + return g_keywords; } diff --git a/src/dpdk.mk b/src/dpdk.mk index a7f078c1b..77e6d4585 100644 --- a/src/dpdk.mk +++ b/src/dpdk.mk @@ -44,5 +44,5 @@ LIBS += -Wl,--no-as-needed -fvisibility=default \ -Wl,--whole-archive -lrte_hash -lrte_kvargs -Wl,-lrte_mbuf -lrte_eal \ -Wl,-lrte_mempool -lrte_ring -lrte_cmdline -lrte_cfgfile -lrte_kni \ -lrte_mempool_ring -lrte_timer -lrte_net -Wl,-lrte_pmd_virtio \ - -lrte_pci -lrte_bus_pci -lrte_bus_vdev \ + -lrte_pci -lrte_bus_pci -lrte_bus_vdev -lrte_lpm \ -Wl,--no-whole-archive -lrt -lm -ldl -lcrypto diff --git a/src/icmp.c b/src/icmp.c index 3a3c178e5..1de9bee36 100644 --- a/src/icmp.c +++ b/src/icmp.c @@ -72,11 +72,11 @@ static int icmp_echo(struct rte_mbuf *mbuf) ich->icmp_cksum = (csum == 0xffff) ? csum : ~csum; memset(&fl4, 0, sizeof(struct flow4)); - fl4.daddr.s_addr = iph->src_addr; - fl4.saddr.s_addr = iph->dst_addr; - fl4.oif = netif_port_get(mbuf->port); - fl4.proto = IPPROTO_ICMP; - fl4.tos = iph->type_of_service; + fl4.fl4_daddr.s_addr = iph->src_addr; + fl4.fl4_saddr.s_addr = iph->dst_addr; + fl4.fl4_oif = netif_port_get(mbuf->port); + fl4.fl4_proto = IPPROTO_ICMP; + fl4.fl4_tos = iph->type_of_service; return ipv4_xmit(mbuf, &fl4); @@ -220,17 +220,17 @@ void icmp_send(struct rte_mbuf *imbuf, int type, int code, uint32_t info) | IPTOS_PREC_INTERNETCONTROL) : iph->type_of_service; memset(&fl4, 0, sizeof(struct flow4)); - fl4.daddr.s_addr = iph->src_addr; - fl4.saddr = saddr; - fl4.oif = netif_port_get(imbuf->port); - fl4.proto = IPPROTO_ICMP; - fl4.tos = tos; - if (!fl4.oif) { + fl4.fl4_daddr.s_addr = iph->src_addr; + fl4.fl4_saddr = saddr; + fl4.fl4_oif = netif_port_get(imbuf->port); + fl4.fl4_proto = IPPROTO_ICMP; + fl4.fl4_tos = tos; + if (!fl4.fl4_oif) { RTE_LOG(DEBUG, ICMP, "%s: no output iface.\n", __func__); return; } - mbuf = rte_pktmbuf_alloc(fl4.oif->mbuf_pool); + mbuf = rte_pktmbuf_alloc(fl4.fl4_oif->mbuf_pool); if (!mbuf) { RTE_LOG(DEBUG, ICMP, "%s: no memory.\n", __func__); return; @@ -249,7 +249,7 @@ void icmp_send(struct rte_mbuf *imbuf, int type, int code, uint32_t info) icmph->un.gateway = info; /* not good */ /* copy as much as we can without exceeding 576 (min-MTU) */ - room = fl4.oif->mtu > 576 ? 576 : fl4.oif->mtu; + room = fl4.fl4_oif->mtu > 576 ? 576 : fl4.fl4_oif->mtu; room -= sizeof(struct ipv4_hdr); room -= sizeof(struct icmphdr); diff --git a/src/inet.c b/src/inet.c index f929c63fc..afe56c626 100644 --- a/src/inet.c +++ b/src/inet.c @@ -17,17 +17,66 @@ */ #include #include +#include #include +#include "inet.h" #include "ipv4.h" +#include "ipv6.h" #include "route.h" +#include "route6.h" #include "neigh.h" -#include "inet.h" #include "icmp.h" +#include "icmp6.h" #include "inetaddr.h" #define INET #define RTE_LOGTYPE_INET RTE_LOGTYPE_USER1 +static struct list_head inet_hooks[INET_HOOK_NUMHOOKS]; +static rte_rwlock_t inet_hook_lock; + +static struct list_head inet6_hooks[INET_HOOK_NUMHOOKS]; +static rte_rwlock_t inet6_hook_lock; + +static inline struct list_head *af_inet_hooks(int af, size_t num) +{ + assert((af == AF_INET || af == AF_INET6) && num < INET_HOOK_NUMHOOKS); + + if (af == AF_INET) + return &inet_hooks[num]; + else + return &inet6_hooks[num]; +} + +static inline rte_rwlock_t *af_inet_hook_lock(int af) +{ + assert(af == AF_INET || af == AF_INET6); + + if (af == AF_INET) + return &inet_hook_lock; + else + return &inet6_hook_lock; +} + +static int inet_hook_init(void) +{ + int i; + + rte_rwlock_init(&inet_hook_lock); + rte_rwlock_write_lock(&inet_hook_lock); + for (i = 0; i < NELEMS(inet_hooks); i++) + INIT_LIST_HEAD(&inet_hooks[i]); + rte_rwlock_write_unlock(&inet_hook_lock); + + rte_rwlock_init(&inet6_hook_lock); + rte_rwlock_write_lock(&inet6_hook_lock); + for (i = 0; i < NELEMS(inet6_hooks); i++) + INIT_LIST_HEAD(&inet6_hooks[i]); + rte_rwlock_write_unlock(&inet6_hook_lock); + + return EDPVS_OK; +} + int inet_init(void) { int err; @@ -36,10 +85,18 @@ int inet_init(void) return err; if ((err = route_init()) != 0) return err; + if ((err = route6_init()) != 0) + return err; + if ((err = inet_hook_init()) != 0) + return err; if ((err = ipv4_init()) != 0) return err; + if ((err = ipv6_init()) != 0) + return err; if ((err = icmp_init()) != 0) return err; + if ((err = icmpv6_init()) != 0) + return err; if ((err = inet_addr_init()) != 0) return err; @@ -52,10 +109,16 @@ int inet_term(void) if ((err = inet_addr_term()) != 0) return err; + if ((err = icmpv6_term()) != 0) + return err; if ((err = icmp_term()) != 0) return err; + if ((err = ipv6_term()) != 0) + return err; if ((err = ipv4_term()) != 0) return err; + if ((err = route6_term()) != 0) + return err; if ((err = route_term()) != 0) return err; if ((err = neigh_term()) != 0) @@ -130,7 +193,176 @@ bool inet_addr_same_net(int af, uint8_t plen, case AF_INET: mask = htonl(~((0x1<<(32-plen)) - 1)); return !((addr1->in.s_addr^addr2->in.s_addr)&mask); + case AF_INET6: + return ipv6_prefix_equal(&addr1->in6, &addr2->in6, plen); default: return false; } } + +static int __inet_register_hooks(struct list_head *head, + struct inet_hook_ops *reg) +{ + struct inet_hook_ops *elem; + + /* check if exist */ + list_for_each_entry(elem, head, list) { + if (elem == reg) { + RTE_LOG(ERR, INET, "%s: hook already exist\n", __func__); + return EDPVS_EXIST; /* error ? */ + } + } + + list_for_each_entry(elem, head, list) { + if (reg->priority < elem->priority) + break; + } + list_add(®->list, elem->list.prev); + + return EDPVS_OK; +} + +int INET_HOOK(int af, unsigned int hook, struct rte_mbuf *mbuf, + struct netif_port *in, struct netif_port *out, + int (*okfn)(struct rte_mbuf *mbuf)) +{ + struct list_head *hook_list; + struct inet_hook_ops *ops; + struct inet_hook_state state; + int verdict = INET_ACCEPT; + + state.hook = hook; + hook_list = af_inet_hooks(af, hook); + + rte_rwlock_read_lock(af_inet_hook_lock(af)); + + ops = list_entry(hook_list, struct inet_hook_ops, list); + + if (!list_empty(hook_list)) { + verdict = INET_ACCEPT; + list_for_each_entry_continue(ops, hook_list, list) { +repeat: + verdict = ops->hook(ops->priv, mbuf, &state); + if (verdict != INET_ACCEPT) { + if (verdict == INET_REPEAT) + goto repeat; + break; + } + } + } + + rte_rwlock_read_unlock(af_inet_hook_lock(af)); + + if (verdict == INET_ACCEPT || verdict == INET_STOP) { + return okfn(mbuf); + } else if (verdict == INET_DROP) { + rte_pktmbuf_free(mbuf); + return EDPVS_DROP; + } else { /* INET_STOLEN */ + return EDPVS_OK; + } +} + +int inet_register_hooks(struct inet_hook_ops *reg, size_t n) +{ + int af; + size_t i, err; + struct list_head *hook_list; + assert(reg); + + for (i = 0; i < n; i++) { + af = reg[i].af; + if (reg[i].hooknum >= INET_HOOK_NUMHOOKS || !reg[i].hook) { + err = EDPVS_INVAL; + goto rollback; + } + hook_list = af_inet_hooks(af, reg[i].hooknum); + + rte_rwlock_write_lock(af_inet_hook_lock(af)); + err = __inet_register_hooks(hook_list, ®[i]); + rte_rwlock_write_unlock(af_inet_hook_lock(af)); + + if (err != EDPVS_OK) + goto rollback; + } + + return EDPVS_OK; + +rollback: + inet_unregister_hooks(reg, n); + return err; +} + +int inet_unregister_hooks(struct inet_hook_ops *reg, size_t n) +{ + int af; + size_t i; + struct inet_hook_ops *elem, *next; + struct list_head *hook_list; + assert(reg); + + for (i = 0; i < n; i++) { + af = reg[i].af; + if (reg[i].hooknum >= INET_HOOK_NUMHOOKS) { + RTE_LOG(WARNING, INET, "%s: bad hook number\n", __func__); + continue; /* return error ? */ + } + hook_list = af_inet_hooks(af, reg[i].hooknum); + +#ifdef CONFIG_DPVS_IPV4_INET_HOOK + rte_rwlock_write_lock(&inet_hook_lock); +#endif + list_for_each_entry_safe(elem, next, hook_list, list) { + if (elem == ®[i]) { + list_del(&elem->list); + break; + } + } +#ifdef CONFIG_DPVS_IPV4_INET_HOOK + rte_rwlock_write_unlock(&inet_hook_lock); +#endif + if (&elem->list == hook_list) + RTE_LOG(WARNING, INET, "%s: hook not found\n", __func__); + } + + return EDPVS_OK; +} + +void inet_stats_add(struct inet_stats *stats, const struct inet_stats *diff) +{ + stats->inpkts += diff->inpkts; + stats->inoctets += diff->inoctets; + stats->indelivers += diff->indelivers; + stats->outforwdatagrams += diff->outforwdatagrams; + stats->outpkts += diff->outpkts; + stats->outoctets += diff->outoctets; + stats->inhdrerrors += diff->inhdrerrors; + stats->intoobigerrors += diff->intoobigerrors; + stats->innoroutes += diff->innoroutes; + stats->inaddrerrors += diff->inaddrerrors; + stats->inunknownprotos += diff->inunknownprotos; + stats->intruncatedpkts += diff->intruncatedpkts; + stats->indiscards += diff->indiscards; + stats->outdiscards += diff->outdiscards; + stats->outnoroutes += diff->outnoroutes; + stats->reasmtimeout += diff->reasmtimeout; + stats->reasmreqds += diff->reasmreqds; + stats->reasmoks += diff->reasmoks; + stats->reasmfails += diff->reasmfails; + stats->fragoks += diff->fragoks; + stats->fragfails += diff->fragfails; + stats->fragcreates += diff->fragcreates; + stats->inmcastpkts += diff->inmcastpkts; + stats->outmcastpkts += diff->outmcastpkts; + stats->inbcastpkts += diff->inbcastpkts; + stats->outbcastpkts += diff->outbcastpkts; + stats->inmcastoctets += diff->inmcastoctets; + stats->outmcastoctets += diff->outmcastoctets; + stats->inbcastoctets += diff->inbcastoctets; + stats->outbcastoctets += diff->outbcastoctets; + stats->csumerrors += diff->csumerrors; + stats->noectpkts += diff->noectpkts; + stats->ect1pkts += diff->ect1pkts; + stats->ect0pkts += diff->ect0pkts; + stats->cepkts += diff->cepkts; +} diff --git a/src/inetaddr.c b/src/inetaddr.c index 6c9429574..299a70078 100644 --- a/src/inetaddr.c +++ b/src/inetaddr.c @@ -27,7 +27,11 @@ #include "ctrl.h" #include "sa_pool.h" #include "inetaddr.h" +#include "neigh.h" +#include "netif_addr.h" #include "conf/inetaddr.h" +#include "route6.h" +#include "ndisc.h" #define IFA #define RTE_LOGTYPE_IFA RTE_LOGTYPE_USER1 @@ -36,6 +40,12 @@ #define INET_ADDR_HSIZE_SHIFT 8 #define INET_ADDR_HSIZE (1U << INET_ADDR_HSIZE_SHIFT) +enum ifaddr_timer_t +{ + INET_NONE, + INET_DAD +}; + static struct list_head in_addr_tab[INET_ADDR_HSIZE]; static rte_rwlock_t in_addr_lock; static rte_atomic32_t in_addr_cnt; @@ -91,13 +101,13 @@ static void ifa_set_lifetime(struct inet_ifaddr *ifa, static struct inet_ifaddr *__ifa_lookup(struct inet_device *idev, const union inet_addr *addr, - uint8_t plen) + uint8_t plen, int af) { struct inet_ifaddr *ifa; list_for_each_entry(ifa, &idev->ifa_list, d_list) { - if ((!plen || ifa->plen == plen) - && inet_addr_equal(idev->af, &ifa->addr, addr)) { + if ((!plen || ifa->plen == plen) && ifa->af == af + && inet_addr_equal(ifa->af, &ifa->addr, addr)) { return ifa; } } @@ -130,11 +140,11 @@ static inline void ___ifa_remove(struct inet_ifaddr *ifa) /* make lookup and remove atmomic, also cancel the timer */ static int __ifa_remove(struct inet_device *idev, const union inet_addr *addr, - uint8_t plen, struct inet_ifaddr **ifa) + uint8_t plen, struct inet_ifaddr **ifa, int af) { struct inet_ifaddr *ent; - if ((ent = __ifa_lookup(idev, addr, plen)) == NULL) + if ((ent = __ifa_lookup(idev, addr, plen, af)) == NULL) return EDPVS_NOTEXIST; if (rte_atomic32_read(&ent->refcnt) > 2) @@ -147,7 +157,7 @@ static int __ifa_remove(struct inet_device *idev, const union inet_addr *addr, return EDPVS_OK; } -static int ifa_add_route(struct inet_ifaddr *ifa) +static int __ifa_add_route4(struct inet_ifaddr *ifa) { int err; union inet_addr net; @@ -161,7 +171,7 @@ static int ifa_add_route(struct inet_ifaddr *ifa) if (ifa->plen == 32) return EDPVS_OK; - err = inet_addr_net(ifa->idev->af, &ifa->addr, &ifa->mask, &net); + err = inet_addr_net(ifa->af, &ifa->addr, &ifa->mask, &net); if (err != EDPVS_OK) goto errout; @@ -179,7 +189,50 @@ static int ifa_add_route(struct inet_ifaddr *ifa) return err; } -static int ifa_del_route(struct inet_ifaddr *ifa) +static int __ifa_add_route6(struct inet_ifaddr *ifa) +{ + int err; + struct in6_addr net; + + err = route6_add(&ifa->addr.in6, 128, RTF_LOCALIN, + &in6addr_any, ifa->idev->dev, + &in6addr_any, ifa->idev->dev->mtu); + + if (err != EDPVS_OK && err != EDPVS_EXIST) + return err; + + if (ifa->plen == 128) + return EDPVS_OK; + + ipv6_addr_prefix(&net, &ifa->addr.in6, ifa->plen); + + err = route6_add(&net, ifa->plen, RTF_FORWARD, + &in6addr_any, ifa->idev->dev, + &in6addr_any, ifa->idev->dev->mtu); + + if (err != EDPVS_OK && err != EDPVS_EXIST) + goto errout; + + return EDPVS_OK; + +errout: + route6_del(&ifa->addr.in6, 128, RTF_LOCALIN, + &in6addr_any, ifa->idev->dev, + &in6addr_any, ifa->idev->dev->mtu); + return err; +} + +static int ifa_add_route(struct inet_ifaddr *ifa) +{ + if (ifa->af == AF_INET) + return __ifa_add_route4(ifa); + else if(ifa->af == AF_INET6) + return __ifa_add_route6(ifa); + else + return EDPVS_NOTSUPP; +} + +static int __ifa_del_route4(struct inet_ifaddr *ifa) { int err; union inet_addr net; @@ -192,7 +245,7 @@ static int ifa_del_route(struct inet_ifaddr *ifa) if (ifa->plen == 32) return EDPVS_OK; - err = inet_addr_net(ifa->idev->af, &ifa->addr, &ifa->mask, &net); + err = inet_addr_net(ifa->af, &ifa->addr, &ifa->mask, &net); if (err != EDPVS_OK) RTE_LOG(WARNING, IFA, "%s: fail to delete route", __func__); @@ -204,6 +257,266 @@ static int ifa_del_route(struct inet_ifaddr *ifa) return EDPVS_OK; } +static int __ifa_del_route6(struct inet_ifaddr *ifa) +{ + int err; + struct in6_addr net; + + err = route6_del(&ifa->addr.in6, 128, RTF_LOCALIN, + &in6addr_any, ifa->idev->dev, + &in6addr_any, ifa->idev->dev->mtu); + if (err != EDPVS_OK && err != EDPVS_NOTEXIST) + RTE_LOG(WARNING, IFA, "%s: fail to delete route", __func__); + + if (ifa->plen == 128) + return EDPVS_OK; + + ipv6_addr_prefix(&net, &ifa->addr.in6, ifa->plen); + + err = route6_del(&net, ifa->plen, RTF_FORWARD, + &in6addr_any, ifa->idev->dev, + &in6addr_any, ifa->idev->dev->mtu); + if (err != EDPVS_OK && err != EDPVS_NOTEXIST) + RTE_LOG(WARNING, IFA, "%s: fail to delete route", __func__); + + return EDPVS_OK; +} + +static int ifa_del_route(struct inet_ifaddr *ifa) +{ + if (ifa->af == AF_INET) + return __ifa_del_route4(ifa); + else if(ifa->af == AF_INET6) + return __ifa_del_route6(ifa); + else + return EDPVS_NOTSUPP; +} + +static struct inet_ifmcaddr *__imc_lookup( int af, const struct inet_device *idev, + const union inet_addr *maddr) +{ + struct inet_ifmcaddr *imc; + + list_for_each_entry(imc, &idev->ifm_list, d_list) { + if (inet_addr_equal(af, &imc->addr, maddr)) { + return imc; + } + } + + return NULL; +} + +static int idev_mc_add(int af, struct inet_device *idev, + const union inet_addr *maddr) +{ + struct inet_ifmcaddr *imc; + + imc = __imc_lookup(af, idev, maddr); + if (imc) { + rte_atomic32_inc(&imc->refcnt); + return EDPVS_OK; + } + + imc = rte_calloc(NULL, 1, sizeof(struct inet_ifmcaddr), RTE_CACHE_LINE_SIZE); + if (!imc) { + return EDPVS_NOMEM; + } + + imc->idev = idev; + memcpy(&imc->addr, maddr, sizeof(*maddr)); + list_add(&imc->d_list, &idev->ifm_list); + rte_atomic32_set(&imc->refcnt, 1); + + return EDPVS_OK; +} + +static int idev_mc_del(int af, struct inet_device *idev, + const union inet_addr *maddr) +{ + struct inet_ifmcaddr *imc; + + imc = __imc_lookup(af, idev, maddr); + if (!imc) { + return EDPVS_NOTEXIST; + } + + rte_atomic32_dec(&imc->refcnt); + if (rte_atomic32_read(&imc->refcnt) < 1) { + list_del(&imc->d_list); + rte_free(imc); + } + return EDPVS_OK; +} + +/* support ipv6 only, and not support source filter */ +static int ifa_add_del_mcast(struct inet_ifaddr *ifa, bool add) +{ + union inet_addr iaddr; + struct ether_addr eaddr; + int err = 0; + + if (ifa->af != AF_INET6) + return EDPVS_OK; + + memset(&iaddr, 0, sizeof(iaddr)); + memset(&eaddr, 0, sizeof(eaddr)); + + addrconf_addr_solict_mult(&ifa->addr.in6, &iaddr.in6); + ipv6_mac_mult(&iaddr.in6, &eaddr); + + if (add) { + err = idev_mc_add(ifa->af, ifa->idev, &iaddr); + if (err) + return err; + + err = netif_mc_add(ifa->idev->dev, &eaddr); + if (err) { + /* rollback */ + idev_mc_del(ifa->af, ifa->idev, &iaddr); + return err; + } + } else { + err = idev_mc_del(ifa->af, ifa->idev, &iaddr); + if (err) + return err; + + err = netif_mc_del(ifa->idev->dev, &eaddr); + if (err) { + /* rollback */ + idev_mc_add(ifa->af, ifa->idev, &iaddr); + return err; + } + } + + return err; +} + +static int inet_ifaddr_dad_completed(void *arg) +{ + struct inet_ifaddr *ifa = arg; + + rte_rwlock_write_lock(&in_addr_lock); + ifa->flags &= ~(IFA_F_TENTATIVE|IFA_F_OPTIMISTIC|IFA_F_DADFAILED); + rte_rwlock_write_unlock(&in_addr_lock); + + return DTIMER_STOP; +} + +/* change timer callback, refer to 'addrconf_mod_timer' */ +static void inet_ifaddr_mod_timer(struct inet_ifaddr *ifa, + enum ifaddr_timer_t what, + struct timeval *when) +{ + dpvs_timer_cancel(&ifa->timer, true); + + switch (what) { + case INET_DAD: + dpvs_timer_sched(&ifa->timer, when, inet_ifaddr_dad_completed, + ifa, true); + break; + /* TODO: other timer support */ + default: + break; + } +} + +static void inet_ifaddr_dad_stop(struct inet_ifaddr *ifa, int dad_failed) +{ + rte_rwlock_write_lock(&in_addr_lock); + if (ifa->flags & IFA_F_PERMANENT) { + if (dad_failed && ifa->flags & IFA_F_TENTATIVE) + ifa->flags |= IFA_F_DADFAILED; + dpvs_timer_cancel(&ifa->timer, true); + rte_rwlock_write_unlock(&in_addr_lock); + } else if (ifa->flags & IFA_F_TEMPORARY) { + /* TODO: support privacy addr */ + RTE_LOG(ERR, IFA, "%s: Not support privacy addr\n", __func__); + rte_rwlock_write_unlock(&in_addr_lock); + } else { + inet_addr_del(AF_INET6, ifa->idev->dev, &ifa->addr, ifa->plen); + rte_rwlock_write_unlock(&in_addr_lock); + } +} + +/* recv DAD: change ifa's state */ +void inet_ifaddr_dad_failure(struct inet_ifaddr *ifa) +{ + inet_ifaddr_dad_stop(ifa, 1); +} + +/* call me by lock */ +static void inet_ifaddr_dad_start(struct inet_ifaddr *ifa) +{ + struct timeval tv; + + if (ifa->flags & IFA_F_NODAD || + !(ifa->flags & IFA_F_TENTATIVE)) { + ifa->flags &= ~(IFA_F_TENTATIVE|IFA_F_OPTIMISTIC|IFA_F_DADFAILED); + return; + } + + tv.tv_sec = 3; + tv.tv_usec = 0; + + ifa->flags |= IFA_F_TENTATIVE | IFA_F_OPTIMISTIC; + inet_ifaddr_mod_timer(ifa, INET_DAD, &tv); + ndisc_send_dad(ifa->idev->dev, &ifa->addr.in6); +} + +/* + * no need to rollback, dpvs can not start successfully; + * should not be init in 'inetaddr_init'; + * because multicast address should be added after port_start + */ +int idev_add_mcast_init(struct netif_port *dev) +{ + struct inet_device *idev; + struct ether_addr eaddr_nodes, eaddr_routers; + union inet_addr all_nodes, all_routers; + int err = 0; + + idev = dev_get_idev(dev); + + memset(&eaddr_nodes, 0, sizeof(eaddr_nodes)); + memset(&eaddr_routers, 0, sizeof(eaddr_routers)); + + memcpy(&all_nodes, &in6addr_linklocal_allnodes, sizeof(all_nodes)); + memcpy(&all_routers, &in6addr_linklocal_allrouters, sizeof(all_routers)); + + ipv6_mac_mult(&all_nodes.in6, &eaddr_nodes); + ipv6_mac_mult(&all_routers.in6, &eaddr_routers); + + rte_rwlock_write_lock(&in_addr_lock); + err = idev_mc_add(AF_INET6, idev, &all_nodes); + if (err != EDPVS_OK) + goto errout; + err = netif_mc_add(idev->dev, &eaddr_nodes); + if (err != EDPVS_OK) + goto free_idev_nodes; + err = idev_mc_add(AF_INET6, idev, &all_routers); + if (err != EDPVS_OK) + goto free_netif_nodes; + err = netif_mc_add(idev->dev, &eaddr_routers); + if (err != EDPVS_OK) + goto free_idev_routers; + + rte_rwlock_write_unlock(&in_addr_lock); + idev_put(idev); + + return EDPVS_OK; + +free_idev_routers: + idev_mc_del(AF_INET6, idev, &all_routers); +free_netif_nodes: + netif_mc_del(idev->dev, &eaddr_nodes); +free_idev_nodes: + idev_mc_del(AF_INET6, idev, &all_nodes); +errout: + rte_rwlock_write_unlock(&in_addr_lock); + idev_put(idev); + return err; +} + static int ifa_expire(void *arg) { struct inet_ifaddr *ifa = arg; @@ -231,6 +544,7 @@ static int ifa_expire(void *arg) dpvs_timer_cancel(&ifa->timer, true); if (ifa->flags & IFA_F_SAPOOL) sa_pool_destroy(ifa); + ifa_add_del_mcast(ifa, false); ifa_del_route(ifa); idev_put(ifa->idev); rte_atomic32_dec(&ifa->idev->ifa_cnt); @@ -261,7 +575,7 @@ static int ifa_add_set(int af, const struct netif_port *dev, rte_rwlock_write_lock(&in_addr_lock); - ifa = __ifa_lookup(idev, addr, plen); + ifa = __ifa_lookup(idev, addr, plen, af); if (ifa && create) { err = EDPVS_EXIST; goto errout; @@ -277,18 +591,27 @@ static int ifa_add_set(int af, const struct netif_port *dev, goto errout; } + ifa->af = af; ifa->idev = idev; ifa->addr = *addr; ifa->plen = plen; ifa->flags = flags; - inet_plen_to_mask(af, plen, &ifa->mask); + + if (af == AF_INET) + inet_plen_to_mask(af, plen, &ifa->mask); + dpvs_time_now(&ifa->cstemp, true); rte_atomic32_init(&ifa->refcnt); + /* set mult*/ + err = ifa_add_del_mcast(ifa, true); + if (err != EDPVS_OK) + goto free_ifa; + /* set routes for local and network */ err = ifa_add_route(ifa); if (err != EDPVS_OK) - goto free_ifa; + goto del_mc; err = __ifa_insert(idev, ifa); if (err != EDPVS_OK) @@ -330,6 +653,16 @@ static int ifa_add_set(int af, const struct netif_port *dev, } } + /* TODO: support privacy addr, don't need it now */ + if (af == AF_INET6) { + assert(ifa->flags & IFA_F_PERMANENT); + } + + if ((af == AF_INET6) && (ifa->flags & IFA_F_PERMANENT)) { + ifa->flags |= IFA_F_TENTATIVE|IFA_F_OPTIMISTIC; + inet_ifaddr_dad_start(ifa); + } + rte_rwlock_write_unlock(&in_addr_lock); idev_put(idev); return EDPVS_OK; @@ -338,6 +671,8 @@ static int ifa_add_set(int af, const struct netif_port *dev, ___ifa_remove(ifa); del_route: ifa_del_route(ifa); +del_mc: + ifa_add_del_mcast(ifa, false); free_ifa: rte_free(ifa); errout: @@ -381,11 +716,12 @@ int inet_addr_del(int af, struct netif_port *dev, return EDPVS_RESOURCE; rte_rwlock_write_lock(&in_addr_lock); - err = __ifa_remove(idev, addr, plen, &ifa); + err = __ifa_remove(idev, addr, plen, &ifa, af); if (err == EDPVS_OK) { dpvs_timer_cancel(&ifa->timer, true); if (ifa->flags & IFA_F_SAPOOL) sa_pool_destroy(ifa); + ifa_add_del_mcast(ifa, false); ifa_del_route(ifa); idev_put(ifa->idev); rte_free(ifa); @@ -429,6 +765,7 @@ int inet_addr_flush(int af, struct netif_port *dev) dpvs_timer_cancel(&ifa->timer, true); if (ifa->flags & IFA_F_SAPOOL) sa_pool_destroy(ifa); + ifa_add_del_mcast(ifa, false); ifa_del_route(ifa); idev_put(ifa->idev); rte_free(ifa); @@ -463,11 +800,11 @@ struct netif_port *inet_addr_get_iface(int af, union inet_addr *addr) struct netif_port *dev; #ifdef INET_ADDR_LOCK - rte_rwlock_read_lock(&in_addr_lock); + rte_rwlock_read_lock(&in_addr_lock); #endif - dev = __inet_addr_get_iface(af, addr); + dev = __inet_addr_get_iface(af, addr); #ifdef INET_ADDR_LOCK - rte_rwlock_read_unlock(&in_addr_lock); + rte_rwlock_read_unlock(&in_addr_lock); #endif return dev; @@ -480,70 +817,111 @@ void inet_addr_select(int af, const struct netif_port *dev, struct inet_device *idev = dev_get_idev(dev); struct inet_ifaddr *ifa; - if (af == AF_INET) + if (!addr || !idev) + return; + + if (af == AF_INET) { addr->in.s_addr = htonl(INADDR_ANY); - else { + } else if (af == AF_INET6) { addr->in6 = in6addr_any; - return; /* not support IPv6 now */ - } - - if (!idev) + } else { + idev_put(idev); return; + } rte_rwlock_read_lock(&in_addr_lock); /* for each primary address */ - list_for_each_entry(ifa, &idev->ifa_list, d_list) { - if (ifa->flags & IFA_F_SECONDARY) - continue; - if (ifa->scope > scope) - continue; - if (!dst || inet_addr_same_net(af, ifa->plen, dst, &ifa->addr)) { + if (af == AF_INET) { + list_for_each_entry(ifa, &idev->ifa_list, d_list) { + if ((ifa->flags & IFA_F_SECONDARY) || + (ifa->flags & IFA_F_TENTATIVE)) + continue; + if (ifa->scope > scope) + continue; + if (!dst || inet_addr_same_net(af, ifa->plen, dst, &ifa->addr)) { + *addr = ifa->addr; + break; + } + + /* save it and may have better choise later */ *addr = ifa->addr; - break; } - - /* save it and may have better choise later */ - *addr = ifa->addr; + } else if (af == AF_INET6) { + ipv6_addr_select(idev, dst, addr); } /* should we use other interface's address ? */ rte_rwlock_read_unlock(&in_addr_lock); + idev_put(idev); return; } struct inet_ifaddr *inet_addr_ifa_get(int af, const struct netif_port *dev, union inet_addr *addr) { - struct inet_ifaddr *ifa = NULL; - struct inet_device *idev = NULL; - - assert(af == AF_INET && addr); + struct inet_ifaddr *ifa = NULL; + struct inet_device *idev = NULL; + assert(addr); #ifdef INET_ADDR_LOCK - rte_rwlock_write_lock(&in_addr_lock); + rte_rwlock_write_lock(&in_addr_lock); #endif - if (!dev) { - dev = __inet_addr_get_iface(AF_INET, addr); - if (!dev) - goto out; - } + if (!dev) { + dev = __inet_addr_get_iface(af, addr); + if (!dev) + goto out; + } - idev = dev_get_idev(dev); - assert(idev); + idev = dev_get_idev(dev); + assert(idev); - ifa = __ifa_lookup(idev, addr, 0); - if (!ifa) - goto out; + ifa = __ifa_lookup(idev, addr, 0, af); + if (!ifa) + goto out; - rte_atomic32_inc(&ifa->refcnt); + rte_atomic32_inc(&ifa->refcnt); out: #ifdef INET_ADDR_LOCK - rte_rwlock_write_unlock(&in_addr_lock); + rte_rwlock_write_unlock(&in_addr_lock); #endif - if (idev) - idev_put(idev); - return ifa; + if (idev) + idev_put(idev); + return ifa; +} + +/* support ipv6 only, refer linux:ipv6_chk_mcast_addr */ +bool inet_chk_mcast_addr(int af, struct netif_port *dev, + const union inet_addr *group, + const union inet_addr *src) +{ + struct inet_device *idev = NULL; + struct inet_ifmcaddr *imc; + int ret = false; + + if (af != AF_INET6) + return true; + + idev = dev_get_idev(dev); + + if (idev) { + rte_rwlock_read_lock(&in_addr_lock); + + imc = __imc_lookup(af, idev, group); + if (imc){ + if (src && !ipv6_addr_any(&src->in6)) { + /* TODO: check source-specific multicast (SSM) if @src is assigned */ + ret = true; + } else { + ret = true; + } + } + + rte_rwlock_read_unlock(&in_addr_lock); + idev_put(idev); + } + + return ret; } /** @@ -639,7 +1017,9 @@ static int ifa_sockopt_get(sockoptid_t opt, const void *conf, size_t size, if (opt != SOCKOPT_GET_IFADDR_SHOW) return EDPVS_NOTSUPP; - if (param->af != AF_INET && param->af != AF_UNSPEC) + if (param->af != AF_INET && + param->af != AF_UNSPEC && + param->af != AF_INET6) return EDPVS_NOTSUPP; if (strlen(param->ifname)) { @@ -680,7 +1060,7 @@ static int ifa_sockopt_get(sockoptid_t opt, const void *conf, size_t size, list_for_each_entry(ifa, &idev->ifa_list, d_list) { if (off >= naddr) break; - ifa_fill_param(idev->af, &array->addrs[off++], ifa); + ifa_fill_param(ifa->af, &array->addrs[off++], ifa); } idev_put(idev); @@ -689,7 +1069,7 @@ static int ifa_sockopt_get(sockoptid_t opt, const void *conf, size_t size, list_for_each_entry(ifa, &in_addr_tab[hash], h_list) { if (off >= naddr) break; - ifa_fill_param(AF_INET, &array->addrs[off++], ifa); + ifa_fill_param(ifa->af, &array->addrs[off++], ifa); } } } diff --git a/src/ip_tunnel.c b/src/ip_tunnel.c index 15fcabb19..69a87ec2b 100644 --- a/src/ip_tunnel.c +++ b/src/ip_tunnel.c @@ -119,11 +119,11 @@ static int tunnel_bind_dev(struct netif_port *dev) if (tiph->daddr) { struct route_entry *rt; struct flow4 fl4 = { - .proto = tiph->protocol, - .daddr.s_addr = tiph->daddr, - .saddr.s_addr = tiph->saddr, - .tos = tiph->tos, - .oif = tnl->link, + .fl4_proto = tiph->protocol, + .fl4_daddr.s_addr = tiph->daddr, + .fl4_saddr.s_addr = tiph->saddr, + .fl4_tos = tiph->tos, + .fl4_oif = tnl->link, }; rt = route4_output(&fl4); @@ -828,11 +828,11 @@ int ip_tunnel_xmit(struct rte_mbuf *mbuf, struct netif_port *dev, rt = connected ? tnl->rt_cache : NULL; if (!rt) { /* not connected or no route cache */ - fl4.proto = proto; - fl4.daddr.s_addr = dip; - fl4.saddr.s_addr = tiph->saddr; - fl4.tos = tos; - fl4.oif = tnl->link; + fl4.fl4_proto = proto; + fl4.fl4_daddr.s_addr = dip; + fl4.fl4_saddr.s_addr = tiph->saddr; + fl4.fl4_tos = tos; + fl4.fl4_oif = tnl->link; rt = route4_output(&fl4); if (!rt) { diff --git a/src/ipv4.c b/src/ipv4.c index d4da4930f..e5e1a9282 100644 --- a/src/ipv4.c +++ b/src/ipv4.c @@ -30,8 +30,6 @@ #define IPV4 #define RTE_LOGTYPE_IPV4 RTE_LOGTYPE_USER1 -#define INET_MAX_PROTS 256 /* cannot change */ - #define IPV4_FORWARD_DEF false static bool ipv4_forward_switch = IPV4_FORWARD_DEF; @@ -56,7 +54,7 @@ static void ipv4_default_ttl_handler(vector_t tokens) FREE_PTR(str); } -static void ipv4_forward_handler(vector_t tokens) +static void ipv4_forwarding_handler(vector_t tokens) { char *str = set_value(tokens); assert(str); @@ -64,6 +62,11 @@ static void ipv4_forward_handler(vector_t tokens) ipv4_forward_switch = true; else if (strcasecmp(str, "off") == 0) ipv4_forward_switch = false; + else + RTE_LOG(WARNING, IPV4, "invalid ipv4:forwarding %s\n", str); + + RTE_LOG(INFO, IPV4, "ipv4:forwarding = %s\n", ipv4_forward_switch ? "on" : "off"); + FREE_PTR(str); } @@ -74,26 +77,16 @@ void ipv4_keyword_value_init(void) inet_def_ttl = INET_DEF_TTL; } /* KW_TYPE_NORMAL keyword */ + ipv4_forward_switch = false; } void install_ipv4_keywords(void) { install_keyword_root("ipv4_defs", NULL); install_keyword("default_ttl", ipv4_default_ttl_handler, KW_TYPE_INIT); - install_keyword("ipv4_forward", ipv4_forward_handler, KW_TYPE_INIT); + install_keyword("forwarding", ipv4_forwarding_handler, KW_TYPE_NORMAL); } -static struct list_head inet_hooks[INET_HOOK_NUMHOOKS]; -/** - * if remove this inet_hook_lock for performance, - * it assume all hook registeration are done - * during initialization, there's no race condition - * at that time. and never changed after that. - */ -#ifdef CONFIG_DPVS_IPV4_INET_HOOK -static rte_rwlock_t inet_hook_lock; -#endif - static const struct inet_protocol *inet_prots[INET_MAX_PROTS]; static rte_spinlock_t inet_prot_lock; /* to see if rwlock is better */ @@ -108,51 +101,6 @@ struct ip4_stats ip4_statistics; rte_spinlock_t ip4_stats_lock; #endif -int INET_HOOK(unsigned int hook, struct rte_mbuf *mbuf, - struct netif_port *in, struct netif_port *out, - int (*okfn)(struct rte_mbuf *mbuf)) -{ - struct list_head *hook_list; - struct inet_hook_ops *ops; - struct inet_hook_state state; - int verdict = INET_ACCEPT; - - state.hook = hook; - hook_list = &inet_hooks[hook]; - -#ifdef CONFIG_DPVS_IPV4_INET_HOOK - rte_rwlock_read_lock(&inet_hook_lock); -#endif - - ops = list_entry(hook_list, struct inet_hook_ops, list); - - if (!list_empty(hook_list)) { - verdict = INET_ACCEPT; - list_for_each_entry_continue(ops, hook_list, list) { -repeat: - verdict = ops->hook(ops->priv, mbuf, &state); - if (verdict != INET_ACCEPT) { - if (verdict == INET_REPEAT) - goto repeat; - break; - } - } - } - -#ifdef CONFIG_DPVS_IPV4_INET_HOOK - rte_rwlock_read_unlock(&inet_hook_lock); -#endif - - if (verdict == INET_ACCEPT || verdict == INET_STOP) { - return okfn(mbuf); - } else if (verdict == INET_DROP) { - rte_pktmbuf_free(mbuf); - return EDPVS_DROP; - } else { /* INET_STOLEN */ - return EDPVS_OK; - } -} - #ifdef CONFIG_DPVS_IPV4_DEBUG static void ip4_dump_hdr(const struct ipv4_hdr *iph, portid_t port) { @@ -260,7 +208,7 @@ static int ipv4_local_in(struct rte_mbuf *mbuf) } } - return INET_HOOK(INET_HOOK_LOCAL_IN, mbuf, + return INET_HOOK(AF_INET, INET_HOOK_LOCAL_IN, mbuf, netif_port_get(mbuf->port), NULL, ipv4_local_in_fin); } @@ -290,7 +238,7 @@ static int ipv4_output_fin2(struct rte_mbuf *mbuf) /* reuse @userdata/@udata64 for prio (used by tc:pfifo_fast) */ mbuf->udata64 = ((ip4_hdr(mbuf)->type_of_service >> 1) & 15); - err = neigh_resolve_output(&nexthop, mbuf, rt->port); + err = neigh_output(AF_INET, (union inet_addr *)&nexthop, mbuf, rt->port); route4_put(rt); return err; } @@ -312,7 +260,7 @@ int ipv4_output(struct rte_mbuf *mbuf) IP4_UPD_PO_STATS(out, mbuf->pkt_len); - return INET_HOOK(INET_HOOK_POST_ROUTING, mbuf, + return INET_HOOK(AF_INET, INET_HOOK_POST_ROUTING, mbuf, NULL, rt->port, ipv4_output_fin); } @@ -357,7 +305,7 @@ static int ipv4_forward(struct rte_mbuf *mbuf) iph->hdr_checksum = (uint16_t)(csum + (csum >= 0xffff)); iph->time_to_live--; - return INET_HOOK(INET_HOOK_FORWARD, mbuf, + return INET_HOOK(AF_INET, INET_HOOK_FORWARD, mbuf, netif_port_get(mbuf->port), rt->port, ipv4_forward_fin); drop: @@ -474,7 +422,8 @@ static int ipv4_rcv(struct rte_mbuf *mbuf, struct netif_port *port) ip4_dump_hdr(iph, mbuf->port); #endif - return INET_HOOK(INET_HOOK_PRE_ROUTING, mbuf, port, NULL, ipv4_rcv_fin); + return INET_HOOK(AF_INET, INET_HOOK_PRE_ROUTING, + mbuf, port, NULL, ipv4_rcv_fin); csum_error: IP4_INC_STATS(csumerrors); @@ -503,16 +452,6 @@ int ipv4_init(void) for (i = 0; i < IP4_IDENTS_SZ; i++) rte_atomic32_set(&ip4_idents[i], (uint32_t)random()); -#ifdef CONFIG_DPVS_IPV4_INET_HOOK - rte_rwlock_init(&inet_hook_lock); - rte_rwlock_write_lock(&inet_hook_lock); -#endif - for (i = 0; i < NELEMS(inet_hooks); i++) - INIT_LIST_HEAD(&inet_hooks[i]); -#ifdef CONFIG_DPVS_IPV4_INET_HOOK - rte_rwlock_write_unlock(&inet_hook_lock); -#endif - rte_spinlock_init(&inet_prot_lock); rte_spinlock_lock(&inet_prot_lock); for (i = 0; i < NELEMS(inet_prots); i++) @@ -576,7 +515,8 @@ int ipv4_local_out(struct rte_mbuf *mbuf) } else { ip4_send_csum(iph); } - return INET_HOOK(INET_HOOK_LOCAL_OUT, mbuf, NULL, rt->port, ipv4_output); + return INET_HOOK(AF_INET, INET_HOOK_LOCAL_OUT, mbuf, + NULL, rt->port, ipv4_output); } int ipv4_xmit(struct rte_mbuf *mbuf, const struct flow4 *fl4) @@ -584,7 +524,7 @@ int ipv4_xmit(struct rte_mbuf *mbuf, const struct flow4 *fl4) struct route_entry *rt; struct ipv4_hdr *iph; - if (!mbuf || !fl4 || fl4->daddr.s_addr == htonl(INADDR_ANY)) { + if (!mbuf || !fl4 || fl4->fl4_saddr.s_addr == htonl(INADDR_ANY)) { if (mbuf) rte_pktmbuf_free(mbuf); return EDPVS_INVAL; @@ -609,19 +549,19 @@ int ipv4_xmit(struct rte_mbuf *mbuf, const struct flow4 *fl4) /* build the IP header */ iph->version_ihl = ((4 << 4) | 5); - iph->type_of_service = fl4->tos; + iph->type_of_service = fl4->fl4_tos; iph->fragment_offset = 0; - iph->time_to_live = fl4->ttl ? fl4->ttl : INET_DEF_TTL; - iph->next_proto_id = fl4->proto; - iph->src_addr = fl4->saddr.s_addr; /* route will not fill fl4.saddr */ - iph->dst_addr = fl4->daddr.s_addr; + iph->time_to_live = fl4->fl4_ttl ? fl4->fl4_ttl : INET_DEF_TTL; + iph->next_proto_id = fl4->fl4_proto; + iph->src_addr = fl4->fl4_saddr.s_addr; /* route will not fill fl4.saddr */ + iph->dst_addr = fl4->fl4_daddr.s_addr; iph->packet_id = ip4_select_id(iph); if (iph->src_addr == htonl(INADDR_ANY)) { union inet_addr saddr; - inet_addr_select(AF_INET, rt->port, (union inet_addr *)&fl4->daddr, - fl4->scope, &saddr); + inet_addr_select(AF_INET, rt->port, (union inet_addr *)&fl4->fl4_daddr, + fl4->fl4_scope, &saddr); iph->src_addr = saddr.in.s_addr; } @@ -657,90 +597,3 @@ int ipv4_unregister_protocol(struct inet_protocol *prot, return err; } - -static int __inet_register_hooks(struct list_head *head, - struct inet_hook_ops *reg) -{ - struct inet_hook_ops *elem; - - /* check if exist */ - list_for_each_entry(elem, head, list) { - if (elem == reg) { - RTE_LOG(ERR, IPV4, "%s: hook already exist\n", __func__); - return EDPVS_EXIST; /* error ? */ - } - } - - list_for_each_entry(elem, head, list) { - if (reg->priority < elem->priority) - break; - } - list_add(®->list, elem->list.prev); - - return EDPVS_OK; -} - -int ipv4_register_hooks(struct inet_hook_ops *reg, size_t n) -{ - size_t i, err; - struct list_head *hook_list; - assert(reg); - - for (i = 0; i < n; i++) { - if (reg[i].hooknum >= INET_HOOK_NUMHOOKS || !reg[i].hook) { - err = EDPVS_INVAL; - goto rollback; - } - hook_list = &inet_hooks[reg[i].hooknum]; - -#ifdef CONFIG_DPVS_IPV4_INET_HOOK - rte_rwlock_write_lock(&inet_hook_lock); -#endif - err = __inet_register_hooks(hook_list, ®[i]); -#ifdef CONFIG_DPVS_IPV4_INET_HOOK - rte_rwlock_write_unlock(&inet_hook_lock); -#endif - - if (err != EDPVS_OK) - goto rollback; - } - - return EDPVS_OK; - -rollback: - ipv4_unregister_hooks(reg, n); - return err; -} - -int ipv4_unregister_hooks(struct inet_hook_ops *reg, size_t n) -{ - size_t i; - struct inet_hook_ops *elem, *next; - struct list_head *hook_list; - assert(reg); - - for (i = 0; i < n; i++) { - if (reg[i].hooknum >= INET_HOOK_NUMHOOKS) { - RTE_LOG(WARNING, IPV4, "%s: bad hook number\n", __func__); - continue; /* return error ? */ - } - hook_list = &inet_hooks[reg[i].hooknum]; - -#ifdef CONFIG_DPVS_IPV4_INET_HOOK - rte_rwlock_write_lock(&inet_hook_lock); -#endif - list_for_each_entry_safe(elem, next, hook_list, list) { - if (elem == ®[i]) { - list_del(&elem->list); - break; - } - } -#ifdef CONFIG_DPVS_IPV4_INET_HOOK - rte_rwlock_write_unlock(&inet_hook_lock); -#endif - if (&elem->list == hook_list) - RTE_LOG(WARNING, IPV4, "%s: hook not found\n", __func__); - } - - return EDPVS_OK; -} diff --git a/src/ipv6/icmp6.c b/src/ipv6/icmp6.c new file mode 100644 index 000000000..2a4d12bba --- /dev/null +++ b/src/ipv6/icmp6.c @@ -0,0 +1,338 @@ +/* just for testing IPv6, not real ICMPv6 implementation. */ +#include +#include "ipv6.h" +#include "common.h" +#include "icmp6.h" +#include "ndisc.h" + +#define ICMP6 +#define RTE_LOGTYPE_ICMP6 RTE_LOGTYPE_USER1 + +#ifdef CONFIG_DPVS_ICMP_DEBUG +static void icmp6_dump_hdr(const struct rte_mbuf *mbuf) +{ + struct icmp6_hdr *ich = rte_pktmbuf_mtod(mbuf, struct icmp6_hdr *); + lcoreid_t lcore = rte_lcore_id(); + + fprintf(stderr, "lcore %d port %d icmp type %u code %u\n", + lcore, mbuf->port, ich->icmp_type, ich->icmp_code); + + return; +} +#endif + +uint16_t icmp6_csum(struct ip6_hdr *iph, struct icmp6_hdr *ich) +{ + uint32_t csum, l4_len; + struct ip6_hdr hdr; + + /* must be linear !! */ + l4_len = ntohs(iph->ip6_plen); + if ((void *)ich != (void *)(iph + 1)) + l4_len -= (void *)ich - (void *)(iph+1); + + memset(&hdr, 0, sizeof(struct ip6_hdr)); + hdr.ip6_nxt = IPPROTO_ICMPV6; + hdr.ip6_plen = htons(l4_len); + hdr.ip6_src = iph->ip6_src; + hdr.ip6_dst = iph->ip6_dst; + + csum = rte_raw_cksum(ich, l4_len); + csum += rte_ipv6_phdr_cksum((struct ipv6_hdr *)&hdr, 0); + + csum = ((csum & 0xffff0000) >> 16) + (csum & 0xffff); + csum = (~csum) & 0xffff; + if (csum == 0) + csum = 0xffff; + + return csum; +} + +void icmp6_send_csum(struct ip6_hdr *shdr, struct icmp6_hdr *ich) +{ + uint32_t csum, l4_len; + + ich->icmp6_cksum = 0; + + l4_len = ntohs(shdr->ip6_plen); + + csum = rte_raw_cksum(ich, l4_len); + csum += rte_ipv6_phdr_cksum((struct ipv6_hdr *)shdr, 0); + + csum = ((csum & 0xffff0000) >> 16) + (csum & 0xffff); + csum = (~csum) & 0xffff; + if (csum == 0) + csum = 0xffff; + + ich->icmp6_cksum = csum; +} + +/* + * copy from kernel + * an inline helper for the "simple" if statement below + * checks if parameter problem report is caused by an + * unrecognized IPv6 option that has the Option Type + * highest-order two bits set to 10 + */ + +static bool icmp6_opt_unrec(struct rte_mbuf *imbuf, uint32_t offset) +{ + uint8_t *op, optval; + + op = mbuf_header_pointer(imbuf, offset, sizeof(optval), &optval); + if (!op) + return true; + return (*op & 0xC0) == 0x80; +} + +/* + * Figure out, may we reply to this packet with icmp error. + * + * We do not reply, if: + * - it was icmp error message. + * - it is truncated, so that it is known, that protocol is ICMPV6 + * (i.e. in the middle of some exthdr) + * + * --ANK (980726) + */ + +static int icmp6_is_ineligible(struct rte_mbuf *imbuf) +{ + int ptr = sizeof(ip6_hdr); + __u8 nexthdr = ip6_hdr(imbuf)->ip6_nxt; + + if (mbuf_may_pull(imbuf, ptr) != 0) { + return 1; + } + + ptr = ip6_skip_exthdr(imbuf, ptr, &nexthdr); + if (ptr < 0) + return 0; + + if (nexthdr == IPPROTO_ICMPV6) { + __u8 _type, *tp; + tp = mbuf_header_pointer(imbuf, + ptr + offsetof(struct icmp6_hdr, icmp6_type), + sizeof(_type), &_type); + if (tp == NULL || + !(*tp & ICMP6_INFOMSG_MASK)) + return 1; + } + return 0; +} + +/* @imbuf is input (original) IP packet to trigger ICMP. */ +void icmp6_send(struct rte_mbuf *imbuf, int type, int code, uint32_t info) +{ + struct ip6_hdr *iph = ip6_hdr(imbuf); + eth_type_t etype = imbuf->packet_type; /* FIXME: use other field ? */ + struct in6_addr *saddr = NULL; + struct ip6_hdr shdr; /* IPv6 header for sending packet */ + struct rte_mbuf *mbuf; + struct icmp6_hdr *ich; + struct flow6 fl6; + struct inet_ifaddr *ifa; + int room, err; + int addr_type = 0; + + ifa = inet_addr_ifa_get(AF_INET6, netif_port_get(imbuf->port), + (union inet_addr *)&iph->ip6_dst); + if (ifa) { + saddr = &iph->ip6_dst; + inet_addr_ifa_put(ifa); + } + + addr_type = ipv6_addr_type(&iph->ip6_dst); + + /* + * when the original ipv6 dst is l2/l3 mcast, just deal ICMP6_PACKET_TOO_BIG and + * ICMP6_PARAM_PROB's unrecognize IPv6 option. + */ + if (addr_type & IPV6_ADDR_MULTICAST || etype != ETH_PKT_HOST) { + if (type != ICMP6_PACKET_TOO_BIG && + !(type == ICMP6_PARAM_PROB && + code == ICMP6_PARAMPROB_OPTION && + (icmp6_opt_unrec(imbuf, info)))) { + + RTE_LOG(DEBUG, ICMP6, + "%s: l2 broadcast or l3 multicast don't support the error.\n", + __func__); + return; + } + saddr = NULL; + } + + addr_type = ipv6_addr_type(&iph->ip6_src); + /* + * Must not send error if the source does not uniquely + * identify a single node (RFC2463 Section 2.4). + * We check unspecified / multicast addresses here, + * and anycast addresses will be checked later. + */ + if ((addr_type == IPV6_ADDR_ANY) || (addr_type & IPV6_ADDR_MULTICAST)) { + RTE_LOG(DEBUG, ICMP6, "icmpv6_send: addr_any/mcast source\n"); + return; + } + + /* + * In icmp6_send, never answer to a ICMP packet except the type of ICMP6_INFOMSG_MASK. + */ + if (icmp6_is_ineligible(imbuf)) { + RTE_LOG(DEBUG, ICMP6, "icmpv6_send: no reply to icmp error\n"); + return; + } + + memset(&shdr, 0, sizeof(struct ip6_hdr)); + memset(&fl6, 0, sizeof(fl6)); + shdr.ip6_nxt = IPPROTO_ICMPV6; + shdr.ip6_dst = fl6.fl6_daddr = iph->ip6_src; + + fl6.fl6_proto = IPPROTO_ICMPV6; + fl6.fl6_oif = netif_port_get(imbuf->port); + if (saddr) { + shdr.ip6_src = fl6.fl6_saddr = *saddr; + } else { + inet_addr_select(AF_INET6, fl6.fl6_oif, + (union inet_addr *)&fl6.fl6_daddr, fl6.fl6_scope, + (union inet_addr *)&fl6.fl6_saddr); + shdr.ip6_src = fl6.fl6_saddr; + } + + mbuf = rte_pktmbuf_alloc(fl6.fl6_oif->mbuf_pool); + if (!mbuf) { + RTE_LOG(DEBUG, ICMP6, "%s: no memory.\n", __func__); + return; + } + assert(rte_pktmbuf_headroom(mbuf) >= 128); /* for L2/L3 */ + ich = (struct icmp6_hdr*)rte_pktmbuf_append(mbuf, sizeof(struct icmp6_hdr));; + if (!ich) { + RTE_LOG(DEBUG, ICMP6, "%s: no room in mbuf.\n", __func__); + rte_pktmbuf_free(mbuf); + return; + } + ich->icmp6_type = type; + ich->icmp6_code = code; + ich->icmp6_pptr = htonl(info); //use icmp6_pptr for store + + /* copy as much as we can without exceeding min-MTU */ + room = min_t(int, fl6.fl6_oif->mtu, IPV6_MIN_MTU); + room -= sizeof(struct ip6_hdr); + room -= sizeof(struct icmp6_hdr); + room = min_t(int, imbuf->data_len, room); + + if (!rte_pktmbuf_append(mbuf, room)) { + RTE_LOG(DEBUG, ICMP6, "%s: no room in mbuf.\n", __func__); + rte_pktmbuf_free(mbuf); + return; + } + + mbuf_copy_bits(imbuf, 0, ich + 1, room); + + shdr.ip6_plen = room + sizeof(struct icmp6_hdr); + icmp6_send_csum(&shdr, ich); + + if ((err = ipv6_xmit(mbuf, &fl6)) != EDPVS_OK) { + RTE_LOG(DEBUG, ICMP6, "%s: ipv6_xmit: %s.\n", + __func__, dpvs_strerror(err)); + } + return; +} + +static int icmp6_echo_reply(struct rte_mbuf *mbuf, struct ip6_hdr *iph, + struct icmp6_hdr *ich) +{ + struct ip6_hdr shdr; /* IPv6 header for sending packet */ + uint32_t icmp_len; + struct flow6 fl6; + + /* must be linear !! */ + icmp_len = ntohs(iph->ip6_plen); + if ((void *)ich != (void *)(iph + 1)) + icmp_len -= (void *)ich - (void *)(iph+1); + + /* reply */ + ich->icmp6_type = ICMP6_ECHO_REPLY; + + memset(&shdr, 0, sizeof(struct ip6_hdr)); + memset(&fl6, 0, sizeof(struct flow6)); + + shdr.ip6_nxt = IPPROTO_ICMPV6; + shdr.ip6_plen = htons(icmp_len); + shdr.ip6_dst = fl6.fl6_daddr = iph->ip6_src; + + fl6.fl6_proto = IPPROTO_ICMPV6; + fl6.fl6_oif = netif_port_get(mbuf->port); + + if (!ipv6_addr_is_multicast(&iph->ip6_dst)) { + shdr.ip6_src = fl6.fl6_saddr = iph->ip6_dst; + } else { + inet_addr_select(AF_INET6, fl6.fl6_oif, + (union inet_addr *)&fl6.fl6_daddr, fl6.fl6_scope, + (union inet_addr *)&fl6.fl6_saddr); + shdr.ip6_src = fl6.fl6_saddr; + } + + icmp6_send_csum(&shdr, ich); + + return ipv6_xmit(mbuf, &fl6); +} + +static int icmp6_rcv(struct rte_mbuf *mbuf) +{ + struct ip6_hdr *iph = mbuf->userdata; + struct icmp6_hdr *ich; + + assert(iph); + + if (mbuf_may_pull(mbuf, sizeof(struct icmp6_hdr)) != 0) + goto drop; + + ich = rte_pktmbuf_mtod(mbuf, struct icmp6_hdr *); + if (unlikely(!ich)) + goto drop; + + if (mbuf_may_pull(mbuf, mbuf->pkt_len) != 0) + goto drop; + + if (icmp6_csum(iph, ich) != 0xffff) + goto drop; + +#ifdef CONFIG_DPVS_ICMP_DEBUG + icmp6_dump_hdr(mbuf); +#endif + switch (ich->icmp6_type) { + case ICMP6_ECHO_REQUEST: + return icmp6_echo_reply(mbuf, iph, ich); + + case ND_ROUTER_SOLICIT: + case ND_ROUTER_ADVERT: + case ND_NEIGHBOR_SOLICIT: + case ND_NEIGHBOR_ADVERT: + case ND_REDIRECT: + return ndisc_rcv(mbuf, netif_port_get(mbuf->port)); + + default : + return EDPVS_KNICONTINUE; + } + +drop: + rte_pktmbuf_free(mbuf); + return EDPVS_INVPKT; +} + +static struct inet6_protocol icmp6_proto = { + .handler = icmp6_rcv, + .flags = INET6_PROTO_F_FINAL, +}; + +int icmpv6_init(void) +{ + ipv6_register_protocol(&icmp6_proto, IPPROTO_ICMPV6); + return 0; +} + +int icmpv6_term(void) +{ + ipv6_unregister_protocol(&icmp6_proto, IPPROTO_ICMPV6); + return 0; +} diff --git a/src/ipv6/ipv6.c b/src/ipv6/ipv6.c new file mode 100644 index 000000000..d3f451ba4 --- /dev/null +++ b/src/ipv6/ipv6.c @@ -0,0 +1,894 @@ +/* + * DPVS is a software load balancer (Virtual Server) based on DPDK. + * + * Copyright (C) 2018 iQIYI (www.iqiyi.com). + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ +/** + * IPv6 protocol for "lite stack". + * Linux Kernel net/ipv6/ is referred. + * + * Lei Chen , initial, Jul 2018. + */ +#include +#include +#include +#include +#include +#include +#include "common.h" +#include "mbuf.h" +#include "inet.h" +#include "ipv6.h" +#include "route6.h" +#include "parser/parser.h" +#include "neigh.h" +#include "icmp6.h" + +/* + * IPv6 inet hooks + */ +static const struct inet6_protocol *inet6_prots[INET_MAX_PROTS]; +static rte_rwlock_t inet6_prot_lock; + +/* + * IPv6 configures with default values. + */ +static bool conf_ipv6_forwarding = false; +static bool conf_ipv6_disable = false; + +/* + * IPv6 statistics + */ +static RTE_DEFINE_PER_LCORE(struct inet_stats, ip6_stats); +#define this_ip6_stats RTE_PER_LCORE(ip6_stats) + +#define IP6_INC_STATS(__f__) \ + do { \ + this_ip6_stats.__f__++; \ + } while (0) + +#define IP6_DEC_STATS(__f__) \ + do { \ + this_ip6_stats.__f__--; \ + } while (0) + +#define IP6_ADD_STATS(__f__, val) \ + do { \ + this_ip6_stats.__f__ += (val); \ + } while (0) + +#define IP6_UPD_PO_STATS(__f__, val) \ + do { \ + this_ip6_stats.__f__##pkts ++; \ + this_ip6_stats.__f__##octets += (val); \ + } while (0) + +/* + * internal functions + */ +static void ip6_prot_init(void) +{ + int i; + + rte_rwlock_init(&inet6_prot_lock); + rte_rwlock_write_lock(&inet6_prot_lock); + + for (i = 0; i < NELEMS(inet6_prots); i++) + inet6_prots[i] = NULL; + + rte_rwlock_write_unlock(&inet6_prot_lock); +} + +static void ip6_conf_forward(vector_t tokens) +{ + char *str = set_value(tokens); + + assert(str); + + if (strcasecmp(str, "on") == 0) + conf_ipv6_forwarding = true; + else if (strcasecmp(str, "off") == 0) + conf_ipv6_forwarding = false; + else + RTE_LOG(WARNING, IPV6, "invalid ipv6:forwarding %s\n", str); + + RTE_LOG(INFO, IPV6, "ipv6:forwarding = %s\n", conf_ipv6_forwarding ? "on" : "off"); + + FREE_PTR(str); +} + +static void ip6_conf_disable(vector_t tokens) +{ + char *str = set_value(tokens); + + assert(str); + + if (strcasecmp(str, "on") == 0) + conf_ipv6_disable = true; + else if (strcasecmp(str, "off") == 0) + conf_ipv6_disable = false; + else + RTE_LOG(WARNING, IPV6, "invalid ipv6:disable %s\n", str); + + RTE_LOG(INFO, IPV6, "ipv6:disable = %s", conf_ipv6_disable ? "on" : "off"); + + FREE_PTR(str); +} + +/* refer linux:ip6_input_finish() */ +static int ip6_local_in_fin(struct rte_mbuf *mbuf) +{ + uint8_t nexthdr; + int (*handler)(struct rte_mbuf *mbuf) = NULL; + bool is_final, have_final = false; + const struct inet6_protocol *prot; + struct ip6_hdr *hdr = ip6_hdr(mbuf); + int ret = EDPVS_INVAL; + + /* + * release route info saved in @userdata + * and set it to IPv6 fixed header for upper layer. + */ + if (!ipv6_addr_is_multicast(&hdr->ip6_dst)) { + struct route6 *rt = mbuf->userdata; + if (rt) { + route6_put(rt); + mbuf->userdata = NULL; + } + } + + mbuf->userdata = (void *)hdr; + nexthdr = hdr->ip6_nxt; + + /* parse extension headers */ +resubmit: + /* + * l3_len is not the transport header length. + * we just borrow it to save info for each step when processing + * fixed header and extension header. + * + * l3_len is initially the fix header size (ipv6_rcv), + * and being set to ext-header size by each non-final protocol. + */ + if (rte_pktmbuf_adj(mbuf, mbuf->l3_len) == NULL) + goto discard; + +resubmit_final: + rte_rwlock_read_lock(&inet6_prot_lock); + + prot = inet6_prots[nexthdr]; + if (unlikely(!prot)) { + /* no proto, kni may like it.*/ + rte_rwlock_read_unlock(&inet6_prot_lock); + IP6_INC_STATS(inunknownprotos); + goto kni; + } + + is_final = (prot->flags & INET6_PROTO_F_FINAL); + + if (have_final) { + /* final proto don't allow encap non-final */ + if (!is_final) { + rte_rwlock_read_unlock(&inet6_prot_lock); + goto discard; + } + } else if (is_final) { + have_final = true; + + /* check mcast, if failed, kni may like it. */ + if (ipv6_addr_is_multicast(&hdr->ip6_dst) && + !inet_chk_mcast_addr(AF_INET6, netif_port_get(mbuf->port), + (union inet_addr *)&hdr->ip6_dst, + (union inet_addr *)&hdr->ip6_src)) { + rte_rwlock_read_unlock(&inet6_prot_lock); + goto kni; + } + } + + handler = prot->handler; + + /* tunnel may try lock again, need release lock */ + rte_rwlock_read_unlock(&inet6_prot_lock); + + assert(handler); + ret = handler(mbuf); + + /* + * 1. if return > 0, it's always "nexthdr", + * no matter if proto is final or not. + * 2. if return == 0, the pkt is consumed. + * 3. should not return < 0, or it'll be ignored. + * 4. mbuf->l3_len must be adjusted by handler. + */ + if (ret > 0) { + nexthdr = ret; + + if (is_final) + goto resubmit_final; + else + goto resubmit; + } else { + IP6_INC_STATS(indelivers); + } + + return ret; + +kni: + return EDPVS_KNICONTINUE; + +discard: + IP6_INC_STATS(indiscards); + rte_pktmbuf_free(mbuf); + return EDPVS_INVAL; +} + +static int ip6_local_in(struct rte_mbuf *mbuf) +{ + return INET_HOOK(AF_INET6, INET_HOOK_LOCAL_IN, mbuf, + netif_port_get(mbuf->port), NULL, ip6_local_in_fin); +} + +static int ip6_mc_local_in(struct rte_mbuf *mbuf) +{ + struct ip6_hdr *iph = ip6_hdr(mbuf); + + IP6_UPD_PO_STATS(inmcast, mbuf->pkt_len); + + if (inet_chk_mcast_addr(AF_INET6, netif_port_get(mbuf->port), + (union inet_addr *)&iph->ip6_dst, NULL)) + return ip6_local_in(mbuf); + else + return EDPVS_KNICONTINUE; /* not drop */ +} + +static inline struct in6_addr *ip6_rt_nexthop(struct route6 *rt, + struct in6_addr *daddr) +{ + if (ipv6_addr_any(&rt->rt6_gateway)) + return daddr; + else + return &rt->rt6_gateway; +} + +static inline unsigned int ip6_mtu_forward(struct route6 *rt) +{ + if (rt->rt6_mtu) + return rt->rt6_mtu; + else if (rt->rt6_dev && rt->rt6_dev->mtu) + return rt->rt6_dev->mtu; + else + return IPV6_MIN_MTU; +} + +static int ip6_fragment(struct rte_mbuf *mbuf, uint32_t mtu, + int (*out)(struct rte_mbuf *)) +{ + struct route6 *rt = mbuf->userdata; + + /* TODO: */ + + IP6_INC_STATS(fragfails); + route6_put(rt); + rte_pktmbuf_free(mbuf); + return EDPVS_FRAG; +} + +static int ip6_output_fin2(struct rte_mbuf *mbuf) +{ + struct ip6_hdr *hdr = ip6_hdr(mbuf); + struct route6 *rt = NULL; + struct in6_addr *nexthop; + struct netif_port *dev; + int err; + + if (ipv6_addr_is_multicast(&hdr->ip6_dst)) { + IP6_UPD_PO_STATS(outmcast, mbuf->pkt_len); + + if (IPV6_ADDR_MC_SCOPE(&hdr->ip6_dst) <= IPV6_ADDR_SCOPE_NODELOCAL) { + IP6_INC_STATS(outdiscards); + rte_pktmbuf_free(mbuf); + return EDPVS_INVAL; + } + + dev = mbuf->userdata; + /* only support linklocal! */ + nexthop = &hdr->ip6_dst; + + } else { + rt = mbuf->userdata; + dev = rt->rt6_dev; + nexthop = ip6_rt_nexthop(rt, &hdr->ip6_dst); + } + mbuf->packet_type = ETHER_TYPE_IPv6; + + err = neigh_output(AF_INET6, (union inet_addr *)nexthop, mbuf, dev); + + if (rt) + route6_put(rt); + + return err; +} + +static int ip6_output_fin(struct rte_mbuf *mbuf) +{ + uint16_t mtu; + struct ip6_hdr *hdr = ip6_hdr(mbuf); + + if (ipv6_addr_is_multicast(&hdr->ip6_dst)) + mtu = ((struct netif_port *)mbuf->userdata)->mtu; + else + mtu = ((struct route6 *)mbuf->userdata)->rt6_mtu; + + if (mbuf->pkt_len > mtu) + return ip6_fragment(mbuf, mtu, ip6_output_fin2); + else + return ip6_output_fin2(mbuf); +} + +int ip6_output(struct rte_mbuf *mbuf) +{ + struct netif_port *dev; + struct route6 *rt = NULL; + struct ip6_hdr *hdr = ip6_hdr(mbuf); + + if (ipv6_addr_is_multicast(&hdr->ip6_dst)) { + dev = mbuf->userdata; + } else { + rt = mbuf->userdata; + dev = rt->rt6_dev; + } + + IP6_UPD_PO_STATS(out, mbuf->pkt_len); + mbuf->port = dev->id; + + if (unlikely(conf_ipv6_disable)) { + IP6_INC_STATS(outdiscards); + if (rt) + route6_put(rt); + rte_pktmbuf_free(mbuf); + return EDPVS_OK; + } + + return INET_HOOK(AF_INET6, INET_HOOK_POST_ROUTING, mbuf, NULL, + dev, ip6_output_fin); +} + +int ip6_local_out(struct rte_mbuf *mbuf) +{ + struct netif_port *dev; + struct ip6_hdr *hdr = ip6_hdr(mbuf); + + if (ipv6_addr_is_multicast(&hdr->ip6_dst)) + dev = mbuf->userdata; + else + dev = ((struct route6 *)mbuf->userdata)->rt6_dev; + + return INET_HOOK(AF_INET6, INET_HOOK_LOCAL_OUT, mbuf, NULL, + dev, ip6_output); +} + +static int ip6_forward_fin(struct rte_mbuf *mbuf) +{ + IP6_INC_STATS(outforwdatagrams); + IP6_ADD_STATS(outoctets, mbuf->pkt_len); + + return ip6_output(mbuf); +} + +static int ip6_forward(struct rte_mbuf *mbuf) +{ + struct ip6_hdr *hdr = ip6_hdr(mbuf); + struct route6 *rt = mbuf->userdata; + int addrtype; + uint32_t mtu; + + if (!conf_ipv6_forwarding) + goto error; + + if (mbuf->packet_type != ETH_PKT_HOST) + goto drop; + + /* not support forward multicast */ + if (ipv6_addr_is_multicast(&hdr->ip6_dst)) + goto error; + + if (hdr->ip6_hlim <= 1) { + mbuf->port = rt->rt6_dev->id; + icmp6_send(mbuf, ICMP6_TIME_EXCEEDED, ICMP6_TIME_EXCEED_TRANSIT, 0); + IP6_INC_STATS(inhdrerrors); + rte_pktmbuf_free(mbuf); + return EDPVS_INVAL; + } + + /* security critical */ + addrtype = ipv6_addr_type(&hdr->ip6_src); + + if (addrtype == IPV6_ADDR_ANY || + addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK)) + goto error; + + if (addrtype & IPV6_ADDR_LINKLOCAL) { + icmp6_send(mbuf, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_BEYONDSCOPE, 0); + goto error; + } + + /* is packet too big ? */ + mtu = ip6_mtu_forward(rt); + if (mtu < IPV6_MIN_MTU) + mtu = IPV6_MIN_MTU; + + if (mbuf->pkt_len > mtu) { + mbuf->port = rt->rt6_dev->id; + icmp6_send(mbuf, ICMP6_PACKET_TOO_BIG, 0, mtu); + + IP6_INC_STATS(intoobigerrors); + IP6_INC_STATS(fragfails); + goto drop; + } + + /* decrease TTL */ + hdr->ip6_hlim--; + + return INET_HOOK(AF_INET6, INET_HOOK_FORWARD, mbuf, + netif_port_get(mbuf->port), rt->rt6_dev, ip6_forward_fin); + +error: + IP6_INC_STATS(inaddrerrors); +drop: + rte_pktmbuf_free(mbuf); + return EDPVS_INVAL; +} + +static struct route6 *ip6_route_input(struct rte_mbuf *mbuf) +{ + struct ip6_hdr *hdr = ip6_hdr(mbuf); + struct flow6 fl6 = { + .fl6_iif = netif_port_get(mbuf->port), + .fl6_daddr = hdr->ip6_dst, + .fl6_saddr = hdr->ip6_src, + .fl6_proto = hdr->ip6_nxt, + }; + + return route6_input(mbuf, &fl6); +} + +static int ip6_rcv_fin(struct rte_mbuf *mbuf) +{ + struct route6 *rt = NULL; + eth_type_t etype = mbuf->packet_type; + struct ip6_hdr *iph = ip6_hdr(mbuf); + + if (ipv6_addr_type(&iph->ip6_dst) & IPV6_ADDR_MULTICAST) + return ip6_mc_local_in(mbuf); + + rt = ip6_route_input(mbuf); + if (!rt) { + IP6_INC_STATS(innoroutes); + goto kni; + } + + /* + * @userdata is used for route info in L3. + * someday, we may use extended mbuf if have more L3 info + * then route need to be saved into mbuf. + */ + mbuf->userdata = (void *)rt; + + if (rt->rt6_flags & RTF_LOCALIN) { + return ip6_local_in(mbuf); + } else if (rt->rt6_flags & RTF_FORWARD) { + /* pass multi-/broad-cast to kni */ + if (etype != ETH_PKT_HOST) + goto kni; + + return ip6_forward(mbuf); + } + + IP6_INC_STATS(innoroutes); + + /* to kni */ + +kni: + if (rt) { + route6_put(rt); + mbuf->userdata = NULL; + } + return EDPVS_KNICONTINUE; +} + +static int ip6_rcv(struct rte_mbuf *mbuf, struct netif_port *dev) +{ + const struct ip6_hdr *hdr; + uint32_t pkt_len, tot_len; + eth_type_t etype = mbuf->packet_type; + + if (unlikely(etype == ETH_PKT_OTHERHOST || !dev)) { + rte_pktmbuf_free(mbuf); + return EDPVS_DROP; + } + + IP6_UPD_PO_STATS(in, mbuf->pkt_len); + + if (unlikely(conf_ipv6_disable)) { + IP6_INC_STATS(indiscards); + goto drop; + } + + if (unlikely(mbuf_may_pull(mbuf, sizeof(*hdr)) != 0)) + goto err; + + hdr = ip6_hdr(mbuf); + + if (unlikely(((hdr->ip6_vfc&0xf0)>>4) != 6)) + goto err; + + /* + * we do not have loopback dev for DPVS at all, + * as RFC4291, loopback must be send/recv from lo dev. + * so let's drop all pkt with loopback address. + */ + if (ipv6_addr_loopback(&hdr->ip6_src) || + ipv6_addr_loopback(&hdr->ip6_dst)) + goto err; + + /* + * RFC4291 Errata ID: 3480 + * interface-local scope is useful only for loopback transmission of + * multicast but we do not have loopback dev. + */ + if (ipv6_addr_is_multicast(&hdr->ip6_dst) && + IPV6_ADDR_MC_SCOPE(&hdr->ip6_dst) == 1) + goto err; + + /* + * drop unicast encapsulated in link-layer multicast/broadcast. + * kernel is configurable, so need we ? + */ + if (!ipv6_addr_is_multicast(&hdr->ip6_dst) && + (etype == ETH_PKT_BROADCAST || etype == ETH_PKT_MULTICAST)) + goto err; + + /* RFC4291 2.7 */ + if (ipv6_addr_is_multicast(&hdr->ip6_dst) && + IPV6_ADDR_MC_SCOPE(&hdr->ip6_dst) == 0) + goto err; + + /* + * RFC4291 2.7 + * source address must not be multicast. + */ + if (ipv6_addr_is_multicast(&hdr->ip6_src)) + goto err; + + pkt_len = ntohs(hdr->ip6_plen); + tot_len = pkt_len + sizeof(*hdr); + + /* check pkt_len, note it's zero if jumbo payload option is present. */ + if (pkt_len || hdr->ip6_nxt != NEXTHDR_HOP) { + if (tot_len > mbuf->pkt_len) { + IP6_INC_STATS(intruncatedpkts); + goto drop; + } + + if (mbuf->pkt_len > tot_len) { + if (rte_pktmbuf_trim(mbuf, mbuf->pkt_len - tot_len) != 0) + goto err; + } + } + + /* + * now @l3_len record fix header only, + * it may change, when parsing extension headers. + * @userdata is used to save route info in L3. + */ + mbuf->l3_len = sizeof(*hdr); + mbuf->userdata = NULL; + + /* hop-by-hop option header */ + if (hdr->ip6_nxt == NEXTHDR_HOP) { + if (ipv6_parse_hopopts(mbuf) != EDPVS_OK) + goto err; + } + + return INET_HOOK(AF_INET6, INET_HOOK_PRE_ROUTING, mbuf, + dev, NULL, ip6_rcv_fin); + +err: + IP6_INC_STATS(inhdrerrors); +drop: + rte_pktmbuf_free(mbuf); + return EDPVS_DROP; +} + +static struct pkt_type ip6_pkt_type = { + /*.type = */ + .func = ip6_rcv, + .port = NULL, +}; + +/* + * IPv6 APIs + */ +int ipv6_init(void) +{ + int err; + + ip6_prot_init(); + + err = ipv6_exthdrs_init(); + if (err) + return err; + + /* htons, cpu_to_be16 not work when struct initialization :( */ + ip6_pkt_type.type = htons(ETHER_TYPE_IPv6); + + err = netif_register_pkt(&ip6_pkt_type); + if (err) + goto reg_pkt_err; + + err = ipv6_ctrl_init(); + if (err) + goto ctrl_err; + + return EDPVS_OK; + +reg_pkt_err: + ipv6_exthdrs_term(); +ctrl_err: + netif_unregister_pkt(&ip6_pkt_type); + + return err; +} + +int ipv6_term(void) +{ + int err; + + err = ipv6_ctrl_term(); + if (err) + return err; + + err = netif_unregister_pkt(&ip6_pkt_type); + if (err) + return err; + + ipv6_exthdrs_term(); + + return EDPVS_OK; +} + +int ipv6_xmit(struct rte_mbuf *mbuf, struct flow6 *fl6) +{ + struct route6 *rt = NULL; + struct ip6_hdr *hdr; + struct netif_port *dev; + + if (unlikely(!mbuf || !fl6 || ipv6_addr_any(&fl6->fl6_daddr))) { + if (mbuf) + rte_pktmbuf_free(mbuf); + return EDPVS_INVAL; + } + + /* TODO: to support jumbo packet */ + if (mbuf->pkt_len > IPV6_MAXPLEN) { + IP6_INC_STATS(outdiscards); + rte_pktmbuf_free(mbuf); + return EDPVS_NOROOM; + } + + if (unlikely(ipv6_addr_is_multicast(&fl6->fl6_daddr))) { + /* only support linklocal now */ + if (IPV6_ADDR_MC_SCOPE(&fl6->fl6_daddr) + != IPV6_ADDR_SCOPE_LINKLOCAL) { + IP6_INC_STATS(outnoroutes); + rte_pktmbuf_free(mbuf); + return EDPVS_NOTSUPP; + } + assert(fl6->fl6_oif); + mbuf->userdata = (void *)fl6->fl6_oif; + dev = fl6->fl6_oif; + + } else { + /* route decision */ + rt = route6_output(mbuf, fl6); + if (!rt) { + IP6_INC_STATS(outnoroutes); + rte_pktmbuf_free(mbuf); + return EDPVS_NOROUTE; + } + mbuf->userdata = (void *)rt; + dev = rt->rt6_dev; + } + + hdr = (void *)rte_pktmbuf_prepend(mbuf, sizeof(*hdr)); + if (unlikely(!hdr)) { + if (rt) + route6_put(rt); + rte_pktmbuf_free(mbuf); + IP6_INC_STATS(outdiscards); + return EDPVS_NOROOM; + } + + memset(hdr, 0, sizeof(*hdr)); + hdr->ip6_vfc = 0x60; + hdr->ip6_flow |= htonl(((uint64_t)fl6->fl6_tos<<20) | \ + (ntohl(fl6->fl6_flow)&0xfffffUL)); + hdr->ip6_plen = htons(mbuf->pkt_len - sizeof(*hdr)); + hdr->ip6_nxt = fl6->fl6_proto; + hdr->ip6_hlim = fl6->fl6_ttl ? : INET_DEF_TTL; + hdr->ip6_src = fl6->fl6_saddr; + hdr->ip6_dst = fl6->fl6_daddr; + + if (ipv6_addr_any(&hdr->ip6_src) && + hdr->ip6_nxt != IPPROTO_ICMPV6) { + union inet_addr saddr; + + inet_addr_select(AF_INET6, dev, (void *)&fl6->fl6_daddr, + fl6->fl6_scope, &saddr); + hdr->ip6_src = saddr.in6; + } + + return ip6_local_out(mbuf); +} + +int ipv6_register_protocol(struct inet6_protocol *prot, + unsigned char protocol) +{ + int err = EDPVS_OK; + + rte_rwlock_write_lock(&inet6_prot_lock); + if (inet6_prots[protocol]) + err = EDPVS_EXIST; + else + inet6_prots[protocol] = prot; + rte_rwlock_write_unlock(&inet6_prot_lock); + + return err; +} + +int ipv6_unregister_protocol(struct inet6_protocol *prot, + unsigned char protocol) +{ + int err = EDPVS_OK; + + rte_rwlock_write_lock(&inet6_prot_lock); + if (inet6_prots[protocol] != prot) + err = EDPVS_NOTEXIST; + else + inet6_prots[protocol] = NULL; + rte_rwlock_write_unlock(&inet6_prot_lock); + + return err; +} + +int ipv6_stats_cpu(struct inet_stats *stats) +{ + if (!stats) + return EDPVS_INVAL; + + memcpy(stats, &this_ip6_stats, sizeof(*stats)); + + return EDPVS_OK; +} + +/* + * configure file + */ +void ipv6_keyword_value_init(void) +{ + if (dpvs_state_get() == DPVS_STATE_INIT) { + /* KW_TYPE_INIT keyword */ + } + /* KW_TYPE NORMAL keyword */ + conf_ipv6_forwarding = false; + conf_ipv6_disable = false; + + route6_keyword_value_init(); +} + +void install_ipv6_keywords(void) +{ + install_keyword_root("ipv6_defs", NULL); + install_keyword("forwarding", ip6_conf_forward, KW_TYPE_NORMAL); + install_keyword("disable", ip6_conf_disable, KW_TYPE_NORMAL); + + install_route6_keywords(); +} + +/* + * ip6_hdrlen: get ip6 header length, including extension header length + */ +int ip6_hdrlen(const struct rte_mbuf *mbuf) { + struct ip6_hdr *ip6h = ip6_hdr(mbuf); + uint8_t ip6nxt = ip6h->ip6_nxt; + int ip6_hdrlen = ip6_skip_exthdr(mbuf, sizeof(struct ip6_hdr), &ip6nxt); + + /* ip6_skip_exthdr may return -1 */ + return (ip6_hdrlen >= 0) ? ip6_hdrlen : sizeof(struct ip6_hdr); +} + +/* + * "ip6_phdr_cksum" is a upgraded version of DPDK routine "rte_ipv6_phdr_cksum" + * to support IPv6 extension headers (RFC 2460). + * */ +uint16_t ip6_phdr_cksum(struct ip6_hdr *ip6h, uint64_t ol_flags, + uint32_t exthdrlen, uint8_t l4_proto) +{ + uint16_t csum; + uint8_t ip6nxt = ip6h->ip6_nxt; + uint32_t ip6plen = ip6h->ip6_plen; + struct in6_addr ip6dst = ip6h->ip6_dst; + + ip6h->ip6_nxt = l4_proto; + + /* length of L4 header plus L4 data */ + ip6h->ip6_plen = htons(ntohs(ip6h->ip6_plen) + + sizeof(struct ip6_hdr) - exthdrlen); + + /* ip6_dst translation for NEXTHDR_ROUTING exthdrs */ + if (unlikely(ip6nxt == NEXTHDR_ROUTING)) { + struct ip6_rthdr0 *rh = (struct ip6_rthdr0 *)(ip6h + 1); + if (likely(rh->ip6r0_segleft > 0)) + ip6h->ip6_dst = rh->ip6r0_addr[rh->ip6r0_segleft - 1]; + } + /*FIXME: what if NEXTHDR_ROUTING is not the first exthdr? */ + + csum = rte_ipv6_phdr_cksum((struct ipv6_hdr *)ip6h, ol_flags); + + /* restore original ip6h header */ + ip6h->ip6_nxt = ip6nxt; + ip6h->ip6_plen = ip6plen; + if (unlikely(ip6nxt == NEXTHDR_ROUTING)) + ip6h->ip6_dst = ip6dst; + + return csum; +} + +/* + * "ip6_udptcp_cksum" is a upgraded version of DPDK routine "rte_ipv6_udptcp_cksum" + * to support IPv6 extension headers (RFC 2460). + * */ +uint16_t ip6_udptcp_cksum(struct ip6_hdr *ip6h, const void *l4_hdr, + uint32_t exthdrlen, uint8_t l4_proto) +{ + uint16_t csum; + uint8_t ip6nxt = ip6h->ip6_nxt; + uint32_t ip6plen = ip6h->ip6_plen; + struct in6_addr ip6dst = ip6h->ip6_dst; + + ip6h->ip6_nxt = l4_proto; + + /* length of L4 header plus L4 data */ + ip6h->ip6_plen = htons(ntohs(ip6h->ip6_plen) + + sizeof(struct ip6_hdr) - exthdrlen); + + /* ip6_dst translation for NEXTHDR_ROUTING exthdrs */ + if (unlikely(ip6nxt == NEXTHDR_ROUTING)) { + struct ip6_rthdr0 *rh = (struct ip6_rthdr0 *)(ip6h + 1); + if (likely(rh->ip6r0_segleft > 0)) + ip6h->ip6_dst = rh->ip6r0_addr[rh->ip6r0_segleft - 1]; + } + /*FIXME: what if NEXTHDR_ROUTING is not the first exthdr? */ + + csum = rte_ipv6_udptcp_cksum((struct ipv6_hdr *)ip6h, l4_hdr); + + /* restore original ip6h header */ + ip6h->ip6_nxt = ip6nxt; + ip6h->ip6_plen = ip6plen; + if (unlikely(ip6nxt == NEXTHDR_ROUTING)) + ip6h->ip6_dst = ip6dst; + + return csum; +} diff --git a/src/ipv6/ipv6_ctrl.c b/src/ipv6/ipv6_ctrl.c new file mode 100644 index 000000000..5c6abb47d --- /dev/null +++ b/src/ipv6/ipv6_ctrl.c @@ -0,0 +1,155 @@ +/* + * DPVS is a software load balancer (Virtual Server) based on DPDK. + * + * Copyright (C) 2018 iQIYI (www.iqiyi.com). + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ +/** + * IPv6 control plane. + * + * Lei Chen , initial, Jul 2018. + */ +#include +#include "common.h" +#include "dpdk.h" +#include "inet.h" +#include "ipv6.h" +#include "conf/ipv6.h" +#include "ctrl.h" + +static int ip6_msg_get_stats(struct dpvs_msg *msg) +{ + int err; + struct inet_stats *stats; + assert(msg); + + stats = rte_zmalloc(NULL, sizeof(*stats), 0); + if (!stats) + return EDPVS_NOMEM; + + err = ipv6_stats_cpu(stats); + if (err != EDPVS_OK) { + rte_free(stats); + return err; + } + + msg->reply.len = sizeof(*stats); + msg->reply.data = stats; + + return EDPVS_OK; +} + +static int ip6_sockopt_set(sockoptid_t opt, const void *in, size_t inlen) +{ + return EDPVS_NOTSUPP; +} + +static int ip6_sockopt_get(sockoptid_t opt, const void *conf, size_t size, + void **out, size_t *outsize) +{ + struct ip6_stats_param *param; + struct dpvs_msg *req, *reply; + struct dpvs_multicast_queue *replies = NULL; + int err; + + if (opt != SOCKOPT_IP6_STATS) + return EDPVS_NOTSUPP; + + if (!out || !outsize) + return EDPVS_INVAL; + + /* ask each worker lcore for stats by msg */ + req = msg_make(MSG_TYPE_IPV6_STATS, 0, DPVS_MSG_MULTICAST, + rte_lcore_id(), 0, NULL); + if (!req) + return EDPVS_NOMEM; + + /* including per-lcore and total statistics. */ + param = rte_zmalloc(NULL, sizeof(struct ip6_stats_param), 0); + if (!param) { + msg_destroy(&req); + return EDPVS_NOMEM; + } + + err = multicast_msg_send(req, 0, &replies); + if (err != EDPVS_OK) { + RTE_LOG(ERR, IPV6, "%s: send msg: %s\n", __func__, dpvs_strerror(err)); + msg_destroy(&req); + rte_free(param); + return err; + } + + /* handle each reply */ + list_for_each_entry(reply, &replies->mq, mq_node) { + struct inet_stats *stats = (struct inet_stats *)reply->data; + + inet_stats_add(¶m->stats, stats); + param->stats_cpus[reply->cid] = *stats; + } + + *out = param; + *outsize = sizeof(*param); + + msg_destroy(&req); + return EDPVS_OK; +} + +static struct dpvs_msg_type ip6_stats_msg = { + .type = MSG_TYPE_IPV6_STATS, + .unicast_msg_cb = ip6_msg_get_stats, +}; + +static struct dpvs_sockopts ip6_sockopts = { + .version = SOCKOPT_VERSION, + .set_opt_min = SOCKOPT_IP6_SET, + .set_opt_max = SOCKOPT_IP6_SET, + .set = ip6_sockopt_set, + + .get_opt_min = SOCKOPT_IP6_STATS, + .get_opt_max = SOCKOPT_IP6_STATS, + .get = ip6_sockopt_get, +}; + +int ipv6_ctrl_init(void) +{ + int err; + + err = sockopt_register(&ip6_sockopts); + if (err != EDPVS_OK) + return err; + + err = msg_type_mc_register(&ip6_stats_msg); + if (err != EDPVS_OK) { + RTE_LOG(ERR, IPV6, "%s: fail to register msg\n", __func__); + sockopt_unregister(&ip6_sockopts); + return err; + } + + return EDPVS_OK; +} + +int ipv6_ctrl_term(void) +{ + int err; + + err = msg_type_mc_unregister(&ip6_stats_msg); + if (err != EDPVS_OK) + RTE_LOG(WARNING, IPV6, "%s: fail to unregister msg\n", __func__); + + err = sockopt_unregister(&ip6_sockopts); + if (err != EDPVS_OK) + RTE_LOG(WARNING, IPV6, "%s: fail to unregister sockopt\n", __func__); + + return EDPVS_OK; +} diff --git a/src/ipv6/ipv6_exthdrs.c b/src/ipv6/ipv6_exthdrs.c new file mode 100644 index 000000000..73d02ca1b --- /dev/null +++ b/src/ipv6/ipv6_exthdrs.c @@ -0,0 +1,191 @@ +/* + * DPVS is a software load balancer (Virtual Server) based on DPDK. + * + * Copyright (C) 2018 iQIYI (www.iqiyi.com). + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ +/** + * IPv6 protocol for "lite stack". + * Linux Kernel net/ipv6/exthdrs.c is referred. + * + * Lei Chen , initial, Jul 2018. + */ +#include +#include "ipv6.h" + +static int ip6_ext_hdr(__u8 nexthdr) +{ + /* + * find out if nexthdr is an extension header or a protocol + */ + return ( (nexthdr == NEXTHDR_HOP) || + (nexthdr == NEXTHDR_ROUTING) || + (nexthdr == NEXTHDR_FRAGMENT) || + (nexthdr == NEXTHDR_AUTH) || + (nexthdr == NEXTHDR_NONE) || + (nexthdr == NEXTHDR_DEST) ); +} + +/* + * The helper function return upper proto offset of mbuf, including ip6_hdr + * and exthdrs. + * + * @mbuf: packet message buffer + * @start: start point of ext header, basically sizeof(struct ip6_hdr) + * @nexthdrp: pointer to next header + * when passed in, it was from the 'ip6_nxt' field of ipv6 basic header + * when returned, it points to the 'ip6_nxt' field from last extension header + * + * @return skip length of packet header, including ext headers + */ +int ip6_skip_exthdr(const struct rte_mbuf *imbuf, int start, __u8 *nexthdrp) +{ + __u8 nexthdr = *nexthdrp; + + while (ip6_ext_hdr(nexthdr)) { + struct ip6_ext _hdr, *hp; + int hdrlen; + + if (nexthdr == NEXTHDR_NONE) + return -1; + hp = mbuf_header_pointer(imbuf, start, sizeof(_hdr), &_hdr); + if (hp == NULL) + return -1; + if (nexthdr == NEXTHDR_FRAGMENT) { + __be16 _frag_off, *fp; + fp = mbuf_header_pointer(imbuf, + start + offsetof(struct ip6_frag, ip6f_offlg), + sizeof(_frag_off), + &_frag_off); + if (fp == NULL) + return -1; + + if (ntohs(*fp) & ~0x7) + break; + hdrlen = 8; + } else if (nexthdr == NEXTHDR_AUTH) + hdrlen = (hp->ip6e_len + 2) << 2; + else + hdrlen = ((hp)->ip6e_len + 1) << 3; + + nexthdr = hp->ip6e_nxt; + start += hdrlen; + } + + *nexthdrp = nexthdr; + return start; +} + +/* + * it's a dummy ext-header handler to parse next header + * and ext-hdr-length only. + */ +static int ip6_dummy_hdr_rcv(struct rte_mbuf *mbuf) +{ + struct ip6_hdr *hdr = mbuf->userdata; + struct ip6_ext *exthdr; + + if (mbuf_may_pull(mbuf, 8) != 0) + goto drop; + + exthdr = rte_pktmbuf_mtod(mbuf, struct ip6_ext *); + + if (mbuf_may_pull(mbuf, 8 + (exthdr->ip6e_len<<3)) != 0) + goto drop; + + if (ipv6_addr_is_multicast(&hdr->ip6_dst) || + mbuf->packet_type != ETH_PKT_HOST) + goto drop; + + /* handle nothing */ + + /* set current ext-header length and return next header. + * note l3_len record current header length only. */ + mbuf->l3_len = 8 + (exthdr->ip6e_len<<3); + return exthdr->ip6e_nxt; + +drop: + rte_pktmbuf_free(mbuf); + return -1; +} + +static int ip6_rthdr_rcv(struct rte_mbuf *mbuf) +{ + /* TODO: handle route header */ + return ip6_dummy_hdr_rcv(mbuf); +} + +static int ip6_destopt_rcv(struct rte_mbuf *mbuf) +{ + /* TODO: handle dest option header */ + return ip6_dummy_hdr_rcv(mbuf); +} + +static int ip6_nodata_rcv(struct rte_mbuf *mbuf) +{ + /* no payload ? just consume it. */ + rte_pktmbuf_free(mbuf); + return 0; +} + +static struct inet6_protocol rthdr_proto = { + .handler = ip6_rthdr_rcv, +}; + +static struct inet6_protocol destopt_proto = { + .handler = ip6_destopt_rcv, +}; + +static struct inet6_protocol nodata_proto = { + .handler = ip6_nodata_rcv, +}; + +int ipv6_exthdrs_init(void) +{ + int err; + + err = ipv6_register_protocol(&rthdr_proto, IPPROTO_ROUTING); + if (err) + goto out; + + err = ipv6_register_protocol(&destopt_proto, IPPROTO_DSTOPTS); + if (err) + goto dstopt_fail; + + err = ipv6_register_protocol(&nodata_proto, IPPROTO_NONE); + if (err) + goto nodata_fail; + + return EDPVS_OK; + +nodata_fail: + ipv6_unregister_protocol(&destopt_proto, IPPROTO_DSTOPTS); +dstopt_fail: + ipv6_unregister_protocol(&rthdr_proto, IPPROTO_ROUTING); +out: + return err; +} + +void ipv6_exthdrs_term(void) +{ + ipv6_unregister_protocol(&nodata_proto, IPPROTO_NONE); + ipv6_unregister_protocol(&destopt_proto, IPPROTO_DSTOPTS); + ipv6_unregister_protocol(&rthdr_proto, IPPROTO_ROUTING); +} + +int ipv6_parse_hopopts(struct rte_mbuf *mbuf) +{ + /* TODO */ + return EDPVS_OK; +} diff --git a/src/ipv6/ndisc.c b/src/ipv6/ndisc.c new file mode 100644 index 000000000..16cd0e378 --- /dev/null +++ b/src/ipv6/ndisc.c @@ -0,0 +1,544 @@ +/* + * DPVS is a software load balancer (Virtual Server) based on DPDK. + * + * Copyright (C) 2017 iQIYI (www.iqiyi.com). + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +/* + * Linux Kernel net/ipv6/ndisc.c is referred. + * Wang Qing + */ +#include +#include +#include +#include +#include + +#include "conf/neigh.h" +#include "neigh.h" +#include "common.h" +#include "ipv6.h" +#include "ndisc.h" +#include "icmp6.h" + +#define NDISC_OPT_SPACE(len) (((len)+2+7)&~7) + +struct nd_msg { + struct icmp6_hdr icmph; + struct in6_addr target; + uint8_t opt[0]; +}; + +/* + * netinet/icmp6.h define ND_OPT by '#define', ND_OPT_MAX is not defined. + * kernel define ND_OPT_ARRAY_MAX by enum, just define 256 here. + */ +#define __ND_OPT_ARRAY_MAX 256 + +struct ndisc_options { + struct nd_opt_hdr *nd_opt_array[__ND_OPT_ARRAY_MAX]; + struct nd_opt_hdr *nd_useropts; + struct nd_opt_hdr *nd_useropts_end; +}; + +#define nd_opts_src_lladdr nd_opt_array[ND_OPT_SOURCE_LINKADDR] +#define nd_opts_tgt_lladdr nd_opt_array[ND_OPT_TARGET_LINKADDR] +#define nd_opts_pi nd_opt_array[ND_OPT_PREFIX_INFORMATION] +#define nd_opts_pi_end nd_opt_array[0] //__ND_OPT_PREFIX_INFO_END +#define nd_opts_rh nd_opt_array[ND_OPT_REDIRECTED_HEADER] +#define nd_opts_mtu nd_opt_array[ND_OPT_MTU] + +/* ipv6 neighbour */ +static inline uint8_t *ndisc_opt_addr_data(struct nd_opt_hdr *p, + struct netif_port *dev) +{ + uint8_t *lladdr = (uint8_t *)(p + 1); + int lladdrlen = p->nd_opt_len << 3; + + /* support ether_addr only */ + if (lladdrlen != NDISC_OPT_SPACE(sizeof(dev->addr))) + return NULL; + + return lladdr; +} + +static uint8_t *ndisc_fill_addr_option(struct rte_mbuf *mbuf, + uint8_t *opt, int type, + void *data, int data_len) +{ + int space = NDISC_OPT_SPACE(data_len); + + opt[0] = type; + opt[1] = space >> 3; + + opt = (uint8_t *)rte_pktmbuf_append(mbuf, data_len + 2); + + memcpy(opt + 2, data, data_len); + data_len += 2; + opt += data_len; + + /* clear space(after option) left */ + if ((space -= data_len) > 0) + memset(opt, 0 ,space); + + return opt + space; +} + +static struct ndisc_options *ndisc_parse_options(uint8_t *opt, int opt_len, + struct ndisc_options *ndopts) +{ + struct nd_opt_hdr *nd_opt = (struct nd_opt_hdr *)opt; + + if (!nd_opt || opt_len < 0 || !ndopts) + return NULL; + + memset(ndopts, 0, sizeof(*ndopts)); + + while (opt_len) { + int l; + + if (opt_len < sizeof(struct nd_opt_hdr)) + return NULL; + l = nd_opt->nd_opt_len << 3; + + if (opt_len < l || l == 0) + return NULL; + + switch (nd_opt->nd_opt_type) { + case ND_OPT_SOURCE_LINKADDR: + case ND_OPT_TARGET_LINKADDR: + case ND_OPT_MTU: + case ND_OPT_REDIRECTED_HEADER: + if (ndopts->nd_opt_array[nd_opt->nd_opt_type]) { + RTE_LOG(ERR, NEIGHBOUR, "[%s] duplicated ND6 option found: \ + type=%d\n", __func__, nd_opt->nd_opt_type); + } else { + ndopts->nd_opt_array[nd_opt->nd_opt_type] = nd_opt; + } + break; + + case ND_OPT_PREFIX_INFORMATION: + ndopts->nd_opts_pi_end = nd_opt; + if (!ndopts->nd_opt_array[nd_opt->nd_opt_type]) + ndopts->nd_opt_array[nd_opt->nd_opt_type] = nd_opt; + break; + + default: + RTE_LOG(ERR, NEIGHBOUR, "[%s] unsupported option ignored: type=%d, \ + len=%d\n", __func__, nd_opt->nd_opt_type, nd_opt->nd_opt_len); + } + + opt_len -= l; + nd_opt = ((void *)nd_opt) + l; + } + + return ndopts; +} + +static struct rte_mbuf *ndisc_build_mbuf(struct netif_port *dev, + const struct in6_addr *daddr, + const struct in6_addr *saddr, + const struct icmp6_hdr *icmp6h, + const struct in6_addr *target, + int llinfo) +{ + struct rte_mbuf *mbuf; + struct icmp6_hdr *icmp6hdr; + struct ipv6_hdr iph; + int len; + uint8_t *opt; + + len = sizeof(*icmp6h) + (target ? sizeof(*target) : 0); + + if (llinfo) + len += NDISC_OPT_SPACE(sizeof(dev->addr)); + + mbuf = rte_pktmbuf_alloc(dev->mbuf_pool); + if (!mbuf) { + RTE_LOG(ERR, NEIGHBOUR, "mbuf_pool alloc failed\n"); + return NULL; + } + + icmp6hdr = (struct icmp6_hdr *)rte_pktmbuf_append(mbuf, sizeof(*icmp6h)); + rte_memcpy(icmp6hdr, icmp6h, sizeof(*icmp6h)); + + opt = rte_pktmbuf_mtod_offset(mbuf, uint8_t *, sizeof(*icmp6h)); + + if (target) { + opt = (uint8_t *)rte_pktmbuf_append(mbuf, sizeof(*target)); + rte_memcpy((struct in6_addr *)opt, target, sizeof(*target)); + opt += sizeof(*target); + } + + if (llinfo) + ndisc_fill_addr_option(mbuf, opt, llinfo, &dev->addr, sizeof(dev->addr)); + + /* checksum */ + iph.payload_len = htons(len); + iph.proto = IPPROTO_ICMPV6; + rte_memcpy(&iph.src_addr, saddr, sizeof(*saddr)); + rte_memcpy(&iph.dst_addr, daddr, sizeof(*daddr)); + icmp6hdr->icmp6_cksum = 0; + icmp6hdr->icmp6_cksum = rte_ipv6_udptcp_cksum(&iph, icmp6hdr); + + return mbuf; +} + +static void ndisc_send_na(struct netif_port *dev, + const struct in6_addr *daddr, + const struct in6_addr *solicited_addr, + int solicited, int override, int inc_opt) +{ + struct inet_ifaddr *ifa; + const struct in6_addr *src_addr; + struct rte_mbuf *mbuf; + struct icmp6_hdr icmp6h; + struct flow6 fl6; + + /* solicited_addr is not always src_addr, just not support now */ + ifa = inet_addr_ifa_get(AF_INET6, dev, (union inet_addr *)solicited_addr); + if (ifa) { + src_addr = solicited_addr; + inet_addr_ifa_put(ifa); + } else { + RTE_LOG(ERR, NEIGHBOUR, "Find no src addr to send na\n"); + return; + } + + memset(&icmp6h, 0, sizeof(icmp6h)); + icmp6h.icmp6_type = ND_NEIGHBOR_ADVERT; + if (solicited) + icmp6h.icmp6_pptr |= ND_NA_FLAG_SOLICITED; + if (override) + icmp6h.icmp6_pptr |= ND_NA_FLAG_OVERRIDE; + + /*ndisc*/ + mbuf = ndisc_build_mbuf(dev, daddr, src_addr, &icmp6h, solicited_addr, + inc_opt ? ND_OPT_TARGET_LINKADDR : 0); + if (!mbuf) + return; + + memset(&fl6, 0, sizeof(fl6)); + fl6.fl6_oif = dev; + fl6.fl6_saddr = *src_addr; + fl6.fl6_daddr = *daddr; + fl6.fl6_proto = IPPROTO_ICMPV6; + fl6.fl6_ttl = 255; + + ipv6_xmit(mbuf, &fl6); +} + +/* saddr can be 0 in ns for dad in addrconf_dad_timer */ +static void ndisc_send_ns(struct netif_port *dev, + const struct in6_addr *solicit, + const struct in6_addr *daddr, + const struct in6_addr *saddr) +{ + struct rte_mbuf *mbuf; + struct icmp6_hdr icmp6h = { + .icmp6_type = ND_NEIGHBOR_SOLICIT, + }; + struct flow6 fl6; + + if (saddr == NULL) { + /* in route module */ + RTE_LOG(ERR, NEIGHBOUR, "Find no src addr to send na,\ + not support yet\n"); + return; + } + + memset(&icmp6h, 0, sizeof(icmp6h)); + icmp6h.icmp6_type = ND_NEIGHBOR_SOLICIT; + + mbuf = ndisc_build_mbuf(dev, daddr, saddr, &icmp6h, solicit, + !ipv6_addr_any(saddr) ? ND_OPT_SOURCE_LINKADDR : 0); + if (!mbuf) + return; + + memset(&fl6, 0, sizeof(fl6)); + fl6.fl6_oif = dev; + fl6.fl6_saddr = *saddr; + fl6.fl6_daddr = *daddr; + fl6.fl6_proto = IPPROTO_ICMPV6; + fl6.fl6_ttl = 255; + + ipv6_xmit(mbuf, &fl6); +} + +void ndisc_send_dad(struct netif_port *dev, + const struct in6_addr* solicit) +{ + struct in6_addr mcaddr; + addrconf_addr_solict_mult(solicit, &mcaddr); + ndisc_send_ns(dev, solicit, &mcaddr, &in6addr_any); +} + +void ndisc_solicit(struct neighbour_entry *neigh, + const struct in6_addr *saddr) +{ + struct in6_addr mcaddr; + struct netif_port *dev = neigh->port; + struct in6_addr *target = &neigh->ip_addr.in6; + + addrconf_addr_solict_mult(target, &mcaddr); + ndisc_send_ns(dev, target, &mcaddr, saddr); +} + +static int ndisc_recv_ns(struct rte_mbuf *mbuf, struct netif_port *dev) +{ + uint8_t *lladdr = NULL; + struct ndisc_options ndopts; + struct neighbour_entry *neigh; + struct inet_ifaddr *ifa; + int inc = 0; + int hashkey = 0; + uint32_t ndoptlen = 0; + + struct in6_addr *saddr = &((struct ip6_hdr *)mbuf->userdata)->ip6_src; + struct in6_addr *daddr = &((struct ip6_hdr *)mbuf->userdata)->ip6_dst; + + struct nd_msg *msg = rte_pktmbuf_mtod(mbuf, struct nd_msg *); + int dad = ipv6_addr_any(saddr); + + if (mbuf_may_pull(mbuf, sizeof(struct nd_msg))) + return EDPVS_DROP; + + ndoptlen = mbuf->data_len - offsetof(struct nd_msg, opt); + + if (ipv6_addr_is_multicast(&msg->target)) { + RTE_LOG(ERR, NEIGHBOUR, "[%s] NS: muticast target address\n", __func__); + return EDPVS_DROP; + } + + if (dad && !ipv6_addr_is_solict_mult(daddr)) { + RTE_LOG(ERR, NEIGHBOUR, "[%s] NS: bad DAD packet\n", __func__); + return EDPVS_DROP; + } + + if (!ndisc_parse_options(msg->opt, ndoptlen, &ndopts)) { + RTE_LOG(ERR, NEIGHBOUR, "[%s] NS: invalid ND packet\n", __func__); + return EDPVS_DROP; + } + + ifa = inet_addr_ifa_get(AF_INET6, dev, (union inet_addr *)&msg->target); + if (!ifa) { + RTE_LOG(ERR, NEIGHBOUR, "[%s] RECVNs: dpvs is not the target!\n", __func__); + return EDPVS_KNICONTINUE; + } + + if (ndopts.nd_opts_src_lladdr) { + lladdr = ndisc_opt_addr_data(ndopts.nd_opts_src_lladdr, dev); + if (!lladdr) { + RTE_LOG(ERR, NEIGHBOUR, "[%s] NS: invalid link-layer address\n", __func__); + inet_addr_ifa_put(ifa); + return EDPVS_DROP; + } + /* + * RFC2461 7.1.1: + * IP source address should not be unspecified address in NS + * if ther is source link-layer address option in the message + */ + if (dad) { + RTE_LOG(ERR, NEIGHBOUR, "[%s] NS: bad DAD packet (link-layer address option)\n", \ + __func__); + inet_addr_ifa_put(ifa); + return EDPVS_DROP; + } + } else { + /* ingnore mbuf without opt */ + inet_addr_ifa_put(ifa); + return EDPVS_KNICONTINUE; + } + + inc = ipv6_addr_is_multicast(daddr); + + /* + * dad response src_addr should be link local, daddr should be multi ff02::1 + * optimistic addr not support + */ + if (dad) { + if (ifa->flags & (IFA_F_TENTATIVE | IFA_F_OPTIMISTIC)) { + RTE_LOG(ERR, NEIGHBOUR, "ICMPv6 NS: someone try to solicit our address.\n"); + inet_ifaddr_dad_failure(ifa); + inet_addr_ifa_put(ifa); + return EDPVS_KNICONTINUE; + } + ndisc_send_na(dev, &in6addr_linklocal_allnodes, &msg->target, 0, 1, 1); + inet_addr_ifa_put(ifa); + return EDPVS_KNICONTINUE; + } + + inet_addr_ifa_put(ifa); + + /* update/create neighbour */ + hashkey = neigh_hashkey(AF_INET6, (union inet_addr *)saddr, dev); + neigh = neigh_lookup_entry(AF_INET6, (union inet_addr *)saddr, dev, hashkey); + if (neigh && !(neigh->flag & NEIGHBOUR_STATIC)) { + neigh_edit(neigh, (struct ether_addr *)lladdr); + neigh_entry_state_trans(neigh, 1); + neigh_sync_core(neigh, 1, NEIGH_ENTRY); + } else { + neigh = neigh_add_table(AF_INET6, (union inet_addr *)saddr, + (struct ether_addr *)lladdr, dev, hashkey, 0); + if (!neigh){ + RTE_LOG(ERR, NEIGHBOUR, "[%s] add neighbour wrong\n", __func__); + return EDPVS_NOMEM; + } + neigh_entry_state_trans(neigh, 1); + neigh_sync_core(neigh, 1, NEIGH_ENTRY); + } + neigh_send_mbuf_cach(neigh); + + ndisc_send_na(dev, saddr, &msg->target, + 1, inc, inc); + + return EDPVS_KNICONTINUE; +} + +static int ndisc_recv_na(struct rte_mbuf *mbuf, struct netif_port *dev) +{ + uint8_t *lladdr = NULL; + struct ndisc_options ndopts; + struct neighbour_entry *neigh; + struct inet_ifaddr *ifa; + int hashkey; + + struct in6_addr *saddr = &((struct ip6_hdr *)mbuf->userdata)->ip6_src; + struct in6_addr *daddr = &((struct ip6_hdr *)mbuf->userdata)->ip6_dst; + + struct nd_msg *msg = rte_pktmbuf_mtod(mbuf, struct nd_msg *); + uint32_t ndoptlen = mbuf->data_len - offsetof(struct nd_msg, opt); + + if (mbuf_may_pull(mbuf, sizeof(struct nd_msg))) { + RTE_LOG(ERR, NEIGHBOUR, "ICMPv6 NA: packet too short.\n"); + return EDPVS_DROP; + } + + if (ipv6_addr_is_multicast(&msg->target)) { + RTE_LOG(ERR, NEIGHBOUR, "ICMPv6 NA: target address is multicast.\n"); + return EDPVS_DROP; + } + + if (ipv6_addr_is_multicast(daddr) && (msg->icmph.icmp6_pptr & ND_NA_FLAG_SOLICITED)) { + RTE_LOG(ERR, NEIGHBOUR, "ICMPv6 NA: solicited NA is multicast.\n"); + return EDPVS_DROP; + } + + if (!ndisc_parse_options(msg->opt, ndoptlen, &ndopts)) { + RTE_LOG(ERR, NEIGHBOUR, "ICMPv6 NA: invalid ND option.\n"); + return EDPVS_DROP; + } + + + ifa = inet_addr_ifa_get(AF_INET6, dev, (union inet_addr *)&msg->target); + if (ifa) { + RTE_LOG(ERR, NEIGHBOUR, "ICMPv6 NA: someone advertises our address.\n"); + if (ifa->flags & (IFA_F_TENTATIVE | IFA_F_OPTIMISTIC)) { + inet_ifaddr_dad_failure(ifa); + } + inet_addr_ifa_put(ifa); + return EDPVS_KNICONTINUE; + } + + if (ndopts.nd_opts_tgt_lladdr) { + lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr, dev); + if (!lladdr) { + RTE_LOG(ERR, NEIGHBOUR, "ICMPv6 NA: invalid link-layer address length.\n"); + return EDPVS_DROP; + } + } else { + /* ingnore mbuf without opt */ + return EDPVS_KNICONTINUE; + } + + /* notice: override flag ignored */ + hashkey = neigh_hashkey(AF_INET6, (union inet_addr *)saddr, dev); + neigh = neigh_lookup_entry(AF_INET6, (union inet_addr *)&msg->target, dev, hashkey); + if (neigh && !(neigh->flag & NEIGHBOUR_STATIC)) { + neigh_edit(neigh, (struct ether_addr *)lladdr); + neigh_entry_state_trans(neigh, 1); + neigh_sync_core(neigh, 1, NEIGH_ENTRY); + } else { + neigh = neigh_add_table(AF_INET6, (union inet_addr *)saddr, + (struct ether_addr *)lladdr, dev, hashkey, 0); + if(!neigh){ + RTE_LOG(ERR, NEIGHBOUR, "[%s] add neighbour wrong\n", __func__); + return EDPVS_NOMEM; + } + neigh_entry_state_trans(neigh, 1); + neigh_sync_core(neigh, 1, NEIGH_ENTRY); + } + neigh_send_mbuf_cach(neigh); + + return EDPVS_KNICONTINUE; +} + +int ndisc_rcv(struct rte_mbuf *mbuf, struct netif_port *dev) +{ + struct nd_msg *msg; + int ret; + struct ip6_hdr *ipv6_hdr = mbuf->userdata; + + if (mbuf_may_pull(mbuf, sizeof(struct icmp6_hdr)) != 0) { + ret = EDPVS_NOMEM; + goto free; + } + + msg = (struct nd_msg *)rte_pktmbuf_mtod(mbuf, struct nd_msg *); + + if (ipv6_hdr->ip6_hlim != 255) { + RTE_LOG(ERR, NEIGHBOUR, "[%s] invalid hop-limit\n", __func__); + ret = EDPVS_INVAL; + goto free; + } + + if (msg->icmph.icmp6_code != 0) { + RTE_LOG(ERR, NEIGHBOUR, "[%s] invalid ICMPv6_code:%d\n", __func__, + msg->icmph.icmp6_code); + ret = EDPVS_INVAL; + goto free; + } + + switch (msg->icmph.icmp6_type) { + case ND_NEIGHBOR_SOLICIT: + ret = ndisc_recv_ns(mbuf, dev); + break; + + case ND_NEIGHBOR_ADVERT: + ret = ndisc_recv_na(mbuf, dev); + break; + + /* not support yet */ + case ND_ROUTER_SOLICIT: + case ND_ROUTER_ADVERT: + case ND_REDIRECT: + ret = EDPVS_KNICONTINUE; + break; + default: + ret = EDPVS_KNICONTINUE; + break; + } + + /* ipv6 handler should consume mbuf */ + if (ret != EDPVS_KNICONTINUE) + goto free; + + return EDPVS_KNICONTINUE; + +free: + rte_pktmbuf_free(mbuf); + return ret; +} + diff --git a/src/ipv6/route6.c b/src/ipv6/route6.c new file mode 100644 index 000000000..de036a9a8 --- /dev/null +++ b/src/ipv6/route6.c @@ -0,0 +1,528 @@ +/* + * DPVS is a software load balancer (Virtual Server) based on DPDK. + * + * Copyright (C) 2018 iQIYI (www.iqiyi.com). + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ +#include +#include "route6.h" +#include "linux_ipv6.h" +#include "ctrl.h" +#include "route6_lpm.h" +#include "route6_hlist.h" +#include "parser/parser.h" + +#define this_rt6_dustbin (RTE_PER_LCORE(rt6_dbin)) +#define RT6_RECYCLE_TIME_DEF 10 +#define RT6_RECYCLE_TIME_MAX 36000 +#define RT6_RECYCLE_TIME_MIN 1 + +static struct route6_method *g_rt6_method = NULL; +static char g_rt6_name[RT6_METHOD_NAME_SZ] = "hlist"; +struct list_head g_rt6_list; + +/* route6 recycle list */ +struct rt6_dustbin { + struct list_head routes; + struct dpvs_timer tm; +}; + +static int g_rt6_recycle_time = RT6_RECYCLE_TIME_DEF; +static RTE_DEFINE_PER_LCORE(struct rt6_dustbin, rt6_dbin); + +static inline void rt6_zero_prefix_tail(struct rt6_prefix *rt6_p) +{ + struct in6_addr addr6; + + ipv6_addr_prefix(&addr6, &rt6_p->addr, rt6_p->plen); + memcpy(&rt6_p->addr, &addr6, sizeof(addr6)); +} + +static void rt6_cfg_zero_prefix_tail(const struct dp_vs_route6_conf *src, + struct dp_vs_route6_conf *dst) +{ + memcpy(dst, src, sizeof(*dst)); + + rt6_zero_prefix_tail(&dst->dst); + /* do not change dst->src, dst->prefsrc */ +} + +int route6_method_register(struct route6_method *rt6_mtd) +{ + struct route6_method *rnode; + + if (!rt6_mtd || strlen(rt6_mtd->name) == 0) + return EDPVS_INVAL; + + list_for_each_entry(rnode, &g_rt6_list, lnode) { + if (strncmp(rt6_mtd->name, rnode->name, sizeof(rnode->name)) == 0) + return EDPVS_EXIST; + } + + list_add_tail(&rt6_mtd->lnode, &g_rt6_list); + return EDPVS_OK; +} + +int route6_method_unregister(struct route6_method *rt6_mtd) +{ + if (!rt6_mtd) + return EDPVS_INVAL; + list_del(&rt6_mtd->lnode); + return EDPVS_OK; +} + +static struct route6_method *rt6_method_get(const char *name) +{ + struct route6_method *rnode; + + list_for_each_entry(rnode, &g_rt6_list, lnode) + if (strcmp(rnode->name, name) == 0) + return rnode; + + return NULL; +} + +static int rt6_recycle(void *arg) +{ + struct route6 *rt6, *next; +#ifdef DPVS_ROUTE6_DEBUG + char buf[64]; +#endif + list_for_each_entry_safe(rt6, next, &this_rt6_dustbin.routes, hnode) { + if (rte_atomic32_read(&rt6->refcnt) <= 1) { + list_del(&rt6->hnode); +#ifdef DPVS_ROUTE6_DEBUG + dump_rt6_prefix(&rt6->rt6_dst, buf, sizeof(buf)); + RTE_LOG(DEBUG, RT6, "[%d] %s: delete dustbin route %s->%s\n", rte_lcore_id(), + __func__, buf, rt6->rt6_dev ? rt6->rt6_dev->name : ""); +#endif + rte_free(rt6); + } + } + + return EDPVS_OK; +} + +void route6_free(struct route6 *rt6) +{ + if (unlikely(rte_atomic32_read(&rt6->refcnt) > 1)) + list_add_tail(&rt6->hnode, &this_rt6_dustbin.routes); + else + rte_free(rt6); +} + +static int rt6_setup_lcore(void *arg) +{ + int err; + bool global; + struct timeval tv; + + tv.tv_sec = g_rt6_recycle_time, + tv.tv_usec = 0, + global = (rte_lcore_id() == rte_get_master_lcore()); + + INIT_LIST_HEAD(&this_rt6_dustbin.routes); + err = dpvs_timer_sched_period(&this_rt6_dustbin.tm, &tv, rt6_recycle, NULL, global); + if (err != EDPVS_OK) + return err; + + return g_rt6_method->rt6_setup_lcore(arg); +} + +static int rt6_destroy_lcore(void *arg) +{ + struct route6 *rt6, *next; + + list_for_each_entry_safe(rt6, next, &this_rt6_dustbin.routes, hnode) { + if (rte_atomic32_read(&rt6->refcnt) <= 1) { /* need judge refcnt here? */ + list_del(&rt6->hnode); + rte_free(rt6); + } + } + + return g_rt6_method->rt6_destroy_lcore(arg); +} + +struct route6 *route6_input(const struct rte_mbuf *mbuf, struct flow6 *fl6) +{ + return g_rt6_method->rt6_input(mbuf, fl6); +} + +struct route6 *route6_output(const struct rte_mbuf *mbuf, struct flow6 *fl6) +{ + return g_rt6_method->rt6_output(mbuf, fl6); +} + +int route6_get(struct route6 *rt) +{ + if (!rt) + return EDPVS_INVAL; + rte_atomic32_inc(&rt->refcnt); + return EDPVS_OK; +} + +int route6_put(struct route6 *rt) +{ + if (!rt) + return EDPVS_INVAL; + rte_atomic32_dec(&rt->refcnt); + return EDPVS_OK; +} + +static struct route6 *rt6_get(const struct dp_vs_route6_conf *rt6_cfg) +{ + return g_rt6_method->rt6_get(rt6_cfg); +} + +static int rt6_add_lcore(const struct dp_vs_route6_conf *rt6_cfg) +{ + return g_rt6_method->rt6_add_lcore(rt6_cfg); +} + +static int rt6_del_lcore(const struct dp_vs_route6_conf *rt6_cfg) +{ + return g_rt6_method->rt6_del_lcore(rt6_cfg); +} + +/* called on master */ +static int rt6_add_del(const struct dp_vs_route6_conf *cf) +{ + int err; + struct dpvs_msg *msg; + lcoreid_t cid; + + cid = rte_lcore_id(); + assert(cid == rte_get_master_lcore()); + + /* for master */ + switch (cf->ops) { + case RT6_OPS_ADD: + if (rt6_get(cf) != NULL) + return EDPVS_EXIST; + err = rt6_add_lcore(cf); + break; + case RT6_OPS_DEL: + if (rt6_get(cf) == NULL) + return EDPVS_NOTEXIST; + err = rt6_del_lcore(cf); + break; + default: + return EDPVS_INVAL; + } + if (err != EDPVS_OK) { + RTE_LOG(ERR, RT6, "%s: fail to add/del route on master -- %s!\n", + __func__, dpvs_strerror(err)); + return err; + } + + /* for slaves */ + msg = msg_make(MSG_TYPE_ROUTE6, 0, DPVS_MSG_MULTICAST, cid, + sizeof(struct dp_vs_route6_conf), cf); + if (unlikely(msg == NULL)) { + err = EDPVS_NOMEM; + goto slave_fail; + } + + err = multicast_msg_send(msg, 0, NULL); + if (err != EDPVS_OK) { + msg_destroy(&msg); + goto slave_fail; + } + msg_destroy(&msg); + + return EDPVS_OK; + +slave_fail: + RTE_LOG(ERR, RT6, "%s: fail to add/del route on slaves -- %s\n", + __func__, dpvs_strerror(err)); + return err; +} + +static int __route6_add_del(const struct in6_addr *dest, int plen, uint32_t flags, + const struct in6_addr *gw, struct netif_port *dev, + const struct in6_addr *src, uint32_t mtu, bool add) +{ + struct dp_vs_route6_conf cf; + + memset(&cf, 0, sizeof(cf)); + if (add) + cf.ops = RT6_OPS_ADD; + else + cf.ops = RT6_OPS_DEL; + cf.dst.addr = *dest; + cf.dst.plen = plen; + cf.flags = flags; + cf.gateway = *gw; + snprintf(cf.ifname, sizeof(cf.ifname), "%s", dev->name); + cf.src.addr = *src; + cf.src.plen = plen; + cf.mtu = mtu; + + rt6_zero_prefix_tail(&cf.dst); + + return rt6_add_del(&cf); +} + +int route6_add(const struct in6_addr *dest, int plen, uint32_t flags, + const struct in6_addr *gw, struct netif_port *dev, + const struct in6_addr *src, uint32_t mtu) +{ + return __route6_add_del(dest, plen, flags, gw, dev, src, mtu, true); +} + +int route6_del(const struct in6_addr *dest, int plen, uint32_t flags, + const struct in6_addr *gw, struct netif_port *dev, + const struct in6_addr *src, uint32_t mtu) +{ + return __route6_add_del(dest, plen, flags, gw, dev, src, mtu, false); +} + +static int rt6_msg_process_cb(struct dpvs_msg *msg) +{ + struct dp_vs_route6_conf *cf; + + assert(msg && msg->data); + if (msg->len != sizeof(struct dp_vs_route6_conf)) { + RTE_LOG(WARNING, RT6, "%s: invalid route6 msg!\n", __func__); + return EDPVS_INVAL; + } + + cf = (struct dp_vs_route6_conf *)msg->data; + switch (cf->ops) { + case RT6_OPS_GET: + /* to be supported */ + return EDPVS_NOTSUPP; + case RT6_OPS_ADD: + return rt6_add_lcore(cf); + case RT6_OPS_DEL: + return rt6_del_lcore(cf); + default: + RTE_LOG(WARNING, RT6, "%s: unsupported operation for route6 msg -- %d!\n", + __func__, cf->ops); + return EDPVS_NOTSUPP; + } + + return EDPVS_OK; +} + +static bool rt6_conf_check(const struct dp_vs_route6_conf *rt6_cfg) +{ + if (!rt6_cfg) + return false; + + if (rt6_cfg->ops < RT6_OPS_GET || rt6_cfg->ops > RT6_OPS_FLUSH) + return false; + + if (rt6_cfg->dst.plen > 128 || rt6_cfg->dst.plen < 0) + return false; + + if (rt6_cfg->src.plen > 128 || rt6_cfg->src.plen < 0) + return false; + + if (rt6_cfg->prefsrc.plen > 128 || rt6_cfg->prefsrc.plen < 0) + return false; + + if (netif_port_get_by_name(rt6_cfg->ifname) == NULL) + return false; + + return true; +} + +static int rt6_sockopt_set(sockoptid_t opt, const void *in, size_t inlen) +{ + const struct dp_vs_route6_conf *rt6_cfg_in = in; + struct dp_vs_route6_conf rt6_cfg; + + if (!rt6_conf_check(rt6_cfg_in)) { + RTE_LOG(INFO, RT6, "%s: invalid route6 sockopt!\n", __func__); + return EDPVS_INVAL; + } + + rt6_cfg_zero_prefix_tail(rt6_cfg_in, &rt6_cfg); + + switch (opt) { + case SOCKOPT_SET_ROUTE6_ADD_DEL: + return rt6_add_del(&rt6_cfg); + case SOCKOPT_SET_ROUTE6_FLUSH: + return EDPVS_NOTSUPP; + default: + return EDPVS_NOTSUPP; + } +} + +static int rt6_sockopt_get(sockoptid_t opt, const void *in, size_t inlen, + void **out, size_t *outlen) +{ + *out = g_rt6_method->rt6_dump(in, outlen); + if (*out == NULL) + *outlen = 0; + return EDPVS_OK; +} + +static struct dpvs_sockopts route6_sockopts = { + .version = SOCKOPT_VERSION, + .set_opt_min = SOCKOPT_SET_ROUTE6_ADD_DEL, + .set_opt_max = SOCKOPT_SET_ROUTE6_FLUSH, + .set = rt6_sockopt_set, + .get_opt_min = SOCKOPT_GET_ROUTE6_SHOW, + .get_opt_max = SOCKOPT_GET_ROUTE6_SHOW, + .get = rt6_sockopt_get, +}; + +static void rt6_method_init(void) +{ + /* register all route6 method here! */ + route6_lpm_init(); + route6_hlist_init(); +} + +static void rt6_method_term(void) +{ + /* clean up all route6 method here! */ + route6_lpm_term(); + route6_hlist_term(); +} + +int route6_init(void) +{ + int err; + lcoreid_t cid; + struct dpvs_msg_type msg_type; + + INIT_LIST_HEAD(&g_rt6_list); + + rt6_method_init(); + g_rt6_method = rt6_method_get(g_rt6_name); + if (!g_rt6_method) { + RTE_LOG(ERR, RT6, "%s: rt6 method '%s' not found!\n", + __func__, g_rt6_name); + return EDPVS_NOTEXIST; + } + + rte_eal_mp_remote_launch(rt6_setup_lcore, NULL, CALL_MASTER); + RTE_LCORE_FOREACH_SLAVE(cid) { + if ((err = rte_eal_wait_lcore(cid)) < 0) { + RTE_LOG(ERR, RT6, "%s: fail to setup rt6 on lcore%d -- %s\n", + __func__, cid, dpvs_strerror(err)); + return EDPVS_DPDKAPIFAIL; + } + } + + memset(&msg_type, 0, sizeof(struct dpvs_msg_type)); + msg_type.type = MSG_TYPE_ROUTE6; + msg_type.mode = DPVS_MSG_MULTICAST; + msg_type.cid = rte_lcore_id(); + msg_type.unicast_msg_cb = rt6_msg_process_cb; + err = msg_type_mc_register(&msg_type); + if (err != EDPVS_OK) { + RTE_LOG(ERR, RT6, "%s: fail to register route6 msg!\n", __func__); + return err; + } + + if ((err = sockopt_register(&route6_sockopts)) != EDPVS_OK) { + RTE_LOG(ERR, RT6, "%s: fail to register route6 sockopt!\n", __func__); + return err; + } + + return EDPVS_OK; +} + +int route6_term(void) +{ + int err; + lcoreid_t cid; + struct dpvs_msg_type msg_type; + + rt6_method_term(); + + if ((err = sockopt_unregister(&route6_sockopts)) != EDPVS_OK) + RTE_LOG(WARNING, RT6, "%s: fail to unregister route6 sockopt!\n", __func__); + + memset(&msg_type, 0, sizeof(struct dpvs_msg_type)); + msg_type.type = MSG_TYPE_ROUTE6; + msg_type.mode = DPVS_MSG_MULTICAST; + msg_type.cid = rte_lcore_id(); + msg_type.unicast_msg_cb = rt6_msg_process_cb; + err = msg_type_mc_unregister(&msg_type); + if (err != EDPVS_OK) + RTE_LOG(WARNING, RT6, "%s:fail to unregister route6 msg!\n", __func__); + + rte_eal_mp_remote_launch(rt6_destroy_lcore, NULL, CALL_MASTER); + RTE_LCORE_FOREACH_SLAVE(cid) { + if ((err = rte_eal_wait_lcore(cid)) < 0) { + RTE_LOG(WARNING, RT6, "%s: fail to destroy rt6 on lcore%d -- %s\n", + __func__, cid, dpvs_strerror(err)); + } + } + + return EDPVS_OK; +} + +/* config file */ +static void rt6_method_handler(vector_t tokens) +{ + char *str = set_value(tokens); + assert(str); + if (!strcmp(str, "hlist") || !strcmp(str, "lpm")) { + RTE_LOG(INFO, RT6, "route6:method = %s\n", str); + snprintf(g_rt6_name, sizeof(g_rt6_name), "%s", str); + } else { + RTE_LOG(WARNING, RT6, "invalid route6:method %s, using default %s\n", + str, "hlist"); + snprintf(g_rt6_name, sizeof(g_rt6_name), "%s", "hlist"); + } + + FREE_PTR(str); +} + +static void rt6_recycle_time_handler(vector_t tokens) +{ + char *str = set_value(tokens); + int recycle_time; + + assert(str); + recycle_time = atoi(str); + if (recycle_time > RT6_RECYCLE_TIME_MAX || recycle_time < RT6_RECYCLE_TIME_MIN) { + RTE_LOG(WARNING, RT6, "invalid ipv6:route:recycle_time %s, using default %d\n", + str, RT6_RECYCLE_TIME_DEF); + g_rt6_recycle_time = RT6_RECYCLE_TIME_DEF; + } else { + RTE_LOG(INFO, RT6, "ipv6:route:recycle_time = %d\n", recycle_time); + g_rt6_recycle_time = recycle_time; + } + + FREE_PTR(str); +} + +void route6_keyword_value_init(void) +{ + if (dpvs_state_get() == DPVS_STATE_INIT) { + /* KW_TYPE_INIT keyword */ + snprintf(g_rt6_name, sizeof(g_rt6_name), "%s", "hlist"); + } + /* KW_TYPE_NORMAL keyword */ + g_rt6_recycle_time = RT6_RECYCLE_TIME_DEF; + + route6_lpm_keyword_value_init(); +} + +void install_route6_keywords(void) +{ + install_keyword("route6", NULL, KW_TYPE_NORMAL); + install_sublevel(); + install_keyword("method", rt6_method_handler, KW_TYPE_INIT); + install_keyword("recycle_time", rt6_recycle_time_handler, KW_TYPE_NORMAL); + install_rt6_lpm_keywords(); + install_sublevel_end(); +} diff --git a/src/ipv6/route6_hlist.c b/src/ipv6/route6_hlist.c new file mode 100644 index 000000000..3c6b3c796 --- /dev/null +++ b/src/ipv6/route6_hlist.c @@ -0,0 +1,388 @@ +/* + * DPVS is a software load balancer (Virtual Server) based on DPDK. + * + * Copyright (C) 2018 iQIYI (www.iqiyi.com). + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#include +#include "route6.h" +#include "route6_hlist.h" +#include "linux_ipv6.h" + +#define RT6_HLIST_MAX_BUCKET_BITS 10 +#define RT6_HLIST_MAX_BUCKETS (1U<nbuckets; i++) { + list_for_each_entry_safe(rt6, rnext, &hlist->hlist[i], hnode) { + list_del(&rt6->hnode); + route6_free(rt6); + hlist->nroutes--; + this_rt6_nroutes--; + } + } + assert(hlist->nroutes == 0); + rte_free(hlist); + } + + assert(this_rt6_nroutes == 0); + return EDPVS_OK; +} + +static uint32_t rt6_hlist_count(void) +{ + return g_nroutes; +} + +static int rt6_hlist_buckets(int plen) +{ + /* caller should ensure 0 <= plen <= 128 */ + if (plen < RT6_HLIST_MAX_BUCKET_BITS) + return (1U << plen); + else + return RT6_HLIST_MAX_BUCKETS; +} + +static inline int rt6_hlist_hashkey(const struct in6_addr *addr, int plen, int nbuckets) +{ + struct in6_addr pfx; + + ipv6_addr_prefix(&pfx, addr, plen); + return rte_jhash_32b((const uint32_t *)&pfx, 4, 0) % nbuckets; +} + +static inline bool rt6_match(const struct route6 *rt6, const struct dp_vs_route6_conf *cf) +{ + /* Note: Do not use `ipv6_masked_addr_cmp` here for performance consideration + * here. We ensure the route6 entry is masked when added to route table. */ + if (ipv6_addr_cmp(&rt6->rt6_dst.addr, &cf->dst.addr) != 0) + return false; + if (rt6->rt6_dst.plen != cf->dst.plen) + return false; + if (rt6->rt6_dev && strlen(cf->ifname) != 0) { + struct netif_port *dev; + dev = netif_port_get_by_name(cf->ifname); + if (!dev || dev->id != rt6->rt6_dev->id) + return false; + } + + /* other fields to be checked? */ + + return true; +} + +static struct route6 *__rt6_hlist_get(const struct dp_vs_route6_conf *cf, + struct rt6_hlist **phlist) +{ + int hashkey; + struct rt6_hlist *hlist; + struct route6 *rt6; + + list_for_each_entry(hlist, &this_rt6_htable, node) { + if (hlist->plen > cf->dst.plen) + continue; + if (hlist->plen < cf->dst.plen) + break; + hashkey = rt6_hlist_hashkey(&cf->dst.addr, hlist->plen, hlist->nbuckets); + list_for_each_entry(rt6, &hlist->hlist[hashkey], hnode) { + if (rt6_match(rt6, cf)) { + if (phlist) + *phlist = hlist; + return rt6; + } + } + } + + return NULL; +} + +static inline struct route6 *rt6_hlist_get(const struct dp_vs_route6_conf *cf) +{ + return __rt6_hlist_get(cf, NULL); +} + +static int rt6_hlist_add_lcore(const struct dp_vs_route6_conf *cf) +{ + struct rt6_hlist *hlist = NULL; + struct route6 *rt6; + int hashkey; + bool hlist_exist = false; +#ifdef DPVS_ROUTE6_DEBUG + char buf[64]; +#endif + + if (rt6_hlist_get(cf)) + return EDPVS_EXIST; + + list_for_each_entry(hlist, &this_rt6_htable, node) { + if (hlist->plen <= cf->dst.plen) { + if (hlist->plen == cf->dst.plen) + hlist_exist = true; + break; + } + } + + if (!hlist_exist) { /* hlist for this prefix not exist, create it! */ + int i, nbuckets, size; + struct rt6_hlist *new_hlist; + + nbuckets = rt6_hlist_buckets(cf->dst.plen); + size = sizeof(struct rt6_hlist) + nbuckets * sizeof(struct list_head); + new_hlist = rte_zmalloc_socket("rt6_hlist", size, 0, rte_socket_id()); + if (unlikely(!new_hlist)) { + RTE_LOG(ERR, RT6, "[%d] %s: fail to alloc rt6_hlist\n", + rte_lcore_id(), __func__); + return EDPVS_NOMEM; + } + + new_hlist->plen = cf->dst.plen; + new_hlist->nbuckets = nbuckets; + new_hlist->nroutes = 0; + for (i = 0; i < nbuckets; i++) + INIT_LIST_HEAD(&new_hlist->hlist[i]); + + /* add new_hlist to plen-sorted htable */ + __list_add(&new_hlist->node, hlist->node.prev, &hlist->node); + +#ifdef DPVS_ROUTE6_DEBUG + RTE_LOG(DEBUG, RT6, "[%d] %s: new rt6_hlist: plen=%d, nbuckets=%d\n", + rte_lcore_id(), __func__, new_hlist->plen, new_hlist->nbuckets); +#endif + + hlist = new_hlist; /* replace current hlist with new_hlist */ + } + + /* create route6 entry and hash it into current hlist */ + rt6 = rte_zmalloc_socket("rt6_entry", sizeof(struct route6), 0, rte_socket_id()); + if (unlikely(!rt6)) { + RTE_LOG(ERR, RT6, "[%d] %s: fail to alloc rt6_entry!\n", + rte_lcore_id(), __func__); + if (hlist->nroutes == 0) { + list_del(&hlist->node); + rte_free(hlist); + } + return EDPVS_NOMEM; + } + + rt6_fill_with_cfg(rt6, cf); + rte_atomic32_set(&rt6->refcnt, 1); + + hashkey = rt6_hlist_hashkey(&cf->dst.addr, cf->dst.plen, hlist->nbuckets); + list_add_tail(&rt6->hnode, &hlist->hlist[hashkey]); + hlist->nroutes++; + this_rt6_nroutes++; + +#ifdef DPVS_ROUTE6_DEBUG + dump_rt6_prefix(&rt6->rt6_dst, buf, sizeof(buf)); + RTE_LOG(DEBUG, RT6, "[%d] %s: new route6 node: %s->%s plen=%d, hashkey=%d/%d\n", + rte_lcore_id(), __func__, buf, cf->ifname, hlist->plen, + hashkey, hlist->nbuckets); +#endif + + return EDPVS_OK; +} + +static int rt6_hlist_del_lcore(const struct dp_vs_route6_conf *cf) +{ + struct route6 *rt6; + struct rt6_hlist *hlist = NULL; +#ifdef DPVS_ROUTE6_DEBUG + char buf[64]; +#endif + + rt6 = __rt6_hlist_get(cf, &hlist); + if (!rt6) + return EDPVS_NOTEXIST; + +#ifdef DPVS_ROUTE6_DEBUG + dump_rt6_prefix(&rt6->rt6_dst, buf, sizeof(buf)); + RTE_LOG(DEBUG, RT6, "[%d] %s: del route6 node: %s->%s\n", + rte_lcore_id(), __func__, buf, cf->ifname); +#endif + list_del(&rt6->hnode); + route6_free(rt6); + + assert(hlist != NULL); + hlist->nroutes--; + this_rt6_nroutes--; + + if (hlist->nroutes == 0) { +#ifdef DPVS_ROUTE6_DEBUG + RTE_LOG(DEBUG, RT6, "[%d] %s: del rt6_hlist: plen=%d, nbuckets=%d\n", + rte_lcore_id(), __func__, hlist->plen, hlist->nbuckets); +#endif + list_del(&hlist->node); + rte_free(hlist); + } + + return EDPVS_OK; +} + +static inline bool +rt6_hlist_flow_match(const struct route6 *rt6, const struct flow6 *fl6) +{ + if (rt6->rt6_dst.plen < 128) { + if (!ipv6_prefix_equal(&fl6->fl6_daddr, &rt6->rt6_dst.addr, rt6->rt6_dst.plen)) + return false; + } else { + if (!ipv6_addr_equal(&fl6->fl6_daddr, &rt6->rt6_dst.addr)) + return false; + } + + if (fl6->fl6_oif && rt6->rt6_dev && (fl6->fl6_oif->id != rt6->rt6_dev->id)) + return false; + + if ((!ipv6_addr_any(&rt6->rt6_src.addr)) && (ipv6_addr_equal(&rt6->rt6_src.addr, + &fl6->fl6_saddr)) != true) + return false; + + /* anything else to check ? */ + + return true; +} + +static struct route6 *rt6_hlist_lookup(const struct rte_mbuf *mbuf, struct flow6 *fl6) +{ + struct rt6_hlist *hlist; + struct route6 *rt6; + int hashkey; + + list_for_each_entry(hlist, &this_rt6_htable, node) { + hashkey = rt6_hlist_hashkey(&fl6->fl6_daddr, hlist->plen, hlist->nbuckets); + list_for_each_entry(rt6, &hlist->hlist[hashkey], hnode) { + if (rt6_hlist_flow_match(rt6, fl6)) { + rte_atomic32_inc(&rt6->refcnt); + return rt6; + } + } + } + + return NULL; +} + +static struct route6 *rt6_hlist_input(const struct rte_mbuf *mbuf, struct flow6 *fl6) +{ + return rt6_hlist_lookup(mbuf, fl6); +} + +static struct route6 *rt6_hlist_output(const struct rte_mbuf *mbuf, struct flow6 *fl6) +{ + return rt6_hlist_lookup(mbuf, fl6); +} + +static struct dp_vs_route6_conf_array* + rt6_hlist_dump(const struct dp_vs_route6_conf *cf, size_t *nbytes) +{ + int i, off; + struct rt6_hlist *hlist; + struct route6 *entry; + struct dp_vs_route6_conf_array *rt6_arr; + struct netif_port *dev = NULL; + + if (cf && strlen(cf->ifname) > 0) { + dev = netif_port_get_by_name(cf->ifname); + if (!dev) { + RTE_LOG(WARNING, RT6, "%s: route6 device %s not found!\n", + __func__, cf->ifname); + return NULL; + } + } + + *nbytes = sizeof(struct dp_vs_route6_conf_array) + + g_nroutes * sizeof(struct dp_vs_route6_conf); + rt6_arr = rte_zmalloc_socket("rt6_sockopt_get", *nbytes, 0, rte_socket_id()); + if (unlikely(!rt6_arr)) + return NULL; + + off = 0; + list_for_each_entry(hlist, &this_rt6_htable, node) { + for (i = 0; i < hlist->nbuckets; i++) { + list_for_each_entry(entry, &hlist->hlist[i], hnode) { + if (off >= g_nroutes) + goto out; + if (dev && dev->id != entry->rt6_dev->id) + continue; + rt6_fill_cfg(&rt6_arr->routes[off++], entry); + } + } + } + +out: + if (off < g_nroutes) + *nbytes = sizeof(struct dp_vs_route6_conf_array) + + off * sizeof(struct dp_vs_route6_conf); + rt6_arr->nroute = off; + + return rt6_arr; +} + +static struct route6_method rt6_hlist_method = { + .name = "hlist", + .rt6_setup_lcore = rt6_hlist_setup_lcore, + .rt6_destroy_lcore = rt6_hlist_destroy_lcore, + .rt6_count = rt6_hlist_count, + .rt6_add_lcore = rt6_hlist_add_lcore, + .rt6_del_lcore = rt6_hlist_del_lcore, + .rt6_get = rt6_hlist_get, + .rt6_input = rt6_hlist_input, + .rt6_output = rt6_hlist_output, + .rt6_dump = rt6_hlist_dump, +}; + +int route6_hlist_init(void) +{ + return route6_method_register(&rt6_hlist_method); +} + +int route6_hlist_term(void) +{ + return route6_method_unregister(&rt6_hlist_method); +} diff --git a/src/ipv6/route6_lpm.c b/src/ipv6/route6_lpm.c new file mode 100644 index 000000000..1b5ee8833 --- /dev/null +++ b/src/ipv6/route6_lpm.c @@ -0,0 +1,589 @@ +/* + * DPVS is a software load balancer (Virtual Server) based on DPDK. + * + * Copyright (C) 2018 iQIYI (www.iqiyi.com). + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +/* Notice: + * The DPDK LPM6 API 'rte_lpm6_delete' is very slow because + * it memset a big memory(several millions of bytes) within + * its implementation. If dpvs is used under the environment + * that routes deletion is frequent, LPM6 is not recommended! + */ + +#include +#include +#include "route6.h" +#include "linux_ipv6.h" +#include "route6_lpm.h" +#include "parser/parser.h" + +#define LPM6_CONF_MAX_RULES_DEF 1024 +#define LPM6_CONF_NUM_TBL8S_DEF (1<<16) + +#define RT6_ARRAY_SIZE_DEF (1<<16) +#define RT6_HASH_BUCKET_DEF (1<<8) + +#define this_lpm6_struct (RTE_PER_LCORE(dpvs_lpm6_struct)) +#define this_rt6_array (RTE_PER_LCORE(dpvs_rt6_array)) +#define this_rt6_hash (RTE_PER_LCORE(dpvs_rt6_hash)) +#define this_rt6_default (RTE_PER_LCORE(dpvs_rt6_default)) + +/* DPDK LPM6 can store 4 bytes route information(i.e. an uint32_t integer) at most. + * But DPVS route has more information needed to store: dest/source IP, gateway, + * mtu, outgoing device...To solve the problem, an indexed route array is used. + * */ +struct rt6_array { + uint32_t num; /* total entry number */ + uint32_t cursor; /* positon of lastest insert, for fast search */ + void *entries[0]; /* route entry array, each elem is a pointer to route6 */ +}; +#define g_nroutes (this_rt6_array->num) + +static uint8_t g_lcore_number = 0; +static uint64_t g_lcore_mask = 0; + +static uint32_t g_lpm6_conf_max_rules = LPM6_CONF_MAX_RULES_DEF; +static uint32_t g_lpm6_conf_num_tbl8s = LPM6_CONF_NUM_TBL8S_DEF; +static uint32_t g_rt6_array_size = RT6_ARRAY_SIZE_DEF; +static uint32_t g_rt6_hash_bucket = RT6_HASH_BUCKET_DEF; + +static RTE_DEFINE_PER_LCORE(struct rte_lpm6*, dpvs_lpm6_struct); +static RTE_DEFINE_PER_LCORE(struct rt6_array*, dpvs_rt6_array); +static RTE_DEFINE_PER_LCORE(struct route6*, dpvs_rt6_default); /*lpm6 not support ::/0 */ + +/* Why need hash lists while using LPM6? + * LPM6 can help find the best match route rule, but cannot find any route rule we want. + * For example, assume there exists two rules with rt6_prefix + * FE80::0/16 --> rt6_array::entries[0] + * FE80::0/64 --> rt6_array::entries[1] + * LPM6 lookup would never hit the first rule using 'rte_lpm6_lookup'. + * So we cannot obtain the first rule when the control plane need to add/del/modify it. + * Thus a hash list is needed for route6 control plane. Actually, hash list is not needed + * for data plane. We use per-lcore struct just for convenience. + */ +static RTE_DEFINE_PER_LCORE(struct list_head*, dpvs_rt6_hash); + +static inline bool rt6_default(const struct rt6_prefix *rt6_p) +{ + return ipv6_addr_any(&rt6_p->addr) && (rt6_p->plen == 0); +} + +static inline int rt6_find_free_array_idx(uint32_t *idx) +{ + uint32_t ii; + if (unlikely(this_rt6_array == NULL)) + return EDPVS_NOTEXIST; + if (this_rt6_array->num >= g_rt6_array_size) + return EDPVS_NOROOM; + for (ii = (this_rt6_array->cursor+1) % g_rt6_array_size; + ii != this_rt6_array->cursor; + ii = (ii+1) % g_rt6_array_size) { + if (this_rt6_array->entries[ii] == NULL) { + *idx = ii; + return EDPVS_OK; + } + } + return EDPVS_INVAL; +} + +static inline int rt6_hash_key(const struct rt6_prefix *rt6_p) +{ + return rte_jhash_32b((const uint32_t *)&rt6_p->addr, 4, + rt6_p->plen) % g_rt6_hash_bucket; +} + +static int rt6_lpm_setup_lcore(void *arg) +{ + char name[64]; + int i, ret; + lcoreid_t cid = rte_lcore_id(); + int socketid = rte_socket_id(); + + struct rte_lpm6_config config = { + .max_rules = g_lpm6_conf_max_rules, + .number_tbl8s = g_lpm6_conf_num_tbl8s, + .flags = 0, + }; + + if ((!(g_lcore_mask & (1<hnode); + route6_free(entry); + } + } + + if (this_rt6_array) { + rte_free(this_rt6_array); + this_rt6_array = NULL; + } + + if (this_rt6_hash) { + rte_free(this_rt6_hash); + this_rt6_hash = NULL; + } + + if (this_lpm6_struct) { + rte_lpm6_free(this_lpm6_struct); + this_lpm6_struct = NULL; + } + + return EDPVS_OK; +} + +static struct route6 *rt6_lpm_lookup(const struct in6_addr *addr) +{ + uint32_t idx; + struct route6 *rt6; + + if (rte_lpm6_lookup(this_lpm6_struct, (uint8_t*)addr, &idx) != 0) + return this_rt6_default; + + assert(idx >= 0 && idx < g_rt6_array_size); + rt6 = this_rt6_array->entries[idx]; + + return rt6; +} + +static struct route6 *rt6_lpm_input(const struct rte_mbuf *mbuf, struct flow6 *fl6) +{ + struct route6 *rt6; + + rt6 = rt6_lpm_lookup(&fl6->fl6_daddr); + if (!rt6) + return NULL; + + /* FIXME: search hash list for detailed match ? */ + if (rt6->rt6_dev && fl6->fl6_oif && rt6->rt6_dev->id != fl6->fl6_oif->id) + goto miss; + if (!ipv6_addr_any(&rt6->rt6_src.addr) && !ipv6_addr_any(&fl6->fl6_saddr) + && !ipv6_addr_equal(&rt6->rt6_src.addr, &fl6->fl6_saddr)) + goto miss; + + rte_atomic32_inc(&rt6->refcnt); + return rt6; + +miss: + if (rt6) + route6_put(rt6); + return NULL; +} + +static struct route6 *rt6_lpm_output(const struct rte_mbuf *mbuf, struct flow6 *fl6) +{ + struct route6 *rt6; + + rt6 = rt6_lpm_lookup(&fl6->fl6_daddr); + if (!rt6) + return NULL; + + /* FIXME: search hash list for detailed match ? */ + if (rt6->rt6_dev && fl6->fl6_oif && rt6->rt6_dev->id != fl6->fl6_oif->id) + goto miss; + if (!ipv6_addr_any(&rt6->rt6_src.addr) && !ipv6_addr_any(&fl6->fl6_saddr) + && !ipv6_addr_equal(&rt6->rt6_src.addr, &fl6->fl6_saddr)) + goto miss; + + rte_atomic32_inc(&rt6->refcnt); + return rt6; + +miss: + if (rt6) + route6_put(rt6); + return NULL; +} + +/* Note slaves have the same rt6_hash table with master. Call me on master only + * if you just need to find the route entry specified by 'rt6_cfg' is configured.*/ +static struct route6* rt6_lpm_get(const struct dp_vs_route6_conf *rt6_cfg) +{ + int hashkey; + struct route6 *entry, *next; + + hashkey = rt6_hash_key(&rt6_cfg->dst); + list_for_each_entry_safe(entry, next, &this_rt6_hash[hashkey], hnode) { + if (entry->rt6_dst.plen == rt6_cfg->dst.plen && + ipv6_prefix_equal(&entry->rt6_dst.addr, &rt6_cfg->dst.addr, + rt6_cfg->dst.plen)) { + /* Do not match rt6_cfg->ifname, because LPM6 does not support + * the same rt6_prefix with different ifname */ + return entry; + } + } + + if (rt6_cfg->dst.plen == 0 && ipv6_addr_any(&rt6_cfg->dst.addr)) + return this_rt6_default; + + return NULL; +} + +static int rt6_add_lcore_default(const struct dp_vs_route6_conf *rt6_cfg) +{ + struct route6 *entry; + + if (this_rt6_default) + return EDPVS_EXIST; + + entry = rte_zmalloc_socket("rt6_entry", sizeof(struct route6), 0, rte_socket_id()); + if (unlikely(entry == NULL)) + return EDPVS_NOMEM; + + /* 'rt6_cfg' has been verified by 'rt6_default' */ + rt6_fill_with_cfg(entry, rt6_cfg); + rte_atomic32_set(&entry->refcnt, 1); + this_rt6_default = entry; + +#ifdef DPVS_RT6_DEBUG + RTE_LOG(DEBUG, RT6, "[%d] %s(default via dev %s)->this_rt6_default OK!\n", + rte_lcore_id(), __func__, rt6_cfg->ifname); +#endif + + return EDPVS_OK; +} + +static int rt6_del_lcore_default(const struct dp_vs_route6_conf *rt6_cfg) +{ + + if (!this_rt6_default) + return EDPVS_NOTEXIST; + + /* 'rt6_cfg' has been verified by 'rt6_default' */ + route6_free(this_rt6_default); + this_rt6_default = NULL; + +#ifdef DPVS_RT6_DEBUG + RTE_LOG(DEBUG, RT6, "[%d] %s(default via dev %s)->this_rt6_default OK!\n", + rte_lcore_id(), __func__, rt6_cfg->ifname); +#endif + + return EDPVS_OK; +} + +static int rt6_lpm_add_lcore(const struct dp_vs_route6_conf *rt6_cfg) +{ + uint32_t idx; + int hashkey, ret; + char buf[64]; + struct route6 *entry; + + assert(rt6_cfg != NULL); + + if (rt6_default(&rt6_cfg->dst)) + return rt6_add_lcore_default(rt6_cfg); + + ret = rt6_find_free_array_idx(&idx); + if (unlikely(ret != EDPVS_OK)) + goto rt6_add_fail; + + entry = rte_zmalloc_socket("rt6_entry", sizeof(struct route6), 0, rte_socket_id()); + if (unlikely(entry == NULL)) { + ret = EDPVS_NOMEM; + goto rt6_add_fail; + } + rt6_fill_with_cfg(entry, rt6_cfg); + rte_atomic32_set(&entry->refcnt, 1); + + ret = rte_lpm6_add(this_lpm6_struct, (uint8_t*)&entry->rt6_dst.addr, + (uint8_t)entry->rt6_dst.plen, idx); + if (unlikely(ret < 0)) { + ret = EDPVS_DPDKAPIFAIL; + goto rt6_lpm_fail; + } + + entry->arr_idx = idx; + this_rt6_array->num++; + this_rt6_array->cursor = idx; + this_rt6_array->entries[idx] = entry; + hashkey = rt6_hash_key(&entry->rt6_dst); + list_add_tail(&entry->hnode, &this_rt6_hash[hashkey]); + +#ifdef DPVS_RT6_DEBUG + dump_rt6_prefix(&rt6_cfg->dst, buf, sizeof(buf)); + RTE_LOG(DEBUG, RT6, "[%d] %s(%s via dev %s)->rt6_hash[%d]:rt6_array[%d] OK!" + " %d routes exist.\n", rte_lcore_id(), __func__, buf, + rt6_cfg->ifname, hashkey, idx, this_rt6_array->num); +#endif + return EDPVS_OK; + +rt6_lpm_fail: + rte_free(entry); +rt6_add_fail: + dump_rt6_prefix(&rt6_cfg->dst, buf, sizeof(buf)); + RTE_LOG(ERR, RT6, "%s[%d]: rte_lpm6_add %s failed -- %s!\n", __func__, + rte_lcore_id(), buf, dpvs_strerror(ret)); + return ret; +} + +static int rt6_lpm_del_lcore(const struct dp_vs_route6_conf *rt6_cfg) +{ + int hashkey, ret; + struct route6 *entry, *next; +#ifdef DPVS_RT6_DEBUG + char buf[64]; +#endif + + assert(rt6_cfg != NULL); + + if (rt6_default(&rt6_cfg->dst)) + return rt6_del_lcore_default(rt6_cfg); + + hashkey = rt6_hash_key(&rt6_cfg->dst); + list_for_each_entry_safe(entry, next, &this_rt6_hash[hashkey], hnode) { + if (entry->rt6_dst.plen == rt6_cfg->dst.plen && + strcmp(rt6_cfg->ifname, entry->rt6_dev->name) == 0 && + ipv6_prefix_equal(&entry->rt6_dst.addr, &rt6_cfg->dst.addr, + rt6_cfg->dst.plen)) { + /* hit! route source is not checked */ + ret = rte_lpm6_delete(this_lpm6_struct, (uint8_t *)&entry->rt6_dst.addr, + (uint8_t)entry->rt6_dst.plen); + if (unlikely(ret < 0)) { + /* rte_lpm6_delete return OK even if no satisfied route exists, + * but fail if duplicated routes exist */ + char buf[256]; + dump_rt6_prefix(&entry->rt6_dst, buf, sizeof(buf)); + RTE_LOG(ERR, RT6, "[%d]%s: rte_lpm6_delete(%s) failed!\n", + rte_lcore_id(), __func__, buf); + return EDPVS_DPDKAPIFAIL; + } +#ifdef DPVS_RT6_DEBUG + dump_rt6_prefix(&rt6_cfg->dst, buf, sizeof(buf)); + RTE_LOG(DEBUG, RT6, "[%d] %s(%s via dev %s)->rt6_hash[%d]:rt6_array[%d] OK!" + " %d routes left.\n", rte_lcore_id(), __func__, buf, + rt6_cfg->ifname, hashkey, entry->arr_idx, this_rt6_array->num-1); +#endif + list_del(&entry->hnode); + this_rt6_array->entries[entry->arr_idx] = NULL; + this_rt6_array->num--; + route6_free(entry); + /* no further search */ + break; + } + } + return EDPVS_OK; +} + +static struct dp_vs_route6_conf_array *rt6_lpm_dump( + const struct dp_vs_route6_conf *rt6_cfg, size_t *nbytes) +{ + int i, off; + struct route6 *entry; + struct dp_vs_route6_conf_array *rt6_arr; + struct netif_port *dev = NULL; + + if (rt6_cfg && (strlen(rt6_cfg->ifname) > 0)) { + dev = netif_port_get_by_name(rt6_cfg->ifname); + if (!dev) { + RTE_LOG(WARNING, RT6, "%s: route6 device %s not found!\n", + __func__, rt6_cfg->ifname); + return NULL; + } + } + + if (this_rt6_default) + *nbytes = sizeof(struct dp_vs_route6_conf_array) +\ + (g_nroutes+1) * sizeof(struct dp_vs_route6_conf); + else + *nbytes = sizeof(struct dp_vs_route6_conf_array) +\ + (g_nroutes) * sizeof(struct dp_vs_route6_conf); + rt6_arr = rte_zmalloc_socket("rt6_sockopt_get", *nbytes, 0, rte_socket_id()); + if (unlikely(!rt6_arr)) + return NULL; + + off = 0; + for (i = 0; i < g_rt6_hash_bucket; i++) { + list_for_each_entry(entry, &this_rt6_hash[i], hnode) { + if (off >= g_nroutes) + break; + if (dev && dev->id != entry->rt6_dev->id) + continue; + rt6_fill_cfg(&rt6_arr->routes[off++], entry); + } + } + + if (this_rt6_default && off <= g_nroutes+1) + rt6_fill_cfg(&rt6_arr->routes[off++], this_rt6_default); + + if (off < g_nroutes) + *nbytes = sizeof(struct dp_vs_route6_conf_array)+\ + off * sizeof(struct dp_vs_route6_conf); + rt6_arr->nroute = off; + + return rt6_arr; +} + +static struct route6_method rt6_lpm_method = { + .name = "lpm", + .rt6_setup_lcore = rt6_lpm_setup_lcore, + .rt6_destroy_lcore = rt6_lpm_destroy_lcore, + .rt6_add_lcore = rt6_lpm_add_lcore, + .rt6_del_lcore = rt6_lpm_del_lcore, + .rt6_get = rt6_lpm_get, + .rt6_input = rt6_lpm_input, + .rt6_output = rt6_lpm_output, + .rt6_dump = rt6_lpm_dump, +}; + +int route6_lpm_init(void) +{ + netif_get_slave_lcores(&g_lcore_number, &g_lcore_mask); + return route6_method_register(&rt6_lpm_method); +} + +int route6_lpm_term(void) +{ + return route6_method_unregister(&rt6_lpm_method); +} + +/* config file */ +static void rt6_lpm6_max_rules_handler(vector_t tokens) +{ + char *str = set_value(tokens); + uint32_t lpm6_max_rules = atoi(str); + + if (lpm6_max_rules < 16 || lpm6_max_rules > 2147483647) { + RTE_LOG(WARNING, RT6, "invalid route6:lpm6_max_rules %s, " + "using default %d\n", str, LPM6_CONF_MAX_RULES_DEF); + g_lpm6_conf_max_rules = LPM6_CONF_MAX_RULES_DEF; + } else { + RTE_LOG(INFO, RT6, "route6:lpm6_max_rules = %d\n", lpm6_max_rules); + g_lpm6_conf_max_rules = lpm6_max_rules; + } + + FREE_PTR(str); +} + +static void rt6_lpm6_num_tbl8s_handler(vector_t tokens) +{ + char *str = set_value(tokens); + uint32_t lpm6_num_tbl8s = atoi(str); + + if (lpm6_num_tbl8s < 16 || lpm6_num_tbl8s > 2147483647) { + RTE_LOG(WARNING, RT6, "invalid route6:lpm6_num_tbl8s %s, " + "using default %d\n", str, LPM6_CONF_NUM_TBL8S_DEF); + g_lpm6_conf_num_tbl8s = LPM6_CONF_NUM_TBL8S_DEF; + } else { + RTE_LOG(INFO, RT6, "route6:lpm6_num_tbl8s = %d\n", lpm6_num_tbl8s); + g_lpm6_conf_num_tbl8s = lpm6_num_tbl8s; + } + + FREE_PTR(str); +} + +static void rt6_array_size_handler(vector_t tokens) +{ + char *str = set_value(tokens); + uint32_t array_size = atoi(str); + + if (array_size < 16 || array_size > 2147483647) { + RTE_LOG(WARNING, RT6, "invalid route6:array_size %s, " + "using default %d\n", str, RT6_ARRAY_SIZE_DEF); + g_rt6_array_size = RT6_ARRAY_SIZE_DEF; + } else { + RTE_LOG(INFO, RT6, "route6:array_size = %d\n", array_size); + g_rt6_array_size = array_size; + } + + FREE_PTR(str); +} + +static void rt6_hash_bucket_handler(vector_t tokens) +{ + char *str = set_value(tokens); + uint32_t hash_buckets = atoi(str); + + if (hash_buckets < 16 || hash_buckets > 2147483647) { + RTE_LOG(WARNING, RT6, "invalid route6:hash_bucket %s, " + "using default %d\n", str, RT6_HASH_BUCKET_DEF); + g_rt6_hash_bucket = RT6_HASH_BUCKET_DEF; + } else { + RTE_LOG(INFO, RT6, "route6:hash_bucket = %d\n", hash_buckets); + g_rt6_hash_bucket = hash_buckets; + } + + FREE_PTR(str); +} + +void route6_lpm_keyword_value_init(void) +{ + if (dpvs_state_get() == DPVS_STATE_INIT) { + /* KW_TYPE_INIT keyword */ + g_lpm6_conf_max_rules = LPM6_CONF_MAX_RULES_DEF; + g_lpm6_conf_num_tbl8s = LPM6_CONF_NUM_TBL8S_DEF; + g_rt6_array_size = RT6_ARRAY_SIZE_DEF; + g_rt6_hash_bucket = RT6_HASH_BUCKET_DEF; + } +} + +void install_rt6_lpm_keywords(void) +{ + install_keyword("lpm", NULL, KW_TYPE_INIT); + install_sublevel(); + install_keyword("lpm6_max_rules", rt6_lpm6_max_rules_handler, KW_TYPE_INIT); + install_keyword("lpm6_num_tbl8s", rt6_lpm6_num_tbl8s_handler, KW_TYPE_INIT); + install_keyword("rt6_array_size", rt6_array_size_handler, KW_TYPE_INIT); + install_keyword("rt6_hash_bucket", rt6_hash_bucket_handler, KW_TYPE_INIT); + install_sublevel_end(); +} diff --git a/src/ipvs/ip_vs_conhash.c b/src/ipvs/ip_vs_conhash.c index 04c8f9452..1339b1f39 100644 --- a/src/ipvs/ip_vs_conhash.c +++ b/src/ipvs/ip_vs_conhash.c @@ -16,30 +16,43 @@ * */ +#include #include "ipv4.h" +#include "ipv6.h" #include "libconhash/conhash.h" #include "ipvs/conhash.h" #define REPLICA 160 - #define QUIC_PACKET_8BYTE_CONNECTION_ID (1 << 3) - -/* QUIC CID hash target for quic* - * QUIC CID(qid) should be configured in UDP service*/ -static int get_quic_hash_target(const struct rte_mbuf *mbuf, uint64_t *quic_cid) +/* + * QUIC CID hash target for quic* + * QUIC CID(qid) should be configured in UDP service + */ +static int get_quic_hash_target(int af, const struct rte_mbuf *mbuf, + uint64_t *quic_cid) { uint8_t pub_flags; + uint32_t udphoff; char *quic_data; uint32_t quic_len; - quic_len = ip4_hdrlen(mbuf) + sizeof(struct udp_hdr) + \ - sizeof(pub_flags) + sizeof(*quic_cid); + if (af == AF_INET6) { + struct ip6_hdr *ip6h = ip6_hdr(mbuf); + uint8_t ip6nxt = ip6h->ip6_nxt; + udphoff = ip6_skip_exthdr(mbuf, sizeof(struct ip6_hdr), &ip6nxt); + } + else + udphoff = ip4_hdrlen(mbuf); + + quic_len = udphoff + sizeof(struct udp_hdr) + + sizeof(pub_flags) + sizeof(*quic_cid); if (mbuf_may_pull((struct rte_mbuf *)mbuf, quic_len) != 0) return EDPVS_NOTEXIST; - quic_data = rte_pktmbuf_mtod_offset(mbuf, char *, ip4_hdrlen(mbuf) + sizeof(struct udp_hdr)); + quic_data = rte_pktmbuf_mtod_offset(mbuf, char *, + udphoff + sizeof(struct udp_hdr)); pub_flags = *((uint8_t *)quic_data); if ((pub_flags & QUIC_PACKET_8BYTE_CONNECTION_ID) == 0) { @@ -54,9 +67,19 @@ static int get_quic_hash_target(const struct rte_mbuf *mbuf, uint64_t *quic_cid) } /*source ip hash target*/ -static int get_sip_hash_target(const struct rte_mbuf *mbuf, uint32_t *sip) +static int get_sip_hash_target(int af, const struct rte_mbuf *mbuf, + uint32_t *addr_fold) { - *sip = ip4_hdr(mbuf)->src_addr; + if (af == AF_INET) { + *addr_fold = ip4_hdr(mbuf)->src_addr; + } else if (af == AF_INET6) { + struct in6_addr *saddr = &ip6_hdr(mbuf)->ip6_src; + *addr_fold = saddr->s6_addr32[0]^saddr->s6_addr32[1]^ + saddr->s6_addr32[2]^saddr->s6_addr32[3]; + } else { + return EDPVS_NOTSUPP; + } + return EDPVS_OK; } @@ -66,7 +89,7 @@ dp_vs_conhash_get(struct dp_vs_service *svc, struct conhash_s *conhash, { char str[40] = {0}; uint64_t quic_cid; - uint32_t sip; + uint32_t addr_fold; const struct node_s *node; if (svc->flags & DP_VS_SVC_F_QID_HASH) { @@ -75,17 +98,17 @@ dp_vs_conhash_get(struct dp_vs_service *svc, struct conhash_s *conhash, return NULL; } /* try to get CID for hash target first, then source IP. */ - if (EDPVS_OK == get_quic_hash_target(mbuf, &quic_cid)) { + if (EDPVS_OK == get_quic_hash_target(svc->af, mbuf, &quic_cid)) { snprintf(str, sizeof(str), "%lu", quic_cid); - } else if (EDPVS_OK == get_sip_hash_target(mbuf, &sip)) { - snprintf(str, sizeof(str), "%u", sip); + } else if (EDPVS_OK == get_sip_hash_target(svc->af, mbuf, &addr_fold)) { + snprintf(str, sizeof(str), "%u", addr_fold); } else { return NULL; } } else if (svc->flags & DP_VS_SVC_F_SIP_HASH) { - if (EDPVS_OK == get_sip_hash_target(mbuf, &sip)) { - snprintf(str, sizeof(str), "%u", sip); + if (EDPVS_OK == get_sip_hash_target(svc->af, mbuf, &addr_fold)) { + snprintf(str, sizeof(str), "%u", addr_fold); } else { return NULL; } @@ -107,13 +130,13 @@ dp_vs_conhash_assign(struct dp_vs_service *svc) { struct dp_vs_dest *dest; struct node_s *p_node; + uint32_t addr_fold; int weight = 0; char str[40]; list_for_each_entry(dest, &svc->dests, n_list) { weight = rte_atomic16_read(&dest->weight); if (weight > 0) { - p_node = rte_zmalloc("p_node", sizeof(struct node_s), RTE_CACHE_LINE_SIZE); if (p_node == NULL) { return EDPVS_NOMEM; @@ -122,7 +145,8 @@ dp_vs_conhash_assign(struct dp_vs_service *svc) rte_atomic32_inc(&dest->refcnt); p_node->data = dest; - snprintf(str, sizeof(str), "%u%d", dest->addr.in.s_addr, dest->port); + addr_fold = inet_addr_fold(dest->af, &dest->addr); + snprintf(str, sizeof(str), "%u%d", addr_fold, dest->port); conhash_set_node(p_node, str, weight*REPLICA); conhash_add_node(svc->sched_data, p_node); @@ -140,7 +164,7 @@ static void node_fini(struct node_s *node) rte_atomic32_dec(&(((struct dp_vs_dest *)(node->data))->refcnt)); node->data = NULL; } - + rte_free(node); } diff --git a/src/ipvs/ip_vs_conn.c b/src/ipvs/ip_vs_conn.c index d78019aee..470899fad 100644 --- a/src/ipvs/ip_vs_conn.c +++ b/src/ipvs/ip_vs_conn.c @@ -20,6 +20,7 @@ #include "common.h" #include "inet.h" #include "ipv4.h" +#include "ipv6.h" #include "sa_pool.h" #include "ipvs/ipvs.h" #include "ipvs/conn.h" @@ -91,11 +92,22 @@ static inline uint32_t conn_hashkey(int af, const union inet_addr *saddr, uint16_t sport, const union inet_addr *daddr, uint16_t dport) { - return rte_jhash_3words((uint32_t)saddr->in.s_addr, - (uint32_t)daddr->in.s_addr, - ((uint32_t)sport) << 16 | (uint32_t)dport, - dp_vs_conn_rnd) - & DPVS_CONN_TAB_MASK; + if (AF_INET == af) + return rte_jhash_3words((uint32_t)saddr->in.s_addr, + (uint32_t)daddr->in.s_addr, + ((uint32_t)sport) << 16 | (uint32_t)dport, + dp_vs_conn_rnd) & DPVS_CONN_TAB_MASK; + + if (AF_INET6 == af) { + uint32_t vect[9]; + vect[0] = ((uint32_t)sport) << 16 | (uint32_t)dport; + memcpy(&vect[1], &saddr->in6, 16); + memcpy(&vect[5], &daddr->in6, 16); + return rte_jhash_32b(vect, 9, dp_vs_conn_rnd) & DPVS_CONN_TAB_MASK; + } + + RTE_LOG(WARNING, IPVS, "%s: hashing unsupported protocol %d\n", __func__, af); + return 0; } static inline int __conn_hash(struct dp_vs_conn *conn, @@ -357,6 +369,7 @@ static int conn_expire(void *priv) struct dp_vs_synproxy_ack_pakcet *ack_mbuf, *t_ack_mbuf; struct rte_mempool *pool; assert(conn); + assert(conn->af == AF_INET || conn->af == AF_INET6); /* set proper timeout */ unsigned conn_timeout = 0; @@ -373,8 +386,7 @@ static int conn_expire(void *priv) conn->timeout.tv_sec = pp->timeout_table[conn->state]; else conn->timeout.tv_sec = 60; - } - else if (pp && pp->timeout_table) + } else if (pp && pp->timeout_table) conn->timeout.tv_sec = pp->timeout_table[conn->state]; else conn->timeout.tv_sec = 60; @@ -423,8 +435,6 @@ static int conn_expire(void *priv) /* refcnt == 1 means we are the only referer. * no one is using the conn and it's timed out. */ if (rte_atomic32_read(&conn->refcnt) == 1) { - struct dp_vs_proto *proto = dp_vs_proto_lookup(conn->proto); - if (conn->flags & DPVS_CONN_F_TEMPLATE) dpvs_timer_cancel(&conn->timer, true); else @@ -434,24 +444,40 @@ static int conn_expire(void *priv) if (conn->control) dp_vs_control_del(conn); - if (proto && proto->conn_expire) - proto->conn_expire(proto, conn); + if (pp && pp->conn_expire) + pp->conn_expire(pp, conn); if (conn->dest->fwdmode == DPVS_FWD_MODE_SNAT - && conn->proto != IPPROTO_ICMP) { - struct sockaddr_in daddr, saddr; - - memset(&daddr, 0, sizeof(daddr)); - daddr.sin_family = AF_INET; - daddr.sin_addr = conn->caddr.in; - daddr.sin_port = conn->cport; - + && conn->proto != IPPROTO_ICMP + && conn->proto != IPPROTO_ICMPV6) { + struct sockaddr_storage saddr, daddr; memset(&saddr, 0, sizeof(saddr)); - saddr.sin_family = AF_INET; - saddr.sin_addr = conn->vaddr.in; - saddr.sin_port = conn->vport; - - sa_release(conn->out_dev, &daddr, &saddr); + memset(&daddr, 0, sizeof(daddr)); + if (AF_INET == conn->af) { + struct sockaddr_in *daddr4 = (struct sockaddr_in *)&saddr; + struct sockaddr_in *saddr4 = (struct sockaddr_in *)&saddr; + + daddr4->sin_family = AF_INET; + daddr4->sin_addr = conn->caddr.in; + daddr4->sin_port = conn->cport; + + saddr4->sin_family = AF_INET; + saddr4->sin_addr = conn->vaddr.in; + saddr4->sin_port = conn->vport; + } else { /* AF_INET6 */ + struct sockaddr_in6 *daddr6 = (struct sockaddr_in6 *)&daddr; + struct sockaddr_in6 *saddr6 = (struct sockaddr_in6 *)&saddr; + + daddr6->sin6_family = AF_INET6; + daddr6->sin6_addr = conn->caddr.in6; + daddr6->sin6_port = conn->cport; + + saddr6->sin6_family = AF_INET6; + saddr6->sin6_addr = conn->vaddr.in6; + saddr6->sin6_port = conn->vport; + } + sa_release(conn->out_dev, (struct sockaddr_storage *)&daddr, + (struct sockaddr_storage *)&saddr); } conn_unbind_dest(conn); @@ -524,19 +550,40 @@ static void conn_flush(void) conn_unhash(conn); if (conn->dest->fwdmode == DPVS_FWD_MODE_SNAT && - conn->proto != IPPROTO_ICMP) { - struct sockaddr_in daddr, saddr; - + conn->proto != IPPROTO_ICMP && + conn->proto != IPPROTO_ICMPV6) { + struct sockaddr_storage daddr, saddr; memset(&daddr, 0, sizeof(daddr)); - daddr.sin_family = AF_INET; - daddr.sin_addr = conn->caddr.in; - daddr.sin_port = conn->cport; - memset(&saddr, 0, sizeof(saddr)); - saddr.sin_family = AF_INET; - saddr.sin_addr = conn->vaddr.in; - saddr.sin_port = conn->vport; - sa_release(conn->out_dev, &daddr, &saddr); + + if (AF_INET == conn->af) { + struct sockaddr_in *daddr4 = (struct sockaddr_in *)&daddr; + struct sockaddr_in *saddr4 = (struct sockaddr_in *)&saddr; + + daddr4->sin_family = AF_INET; + daddr4->sin_addr = conn->caddr.in; + daddr4->sin_port = conn->cport; + + saddr4->sin_family = AF_INET; + saddr4->sin_addr = conn->vaddr.in; + saddr4->sin_port = conn->vport; + } else if (AF_INET6 == conn->af) { + struct sockaddr_in6 *daddr6 = (struct sockaddr_in6 *)&daddr; + struct sockaddr_in6 *saddr6 = (struct sockaddr_in6 *)&saddr; + + daddr6->sin6_family = AF_INET6; + daddr6->sin6_addr = conn->caddr.in6; + daddr6->sin6_port = conn->cport; + + saddr6->sin6_family = AF_INET6; + saddr6->sin6_addr = conn->vaddr.in6; + saddr6->sin6_port = conn->cport; + } else { + RTE_LOG(WARNING, IPVS, "%s: conn address family %d " + "not supported!\n", __func__, conn->af); + } + sa_release(conn->out_dev, (struct sockaddr_storage *)&daddr, + (struct sockaddr_storage *)&saddr); } conn_unbind_dest(conn); @@ -556,9 +603,10 @@ static void conn_flush(void) #endif } -struct dp_vs_conn * dp_vs_conn_new(struct rte_mbuf *mbuf, - struct dp_vs_conn_param *param, - struct dp_vs_dest *dest, uint32_t flags) +struct dp_vs_conn *dp_vs_conn_new(struct rte_mbuf *mbuf, + const struct dp_vs_iphdr *iph, + struct dp_vs_conn_param *param, + struct dp_vs_dest *dest, uint32_t flags) { struct dp_vs_conn *new; struct conn_tuple_hash *t; @@ -579,19 +627,20 @@ struct dp_vs_conn * dp_vs_conn_new(struct rte_mbuf *mbuf, if ((flags & DPVS_CONN_F_TEMPLATE) || param->ct_dport != 0) rport = param->ct_dport; else if (dest->fwdmode == DPVS_FWD_MODE_SNAT) { - if (unlikely(param->proto == IPPROTO_ICMP)) { + if (unlikely(param->proto == IPPROTO_ICMP || + param->proto == IPPROTO_ICMPV6)) { rport = param->vport; } else { - ports = mbuf_header_pointer(mbuf, ip4_hdrlen(mbuf), - sizeof(_ports), _ports); + ports = mbuf_header_pointer(mbuf, iph->len, sizeof(_ports), _ports); if (unlikely(!ports)) { RTE_LOG(WARNING, IPVS, "%s: no memory\n", __func__); goto errout; } rport = ports[0]; } - } else + } else { rport = dest->port; + } /* init inbound conn tuple hash */ t = &tuplehash_in(new); @@ -610,9 +659,9 @@ struct dp_vs_conn * dp_vs_conn_new(struct rte_mbuf *mbuf, t->af = param->af; t->proto = param->proto; if (dest->fwdmode == DPVS_FWD_MODE_SNAT) - t->saddr.in.s_addr = ip4_hdr(mbuf)->src_addr; + t->saddr = iph->saddr; else - t->saddr = dest->addr; + t->saddr = dest->addr; t->sport = rport; t->daddr = *param->caddr; /* non-FNAT */ t->dport = param->cport; /* non-FNAT */ @@ -627,15 +676,21 @@ struct dp_vs_conn * dp_vs_conn_new(struct rte_mbuf *mbuf, new->vport = param->vport; new->laddr = *param->caddr; /* non-FNAT */ new->lport = param->cport; /* non-FNAT */ - if (dest->fwdmode == DPVS_FWD_MODE_SNAT) - new->daddr.in.s_addr = ip4_hdr(mbuf)->src_addr; - else + if (dest->fwdmode == DPVS_FWD_MODE_SNAT) { + new->daddr = iph->saddr; + } else { new->daddr = dest->addr; + } new->dport = rport; /* neighbour confirm cache */ - new->in_nexthop.in.s_addr = htonl(INADDR_ANY); - new->out_nexthop.in.s_addr = htonl(INADDR_ANY); + if (AF_INET == param->af) { + new->in_nexthop.in.s_addr = htonl(INADDR_ANY); + new->out_nexthop.in.s_addr = htonl(INADDR_ANY); + } else if (AF_INET6 == param->af) { + new->in_nexthop.in6 = in6addr_any; + new->out_nexthop.in6 = in6addr_any; + } new->in_dev = NULL; new->out_dev = NULL; @@ -680,11 +735,11 @@ struct dp_vs_conn * dp_vs_conn_new(struct rte_mbuf *mbuf, rte_atomic32_set(&new->syn_retry_max, 0); rte_atomic32_set(&new->dup_ack_cnt, 0); if ((flags & DPVS_CONN_F_SYNPROXY) && !(flags & DPVS_CONN_F_TEMPLATE)) { - struct tcphdr _tcph, *th; + struct tcphdr _tcph, *th = NULL; struct dp_vs_synproxy_ack_pakcet *ack_mbuf; struct dp_vs_proto *pp; - th = mbuf_header_pointer(mbuf, ip4_hdrlen(mbuf), sizeof(_tcph), &_tcph); + th = mbuf_header_pointer(mbuf, iph->len, sizeof(_tcph), &_tcph); if (!th) { RTE_LOG(ERR, IPVS, "%s: get tcphdr failed\n", __func__); goto unbind_laddr; @@ -830,8 +885,7 @@ struct dp_vs_conn *dp_vs_ct_in_get(int af, uint16_t proto, conn = tuplehash_to_conn(tuphash); if (tuphash->sport == sport && tuphash->dport == dport && inet_addr_equal(af, &tuphash->saddr, saddr) - && inet_addr_equal(proto == IPPROTO_IP ? AF_UNSPEC : af, - &tuphash->daddr, daddr) + && inet_addr_equal(af, &tuphash->daddr, daddr) && conn->flags & DPVS_CONN_F_TEMPLATE && tuphash->proto == proto && tuphash->af == af) { @@ -1027,6 +1081,7 @@ static inline char* get_conn_state_name(uint16_t proto, uint16_t state) } break; case IPPROTO_ICMP: + case IPPROTO_ICMPV6: switch (state) { case DPVS_ICMP_S_NORMAL: return "ICMP_NORMAL"; @@ -1052,10 +1107,17 @@ static inline void sockopt_fill_conn_entry(const struct dp_vs_conn *conn, entry->lcoreid = rte_lcore_id(); snprintf(entry->state, sizeof(entry->state), "%s", get_conn_state_name(conn->proto, conn->state)); - entry->caddr = conn->caddr.in.s_addr; - entry->vaddr = conn->vaddr.in.s_addr; - entry->laddr = conn->laddr.in.s_addr; - entry->daddr = conn->daddr.in.s_addr; + if (AF_INET == conn->af) { + entry->caddr.in = conn->caddr.in; + entry->vaddr.in = conn->vaddr.in; + entry->laddr.in = conn->laddr.in; + entry->daddr.in = conn->daddr.in; + } else if (AF_INET6 == conn->af) { + entry->caddr.in6 = conn->caddr.in6; + entry->vaddr.in6 = conn->vaddr.in6; + entry->laddr.in6 = conn->laddr.in6; + entry->daddr.in6 = conn->daddr.in6; + } entry->cport = conn->cport; entry->vport = conn->vport; entry->lport = conn->lport; @@ -1066,19 +1128,16 @@ static inline void sockopt_fill_conn_entry(const struct dp_vs_conn *conn, static int sockopt_conn_get_specified(const struct ip_vs_conn_req *conn_req, struct ip_vs_conn_array *conn_arr) { - union inet_addr sip, tip; struct dp_vs_conn *conn; struct dpvs_msg *msg, *rmsg; struct dpvs_multicast_queue *mcq; struct ip_vs_conn_array *resp_conn; int res; - sip.in.s_addr = conn_req->sockpair.sip; - tip.in.s_addr = conn_req->sockpair.tip; - if (conn_req->flag & GET_IPVS_CONN_FLAG_TEMPLATE) { conn = dp_vs_ct_in_get(conn_req->sockpair.af, conn_req->sockpair.proto, - &sip, &tip, conn_req->sockpair.sport, conn_req->sockpair.tport); + &conn_req->sockpair.sip, &conn_req->sockpair.tip, + conn_req->sockpair.sport, conn_req->sockpair.tport); if (unlikely(conn != NULL)) { /* hit persist conn */ sockopt_fill_conn_entry(conn, &conn_arr->array[0]); conn_arr->nconns = 1; @@ -1180,7 +1239,7 @@ static int sockopt_conn_get_all(const struct ip_vs_conn_req *conn_req, assert(got == MAX_CTRL_CONN_GET_ENTRIES); conn_arr->nconns = got; - /* low chance that all done here, we assign GET_IPVS_CONN_RESL_MORE + /* small chance that all done here, we assign GET_IPVS_CONN_RESL_MORE * flag for simplicity here anyway */ conn_arr->resl = GET_IPVS_CONN_RESL_OK | GET_IPVS_CONN_RESL_MORE; conn_arr->curcid = cid; @@ -1332,7 +1391,6 @@ static struct dpvs_sockopts conn_sockopts = { static int conn_get_msgcb_slave(struct dpvs_msg *msg) { const struct ip_vs_conn_req *conn_req; - union inet_addr sip, tip; struct dp_vs_conn *conn; int dir, reply_len; struct ip_vs_conn_array *reply_data; @@ -1340,19 +1398,17 @@ static int conn_get_msgcb_slave(struct dpvs_msg *msg) assert(msg->len == sizeof(struct ip_vs_conn_req)); conn_req = (struct ip_vs_conn_req *)&msg->data[0]; - sip.in.s_addr = conn_req->sockpair.sip; - tip.in.s_addr = conn_req->sockpair.tip; - /* templates are global, it should never found here */ if (conn_req->flag & GET_IPVS_CONN_FLAG_TEMPLATE) return EDPVS_INVAL; conn = dp_vs_conn_get(conn_req->sockpair.af, conn_req->sockpair.proto, - &sip, &tip, conn_req->sockpair.sport, conn_req->sockpair.tport, &dir, 0); + &conn_req->sockpair.sip, &conn_req->sockpair.tip, + conn_req->sockpair.sport, conn_req->sockpair.tport, &dir, 0); if (!conn) { conn = dp_vs_conn_get(conn_req->sockpair.af, conn_req->sockpair.proto, - &sip, &tip, conn_req->sockpair.sport, - conn_req->sockpair.tport, &dir, 1); + &conn_req->sockpair.sip, &conn_req->sockpair.tip, + conn_req->sockpair.sport, conn_req->sockpair.tport, &dir, 1); } if (unlikely(conn != NULL)) diff --git a/src/ipvs/ip_vs_core.c b/src/ipvs/ip_vs_core.c index b8073b49e..33a3134b5 100644 --- a/src/ipvs/ip_vs_core.c +++ b/src/ipvs/ip_vs_core.c @@ -17,9 +17,12 @@ */ #include #include +#include #include "common.h" #include "ipv4.h" +#include "ipv6.h" #include "icmp.h" +#include "icmp6.h" #include "sa_pool.h" #include "ipvs/ipvs.h" #include "ipvs/conn.h" @@ -32,19 +35,26 @@ #include "ipvs/synproxy.h" #include "ipvs/blklst.h" #include "ipvs/proto_udp.h" +#include "route6.h" -#define icmp4_id(icmph) (((icmph)->un).echo.id) - -static inline int dp_vs_fill_iphdr(int af, const struct rte_mbuf *mbuf, +static inline int dp_vs_fill_iphdr(int af, struct rte_mbuf *mbuf, struct dp_vs_iphdr *iph) { if (af == AF_INET) { - const struct ipv4_hdr *ip4h = ip4_hdr(mbuf); + struct ipv4_hdr *ip4h = ip4_hdr(mbuf); iph->af = AF_INET; iph->len = ip4_hdrlen(mbuf); iph->proto = ip4h->next_proto_id; iph->saddr.in.s_addr = ip4h->src_addr; iph->daddr.in.s_addr = ip4h->dst_addr; + } else if (af == AF_INET6) { + struct ip6_hdr *ip6h = ip6_hdr(mbuf); + uint8_t ip6nxt = ip6h->ip6_nxt; + iph->af = AF_INET6; + iph->len = ip6_skip_exthdr(mbuf, sizeof(struct ip6_hdr), &ip6nxt); + iph->proto = ip6nxt; + iph->saddr.in6 = ip6h->ip6_src; + iph->daddr.in6 = ip6h->ip6_dst; } else { return EDPVS_NOTSUPP; } @@ -75,8 +85,8 @@ static struct dp_vs_conn *dp_vs_sched_persist(struct dp_vs_service *svc, conn_flags = (is_synproxy_on ? DPVS_CONN_F_SYNPROXY : 0); if (svc->af == AF_INET6) { - RTE_LOG(ERR, IPVS, "%s: IPv6 is not supported!\n", __func__); - return NULL; + /* FIXME: Is OK to use svc->netmask as IPv6 prefix length ? */ + ipv6_addr_prefix_copy(&snet.in6, &iph->saddr.in6, svc->netmask); } else { snet.in.s_addr = iph->saddr.in.s_addr & svc->netmask; } @@ -109,7 +119,7 @@ static struct dp_vs_conn *dp_vs_sched_persist(struct dp_vs_service *svc, dp_vs_conn_fill_param(iph->af, iph->proto, &snet, &iph->daddr, 0, ports[1], 0, ¶m); - ct = dp_vs_conn_new(mbuf, ¶m, dest, conn_flags | DPVS_CONN_F_TEMPLATE); + ct = dp_vs_conn_new(mbuf, iph, ¶m, dest, conn_flags | DPVS_CONN_F_TEMPLATE); if(unlikely(NULL == ct)) return NULL; @@ -133,7 +143,7 @@ static struct dp_vs_conn *dp_vs_sched_persist(struct dp_vs_service *svc, dp_vs_conn_fill_param(iph->af, iph->proto, &snet, &iph->daddr, 0, 0, 0, ¶m); - ct = dp_vs_conn_new(mbuf, ¶m, dest, conn_flags | DPVS_CONN_F_TEMPLATE); + ct = dp_vs_conn_new(mbuf, iph, ¶m, dest, conn_flags | DPVS_CONN_F_TEMPLATE); if(unlikely(NULL == ct)) return NULL; @@ -149,7 +159,7 @@ static struct dp_vs_conn *dp_vs_sched_persist(struct dp_vs_service *svc, dp_vs_conn_fill_param(iph->af, iph->proto, &iph->saddr, &iph->daddr, ports[0], ports[1], dport, ¶m); - conn = dp_vs_conn_new(mbuf, ¶m, dest, conn_flags); + conn = dp_vs_conn_new(mbuf, iph, ¶m, dest, conn_flags); if (unlikely(NULL == conn)) { dp_vs_conn_put(ct); return NULL; @@ -164,6 +174,100 @@ static struct dp_vs_conn *dp_vs_sched_persist(struct dp_vs_service *svc, return conn; } +static struct dp_vs_conn *dp_vs_snat_schedule(struct dp_vs_dest *dest, + const struct dp_vs_iphdr *iph, + uint16_t *ports, + struct rte_mbuf *mbuf) +{ + int err; + struct dp_vs_conn *conn; + struct dp_vs_conn_param param; + struct sockaddr_storage daddr, saddr; + uint16_t _ports[2]; + + if (unlikely(iph->proto == IPPROTO_ICMP)) { + struct icmphdr *ich, _icmph; + ich = mbuf_header_pointer(mbuf, iph->len, sizeof(_icmph), &_icmph); + if (!ich) + return NULL; + + _ports[0] = icmp4_id(ich); + _ports[1] = ich->type << 8 | ich->code; + + /* ID may confict for diff host, + * need we use ID pool ? */ + dp_vs_conn_fill_param(iph->af, iph->proto, + &iph->daddr, &dest->addr, + _ports[1], _ports[0], + 0, ¶m); + } else if (unlikely(iph->proto == IPPROTO_ICMPV6)) { + struct icmp6_hdr *ic6h, _ic6hp; + ic6h = mbuf_header_pointer(mbuf, iph->len, sizeof(_ic6hp), &_ic6hp); + if (!ic6h) + return NULL; + + _ports[0] = icmp6h_id(ic6h); + _ports[1] = ic6h->icmp6_type << 8 | ic6h->icmp6_code; + + dp_vs_conn_fill_param(iph->af, iph->proto, + &iph->daddr, &dest->addr, + _ports[1], _ports[0], + 0, ¶m); + } else { + /* we cannot inherit dest (host's src port), + * that may confict for diff hosts, + * and using dest->port is worse choice. */ + if (iph->af == AF_INET) { + struct sockaddr_in *daddr4 = (struct sockaddr_in *)&daddr; + struct sockaddr_in *saddr4 = (struct sockaddr_in *)&saddr; + + memset(&daddr, 0, sizeof(daddr)); + daddr4->sin_family = AF_INET; + daddr4->sin_addr = iph->daddr.in; + daddr4->sin_port = ports[1]; + + memset(&saddr, 0, sizeof(saddr)); + saddr4->sin_family = AF_INET; + saddr4->sin_addr = dest->addr.in; + saddr4->sin_port = 0; + + err = sa_fetch(AF_INET, NULL, &daddr, &saddr); + if (err != 0) + return NULL; + dp_vs_conn_fill_param(AF_INET, iph->proto, &iph->daddr, &dest->addr, + ports[1], saddr4->sin_port, 0, ¶m); + } else { /* AF_INET6 */ + struct sockaddr_in6 *daddr6 = (struct sockaddr_in6 *)&daddr; + struct sockaddr_in6 *saddr6 = (struct sockaddr_in6 *)&saddr; + + memset(&daddr, 0, sizeof(daddr)); + daddr6->sin6_family = AF_INET6; + daddr6->sin6_addr = iph->daddr.in6; + daddr6->sin6_port = ports[1]; + + memset(&saddr, 0, sizeof(saddr)); + saddr6->sin6_family = AF_INET6; + saddr6->sin6_addr = dest->addr.in6; + saddr6->sin6_port = 0; + + err = sa_fetch(AF_INET6, NULL, &daddr, &saddr); + if (err != 0) + return NULL; + dp_vs_conn_fill_param(AF_INET6, iph->proto, &iph->daddr, &dest->addr, + ports[1], saddr6->sin6_port, 0, ¶m); + } + } + + conn = dp_vs_conn_new(mbuf, iph, ¶m, dest, 0); + if (!conn) { + sa_release(NULL, &daddr, &saddr); + return NULL; + } + + dp_vs_stats_conn(conn); + return conn; +} + /* select an RS by service's scheduler and create a connection */ struct dp_vs_conn *dp_vs_schedule(struct dp_vs_service *svc, const struct dp_vs_iphdr *iph, @@ -174,8 +278,6 @@ struct dp_vs_conn *dp_vs_schedule(struct dp_vs_service *svc, struct dp_vs_dest *dest; struct dp_vs_conn *conn; struct dp_vs_conn_param param; - struct sockaddr_in daddr, saddr; - int err; assert(svc && iph && mbuf); @@ -196,80 +298,46 @@ struct dp_vs_conn *dp_vs_schedule(struct dp_vs_service *svc, return NULL; } - if (dest->fwdmode == DPVS_FWD_MODE_SNAT) { - if (unlikely(iph->proto == IPPROTO_ICMP)) { - struct icmphdr *ich, _icmph; - ich = mbuf_header_pointer(mbuf, iph->len, sizeof(_icmph), &_icmph); - if (!ich) - return NULL; - - ports = _ports; - _ports[0] = icmp4_id(ich); - _ports[1] = ich->type << 8 | ich->code; - - /* ID may confict for diff host, - * need we use ID pool ? */ - dp_vs_conn_fill_param(iph->af, iph->proto, - &iph->daddr, &dest->addr, - ports[1], ports[0], - 0, ¶m); - } else { - /* we cannot inherit dest (host's src port), - * that may confict for diff hosts, - * and using dest->port is worse choice. */ - memset(&daddr, 0, sizeof(daddr)); - daddr.sin_family = AF_INET; - daddr.sin_addr = iph->daddr.in; - daddr.sin_port = ports[1]; - memset(&saddr, 0, sizeof(saddr)); - saddr.sin_family = AF_INET; - saddr.sin_addr = dest->addr.in; - saddr.sin_port = 0; - - err = sa_fetch(NULL, &daddr, &saddr); - if (err != 0) { -#ifdef CONFIG_DPVS_MBUF_DEBUG - dp_vs_mbuf_dump("sa_fetch failed.", iph->af, mbuf); -#endif - return NULL; - } - - dp_vs_conn_fill_param(iph->af, iph->proto, - &iph->daddr, &dest->addr, - ports[1], saddr.sin_port, - 0, ¶m); - } + if (dest->fwdmode == DPVS_FWD_MODE_SNAT) + return dp_vs_snat_schedule(dest, iph, ports, mbuf); + + if (unlikely(iph->proto == IPPROTO_ICMP)) { + struct icmphdr *ich, _icmph; + ich = mbuf_header_pointer(mbuf, iph->len, sizeof(_icmph), &_icmph); + if (!ich) + return NULL; + + ports = _ports; + _ports[0] = icmp4_id(ich); + _ports[1] = ich->type << 8 | ich->code; + + dp_vs_conn_fill_param(iph->af, iph->proto, + &iph->saddr, &iph->daddr, + ports[0], ports[1], 0, ¶m); + } else if (unlikely(iph->proto == IPPROTO_ICMPV6)) { + struct icmp6_hdr *ic6h, _ic6hp; + ic6h = mbuf_header_pointer(mbuf, iph->len, sizeof(_ic6hp), &_ic6hp); + if (!ic6h) + return NULL; + + ports = _ports; + _ports[0] = icmp6h_id(ic6h); + _ports[1] = ic6h->icmp6_type << 8 | ic6h->icmp6_code; + + dp_vs_conn_fill_param(iph->af, iph->proto, + &iph->daddr, &dest->addr, + ports[1], ports[0], + 0, ¶m); } else { - if (unlikely(iph->proto == IPPROTO_ICMP)) { - struct icmphdr *ich, _icmph; - ich = mbuf_header_pointer(mbuf, iph->len, sizeof(_icmph), &_icmph); - if (!ich) - return NULL; - - ports = _ports; - _ports[0] = icmp4_id(ich); - _ports[1] = ich->type << 8 | ich->code; - - dp_vs_conn_fill_param(iph->af, iph->proto, - &iph->saddr, &iph->daddr, - ports[0], ports[1], 0, ¶m); - } else { - dp_vs_conn_fill_param(iph->af, iph->proto, - &iph->saddr, &iph->daddr, - ports[0], ports[1], 0, ¶m); - } + dp_vs_conn_fill_param(iph->af, iph->proto, + &iph->saddr, &iph->daddr, + ports[0], ports[1], 0, ¶m); } - conn = dp_vs_conn_new(mbuf, ¶m, dest, + conn = dp_vs_conn_new(mbuf, iph, ¶m, dest, is_synproxy_on ? DPVS_CONN_F_SYNPROXY : 0); - if (!conn) { - if (dest->fwdmode == DPVS_FWD_MODE_SNAT && iph->proto != IPPROTO_ICMP) - sa_release(NULL, &daddr, &saddr); -#ifdef CONFIG_DPVS_MBUF_DEBUG - dp_vs_mbuf_dump("create conn failed.", iph->af, mbuf); -#endif + if (!conn) return NULL; - } dp_vs_stats_conn(conn); return conn; @@ -277,8 +345,8 @@ struct dp_vs_conn *dp_vs_schedule(struct dp_vs_service *svc, /* return verdict INET_XXX */ static int xmit_outbound(struct rte_mbuf *mbuf, - struct dp_vs_proto *prot, - struct dp_vs_conn *conn) + struct dp_vs_proto *prot, + struct dp_vs_conn *conn) { int err; assert(mbuf && prot && conn); @@ -286,7 +354,8 @@ static int xmit_outbound(struct rte_mbuf *mbuf, if (dp_vs_stats_out(conn, mbuf)) { dp_vs_conn_put(conn); return INET_DROP; - } + } + if (!conn->packet_out_xmit) { RTE_LOG(WARNING, IPVS, "%s: missing out_xmit\n", __func__); dp_vs_conn_put(conn); @@ -304,8 +373,8 @@ static int xmit_outbound(struct rte_mbuf *mbuf, /* return verdict INET_XXX */ static int xmit_inbound(struct rte_mbuf *mbuf, - struct dp_vs_proto *prot, - struct dp_vs_conn *conn) + struct dp_vs_proto *prot, + struct dp_vs_conn *conn) { int err; assert(mbuf && prot && conn); @@ -340,9 +409,9 @@ static int xmit_inbound(struct rte_mbuf *mbuf, } /* mbuf should be consumed here. */ -static int xmit_outbound_icmp(struct rte_mbuf *mbuf, - struct dp_vs_proto *prot, - struct dp_vs_conn *conn) +static int __xmit_outbound_icmp4(struct rte_mbuf *mbuf, + struct dp_vs_proto *prot, + struct dp_vs_conn *conn) { struct flow4 fl4; struct route_entry *rt = NULL; @@ -362,9 +431,9 @@ static int xmit_outbound_icmp(struct rte_mbuf *mbuf, } memset(&fl4, 0, sizeof(struct flow4)); - fl4.daddr = conn->caddr.in; - fl4.saddr = conn->vaddr.in; - fl4.tos = iph->type_of_service; + fl4.fl4_daddr = conn->caddr.in; + fl4.fl4_saddr = conn->vaddr.in; + fl4.fl4_tos = iph->type_of_service; rt = route4_output(&fl4); if (!rt) { rte_pktmbuf_free(mbuf); @@ -387,13 +456,75 @@ static int xmit_outbound_icmp(struct rte_mbuf *mbuf, /* translation for outer L3, ICMP, and inner L3 and L4 */ dp_vs_xmit_icmp(mbuf, prot, conn, DPVS_CONN_DIR_OUTBOUND); - return INET_HOOK(INET_HOOK_LOCAL_OUT, mbuf, NULL, rt->port, ipv4_output); + return INET_HOOK(AF_INET, INET_HOOK_LOCAL_OUT, mbuf, + NULL, rt->port, ipv4_output); } /* mbuf should be consumed here. */ -static int xmit_inbound_icmp(struct rte_mbuf *mbuf, - struct dp_vs_proto *prot, - struct dp_vs_conn *conn) +static int __xmit_outbound_icmp6(struct rte_mbuf *mbuf, + struct dp_vs_proto *prot, + struct dp_vs_conn *conn) +{ + struct flow6 fl6; + struct route6 *rt6 = NULL; + + /* no translation needed for DR/TUN. */ + if (conn->dest->fwdmode != DPVS_FWD_MODE_FNAT && + conn->dest->fwdmode != DPVS_FWD_MODE_NAT && + conn->dest->fwdmode != DPVS_FWD_MODE_SNAT) { + if (!conn->packet_out_xmit) { + RTE_LOG(WARNING, IPVS, "%s: missing packet_out_xmit\n", __func__); + rte_pktmbuf_free(mbuf); + return EDPVS_NOTSUPP; + } + + return conn->packet_out_xmit(prot, conn, mbuf); + } + + memset(&fl6, 0, sizeof(struct flow6)); + fl6.fl6_daddr = conn->caddr.in6; + fl6.fl6_saddr = conn->vaddr.in6; + rt6 = route6_output(mbuf, &fl6); + if (!rt6) { + rte_pktmbuf_free(mbuf); + return EDPVS_NOROUTE; + } + + if (mbuf->pkt_len > rt6->rt6_mtu) { + route6_put(rt6); + icmp6_send(mbuf, ICMP6_PACKET_TOO_BIG, 0, htonl(rt6->rt6_mtu)); + rte_pktmbuf_free(mbuf); + return EDPVS_FRAG; + } + + if (unlikely(mbuf->userdata != NULL)) + route6_put((struct route6 *)mbuf->userdata); + mbuf->userdata = rt6; + + /* translation for outer L3, ICMP, and inner L3 and L4 */ + dp_vs_xmit_icmp(mbuf, prot, conn, DPVS_CONN_DIR_OUTBOUND); + + return INET_HOOK(AF_INET6, INET_HOOK_LOCAL_OUT, mbuf, + NULL, rt6->rt6_dev, ip6_output); +} + +static int xmit_outbound_icmp(struct rte_mbuf *mbuf, + struct dp_vs_proto *prot, + struct dp_vs_conn *conn) +{ + int af = conn->af; + + assert(af == AF_INET || af == AF_INET6); + if (af == AF_INET) + return __xmit_outbound_icmp4(mbuf, prot, conn); + else + return __xmit_outbound_icmp6(mbuf, prot, conn); +} + +/* mbuf should be consumed here. */ +static int __xmit_inbound_icmp4(struct rte_mbuf *mbuf, + struct dp_vs_proto *prot, + struct dp_vs_conn *conn) { struct flow4 fl4; struct route_entry *rt = NULL; @@ -413,9 +544,9 @@ static int xmit_inbound_icmp(struct rte_mbuf *mbuf, } memset(&fl4, 0, sizeof(struct flow4)); - fl4.daddr = conn->daddr.in; - fl4.saddr = conn->laddr.in; - fl4.tos = iph->type_of_service; + fl4.fl4_daddr = conn->daddr.in; + fl4.fl4_saddr = conn->laddr.in; + fl4.fl4_tos = iph->type_of_service; rt = route4_output(&fl4); if (!rt) { rte_pktmbuf_free(mbuf); @@ -438,11 +569,75 @@ static int xmit_inbound_icmp(struct rte_mbuf *mbuf, /* translation for outer L3, ICMP, and inner L3 and L4 */ dp_vs_xmit_icmp(mbuf, prot, conn, DPVS_CONN_DIR_INBOUND); - return INET_HOOK(INET_HOOK_LOCAL_OUT, mbuf, NULL, rt->port, ipv4_output); + return INET_HOOK(AF_INET, INET_HOOK_LOCAL_OUT, mbuf, + NULL, rt->port, ipv4_output); +} + + +/* mbuf should be consumed here. */ +static int __xmit_inbound_icmp6(struct rte_mbuf *mbuf, + struct dp_vs_proto *prot, + struct dp_vs_conn *conn) +{ + struct flow6 fl6; + struct route6 *rt6 = NULL; + + /* no translation needed for DR/TUN. */ + if (conn->dest->fwdmode != DPVS_FWD_MODE_NAT && + conn->dest->fwdmode != DPVS_FWD_MODE_FNAT && + conn->dest->fwdmode != DPVS_FWD_MODE_SNAT) { + if (!conn->packet_xmit) { + RTE_LOG(WARNING, IPVS, "%s: missing packet_xmit\n", __func__); + rte_pktmbuf_free(mbuf); + return EDPVS_NOTSUPP; + } + + return conn->packet_xmit(prot, conn, mbuf); + } + + memset(&fl6, 0, sizeof(struct flow6)); + fl6.fl6_daddr = conn->daddr.in6; + fl6.fl6_saddr = conn->laddr.in6; + rt6 = route6_output(mbuf, &fl6); + if (!rt6) { + rte_pktmbuf_free(mbuf); + return EDPVS_NOROUTE; + } + + if (mbuf->pkt_len > rt6->rt6_mtu) { + route6_put(rt6); + icmp6_send(mbuf, ICMP6_PACKET_TOO_BIG, 0, htonl(rt6->rt6_mtu)); + rte_pktmbuf_free(mbuf); + return EDPVS_FRAG; + } + + if (unlikely(mbuf->userdata != NULL)) + route6_put((struct route6 *)mbuf->userdata); + mbuf->userdata = rt6; + + /* translation for outer L3, ICMP, and inner L3 and L4 */ + dp_vs_xmit_icmp(mbuf, prot, conn, DPVS_CONN_DIR_INBOUND); + + return INET_HOOK(AF_INET6, INET_HOOK_LOCAL_OUT, mbuf, + NULL, rt6->rt6_dev, ip6_output); +} + +static int xmit_inbound_icmp(struct rte_mbuf *mbuf, + struct dp_vs_proto *prot, + struct dp_vs_conn *conn) +{ + int af = conn->af; + + assert(af == AF_INET || af == AF_INET6); + + if (af == AF_INET) + return __xmit_inbound_icmp4(mbuf, prot, conn); + else + return __xmit_inbound_icmp6(mbuf, prot, conn); } /* return verdict INET_XXX */ -static int dp_vs_in_icmp(struct rte_mbuf *mbuf, int *related) +static int __dp_vs_in_icmp4(struct rte_mbuf *mbuf, int *related) { struct icmphdr *ich, _icmph; struct ipv4_hdr *iph = ip4_hdr(mbuf); @@ -520,6 +715,9 @@ static int dp_vs_in_icmp(struct rte_mbuf *mbuf, int *related) return INET_DROP; } + // re-fetch IP header and Icmp address + iph = ip4_hdr(mbuf); + ich = (struct icmphdr*)((void*)iph + ip4_hdrlen(mbuf)); if (rte_raw_cksum(ich, mbuf->pkt_len - ip4_hdrlen(mbuf)) != 0xffff) { RTE_LOG(DEBUG, IPVS, "%s: bad checksum\n", __func__); dp_vs_conn_put_no_reset(conn); @@ -547,31 +745,167 @@ static int dp_vs_in_icmp(struct rte_mbuf *mbuf, int *related) } /* return verdict INET_XXX */ -static int dp_vs_in(void *priv, struct rte_mbuf *mbuf, - const struct inet_hook_state *state) +static int __dp_vs_in_icmp6(struct rte_mbuf *mbuf, int *related) +{ + struct icmp6_hdr *ic6h, _icmp6h; + struct ip6_hdr *ip6h = ip6_hdr(mbuf); + struct ip6_hdr *cip6h, _cip6h; + struct dp_vs_iphdr dcip6h; + struct dp_vs_proto *prot; + struct dp_vs_conn *conn; + int off, ic6h_off, dir, err; + bool drop = false; + uint8_t nexthdr = ip6h->ip6_nxt; +#ifdef CONFIG_DPVS_IPVS_DEBUG + char src_addr_buff[64], dst_addr_buff[64]; +#endif + + *related = 0; /* not related until found matching conn */ + + // don't suppurt frag now + if (unlikely(ip6_is_frag(ip6h))) { + RTE_LOG(WARNING, IPVS, "%s: ip packet is frag.\n", __func__); + return INET_DROP; + } + + off = sizeof(struct ip6_hdr); + off = ip6_skip_exthdr(mbuf, off, &nexthdr); + if (off < 0 || nexthdr != IPPROTO_ICMPV6) { + RTE_LOG(WARNING, IPVS, "%s: off or nexthdr is illegal. off is %d, nexthdr is %u.\n", + __func__, off, nexthdr); + return INET_DROP; + } + + ic6h_off = off; + ic6h = mbuf_header_pointer(mbuf, off, sizeof(_icmp6h), &_icmp6h); + if (unlikely(!ic6h)) + return INET_DROP; + +#ifdef CONFIG_DPVS_IPVS_DEBUG + inet_ntop(AF_INET6, &ip6h->ip6_src, src_addr_buff, sizeof(src_addr_buff)); + inet_ntop(AF_INET6, &ip6h->ip6_dst, dst_addr_buff, sizeof(dst_addr_buff)); + RTE_LOG(DEBUG, IPVS, "ICMP6 (%d,%d) %s->%s\n", + ic6h->icmp6_type, ntohs(icmp6h_id(ic6h)), src_addr_buff, dst_addr_buff); +#endif + + /* support these related error types only, + * others either not support or not related. + */ + if (ic6h->icmp6_type != ICMP6_DST_UNREACH + && ic6h->icmp6_type != ICMP6_PACKET_TOO_BIG + && ic6h->icmp6_type != ICMP6_TIME_EXCEEDED) + return INET_ACCEPT; + + /* inner (contained) IP header */ + off += sizeof(struct icmp6_hdr); + cip6h = mbuf_header_pointer(mbuf, off, sizeof(_cip6h), &_cip6h); + if (unlikely(!cip6h)) + return INET_ACCEPT; + + if (unlikely(ip6_is_frag(cip6h))) { + RTE_LOG(WARNING, IPVS, "%s: frag needed.\n", __func__); + return INET_ACCEPT; + } + + /* + * lookup conn with inner IP pkt. + * it need to move mbuf.data_off to inner IP pkt, + * and restore it later. although it looks strange. + */ + rte_pktmbuf_adj(mbuf, off); + if (mbuf_may_pull(mbuf, sizeof(struct ip6_hdr)) != 0) + return INET_DROP; + dp_vs_fill_iphdr(AF_INET6, mbuf, &dcip6h); + + prot = dp_vs_proto_lookup(dcip6h.proto); + if (!prot) + return INET_ACCEPT; + + conn = prot->conn_lookup(prot, &dcip6h, mbuf, &dir, true, &drop); + if (!conn) + return INET_ACCEPT; + + /* recover mbuf.data_off to outer IP header. */ + rte_pktmbuf_prepend(mbuf, off); + + /* so the ICMP is related to existing conn */ + *related = 1; + + if (mbuf_may_pull(mbuf, mbuf->pkt_len) != 0) { + RTE_LOG(WARNING, IPVS, "%s: may_pull icmp error\n", __func__); + dp_vs_conn_put_no_reset(conn); + return INET_DROP; + } + + /* + * check checksum + * re-fetch IP header and Icmp address + */ + ip6h = ip6_hdr(mbuf); + ic6h = (struct icmp6_hdr *)((void*)(ip6h) + ic6h_off); + if (icmp6_csum(ip6h, ic6h) != 0xffff) { + RTE_LOG(DEBUG, IPVS, "%s: bad checksum\n", __func__); + dp_vs_conn_put_no_reset(conn); + return INET_DROP; + } + + if (dp_vs_stats_in(conn, mbuf)) { + dp_vs_conn_put(conn); + return INET_DROP; + } + /* note + * 1. the direction of inner IP pkt is reversed with ICMP pkt. + * 2. but we use (@reverse == true) for prot->conn_lookup() + * as a result, @dir is same with icmp packet. */ + if (dir == DPVS_CONN_DIR_INBOUND) + err = xmit_inbound_icmp(mbuf, prot, conn); + else + err = xmit_outbound_icmp(mbuf, prot, conn); + if (err != EDPVS_OK) + RTE_LOG(WARNING, IPVS, "%s: xmit icmp error: %s\n", + __func__, dpvs_strerror(err)); + + dp_vs_conn_put_no_reset(conn); + return INET_STOLEN; +} + +static int dp_vs_in_icmp(int af, struct rte_mbuf *mbuf, int *related) +{ + *related = 0; + switch (af) { + case AF_INET: + return __dp_vs_in_icmp4(mbuf, related); + case AF_INET6: + return __dp_vs_in_icmp6(mbuf, related); + } + return INET_ACCEPT; +} + +/* return verdict INET_XXX + * af from mbuf->l3_type? No! The field is rewritten by netif and conflicts with + * m.packet_type(an union), so using a wrapper to get af. + * */ +static int __dp_vs_in(void *priv, struct rte_mbuf *mbuf, + const struct inet_hook_state *state, int af) { struct dp_vs_iphdr iph; struct dp_vs_proto *prot; struct dp_vs_conn *conn; - int dir, af, verdict, err, related; + int dir, verdict, err, related; bool drop = false; eth_type_t etype = mbuf->packet_type; /* FIXME: use other field ? */ assert(mbuf && state); - /* cannot use mbuf->l3_type which is conflict with m.packet_type - * or using wrapper to avoid af check here */ - /* af = mbuf->l3_type == htons(ETHER_TYPE_IPv4) ? AF_INET : AF_INET6; */ - af = AF_INET; - if (unlikely(etype != ETH_PKT_HOST)) return INET_ACCEPT; if (dp_vs_fill_iphdr(af, mbuf, &iph) != EDPVS_OK) return INET_ACCEPT; - if (unlikely(iph.proto == IPPROTO_ICMP)) { + if (unlikely(iph.proto == IPPROTO_ICMP || + iph.proto == IPPROTO_ICMPV6)) { /* handle related ICMP error to existing conn */ - verdict = dp_vs_in_icmp(mbuf, &related); + verdict = dp_vs_in_icmp(af, mbuf, &related); if (related || verdict != INET_ACCEPT) return verdict; /* let unrelated and valid ICMP goes down, @@ -594,7 +928,7 @@ static int dp_vs_in(void *priv, struct rte_mbuf *mbuf, * case frags in same flow are not occur in same lcore, a global lock is * needed, which is not a good idea. */ - if (ip4_is_frag(ip4_hdr(mbuf))) { + if (af == AF_INET && ip4_is_frag(ip4_hdr(mbuf))) { RTE_LOG(DEBUG, IPVS, "%s: frag not support.\n", __func__); return INET_DROP; } @@ -646,7 +980,7 @@ static int dp_vs_in(void *priv, struct rte_mbuf *mbuf, } else { /* Syn-proxy 3 logic: receive syn-ack from rs */ if (dp_vs_synproxy_synack_rcv(mbuf, conn, prot, - ip4_hdrlen(mbuf), &verdict) == 0) { + iph.len, &verdict) == 0) { dp_vs_stats_out(conn, mbuf); dp_vs_conn_put(conn); return verdict; @@ -668,14 +1002,24 @@ static int dp_vs_in(void *priv, struct rte_mbuf *mbuf, return xmit_outbound(mbuf, prot, conn); } -static int dp_vs_pre_routing(void *priv, struct rte_mbuf *mbuf, - const struct inet_hook_state *state) +static int dp_vs_in(void *priv, struct rte_mbuf *mbuf, + const struct inet_hook_state *state) +{ + return __dp_vs_in(priv, mbuf, state, AF_INET); +} + +static int dp_vs_in6(void *priv, struct rte_mbuf *mbuf, + const struct inet_hook_state *state) +{ + return __dp_vs_in(priv, mbuf, state, AF_INET6); +} + +static int __dp_vs_pre_routing(void *priv, struct rte_mbuf *mbuf, + const struct inet_hook_state *state, int af) { struct dp_vs_iphdr iph; - int af; struct dp_vs_service *svc; - af = AF_INET; if (EDPVS_OK != dp_vs_fill_iphdr(af, mbuf, &iph)) return INET_ACCEPT; @@ -706,17 +1050,43 @@ static int dp_vs_pre_routing(void *priv, struct rte_mbuf *mbuf, return INET_ACCEPT; } +static int dp_vs_pre_routing(void *priv, struct rte_mbuf *mbuf, + const struct inet_hook_state *state) +{ + return __dp_vs_pre_routing(priv, mbuf, state, AF_INET); +} + +static int dp_vs_pre_routing6(void *priv, struct rte_mbuf *mbuf, + const struct inet_hook_state *state) +{ + return __dp_vs_pre_routing(priv, mbuf, state, AF_INET6); +} + static struct inet_hook_ops dp_vs_ops[] = { { + .af = AF_INET, .hook = dp_vs_in, .hooknum = INET_HOOK_PRE_ROUTING, .priority = 100, }, { + .af = AF_INET, .hook = dp_vs_pre_routing, .hooknum = INET_HOOK_PRE_ROUTING, .priority = 99, }, + { + .af = AF_INET6, + .hook = dp_vs_in6, + .hooknum = INET_HOOK_PRE_ROUTING, + .priority = 100, + }, + { + .af = AF_INET6, + .hook = dp_vs_pre_routing6, + .hooknum = INET_HOOK_PRE_ROUTING, + .priority = 99, + }, }; int dp_vs_init(void) @@ -768,7 +1138,7 @@ int dp_vs_init(void) RTE_LOG(ERR, IPVS, "fail to init stats: %s\n", dpvs_strerror(err)); goto err_stats; } - err = ipv4_register_hooks(dp_vs_ops, NELEMS(dp_vs_ops)); + err = inet_register_hooks(dp_vs_ops, NELEMS(dp_vs_ops)); if (err != EDPVS_OK) { RTE_LOG(ERR, IPVS, "fail to register hooks: %s\n", dpvs_strerror(err)); goto err_hooks; @@ -780,9 +1150,9 @@ int dp_vs_init(void) err_hooks: dp_vs_stats_term(); err_stats: - dp_vs_service_term(); -err_blklst: dp_vs_blklst_term(); +err_blklst: + dp_vs_service_term(); err_serv: dp_vs_sched_term(); err_sched: @@ -801,7 +1171,7 @@ int dp_vs_term(void) { int err; - err = ipv4_unregister_hooks(dp_vs_ops, NELEMS(dp_vs_ops)); + err = inet_unregister_hooks(dp_vs_ops, NELEMS(dp_vs_ops)); if (err != EDPVS_OK) RTE_LOG(ERR, IPVS, "fail to unregister hooks: %s\n", dpvs_strerror(err)); diff --git a/src/ipvs/ip_vs_dest.c b/src/ipvs/ip_vs_dest.c index 819a86cad..a69d3bc61 100644 --- a/src/ipvs/ip_vs_dest.c +++ b/src/ipvs/ip_vs_dest.c @@ -17,33 +17,33 @@ */ #include #include +#include #include "inet.h" #include "ipvs/service.h" #include "ipvs/dest.h" #include "ipvs/sched.h" #include "ipvs/laddr.h" #include "ipvs/conn.h" -#include -/** +/* * locks - * */ + */ static rte_rwlock_t __dp_vs_rs_lock; -/*** +/* * hash table for rs - ***/ + */ #define DP_VS_RTAB_BITS 4 #define DP_VS_RTAB_SIZE (1 << DP_VS_RTAB_BITS) #define DP_VS_RTAB_MASK (DP_VS_RTAB_SIZE - 1) static struct list_head dp_vs_rtable[DP_VS_RTAB_SIZE]; -/** +/* * Trash for destinations - * **/ + */ struct list_head dp_vs_dest_trash = LIST_HEAD_INIT(dp_vs_dest_trash); @@ -52,10 +52,14 @@ static inline unsigned dp_vs_rs_hashkey(int af, uint32_t port) { register unsigned porth = ntohs(port); - uint32_t addr_fold = addr->in.s_addr; + uint32_t addr_fold; -#ifdef CONFIG_IP_VS_IPV6 -#endif + addr_fold = inet_addr_fold(af, addr); + + if (!addr_fold) { + RTE_LOG(DEBUG, SERVICE, "%s: IP proto not support.\n", __func__); + return 0; + } return (ntohl(addr_fold) ^ (porth >> DP_VS_RTAB_BITS) ^ porth) & DP_VS_RTAB_MASK; @@ -83,7 +87,8 @@ static int dp_vs_rs_unhash(struct dp_vs_dest *dest) struct dp_vs_dest *dp_vs_lookup_dest(struct dp_vs_service *svc, - const union inet_addr *daddr, uint16_t dport) + const union inet_addr *daddr, + uint16_t dport) { struct dp_vs_dest *dest; @@ -123,7 +128,8 @@ struct dp_vs_dest *dp_vs_find_dest(int af, const union inet_addr *daddr, * scheduling. */ struct dp_vs_dest *dp_vs_trash_get_dest(struct dp_vs_service *svc, - const union inet_addr *daddr, uint16_t dport) + const union inet_addr *daddr, + uint16_t dport) { struct dp_vs_dest *dest, *nxt; @@ -170,16 +176,14 @@ void dp_vs_trash_cleanup(void) } static void __dp_vs_update_dest(struct dp_vs_service *svc, - struct dp_vs_dest *dest, struct dp_vs_dest_conf *udest) + struct dp_vs_dest *dest, + struct dp_vs_dest_conf *udest) { int conn_flags; rte_atomic16_set(&dest->weight, udest->weight); conn_flags = udest->conn_flags | DPVS_CONN_F_INACTIVE; -#ifdef CONFIG_IP_VS_IPV6 -#endif - rte_rwlock_write_lock(&__dp_vs_rs_lock); dp_vs_rs_hash(dest); rte_rwlock_write_unlock(&__dp_vs_rs_lock); @@ -207,8 +211,9 @@ static void __dp_vs_update_dest(struct dp_vs_service *svc, } -int dp_vs_new_dest(struct dp_vs_service *svc, struct dp_vs_dest_conf *udest, - struct dp_vs_dest **dest_p) +int dp_vs_new_dest(struct dp_vs_service *svc, + struct dp_vs_dest_conf *udest, + struct dp_vs_dest **dest_p) { int size; struct dp_vs_dest *dest; @@ -526,7 +531,8 @@ int dp_vs_get_dest_entries(const struct dp_vs_service *svc, if(count >= get->num_dests) break; memset(&entry, 0, sizeof(entry)); - entry.addr = dest->addr.in.s_addr; + entry.af = dest->af; + entry.addr = dest->addr; entry.port = dest->port; entry.conn_flags = dest->fwdmode; entry.weight = rte_atomic16_read(&dest->weight); diff --git a/src/ipvs/ip_vs_laddr.c b/src/ipvs/ip_vs_laddr.c index d845d9da9..f04100d90 100644 --- a/src/ipvs/ip_vs_laddr.c +++ b/src/ipvs/ip_vs_laddr.c @@ -98,6 +98,7 @@ /* laddr is configured with service instead of lcore */ struct dp_vs_laddr { + int af; struct list_head list; /* svc->laddr_list elem */ union inet_addr addr; rte_atomic32_t refcnt; @@ -163,7 +164,8 @@ int dp_vs_laddr_bind(struct dp_vs_conn *conn, struct dp_vs_service *svc) { struct dp_vs_laddr *laddr = NULL; int i; - struct sockaddr_in dsin, ssin = {0}; + uint16_t sport = 0; + struct sockaddr_storage dsin, ssin; if (!conn || !conn->dest || !svc) return EDPVS_INVAL; @@ -189,18 +191,32 @@ int dp_vs_laddr_bind(struct dp_vs_conn *conn, struct dp_vs_service *svc) return EDPVS_RESOURCE; } - memset(&dsin, 0, sizeof(struct sockaddr_in)); - dsin.sin_family = svc->af; - dsin.sin_addr = conn->daddr.in; - dsin.sin_port = conn->dport; - - memset(&ssin, 0, sizeof(struct sockaddr_in)); - ssin.sin_family = svc->af; - ssin.sin_addr = laddr->addr.in; + memset(&dsin, 0, sizeof(struct sockaddr_storage)); + memset(&ssin, 0, sizeof(struct sockaddr_storage)); + + if (laddr->af == AF_INET) { + struct sockaddr_in *daddr, *saddr; + daddr = (struct sockaddr_in *)&dsin; + daddr->sin_family = laddr->af; + daddr->sin_addr = conn->daddr.in; + daddr->sin_port = conn->dport; + saddr = (struct sockaddr_in *)&ssin; + saddr->sin_family = laddr->af; + saddr->sin_addr = laddr->addr.in; + } else { + struct sockaddr_in6 *daddr, *saddr; + daddr = (struct sockaddr_in6 *)&dsin; + daddr->sin6_family = laddr->af; + daddr->sin6_addr = conn->daddr.in6; + daddr->sin6_port = conn->dport; + saddr = (struct sockaddr_in6 *)&ssin; + saddr->sin6_family = laddr->af; + saddr->sin6_addr = laddr->addr.in6; + } - if (sa_fetch(laddr->iface, &dsin, &ssin) != EDPVS_OK) { + if (sa_fetch(laddr->af, laddr->iface, &dsin, &ssin) != EDPVS_OK) { char buf[64]; - if (inet_ntop(conn->af, &laddr->addr, buf, sizeof(buf)) == NULL) + if (inet_ntop(laddr->af, &laddr->addr, buf, sizeof(buf)) == NULL) snprintf(buf, sizeof(buf), "::"); #ifdef CONFIG_DPVS_IPVS_DEBUG @@ -211,11 +227,13 @@ int dp_vs_laddr_bind(struct dp_vs_conn *conn, struct dp_vs_service *svc) continue; } + sport = (laddr->af == AF_INET ? (((struct sockaddr_in *)&ssin)->sin_port) + : (((struct sockaddr_in6 *)&ssin)->sin6_port)); break; } rte_rwlock_write_unlock(&svc->laddr_lock); - if (!laddr || ssin.sin_port == 0) { + if (!laddr || sport == 0) { #ifdef CONFIG_DPVS_IPVS_DEBUG RTE_LOG(ERR, IPVS, "%s: [%d] no lport available !!\n", __func__, rte_lcore_id()); @@ -229,9 +247,9 @@ int dp_vs_laddr_bind(struct dp_vs_conn *conn, struct dp_vs_service *svc) /* overwrite related fields in out-tuplehash and conn */ conn->laddr = laddr->addr; - conn->lport = ssin.sin_port; + conn->lport = sport; tuplehash_out(conn).daddr = laddr->addr; - tuplehash_out(conn).dport = ssin.sin_port; + tuplehash_out(conn).dport = sport; conn->local = laddr; return EDPVS_OK; @@ -239,7 +257,7 @@ int dp_vs_laddr_bind(struct dp_vs_conn *conn, struct dp_vs_service *svc) int dp_vs_laddr_unbind(struct dp_vs_conn *conn) { - struct sockaddr_in dsin, ssin; + struct sockaddr_storage dsin, ssin; if (conn->flags & DPVS_CONN_F_TEMPLATE) return EDPVS_OK; @@ -247,15 +265,31 @@ int dp_vs_laddr_unbind(struct dp_vs_conn *conn) if (!conn->local) return EDPVS_OK; /* not FNAT ? */ - memset(&dsin, 0, sizeof(struct sockaddr_in)); - dsin.sin_family = conn->af; - dsin.sin_addr = conn->daddr.in; - dsin.sin_port = conn->dport; + memset(&dsin, 0, sizeof(struct sockaddr_storage)); + memset(&ssin, 0, sizeof(struct sockaddr_storage)); + + if (conn->local->af == AF_INET) { + struct sockaddr_in *daddr, *saddr; + daddr = (struct sockaddr_in *)&dsin; + daddr->sin_family = conn->local->af; + daddr->sin_addr = conn->daddr.in; + daddr->sin_port = conn->dport; + saddr = (struct sockaddr_in *)&ssin; + saddr->sin_family = conn->local->af; + saddr->sin_addr = conn->laddr.in; + saddr->sin_port = conn->lport; + } else { + struct sockaddr_in6 *daddr, *saddr; + daddr = (struct sockaddr_in6 *)&dsin; + daddr->sin6_family = conn->local->af; + daddr->sin6_addr = conn->daddr.in6; + daddr->sin6_port = conn->dport; + saddr = (struct sockaddr_in6 *)&ssin; + saddr->sin6_family = conn->local->af; + saddr->sin6_addr = conn->laddr.in6; + saddr->sin6_port = conn->lport; + } - memset(&ssin, 0, sizeof(struct sockaddr_in)); - ssin.sin_family = conn->af; - ssin.sin_addr = conn->laddr.in; - ssin.sin_port = conn->lport; sa_release(conn->local->iface, &dsin, &ssin); rte_atomic32_dec(&conn->local->conn_counts); @@ -265,7 +299,8 @@ int dp_vs_laddr_unbind(struct dp_vs_conn *conn) return EDPVS_OK; } -int dp_vs_laddr_add(struct dp_vs_service *svc, const union inet_addr *addr, +int dp_vs_laddr_add(struct dp_vs_service *svc, + int af, const union inet_addr *addr, const char *ifname) { struct dp_vs_laddr *new, *curr; @@ -278,6 +313,7 @@ int dp_vs_laddr_add(struct dp_vs_service *svc, const union inet_addr *addr, if (!new) return EDPVS_NOMEM; + new->af = af; new->addr = *addr; rte_atomic32_init(&new->refcnt); rte_atomic32_init(&new->conn_counts); @@ -291,7 +327,7 @@ int dp_vs_laddr_add(struct dp_vs_service *svc, const union inet_addr *addr, rte_rwlock_write_lock(&svc->laddr_lock); list_for_each_entry(curr, &svc->laddr_list, list) { - if (inet_addr_equal(svc->af, &curr->addr, &new->addr)) { + if (af == curr->af && inet_addr_equal(af, &curr->addr, &new->addr)) { rte_rwlock_write_unlock(&svc->laddr_lock); rte_free(new); return EDPVS_EXIST; @@ -305,7 +341,7 @@ int dp_vs_laddr_add(struct dp_vs_service *svc, const union inet_addr *addr, return EDPVS_OK; } -int dp_vs_laddr_del(struct dp_vs_service *svc, const union inet_addr *addr) +int dp_vs_laddr_del(struct dp_vs_service *svc, int af, const union inet_addr *addr) { struct dp_vs_laddr *laddr, *next; int err = EDPVS_NOTEXIST; @@ -315,7 +351,7 @@ int dp_vs_laddr_del(struct dp_vs_service *svc, const union inet_addr *addr) rte_rwlock_write_lock(&svc->laddr_lock); list_for_each_entry_safe(laddr, next, &svc->laddr_list, list) { - if (!inet_addr_equal(svc->af, &laddr->addr, addr)) + if (!((af == laddr->af) && inet_addr_equal(af, &laddr->addr, addr))) continue; /* found */ @@ -366,6 +402,7 @@ static int dp_vs_laddr_getall(struct dp_vs_service *svc, i = 0; list_for_each_entry(laddr, &svc->laddr_list, list) { assert(i < *naddr); + (*addrs)[i].af = laddr->af; (*addrs)[i].addr = laddr->addr; (*addrs)[i].nconns = rte_atomic32_read(&laddr->conn_counts); i++; @@ -396,7 +433,7 @@ int dp_vs_laddr_flush(struct dp_vs_service *svc) } else { char buf[64]; - if (inet_ntop(svc->af, &laddr->addr, buf, sizeof(buf)) == NULL) + if (inet_ntop(laddr->af, &laddr->addr, buf, sizeof(buf)) == NULL) snprintf(buf, sizeof(buf), "::"); RTE_LOG(DEBUG, IPVS, "%s: laddr %s is in use.\n", __func__, buf); @@ -421,9 +458,9 @@ static int laddr_sockopt_set(sockoptid_t opt, const void *conf, size_t size) if (!conf && size < sizeof(*laddr_conf)) return EDPVS_INVAL; - if (dp_vs_match_parse(laddr_conf->af, laddr_conf->srange, - laddr_conf->drange, laddr_conf->iifname, - laddr_conf->oifname, &match) != EDPVS_OK) + if (dp_vs_match_parse(laddr_conf->srange, laddr_conf->drange, + laddr_conf->iifname, laddr_conf->oifname, + &match) != EDPVS_OK) return EDPVS_INVAL; svc = dp_vs_service_lookup(laddr_conf->af, laddr_conf->proto, @@ -434,10 +471,11 @@ static int laddr_sockopt_set(sockoptid_t opt, const void *conf, size_t size) switch (opt) { case SOCKOPT_SET_LADDR_ADD: - err = dp_vs_laddr_add(svc, &laddr_conf->laddr, laddr_conf->ifname); + err = dp_vs_laddr_add(svc, laddr_conf->af, &laddr_conf->laddr, + laddr_conf->ifname); break; case SOCKOPT_SET_LADDR_DEL: - err = dp_vs_laddr_del(svc, &laddr_conf->laddr); + err = dp_vs_laddr_del(svc, laddr_conf->af, &laddr_conf->laddr); break; case SOCKOPT_SET_LADDR_FLUSH: err = dp_vs_laddr_flush(svc); @@ -465,9 +503,9 @@ static int laddr_sockopt_get(sockoptid_t opt, const void *conf, size_t size, if (!conf && size < sizeof(*laddr_conf)) return EDPVS_INVAL; - if (dp_vs_match_parse(laddr_conf->af, laddr_conf->srange, - laddr_conf->drange, laddr_conf->iifname, - laddr_conf->oifname, &match) != EDPVS_OK) + if (dp_vs_match_parse(laddr_conf->srange, laddr_conf->drange, + laddr_conf->iifname, laddr_conf->oifname, + &match) != EDPVS_OK) return EDPVS_INVAL; @@ -497,6 +535,7 @@ static int laddr_sockopt_get(sockoptid_t opt, const void *conf, size_t size, laddrs->nladdrs = naddr; for (i = 0; i < naddr; i++) { + laddrs->laddrs[i].af = addrs[i].af; laddrs->laddrs[i].addr = addrs[i].addr; /* TODO: nport_conflict & nconns */ laddrs->laddrs[i].nport_conflict = 0; diff --git a/src/ipvs/ip_vs_proto.c b/src/ipvs/ip_vs_proto.c index 6460dae2a..88bfc4ec0 100644 --- a/src/ipvs/ip_vs_proto.c +++ b/src/ipvs/ip_vs_proto.c @@ -72,6 +72,7 @@ struct dp_vs_proto *dp_vs_proto_lookup(uint8_t proto) extern struct dp_vs_proto dp_vs_proto_udp; extern struct dp_vs_proto dp_vs_proto_tcp; extern struct dp_vs_proto dp_vs_proto_icmp; +extern struct dp_vs_proto dp_vs_proto_icmp6; int dp_vs_proto_init(void) { @@ -87,6 +88,11 @@ int dp_vs_proto_init(void) goto tcp_error; } + if ((err = proto_register(&dp_vs_proto_icmp6)) != EDPVS_OK) { + RTE_LOG(ERR, IPVS, "%s: fail to register ICMPV6\n", __func__); + goto icmp6_error; + } + if ((err = proto_register(&dp_vs_proto_icmp)) != EDPVS_OK) { RTE_LOG(ERR, IPVS, "%s: fail to register ICMP\n", __func__); goto icmp_error; @@ -95,6 +101,8 @@ int dp_vs_proto_init(void) return EDPVS_OK; icmp_error: + proto_unregister(&dp_vs_proto_icmp6); +icmp6_error: proto_unregister(&dp_vs_proto_tcp); tcp_error: proto_unregister(&dp_vs_proto_udp); @@ -106,6 +114,9 @@ int dp_vs_proto_term(void) if (proto_unregister(&dp_vs_proto_icmp) != EDPVS_OK) RTE_LOG(ERR, IPVS, "%s: fail to unregister ICMP\n", __func__); + if (proto_unregister(&dp_vs_proto_icmp6) != EDPVS_OK) + RTE_LOG(ERR, IPVS, "%s: fail to unregister ICMPV6\n", __func__); + if (proto_unregister(&dp_vs_proto_tcp) != EDPVS_OK) RTE_LOG(ERR, IPVS, "%s: fail to unregister TCP\n", __func__); diff --git a/src/ipvs/ip_vs_proto_icmp.c b/src/ipvs/ip_vs_proto_icmp.c index c2f4fdcd5..2fc29e439 100644 --- a/src/ipvs/ip_vs_proto_icmp.c +++ b/src/ipvs/ip_vs_proto_icmp.c @@ -22,10 +22,13 @@ */ #include #include +#include #include "dpdk.h" #include "common.h" #include "inet.h" #include "ipv4.h" +#include "ipv6.h" +#include "icmp6.h" #include "ipvs/ipvs.h" #include "ipvs/proto.h" #include "ipvs/proto_icmp.h" @@ -57,6 +60,8 @@ * - ip_vs_nat_xmit() or ip_vs_out_snat_xmit() * - handle_response() * + * + For ICMPv6 messages in SNAT/DNAT/FULLNAT, checksum should be recaculate. + * * + For ICMP-Error, which includes original IP packet as payload: * Those embedded IPs are not be handled here IPVS core. */ @@ -72,11 +77,21 @@ static int icmp_conn_sched(struct dp_vs_proto *proto, struct dp_vs_conn **conn, int *verdict) { - struct icmphdr *ich, _icmph; + void *ich = NULL; struct dp_vs_service *svc; + int af = iph->af; assert(proto && iph && mbuf && conn && verdict); - ich = mbuf_header_pointer(mbuf, iph->len, sizeof(_icmph), &_icmph); + if (AF_INET6 == af) { + struct icmp6_hdr _icmph6; + ich = mbuf_header_pointer(mbuf, iph->len, sizeof(_icmph6), + (void *)&_icmph6); + } else { + struct icmphdr _icmph; + ich = mbuf_header_pointer(mbuf, iph->len, sizeof(_icmph), + (void *)&_icmph); + } + if (unlikely(!ich)) { *verdict = INET_DROP; return EDPVS_INVPKT; @@ -102,34 +117,66 @@ static int icmp_conn_sched(struct dp_vs_proto *proto, } static const uint8_t invmap[] = { - [ICMP_ECHO] = ICMP_ECHOREPLY + 1, - [ICMP_ECHOREPLY] = ICMP_ECHO + 1, - [ICMP_TIMESTAMP] = ICMP_TIMESTAMPREPLY + 1, - [ICMP_TIMESTAMPREPLY] = ICMP_TIMESTAMP + 1, - [ICMP_INFO_REQUEST] = ICMP_INFO_REPLY + 1, - [ICMP_INFO_REPLY] = ICMP_INFO_REQUEST + 1, - [ICMP_ADDRESS] = ICMP_ADDRESSREPLY + 1, - [ICMP_ADDRESSREPLY] = ICMP_ADDRESS + 1 + [ICMP_ECHO] = ICMP_ECHOREPLY + 1, + [ICMP_ECHOREPLY] = ICMP_ECHO + 1, + [ICMP_TIMESTAMP] = ICMP_TIMESTAMPREPLY + 1, + [ICMP_TIMESTAMPREPLY] = ICMP_TIMESTAMP + 1, + [ICMP_INFO_REQUEST] = ICMP_INFO_REPLY + 1, + [ICMP_INFO_REPLY] = ICMP_INFO_REQUEST + 1, + [ICMP_ADDRESS] = ICMP_ADDRESSREPLY + 1, + [ICMP_ADDRESSREPLY] = ICMP_ADDRESS + 1 }; static bool icmp_invert_type(uint8_t *type, uint8_t orig) { - if (orig >= sizeof(invmap) || !invmap[orig]) - return false; + if (orig >= sizeof(invmap) || !invmap[orig]) + return false; - *type = invmap[orig] - 1; - return true; + *type = invmap[orig] - 1; + return true; +} + +/* + * imverse map for icmp6 + * for example: + * invmap6[ICMP6_ECHO_REPLY] - 1 => ICMP6_ECHO_REQUEST + 1 - 1 + * => ICMP6_ECHO_REQUEST + * and + * invmap6[ICMP6_ECHO_REQUEST] - 1 => ICMP6_ECHO_REPLY + 1 - 1 + * => ICMP6_ECHO_REPLY + */ +static const uint8_t invmap6[] = { + [ICMP6_ECHO_REPLY] = ICMP6_ECHO_REQUEST + 1, + [ICMP6_ECHO_REQUEST] = ICMP6_ECHO_REPLY + 1 +}; + +/* + * icmp6_invert_type: invert type used for icmpv6 + * @type: original icmp6 type + * @return true or false + */ +static bool icmp6_invert_type(uint8_t *type, uint8_t orig) { + if (orig >= sizeof(invmap6) || !invmap6[orig]) { + return false; + } + *type = invmap6[orig] - 1; + return true; } static bool is_icmp_reply(uint8_t type) { - if (type == ICMP_ECHOREPLY - || type == ICMP_TIMESTAMPREPLY - || type == ICMP_INFO_REPLY - || type == ICMP_ADDRESSREPLY) - return true; - else - return false; + if (type == ICMP_ECHOREPLY || type == ICMP_TIMESTAMPREPLY || + type == ICMP_INFO_REPLY || type == ICMP_ADDRESSREPLY) + return true; + else + return false; +} + +static bool is_icmp6_reply(uint8_t type) { + if (type == ICMP6_ECHO_REPLY) { + return true; + } + return false; } static struct dp_vs_conn *icmp_conn_lookup(struct dp_vs_proto *proto, @@ -137,29 +184,77 @@ static struct dp_vs_conn *icmp_conn_lookup(struct dp_vs_proto *proto, struct rte_mbuf *mbuf, int *direct, bool reverse, bool *drop) { - struct icmphdr *ich, _icmph; + void *ich = NULL; __be16 sport, dport; /* dummy ports */ uint8_t type; + int af = iph->af; + /* true icmp type/code, used for v4/v6 */ + uint8_t icmp_type = 0; + uint8_t icmp_code = 0; assert(proto && iph && mbuf); - ich = mbuf_header_pointer(mbuf, iph->len, sizeof(_icmph), &_icmph); - if (unlikely(!ich)) - return NULL; - - if (!is_icmp_reply(ich->type)) { - sport = ich->un.echo.id; - dport = ich->type << 8 | ich->code; - } else if (icmp_invert_type(&type, ich->type)) { - sport = type << 8 | ich->code; - dport = ich->un.echo.id; + if (AF_INET6 == af) { + struct icmp6_hdr _icmph6; + ich = mbuf_header_pointer(mbuf, iph->len, sizeof(_icmph6), + (void *)&_icmph6); + if (unlikely(!ich)) + return NULL; + /* icmp v6 */ + icmp_type = ((struct icmp6_hdr *)ich)->icmp6_type; + icmp_code = ((struct icmp6_hdr *)ich)->icmp6_code; + if (! is_icmp6_reply(icmp_type)) { + sport = ((struct icmp6_hdr *)ich)->icmp6_id; + dport = icmp_type << 8 | icmp_code; + } else if (icmp6_invert_type(&type, icmp_type)) { + sport = type << 8 | icmp_code; + dport = ((struct icmp6_hdr *)ich)->icmp6_id; + } else { + return NULL; + } } else { - return NULL; + struct icmphdr _icmph; + ich = mbuf_header_pointer(mbuf, iph->len, sizeof(_icmph), + (void *)&_icmph); + if (unlikely(!ich)) + return NULL; + /* icmp v4 */ + icmp_type = ((struct icmphdr *)ich)->type; + icmp_code = ((struct icmphdr *)ich)->code; + if (!is_icmp_reply(icmp_type)) { + sport = ((struct icmphdr *)ich)->un.echo.id; + dport = icmp_type << 8 | icmp_code; + } else if (icmp_invert_type(&type, icmp_type)) { + sport = type << 8 | icmp_code; + dport = ((struct icmphdr *)ich)->un.echo.id; + } else { + return NULL; + } } return dp_vs_conn_get(iph->af, iph->proto, &iph->saddr, &iph->daddr, sport, dport, direct, reverse); } +static int icmp6_csum_handler(struct dp_vs_proto *proto, + struct dp_vs_conn *conn, struct rte_mbuf *mbuf) +{ + struct ip6_hdr *ip6h = ip6_hdr(mbuf); + struct icmp6_hdr *ich; + uint8_t ip6nxt = ip6h->ip6_nxt; + int offset = ip6_skip_exthdr(mbuf, sizeof(struct ip6_hdr), &ip6nxt); + + if (unlikely(mbuf_may_pull(mbuf, offset + sizeof(struct icmp6_hdr)) != 0)) + return EDPVS_INVPKT; + + ich = rte_pktmbuf_mtod_offset(mbuf, struct icmp6_hdr *, offset); + if (unlikely(!ich)) + return EDPVS_INVPKT; + + icmp6_send_csum(ip6h, ich); + + return EDPVS_OK; +} + static int icmp_state_trans(struct dp_vs_proto *proto, struct dp_vs_conn *conn, struct rte_mbuf *mbuf, int dir) { @@ -175,3 +270,17 @@ struct dp_vs_proto dp_vs_proto_icmp = { .conn_lookup = icmp_conn_lookup, .state_trans = icmp_state_trans, }; + +struct dp_vs_proto dp_vs_proto_icmp6 = { + .name = "ICMPV6", + .proto = IPPROTO_ICMPV6, + .conn_sched = icmp_conn_sched, + .conn_lookup = icmp_conn_lookup, + .nat_in_handler = icmp6_csum_handler, + .nat_out_handler = icmp6_csum_handler, + .fnat_in_handler = icmp6_csum_handler, + .fnat_out_handler = icmp6_csum_handler, + .snat_in_handler = icmp6_csum_handler, + .snat_out_handler = icmp6_csum_handler, + .state_trans = icmp_state_trans, +}; diff --git a/src/ipvs/ip_vs_proto_tcp.c b/src/ipvs/ip_vs_proto_tcp.c index ce6e724c7..3a78639d7 100644 --- a/src/ipvs/ip_vs_proto_tcp.c +++ b/src/ipvs/ip_vs_proto_tcp.c @@ -20,6 +20,8 @@ #include "common.h" #include "dpdk.h" #include "ipv4.h" +#include "ipv6.h" +#include "route6.h" #include "neigh.h" #include "ipvs/ipvs.h" #include "ipvs/proto.h" @@ -94,24 +96,65 @@ static struct tcp_state tcp_states[] = { static uint32_t tcp_secret; -/* if tcp header will be modified mbuf_header_pointer() cannot be used. */ +/* + * tcp_hdr: get the pointer to tcp header + * @af: address family + * @mbuf: message buffer from DPDK + * @return pointer to the tcp header + * + * if tcp header will be modified mbuf_header_pointer() cannot be used + */ inline struct tcphdr *tcp_hdr(const struct rte_mbuf *mbuf) { - int ip4hlen = ip4_hdrlen(mbuf); + int iphdrlen; + unsigned char version, *verp; + + verp = rte_pktmbuf_mtod(mbuf, unsigned char*); + version = (*verp >> 4) & 0xf; + + if (4 == version) { + iphdrlen = ip4_hdrlen(mbuf); + } else if (6 == version) { + struct ip6_hdr *ip6h = ip6_hdr(mbuf); + uint8_t ip6nxt = ip6h->ip6_nxt; + iphdrlen = ip6_skip_exthdr(mbuf, sizeof(struct ip6_hdr), &ip6nxt); + if (iphdrlen < 0) + return NULL; + } else { + return NULL; + } /* do not support frags */ - if (unlikely(mbuf->data_len < ip4hlen + sizeof(struct tcphdr))) + if (unlikely(mbuf->data_len < iphdrlen + sizeof(struct tcphdr))) return NULL; - return rte_pktmbuf_mtod_offset(mbuf, struct tcphdr *, ip4hlen); + return rte_pktmbuf_mtod_offset(mbuf, struct tcphdr *, iphdrlen); } +/* + * tcp4_send_csum: compute checksum for tcp/udp ipv4 + * @iph: pointer to ipv4 header + * @th: pointer to the beginning of the L4 header + * @return void + */ inline void tcp4_send_csum(struct ipv4_hdr *iph, struct tcphdr *th) { th->check = 0; th->check = rte_ipv4_udptcp_cksum(iph, th); } +/* + * tcp6_send_csum: compute checksum for tcp ipv6 + * @iph: pointer to ipv6 header in dpdk ipv6_hdr format + * @th: pointer to the beginning of the L4 header + * @return void + */ +inline void tcp6_send_csum(struct ipv6_hdr *iph, struct tcphdr *th) { + th->check = 0; + th->check = ip6_udptcp_cksum((struct ip6_hdr *)iph, th, + (void *)th - (void *)iph, IPPROTO_TCP); +} + static inline uint32_t seq_scale(uint32_t seq) { struct timespec now; @@ -215,12 +258,15 @@ static inline int tcp_in_add_toa(struct dp_vs_conn *conn, struct rte_mbuf *mbuf, { uint32_t mtu; struct tcpopt_addr *toa; + uint32_t tcp_opt_len; + uint8_t *p, *q, *tail; struct route_entry *rt; - if (unlikely(conn->af != AF_INET)) + if (unlikely(conn->af != AF_INET && conn->af != AF_INET6)) return EDPVS_NOTSUPP; + tcp_opt_len = conn->af == AF_INET ? TCP_OLEN_IP4_ADDR : TCP_OLEN_IP6_ADDR; /* * check if we can add the new option */ @@ -234,14 +280,16 @@ static inline int tcp_in_add_toa(struct dp_vs_conn *conn, struct rte_mbuf *mbuf, return EDPVS_NOROUTE; } - if (unlikely(mbuf->pkt_len > (mtu - sizeof(struct tcpopt_addr)))) { - RTE_LOG(DEBUG, IPVS, "add toa: need fragment.\n"); + if (unlikely(mbuf->pkt_len > (mtu - tcp_opt_len))) { + RTE_LOG(DEBUG, IPVS, "add toa: need fragment, tcp opt len : %u.\n", + tcp_opt_len); return EDPVS_FRAG; } /* maximum TCP header is 60, and 40 for options */ - if (unlikely((60 - (tcph->doff << 2)) < sizeof(struct tcpopt_addr))) { - RTE_LOG(DEBUG, IPVS, "add toa: no TCP header room.\n"); + if (unlikely((60 - (tcph->doff << 2)) < tcp_opt_len)) { + RTE_LOG(DEBUG, IPVS, "add toa: no TCP header room, tcp opt len : %u.\n", + tcp_opt_len); return EDPVS_NOROOM; } @@ -249,9 +297,10 @@ static inline int tcp_in_add_toa(struct dp_vs_conn *conn, struct rte_mbuf *mbuf, * have to pull all bits in segments for later operation. */ if (unlikely(mbuf_may_pull(mbuf, mbuf->pkt_len) != 0)) return EDPVS_INVPKT; - tail = (uint8_t *)rte_pktmbuf_append(mbuf, sizeof(struct tcpopt_addr)); + tail = (uint8_t *)rte_pktmbuf_append(mbuf, tcp_opt_len); if (unlikely(!tail)) { - RTE_LOG(DEBUG, IPVS, "add toa: no mbuf tail room.\n"); + RTE_LOG(DEBUG, IPVS, "add toa: no mbuf tail room, tcp opt len : %u.\n", + tcp_opt_len); return EDPVS_NOROOM; } @@ -263,7 +312,7 @@ static inline int tcp_in_add_toa(struct dp_vs_conn *conn, struct rte_mbuf *mbuf, * @p is last data byte, * @q is new position of last data byte */ p = tail - 1; - q = p + sizeof(struct tcpopt_addr); + q = p + tcp_opt_len; while (p >= ((uint8_t *)tcph + sizeof(struct tcphdr))) { *q = *p; p--, q--; @@ -272,16 +321,29 @@ static inline int tcp_in_add_toa(struct dp_vs_conn *conn, struct rte_mbuf *mbuf, /* insert toa right after TCP basic header */ toa = (struct tcpopt_addr *)(tcph + 1); toa->opcode = TCP_OPT_ADDR; - toa->opsize = TCP_OLEN_ADDR; + toa->opsize = tcp_opt_len; toa->port = conn->cport; - toa->addr = conn->caddr.in.s_addr; + + if (conn->af == AF_INET) { + struct tcpopt_ip4_addr *toa_ip4 = (struct tcpopt_ip4_addr *)(tcph + 1); + toa_ip4->addr = conn->caddr.in; + } + else { + struct tcpopt_ip6_addr *toa_ip6 = (struct tcpopt_ip6_addr *)(tcph + 1); + toa_ip6->addr = conn->caddr.in6; + } + /* reset tcp header length */ - tcph->doff += sizeof(struct tcpopt_addr) >> 2; + tcph->doff += tcp_opt_len >> 2; + /* reset ip header total length */ - ip4_hdr(mbuf)->total_length = - htons(ntohs(ip4_hdr(mbuf)->total_length) - + sizeof(struct tcpopt_addr)); + if (conn->af == AF_INET) + ip4_hdr(mbuf)->total_length = + htons(ntohs(ip4_hdr(mbuf)->total_length) + tcp_opt_len); + else + ip6_hdr(mbuf)->ip6_plen = + htons(ntohs(ip6_hdr(mbuf)->ip6_plen) + tcp_opt_len); /* tcp csum will be recalc later, * so as IP hdr csum since iph.tot_len has been chagned. */ @@ -308,11 +370,17 @@ static void tcp_out_save_seq(struct rte_mbuf *mbuf, conn->rs_end_ack = th->ack_seq; } -static void tcp_out_adjust_mss(struct tcphdr *tcph) +static void tcp_out_adjust_mss(int af, struct tcphdr *tcph) { unsigned char *ptr; int length; + if (unlikely(af != AF_INET && af != AF_INET6)) { + RTE_LOG(DEBUG, IPVS, "adjust mss: unknow af, af : %d.\n", + af); + return ; + } + ptr = (unsigned char *)(tcph + 1); length = (tcph->doff << 2) - sizeof(struct tcphdr); @@ -335,7 +403,7 @@ static void tcp_out_adjust_mss(struct tcphdr *tcph) if ((opcode == TCP_OPT_MSS) && (opsize == TCP_OLEN_MSS)) { uint16_t in_mss = ntohs(*(__be16 *) ptr); - in_mss -= TCP_OLEN_ADDR; + in_mss -= (af == AF_INET ? TCP_OLEN_IP4_ADDR : TCP_OLEN_IP6_ADDR); /* set mss, 16bit */ *((uint16_t *) ptr) = htons(in_mss); @@ -532,11 +600,11 @@ tcp_conn_lookup(struct dp_vs_proto *proto, const struct dp_vs_iphdr *iph, if (conn != NULL) { if (th->ack) { if ((*direct == DPVS_CONN_DIR_INBOUND) && conn->out_dev - && (conn->out_nexthop.in.s_addr != htonl(INADDR_ANY))) { - neigh_confirm(conn->out_nexthop.in, conn->out_dev); + && (!inet_is_addr_any(conn->af, &conn->out_nexthop))) { + neigh_confirm(conn->af, &conn->out_nexthop, conn->out_dev); } else if ((*direct == DPVS_CONN_DIR_OUTBOUND) && conn->in_dev - && (conn->in_nexthop.in.s_addr != htonl(INADDR_ANY))) { - neigh_confirm(conn->in_nexthop.in, conn->in_dev); + && (!inet_is_addr_any(conn->af, &conn->in_nexthop))) { + neigh_confirm(conn->af, &conn->in_nexthop, conn->in_dev); } } } @@ -550,16 +618,17 @@ static int tcp_fnat_in_handler(struct dp_vs_proto *proto, struct tcphdr *th; struct route_entry *rt = mbuf->userdata; struct netif_port *dev = NULL; - int ip4hlen = ip4_hdrlen(mbuf); - - if (mbuf_may_pull(mbuf, ip4hlen + sizeof(*th)) != 0) + int af = conn->af; + int iphdrlen = ((AF_INET6 == af) ? ip6_hdrlen(mbuf): ip4_hdrlen(mbuf)); + + if (mbuf_may_pull(mbuf, iphdrlen + sizeof(*th)) != 0) return EDPVS_INVPKT; th = tcp_hdr(mbuf); if (unlikely(!th)) return EDPVS_INVPKT; - if (mbuf_may_pull(mbuf, ip4hlen + (th->doff<<2)) != 0) + if (mbuf_may_pull(mbuf, iphdrlen + (th->doff << 2)) != 0) return EDPVS_INVPKT; /* @@ -594,14 +663,26 @@ static int tcp_fnat_in_handler(struct dp_vs_proto *proto, dev = conn->in_dev; if (likely(dev && (dev->flag & NETIF_PORT_FLAG_TX_TCP_CSUM_OFFLOAD))) { - mbuf->l4_len = ntohs(ip4_hdr(mbuf)->total_length) - ip4hlen; - mbuf->l3_len = ip4hlen; - mbuf->ol_flags |= (PKT_TX_TCP_CKSUM | PKT_TX_IP_CKSUM | PKT_TX_IPV4); - th->check = rte_ipv4_phdr_cksum(ip4_hdr(mbuf), mbuf->ol_flags); + if (AF_INET6 == af) { + struct ip6_hdr *ip6h = ip6_hdr(mbuf); + mbuf->l3_len = iphdrlen; + mbuf->l4_len = ntohs(ip6h->ip6_plen) + sizeof(struct ip6_hdr) - iphdrlen; + mbuf->ol_flags |= (PKT_TX_TCP_CKSUM | PKT_TX_IPV6); + th->check = ip6_phdr_cksum(ip6h, mbuf->ol_flags, iphdrlen, IPPROTO_TCP); + } else { + mbuf->l4_len = ntohs(ip4_hdr(mbuf)->total_length) - iphdrlen; + mbuf->l3_len = iphdrlen; + mbuf->ol_flags |= (PKT_TX_TCP_CKSUM | PKT_TX_IP_CKSUM | PKT_TX_IPV4); + th->check = rte_ipv4_phdr_cksum(ip4_hdr(mbuf), mbuf->ol_flags); + } } else { if (mbuf_may_pull(mbuf, mbuf->pkt_len) != 0) return EDPVS_INVPKT; - tcp4_send_csum(ip4_hdr(mbuf), th); + if (AF_INET6 == af) { + tcp6_send_csum((struct ipv6_hdr *)ip6_hdr(mbuf), th); + } else { + tcp4_send_csum(ip4_hdr(mbuf), th); + } } return EDPVS_OK; @@ -613,16 +694,17 @@ static int tcp_fnat_out_handler(struct dp_vs_proto *proto, struct tcphdr *th; struct route_entry *rt = mbuf->userdata; struct netif_port *dev = NULL; - int ip4hlen = ip4_hdrlen(mbuf); + int af = conn->af; + int iphdrlen = ((AF_INET6 == af) ? ip6_hdrlen(mbuf): ip4_hdrlen(mbuf)); - if (mbuf_may_pull(mbuf, ip4hlen + sizeof(*th)) != 0) + if (mbuf_may_pull(mbuf, iphdrlen + sizeof(*th)) != 0) return EDPVS_INVPKT; th = tcp_hdr(mbuf); if (unlikely(!th)) return EDPVS_INVPKT; - if (mbuf_may_pull(mbuf, ip4hlen + (th->doff<<2)) != 0) + if (mbuf_may_pull(mbuf, iphdrlen + (th->doff<<2)) != 0) return EDPVS_INVPKT; /* save last seq/ack from RS for RST when conn expire */ @@ -633,7 +715,7 @@ static int tcp_fnat_out_handler(struct dp_vs_proto *proto, th->dest = conn->cport; if (th->syn && th->ack) - tcp_out_adjust_mss(th); + tcp_out_adjust_mss(af, th); /* adjust ACK/SACK from RS since inbound SEQ is changed */ if (tcp_out_adjust_seq(conn, th) != EDPVS_OK) @@ -648,14 +730,26 @@ static int tcp_fnat_out_handler(struct dp_vs_proto *proto, dev = conn->out_dev; if (likely(dev && (dev->flag & NETIF_PORT_FLAG_TX_TCP_CSUM_OFFLOAD))) { - mbuf->l4_len = ntohs(ip4_hdr(mbuf)->total_length) - ip4hlen; - mbuf->l3_len = ip4hlen; - mbuf->ol_flags |= (PKT_TX_TCP_CKSUM | PKT_TX_IP_CKSUM | PKT_TX_IPV4); - th->check = rte_ipv4_phdr_cksum(ip4_hdr(mbuf), mbuf->ol_flags); + if (AF_INET6 == af) { + struct ip6_hdr *ip6h = ip6_hdr(mbuf); + mbuf->l3_len = iphdrlen; + mbuf->l4_len = ntohs(ip6h->ip6_plen) + sizeof(struct ip6_hdr) - iphdrlen; + mbuf->ol_flags |= (PKT_TX_TCP_CKSUM | PKT_TX_IPV6); + th->check = ip6_phdr_cksum(ip6h, mbuf->ol_flags, iphdrlen, IPPROTO_TCP); + } else { + mbuf->l4_len = ntohs(ip4_hdr(mbuf)->total_length) - iphdrlen; + mbuf->l3_len = iphdrlen; + mbuf->ol_flags |= (PKT_TX_TCP_CKSUM | PKT_TX_IP_CKSUM | PKT_TX_IPV4); + th->check = rte_ipv4_phdr_cksum(ip4_hdr(mbuf), mbuf->ol_flags); + } } else { if (mbuf_may_pull(mbuf, mbuf->pkt_len) != 0) return EDPVS_INVPKT; - tcp4_send_csum(ip4_hdr(mbuf), th); + if (AF_INET6 == af) { + tcp6_send_csum((struct ipv6_hdr *)ip6_hdr(mbuf), th); + } else { + tcp4_send_csum(ip4_hdr(mbuf), th); + } } return EDPVS_OK; @@ -665,18 +759,19 @@ static int tcp_snat_in_handler(struct dp_vs_proto *proto, struct dp_vs_conn *conn, struct rte_mbuf *mbuf) { struct tcphdr *th; - int ip4hlen = ip4_hdrlen(mbuf); struct netif_port *dev = NULL; struct route_entry *rt = mbuf->userdata; + int af = conn->af; + int iphdrlen = ((AF_INET6 == af) ? ip6_hdrlen(mbuf): ip4_hdrlen(mbuf)); - if (mbuf_may_pull(mbuf, ip4hlen + sizeof(*th)) != 0) + if (mbuf_may_pull(mbuf, iphdrlen + sizeof(*th)) != 0) return EDPVS_INVPKT; th = tcp_hdr(mbuf); if (unlikely(!th)) return EDPVS_INVPKT; - if (mbuf_may_pull(mbuf, ip4hlen + (th->doff<<2)) != 0) + if (mbuf_may_pull(mbuf, iphdrlen + (th->doff << 2)) != 0) return EDPVS_INVPKT; /* L4 translation */ @@ -688,14 +783,26 @@ static int tcp_snat_in_handler(struct dp_vs_proto *proto, /* leverage HW TX TCP csum offload if possible */ if (likely(dev && (dev->flag & NETIF_PORT_FLAG_TX_TCP_CSUM_OFFLOAD))) { - mbuf->l4_len = ntohs(ip4_hdr(mbuf)->total_length) - ip4hlen; - mbuf->l3_len = ip4hlen; - mbuf->ol_flags |= (PKT_TX_TCP_CKSUM | PKT_TX_IP_CKSUM | PKT_TX_IPV4); - th->check = rte_ipv4_phdr_cksum(ip4_hdr(mbuf), mbuf->ol_flags); + if (AF_INET6 == af) { + struct ip6_hdr *ip6h = ip6_hdr(mbuf); + mbuf->l3_len = iphdrlen; + mbuf->l4_len = ntohs(ip6h->ip6_plen) + sizeof(struct ip6_hdr) - iphdrlen; + mbuf->ol_flags |= (PKT_TX_TCP_CKSUM | PKT_TX_IPV6); + th->check = ip6_phdr_cksum(ip6h, mbuf->ol_flags, iphdrlen, IPPROTO_TCP); + } else { + mbuf->l4_len = ntohs(ip4_hdr(mbuf)->total_length) - iphdrlen; + mbuf->l3_len = iphdrlen; + mbuf->ol_flags |= (PKT_TX_TCP_CKSUM | PKT_TX_IP_CKSUM | PKT_TX_IPV4); + th->check = rte_ipv4_phdr_cksum(ip4_hdr(mbuf), mbuf->ol_flags); + } } else { if (mbuf_may_pull(mbuf, mbuf->pkt_len) != 0) return EDPVS_INVPKT; - tcp4_send_csum(ip4_hdr(mbuf), th); + if (AF_INET6 == af) { + tcp6_send_csum((struct ipv6_hdr *)ip6_hdr(mbuf), th); + } else { + tcp4_send_csum(ip4_hdr(mbuf), th); + } } return EDPVS_OK; @@ -705,18 +812,19 @@ static int tcp_snat_out_handler(struct dp_vs_proto *proto, struct dp_vs_conn *conn, struct rte_mbuf *mbuf) { struct tcphdr *th; - int ip4hlen = ip4_hdrlen(mbuf); struct netif_port *dev = NULL; struct route_entry *rt = mbuf->userdata; + int af = conn->af; + int iphdrlen = ((AF_INET6 == af) ? ip6_hdrlen(mbuf): ip4_hdrlen(mbuf)); - if (mbuf_may_pull(mbuf, ip4hlen + sizeof(*th)) != 0) + if (mbuf_may_pull(mbuf, iphdrlen + sizeof(*th)) != 0) return EDPVS_INVPKT; th = tcp_hdr(mbuf); if (unlikely(!th)) return EDPVS_INVPKT; - if (mbuf_may_pull(mbuf, ip4hlen + (th->doff<<2)) != 0) + if (mbuf_may_pull(mbuf, iphdrlen + (th->doff << 2)) != 0) return EDPVS_INVPKT; /* L4 translation */ @@ -728,14 +836,26 @@ static int tcp_snat_out_handler(struct dp_vs_proto *proto, /* leverage HW TX TCP csum offload if possible */ if (likely(dev && (dev->flag & NETIF_PORT_FLAG_TX_TCP_CSUM_OFFLOAD))) { - mbuf->l4_len = ntohs(ip4_hdr(mbuf)->total_length) - ip4hlen; - mbuf->l3_len = ip4hlen; - mbuf->ol_flags |= (PKT_TX_TCP_CKSUM | PKT_TX_IP_CKSUM | PKT_TX_IPV4); - th->check = rte_ipv4_phdr_cksum(ip4_hdr(mbuf), mbuf->ol_flags); + if (AF_INET6 == af) { + struct ip6_hdr *ip6h = ip6_hdr(mbuf); + mbuf->l3_len = iphdrlen; + mbuf->l4_len = ntohs(ip6h->ip6_plen) + sizeof(struct ip6_hdr) - iphdrlen; + mbuf->ol_flags |= (PKT_TX_TCP_CKSUM | PKT_TX_IPV6); + th->check = ip6_phdr_cksum(ip6h, mbuf->ol_flags, iphdrlen, IPPROTO_TCP); + } else { + mbuf->l4_len = ntohs(ip4_hdr(mbuf)->total_length) - iphdrlen; + mbuf->l3_len = iphdrlen; + mbuf->ol_flags |= (PKT_TX_TCP_CKSUM | PKT_TX_IP_CKSUM | PKT_TX_IPV4); + th->check = rte_ipv4_phdr_cksum(ip4_hdr(mbuf), mbuf->ol_flags); + } } else { if (mbuf_may_pull(mbuf, mbuf->pkt_len) != 0) return EDPVS_INVPKT; - tcp4_send_csum(ip4_hdr(mbuf), th); + if (AF_INET6 == af) { + tcp6_send_csum((struct ipv6_hdr *)ip6_hdr(mbuf), th); + } else { + tcp4_send_csum(ip4_hdr(mbuf), th); + } } return EDPVS_OK; @@ -772,12 +892,14 @@ static int tcp_state_trans(struct dp_vs_proto *proto, struct dp_vs_conn *conn, assert(proto && conn && mbuf); struct dp_vs_dest *dest = conn->dest; unsigned conn_timeout = 0; + int af = conn->af; #ifdef CONFIG_DPVS_IPVS_DEBUG char dbuf[64], cbuf[64]; const char *daddr, *caddr; #endif - th = mbuf_header_pointer(mbuf, ip4_hdrlen(mbuf), sizeof(_tcph), &_tcph); + int iphdrlen = ((AF_INET6 == af) ? ip6_hdrlen(mbuf): ip4_hdrlen(mbuf)); + th = mbuf_header_pointer(mbuf, iphdrlen, sizeof(_tcph), &_tcph); if (unlikely(!th)) return EDPVS_INVPKT; if (dest->fwdmode == DPVS_FWD_MODE_DR || dest->fwdmode == DPVS_FWD_MODE_TUNNEL) @@ -850,33 +972,54 @@ static int tcp_state_trans(struct dp_vs_proto *proto, struct dp_vs_conn *conn, struct rte_mempool *get_mbuf_pool(const struct dp_vs_conn *conn, int dir) { - struct flow4 fl4; struct netif_port *dev; - struct route_entry *rt = NULL; /* we need oif for correct rte_mempoll, * most likely oif is conn->in/out_dev (fast-xmit), * if not, determine output device by route. */ dev = ((dir == DPVS_CONN_DIR_INBOUND) ? conn->in_dev : conn->out_dev); if (unlikely(!dev)) { - memset(&fl4, 0, sizeof(struct flow4)); - if (dir == DPVS_CONN_DIR_INBOUND) { - fl4.saddr = conn->laddr.in; - fl4.daddr = conn->daddr.in; - fl4.sport = conn->lport; - fl4.dport = conn->dport; - } else { - fl4.saddr = conn->vaddr.in; - fl4.daddr = conn->caddr.in; - fl4.sport = conn->vport; - fl4.dport = conn->cport; + if (AF_INET == conn->af) { + struct route_entry *rt = NULL; + struct flow4 fl4; + memset(&fl4, 0, sizeof(struct flow4)); + if (dir == DPVS_CONN_DIR_INBOUND) { + fl4.fl4_saddr = conn->laddr.in; + fl4.fl4_daddr = conn->daddr.in; + fl4.fl4_sport = conn->lport; + fl4.fl4_dport = conn->dport; + } else { + fl4.fl4_saddr = conn->vaddr.in; + fl4.fl4_daddr = conn->caddr.in; + fl4.fl4_sport = conn->vport; + fl4.fl4_dport = conn->cport; + } + fl4.fl4_proto = IPPROTO_TCP; + if ((rt = route4_output(&fl4)) == NULL) + return NULL; + dev = rt->port; + route4_put(rt); + } else { /* AF_INET6 */ + struct route6 *rt6 = NULL; + struct flow6 fl6; + memset(&fl6, 0, sizeof(struct flow6)); + if (dir == DPVS_CONN_DIR_INBOUND) { + fl6.fl6_saddr = conn->laddr.in6; + fl6.fl6_daddr = conn->daddr.in6; + fl6.fl6_sport = conn->lport; + fl6.fl6_dport = conn->dport; + } else { + fl6.fl6_saddr = conn->vaddr.in6; + fl6.fl6_daddr = conn->caddr.in6; + fl6.fl6_sport = conn->vport; + fl6.fl6_dport = conn->cport; + } + fl6.fl6_proto = IPPROTO_TCP; + if ((rt6 = route6_output(NULL, &fl6)) == NULL) + return NULL; + dev = rt6->rt6_dev; + route6_put(rt6); } - - fl4.proto = IPPROTO_TCP; - if ((rt = route4_output(&fl4)) == NULL) - return NULL; - dev = rt->port; - route4_put(rt); } return dev->mbuf_pool; diff --git a/src/ipvs/ip_vs_proto_udp.c b/src/ipvs/ip_vs_proto_udp.c index 7070c856d..e8c9dcf2d 100644 --- a/src/ipvs/ip_vs_proto_udp.c +++ b/src/ipvs/ip_vs_proto_udp.c @@ -17,10 +17,12 @@ */ #include #include -#include +#include #include "common.h" #include "dpdk.h" #include "ipv4.h" +#include "ipv6.h" +#include "route6.h" #include "ipvs/ipvs.h" #include "ipvs/proto.h" #include "ipvs/proto_udp.h" @@ -59,6 +61,19 @@ static int udp_timeouts[DPVS_UDP_S_LAST + 1] = { [DPVS_UDP_S_LAST] = 2, }; +inline void udp4_send_csum(struct ipv4_hdr *iph, struct udphdr *uh) +{ + uh->check = 0; + uh->check = rte_ipv4_udptcp_cksum(iph, uh); +} + +inline void udp6_send_csum(struct ipv6_hdr *iph, struct udphdr *uh) +{ + uh->check = 0; + uh->check = ip6_udptcp_cksum((struct ip6_hdr *)iph, uh, + (void *)uh - (void *)iph, IPPROTO_UDP); +} + static int udp_conn_sched(struct dp_vs_proto *proto, const struct dp_vs_iphdr *iph, struct rte_mbuf *mbuf, @@ -140,10 +155,18 @@ udp_conn_lookup(struct dp_vs_proto *proto, * UDP has no ack, we don't know pkt from client is response or not * UDP can only confirm neighbour to RS */ + int af = iph->af; if (conn != NULL) { - if ((*direct == DPVS_CONN_DIR_OUTBOUND) && conn->in_dev - && (conn->in_nexthop.in.s_addr != htonl(INADDR_ANY))){ - neigh_confirm(conn->in_nexthop.in, conn->in_dev); + if (AF_INET6 == af) { + if ((*direct == DPVS_CONN_DIR_OUTBOUND) && conn->in_dev + && !ipv6_addr_any(&conn->in_nexthop.in6)) { + neigh_confirm(AF_INET6, &conn->in_nexthop, conn->in_dev); + } + } else { + if ((*direct == DPVS_CONN_DIR_OUTBOUND) && conn->in_dev + && (conn->in_nexthop.in.s_addr != htonl(INADDR_ANY))) { + neigh_confirm(AF_INET, &conn->in_nexthop, conn->in_dev); + } } } @@ -168,16 +191,16 @@ static int udp_state_trans(struct dp_vs_proto *proto, struct dp_vs_conn *conn, static int send_standalone_uoa(const struct dp_vs_conn *conn, const struct rte_mbuf *ombuf, - const struct iphdr *oiph, + const void *oiph, const struct udphdr *ouh, enum uoa_mode mode) { struct rte_mbuf *mbuf = NULL; - struct route_entry *rt; - struct iphdr *iph; + void *iph; struct udphdr *uh; struct ipopt_uoa *uoa = NULL; struct opphdr *opp; + int af = conn->af; assert(conn && ombuf && oiph && ouh && ombuf->userdata); @@ -189,29 +212,55 @@ static int send_standalone_uoa(const struct dp_vs_conn *conn, if (unlikely(!mbuf)) return EDPVS_NOMEM; + int ipolen_uoa = (AF_INET6 == af) ? IPOLEN_UOA_IPV6 : IPOLEN_UOA_IPV4; + /* don't copy any ip options from oiph, is it ok ? */ - iph = (void *)rte_pktmbuf_append(mbuf, sizeof(struct iphdr)); - if (unlikely(!iph)) - goto no_room; - iph->version = 4; - iph->tos = oiph->tos; - iph->id = ip4_select_id((struct ipv4_hdr *)iph); - iph->frag_off = 0; - iph->ttl = oiph->ttl; - iph->saddr = conn->laddr.in.s_addr; - iph->daddr = conn->daddr.in.s_addr; + if (AF_INET6 == af) { + iph = (void *)rte_pktmbuf_append(mbuf, sizeof(struct ip6_hdr)); + if (unlikely(!iph)) + goto no_room; + ((struct ip6_hdr *)iph)->ip6_ctlun + = ((struct ip6_hdr *)oiph)->ip6_ctlun; + memcpy(&((struct ip6_hdr *)iph)->ip6_src, + &((struct ip6_hdr *)oiph)->ip6_src, + IPV6_ADDR_LEN_IN_BYTES); + memcpy(&((struct ip6_hdr *)iph)->ip6_dst, + &((struct ip6_hdr *)oiph)->ip6_dst, + IPV6_ADDR_LEN_IN_BYTES); + } else { + iph = (void *)rte_pktmbuf_append(mbuf, sizeof(struct iphdr)); + if (unlikely(!iph)) + goto no_room; + ((struct iphdr *)iph)->version = 4; + ((struct iphdr *)iph)->tos = ((struct iphdr *)oiph)->tos; + ((struct iphdr *)iph)->id = ip4_select_id((struct ipv4_hdr *)iph); + ((struct iphdr *)iph)->frag_off = 0; + ((struct iphdr *)iph)->ttl = ((struct iphdr *)oiph)->ttl; + ((struct iphdr *)iph)->saddr = conn->laddr.in.s_addr; + ((struct iphdr *)iph)->daddr = conn->daddr.in.s_addr; + } if (mode == UOA_M_IPO) { - iph->ihl = (sizeof(struct iphdr) + IPOLEN_UOA) / 4; - iph->tot_len = htons(sizeof(*iph) + IPOLEN_UOA + sizeof(*uh)); - iph->protocol = oiph->protocol; /* should always UDP */ - - uoa = (void *)rte_pktmbuf_append(mbuf, IPOLEN_UOA); - } else { /* UOA_M_OPP */ - iph->ihl = sizeof(struct iphdr) / 4; - iph->tot_len = \ - htons(sizeof(*iph) + sizeof(*opp) + sizeof(*uoa) + sizeof(*uh)); - iph->protocol = IPPROTO_OPT; + /* only ipv4 support and use this ip option mode */ + ((struct iphdr *)iph)->ihl = + (sizeof(struct iphdr) + IPOLEN_UOA_IPV4) / 4; + ((struct iphdr *)iph)->tot_len = + htons(sizeof(*iph) + IPOLEN_UOA_IPV4 + sizeof(*uh)); + ((struct iphdr *)iph)->protocol = ((struct iphdr *)oiph)->protocol; + + uoa = (void *)rte_pktmbuf_append(mbuf, ipolen_uoa); + } else { + /* UOA_M_OPP */ + if (AF_INET6 == af) { + ((struct ip6_hdr *)iph)->ip6_plen = + sizeof(*opp) + sizeof(*uoa) + sizeof(*uh); + ((struct ip6_hdr *)iph)->ip6_nxt = IPPROTO_OPT; + } else { + ((struct iphdr *)iph)->ihl = sizeof(struct iphdr) / 4; + ((struct iphdr *)iph)->tot_len = htons(sizeof(struct iphdr) + + sizeof(*opp) + sizeof(*uoa) + sizeof(*uh)); + ((struct iphdr *)iph)->protocol = IPPROTO_OPT; + } /* option-proto */ opp = (void *)rte_pktmbuf_append(mbuf, sizeof(*opp)); @@ -219,39 +268,61 @@ static int send_standalone_uoa(const struct dp_vs_conn *conn, goto no_room; memset(opp, 0, sizeof(*opp)); - opp->version = 0x01; - opp->protocol = oiph->protocol; - opp->length = htons(sizeof(*opp) + sizeof(*uoa)); + if (AF_INET6 == af) { + opp->version = OPPHDR_IPV6; + opp->protocol = IPPROTO_UDP; /* set to IPPROTO_UDP */ + } else { + opp->version = OPPHDR_IPV4; + opp->protocol = IPPROTO_UDP; + } + opp->length = htons(sizeof(*opp) + ipolen_uoa); - uoa = (void *)rte_pktmbuf_append(mbuf, sizeof(*uoa)); + uoa = (void *)rte_pktmbuf_append(mbuf, ipolen_uoa); } /* UOA option */ if (unlikely(!uoa)) goto no_room; - uoa->op_code = IPOPT_UOA; - uoa->op_len = IPOLEN_UOA; - uoa->op_port = ouh->source; - uoa->op_addr = oiph->saddr; + memset(uoa, 0, ipolen_uoa); + uoa->op_code = IPOPT_UOA; + uoa->op_len = ipolen_uoa; + uoa->op_port = ouh->source; + /* fix uoa->op_addr */ + if (AF_INET6 == af) { + memcpy(&uoa->op_addr, &((struct ip6_hdr *)oiph)->ip6_src, + IPV6_ADDR_LEN_IN_BYTES); + } else { + memcpy(&uoa->op_addr, &((struct iphdr *)oiph)->saddr, + IPV4_ADDR_LEN_IN_BYTES); + } /* udp header */ uh = (void *)rte_pktmbuf_append(mbuf, sizeof(struct udphdr)); if (unlikely(!uh)) goto no_room; - uh->source = conn->lport; - uh->dest = conn->dport; - uh->len = htons(sizeof(struct udphdr)); /* empty payload */ + + memset(uh, 0, sizeof(struct udphdr)); + uh->source = conn->lport; + uh->dest = conn->dport; + uh->len = htons(sizeof(struct udphdr)); /* empty payload */ /* udp checksum */ - uh->check = 0; /* rte_ipv4_udptcp_cksum fails if opp inserted. */ + uh->check = 0; /* rte_ipv4_udptcp_cksum fails if opp inserted. */ /* ip checksum will calc later */ - mbuf->userdata = rt = (struct route_entry *)ombuf->userdata; - route4_get(rt); - - return ipv4_local_out(mbuf); + if (AF_INET6 == af) { + struct route6 *rt6; + mbuf->userdata = rt6 = (struct route6*)ombuf->userdata; + route6_get(rt6); + return ip6_local_out(mbuf); + } else { /* IPv4 */ + struct route_entry *rt; + mbuf->userdata = rt = (struct route_entry *)ombuf->userdata; + route4_get(rt); + return ipv4_local_out(mbuf); + } no_room: if (mbuf) @@ -265,8 +336,9 @@ static int insert_ipopt_uoa(struct dp_vs_conn *conn, struct rte_mbuf *mbuf, struct iphdr *niph = NULL; struct ipopt_uoa *optuoa; - if ((ip4_hdrlen(mbuf) + sizeof(struct ipopt_uoa) > MAX_IPOPTLEN) || - (mbuf->pkt_len + sizeof(struct ipopt_uoa) > mtu)) + if ((ip4_hdrlen(mbuf) + sizeof(struct ipopt_uoa) > + sizeof(struct iphdr) + MAX_IPOPTLEN) + || (mbuf->pkt_len + sizeof(struct ipopt_uoa) > mtu)) goto standalone_uoa; /* @@ -276,7 +348,7 @@ static int insert_ipopt_uoa(struct dp_vs_conn *conn, struct rte_mbuf *mbuf, * otherwise move left parts (IP opts, UDP hdr and payloads). */ if (likely(ntohs(iph->tot_len) >= (sizeof(struct iphdr) * 2))) { - niph = (struct iphdr *)rte_pktmbuf_prepend(mbuf, IPOLEN_UOA); + niph = (struct iphdr *)rte_pktmbuf_prepend(mbuf, IPOLEN_UOA_IPV4); if (unlikely(!niph)) goto standalone_uoa; @@ -290,23 +362,23 @@ static int insert_ipopt_uoa(struct dp_vs_conn *conn, struct rte_mbuf *mbuf, if (unlikely(mbuf_may_pull(mbuf, mbuf->pkt_len) != 0)) goto standalone_uoa; - ptr = (void *)rte_pktmbuf_append(mbuf, IPOLEN_UOA); + ptr = (void *)rte_pktmbuf_append(mbuf, IPOLEN_UOA_IPV4); if (unlikely(!ptr)) goto standalone_uoa; - memmove((void *)(iph + 1) + IPOLEN_UOA, iph + 1, + memmove((void *)(iph + 1) + IPOLEN_UOA_IPV4, iph + 1, ntohs(iph->tot_len) - sizeof(struct iphdr)); - uh = (void *)uh + IPOLEN_UOA; + uh = (void *)uh + IPOLEN_UOA_IPV4; } optuoa = (struct ipopt_uoa *)(niph + 1); optuoa->op_code = IPOPT_UOA; - optuoa->op_len = IPOLEN_UOA; + optuoa->op_len = IPOLEN_UOA_IPV4; optuoa->op_port = uh->source; - optuoa->op_addr = niph->saddr; + memcpy(&optuoa->op_addr, &niph->saddr, IPV4_ADDR_LEN_IN_BYTES); - niph->ihl += IPOLEN_UOA / 4; - niph->tot_len = htons(ntohs(niph->tot_len) + IPOLEN_UOA); + niph->ihl += IPOLEN_UOA_IPV4 / 4; + niph->tot_len = htons(ntohs(niph->tot_len) + IPOLEN_UOA_IPV4); /* UDP/IP checksum will recalc later*/ return EDPVS_OK; @@ -315,27 +387,54 @@ static int insert_ipopt_uoa(struct dp_vs_conn *conn, struct rte_mbuf *mbuf, return send_standalone_uoa(conn, mbuf, iph, uh, UOA_M_IPO); } +/* + * insert_opp_uoa: insert IPPROTO_OPT with uoa + * + * @iph: pointer to ip header, type of void * + * will be cast to struct iphdr * or struct ip6_hdr * according to af + * @uh: pointer to udp header + * @return insertion status + */ static int insert_opp_uoa(struct dp_vs_conn *conn, struct rte_mbuf *mbuf, - struct iphdr *iph, struct udphdr *uh, int mtu) + void *iph, struct udphdr *uh, int mtu) { - struct iphdr *niph; - struct opphdr *opph; - struct ipopt_uoa *uoa; + void *niph; + struct opphdr *opph = NULL; + struct ipopt_uoa *uoa = NULL; + int af = conn->af; + int iphdrlen = 0, iptot_len = 0, ipolen_uoa = 0; + if (AF_INET6 == af) { + /* + * iphdrlen: ipv6 total header length = basic header length (40 B) + + * ext header length + * iptot_len: ipv6 total length = basic header length (40 B) + + * payload length(including ext header) + */ + iphdrlen = ip6_hdrlen(mbuf); + iptot_len = sizeof(struct ip6_hdr) + + ntohs(((struct ip6_hdr *)iph)->ip6_plen); + ipolen_uoa = IPOLEN_UOA_IPV6; + } else { + iphdrlen = ip4_hdrlen(mbuf); + iptot_len = ntohs(((struct iphdr *)iph)->tot_len); + ipolen_uoa = IPOLEN_UOA_IPV4; + } - if (mbuf->pkt_len + sizeof(*opph) + IPOLEN_UOA > mtu) + if (mbuf->pkt_len + sizeof(*opph) + ipolen_uoa > mtu) goto standalone_uoa; /* - * new protocol in inserted after IPv4 header (including existing + * new protocol is inserted after IPv4/v6 header (including existing * options), and before UDP header. so unlike "ipo" mode, do not * need handle IPOPT_END coincide issue. */ - if (likely(ntohs(iph->tot_len) >= (iph->ihl<<2) * 2)) { - niph = (void *)rte_pktmbuf_prepend(mbuf, sizeof(*opph) + IPOLEN_UOA); + + if (likely(iptot_len >= iphdrlen * 2)) { + niph = (void *)rte_pktmbuf_prepend(mbuf, sizeof(*opph) + ipolen_uoa); if (unlikely(!niph)) goto standalone_uoa; - memmove(niph, iph, iph->ihl << 2); + memmove(niph, iph, iphdrlen); } else { unsigned char *ptr; @@ -344,32 +443,54 @@ static int insert_opp_uoa(struct dp_vs_conn *conn, struct rte_mbuf *mbuf, if (unlikely(mbuf_may_pull(mbuf, mbuf->pkt_len) != 0)) goto standalone_uoa; - ptr = (void *)rte_pktmbuf_append(mbuf, sizeof(*opph) + IPOLEN_UOA); + ptr = (void *)rte_pktmbuf_append(mbuf, sizeof(*opph) + ipolen_uoa); if (unlikely(!ptr)) goto standalone_uoa; - memmove((void *)iph + (iph->ihl << 2) + sizeof(*opph) + IPOLEN_UOA, - (void *)iph + (iph->ihl << 2), - ntohs(iph->tot_len) - (iph->ihl << 2)); + memmove((void *)iph + iphdrlen + sizeof(*opph) + ipolen_uoa, + (void *)iph + iphdrlen, + iptot_len - iphdrlen); - uh = (void *)uh + sizeof(*opph) + IPOLEN_UOA; + uh = (void *)uh + sizeof(*opph) + ipolen_uoa; } - opph = (struct opphdr *)((void *)niph + (niph->ihl << 2)); + opph = (struct opphdr *)((void *)niph + iphdrlen); memset(opph, 0, sizeof(*opph)); - opph->version = 0x1; - opph->protocol = niph->protocol; - opph->length = htons(sizeof(*opph) + IPOLEN_UOA); - uoa = (void *)opph->options; - uoa->op_code = IPOPT_UOA; - uoa->op_len = IPOLEN_UOA; - uoa->op_port = uh->source; - uoa->op_addr = niph->saddr; + if (AF_INET6 == af) { + /* version 2 for ipv6 address family */ + uint8_t nexthdr = ((struct ip6_hdr *)niph)->ip6_nxt; + ip6_skip_exthdr(mbuf, sizeof(struct ip6_hdr), &nexthdr); + opph->version = OPPHDR_IPV6; + opph->protocol = nexthdr; + } else { + /* version 1 for ipv4 address family */ + opph->version = OPPHDR_IPV4; + opph->protocol = ((struct iphdr *)niph)->protocol; + } + opph->length = htons(sizeof(*opph) + ipolen_uoa); - niph->protocol = IPPROTO_OPT; - niph->tot_len = htons(ntohs(niph->tot_len) + sizeof(*opph) + IPOLEN_UOA); - /* UDP/IP checksum will recalc later*/ + uoa = (void *)opph->options; + memset(uoa, 0, sizeof(struct ipopt_uoa)); + uoa->op_code = IPOPT_UOA; + uoa->op_len = ipolen_uoa; + uoa->op_port = uh->source; + if (AF_INET6 == af) { + memcpy(&uoa->op_addr, &((struct ip6_hdr *)niph)->ip6_src, + IPV6_ADDR_LEN_IN_BYTES); + ((struct ip6_hdr *)niph)->ip6_nxt = IPPROTO_OPT; + /* Update ipv6 payload length */ + ((struct ip6_hdr *)niph)->ip6_plen = + htons(ntohs(((struct ip6_hdr *)niph)->ip6_plen) + + sizeof(*opph) + ipolen_uoa); + } else { + memcpy(&uoa->op_addr, &((struct iphdr *)niph)->saddr, + IPV4_ADDR_LEN_IN_BYTES); + ((struct iphdr *)niph)->protocol = IPPROTO_OPT; + /* UDP/IP checksum will recalc later*/ + ((struct iphdr *)niph)->tot_len = + htons(iptot_len + sizeof(*opph) + ipolen_uoa); + } return EDPVS_OK; @@ -380,10 +501,13 @@ static int insert_opp_uoa(struct dp_vs_conn *conn, struct rte_mbuf *mbuf, static int udp_insert_uoa(struct dp_vs_conn *conn, struct rte_mbuf *mbuf, struct conn_uoa *uoa) { - struct iphdr *iph = (struct iphdr *)ip4_hdr(mbuf); - struct route_entry *rt = NULL; + void *rt = NULL; struct udphdr *uh = NULL; - int err; + void *iph = NULL; + int af = conn->af; + int iphdrlen = 0; + int err = EDPVS_OK; + int mtu; /* already send enough UOA */ if (uoa->state == UOA_S_DONE) @@ -396,15 +520,25 @@ static int udp_insert_uoa(struct dp_vs_conn *conn, struct rte_mbuf *mbuf, return EDPVS_OK; } - /* get udp header before any 'standalone_uoa' */ - uh = rte_pktmbuf_mtod_offset(mbuf, struct udphdr *, ip4_hdrlen(mbuf)); - rt = mbuf->userdata; if (!rt) { RTE_LOG(ERR, IPVS, "%s: no route\n", __func__); return EDPVS_INVPKT; } + if (AF_INET6 == af) { + mtu = ((struct route6*)rt)->rt6_mtu; + iph = ip6_hdr(mbuf); + iphdrlen = ip6_hdrlen(mbuf); + } else { + mtu = ((struct route_entry*) rt)->mtu; + iph = (struct iphdr *)ip4_hdr(mbuf); + iphdrlen = ip4_hdrlen(mbuf); + } + + /* get udp header before any 'standalone_uoa' */ + uh = rte_pktmbuf_mtod_offset(mbuf, struct udphdr *, iphdrlen); + /* * send standalone (empty-payload) UDP/IP pkt with UOA if * no room in IP header or exceeding MTU. @@ -416,16 +550,21 @@ static int udp_insert_uoa(struct dp_vs_conn *conn, struct rte_mbuf *mbuf, * is coincide, the IPOPT_END should already exist. */ switch (g_uoa_mode) { - case UOA_M_IPO: - err = insert_ipopt_uoa(conn, mbuf, iph, uh, rt->mtu); - break; - - case UOA_M_OPP: - err = insert_opp_uoa(conn, mbuf, iph, uh, rt->mtu); - break; - - default: - return EDPVS_INVAL; + case UOA_M_IPO: + /* only ipv4 support ipopt mode */ + if (AF_INET == af) { + err = insert_ipopt_uoa(conn, mbuf, (struct iphdr *)iph, uh, mtu); + } else { + RTE_LOG(WARNING, IPVS, "fail to send UOA: %s\n", dpvs_strerror(err)); + } + break; + + case UOA_M_OPP: + err = insert_opp_uoa(conn, mbuf, iph, uh, mtu); + break; + + default: + return EDPVS_INVAL; } if (err == EDPVS_OK) @@ -441,28 +580,41 @@ static int udp_fnat_in_handler(struct dp_vs_proto *proto, struct rte_mbuf *mbuf) { struct udp_hdr *uh = NULL; - struct iphdr *iph = (void *)ip4_hdr(mbuf); struct opphdr *opp = NULL; + void *iph = NULL; + int af = conn->af; + int iphdrlen = 0; + uint8_t nxt_proto; + + if (AF_INET6 == af) { + iph = ip6_hdr(mbuf); + iphdrlen = ip6_hdrlen(mbuf); + /* need found the last ip6_nxt of the ext header */ + uint8_t nexthdr = ((struct ip6_hdr *)iph)->ip6_nxt; + ip6_skip_exthdr(mbuf, sizeof(struct ip6_hdr), &nexthdr); + nxt_proto = nexthdr; + } else { + iph = ip4_hdr(mbuf); + iphdrlen = ip4_hdrlen(mbuf); + nxt_proto = ((struct iphdr *)iph)->protocol; + } /* cannot use mbuf_header_pointer() */ - if (unlikely(mbuf->data_len < ip4_hdrlen(mbuf) + sizeof(struct udp_hdr))) + if (unlikely(mbuf->data_len < iphdrlen + sizeof(struct udp_hdr))) return EDPVS_INVPKT; - if (iph->protocol == IPPROTO_UDP) { - uh = (void *)iph + ip4_hdrlen(mbuf); - } else if (iph->protocol == IPPROTO_OPT) { - opp = (void *)iph + ip4_hdrlen(mbuf); - - uh = (void *)opp + ntohs(opp->length); + if (nxt_proto == IPPROTO_UDP) { + uh = (struct udp_hdr *)(iph + iphdrlen); + } else if (nxt_proto == IPPROTO_OPT) { + opp = (struct opphdr *)(iph + iphdrlen); + uh = (struct udp_hdr *)((void *)opp + ntohs(opp->length)); } if (unlikely(!uh)) return EDPVS_INVPKT; - uh->src_port = conn->lport; - uh->dst_port = conn->dport; - - uh->dgram_cksum = 0; + uh->src_port = conn->lport; + uh->dst_port = conn->dport; /* * XXX: UDP pseudo header need UDP length, but the common helper function @@ -474,8 +626,17 @@ static int udp_fnat_in_handler(struct dp_vs_proto *proto, * However, UDP checksum is not mandatory, to make things easier, when OPP * header exist, we just not calc UDP checksum. */ - if (!opp) - uh->dgram_cksum = rte_ipv4_udptcp_cksum(ip4_hdr(mbuf), uh); + if (!opp) { + if (AF_INET6 == af) { + udp6_send_csum((struct ipv6_hdr *)ip6_hdr(mbuf), (struct udphdr*)uh); + } else { + udp4_send_csum(ip4_hdr(mbuf), (struct udphdr *)uh); + } + } + /* FIXME: + * 1. IPv6 UDP checksum is a must, packets with OPP header also need checksum. + * 2. UDP checksum offload is to be supported. + */ return EDPVS_OK; } @@ -485,19 +646,24 @@ static int udp_fnat_out_handler(struct dp_vs_proto *proto, struct rte_mbuf *mbuf) { struct udp_hdr *uh; + int af = conn->af; + int iphdrlen = ((AF_INET6 == af) ? ip6_hdrlen(mbuf): ip4_hdrlen(mbuf)); /* cannot use mbuf_header_pointer() */ - if (unlikely(mbuf->data_len < ip4_hdrlen(mbuf) + sizeof(struct udp_hdr))) + if (unlikely(mbuf->data_len < iphdrlen + sizeof(struct udp_hdr))) return EDPVS_INVPKT; - uh = rte_pktmbuf_mtod_offset(mbuf, struct udp_hdr *, ip4_hdrlen(mbuf)); + uh = rte_pktmbuf_mtod_offset(mbuf, struct udp_hdr *, iphdrlen); if (unlikely(!uh)) return EDPVS_INVPKT; - uh->src_port = conn->vport; - uh->dst_port = conn->cport; + uh->src_port = conn->vport; + uh->dst_port = conn->cport; - uh->dgram_cksum = 0; - uh->dgram_cksum = rte_ipv4_udptcp_cksum(ip4_hdr(mbuf), uh); + if (AF_INET6 == af) { + udp6_send_csum((struct ipv6_hdr *)ip6_hdr(mbuf), (struct udphdr*)uh); + } else { + udp4_send_csum(ip4_hdr(mbuf), (struct udphdr *)uh); + } return EDPVS_OK; } @@ -519,18 +685,23 @@ static int udp_snat_in_handler(struct dp_vs_proto *proto, struct rte_mbuf *mbuf) { struct udp_hdr *uh; + int af = conn->af; + int iphdrlen = ((AF_INET6 == af) ? ip6_hdrlen(mbuf): ip4_hdrlen(mbuf)); /* cannot use mbuf_header_pointer() */ - if (unlikely(mbuf->data_len < ip4_hdrlen(mbuf) + sizeof(struct udp_hdr))) + if (unlikely(mbuf->data_len < iphdrlen + sizeof(struct udp_hdr))) return EDPVS_INVPKT; - uh = rte_pktmbuf_mtod_offset(mbuf, struct udp_hdr *, ip4_hdrlen(mbuf)); + uh = rte_pktmbuf_mtod_offset(mbuf, struct udp_hdr *, iphdrlen); if (unlikely(!uh)) return EDPVS_INVPKT; uh->dst_port = conn->dport; - uh->dgram_cksum = 0; - uh->dgram_cksum = rte_ipv4_udptcp_cksum(ip4_hdr(mbuf), uh); + if (AF_INET6 == af) { + udp6_send_csum((struct ipv6_hdr *)ip6_hdr(mbuf), (struct udphdr*)uh); + } else { + udp4_send_csum(ip4_hdr(mbuf), (struct udphdr *)uh); + } return EDPVS_OK; } @@ -540,18 +711,23 @@ static int udp_snat_out_handler(struct dp_vs_proto *proto, struct rte_mbuf *mbuf) { struct udp_hdr *uh; + int af = conn->af; + int iphdrlen = ((AF_INET6 == af) ? ip6_hdrlen(mbuf): ip4_hdrlen(mbuf)); /* cannot use mbuf_header_pointer() */ - if (unlikely(mbuf->data_len < ip4_hdrlen(mbuf) + sizeof(struct udp_hdr))) + if (unlikely(mbuf->data_len < iphdrlen + sizeof(struct udp_hdr))) return EDPVS_INVPKT; - uh = rte_pktmbuf_mtod_offset(mbuf, struct udp_hdr *, ip4_hdrlen(mbuf)); + uh = rte_pktmbuf_mtod_offset(mbuf, struct udp_hdr *, iphdrlen); if (unlikely(!uh)) return EDPVS_INVPKT; uh->src_port = conn->vport; - uh->dgram_cksum = 0; - uh->dgram_cksum = rte_ipv4_udptcp_cksum(ip4_hdr(mbuf), uh); + if (AF_INET6 == af) { + udp6_send_csum((struct ipv6_hdr *)ip6_hdr(mbuf), (struct udphdr*)uh); + } else { + udp4_send_csum(ip4_hdr(mbuf), (struct udphdr *)uh); + } return EDPVS_OK; } diff --git a/src/ipvs/ip_vs_rr.c b/src/ipvs/ip_vs_rr.c index 3fdf14ae4..962feeb4f 100644 --- a/src/ipvs/ip_vs_rr.c +++ b/src/ipvs/ip_vs_rr.c @@ -34,7 +34,7 @@ static int dp_vs_rr_update_svc(struct dp_vs_service *svc) * Round-Robin Scheduling */ static struct dp_vs_dest *dp_vs_rr_schedule(struct dp_vs_service *svc, - const struct rte_mbuf *mbuf) + const struct rte_mbuf *mbuf) { struct list_head *p, *q; struct dp_vs_dest *dest; diff --git a/src/ipvs/ip_vs_sched.c b/src/ipvs/ip_vs_sched.c index 80c5faec7..6ea1193ae 100644 --- a/src/ipvs/ip_vs_sched.c +++ b/src/ipvs/ip_vs_sched.c @@ -36,7 +36,7 @@ static rte_rwlock_t __dp_vs_sched_lock; * Bind a service with a scheduler */ int dp_vs_bind_scheduler(struct dp_vs_service *svc, - struct dp_vs_scheduler *scheduler) + struct dp_vs_scheduler *scheduler) { int ret; diff --git a/src/ipvs/ip_vs_service.c b/src/ipvs/ip_vs_service.c index f0f99f260..7156d574c 100644 --- a/src/ipvs/ip_vs_service.c +++ b/src/ipvs/ip_vs_service.c @@ -19,6 +19,7 @@ #include #include "inet.h" #include "ipv4.h" +#include "ipv6.h" #include "ipvs/service.h" #include "ipvs/dest.h" #include "ipvs/sched.h" @@ -26,6 +27,7 @@ #include "ipvs/blklst.h" #include "ctrl.h" #include "route.h" +#include "route6.h" #include "netif.h" #include "assert.h" #include "neigh.h" @@ -47,8 +49,15 @@ static struct list_head dp_vs_svc_match_list; static inline unsigned dp_vs_svc_hashkey(int af, unsigned proto, const union inet_addr *addr) { - /* now IPv4 only */ - uint32_t addr_fold = addr->in.s_addr; + uint32_t addr_fold; + + addr_fold = inet_addr_fold(af, addr); + + if (!addr_fold) { + RTE_LOG(DEBUG, SERVICE, "%s: IP proto not support.\n", __func__); + return 0; + } + return (proto ^ rte_be_to_cpu_32(addr_fold)) & DP_VS_SVC_TAB_MASK; } @@ -104,7 +113,8 @@ static int dp_vs_svc_unhash(struct dp_vs_service *svc) } struct dp_vs_service *__dp_vs_service_get(int af, uint16_t protocol, - const union inet_addr *vaddr, uint16_t vport) + const union inet_addr *vaddr, + uint16_t vport) { unsigned hash; struct dp_vs_service *svc; @@ -146,21 +156,28 @@ static inline bool __svc_in_range(int af, const union inet_addr *addr, __be16 port, const struct inet_addr_range *range) { - if (unlikely(af != AF_INET)) + if (unlikely((af == AF_INET) && + (ntohl(range->min_addr.in.s_addr) > ntohl(range->max_addr.in.s_addr)))) return false; - if (unlikely(ntohl(range->min_addr.in.s_addr) > \ - ntohl(range->max_addr.in.s_addr))) + if (unlikely((af == AF_INET6) && + ipv6_addr_cmp(&range->min_addr.in6, &range->max_addr.in6) > 0)) return false; if (unlikely(ntohs(range->min_port) > ntohs(range->max_port))) return false; /* if both min/max are zero, means need not check. */ - if (range->max_addr.in.s_addr != htonl(INADDR_ANY)) { - if (ntohl(addr->in.s_addr) < ntohl(range->min_addr.in.s_addr) || - ntohl(addr->in.s_addr) > ntohl(range->max_addr.in.s_addr)) - return false; + if (!inet_is_addr_any(af, &range->max_addr)) { + if (af == AF_INET) { + if (ntohl(addr->in.s_addr) < ntohl(range->min_addr.in.s_addr) || + ntohl(addr->in.s_addr) > ntohl(range->max_addr.in.s_addr)) + return false; + } else { + if (ipv6_addr_cmp(&range->min_addr.in6, &addr->in6) > 0 || + ipv6_addr_cmp(&range->max_addr.in6, &addr->in6) < 0) + return false; + } } if (range->max_port != 0) { @@ -173,7 +190,7 @@ static inline bool __svc_in_range(int af, } static struct dp_vs_service * -__dp_vs_svc_match_get(int af, const struct rte_mbuf *mbuf) +__dp_vs_svc_match_get4(const struct rte_mbuf *mbuf) { struct route_entry *rt = mbuf->userdata; struct ipv4_hdr *iph = ip4_hdr(mbuf); /* ipv4 only */ @@ -188,36 +205,112 @@ __dp_vs_svc_match_get(int af, const struct rte_mbuf *mbuf) if (!ports) return NULL; + /* snat is handled at pre-routing to check if oif + * is match perform route here. */ + if (rt) { + if ((rt->flag & RTF_KNI) || (rt->flag & RTF_LOCALIN)) + return NULL; + oif = rt->port->id; + } else { + rt = route4_input(mbuf, &daddr.in, &saddr.in, + iph->type_of_service, + netif_port_get(mbuf->port)); + if (!rt) + return NULL; + if ((rt->flag & RTF_KNI) || (rt->flag & RTF_LOCALIN)) { + route4_put(rt); + return NULL; + } + oif = rt->port->id; + route4_put(rt); + } + list_for_each_entry(svc, &dp_vs_svc_match_list, m_list) { struct dp_vs_match *m = svc->match; struct netif_port *idev, *odev; assert(m); - /* snat is handled at pre-routing to check if oif - * is match perform route here. */ - if (strlen(m->oifname)) { - if (!rt) { - rt = route4_input(mbuf, &daddr.in, &saddr.in, - iph->type_of_service, - netif_port_get(mbuf->port)); - if (!rt) - return NULL; - - /* set mbuf->userdata to @rt as side-effect is not good! - * although route will done again when out-xmit. */ - oif = rt->port->id; - route4_put(rt); - } else { - oif = rt->port->id; - } + if (!strlen(m->oifname)) + oif = NETIF_PORT_ID_ALL; + + idev = netif_port_get_by_name(m->iifname); + odev = netif_port_get_by_name(m->oifname); + + if (svc->af == AF_INET && svc->proto == iph->next_proto_id && + __svc_in_range(AF_INET, &saddr, ports[0], &m->srange) && + __svc_in_range(AF_INET, &daddr, ports[1], &m->drange) && + (!idev || idev->id == mbuf->port) && + (!odev || odev->id == oif) + ) { + rte_atomic32_inc(&svc->usecnt); + return svc; + } + } + + return NULL; +} + +static struct dp_vs_service * +__dp_vs_svc_match_get6(const struct rte_mbuf *mbuf) +{ + struct route6 *rt = mbuf->userdata; + struct ip6_hdr *iph = ip6_hdr(mbuf); + uint8_t ip6nxt = iph->ip6_nxt; + struct dp_vs_service *svc; + union inet_addr saddr, daddr; + __be16 _ports[2], *ports; + portid_t oif = NETIF_PORT_ID_ALL; + + struct flow6 fl6 = { + .fl6_iif = NULL, + .fl6_daddr = iph->ip6_dst, + .fl6_saddr = iph->ip6_src, + .fl6_proto = iph->ip6_nxt, + }; + + saddr.in6 = iph->ip6_src; + daddr.in6 = iph->ip6_dst; + ports = mbuf_header_pointer(mbuf, ip6_hdrlen(mbuf), sizeof(_ports), _ports); + if (!ports) + return NULL; + + /* snat is handled at pre-routing to check if oif + * is match perform route here. */ + if (rt) { + if ((rt->rt6_flags & RTF_KNI) || (rt->rt6_flags & RTF_LOCALIN)) + return NULL; + oif = rt->rt6_dev->id; + } else { + rt = route6_input(mbuf, &fl6); + if (!rt) + return NULL; + + /* set mbuf->userdata to @rt as side-effect is not good! + * although route will done again when out-xmit. */ + if ((rt->rt6_flags & RTF_KNI) || (rt->rt6_flags & RTF_LOCALIN)) { + route6_put(rt); + return NULL; } + oif = rt->rt6_dev->id; + route6_put(rt); + } + + list_for_each_entry(svc, &dp_vs_svc_match_list, m_list) { + struct dp_vs_match *m = svc->match; + struct netif_port *idev, *odev; + assert(m); + + if (!strlen(m->oifname)) + oif = NETIF_PORT_ID_ALL; idev = netif_port_get_by_name(m->iifname); odev = netif_port_get_by_name(m->oifname); - if (svc->af == af && svc->proto == iph->next_proto_id && - __svc_in_range(af, &saddr, ports[0], &m->srange) && - __svc_in_range(af, &daddr, ports[1], &m->drange) && + ip6_skip_exthdr(mbuf, sizeof(struct ip6_hdr), &ip6nxt); + + if (svc->af == AF_INET6 && svc->proto == ip6nxt && + __svc_in_range(AF_INET6, &saddr, ports[0], &m->srange) && + __svc_in_range(AF_INET6, &daddr, ports[1], &m->drange) && (!idev || idev->id == mbuf->port) && (!odev || odev->id == oif) ) { @@ -229,7 +322,18 @@ __dp_vs_svc_match_get(int af, const struct rte_mbuf *mbuf) return NULL; } -int dp_vs_match_parse(int af, const char *srange, const char *drange, +static struct dp_vs_service * +__dp_vs_svc_match_get(int af, const struct rte_mbuf *mbuf) +{ + if (af == AF_INET) + return __dp_vs_svc_match_get4(mbuf); + else if (af == AF_INET6) + return __dp_vs_svc_match_get6(mbuf); + else + return NULL; +} + +int dp_vs_match_parse(const char *srange, const char *drange, const char *iifname, const char *oifname, struct dp_vs_match *match) { @@ -238,13 +342,13 @@ int dp_vs_match_parse(int af, const char *srange, const char *drange, memset(match, 0, sizeof(*match)); if (srange && strlen(srange)) { - err = inet_addr_range_parse(AF_INET, srange, &match->srange); + err = inet_addr_range_parse(srange, &match->srange, &match->af); if (err != EDPVS_OK) return err; } if (drange && strlen(drange)) { - err = inet_addr_range_parse(AF_INET, drange, &match->drange); + err = inet_addr_range_parse(drange, &match->drange, &match->af); if (err != EDPVS_OK) return err; } @@ -310,7 +414,7 @@ struct dp_vs_service *dp_vs_service_lookup(int af, uint16_t protocol, struct dp_vs_service *dp_vs_lookup_vip(int af, uint16_t protocol, - const union inet_addr *vaddr) + const union inet_addr *vaddr) { struct dp_vs_service *svc; unsigned hash; @@ -353,15 +457,15 @@ void __dp_vs_unbind_svc(struct dp_vs_dest *dest) } int dp_vs_add_service(struct dp_vs_service_conf *u, - struct dp_vs_service **svc_p) + struct dp_vs_service **svc_p) { int ret = 0; int size; struct dp_vs_scheduler *sched = NULL; struct dp_vs_service *svc = NULL; - if (!u->fwmark && !u->addr.in.s_addr && !u->port && - is_empty_match(&u->match)) { + if (!u->fwmark && inet_is_addr_any(u->af, &u->addr) + && !u->port && is_empty_match(&u->match)) { RTE_LOG(ERR, SERVICE, "%s: adding empty servive\n", __func__); return EDPVS_INVAL; } @@ -419,8 +523,8 @@ int dp_vs_add_service(struct dp_vs_service_conf *u, ret = dp_vs_new_stats(&(svc->stats)); if(ret) goto out_err; - if(svc->af == AF_INET) - dp_vs_num_services++; + + dp_vs_num_services++; rte_rwlock_write_lock(&__dp_vs_svc_lock); dp_vs_svc_hash(svc); @@ -457,12 +561,10 @@ dp_vs_edit_service(struct dp_vs_service *svc, struct dp_vs_service_conf *u) } old_sched = sched; -#ifdef CONFIG_IP_VS_IPV6 if (u->af == AF_INET6 && (u->netmask < 1 || u->netmask > 128)) { - ret = -EINVAL; + ret = EDPVS_INVAL; goto out; } -#endif rte_rwlock_write_lock(&__dp_vs_svc_lock); @@ -511,12 +613,9 @@ dp_vs_edit_service(struct dp_vs_service *svc, struct dp_vs_service_conf *u) } } - out_unlock: +out_unlock: rte_rwlock_write_unlock(&__dp_vs_svc_lock); -#ifdef CONFIG_IP_VS_IPV6 - out: -#endif - +out: return ret; } @@ -526,8 +625,7 @@ static void __dp_vs_del_service(struct dp_vs_service *svc) struct dp_vs_dest *dest, *nxt; /* Count only IPv4 services for old get/setsockopt interface */ - if (svc->af == AF_INET) - dp_vs_num_services--; + dp_vs_num_services--; /* Unbind scheduler */ dp_vs_unbind_scheduler(svc); @@ -587,8 +685,9 @@ dp_vs_copy_service(struct dp_vs_service_entry *dst, struct dp_vs_service *src) struct dp_vs_match *m; memset(dst, 0, sizeof(*dst)); + dst->af = src->af; dst->proto = src->proto; - dst->addr = src->addr.in.s_addr; + dst->addr = src->addr; dst->port = src->port; dst->fwmark = src->fwmark; snprintf(dst->sched_name, sizeof(dst->sched_name), @@ -606,8 +705,8 @@ dp_vs_copy_service(struct dp_vs_service_entry *dst, struct dp_vs_service *src) if (!m) return err; - inet_addr_range_dump(AF_INET, &m->srange, dst->srange, sizeof(dst->srange)); - inet_addr_range_dump(AF_INET, &m->drange, dst->drange, sizeof(dst->drange)); + inet_addr_range_dump(m->af, &m->srange, dst->srange, sizeof(dst->srange)); + inet_addr_range_dump(m->af, &m->drange, dst->drange, sizeof(dst->drange)); snprintf(dst->iifname, sizeof(dst->iifname), "%s", m->iifname); snprintf(dst->oifname, sizeof(dst->oifname), "%s", m->oifname); @@ -624,8 +723,6 @@ int dp_vs_get_service_entries(const struct dp_vs_get_services *get, for (idx = 0; idx < DP_VS_SVC_TAB_SIZE; idx++) { list_for_each_entry(svc, &dp_vs_svc_table[idx], s_list){ - if (svc->af != AF_INET) - continue; if (count >= get->num_services) goto out; ret = dp_vs_copy_service(&uptr->entrytable[count], svc); @@ -637,9 +734,6 @@ int dp_vs_get_service_entries(const struct dp_vs_get_services *get, for (idx = 0; idx < DP_VS_SVC_TAB_SIZE; idx++) { list_for_each_entry(svc, &dp_vs_svc_fwm_table[idx], f_list) { - /* Only expose IPv4 entries to old interface */ - if (svc->af != AF_INET) - continue; if (count >= get->num_services) goto out; ret = dp_vs_copy_service(&uptr->entrytable[count], svc); @@ -650,8 +744,6 @@ int dp_vs_get_service_entries(const struct dp_vs_get_services *get, } list_for_each_entry(svc, &dp_vs_svc_match_list, m_list) { - if (svc->af != AF_INET) - continue; if (count >= get->num_services) goto out; ret = dp_vs_copy_service(&uptr->entrytable[count], svc); @@ -773,9 +865,10 @@ int dp_vs_zero_all(void) static int dp_vs_copy_usvc_compat(struct dp_vs_service_conf *conf, struct dp_vs_service_user *user) { - conf->af = AF_INET; + int err; + conf->af = user->af; conf->protocol = user->proto; - conf->addr.in.s_addr = user->addr; + conf->addr = user->addr; conf->port = user->port; conf->fwmark = user->fwmark; @@ -789,27 +882,32 @@ static int dp_vs_copy_usvc_compat(struct dp_vs_service_conf *conf, conf->bps = user->bps; conf->limit_proportion = user->limit_proportion; - return dp_vs_match_parse(AF_INET, user->srange, user->drange, - user->iifname, user->oifname, &conf->match); + err = dp_vs_match_parse(user->srange, user->drange, + user->iifname, user->oifname, &conf->match); + if (conf->match.af) + conf->af = conf->match.af; + + return err; } static void dp_vs_copy_udest_compat(struct dp_vs_dest_conf *udest, struct dp_vs_dest_user *udest_compat) { - udest->addr.in.s_addr = udest_compat->addr; - udest->port = udest_compat->port; - udest->fwdmode = udest_compat->conn_flags;//make sure fwdmode and conn_flags are the same + udest->af = udest_compat->af; + udest->addr = udest_compat->addr; + udest->port = udest_compat->port; + udest->fwdmode = udest_compat->conn_flags;//make sure fwdmode and conn_flags are the same udest->conn_flags = udest_compat->conn_flags; - udest->weight = udest_compat->weight; - udest->max_conn = udest_compat->max_conn; - udest->min_conn = udest_compat->min_conn; + udest->weight = udest_compat->weight; + udest->max_conn = udest_compat->max_conn; + udest->min_conn = udest_compat->min_conn; } static int gratuitous_arp_send_vip(struct in_addr *vip) { struct route_entry *local_route; - local_route = route_out_local_lookup(vip->s_addr); + local_route = route_out_local_lookup(vip->s_addr); if(local_route){ neigh_gratuitous_arp(&local_route->dest, local_route->port); route4_put(local_route); @@ -835,8 +933,8 @@ static int dp_vs_set_svc(sockoptid_t opt, const void *user, size_t len) } if (opt == DPVS_SO_SET_FLUSH) return dp_vs_flush(); - memcpy(arg, user, len); + memcpy(arg, user, len); usvc_compat = (struct dp_vs_service_user *)arg; udest_compat = (struct dp_vs_dest_user *)(usvc_compat + 1); @@ -845,7 +943,8 @@ static int dp_vs_set_svc(sockoptid_t opt, const void *user, size_t len) return ret; if (opt == DPVS_SO_SET_ZERO) { - if(!usvc.fwmark && !usvc.addr.in.s_addr && !usvc.port && + if(!inet_is_addr_any(usvc.af, &usvc.addr) && + !usvc.fwmark && !usvc.port && is_empty_match(&usvc.match) ) { return dp_vs_zero_all(); @@ -853,12 +952,12 @@ static int dp_vs_set_svc(sockoptid_t opt, const void *user, size_t len) } if (usvc.protocol != IPPROTO_TCP && usvc.protocol != IPPROTO_UDP && - usvc.protocol != IPPROTO_ICMP) { + usvc.protocol != IPPROTO_ICMP && usvc.protocol != IPPROTO_ICMPV6) { RTE_LOG(ERR, SERVICE, "%s: protocol not support.\n", __func__); return EDPVS_INVAL; } - if (usvc.addr.in.s_addr || usvc.port) + if (!inet_is_addr_any(usvc.af, &usvc.addr) || usvc.port) svc = __dp_vs_service_get(usvc.af, usvc.protocol, &usvc.addr, usvc.port); else if (usvc.fwmark) @@ -945,18 +1044,17 @@ static int dp_vs_get_svc(sockoptid_t opt, const void *user, size_t len, void **o { struct dp_vs_get_services *get, *output; int size; - get = (struct dp_vs_get_services*)user; + get = (struct dp_vs_get_services *)user; size = sizeof(*get) + \ sizeof(struct dp_vs_service_entry) * (get->num_services); - //memcpy(&get, user, size); - if(len != size){ + if(len != sizeof(*get)){ *outlen = 0; return EDPVS_INVAL; } - output = rte_zmalloc("get_services", len, 0); + output = rte_zmalloc("get_services", size, 0); if (unlikely(NULL == output)) return EDPVS_NOMEM; - memcpy(output, get, size); + memcpy(output, get, sizeof(*get)); ret = dp_vs_get_service_entries(get, output); *out = output; *outlen = size; @@ -969,23 +1067,23 @@ static int dp_vs_get_svc(sockoptid_t opt, const void *user, size_t len, void **o union inet_addr addr; entry = (struct dp_vs_service_entry *)user; - addr.in.s_addr = entry->addr; + addr = entry->addr; if(entry->fwmark) svc = __dp_vs_svc_fwm_get(AF_INET, entry->fwmark); - else if (entry->addr || entry->port) - svc = __dp_vs_service_get(AF_INET, entry->proto, + else if (!inet_is_addr_any(entry->af, &entry->addr) || entry->port) + svc = __dp_vs_service_get(entry->af, entry->proto, &addr, entry->port); else { struct dp_vs_match match; - ret = dp_vs_match_parse(AF_INET, entry->srange, - entry->drange, entry->iifname, - entry->oifname, &match); + ret = dp_vs_match_parse(entry->srange, entry->drange, + entry->iifname, entry->oifname, + &match); if (ret != EDPVS_OK) return ret; if (!is_empty_match(&match)) { - svc = __dp_vs_svc_match_find(AF_INET, entry->proto, + svc = __dp_vs_svc_match_find(match.af, entry->proto, &match); } } @@ -1014,32 +1112,32 @@ static int dp_vs_get_svc(sockoptid_t opt, const void *user, size_t len, void **o int size; get = (struct dp_vs_get_dests *)user; size = sizeof(*get) + sizeof(struct dp_vs_dest_entry) * get->num_dests; - if(len != size){ + if(len != sizeof(*get)){ *outlen = 0; return EDPVS_INVAL; } - addr.in.s_addr = get->addr; + addr = get->addr; output = rte_zmalloc("get_services", size, 0); if (unlikely(NULL == output)) return EDPVS_NOMEM; - memcpy(output, get, size); + memcpy(output, get, sizeof(*get)); if(get->fwmark) - svc = __dp_vs_svc_fwm_get(AF_INET, get->fwmark); - else if (addr.in.s_addr || get->port) - svc = __dp_vs_service_get(AF_INET, get->proto, &addr, + svc = __dp_vs_svc_fwm_get(get->af, get->fwmark); + else if (!inet_is_addr_any(get->af, &addr) || get->port) + svc = __dp_vs_service_get(get->af, get->proto, &addr, get->port); else { struct dp_vs_match match; - ret = dp_vs_match_parse(AF_INET, get->srange, - get->drange, get->iifname, - get->oifname, &match); + ret = dp_vs_match_parse(get->srange, get->drange, + get->iifname, get->oifname, + &match); if (ret != EDPVS_OK) return ret; if (!is_empty_match(&match)) { - svc = __dp_vs_svc_match_find(AF_INET, get->proto, + svc = __dp_vs_svc_match_find(match.af, get->proto, &match); } } diff --git a/src/ipvs/ip_vs_synproxy.c b/src/ipvs/ip_vs_synproxy.c index 145911c77..b1c9b1bb6 100644 --- a/src/ipvs/ip_vs_synproxy.c +++ b/src/ipvs/ip_vs_synproxy.c @@ -25,6 +25,7 @@ #include "ipvs/synproxy.h" #include "timer.h" #include "ipv4.h" +#include "ipv6.h" #include "ipvs/proto.h" #include "ipvs/proto_tcp.h" #include "ipvs/blklst.h" @@ -78,7 +79,7 @@ static struct dpvs_timer g_second_timer; * syncookies using digest function from openssl libray, * a little difference from kernel, which uses md5_transform * */ -static uint32_t g_net_secret[2]; +static uint32_t g_net_secret[2][MD5_LBLOCK]; static struct dpvs_timer g_minute_timer; static rte_atomic32_t g_minute_count; @@ -119,8 +120,10 @@ int dp_vs_synproxy_init(void) char ack_mbufpool_name[32]; struct timeval tv; - g_net_secret[0] = (uint32_t)random(); - g_net_secret[1] = (uint32_t)random(); + for (i = 0; i < MD5_LBLOCK; i++) { + g_net_secret[0][i] = (uint32_t)random(); + g_net_secret[1][i] = (uint32_t)random(); + } rte_atomic32_set(&g_minute_count, (uint32_t)random()); tv.tv_sec = 60; /* one minute timer */ @@ -136,8 +139,11 @@ int dp_vs_synproxy_init(void) DP_VS_SYNPROXY_ACK_CACHE_SIZE, 0, NULL, NULL, NULL, NULL, i, 0); - if (!dp_vs_synproxy_ack_mbufpool[i]) + if (!dp_vs_synproxy_ack_mbufpool[i]) { + for (i = i - 1; i >= 0; i--) + rte_mempool_free(dp_vs_synproxy_ack_mbufpool[i]); return EDPVS_NOMEM; + } } #ifdef CONFIG_SYNPROXY_DEBUG @@ -166,8 +172,10 @@ int dp_vs_synproxy_term(void) #define COOKIEBITS 24 /* Upper bits store count */ #define COOKIEMASK (((uint32_t)1 << COOKIEBITS) - 1) -static uint32_t cookie_hash(uint32_t saddr, uint32_t daddr, uint16_t sport, - uint16_t dport, uint32_t count, int c) +static uint32_t +cookie_hash(uint32_t saddr, uint32_t daddr, + uint16_t sport, uint16_t dport, + uint32_t count, int c) { unsigned char hash[MD5_DIGEST_LENGTH]; uint32_t data[5]; @@ -177,7 +185,7 @@ static uint32_t cookie_hash(uint32_t saddr, uint32_t daddr, uint16_t sport, data[1] = daddr; data[2] = (sport << 16) + dport; data[3] = count; - data[4] = g_net_secret[c]; + data[4] = g_net_secret[c][0]; MD5((unsigned char *)data, sizeof(data), hash); memcpy(&hvalue, hash, sizeof(hvalue)); @@ -185,10 +193,11 @@ static uint32_t cookie_hash(uint32_t saddr, uint32_t daddr, uint16_t sport, return hvalue; } -static uint32_t secure_tcp_syn_cookie( - uint32_t saddr, uint32_t daddr, - uint16_t sport, uint16_t dport, - uint32_t sseq, uint32_t count, uint32_t data) +static uint32_t +secure_tcp_syn_cookie(uint32_t saddr, uint32_t daddr, + uint16_t sport, uint16_t dport, + uint32_t sseq, uint32_t count, + uint32_t data) { /* * Compute the secure sequence number. @@ -204,10 +213,12 @@ static uint32_t secure_tcp_syn_cookie( ((cookie_hash(saddr, daddr, sport, dport, count, 1) + data) & COOKIEMASK)); } -static uint32_t check_tcp_syn_cookie(uint32_t cookie, - uint32_t saddr, uint32_t daddr, - uint16_t sport, uint16_t dport, - uint32_t sseq, uint32_t count, uint32_t maxdiff) +static uint32_t +check_tcp_syn_cookie(uint32_t cookie, + uint32_t saddr, uint32_t daddr, + uint16_t sport, uint16_t dport, + uint32_t sseq, uint32_t count, + uint32_t maxdiff) { /* * This retrieves the small "data" value from the syncookie. @@ -232,6 +243,66 @@ static uint32_t check_tcp_syn_cookie(uint32_t cookie, & COOKIEMASK; /* Leaving the data behind */ } +static uint32_t +cookie_hash_v6(const struct in6_addr *saddr, + const struct in6_addr *daddr, + uint16_t sport, uint16_t dport, + uint32_t count, int c) +{ + int i; + uint32_t hvalue, data[MD5_LBLOCK]; + unsigned char hash[MD5_DIGEST_LENGTH]; + + for (i = 0; i < 4; i++) + data[i] = g_net_secret[c][i] + ((uint32_t *)saddr)[i]; + for (i = 4; i < 8; i++) + data[i] = g_net_secret[c][i] + ((uint32_t *)daddr)[i-4]; + + data[8] = g_net_secret[c][8] + ((sport << 16) + dport); + data[9] = g_net_secret[c][9] + count; + + for (i = 10; i < MD5_LBLOCK; i++) + data[i] = g_net_secret[c][i]; + + MD5((unsigned char*)data, sizeof(data), hash); + memcpy(&hvalue, hash, sizeof(hvalue)); + + return hvalue; +} + +static uint32_t +secure_tcp_syn_cookie_v6(const struct in6_addr *saddr, + const struct in6_addr *daddr, + uint16_t sport, uint16_t dport, + uint32_t sseq, uint32_t count, + uint32_t data) +{ + return (cookie_hash_v6(saddr, daddr, sport, dport, 0, 0) + + sseq + (count << COOKIEBITS) + + ((cookie_hash_v6(saddr, daddr, sport, dport, count, 1) + + data) & COOKIEMASK)); +} + +static uint32_t +check_tcp_syn_cookie_v6(uint32_t cookie, + const struct in6_addr *saddr, + const struct in6_addr *daddr, + uint16_t sport, uint16_t dport, + uint32_t sseq, uint32_t count, + uint32_t maxdiff) +{ + uint32_t diff; + + cookie -= cookie_hash_v6(saddr, daddr, sport, dport, 0, 0) + sseq; + + diff = (count - (cookie >> COOKIEBITS)) & ((uint32_t) -1 >> COOKIEBITS); + if (diff >= maxdiff) + return (uint32_t) -1; + + return (cookie - cookie_hash_v6(saddr, daddr, sport, dport, + count - diff, 1)) & COOKIEMASK; +} + /* This table has to be sorted and terminated with (uint16_t)-1. * XXX generate a better table. * Unresolved Issues: HIPPI with a 64K MSS is not well supported. @@ -272,11 +343,12 @@ static uint16_t const msstab[] = { * [19-16] snd_wscale * [15-12] MSSIND */ -static uint32_t syn_proxy_cookie_v4_init_sequence(struct rte_mbuf *mbuf, - struct dp_vs_synproxy_opt *opts) +static uint32_t +syn_proxy_cookie_v4_init_sequence(struct rte_mbuf *mbuf, + const struct tcphdr *th, + struct dp_vs_synproxy_opt *opts) { const struct iphdr *iph = (struct iphdr*)ip4_hdr(mbuf); - const struct tcphdr *th = tcp_hdr(mbuf); int mssind; const uint16_t mss = opts->mss_clamp; uint32_t data; @@ -296,13 +368,39 @@ static uint32_t syn_proxy_cookie_v4_init_sequence(struct rte_mbuf *mbuf, rte_atomic32_read(&g_minute_count), data); } +static uint32_t +syn_proxy_cookie_v6_init_sequence(struct rte_mbuf *mbuf, + const struct tcphdr *th, + struct dp_vs_synproxy_opt *opts) +{ + const struct ip6_hdr *ip6h = ip6_hdr(mbuf); + int mssind; + const uint16_t mss = opts->mss_clamp; + uint32_t data; + + /* XXX sort msstab[] by probability? Binary serarch? */ + for (mssind = 0; mss > msstab[mssind + 1]; mssind++) + ; + opts->mss_clamp = msstab[mssind] + 1; + + data = ((mssind & 0x0f) << DP_VS_SYNPROXY_MSS_BITS); + data |= opts->sack_ok << DP_VS_SYNPROXY_SACKOK_BIT; + data |= opts->tstamp_ok << DP_VS_SYNPROXY_TSOK_BIT; + data |= ((opts->snd_wscale & 0xf) << DP_VS_SYNPROXY_SND_WSCALE_BITS); + + return secure_tcp_syn_cookie_v6(&ip6h->ip6_src, &ip6h->ip6_dst, + th->source, th->dest, ntohl(th->seq), + rte_atomic32_read(&g_minute_count), data); +} + /* * When syn_proxy_cookie_v4_init_sequence is used, we check cookie as follow: * 1. mssind check. * 2. get sack/timestamp/wscale options */ -static int syn_proxy_v4_cookie_check(struct rte_mbuf *mbuf, uint32_t cookie, - struct dp_vs_synproxy_opt *opt) +static int +syn_proxy_v4_cookie_check(struct rte_mbuf *mbuf, uint32_t cookie, + struct dp_vs_synproxy_opt *opt) { const struct iphdr *iph = (struct iphdr*)ip4_hdr(mbuf); const struct tcphdr *th = tcp_hdr(mbuf); @@ -337,6 +435,42 @@ static int syn_proxy_v4_cookie_check(struct rte_mbuf *mbuf, uint32_t cookie, return 0; } +static int +syn_proxy_v6_cookie_check(struct rte_mbuf *mbuf, uint32_t cookie, + struct dp_vs_synproxy_opt *opt) +{ + const struct ip6_hdr *ip6h = ip6_hdr(mbuf); + const struct tcphdr *th = tcp_hdr(mbuf); + + uint32_t seq = ntohl(th->seq) - 1; + uint32_t mssind; + uint32_t res = check_tcp_syn_cookie_v6(cookie, &ip6h->ip6_src, &ip6h->ip6_dst, + th->source, th->dest, seq, rte_atomic32_read(&g_minute_count), + DP_VS_SYNPROXY_COUNTER_TRIES); + + if ((uint32_t) -1 == res) /* count is invalid, g_minute_count' >> g_minute_count */ + return 0; + + mssind = (res & DP_VS_SYNPROXY_MSS_MASK) >> DP_VS_SYNPROXY_MSS_BITS; + + memset(opt, 0, sizeof(struct dp_vs_synproxy_opt)); + if ((mssind < NUM_MSS) && ((res & DP_VS_SYNPROXY_OTHER_MASK) == 0)) { + opt->mss_clamp = msstab[mssind] + 1; + opt->sack_ok = (res & DP_VS_SYNPROXY_SACKOK_MASK) >> DP_VS_SYNPROXY_SACKOK_BIT; + opt->tstamp_ok = (res & DP_VS_SYNPROXY_TSOK_MASK) >> DP_VS_SYNPROXY_TSOK_BIT; + opt->snd_wscale = (res & DP_VS_SYNPROXY_SND_WSCALE_MASK) + >> DP_VS_SYNPROXY_SND_WSCALE_BITS; + if (opt->snd_wscale > 0 && opt->snd_wscale <= DP_VS_SYNPROXY_WSCALE_MAX) + opt->wscale_ok = 1; + else if (opt->snd_wscale == 0) + opt->wscale_ok = 0; + else + return 0; + + return 1; + } + return 0; +} /* * Synproxy implementation @@ -444,27 +578,29 @@ static void syn_proxy_parse_set_opts(struct rte_mbuf *mbuf, struct tcphdr *th, * 5) compute iphdr and tcp check (HW xmit checksum offload not support for syn). */ static void syn_proxy_reuse_mbuf(int af, struct rte_mbuf *mbuf, - struct dp_vs_synproxy_opt *opt) + struct tcphdr *th, + struct dp_vs_synproxy_opt *opt) { uint32_t isn; - uint32_t tmpaddr; uint16_t tmpport; - struct iphdr *iph; - struct tcphdr *th; - int ip4hlen; + int iphlen; - iph = (struct iphdr*)ip4_hdr(mbuf); - ip4hlen = ip4_hdrlen(mbuf); - th = tcp_hdr(mbuf); + if (AF_INET6 == af) + iphlen = sizeof(struct ip6_hdr); + else + iphlen = ip4_hdrlen(mbuf); - if (mbuf_may_pull(mbuf, ip4hlen + (th->doff<< 2)) != 0) + if (mbuf_may_pull(mbuf, iphlen + (th->doff << 2)) != 0) return; /* deal with tcp options */ syn_proxy_parse_set_opts(mbuf, th, opt); /* get cookie */ - isn = syn_proxy_cookie_v4_init_sequence(mbuf, opt); + if (AF_INET6 == af) + isn = syn_proxy_cookie_v6_init_sequence(mbuf, th, opt); + else + isn = syn_proxy_cookie_v4_init_sequence(mbuf, th, opt); /* set syn-ack flag */ ((uint8_t *)th)[13] = 0x12; @@ -479,28 +615,50 @@ static void syn_proxy_reuse_mbuf(int af, struct rte_mbuf *mbuf, th->seq = htonl(isn); /* exchage addresses */ - tmpaddr = iph->saddr; - iph->saddr = iph->daddr; - iph->daddr = tmpaddr; - - iph->ttl = dp_vs_synproxy_ctrl_synack_ttl; - iph->tos = 0; - - /* compute checksum */ - if (likely(mbuf->ol_flags & PKT_TX_TCP_CKSUM)) { - mbuf->l3_len = ip4hlen; - mbuf->l4_len = ntohs(ip4_hdr(mbuf)->total_length) - ip4hlen; - th->check = rte_ipv4_phdr_cksum(ip4_hdr(mbuf), mbuf->ol_flags); + if (AF_INET6 == af) { + struct in6_addr tmpaddr; + struct ip6_hdr *ip6h = ip6_hdr(mbuf); + + tmpaddr = ip6h->ip6_src; + ip6h->ip6_src = ip6h->ip6_dst; + ip6h->ip6_dst = tmpaddr; + ip6h->ip6_hlim = dp_vs_synproxy_ctrl_synack_ttl; + + if (likely(mbuf->ol_flags & PKT_TX_TCP_CKSUM)) { + mbuf->l3_len = (void *)th - (void *)ip6h; + mbuf->l4_len = ntohs(ip6h->ip6_plen) + sizeof(struct ip6_hdr) - mbuf->l3_len; + th->check = ip6_phdr_cksum(ip6h, mbuf->ol_flags, mbuf->l3_len, IPPROTO_TCP); + } else { + if (mbuf_may_pull(mbuf, mbuf->pkt_len) != 0) + return; + tcp6_send_csum((struct ipv6_hdr*)ip6h, th); + } } else { - if (mbuf_may_pull(mbuf, mbuf->pkt_len) != 0) - return; - tcp4_send_csum((struct ipv4_hdr*)iph, th); - } + uint32_t tmpaddr; + struct iphdr *iph = (struct iphdr*)ip4_hdr(mbuf); + + tmpaddr = iph->saddr; + iph->saddr = iph->daddr; + iph->daddr = tmpaddr; + iph->ttl = dp_vs_synproxy_ctrl_synack_ttl; + iph->tos = 0; + + /* compute checksum */ + if (likely(mbuf->ol_flags & PKT_TX_TCP_CKSUM)) { + mbuf->l3_len = iphlen; + mbuf->l4_len = ntohs(iph->tot_len) - iphlen; + th->check = rte_ipv4_phdr_cksum((struct ipv4_hdr*)iph, mbuf->ol_flags); + } else { + if (mbuf_may_pull(mbuf, mbuf->pkt_len) != 0) + return; + tcp4_send_csum((struct ipv4_hdr*)iph, th); + } - if (likely(mbuf->ol_flags & PKT_TX_IP_CKSUM)) - iph->check = 0; - else - ip4_send_csum((struct ipv4_hdr*)iph); + if (likely(mbuf->ol_flags & PKT_TX_IP_CKSUM)) + iph->check = 0; + else + ip4_send_csum((struct ipv4_hdr*)iph); + } } /* Syn-proxy step 1 logic: receive client's Syn. @@ -547,7 +705,6 @@ int dp_vs_synproxy_syn_rcv(int af, struct rte_mbuf *mbuf, if (dp_vs_blklst_lookup(iph->proto, &iph->daddr, th->dest, &iph->saddr)) { goto syn_rcv_out; } - } else { if (svc) dp_vs_service_put(svc); @@ -570,11 +727,15 @@ int dp_vs_synproxy_syn_rcv(int af, struct rte_mbuf *mbuf, __func__, mbuf->port); goto syn_rcv_out; } - if (likely(dev && (dev->flag & NETIF_PORT_FLAG_TX_TCP_CSUM_OFFLOAD))) - mbuf->ol_flags |= (PKT_TX_TCP_CKSUM | PKT_TX_IP_CKSUM | PKT_TX_IPV4); + if (likely(dev && (dev->flag & NETIF_PORT_FLAG_TX_TCP_CSUM_OFFLOAD))) { + if (af == AF_INET) + mbuf->ol_flags |= (PKT_TX_TCP_CKSUM | PKT_TX_IP_CKSUM | PKT_TX_IPV4); + else + mbuf->ol_flags |= (PKT_TX_TCP_CKSUM | PKT_TX_IPV6); + } /* reuse mbuf */ - syn_proxy_reuse_mbuf(af, mbuf, &tcp_opt); + syn_proxy_reuse_mbuf(af, mbuf, th, &tcp_opt); /* set L2 header and send the packet out * It is noted that "ipv4_xmit" should not used here, @@ -659,8 +820,6 @@ static int syn_proxy_send_rs_syn(int af, const struct tcphdr *th, int tcp_hdr_size; struct rte_mbuf *syn_mbuf, *syn_mbuf_cloned; struct rte_mempool *pool; - struct iphdr *ack_iph; - struct iphdr *syn_iph; struct tcphdr *syn_th; if (!cp->packet_xmit) { @@ -709,25 +868,51 @@ static int syn_proxy_send_rs_syn(int af, const struct tcphdr *th, syn_th->urg = 0; syn_proxy_syn_build_options((uint32_t *)(syn_th + 1), opt); - /* Reserve space for ipv4 header */ - syn_iph = (struct iphdr *)rte_pktmbuf_prepend(syn_mbuf, sizeof(struct ipv4_hdr)); - if (!syn_iph) { - rte_pktmbuf_free(syn_mbuf); - //RTE_LOG(WARNING, IPVS, "%s:%s\n", __func__, dpvs_strerror(EDPVS_NOROOM)); - return EDPVS_NOROOM; - } + if (AF_INET6 == af) { + struct ip6_hdr *ack_ip6h; + struct ip6_hdr *syn_ip6h; - ack_iph = (struct iphdr *)ip4_hdr(mbuf); - *((uint16_t *) syn_iph) = htons((4 << 12) | (5 << 8) | (ack_iph->tos & 0x1E)); - syn_iph->tot_len = htons(syn_mbuf->pkt_len); - syn_iph->frag_off = htons(IPV4_HDR_DF_FLAG); - syn_iph->ttl = 64; - syn_iph->protocol = IPPROTO_TCP; - syn_iph->saddr = ack_iph->saddr; - syn_iph->daddr = ack_iph->daddr; + /* Reserve space for ipv6 header */ + syn_ip6h = (struct ip6_hdr *)rte_pktmbuf_prepend(syn_mbuf, + sizeof(struct ip6_hdr)); + if (!syn_ip6h) { + rte_pktmbuf_free(syn_mbuf); + //RTE_LOG(WARNING, IPVS, "%s:%s\n", __func__, dpvs_strerror(EDPVS_NOROOM)); + return EDPVS_NOROOM; + } - /* checksum is done by fnat_in_handler */ - syn_iph->check = 0; + ack_ip6h = (struct ip6_hdr *)ip6_hdr(mbuf); + + syn_ip6h->ip6_vfc = 0x60; /* IPv6 */ + syn_ip6h->ip6_src = ack_ip6h->ip6_src; + syn_ip6h->ip6_dst = ack_ip6h->ip6_dst; + syn_ip6h->ip6_plen = htons(tcp_hdr_size); + syn_ip6h->ip6_nxt = NEXTHDR_TCP; + syn_ip6h->ip6_hlim = IPV6_DEFAULT_HOPLIMIT; + } else { + struct iphdr *ack_iph; + struct iphdr *syn_iph; + + /* Reserve space for ipv4 header */ + syn_iph = (struct iphdr *)rte_pktmbuf_prepend(syn_mbuf, sizeof(struct ipv4_hdr)); + if (!syn_iph) { + rte_pktmbuf_free(syn_mbuf); + //RTE_LOG(WARNING, IPVS, "%s:%s\n", __func__, dpvs_strerror(EDPVS_NOROOM)); + return EDPVS_NOROOM; + } + + ack_iph = (struct iphdr *)ip4_hdr(mbuf); + *((uint16_t *) syn_iph) = htons((4 << 12) | (5 << 8) | (ack_iph->tos & 0x1E)); + syn_iph->tot_len = htons(syn_mbuf->pkt_len); + syn_iph->frag_off = htons(IPV4_HDR_DF_FLAG); + syn_iph->ttl = 64; + syn_iph->protocol = IPPROTO_TCP; + syn_iph->saddr = ack_iph->saddr; + syn_iph->daddr = ack_iph->daddr; + + /* checksum is done by fnat_in_handler */ + syn_iph->check = 0; + } /* Save syn_mbuf if syn retransmission is on */ if (dp_vs_synproxy_ctrl_syn_retry > 0) { @@ -744,6 +929,8 @@ static int syn_proxy_send_rs_syn(int af, const struct tcphdr *th, rte_atomic32_set(&cp->syn_retry_max, dp_vs_synproxy_ctrl_syn_retry); } + /* TODO: Save info for fast_response_xmit */ + /* Count in the syn packet */ dp_vs_stats_in(cp, mbuf); @@ -781,7 +968,12 @@ int dp_vs_synproxy_ack_rcv(int af, struct rte_mbuf *mbuf, return 0; } - res_cookie_check = syn_proxy_v4_cookie_check(mbuf, ntohl(th->ack_seq) - 1, &opt); + if (AF_INET6 == af) + res_cookie_check = syn_proxy_v6_cookie_check(mbuf, + ntohl(th->ack_seq) - 1, &opt); + else + res_cookie_check = syn_proxy_v4_cookie_check(mbuf, + ntohl(th->ack_seq) - 1, &opt); if (!res_cookie_check) { /* Update statistics */ dp_vs_estats_inc(SYNPROXY_BAD_ACK); @@ -832,7 +1024,8 @@ int dp_vs_synproxy_ack_rcv(int af, struct rte_mbuf *mbuf, } /* Update out2in sack seqs */ -static inline void syn_proxy_filter_opt_outin(struct tcphdr *th, struct dp_vs_seq *sp_seq) +static inline void +syn_proxy_filter_opt_outin(struct tcphdr *th, struct dp_vs_seq *sp_seq) { unsigned char *ptr; int length = (th->doff * 4) - sizeof(struct tcphdr); @@ -872,7 +1065,8 @@ static inline void syn_proxy_filter_opt_outin(struct tcphdr *th, struct dp_vs_se if (TCPOPT_SACK == opcode && opsize >= (TCP_OLEN_SACK_BASE + TCP_OLEN_SACK_PERBLOCK) && !((opsize - TCP_OLEN_SACK_BASE) % TCP_OLEN_SACK_PERBLOCK)) { - for (i = 0; i < (opsize - TCP_OLEN_SACK_BASE); i += TCP_OLEN_SACK_PERBLOCK) { + for (i = 0; i < (opsize - TCP_OLEN_SACK_BASE); + i += TCP_OLEN_SACK_PERBLOCK) { tmp = (uint32_t *)(ptr + i); old_ack_seq = ntohl(*tmp); *tmp = htonl((uint32_t) (old_ack_seq - sp_seq->delta)); @@ -918,7 +1112,7 @@ void dp_vs_synproxy_dnat_handler(struct tcphdr *tcph, struct dp_vs_seq *sp_seq) /* Syn-proxy step 3 logic: receive rs's Syn/Ack. * Update syn_proxy_seq.delta and send stored ack mbufs to rs. */ int dp_vs_synproxy_synack_rcv(struct rte_mbuf *mbuf, struct dp_vs_conn *cp, - struct dp_vs_proto *pp, int ihl, int *verdict) + struct dp_vs_proto *pp, int th_offset, int *verdict) { struct tcphdr _tcph, *th; struct dp_vs_synproxy_ack_pakcet *tmbuf, *tmbuf2; @@ -926,7 +1120,7 @@ int dp_vs_synproxy_synack_rcv(struct rte_mbuf *mbuf, struct dp_vs_conn *cp, struct dp_vs_dest *dest = cp->dest; unsigned conn_timeout = 0; - th = mbuf_header_pointer(mbuf, ihl, sizeof(_tcph), &_tcph); + th = mbuf_header_pointer(mbuf, th_offset, sizeof(_tcph), &_tcph); if (unlikely(!th)) { *verdict = INET_DROP; return 0; @@ -972,7 +1166,7 @@ int dp_vs_synproxy_synack_rcv(struct rte_mbuf *mbuf, struct dp_vs_conn *cp, #endif } - // ip_vs_synproxy_save_fast_xmit_info ? + /* TODO: ip_vs_synproxy_save_fast_xmit_info ? */ /* Free stored syn mbuf, no need for retransmition any more */ if (cp->syn_mbuf) { @@ -1115,7 +1309,12 @@ int dp_vs_synproxy_reuse_conn(int af, struct rte_mbuf *mbuf, (cp->flags & DPVS_CONN_F_SYNPROXY) && (!th->syn && th->ack && !th->rst && !th->fin) && (cp->syn_proxy_seq.isn != htonl((uint32_t)(ntohl(th->ack_seq) - 1)))) { - res_cookie_check = syn_proxy_v4_cookie_check(mbuf, ntohl(th->ack_seq) - 1, &opt); + if (AF_INET6 == af) + res_cookie_check = syn_proxy_v6_cookie_check(mbuf, + ntohl(th->ack_seq) - 1, &opt); + else + res_cookie_check = syn_proxy_v4_cookie_check(mbuf, + ntohl(th->ack_seq) - 1, &opt); if (!res_cookie_check) { /* Update statistics */ dp_vs_estats_inc(SYNPROXY_BAD_ACK); @@ -1152,9 +1351,10 @@ int dp_vs_synproxy_reuse_conn(int af, struct rte_mbuf *mbuf, cp->timeout.tv_sec = 0; } - if (unlikely(EDPVS_OK != (ret = syn_proxy_send_rs_syn(af, th, cp, mbuf, pp, &opt)))) { - RTE_LOG(ERR, IPVS, "%s: syn_proxy_send_rs_syn failed when reuse conn -- %s\n", - __func__, dpvs_strerror(ret)); + if (unlikely(EDPVS_OK != (ret = syn_proxy_send_rs_syn(af, th, cp, + mbuf, pp, &opt)))) { + RTE_LOG(ERR, IPVS, "%s: syn_proxy_send_rs_syn failed when reuse conn" + " -- %s\n", __func__, dpvs_strerror(ret)); /* Release conn immediately */ cp->timeout.tv_sec = 0; } diff --git a/src/ipvs/ip_vs_wlc.c b/src/ipvs/ip_vs_wlc.c index 70b369077..e5751f7d7 100644 --- a/src/ipvs/ip_vs_wlc.c +++ b/src/ipvs/ip_vs_wlc.c @@ -24,7 +24,7 @@ static inline unsigned int dp_vs_wlc_dest_overhead(struct dp_vs_dest *dest) } static struct dp_vs_dest *dp_vs_wlc_schedule(struct dp_vs_service *svc, - const struct rte_mbuf *mbuf) + const struct rte_mbuf *mbuf) { struct dp_vs_dest *dest, *least; unsigned int loh, doh; @@ -51,7 +51,7 @@ static struct dp_vs_dest *dp_vs_wlc_schedule(struct dp_vs_service *svc, /* * Find the destination with the least load. */ - nextstage: +nextstage: list_for_each_entry_continue(dest, &svc->dests, n_list) { if (dest->flags & DPVS_DEST_F_OVERLOAD) continue; diff --git a/src/ipvs/ip_vs_wrr.c b/src/ipvs/ip_vs_wrr.c index 90319aa17..bf4395856 100644 --- a/src/ipvs/ip_vs_wrr.c +++ b/src/ipvs/ip_vs_wrr.c @@ -118,10 +118,10 @@ static int dp_vs_wrr_update_svc(struct dp_vs_service *svc) } /* - * Weighted Round-Robin Scheduling + * Weighted Round-Robin Scheduling */ static struct dp_vs_dest *dp_vs_wrr_schedule(struct dp_vs_service *svc, - const struct rte_mbuf *mbuf) + const struct rte_mbuf *mbuf) { struct dp_vs_dest *dest; struct dp_vs_wrr_mark *mark = svc->sched_data; diff --git a/src/ipvs/ip_vs_xmit.c b/src/ipvs/ip_vs_xmit.c index c9fb581c8..be2282ce6 100644 --- a/src/ipvs/ip_vs_xmit.c +++ b/src/ipvs/ip_vs_xmit.c @@ -16,11 +16,15 @@ * */ #include +#include #include #include "dpdk.h" #include "ipv4.h" +#include "ipv6.h" #include "route.h" +#include "route6.h" #include "icmp.h" +#include "icmp6.h" #include "neigh.h" #include "ipvs/xmit.h" #include "parser/parser.h" @@ -28,12 +32,13 @@ static bool fast_xmit_close = false; static bool xmit_ttl = false; -static int dp_vs_fast_xmit_fnat(struct dp_vs_proto *proto, - struct dp_vs_conn *conn, - struct rte_mbuf *mbuf) +static int __dp_vs_fast_xmit_fnat4(struct dp_vs_proto *proto, + struct dp_vs_conn *conn, + struct rte_mbuf *mbuf) { - struct ipv4_hdr *iph = ip4_hdr(mbuf); + struct ipv4_hdr *ip4h = ip4_hdr(mbuf); struct ether_hdr *eth; + uint16_t packet_type = ETHER_TYPE_IPv4; int err; if (unlikely(conn->in_dev == NULL)) @@ -53,12 +58,12 @@ static int dp_vs_fast_xmit_fnat(struct dp_vs_proto *proto, * re-fetch IP header * the offset may changed during pre-handler */ - iph = ip4_hdr(mbuf); + ip4h = ip4_hdr(mbuf); } - iph->hdr_checksum = 0; - iph->src_addr = conn->laddr.in.s_addr; - iph->dst_addr = conn->daddr.in.s_addr; + ip4h->hdr_checksum = 0; + ip4h->src_addr = conn->laddr.in.s_addr; + ip4h->dst_addr = conn->daddr.in.s_addr; if(proto->fnat_in_handler) { err = proto->fnat_in_handler(proto, conn, mbuf); @@ -67,17 +72,17 @@ static int dp_vs_fast_xmit_fnat(struct dp_vs_proto *proto, } if (likely(mbuf->ol_flags & PKT_TX_IP_CKSUM)) { - iph->hdr_checksum = 0; + ip4h->hdr_checksum = 0; } else { - ip4_send_csum(iph); + ip4_send_csum(ip4h); } eth = (struct ether_hdr *)rte_pktmbuf_prepend(mbuf, (uint16_t)sizeof(struct ether_hdr)); ether_addr_copy(&conn->in_dmac, ð->d_addr); ether_addr_copy(&conn->in_smac, ð->s_addr); - eth->ether_type = rte_cpu_to_be_16(ETHER_TYPE_IPv4); - mbuf->packet_type = ETHER_TYPE_IPv4; + eth->ether_type = rte_cpu_to_be_16(packet_type); + mbuf->packet_type = packet_type; err = netif_xmit(mbuf, conn->in_dev); if (err != EDPVS_OK) @@ -87,12 +92,75 @@ static int dp_vs_fast_xmit_fnat(struct dp_vs_proto *proto, return EDPVS_OK; } -static int dp_vs_fast_outxmit_fnat(struct dp_vs_proto *proto, - struct dp_vs_conn *conn, - struct rte_mbuf *mbuf) +static int __dp_vs_fast_xmit_fnat6(struct dp_vs_proto *proto, + struct dp_vs_conn *conn, + struct rte_mbuf *mbuf) { - struct ipv4_hdr *iph = ip4_hdr(mbuf); + struct ip6_hdr *ip6h = ip6_hdr(mbuf); + struct ether_hdr *eth; + uint16_t packet_type = ETHER_TYPE_IPv6; + int err; + + if (unlikely(conn->in_dev == NULL)) + return EDPVS_NOROUTE; + + if (unlikely(is_zero_ether_addr(&conn->in_dmac) || + is_zero_ether_addr(&conn->in_smac))) + return EDPVS_NOTSUPP; + + /* pre-handler before translation */ + if (proto->fnat_in_pre_handler) { + err = proto->fnat_in_pre_handler(proto, conn, mbuf); + if (err != EDPVS_OK) + return err; + + /* + * re-fetch IP header + * the offset may changed during pre-handler + */ + ip6h = ip6_hdr(mbuf); + } + + ip6h->ip6_src = conn->laddr.in6; + ip6h->ip6_dst = conn->daddr.in6; + + if(proto->fnat_in_handler) { + err = proto->fnat_in_handler(proto, conn, mbuf); + if(err != EDPVS_OK) + return err; + } + + eth = (struct ether_hdr *)rte_pktmbuf_prepend(mbuf, + (uint16_t)sizeof(struct ether_hdr)); + ether_addr_copy(&conn->in_dmac, ð->d_addr); + ether_addr_copy(&conn->in_smac, ð->s_addr); + eth->ether_type = rte_cpu_to_be_16(packet_type); + mbuf->packet_type = packet_type; + + err = netif_xmit(mbuf, conn->in_dev); + if (err != EDPVS_OK) + RTE_LOG(DEBUG, IPVS, "%s: fail to netif_xmit.\n", __func__); + + /* must return OK since netif_xmit alway consume mbuf */ + return EDPVS_OK; +} + +static int dp_vs_fast_xmit_fnat(int af, + struct dp_vs_proto *proto, + struct dp_vs_conn *conn, + struct rte_mbuf *mbuf) +{ + return af == AF_INET ? __dp_vs_fast_xmit_fnat4(proto, conn, mbuf) + : __dp_vs_fast_xmit_fnat6(proto, conn, mbuf); +} + +static int __dp_vs_fast_outxmit_fnat4(struct dp_vs_proto *proto, + struct dp_vs_conn *conn, + struct rte_mbuf *mbuf) +{ + struct ipv4_hdr *ip4h = ip4_hdr(mbuf); struct ether_hdr *eth; + uint16_t packet_type = ETHER_TYPE_IPv4; int err; if (unlikely(conn->out_dev == NULL)) @@ -112,12 +180,12 @@ static int dp_vs_fast_outxmit_fnat(struct dp_vs_proto *proto, * re-fetch IP header * the offset may changed during pre-handler */ - iph = ip4_hdr(mbuf); + ip4h = ip4_hdr(mbuf); } - iph->hdr_checksum = 0; - iph->src_addr = conn->vaddr.in.s_addr; - iph->dst_addr = conn->caddr.in.s_addr; + ip4h->hdr_checksum = 0; + ip4h->src_addr = conn->vaddr.in.s_addr; + ip4h->dst_addr = conn->caddr.in.s_addr; if(proto->fnat_out_handler) { err = proto->fnat_out_handler(proto, conn, mbuf); @@ -126,17 +194,17 @@ static int dp_vs_fast_outxmit_fnat(struct dp_vs_proto *proto, } if (likely(mbuf->ol_flags & PKT_TX_IP_CKSUM)) { - iph->hdr_checksum = 0; + ip4h->hdr_checksum = 0; } else { - ip4_send_csum(iph); + ip4_send_csum(ip4h); } eth = (struct ether_hdr *)rte_pktmbuf_prepend(mbuf, (uint16_t)sizeof(struct ether_hdr)); ether_addr_copy(&conn->out_dmac, ð->d_addr); ether_addr_copy(&conn->out_smac, ð->s_addr); - eth->ether_type = rte_cpu_to_be_16(ETHER_TYPE_IPv4); - mbuf->packet_type = ETHER_TYPE_IPv4; + eth->ether_type = rte_cpu_to_be_16(packet_type); + mbuf->packet_type = packet_type; err = netif_xmit(mbuf, conn->out_dev); if (err != EDPVS_OK) @@ -146,6 +214,68 @@ static int dp_vs_fast_outxmit_fnat(struct dp_vs_proto *proto, return EDPVS_OK; } +static int __dp_vs_fast_outxmit_fnat6(struct dp_vs_proto *proto, + struct dp_vs_conn *conn, + struct rte_mbuf *mbuf) +{ + struct ip6_hdr *ip6h = ip6_hdr(mbuf); + struct ether_hdr *eth; + uint16_t packet_type = ETHER_TYPE_IPv6; + int err; + + if (unlikely(conn->out_dev == NULL)) + return EDPVS_NOROUTE; + + if (unlikely(is_zero_ether_addr(&conn->out_dmac) || + is_zero_ether_addr(&conn->out_smac))) + return EDPVS_NOTSUPP; + + /* pre-handler before translation */ + if (proto->fnat_out_pre_handler) { + err = proto->fnat_out_pre_handler(proto, conn, mbuf); + if (err != EDPVS_OK) + return err; + + /* + * re-fetch IP header + * the offset may changed during pre-handler + */ + ip6h = ip6_hdr(mbuf); + } + + ip6h->ip6_src = conn->vaddr.in6; + ip6h->ip6_dst = conn->caddr.in6; + + if(proto->fnat_out_handler) { + err = proto->fnat_out_handler(proto, conn, mbuf); + if(err != EDPVS_OK) + return err; + } + + eth = (struct ether_hdr *)rte_pktmbuf_prepend(mbuf, + (uint16_t)sizeof(struct ether_hdr)); + ether_addr_copy(&conn->out_dmac, ð->d_addr); + ether_addr_copy(&conn->out_smac, ð->s_addr); + eth->ether_type = rte_cpu_to_be_16(packet_type); + mbuf->packet_type = packet_type; + + err = netif_xmit(mbuf, conn->out_dev); + if (err != EDPVS_OK) + RTE_LOG(DEBUG, IPVS, "%s: fail to netif_xmit.\n", __func__); + + /* must return OK since netif_xmit alway consume mbuf */ + return EDPVS_OK; +} + +static int dp_vs_fast_outxmit_fnat(int af, + struct dp_vs_proto *proto, + struct dp_vs_conn *conn, + struct rte_mbuf *mbuf) +{ + return af == AF_INET ? __dp_vs_fast_outxmit_fnat4(proto, conn, mbuf) + : __dp_vs_fast_outxmit_fnat6(proto, conn, mbuf); +} + /* * ARP_HDR_ETHER SUPPORT ONLY * save source mac(client) for output in conn as dest mac @@ -211,8 +341,8 @@ static void dp_vs_save_outxmit_info(struct rte_mbuf *mbuf, */ static void dp_vs_conn_cache_rt(struct dp_vs_conn *conn, struct route_entry *rt, bool in) { - if ((in && conn->in_dev && (conn->in_nexthop.in.s_addr == htonl(INADDR_ANY))) || - (!in && conn->out_dev && (conn->out_nexthop.in.s_addr == htonl(INADDR_ANY)))) + if ((in && conn->in_dev && (conn->in_nexthop.in.s_addr != htonl(INADDR_ANY))) || + (!in && conn->out_dev && (conn->out_nexthop.in.s_addr != htonl(INADDR_ANY)))) return; if (in) { @@ -233,9 +363,33 @@ static void dp_vs_conn_cache_rt(struct dp_vs_conn *conn, struct route_entry *rt, } } -int dp_vs_xmit_fnat(struct dp_vs_proto *proto, - struct dp_vs_conn *conn, - struct rte_mbuf *mbuf) +static void dp_vs_conn_cache_rt6(struct dp_vs_conn *conn, struct route6 *rt, bool in) +{ + if ((in && conn->in_dev && !ipv6_addr_any(&conn->in_nexthop.in6)) || + (!in && conn->out_dev && !ipv6_addr_any(&conn->out_nexthop.in6))) + return; + + if (in) { + conn->in_dev = rt->rt6_dev; + if (ipv6_addr_any(&rt->rt6_gateway)) { + conn->in_nexthop.in6 = conn->daddr.in6; + } else { + conn->in_nexthop.in6 = rt->rt6_gateway; + } + + } else { + conn->out_dev = rt->rt6_dev; + if (ipv6_addr_any(&rt->rt6_gateway)) { + conn->out_nexthop.in6 = conn->caddr.in6; + } else { + conn->out_nexthop.in6 = rt->rt6_gateway; + } + } +} + +static int __dp_vs_xmit_fnat4(struct dp_vs_proto *proto, + struct dp_vs_conn *conn, + struct rte_mbuf *mbuf) { struct flow4 fl4; struct ipv4_hdr *iph = ip4_hdr(mbuf); @@ -244,7 +398,7 @@ int dp_vs_xmit_fnat(struct dp_vs_proto *proto, if (!fast_xmit_close && !(conn->flags & DPVS_CONN_F_NOFASTXMIT)) { dp_vs_save_xmit_info(mbuf, proto, conn); - if (!dp_vs_fast_xmit_fnat(proto, conn, mbuf)) { + if (!dp_vs_fast_xmit_fnat(AF_INET, proto, conn, mbuf)) { return EDPVS_OK; } } @@ -260,16 +414,15 @@ int dp_vs_xmit_fnat(struct dp_vs_proto *proto, } memset(&fl4, 0, sizeof(struct flow4)); - fl4.daddr = conn->daddr.in; - fl4.saddr = conn->laddr.in; - fl4.tos = iph->type_of_service; + fl4.fl4_daddr = conn->daddr.in; + fl4.fl4_saddr = conn->laddr.in; + fl4.fl4_tos = iph->type_of_service; rt = route4_output(&fl4); if (!rt) { err = EDPVS_NOROUTE; goto errout; } - /* * didn't cache the pointer to rt * or route can't be deleted when there is conn ref @@ -330,7 +483,8 @@ int dp_vs_xmit_fnat(struct dp_vs_proto *proto, ip4_send_csum(iph); } - return INET_HOOK(INET_HOOK_LOCAL_OUT, mbuf, NULL, rt->port, ipv4_output); + return INET_HOOK(AF_INET, INET_HOOK_LOCAL_OUT, mbuf, + NULL, rt->port, ipv4_output); errout: if (rt) @@ -339,9 +493,118 @@ int dp_vs_xmit_fnat(struct dp_vs_proto *proto, return err; } -int dp_vs_out_xmit_fnat(struct dp_vs_proto *proto, - struct dp_vs_conn *conn, - struct rte_mbuf *mbuf) +static int __dp_vs_xmit_fnat6(struct dp_vs_proto *proto, + struct dp_vs_conn *conn, + struct rte_mbuf *mbuf) +{ + struct flow6 fl6; + struct ip6_hdr *ip6h = ip6_hdr(mbuf); + struct route6 *rt6; + int err, mtu; + + if (!fast_xmit_close && !(conn->flags & DPVS_CONN_F_NOFASTXMIT)) { + dp_vs_save_xmit_info(mbuf, proto, conn); + if (!dp_vs_fast_xmit_fnat(AF_INET6, proto, conn, mbuf)) { + return EDPVS_OK; + } + } + + /* + * drop old route. just for safe, because + * FNAT is PRE_ROUTING, should not have route. + */ + if (unlikely(mbuf->userdata != NULL)) { + RTE_LOG(WARNING, IPVS, "%s: FNAT have route %p ?\n", + __func__, mbuf->userdata); + route6_put((struct route6 *)mbuf->userdata); + } + + memset(&fl6, 0, sizeof(struct flow6)); + fl6.fl6_daddr = conn->daddr.in6; + fl6.fl6_saddr = conn->laddr.in6; + rt6 = route6_output(mbuf, &fl6); + if (!rt6) { + err = EDPVS_NOROUTE; + goto errout; + } + + /* + * didn't cache the pointer to rt6 + * or route can't be deleted when there is conn ref + * this is for neighbour confirm. + */ + dp_vs_conn_cache_rt6(conn, rt6, true); + + // check mtu + mtu = rt6->rt6_mtu; + if (mbuf->pkt_len > mtu) { + RTE_LOG(DEBUG, IPVS, "%s: frag needed.\n", __func__); + icmp6_send(mbuf, ICMP6_PACKET_TOO_BIG, 0, mtu); + + err = EDPVS_FRAG; + goto errout; + } + + mbuf->userdata = rt6; + + /* after route lookup and before translation */ + if (xmit_ttl) { + if (unlikely(ip6h->ip6_hops <= 1)) { + icmp6_send(mbuf, ICMP6_TIME_EXCEEDED, ICMP6_TIME_EXCEED_TRANSIT, 0); + err = EDPVS_DROP; + goto errout; + } + + ip6h->ip6_hops--; + } + + /* pre-handler before translation */ + if (proto->fnat_in_pre_handler) { + err = proto->fnat_in_pre_handler(proto, conn, mbuf); + if (err != EDPVS_OK) + goto errout; + + /* + * re-fetch IP header + * the offset may changed during pre-handler + */ + ip6h = ip6_hdr(mbuf); + } + + /* L3 translation before l4 re-csum */ + ip6h->ip6_src = conn->laddr.in6; + ip6h->ip6_dst = conn->daddr.in6; + + /* L4 FNAT translation */ + if (proto->fnat_in_handler) { + err = proto->fnat_in_handler(proto, conn, mbuf); + if (err != EDPVS_OK) + goto errout; + } + + return INET_HOOK(AF_INET6, INET_HOOK_LOCAL_OUT, mbuf, + NULL, rt6->rt6_dev, ip6_output); + +errout: + if (rt6) + route6_put(rt6); + rte_pktmbuf_free(mbuf); + return err; +} + +int dp_vs_xmit_fnat(struct dp_vs_proto *proto, + struct dp_vs_conn *conn, + struct rte_mbuf *mbuf) +{ + int af = conn->af; + assert(af == AF_INET || af == AF_INET6); + return af == AF_INET ? __dp_vs_xmit_fnat4(proto, conn, mbuf) + : __dp_vs_xmit_fnat6(proto, conn, mbuf); +} + +static int __dp_vs_out_xmit_fnat4(struct dp_vs_proto *proto, + struct dp_vs_conn *conn, + struct rte_mbuf *mbuf) { struct flow4 fl4; struct ipv4_hdr *iph = ip4_hdr(mbuf); @@ -350,7 +613,7 @@ int dp_vs_out_xmit_fnat(struct dp_vs_proto *proto, if (!fast_xmit_close && !(conn->flags & DPVS_CONN_F_NOFASTXMIT)) { dp_vs_save_outxmit_info(mbuf, proto, conn); - if (!dp_vs_fast_outxmit_fnat(proto, conn, mbuf)) { + if (!dp_vs_fast_outxmit_fnat(AF_INET, proto, conn, mbuf)) { return EDPVS_OK; } } @@ -363,9 +626,9 @@ int dp_vs_out_xmit_fnat(struct dp_vs_proto *proto, route4_put((struct route_entry *)mbuf->userdata); memset(&fl4, 0, sizeof(struct flow4)); - fl4.daddr = conn->caddr.in; - fl4.saddr = conn->vaddr.in; - fl4.tos = iph->type_of_service; + fl4.fl4_daddr = conn->caddr.in; + fl4.fl4_saddr = conn->vaddr.in; + fl4.fl4_tos = iph->type_of_service; rt = route4_output(&fl4); if (!rt) { err = EDPVS_NOROUTE; @@ -432,7 +695,8 @@ int dp_vs_out_xmit_fnat(struct dp_vs_proto *proto, ip4_send_csum(iph); } - return INET_HOOK(INET_HOOK_LOCAL_OUT, mbuf, NULL, rt->port, ipv4_output); + return INET_HOOK(AF_INET, INET_HOOK_LOCAL_OUT, mbuf, + NULL, rt->port, ipv4_output); errout: if (rt) @@ -441,10 +705,114 @@ int dp_vs_out_xmit_fnat(struct dp_vs_proto *proto, return err; } +static int __dp_vs_out_xmit_fnat6(struct dp_vs_proto *proto, + struct dp_vs_conn *conn, + struct rte_mbuf *mbuf) +{ + struct flow6 fl6; + struct ip6_hdr *ip6h = ip6_hdr(mbuf); + struct route6 *rt6; + int err, mtu; + + if (!fast_xmit_close && !(conn->flags & DPVS_CONN_F_NOFASTXMIT)) { + dp_vs_save_outxmit_info(mbuf, proto, conn); + if (!dp_vs_fast_outxmit_fnat(AF_INET6, proto, conn, mbuf)) { + return EDPVS_OK; + } + } + + /* + * drop old route. just for safe, because + * FNAT is PRE_ROUTING, should not have route. + */ + if (unlikely(mbuf->userdata != NULL)) + route6_put((struct route6 *)mbuf->userdata); + + memset(&fl6, 0, sizeof(struct flow6)); + fl6.fl6_daddr = conn->caddr.in6; + fl6.fl6_saddr = conn->vaddr.in6; + rt6 = route6_output(mbuf, &fl6); + if (!rt6) { + err = EDPVS_NOROUTE; + goto errout; + } + + /* + * didn't cache the pointer to rt + * or route can't be deleted when there is conn ref + * this is for neighbour confirm. + */ + dp_vs_conn_cache_rt6(conn, rt6, false); + + mtu = rt6->rt6_mtu; + if (mbuf->pkt_len > mtu) { + RTE_LOG(DEBUG, IPVS, "%s: frag needed.\n", __func__); + icmp6_send(mbuf, ICMP6_PACKET_TOO_BIG, 0, htonl(mtu)); + err = EDPVS_FRAG; + goto errout; + } + + mbuf->userdata = rt6; + + /* after route lookup and before translation */ + if (xmit_ttl) { + if (unlikely(ip6h->ip6_hops <= 1)) { + icmp6_send(mbuf, ICMP6_TIME_EXCEEDED, ICMP6_TIME_EXCEED_TRANSIT, 0); + err = EDPVS_DROP; + goto errout; + } + + ip6h->ip6_hops--; + } + + /* pre-handler before translation */ + if (proto->fnat_out_pre_handler) { + err = proto->fnat_out_pre_handler(proto, conn, mbuf); + if (err != EDPVS_OK) + goto errout; + + /* + * re-fetch IP header + * the offset may changed during pre-handler + */ + ip6h = ip6_hdr(mbuf); + } + + /* L3 translation before l4 re-csum */ + ip6h->ip6_src = conn->vaddr.in6; + ip6h->ip6_dst = conn->caddr.in6; + + /* L4 FNAT translation */ + if (proto->fnat_out_handler) { + err = proto->fnat_out_handler(proto, conn, mbuf); + if (err != EDPVS_OK) + goto errout; + } + + return INET_HOOK(AF_INET6, INET_HOOK_LOCAL_OUT, mbuf, + NULL, rt6->rt6_dev, ip6_output); + +errout: + if (rt6) + route6_put(rt6); + rte_pktmbuf_free(mbuf); + return err; +} + +int dp_vs_out_xmit_fnat(struct dp_vs_proto *proto, + struct dp_vs_conn *conn, + struct rte_mbuf *mbuf) +{ + int af = conn->af; + assert(af == AF_INET || af == AF_INET6); + return af == AF_INET ? __dp_vs_out_xmit_fnat4(proto, conn, mbuf) + : __dp_vs_out_xmit_fnat6(proto, conn, mbuf); +} + /* mbuf's data should pointer to outer IP packet. */ -void dp_vs_xmit_icmp(struct rte_mbuf *mbuf, - struct dp_vs_proto *prot, - struct dp_vs_conn *conn, int dir) +static void __dp_vs_xmit_icmp4(struct rte_mbuf *mbuf, + struct dp_vs_proto *prot, + struct dp_vs_conn *conn, int dir) { struct ipv4_hdr *iph = ip4_hdr(mbuf); struct icmphdr *icmph = (struct icmphdr *) @@ -453,7 +821,7 @@ void dp_vs_xmit_icmp(struct rte_mbuf *mbuf, int fullnat = (conn->dest->fwdmode == DPVS_FWD_MODE_FNAT); uint16_t csum; - /* + /* * outer/inner L3 translation. */ if (fullnat) { @@ -478,7 +846,7 @@ void dp_vs_xmit_icmp(struct rte_mbuf *mbuf, ip4_send_csum(ciph); } - /* + /* * inner L4 translation. * * note it's no way to recalc inner csum to lack of data, @@ -524,9 +892,130 @@ void dp_vs_xmit_icmp(struct rte_mbuf *mbuf, return; } -int dp_vs_xmit_dr(struct dp_vs_proto *proto, - struct dp_vs_conn *conn, - struct rte_mbuf *mbuf) +/* mbuf's data should pointer to outer IP packet. */ +static void __dp_vs_xmit_icmp6(struct rte_mbuf *mbuf, + struct dp_vs_proto *prot, + struct dp_vs_conn *conn, int dir) +{ + struct ip6_hdr *ip6h = ip6_hdr(mbuf); + struct icmp6_hdr *icmp6h; + struct ip6_hdr *cip6h; + int fullnat = (conn->dest->fwdmode == DPVS_FWD_MODE_FNAT); + uint8_t nexthdr = ip6h->ip6_nxt; + int offset = sizeof(*ip6h); + uint32_t csum, l4_len; + + offset = ip6_skip_exthdr(mbuf, offset, &nexthdr); + if (offset < 0) { + RTE_LOG(WARNING, IPVS, "%s: Get ipv6 payload fail, mbuf : %p \n", + __func__, mbuf); + return ; + } + + if (unlikely(nexthdr != IPPROTO_ICMPV6)) { + RTE_LOG(WARNING, IPVS, "%s: Get ipv6 payload isn't icmp, mbuf : %p \n", + __func__, mbuf); + return ; + } + + icmp6h = (struct icmp6_hdr *) + ((unsigned char *)ip6_hdr(mbuf) + offset); + cip6h = (struct ip6_hdr *)(icmp6h + 1); + /* + * outer/inner L3 translation. + */ + if (fullnat) { + if (dir == DPVS_CONN_DIR_INBOUND) { + ip6h->ip6_src = conn->laddr.in6; + cip6h->ip6_dst = conn->laddr.in6; + } else { + ip6h->ip6_dst = conn->caddr.in6; + cip6h->ip6_src = conn->caddr.in6; + } + } + + if (dir == DPVS_CONN_DIR_INBOUND) { + ip6h->ip6_dst = conn->daddr.in6; + cip6h->ip6_src = conn->daddr.in6; + } else { + ip6h->ip6_src = conn->vaddr.in6; + cip6h->ip6_dst = conn->vaddr.in6; + } + + /* + * inner L4 translation. + * + * note it's no way to recalc inner csum to lack of data, + * actually it's not needed. + */ + offset += (sizeof(*icmp6h) + sizeof(*cip6h)); + nexthdr = cip6h->ip6_nxt; + offset = ip6_skip_exthdr(mbuf, offset, &nexthdr); + + if (offset > 0 + && (nexthdr == IPPROTO_TCP + || nexthdr == IPPROTO_UDP)) { + uint16_t *ports = (void *)ip6_hdr(mbuf) + offset; + + if (fullnat) { + if (dir == DPVS_CONN_DIR_INBOUND) { + ports[1] = conn->lport; + } else { + ports[0] = conn->cport; + /* seq adjustment (changed by FNAT) */ + if (nexthdr == IPPROTO_TCP) { + uint32_t *seq = (uint32_t *)ports + 1; + *seq = htonl(ntohl(*seq) - conn->fnat_seq.delta); + } + } + } + + if (dir == DPVS_CONN_DIR_INBOUND) { + ports[0] = conn->dport; + /* seq adjustment (changed by SynProxy) */ + if (nexthdr == IPPROTO_TCP) { + uint32_t *seq = (uint32_t *)ports + 1; + *seq = htonl(ntohl(*seq) - conn->syn_proxy_seq.delta); + } + } else { + ports[1] = conn->vport; + } + } + + /* + * ICMP recalc csum. + */ + icmp6h->icmp6_cksum = 0; + l4_len = ntohs(ip6h->ip6_plen); + csum = rte_raw_cksum(icmp6h, l4_len); + csum += rte_ipv6_phdr_cksum((struct ipv6_hdr *)ip6h, 0); + + csum = ((csum & 0xffff0000) >> 16) + (csum & 0xffff); + csum = (~csum) & 0xffff; + if (csum == 0) + csum = 0xffff; + + icmp6h->icmp6_cksum = csum; + + return; +} + +/* mbuf's data should pointer to outer IP packet. */ +void dp_vs_xmit_icmp(struct rte_mbuf *mbuf, + struct dp_vs_proto *prot, + struct dp_vs_conn *conn, int dir) +{ + int af = conn->af; + + assert(af == AF_INET || af == AF_INET6); + + return af == AF_INET ? __dp_vs_xmit_icmp4(mbuf, prot, conn, dir) + : __dp_vs_xmit_icmp6(mbuf, prot, conn, dir); +} + +static int __dp_vs_xmit_dr4(struct dp_vs_proto *proto, + struct dp_vs_conn *conn, + struct rte_mbuf *mbuf) { struct flow4 fl4; struct ipv4_hdr *iph = ip4_hdr(mbuf); @@ -540,9 +1029,9 @@ int dp_vs_xmit_dr(struct dp_vs_proto *proto, } memset(&fl4, 0, sizeof(struct flow4)); - fl4.daddr.s_addr = conn->daddr.in.s_addr; - fl4.saddr.s_addr = iph->src_addr; - fl4.tos = iph->type_of_service; + fl4.fl4_daddr.s_addr = conn->daddr.in.s_addr; + fl4.fl4_saddr.s_addr = iph->src_addr; + fl4.fl4_tos = iph->type_of_service; rt = route4_output(&fl4); if (!rt) { err = EDPVS_NOROUTE; @@ -562,7 +1051,7 @@ int dp_vs_xmit_dr(struct dp_vs_proto *proto, } mbuf->packet_type = ETHER_TYPE_IPv4; - err = neigh_resolve_output(&conn->daddr.in, mbuf, rt->port); + err = neigh_output(AF_INET, (union inet_addr *)&conn->daddr.in, mbuf, rt->port); route4_put(rt); return err; @@ -573,16 +1062,160 @@ int dp_vs_xmit_dr(struct dp_vs_proto *proto, return err; } -int dp_vs_xmit_snat(struct dp_vs_proto *proto, - struct dp_vs_conn *conn, - struct rte_mbuf *mbuf) +static int __dp_vs_xmit_dr6(struct dp_vs_proto *proto, + struct dp_vs_conn *conn, + struct rte_mbuf *mbuf) +{ + struct flow6 fl6; + struct ip6_hdr *ip6h = ip6_hdr(mbuf); + struct route6 *rt6; + int err, mtu; + + if (unlikely(mbuf->userdata != NULL)) { + RTE_LOG(WARNING, IPVS, "%s: Already have route %p ?\n", + __func__, mbuf->userdata); + route6_put((struct route6 *)mbuf->userdata); + } + + memset(&fl6, 0, sizeof(struct flow6)); + fl6.fl6_daddr = conn->daddr.in6; + fl6.fl6_saddr = ip6h->ip6_src; + rt6 = route6_output(mbuf, &fl6); + if (!rt6) { + err = EDPVS_NOROUTE; + goto errout; + } + + /* dr xmit support cache of route to rs*/ + dp_vs_conn_cache_rt6(conn, rt6, true); + + mtu = rt6->rt6_mtu; + if (mbuf->pkt_len > mtu) { + RTE_LOG(DEBUG, IPVS, "%s: frag needed.\n", __func__); + icmp6_send(mbuf, ICMP6_PACKET_TOO_BIG, 0, htonl(mtu)); + err = EDPVS_FRAG; + goto errout; + } + + mbuf->packet_type = ETHER_TYPE_IPv6; + err = neigh_output(AF_INET6, (union inet_addr *)&conn->daddr.in6, mbuf, rt6->rt6_dev); + route6_put(rt6); + return err; + +errout: + if (rt6) + route6_put(rt6); + rte_pktmbuf_free(mbuf); + return err; +} + +int dp_vs_xmit_dr(struct dp_vs_proto *proto, + struct dp_vs_conn *conn, + struct rte_mbuf *mbuf) +{ + int af = conn->af; + + assert(af == AF_INET || af == AF_INET6); + + return af == AF_INET ? __dp_vs_xmit_dr4(proto, conn, mbuf) + : __dp_vs_xmit_dr6(proto, conn, mbuf); +} + +static int __dp_vs_xmit_snat4(struct dp_vs_proto *proto, + struct dp_vs_conn *conn, + struct rte_mbuf *mbuf) +{ + struct flow4 fl4; + struct ipv4_hdr *iph = ip4_hdr(mbuf); + struct route_entry *rt; + int err, mtu; + + /* + * drop old route. just for safe, because + * inbound SNAT traffic is hooked at PRE_ROUTING, + * should not have route. + */ + if (unlikely(mbuf->userdata != NULL)) { + RTE_LOG(WARNING, IPVS, "%s: SNAT have route %p ?\n", + __func__, mbuf->userdata); + route4_put((struct route_entry *)mbuf->userdata); + } + + /* + * hosts inside SNAT may belongs to diff net, + * let's route it. + */ + memset(&fl4, 0, sizeof(struct flow4)); + fl4.fl4_daddr = conn->daddr.in; + fl4.fl4_saddr = conn->caddr.in; + fl4.fl4_tos = iph->type_of_service; + rt = route4_output(&fl4); + if (!rt) { + err = EDPVS_NOROUTE; + goto errout; + } + + dp_vs_conn_cache_rt(conn, rt, true); + + mtu = rt->mtu; + if (mbuf->pkt_len > mtu + && (iph->fragment_offset & htons(IPV4_HDR_DF_FLAG))) { + RTE_LOG(DEBUG, IPVS, "%s: frag needed.\n", __func__); + icmp_send(mbuf, ICMP_DEST_UNREACH, ICMP_UNREACH_NEEDFRAG, htonl(mtu)); + err = EDPVS_FRAG; + goto errout; + } + + mbuf->userdata = rt; + + /* after route lookup and before translation */ + if (xmit_ttl) { + if (unlikely(iph->time_to_live <= 1)) { + icmp_send(mbuf, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0); + err = EDPVS_DROP; + goto errout; + } + + iph->time_to_live--; + } + + /* L3 translation before l4 re-csum */ + iph->hdr_checksum = 0; + iph->dst_addr = conn->daddr.in.s_addr; + + /* L4 translation */ + if (proto->snat_in_handler) { + err = proto->snat_in_handler(proto, conn, mbuf); + if (err != EDPVS_OK) + goto errout; + } + + /* L3 re-checksum */ + if (likely(mbuf->ol_flags & PKT_TX_IP_CKSUM)) + iph->hdr_checksum = 0; + else + ip4_send_csum(iph); + + return INET_HOOK(AF_INET, INET_HOOK_LOCAL_OUT, mbuf, + NULL, rt->port, ipv4_output); + +errout: + if (rt) + route4_put(rt); + rte_pktmbuf_free(mbuf); + return err; +} + +static int __dp_vs_xmit_snat6(struct dp_vs_proto *proto, + struct dp_vs_conn *conn, + struct rte_mbuf *mbuf) { - struct flow4 fl4; - struct ipv4_hdr *iph = ip4_hdr(mbuf); - struct route_entry *rt; + struct flow6 fl6; + struct ip6_hdr *ip6h = ip6_hdr(mbuf); + struct route6 *rt6; int err, mtu; - /* + /* * drop old route. just for safe, because * inbound SNAT traffic is hooked at PRE_ROUTING, * should not have route. @@ -590,50 +1223,47 @@ int dp_vs_xmit_snat(struct dp_vs_proto *proto, if (unlikely(mbuf->userdata != NULL)) { RTE_LOG(WARNING, IPVS, "%s: SNAT have route %p ?\n", __func__, mbuf->userdata); - route4_put((struct route_entry *)mbuf->userdata); + route6_put((struct route6 *)mbuf->userdata); } - /* + /* * hosts inside SNAT may belongs to diff net, - * let's route it. + * let's route it. */ - memset(&fl4, 0, sizeof(struct flow4)); - fl4.daddr = conn->daddr.in; - fl4.saddr = conn->caddr.in; - fl4.tos = iph->type_of_service; - rt = route4_output(&fl4); - if (!rt) { + memset(&fl6, 0, sizeof(struct flow6)); + fl6.fl6_daddr = conn->daddr.in6; + fl6.fl6_saddr = conn->caddr.in6; + rt6 = route6_output(mbuf, &fl6); + if (!rt6) { err = EDPVS_NOROUTE; goto errout; } - dp_vs_conn_cache_rt(conn, rt, true); + dp_vs_conn_cache_rt6(conn, rt6, true); - mtu = rt->mtu; - if (mbuf->pkt_len > mtu - && (iph->fragment_offset & htons(IPV4_HDR_DF_FLAG))) { + mtu = rt6->rt6_mtu; + if (mbuf->pkt_len > mtu) { RTE_LOG(DEBUG, IPVS, "%s: frag needed.\n", __func__); - icmp_send(mbuf, ICMP_DEST_UNREACH, ICMP_UNREACH_NEEDFRAG, htonl(mtu)); + icmp6_send(mbuf, ICMP6_PACKET_TOO_BIG, 0, htonl(mtu)); err = EDPVS_FRAG; goto errout; } - mbuf->userdata = rt; + mbuf->userdata = rt6; /* after route lookup and before translation */ if (xmit_ttl) { - if (unlikely(iph->time_to_live <= 1)) { - icmp_send(mbuf, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0); + if (unlikely(ip6h->ip6_hops <= 1)) { + icmp6_send(mbuf, ICMP6_TIME_EXCEEDED, ICMP6_TIME_EXCEED_TRANSIT, 0); err = EDPVS_DROP; goto errout; } - iph->time_to_live--; + ip6h->ip6_hops--; } /* L3 translation before l4 re-csum */ - iph->hdr_checksum = 0; - iph->dst_addr = conn->daddr.in.s_addr; + ip6h->ip6_dst = conn->daddr.in6; /* L4 translation */ if (proto->snat_in_handler) { @@ -642,24 +1272,31 @@ int dp_vs_xmit_snat(struct dp_vs_proto *proto, goto errout; } - /* L3 re-checksum */ - if (likely(mbuf->ol_flags & PKT_TX_IP_CKSUM)) - iph->hdr_checksum = 0; - else - ip4_send_csum(iph); - - return INET_HOOK(INET_HOOK_LOCAL_OUT, mbuf, NULL, rt->port, ipv4_output); + return INET_HOOK(AF_INET6, INET_HOOK_LOCAL_OUT, mbuf, + NULL, rt6->rt6_dev, ip6_output); errout: - if (rt) - route4_put(rt); + if (rt6) + route6_put(rt6); rte_pktmbuf_free(mbuf); return err; } -int dp_vs_out_xmit_snat(struct dp_vs_proto *proto, - struct dp_vs_conn *conn, - struct rte_mbuf *mbuf) +int dp_vs_xmit_snat(struct dp_vs_proto *proto, + struct dp_vs_conn *conn, + struct rte_mbuf *mbuf) +{ + int af = conn->af; + + assert(af == AF_INET || af == AF_INET6); + + return af == AF_INET ? __dp_vs_xmit_snat4(proto, conn, mbuf) + : __dp_vs_xmit_snat6(proto, conn, mbuf); +} + +static int __dp_vs_out_xmit_snat4(struct dp_vs_proto *proto, + struct dp_vs_conn *conn, + struct rte_mbuf *mbuf) { int err; struct flow4 fl4; @@ -668,9 +1305,9 @@ int dp_vs_out_xmit_snat(struct dp_vs_proto *proto, if (!rt) { memset(&fl4, 0, sizeof(struct flow4)); - fl4.daddr = conn->caddr.in; - fl4.saddr = conn->vaddr.in; - fl4.tos = iph->type_of_service; + fl4.fl4_daddr = conn->caddr.in; + fl4.fl4_saddr = conn->vaddr.in; + fl4.fl4_tos = iph->type_of_service; rt = route4_output(&fl4); if (!rt) { err = EDPVS_NOROUTE; @@ -719,7 +1356,8 @@ int dp_vs_out_xmit_snat(struct dp_vs_proto *proto, else ip4_send_csum(iph); - return INET_HOOK(INET_HOOK_LOCAL_OUT, mbuf, NULL, rt->port, ipv4_output); + return INET_HOOK(AF_INET, INET_HOOK_LOCAL_OUT, mbuf, + NULL, rt->port, ipv4_output); errout: if (rt) @@ -818,9 +1456,83 @@ static int dp_vs_fast_outxmit_nat(struct dp_vs_proto *proto, return EDPVS_OK; } -int dp_vs_xmit_nat(struct dp_vs_proto *proto, - struct dp_vs_conn *conn, - struct rte_mbuf *mbuf) +static int __dp_vs_out_xmit_snat6(struct dp_vs_proto *proto, + struct dp_vs_conn *conn, + struct rte_mbuf *mbuf) +{ + int err; + struct flow6 fl6; + struct route6 *rt6 = mbuf->userdata; + struct ip6_hdr *ip6h = ip6_hdr(mbuf); + + if (!rt6) { + memset(&fl6, 0, sizeof(struct flow6)); + fl6.fl6_daddr = conn->caddr.in6; + fl6.fl6_saddr = conn->vaddr.in6; + rt6 = route6_output(mbuf, &fl6); + if (!rt6) { + err = EDPVS_NOROUTE; + goto errout; + } + + mbuf->userdata = rt6; + + dp_vs_conn_cache_rt6(conn, rt6, false); + } + + if (mbuf->pkt_len > rt6->rt6_mtu) { + RTE_LOG(DEBUG, IPVS, "%s: frag needed.\n", __func__); + icmp6_send(mbuf, ICMP6_PACKET_TOO_BIG, 0, htonl(rt6->rt6_mtu)); + err = EDPVS_FRAG; + goto errout; + } + + /* after route lookup and before translation */ + if (xmit_ttl) { + if (unlikely(ip6h->ip6_hops <= 1)) { + icmp6_send(mbuf, ICMP6_TIME_EXCEEDED, ICMP6_TIME_EXCEED_TRANSIT, 0); + err = EDPVS_DROP; + goto errout; + } + + ip6h->ip6_hops--; + } + + /* L3 translation before L4 re-csum */ + ip6h->ip6_src = conn->vaddr.in6; + + /* L4 translation */ + if (proto->snat_out_handler) { + err = proto->snat_out_handler(proto, conn, mbuf); + if (err != EDPVS_OK) + goto errout; + } + + return INET_HOOK(AF_INET6, INET_HOOK_LOCAL_OUT, mbuf, + NULL, rt6->rt6_dev, ip6_output); + +errout: + if (rt6) + route6_put(rt6); + rte_pktmbuf_free(mbuf); + return err; +} + +int dp_vs_out_xmit_snat(struct dp_vs_proto *proto, + struct dp_vs_conn *conn, + struct rte_mbuf *mbuf) +{ + int af = conn->af; + + assert(af == AF_INET || af == AF_INET6); + + return af == AF_INET ? __dp_vs_out_xmit_snat4(proto, conn, mbuf) + : __dp_vs_out_xmit_snat6(proto, conn, mbuf); +} + +static int __dp_vs_xmit_nat4(struct dp_vs_proto *proto, + struct dp_vs_conn *conn, + struct rte_mbuf *mbuf) { struct flow4 fl4; struct ipv4_hdr *iph = ip4_hdr(mbuf); @@ -845,9 +1557,9 @@ int dp_vs_xmit_nat(struct dp_vs_proto *proto, } memset(&fl4, 0, sizeof(struct flow4)); - fl4.daddr = conn->daddr.in; - fl4.saddr = conn->caddr.in; - fl4.tos = iph->type_of_service; + fl4.fl4_daddr = conn->daddr.in; + fl4.fl4_saddr = conn->caddr.in; + fl4.fl4_tos = iph->type_of_service; rt = route4_output(&fl4); if (!rt) { err = EDPVS_NOROUTE; @@ -860,6 +1572,8 @@ int dp_vs_xmit_nat(struct dp_vs_proto *proto, if (mbuf->pkt_len > mtu && (iph->fragment_offset & htons(IPV4_HDR_DF_FLAG))) { RTE_LOG(DEBUG, IPVS, "%s: frag needed.\n", __func__); + icmp_send(mbuf, ICMP_DEST_UNREACH, ICMP_UNREACH_NEEDFRAG, + htonl(rt->mtu)); err = EDPVS_FRAG; goto errout; } @@ -894,7 +1608,8 @@ int dp_vs_xmit_nat(struct dp_vs_proto *proto, ip4_send_csum(iph); } - return INET_HOOK(INET_HOOK_LOCAL_OUT, mbuf, NULL, rt->port, ipv4_output); + return INET_HOOK(AF_INET, INET_HOOK_LOCAL_OUT, mbuf, + NULL, rt->port, ipv4_output); errout: if (rt) @@ -903,9 +1618,92 @@ int dp_vs_xmit_nat(struct dp_vs_proto *proto, return err; } -int dp_vs_out_xmit_nat(struct dp_vs_proto *proto, +static int __dp_vs_xmit_nat6(struct dp_vs_proto *proto, + struct dp_vs_conn *conn, + struct rte_mbuf *mbuf) +{ + struct flow6 fl6; + struct ip6_hdr *ip6h = ip6_hdr(mbuf); + struct route6 *rt6; + int err, mtu; + + /* + * drop old route. just for safe, because + * NAT is PREROUTING, should not have route. + */ + if (unlikely(mbuf->userdata != NULL)) { + RTE_LOG(WARNING, IPVS, "%s: NAT have route %p ?\n", + __func__, mbuf->userdata); + route6_put((struct route6*)mbuf->userdata); + } + + memset(&fl6, 0, sizeof(struct flow6)); + fl6.fl6_daddr = conn->daddr.in6; + fl6.fl6_saddr = conn->caddr.in6; + rt6 = route6_output(mbuf, &fl6); + if (!rt6) { + err = EDPVS_NOROUTE; + goto errout; + } + + dp_vs_conn_cache_rt6(conn, rt6, true); + + mtu = rt6->rt6_mtu; + if (mbuf->pkt_len > mtu) { + RTE_LOG(DEBUG, IPVS, "%s: frag needed.\n", __func__); + icmp6_send(mbuf, ICMP6_PACKET_TOO_BIG, 0, htonl(mtu)); + err = EDPVS_FRAG; + goto errout; + } + + mbuf->userdata = rt6; + + /* after route lookup and before translation */ + if (xmit_ttl) { + if (unlikely(ip6h->ip6_hops <= 1)) { + icmp6_send(mbuf, ICMP6_TIME_EXCEEDED, ICMP6_TIME_EXCEED_TRANSIT, 0); + err = EDPVS_DROP; + goto errout; + } + + ip6h->ip6_hops--; + } + + /* L3 translation before l4 re-csum */ + ip6h->ip6_dst = conn->daddr.in6; + + /* L4 NAT translation */ + if (proto->fnat_in_handler) { + err = proto->nat_in_handler(proto, conn, mbuf); + if (err != EDPVS_OK) + goto errout; + } + + return INET_HOOK(AF_INET6, INET_HOOK_LOCAL_OUT, mbuf, + NULL, rt6->rt6_dev, ip6_output); + +errout: + if (rt6) + route6_put(rt6); + rte_pktmbuf_free(mbuf); + return err; +} + +int dp_vs_xmit_nat(struct dp_vs_proto *proto, struct dp_vs_conn *conn, struct rte_mbuf *mbuf) +{ + int af = conn->af; + + assert(af == AF_INET || af == AF_INET6); + + return af == AF_INET ? __dp_vs_xmit_nat4(proto, conn, mbuf) + : __dp_vs_xmit_nat6(proto, conn, mbuf); +} + +static int __dp_vs_out_xmit_nat4(struct dp_vs_proto *proto, + struct dp_vs_conn *conn, + struct rte_mbuf *mbuf) { struct flow4 fl4; struct ipv4_hdr *iph = ip4_hdr(mbuf); @@ -930,9 +1728,9 @@ int dp_vs_out_xmit_nat(struct dp_vs_proto *proto, } memset(&fl4, 0, sizeof(struct flow4)); - fl4.daddr = conn->caddr.in; - fl4.saddr = conn->vaddr.in; - fl4.tos = iph->type_of_service; + fl4.fl4_daddr = conn->caddr.in; + fl4.fl4_saddr = conn->vaddr.in; + fl4.fl4_tos = iph->type_of_service; rt = route4_output(&fl4); if (!rt) { err = EDPVS_NOROUTE; @@ -945,6 +1743,9 @@ int dp_vs_out_xmit_nat(struct dp_vs_proto *proto, if (mbuf->pkt_len > mtu && (iph->fragment_offset & htons(IPV4_HDR_DF_FLAG))) { RTE_LOG(DEBUG, IPVS, "%s: frag needed.\n", __func__); + icmp_send(mbuf, ICMP_DEST_UNREACH, ICMP_UNREACH_NEEDFRAG, + htonl(rt->mtu)); + err = EDPVS_FRAG; goto errout; } @@ -979,7 +1780,8 @@ int dp_vs_out_xmit_nat(struct dp_vs_proto *proto, ip4_send_csum(iph); } - return INET_HOOK(INET_HOOK_LOCAL_OUT, mbuf, NULL, rt->port, ipv4_output); + return INET_HOOK(AF_INET, INET_HOOK_LOCAL_OUT, mbuf, + NULL, rt->port, ipv4_output); errout: if (rt) @@ -988,9 +1790,96 @@ int dp_vs_out_xmit_nat(struct dp_vs_proto *proto, return err; } -int dp_vs_xmit_tunnel(struct dp_vs_proto *proto, - struct dp_vs_conn *conn, - struct rte_mbuf *mbuf) +static int __dp_vs_out_xmit_nat6(struct dp_vs_proto *proto, + struct dp_vs_conn *conn, + struct rte_mbuf *mbuf) +{ + struct flow6 fl6; + struct ip6_hdr *ip6h = ip6_hdr(mbuf); + struct route6 *rt6; + int err, mtu; + + /* + * drop old route. just for safe, because + * NAT is PREROUTING, should not have route. + */ + if (unlikely(mbuf->userdata != NULL)) { + RTE_LOG(WARNING, IPVS, "%s: NAT have route %p ?\n", + __func__, mbuf->userdata); + route6_put((struct route6*)mbuf->userdata); + } + + memset(&fl6, 0, sizeof(struct flow6)); + fl6.fl6_daddr = conn->caddr.in6; + fl6.fl6_saddr = conn->vaddr.in6; + rt6 = route6_output(mbuf, &fl6); + if (!rt6) { + err = EDPVS_NOROUTE; + goto errout; + } + + dp_vs_conn_cache_rt6(conn, rt6, false); + + mtu = rt6->rt6_mtu; + if (mbuf->pkt_len > mtu) { + RTE_LOG(DEBUG, IPVS, "%s: frag needed.\n", __func__); + icmp6_send(mbuf, ICMP6_PACKET_TOO_BIG, 0, htonl(mtu)); + err = EDPVS_FRAG; + goto errout; + } + + mbuf->userdata = rt6; + + /* after route lookup and before translation */ + if (xmit_ttl) { + if (unlikely(ip6h->ip6_hops <= 1)) { + icmp6_send(mbuf, ICMP6_TIME_EXCEEDED, ICMP6_TIME_EXCEED_TRANSIT, 0); + err = EDPVS_DROP; + goto errout; + } + + ip6h->ip6_hops--; + } + + /* L3 translation before l4 re-csum */ + ip6h->ip6_src = conn->vaddr.in6; + + /* L4 NAT translation */ + if (proto->fnat_in_handler) { + err = proto->nat_out_handler(proto, conn, mbuf); + if (err != EDPVS_OK) + goto errout; + } + + return INET_HOOK(AF_INET6, INET_HOOK_LOCAL_OUT, mbuf, + NULL, rt6->rt6_dev, ip6_output); + +errout: + if (rt6) + route6_put(rt6); + rte_pktmbuf_free(mbuf); + return err; +} + +int dp_vs_out_xmit_nat(struct dp_vs_proto *proto, + struct dp_vs_conn *conn, + struct rte_mbuf *mbuf) +{ + int af = conn->af; + + assert(af == AF_INET || af == AF_INET6); + + return af == AF_INET ? __dp_vs_out_xmit_nat4(proto, conn, mbuf) + : __dp_vs_out_xmit_nat6(proto, conn, mbuf); +} + +/* + * IP-IP tunnel is used for IPv4 IPVS tunnel forwarding. + * `tunl0` should be configured up on RS. + * */ +static int __dp_vs_xmit_tunnel4(struct dp_vs_proto *proto, + struct dp_vs_conn *conn, + struct rte_mbuf *mbuf) { struct flow4 fl4; struct ipv4_hdr *new_iph, *old_iph = ip4_hdr(mbuf); @@ -1010,8 +1899,8 @@ int dp_vs_xmit_tunnel(struct dp_vs_proto *proto, } memset(&fl4, 0, sizeof(struct flow4)); - fl4.daddr = conn->daddr.in; - fl4.tos = tos; + fl4.fl4_daddr = conn->daddr.in; + fl4.fl4_tos = tos; rt = route4_output(&fl4); if (!rt) { err = EDPVS_NOROUTE; @@ -1033,6 +1922,8 @@ int dp_vs_xmit_tunnel(struct dp_vs_proto *proto, if (mbuf->pkt_len > mtu && df) { RTE_LOG(DEBUG, IPVS, "%s: frag needed.\n", __func__); + icmp_send(mbuf, ICMP_DEST_UNREACH, ICMP_UNREACH_NEEDFRAG, + htonl(rt->mtu)); err = EDPVS_FRAG; goto errout; } @@ -1055,7 +1946,8 @@ int dp_vs_xmit_tunnel(struct dp_vs_proto *proto, ip4_send_csum(new_iph); } - return INET_HOOK(INET_HOOK_LOCAL_OUT, mbuf, NULL, rt->port, ipv4_output); + return INET_HOOK(AF_INET, INET_HOOK_LOCAL_OUT, mbuf, + NULL, rt->port, ipv4_output); errout: if (rt) @@ -1064,6 +1956,105 @@ int dp_vs_xmit_tunnel(struct dp_vs_proto *proto, return err; } +/* + * IPv6-IPv6 tunnel is used for IPv6 IPVS tunnel forwarding. + * `ip6tnl0` should be configured up on RS. + * */ +static int __dp_vs_xmit_tunnel6(struct dp_vs_proto *proto, + struct dp_vs_conn *conn, + struct rte_mbuf *mbuf) +{ + struct flow6 fl6; + struct ip6_hdr *new_ip6h, *old_ip6h = ip6_hdr(mbuf); + struct route6 *rt6; + int err, mtu; + + /* + * drop old route. just for safe, because + * TUNNEL is PREROUTING, should not have route. + */ + if (unlikely(mbuf->userdata != NULL)) { + RTE_LOG(WARNING, IPVS, "%s: TUNNEL have route %p ?\n", + __func__, mbuf->userdata); + route6_put((struct route6*)mbuf->userdata); + } + + memset(&fl6, 0, sizeof(struct flow6)); + fl6.fl6_daddr = conn->daddr.in6; + rt6 = route6_output(mbuf, &fl6); + if (!rt6) { + err = EDPVS_NOROUTE; + goto errout; + } + + dp_vs_conn_cache_rt6(conn, rt6, true); + + mtu = rt6->rt6_mtu; + mbuf->userdata = rt6; + + new_ip6h = (struct ip6_hdr*)rte_pktmbuf_prepend(mbuf, sizeof(struct ip6_hdr)); + if (!new_ip6h) { + RTE_LOG(WARNING, IPVS, "%s: mbuf has not enough headroom" + " space for ipvs tunnel\n", __func__); + err = EDPVS_NOROOM; + goto errout; + } + + if (mbuf->pkt_len > mtu) { + RTE_LOG(DEBUG, IPVS, "%s: frag needed.\n", __func__); + icmp6_send(mbuf, ICMP6_PACKET_TOO_BIG, 0, htonl(mtu)); + err = EDPVS_FRAG; + goto errout; + } + + memset(new_ip6h, 0, sizeof(struct ip6_hdr)); + new_ip6h->ip6_flow = old_ip6h->ip6_flow; + new_ip6h->ip6_plen = htons(mbuf->pkt_len - sizeof(struct ip6_hdr)); + new_ip6h->ip6_nxt = IPPROTO_IPV6; + new_ip6h->ip6_hops = old_ip6h->ip6_hops; + + /* FIXME: How to set outter IP source ? + * 1. why not use `rt6->rt6_src.addr` ? + * `rt6->rt6_src` is not set due to src-validation in route6 + * 2. why not use `inet_addr_select` ? + * `inet_addr_select` return the vip as source(note the vip is configured on + * ip6tnl0), and has performance concerns because of locking. + * For a compromise, the original source IP is used. Routing problem may exist. + */ + new_ip6h->ip6_src = old_ip6h->ip6_src; + + /* + new_ip6h->ip6_src = rt6->rt6_src.addr; + if (ipv6_addr_any(&new_ip6h->ip6_src)) + inet_addr_select(AF_INET6, rt6->rt6_dev, + (const union inet_addr*)&fl6.fl6_daddr, 0, + (union inet_addr*)&new_ip6h->ip6_src); + */ + + new_ip6h->ip6_dst = conn->daddr.in6; + + return INET_HOOK(AF_INET6, INET_HOOK_LOCAL_OUT, mbuf, + NULL, rt6->rt6_dev, ip6_output); + +errout: + if (rt6) + route6_put(rt6); + rte_pktmbuf_free(mbuf); + return err; +} + +int dp_vs_xmit_tunnel(struct dp_vs_proto *proto, + struct dp_vs_conn *conn, + struct rte_mbuf *mbuf) +{ + int af = conn->af; + + assert(af == AF_INET || af == AF_INET6); + + return af == AF_INET ? __dp_vs_xmit_tunnel4(proto, conn, mbuf) + : __dp_vs_xmit_tunnel6(proto, conn, mbuf); +} + static void conn_fast_xmit_handler(vector_t tockens) { RTE_LOG(INFO, IPVS, "fast xmit OFF\n"); diff --git a/src/kni.c b/src/kni.c index 690f97d96..56d84a4a1 100644 --- a/src/kni.c +++ b/src/kni.c @@ -29,7 +29,6 @@ #include #include #include -#include #include #include #include "common.h" diff --git a/src/main.c b/src/main.c index 416666f4b..fccde36fd 100644 --- a/src/main.c +++ b/src/main.c @@ -34,6 +34,7 @@ #include "cfgfile.h" #include "ip_tunnel.h" #include "sys_time.h" +#include "route6.h" #define DPVS "dpvs" #define RTE_LOGTYPE_DPVS RTE_LOGTYPE_USER1 @@ -308,7 +309,7 @@ int main(int argc, char *argv[]) if ((err = ctrl_term()) != 0) RTE_LOG(ERR, DPVS, "Fail to term ctrl plane\n"); if ((err = netif_term()) != 0) - RTE_LOG(ERR, DPVS, "Fail to term route\n"); + RTE_LOG(ERR, DPVS, "Fail to term netif\n"); if ((err = cfgfile_term()) != 0) RTE_LOG(ERR, DPVS, "Fail to term configuration file: %s\n", dpvs_strerror(err)); diff --git a/src/neigh.c b/src/neigh.c index 3c248f894..e91036968 100644 --- a/src/neigh.c +++ b/src/neigh.c @@ -29,70 +29,37 @@ #include "common.h" #include "route.h" #include "ctrl.h" +#include "ndisc.h" #include "conf/neigh.h" -#define ARP_TAB_BITS 8 -#define ARP_TAB_SIZE (1 << ARP_TAB_BITS) -#define ARP_TAB_MASK (ARP_TAB_SIZE - 1) - -#define ARP_ENTRY_BUFF_SIZE_DEF 128 -#define ARP_ENTRY_BUFF_SIZE_MIN 16 -#define ARP_ENTRY_BUFF_SIZE_MAX 8192 +#define NEIGH_ENTRY_BUFF_SIZE_DEF 128 +#define NEIGH_ENTRY_BUFF_SIZE_MIN 16 +#define NEIGH_ENTRY_BUFF_SIZE_MAX 8192 #define DPVS_NEIGH_TIMEOUT_DEF 60 #define DPVS_NEIGH_TIMEOUT_MIN 1 #define DPVS_NEIGH_TIMEOUT_MAX 3600 -struct neighbour_entry { - struct list_head arp_list; - struct in_addr ip_addr; - struct ether_addr eth_addr; - struct netif_port *port; - struct dpvs_timer timer; - struct list_head queue_list; - uint32_t que_num; - uint32_t state; - uint32_t ts; - uint8_t flag; -} __rte_cache_aligned; +static int neigh_nums[DPVS_MAX_LCORE] = {0}; struct neighbour_mbuf_entry { - struct rte_mbuf *m; - struct list_head neigh_mbuf_list; + struct rte_mbuf *m; + struct list_head neigh_mbuf_list; } __rte_cache_aligned; struct raw_neigh { - struct in_addr ip_addr; + int af; + union inet_addr ip_addr; struct ether_addr eth_addr; struct netif_port *port; - bool add; - uint8_t flag; + bool add; + uint8_t flag; } __rte_cache_aligned; struct nud_state { int next_state[DPVS_NUD_S_MAX]; }; -#ifdef CONFIG_DPVS_NEIGH_DEBUG -static const char *nud_state_names[] = { - [DPVS_NUD_S_NONE] = "NONE", - [DPVS_NUD_S_SEND] = "SEND", - [DPVS_NUD_S_REACHABLE] = "REACHABLE", - [DPVS_NUD_S_PROBE] = "PROBE", - [DPVS_NUD_S_DELAY] = "DELAY", - [DPVS_NUD_S_MAX] = "BUG" -}; -#endif - -#define sNNO DPVS_NUD_S_NONE -#define sNSD DPVS_NUD_S_SEND -#define sNRE DPVS_NUD_S_REACHABLE -#define sNPR DPVS_NUD_S_PROBE -#define sNDE DPVS_NUD_S_DELAY - -#define DPVS_NUD_S_KEEP DPVS_NUD_S_MAX -#define sNKP DPVS_NUD_S_KEEP /*Keep state and do not reset timer*/ - static int nud_timeouts[DPVS_NUD_S_MAX] = { [DPVS_NUD_S_NONE] = 2, [DPVS_NUD_S_SEND] = 3, @@ -113,7 +80,7 @@ static struct nud_state nud_states[] = { #define NEIGH_PROCESS_MAC_RING_INTERVAL 100 /* params from config file */ -static int arp_unres_qlen = ARP_ENTRY_BUFF_SIZE_DEF; +static int arp_unres_qlen = NEIGH_ENTRY_BUFF_SIZE_DEF; static struct rte_ring *neigh_ring[DPVS_MAX_LCORE]; @@ -125,14 +92,14 @@ static void unres_qlen_handler(vector_t tokens) assert(str); unres_qlen = atoi(str); - if (unres_qlen >= ARP_ENTRY_BUFF_SIZE_MIN && - unres_qlen <= ARP_ENTRY_BUFF_SIZE_MAX) { + if (unres_qlen >= NEIGH_ENTRY_BUFF_SIZE_MIN && + unres_qlen <= NEIGH_ENTRY_BUFF_SIZE_MAX) { RTE_LOG(INFO, NEIGHBOUR, "arp_unres_qlen = %d\n", unres_qlen); arp_unres_qlen = unres_qlen; } else { RTE_LOG(WARNING, NEIGHBOUR, "invalid arp_unres_qlen config %s, using default " - "%d\n", str, ARP_ENTRY_BUFF_SIZE_DEF); - arp_unres_qlen = ARP_ENTRY_BUFF_SIZE_DEF; + "%d\n", str, NEIGH_ENTRY_BUFF_SIZE_DEF); + arp_unres_qlen = NEIGH_ENTRY_BUFF_SIZE_DEF; } FREE_PTR(str); @@ -149,8 +116,8 @@ static void timeout_handler(vector_t tokens) RTE_LOG(INFO, NEIGHBOUR, "arp_reachable_timeout = %d\n", timeout); nud_timeouts[DPVS_NUD_S_REACHABLE] = timeout; } else { - RTE_LOG(INFO, NEIGHBOUR, "invalid arp_reachable_timeout config %s, using default %d\n", - str, DPVS_NEIGH_TIMEOUT_DEF); + RTE_LOG(INFO, NEIGHBOUR, "invalid arp_reachable_timeout config %s, \ + using default %d\n", str, DPVS_NEIGH_TIMEOUT_DEF); nud_timeouts[DPVS_NUD_S_REACHABLE] = DPVS_NEIGH_TIMEOUT_DEF; } FREE_PTR(str); @@ -160,7 +127,7 @@ void neigh_keyword_value_init(void) { if (dpvs_state_get() == DPVS_STATE_INIT) { /* KW_TYPE_INIT keyword */ - arp_unres_qlen = ARP_ENTRY_BUFF_SIZE_DEF; + arp_unres_qlen = NEIGH_ENTRY_BUFF_SIZE_DEF; nud_timeouts[DPVS_NUD_S_REACHABLE] = DPVS_NEIGH_TIMEOUT_DEF; } /* KW_TYPE_NORMAL keyword */ @@ -173,18 +140,15 @@ void install_neighbor_keywords(void) install_keyword("timeout", timeout_handler, KW_TYPE_INIT); } -static int num_neighbours = 0; -static lcoreid_t g_cid = 0; static lcoreid_t master_cid = 0; -static struct list_head neigh_table[DPVS_MAX_LCORE][ARP_TAB_SIZE]; +static struct list_head neigh_table[DPVS_MAX_LCORE][NEIGH_TAB_SIZE]; -static struct raw_neigh* neigh_ring_clone_entry(const struct neighbour_entry* neighbour, bool add); +static struct raw_neigh* neigh_ring_clone_entry(const struct neighbour_entry* neighbour, + bool add); static int neigh_send_arp(struct netif_port *port, uint32_t src_ip, uint32_t dst_ip); -#ifdef CONFIG_DPVS_NEIGH_DEBUG - static inline char *eth_addr_itoa(const struct ether_addr *src, char *dst, size_t size) { snprintf(dst, size, "%02x:%02x:%02x:%02x:%02x:%02x", @@ -197,6 +161,8 @@ static inline char *eth_addr_itoa(const struct ether_addr *src, char *dst, size_ return dst; } + +#ifdef CONFIG_DPVS_NEIGH_DEBUG static void dump_arp_hdr(const char *msg, const struct arp_hdr *ah, portid_t port) { const struct arp_ipv4 *aip4; @@ -224,16 +190,12 @@ static inline void dump_arp_hdr(const char *msg, const struct arp_hdr *ah, porti { } #endif -static inline unsigned int neigh_hashkey(uint32_t ip_addr, struct netif_port *port) -{ - return rte_be_to_cpu_32(ip_addr)&ARP_TAB_MASK; -} static inline int neigh_hash(struct neighbour_entry *neighbour, unsigned int hashkey) { lcoreid_t cid = rte_lcore_id(); if(!(neighbour->flag & NEIGHBOUR_HASHED)){ - list_add(&neighbour->arp_list, &neigh_table[cid][hashkey]); + list_add(&neighbour->neigh_list, &neigh_table[cid][hashkey]); neighbour->flag |= NEIGHBOUR_HASHED; return EDPVS_OK; } @@ -245,29 +207,29 @@ static inline int neigh_unhash(struct neighbour_entry *neighbour) { int err; if((neighbour->flag & NEIGHBOUR_HASHED)){ - list_del(&neighbour->arp_list); + list_del(&neighbour->neigh_list); neighbour->flag &= ~NEIGHBOUR_HASHED; err = EDPVS_OK; } else { err = EDPVS_NOTEXIST; } if (unlikely(err == EDPVS_NOTEXIST)) - RTE_LOG(DEBUG, NEIGHBOUR, "%s: arp entry not hashed.\n", __func__); + RTE_LOG(DEBUG, NEIGHBOUR, "%s: neighbour entry not hashed.\n", __func__); return err; } -static inline bool neigh_key_cmp(const struct neighbour_entry *neighbour, - const void *key, const struct netif_port* port) +static inline bool neigh_key_cmp(int af, const struct neighbour_entry *neighbour, + const union inet_addr *key, const struct netif_port* port) { - return ((neighbour->ip_addr.s_addr == *(uint32_t*)key) - &&(neighbour->port->id==port->id)); + + return (inet_addr_equal(af, key, &neighbour->ip_addr)) && + (neighbour->port == port) && + (neighbour->af == af); } static int neigh_entry_expire(struct neighbour_entry *neighbour) { struct neighbour_mbuf_entry *mbuf, *mbuf_next; - struct raw_neigh *mac_param; - lcoreid_t cid = rte_lcore_id(); dpvs_timer_cancel(&neighbour->timer, false); @@ -280,38 +242,14 @@ static int neigh_entry_expire(struct neighbour_entry *neighbour) rte_free(mbuf); } - if (cid == g_cid) { - mac_param = neigh_ring_clone_entry(neighbour, 0); - if (mac_param) { - int ret = rte_ring_enqueue(neigh_ring[master_cid], mac_param); - if (unlikely(-EDQUOT == ret)) - RTE_LOG(WARNING, NEIGHBOUR, "%s: neigh ring quota exceeded\n", - __func__); - else if (ret < 0) { - rte_free(mac_param); - RTE_LOG(WARNING, NEIGHBOUR, "%s: neigh ring enqueue failed\n", - __func__); - } - } - else - RTE_LOG(WARNING, NEIGHBOUR, "%s: clone ring param faild\n", __func__); - } - rte_free(neighbour); + assert(cid != master_cid); + neigh_nums[cid]--; return DTIMER_STOP; } -#ifdef CONFIG_DPVS_NEIGH_DEBUG -static const char *nud_state_name(int state) -{ - if (state >= DPVS_NUD_S_KEEP) - return "ERR!"; - return nud_state_names[state] ? nud_state_names[state] :""; -} -#endif - -static void neigh_entry_state_trans(struct neighbour_entry *neighbour, int idx) +void neigh_entry_state_trans(struct neighbour_entry *neighbour, int idx) { struct timeval timeout; @@ -352,12 +290,14 @@ static int neighbour_timer_event(void *data) return DTIMER_OK; } -static struct neighbour_entry *neigh_lookup_entry(const void *key, const struct netif_port* port, unsigned int hashkey) +struct neighbour_entry *neigh_lookup_entry(int af, const union inet_addr *key, + const struct netif_port* port, + unsigned int hashkey) { struct neighbour_entry *neighbour; lcoreid_t cid = rte_lcore_id(); - list_for_each_entry(neighbour, &neigh_table[cid][hashkey], arp_list){ - if(neigh_key_cmp(neighbour, key, port)) { + list_for_each_entry(neighbour, &neigh_table[cid][hashkey], neigh_list){ + if(neigh_key_cmp(af, neighbour, key, port)) { return neighbour; } } @@ -365,71 +305,19 @@ static struct neighbour_entry *neigh_lookup_entry(const void *key, const struct return NULL; } -void neigh_confirm(struct in_addr nexthop, struct netif_port *port) -{ - struct neighbour_entry *neighbour; - unsigned int hashkey; - lcoreid_t cid = rte_lcore_id(); - /*find nexhop/neighbour to confirm, no matter whether it is the route in*/ - hashkey = neigh_hashkey(nexthop.s_addr, port); - list_for_each_entry(neighbour, &neigh_table[cid][hashkey], arp_list) { - if (neigh_key_cmp(neighbour, &nexthop.s_addr, port) && - !(neighbour->flag & NEIGHBOUR_STATIC)) { - neigh_entry_state_trans(neighbour, 2); - } - } -} - -static void neigh_arp_confirm(struct neighbour_entry *neighbour) -{ - union inet_addr saddr, daddr; - - memset(&saddr, 0, sizeof(saddr)); - daddr.in.s_addr = neighbour->ip_addr.s_addr; - inet_addr_select(AF_INET, neighbour->port, &daddr, 0, &saddr); - if (!saddr.in.s_addr) { - RTE_LOG(ERR, NEIGHBOUR, "[%s]no source ip\n", __func__); - } - - if (neigh_send_arp(neighbour->port, saddr.in.s_addr, - daddr.in.s_addr) != EDPVS_OK) { - RTE_LOG(ERR, NEIGHBOUR, "[%s] send arp failed\n", __func__); - } -} - -static int neigh_edit(struct neighbour_entry *neighbour, struct ether_addr* eth_addr, - unsigned int hashkey) +int neigh_edit(struct neighbour_entry *neighbour, struct ether_addr *eth_addr) { rte_memcpy(&neighbour->eth_addr, eth_addr, 6); - lcoreid_t cid = rte_lcore_id(); - - if ((g_cid == cid) && !(neighbour->flag & NEIGHBOUR_STATIC)) { - struct raw_neigh *mac_param; - mac_param = neigh_ring_clone_entry(neighbour, 1); - if (mac_param) { - int ret = rte_ring_enqueue(neigh_ring[master_cid], mac_param); - if (unlikely(-EDQUOT == ret)) - RTE_LOG(WARNING, NEIGHBOUR, "%s: neigh ring quota exceeded\n", - __func__); - else if (ret < 0) { - rte_free(mac_param); - RTE_LOG(WARNING, NEIGHBOUR, "%s: neigh ring enqueue failed\n", - __func__); - } - } - else - RTE_LOG(WARNING, NEIGHBOUR, "%s: clone ring param faild\n", __func__); - } return EDPVS_OK; } -static struct neighbour_entry * -neigh_add_table(uint32_t ipaddr, const struct ether_addr* eth_addr, - struct netif_port* port, unsigned int hashkey, int flag) +struct neighbour_entry *neigh_add_table(int af, const union inet_addr *ipaddr, + const struct ether_addr *eth_addr, + struct netif_port *port, + unsigned int hashkey, int flag) { struct neighbour_entry *new_neighbour=NULL; - struct in_addr *ip_addr = (struct in_addr*)&ipaddr; struct timeval delay; lcoreid_t cid = rte_lcore_id(); @@ -438,9 +326,10 @@ neigh_add_table(uint32_t ipaddr, const struct ether_addr* eth_addr, if(new_neighbour == NULL) return NULL; - rte_memcpy(&new_neighbour->ip_addr, ip_addr, - sizeof(struct in_addr)); + rte_memcpy(&new_neighbour->ip_addr, ipaddr, + sizeof(union inet_addr)); new_neighbour->flag = flag; + new_neighbour->af = af; if(eth_addr){ rte_memcpy(&new_neighbour->eth_addr, eth_addr, 6); @@ -457,48 +346,43 @@ neigh_add_table(uint32_t ipaddr, const struct ether_addr* eth_addr, INIT_LIST_HEAD(&new_neighbour->queue_list); - if (!(new_neighbour->flag & NEIGHBOUR_STATIC) && cid != master_cid) { + if (!(new_neighbour->flag & NEIGHBOUR_STATIC)) { dpvs_timer_sched(&new_neighbour->timer, &delay, neighbour_timer_event, new_neighbour, false); } - if ((g_cid == cid) && !(new_neighbour->flag & NEIGHBOUR_STATIC)) { - struct raw_neigh *mac_param; - mac_param = neigh_ring_clone_entry(new_neighbour, 1); - if (mac_param) { - int ret = rte_ring_enqueue(neigh_ring[master_cid], mac_param); - if (unlikely(-EDQUOT == ret)) - RTE_LOG(WARNING, NEIGHBOUR, "%s: neigh ring quota exceeded\n", - __func__); - else if (ret < 0) { - rte_free(mac_param); - RTE_LOG(WARNING, NETIF, "%s: neigh ring enqueue failed\n", - __func__); - } - } - else - RTE_LOG(WARNING, NEIGHBOUR, "%s: clone ring param faild\n", __func__); - } neigh_hash(new_neighbour, hashkey); + neigh_nums[cid]++; return new_neighbour; } /***********************fill mac hdr before send pkt************************************/ -static void neigh_fill_mac(struct neighbour_entry *neighbour, struct rte_mbuf *m) +static void neigh_fill_mac(struct neighbour_entry *neighbour, + struct rte_mbuf *m, + const struct in6_addr *target, + struct netif_port *port) { struct ether_hdr *eth; + struct ether_addr mult_eth; uint16_t pkt_type; m->l2_len = sizeof(struct ether_hdr); eth = (struct ether_hdr *)rte_pktmbuf_prepend(m, (uint16_t)sizeof(struct ether_hdr)); - ether_addr_copy(&neighbour->eth_addr,ð->d_addr); - ether_addr_copy(&neighbour->port->addr,ð->s_addr); + + if (!neighbour && target) { + ipv6_mac_mult(target, &mult_eth); + ether_addr_copy(&mult_eth, ð->d_addr); + } else { + ether_addr_copy(&neighbour->eth_addr, ð->d_addr); + } + + ether_addr_copy(&port->addr, ð->s_addr); pkt_type = (uint16_t)m->packet_type; eth->ether_type = rte_cpu_to_be_16(pkt_type); } -static void neigh_send_mbuf_cach(struct neighbour_entry *neighbour) +void neigh_send_mbuf_cach(struct neighbour_entry *neighbour) { struct neighbour_mbuf_entry *mbuf, *mbuf_next; struct rte_mbuf *m; @@ -507,17 +391,60 @@ static void neigh_send_mbuf_cach(struct neighbour_entry *neighbour) &neighbour->queue_list,neigh_mbuf_list){ list_del(&mbuf->neigh_mbuf_list); m = mbuf->m; - neigh_fill_mac(neighbour, m); + neigh_fill_mac(neighbour, m, NULL, neighbour->port); netif_xmit(m, neighbour->port); neighbour->que_num--; rte_free(mbuf); } } +void neigh_confirm(int af, union inet_addr *nexthop, struct netif_port *port) +{ + struct neighbour_entry *neighbour; + unsigned int hashkey; + lcoreid_t cid = rte_lcore_id(); + /*find nexhop/neighbour to confirm, no matter whether it is the route in*/ + hashkey = neigh_hashkey(af, nexthop, port); + list_for_each_entry(neighbour, &neigh_table[cid][hashkey], neigh_list) { + if (neigh_key_cmp(af, neighbour, nexthop, port) && + !(neighbour->flag & NEIGHBOUR_STATIC)) { + neigh_entry_state_trans(neighbour, 2); + } + } +} + +static void neigh_state_confirm(struct neighbour_entry *neighbour) +{ + union inet_addr saddr, daddr; + + memset(&saddr, 0, sizeof(saddr)); + + if (neighbour->af == AF_INET) { + daddr.in.s_addr = neighbour->ip_addr.in.s_addr; + inet_addr_select(AF_INET, neighbour->port, &daddr, 0, &saddr); + if (!saddr.in.s_addr) { + RTE_LOG(ERR, NEIGHBOUR, "[%s]no source ip\n", __func__); + } + + if (neigh_send_arp(neighbour->port, saddr.in.s_addr, + daddr.in.s_addr) != EDPVS_OK) { + RTE_LOG(ERR, NEIGHBOUR, "[%s] send arp failed\n", __func__); + } + } else if (neighbour->af == AF_INET6) { + /*to be continue*/ + ipv6_addr_copy(&daddr.in6, &neighbour->ip_addr.in6); + inet_addr_select(AF_INET6, neighbour->port, &daddr, 0, &saddr); + + if (ipv6_addr_any(&saddr.in6)) + RTE_LOG(ERR, NEIGHBOUR, "[%s]no source ip\n", __func__); + + ndisc_solicit(neighbour, &saddr.in6); + } +} +/*arp*/ int neigh_resolve_input(struct rte_mbuf *m, struct netif_port *port) { - struct arp_hdr *arp = rte_pktmbuf_mtod(m, struct arp_hdr *); struct ether_hdr *eth; @@ -554,13 +481,15 @@ int neigh_resolve_input(struct rte_mbuf *m, struct netif_port *port) } else if(arp->arp_op == htons(ARP_OP_REPLY)) { ipaddr = arp->arp_data.arp_sip; - hashkey = neigh_hashkey(ipaddr, port); - neighbour = neigh_lookup_entry(&ipaddr, port, hashkey); + hashkey = neigh_hashkey(AF_INET, (union inet_addr *)&ipaddr, port); + neighbour = neigh_lookup_entry(AF_INET, (union inet_addr *)&ipaddr, + port, hashkey); if (neighbour && !(neighbour->flag & NEIGHBOUR_STATIC)) { - neigh_edit(neighbour, &arp->arp_data.arp_sha, hashkey); + neigh_edit(neighbour, &arp->arp_data.arp_sha); neigh_entry_state_trans(neighbour, 1); } else { - neighbour = neigh_add_table(ipaddr, &arp->arp_data.arp_sha, port, hashkey, 0); + neighbour = neigh_add_table(AF_INET, (union inet_addr *)&ipaddr, + &arp->arp_data.arp_sha, port, hashkey, 0); if(!neighbour){ RTE_LOG(ERR, NEIGHBOUR, "[%s] add neighbour wrong\n", __func__); rte_pktmbuf_free(m); @@ -617,34 +546,40 @@ static int neigh_send_arp(struct netif_port *port, uint32_t src_ip, uint32_t dst memset(&arp[1], 0, 18); - dump_arp_hdr("send", arp, port->id); + dump_arp_hdr("send", arp, port->id); netif_xmit(m, port); return EDPVS_OK; } -int neigh_resolve_output(struct in_addr *nexhop, struct rte_mbuf *m, - struct netif_port *port) +int neigh_output(int af, union inet_addr *nexhop, + struct rte_mbuf *m, struct netif_port *port) { struct neighbour_entry *neighbour; struct neighbour_mbuf_entry *m_buf; unsigned int hashkey; - uint32_t nexhop_addr = nexhop->s_addr; if (port->flag & NETIF_PORT_FLAG_NO_ARP) return netif_xmit(m, port); - hashkey = neigh_hashkey(nexhop_addr, port); - neighbour = neigh_lookup_entry(&nexhop_addr, port, hashkey); + if (af == AF_INET6 && ipv6_addr_is_multicast((struct in6_addr *)nexhop)) { + neigh_fill_mac(NULL, m, (struct in6_addr *)nexhop, port); + return netif_xmit(m, port); + } + + hashkey = neigh_hashkey(af, nexhop, port); + neighbour = neigh_lookup_entry(af, nexhop, port, hashkey); if (neighbour) { if ((neighbour->state == DPVS_NUD_S_NONE) || (neighbour->state == DPVS_NUD_S_SEND)) { if (neighbour->que_num > arp_unres_qlen) { - /*don't need arp request now, - since neighbour will not be confirmed - and it will be released late*/ + /* + * don't need arp request now, + * since neighbour will not be confirmed + * and it will be released late + */ rte_pktmbuf_free(m); - RTE_LOG(ERR, NEIGHBOUR, "[%s] arp_unres_queue is full, drop packet\n", __func__); + RTE_LOG(ERR, NEIGHBOUR, "[%s] neigh_unres_queue is full, drop packet\n", __func__); return EDPVS_DROP; } m_buf = rte_zmalloc("neigh_new_mbuf", @@ -658,7 +593,7 @@ int neigh_resolve_output(struct in_addr *nexhop, struct rte_mbuf *m, neighbour->que_num++; if (neighbour->state == DPVS_NUD_S_NONE) { - neigh_arp_confirm(neighbour); + neigh_state_confirm(neighbour); neigh_entry_state_trans(neighbour, 0); } return EDPVS_OK; @@ -667,11 +602,11 @@ int neigh_resolve_output(struct in_addr *nexhop, struct rte_mbuf *m, (neighbour->state == DPVS_NUD_S_PROBE) || (neighbour->state == DPVS_NUD_S_DELAY)) { - neigh_fill_mac(neighbour, m); + neigh_fill_mac(neighbour, m, NULL, port); netif_xmit(m, neighbour->port); if (neighbour->state == DPVS_NUD_S_PROBE) { - neigh_arp_confirm(neighbour); + neigh_state_confirm(neighbour); neigh_entry_state_trans(neighbour, 0); } @@ -681,7 +616,7 @@ int neigh_resolve_output(struct in_addr *nexhop, struct rte_mbuf *m, return EDPVS_IDLE; } else{ - neighbour = neigh_add_table(nexhop_addr, NULL, port, hashkey, 0); + neighbour = neigh_add_table(af, nexhop, NULL, port, hashkey, 0); if(!neighbour){ RTE_LOG(ERR, NEIGHBOUR, "[%s] add neighbour wrong\n", __func__); rte_pktmbuf_free(m); @@ -702,7 +637,7 @@ int neigh_resolve_output(struct in_addr *nexhop, struct rte_mbuf *m, neighbour->que_num++; if (neighbour->state == DPVS_NUD_S_NONE) { - neigh_arp_confirm(neighbour); + neigh_state_confirm(neighbour); neigh_entry_state_trans(neighbour, 0); } @@ -734,20 +669,23 @@ static int neigh_ring_init(void) socket_id = rte_socket_id(); for (cid = 0; cid < DPVS_MAX_LCORE; cid++) { snprintf(name_buf, RTE_RING_NAMESIZE, "neigh_ring_c%d", cid); - neigh_ring[cid] = rte_ring_create(name_buf, MAC_RING_SIZE, socket_id, RING_F_SC_DEQ); + neigh_ring[cid] = rte_ring_create(name_buf, MAC_RING_SIZE, + socket_id, RING_F_SC_DEQ); if (neigh_ring[cid] == NULL) rte_panic("create ring:%s failed!\n", name_buf); } return EDPVS_OK; } -static struct raw_neigh* neigh_ring_clone_entry(const struct neighbour_entry* neighbour, bool add) +static struct raw_neigh* neigh_ring_clone_entry(const struct neighbour_entry* neighbour, + bool add) { struct raw_neigh* mac_param; mac_param = rte_zmalloc("mac_entry", sizeof(struct raw_neigh), RTE_CACHE_LINE_SIZE); if (mac_param == NULL) return NULL; - rte_memcpy(&mac_param->ip_addr, &neighbour->ip_addr, sizeof(struct in_addr)); + mac_param->af = neighbour->af; + rte_memcpy(&mac_param->ip_addr, &neighbour->ip_addr, sizeof(union inet_addr)); mac_param->flag = neighbour->flag & ~NEIGHBOUR_HASHED; mac_param->port = neighbour->port; mac_param->add = add; @@ -756,7 +694,8 @@ static struct raw_neigh* neigh_ring_clone_entry(const struct neighbour_entry* ne return mac_param; } -static struct raw_neigh* neigh_ring_clone_param(const struct dp_vs_neigh_conf *param, bool add) +static struct raw_neigh* neigh_ring_clone_param(const struct dp_vs_neigh_conf *param, + bool add) { struct netif_port *port; struct raw_neigh* mac_param; @@ -764,7 +703,8 @@ static struct raw_neigh* neigh_ring_clone_param(const struct dp_vs_neigh_conf *p mac_param = rte_zmalloc("mac_entry", sizeof(struct raw_neigh), RTE_CACHE_LINE_SIZE); if (mac_param == NULL) return NULL; - rte_memcpy(&mac_param->ip_addr, ¶m->ip_addr, sizeof(struct in_addr)); + mac_param->af = param->af; + rte_memcpy(&mac_param->ip_addr, ¶m->ip_addr, sizeof(union inet_addr)); mac_param->flag = param->flag | NEIGHBOUR_STATIC; mac_param->port = port; mac_param->add = add; @@ -772,6 +712,10 @@ static struct raw_neigh* neigh_ring_clone_param(const struct dp_vs_neigh_conf *p return mac_param; } +/* + *1, master core static neighbour sync slave core; + *2, ipv6 slave core sync slave core when recieve ns/na + */ void neigh_process_ring(void *arg) { struct raw_neigh *params[NETIF_MAX_PKT_BURST]; @@ -780,32 +724,37 @@ void neigh_process_ring(void *arg) struct neighbour_entry *neigh; struct raw_neigh *param; lcoreid_t cid = rte_lcore_id(); - nb_rb = rte_ring_dequeue_burst(neigh_ring[cid], (void **)params, NETIF_MAX_PKT_BURST, NULL); + nb_rb = rte_ring_dequeue_burst(neigh_ring[cid], (void **)params, + NETIF_MAX_PKT_BURST, NULL); if (nb_rb > 0) { int i; for (i = 0; i < nb_rb; i++) { param = params[i]; - hash = neigh_hashkey(param->ip_addr.s_addr, param->port); - neigh = neigh_lookup_entry(¶m->ip_addr.s_addr, param->port, hash); + hash = neigh_hashkey(param->af, ¶m->ip_addr, param->port); + neigh = neigh_lookup_entry(param->af, ¶m->ip_addr, + param->port, hash); if (param->add) { if (neigh) { - neigh_edit(neigh, ¶m->eth_addr, hash); + neigh_edit(neigh, ¶m->eth_addr); + if (!(param->flag & NEIGHBOUR_STATIC)) + neigh_entry_state_trans(neigh, 1); } else { - neigh = neigh_add_table(param->ip_addr.s_addr, ¶m->eth_addr, - param->port, hash, param->flag); - if ((cid == master_cid)&&(neigh)) { - num_neighbours++; - } + neigh = neigh_add_table(param->af, ¶m->ip_addr, + ¶m->eth_addr, param->port, + hash, param->flag); + if (!(param->flag & NEIGHBOUR_STATIC)) + neigh_entry_state_trans(neigh, 1); } - } - else { + + neigh_send_mbuf_cach(neigh); + + } else { if (neigh) { - if (!(neigh->flag & NEIGHBOUR_STATIC) && - (cid != master_cid)) + if (!(neigh->flag & NEIGHBOUR_STATIC)) dpvs_timer_cancel(&neigh->timer, false); - neigh_unhash(neigh); + struct neighbour_mbuf_entry *mbuf, *mbuf_next; list_for_each_entry_safe(mbuf, mbuf_next, &neigh->queue_list, neigh_mbuf_list) { @@ -814,8 +763,7 @@ void neigh_process_ring(void *arg) rte_free(mbuf); } rte_free(neigh); - if (cid == master_cid) - num_neighbours--; + neigh_nums[cid]--; } else RTE_LOG(WARNING, NEIGHBOUR, "%s: not exist\n", __func__); @@ -828,24 +776,88 @@ void neigh_process_ring(void *arg) /************************** used for dpip neighbour show***********************************/ static void neigh_fill_param(struct dp_vs_neigh_conf *param, - const struct neighbour_entry *entry) + const struct neighbour_entry *entry, + lcoreid_t cid) { - param->af = AF_INET; - param->ip_addr.in = entry->ip_addr; - param->flag = entry->flag; - ether_addr_copy(&entry->eth_addr,¶m->eth_addr); + param->af = entry->af; + param->ip_addr = entry->ip_addr; + param->flag = entry->flag; + ether_addr_copy(&entry->eth_addr, ¶m->eth_addr); param->que_num = entry->que_num; - param->state = entry->state; + param->state = entry->state; + param->cid = cid; + memcpy(¶m->ifname, entry->port->name, IFNAMSIZ); } -static int neigh_sockopt_get(sockoptid_t opt, const void *conf, size_t size, - void **out, size_t *outsize) +static void neigh_fill_array(struct netif_port *dev, lcoreid_t cid, + struct dp_vs_neigh_conf_array *array, + size_t neigh_nums) { - const struct dp_vs_neigh_conf *cf; - struct dp_vs_neigh_conf_array *array; - size_t hash, off; + int hash, off; struct neighbour_entry *entry; + + off = array->neigh_nums; + + if (dev) { + for (hash = 0; hash < NEIGH_TAB_SIZE; hash++) { + list_for_each_entry(entry, &neigh_table[cid][hash], neigh_list) { + if (dev == entry->port) { + if (off >= neigh_nums) { + RTE_LOG(WARNING, NEIGHBOUR, "%s: neigh num not match\n", __func__); + break; + } + + neigh_fill_param(&array->addrs[off++], entry, cid); + array->neigh_nums = off; + } + } + } + } else { + for (hash = 0; hash < NEIGH_TAB_SIZE; hash++) { + list_for_each_entry(entry, &neigh_table[cid][hash], neigh_list) { + if (off >= neigh_nums) { + RTE_LOG(WARNING, NEIGHBOUR, "%s: neigh num not match\n", __func__); + break; + } + + neigh_fill_param(&array->addrs[off++], entry, cid); + array->neigh_nums = off; + } + } + } +} + +static int get_neigh_uc_cb(struct dpvs_msg *msg) +{ + struct netif_port *dev = NULL; + struct dp_vs_neigh_conf_array *array; + int len; + lcoreid_t cid = rte_lcore_id(); + + if (msg->len) + dev = netif_port_get_by_name((char *)msg->data); + + len = sizeof(struct dp_vs_neigh_conf_array) + + sizeof(struct dp_vs_neigh_conf) * neigh_nums[cid]; + array = rte_zmalloc("neigh_array", len, RTE_CACHE_LINE_SIZE); + + neigh_fill_array(dev, cid, array, neigh_nums[cid]); + msg->reply.len = len; + msg->reply.data = (void *)array; + + return EDPVS_OK; +} + +static int neigh_sockopt_get(sockoptid_t opt, const void *conf, + size_t size, void **out, size_t *outsize) +{ + const struct dp_vs_neigh_conf *cf; + struct dp_vs_neigh_conf_array *array, *array_msg; + size_t off = 0; struct netif_port *port = NULL; + struct dpvs_msg *msg, *cur; + struct dpvs_multicast_queue *reply = NULL; + int neigh_nums_g = 0, err; if (conf && size >= sizeof(*cf)) cf = conf; @@ -861,32 +873,83 @@ static int neigh_sockopt_get(sockoptid_t opt, const void *conf, size_t size, } } + msg = msg_make(MSG_TYPE_NEIGH_GET, 0 , DPVS_MSG_MULTICAST, rte_lcore_id(), + port ? sizeof(port->name) : 0, port ? (&port->name) : NULL); + if (!msg) + return EDPVS_NOMEM; + err = multicast_msg_send(msg, 0, &reply); + if (err != EDPVS_OK) { + msg_destroy(&msg); + RTE_LOG(ERR, NEIGHBOUR, "%s: send message fail.\n", __func__); + return EDPVS_DROP; + } + + /* get neigh_num */ + list_for_each_entry(cur, &reply->mq, mq_node) { + array_msg = (struct dp_vs_neigh_conf_array *)(cur->data); + neigh_nums_g += array_msg->neigh_nums; + } *outsize = sizeof(struct dp_vs_neigh_conf_array) + \ - num_neighbours * sizeof(struct dp_vs_neigh_conf); + neigh_nums_g * sizeof(struct dp_vs_neigh_conf); *out = rte_calloc(NULL, 1, *outsize, RTE_CACHE_LINE_SIZE); - if (!(*out)) + if (!(*out)) { + msg_destroy(&msg); return EDPVS_NOMEM; - + } array = *out; - off = 0; - if (port) { - for (hash = 0; hash < ARP_TAB_SIZE; hash ++){ - list_for_each_entry(entry, &neigh_table[master_cid][hash], arp_list) { - if (port == entry->port) { - neigh_fill_param(&array->addrs[off++], entry); - } - } + /* copy neigh_array*/ + array->neigh_nums = neigh_nums_g; + list_for_each_entry(cur, &reply->mq, mq_node) { + array_msg = (struct dp_vs_neigh_conf_array *)(cur->data); + memcpy(&array->addrs[off], &array_msg->addrs, + array_msg->neigh_nums * sizeof(struct dp_vs_neigh_conf)); + off += array_msg->neigh_nums; + } + + msg_destroy(&msg); + + return EDPVS_OK; +} + +int neigh_sync_core(const void *param, bool add_del, enum param_kind kind) +{ + struct raw_neigh *mac_param; + int ret = 0; + lcoreid_t cid, i; + cid = rte_lcore_id(); + + for (i = 0; i < DPVS_MAX_LCORE; i++) { + if ((i == cid) || (!is_lcore_id_valid(i)) || (i == master_cid)) + continue; + switch (kind) { + case NEIGH_ENTRY: + mac_param = neigh_ring_clone_entry(param, add_del); + break; + case NEIGH_PARAM: + mac_param = neigh_ring_clone_param(param, add_del); + break; + default: + return EDPVS_NOTSUPP; } - } else { - for (hash = 0; hash < ARP_TAB_SIZE; hash ++){ - list_for_each_entry(entry, &neigh_table[master_cid][hash], arp_list) { - neigh_fill_param(&array->addrs[off++], entry); + + if (mac_param) { + ret = rte_ring_enqueue(neigh_ring[i], mac_param); + if (unlikely(-EDQUOT == ret)) { + RTE_LOG(WARNING, NEIGHBOUR, "%s: neigh ring quota exceeded\n", + __func__); + } else if (ret < 0) { + rte_free(mac_param); + RTE_LOG(WARNING, NEIGHBOUR, "%s: neigh ring enqueue failed\n", + __func__); + return EDPVS_DPDKAPIFAIL; } + } else { + RTE_LOG(WARNING, NEIGHBOUR, "%s: clone mac faild\n", __func__); + return EDPVS_NOMEM; } } - array->n_neigh = off; return EDPVS_OK; } @@ -895,21 +958,11 @@ static int neigh_sockopt_set(sockoptid_t opt, const void *conf, size_t size) { const struct dp_vs_neigh_conf *param = conf; struct netif_port *port; - struct neighbour_entry *neigh; - unsigned int hash; - struct neighbour_mbuf_entry *mbuf, *mbuf_next; - lcoreid_t cid, i; - cid = rte_lcore_id(); - struct raw_neigh *mac_param; - if (!conf || size < sizeof(*param)) return EDPVS_INVAL; - if (param->af != AF_INET) - return EDPVS_NOTSUPP; - - if (param->ip_addr.in.s_addr == htonl(INADDR_ANY)) + if (inet_is_addr_any(param->af, ¶m->ip_addr)) return EDPVS_INVAL; port = netif_port_get_by_name(param->ifname); @@ -919,82 +972,19 @@ static int neigh_sockopt_set(sockoptid_t opt, const void *conf, size_t size) return EDPVS_INVAL; } - hash = neigh_hashkey(param->ip_addr.in.s_addr, port); - switch (opt) { case SOCKOPT_SET_NEIGH_ADD: - neigh = neigh_lookup_entry(¶m->ip_addr.in.s_addr, port, hash); - if (neigh) { - RTE_LOG(WARNING, NEIGHBOUR, "%s: already exist\n", __func__); - return EDPVS_EXIST; - } - - neigh = neigh_add_table(param->ip_addr.in.s_addr, ¶m->eth_addr, - port, hash, param->flag | NEIGHBOUR_STATIC); - - if (!neigh) { - RTE_LOG(WARNING, NEIGHBOUR, "%s: no memory\n", __func__); - return EDPVS_NOMEM; + if (EDPVS_OK != neigh_sync_core(param, 1, NEIGH_PARAM)) { + RTE_LOG(WARNING, NEIGHBOUR, "%s: sync failed\n", __func__); + return EDPVS_INVAL; } - - for(i = 0; i < DPVS_MAX_LCORE; i++) { - if ((i == cid) || (!is_lcore_id_valid(i))) - continue; - mac_param = neigh_ring_clone_param(param, 1); - if (mac_param) { - int ret = rte_ring_enqueue(neigh_ring[i], mac_param); - if (unlikely(-EDQUOT == ret)) - RTE_LOG(WARNING, NEIGHBOUR, "%s: neigh ring quota exceeded\n", - __func__); - else if (ret < 0) { - rte_free(mac_param); - RTE_LOG(WARNING, NEIGHBOUR, "%s: neigh ring enqueue failed\n", - __func__); - } - } - else - RTE_LOG(WARNING, NEIGHBOUR, "%s: clone mac faild\n", __func__); - } - - num_neighbours++; - break; case SOCKOPT_SET_NEIGH_DEL: - neigh = neigh_lookup_entry(¶m->ip_addr.in.s_addr, port, hash); - if (!neigh) { - RTE_LOG(WARNING, NEIGHBOUR, "%s: not exist\n", __func__); - return EDPVS_NOTEXIST; - } - - neigh_unhash(neigh); - list_for_each_entry_safe(mbuf, mbuf_next, - &neigh->queue_list, neigh_mbuf_list) { - list_del(&mbuf->neigh_mbuf_list); - rte_pktmbuf_free(mbuf->m); - rte_free(mbuf); - } - rte_free(neigh); - num_neighbours--; - - for(i = 0; i < DPVS_MAX_LCORE; i++) { - if ((i == cid) || (!is_lcore_id_valid(i))) - continue; - mac_param = neigh_ring_clone_param(param, 0); - if (mac_param) { - int ret = rte_ring_enqueue(neigh_ring[i], mac_param); - if (unlikely(-EDQUOT == ret)) - RTE_LOG(WARNING, NEIGHBOUR, "%s: neigh ring quota exceeded\n", - __func__); - else if (ret < 0) { - rte_free(mac_param); - RTE_LOG(WARNING, NEIGHBOUR, "%s: neigh ring enqueue failed\n", - __func__); - } - } - else - RTE_LOG(WARNING, NEIGHBOUR, "%s: clone mac faild\n", __func__); + if (EDPVS_OK != neigh_sync_core(param, 0, NEIGH_PARAM)) { + RTE_LOG(WARNING, NEIGHBOUR, "%s: sync failed\n", __func__); + return EDPVS_INVAL; } break; @@ -1023,26 +1013,14 @@ static int arp_init(void) { int i, j; int err; - uint64_t lcore_mask; - lcoreid_t cid; for (i = 0; i < DPVS_MAX_LCORE; i++) { - for (j = 0; j < ARP_TAB_SIZE; j++) { + for (j = 0; j < NEIGH_TAB_SIZE; j++) { INIT_LIST_HEAD(&neigh_table[i][j]); } } - /*choose one core to sync master*/ - netif_get_slave_lcores(NULL, &lcore_mask); - - for (cid = 0 ; cid < DPVS_MAX_LCORE; cid++) { - if (lcore_mask & (1L << cid)) { - g_cid = cid; - break; - } - } - master_cid = rte_lcore_id(); arp_pkt_type.type = rte_cpu_to_be_16(ETHER_TYPE_ARP); @@ -1066,17 +1044,41 @@ static int arp_init(void) return EDPVS_OK; } +static void register_stats_cb(void) +{ + struct dpvs_msg_type mt; + memset(&mt, 0 , sizeof(mt)); + mt.type = MSG_TYPE_NEIGH_GET; + mt.unicast_msg_cb = get_neigh_uc_cb; + mt.multicast_msg_cb = NULL; + assert(msg_type_mc_register(&mt) == 0); +} + +static void unregister_stats_cb(void) +{ + struct dpvs_msg_type mt; + memset(&mt, 0, sizeof(mt)); + mt.type = MSG_TYPE_NEIGH_GET; + mt.unicast_msg_cb = get_neigh_uc_cb; + mt.multicast_msg_cb = NULL; + assert(msg_type_mc_unregister(&mt) == 0); +} + int neigh_init(void) { if(EDPVS_NOMEM == arp_init()){ return EDPVS_NOMEM; } + register_stats_cb(); + return EDPVS_OK; } int neigh_term(void) { + unregister_stats_cb(); + return EDPVS_OK; } diff --git a/src/netif.c b/src/netif.c index 2635c77a1..20ed57d26 100644 --- a/src/netif.c +++ b/src/netif.c @@ -93,6 +93,10 @@ struct port_conf_stream { int tx_queue_nb; int tx_desc_nb; + enum rte_fdir_mode fdir_mode; + enum rte_fdir_pballoc_type fdir_pballoc; + enum rte_fdir_status_mode fdir_status; + bool promisc_mode; struct list_head port_list_node; @@ -257,6 +261,9 @@ static void device_handler(vector_t tokens) port_cfg->tx_desc_nb = NETIF_NB_TX_DESC_DEF; port_cfg->promisc_mode = false; strncpy(port_cfg->rss, "tcp", sizeof(port_cfg->rss)); + port_cfg->fdir_mode = RTE_FDIR_MODE_PERFECT; + port_cfg->fdir_pballoc = RTE_FDIR_PBALLOC_64K; + port_cfg->fdir_status = RTE_FDIR_REPORT_STATUS; list_add(&port_cfg->port_list_node, &port_list); } @@ -371,6 +378,102 @@ static void tx_desc_nb_handler(vector_t tokens) FREE_PTR(str); } +static void fdir_mode_handler(vector_t tokens) +{ + char *mode, *str = set_value(tokens); + struct port_conf_stream *current_device = list_entry(port_list.next, + struct port_conf_stream, port_list_node); + bool use_default = false; + assert(str); + + mode = strlwr(str); + + if (!strncmp(mode, "none", sizeof("none"))) + current_device->fdir_mode = RTE_FDIR_MODE_NONE; + else if (!strncmp(mode, "signature", sizeof("signature"))) + current_device->fdir_mode = RTE_FDIR_MODE_SIGNATURE; + else if (!strncmp(mode, "perfect", sizeof("perfect"))) + current_device->fdir_mode = RTE_FDIR_MODE_PERFECT; + else if (!strncmp(mode, "perfect_mac_vlan", sizeof("perfect_mac_vlan"))) + current_device->fdir_mode = RTE_FDIR_MODE_PERFECT_MAC_VLAN; + else if (!strncmp(mode, "perfect_tunnel", sizeof("perfect_tunnel"))) + current_device->fdir_mode = RTE_FDIR_MODE_PERFECT_TUNNEL; + else { + use_default = true; + current_device->fdir_mode = RTE_FDIR_MODE_PERFECT; + } + + if (use_default) + RTE_LOG(WARNING, NETIF, "invalid %s:fdir_mode '%s', " + "use default 'perfect'\n", current_device->name, mode); + else + RTE_LOG(INFO, NETIF, "%s:fdir_mode = %s\n", current_device->name, mode); + + FREE_PTR(str); +} + +static void fdir_pballoc_handler(vector_t tokens) +{ + char *pballoc, *str = set_value(tokens); + struct port_conf_stream *current_device = list_entry(port_list.next, + struct port_conf_stream, port_list_node); + bool use_default = false; + assert(str); + + pballoc = strlwr(str); + + if (!strncmp(pballoc, "64k", sizeof("64k"))) + current_device->fdir_pballoc = RTE_FDIR_PBALLOC_64K; + else if (!strncmp(pballoc, "128k", sizeof("128k"))) + current_device->fdir_pballoc = RTE_FDIR_PBALLOC_128K; + else if (!strncmp(pballoc, "256k", sizeof("256k"))) + current_device->fdir_pballoc = RTE_FDIR_PBALLOC_256K; + else { + use_default = true; + current_device->fdir_pballoc = RTE_FDIR_PBALLOC_64K; + } + + if (use_default) + RTE_LOG(WARNING, NETIF, "invalid %s:fdir_pballoc '%s', " + "use default '64k'\n", current_device->name, pballoc); + else + RTE_LOG(INFO, NETIF, "%s:fdir_pballoc = %s\n", + current_device->name, pballoc); + + FREE_PTR(str); +} + +static void fdir_status_handler(vector_t tokens) +{ + char *status, *str = set_value(tokens); + struct port_conf_stream *current_device = list_entry(port_list.next, + struct port_conf_stream, port_list_node); + bool use_default = false; + assert(str); + + status = strlwr(str); + + if (!strncmp(status, "close", sizeof("close"))) + current_device->fdir_status = RTE_FDIR_NO_REPORT_STATUS; + else if (!strncmp(status, "matched", sizeof("matched"))) + current_device->fdir_status = RTE_FDIR_REPORT_STATUS; + else if (!strncmp(status, "always", sizeof("always"))) + current_device->fdir_status = RTE_FDIR_REPORT_STATUS_ALWAYS; + else { + use_default = true; + current_device->fdir_status = RTE_FDIR_REPORT_STATUS; + } + + if (use_default) + RTE_LOG(WARNING, NETIF, "invalid %s:fdir_status '%s', " + "use default 'matched'\n", current_device->name, status); + else + RTE_LOG(INFO, NETIF, "%s:fdir_status = %s\n", + current_device->name, status); + + FREE_PTR(str); +} + static void promisc_mode_handler(vector_t tokens) { struct port_conf_stream *current_device = list_entry(port_list.next, @@ -736,6 +839,12 @@ void install_netif_keywords(void) install_keyword("queue_number", tx_queue_number_handler, KW_TYPE_INIT); install_keyword("descriptor_number", tx_desc_nb_handler, KW_TYPE_INIT); install_sublevel_end(); + install_keyword("fdir", NULL, KW_TYPE_INIT); + install_sublevel(); + install_keyword("mode", fdir_mode_handler, KW_TYPE_INIT); + install_keyword("pballoc", fdir_pballoc_handler, KW_TYPE_INIT); + install_keyword("status", fdir_status_handler, KW_TYPE_INIT); + install_sublevel_end(); install_keyword("promisc_mode", promisc_mode_handler, KW_TYPE_INIT); install_keyword("kni_name", kni_name_handler, KW_TYPE_INIT); install_sublevel_end(); @@ -2671,8 +2780,8 @@ struct netif_port *netif_alloc(size_t priv_size, const char *namefmt, return NULL; } dev->in_ptr->dev = dev; - dev->in_ptr->af = AF_INET; INIT_LIST_HEAD(&dev->in_ptr->ifa_list); + INIT_LIST_HEAD(&dev->in_ptr->ifm_list); if (tc_init_dev(dev) != EDPVS_OK) { RTE_LOG(ERR, NETIF, "%s: fail to init TC\n", __func__); @@ -2772,6 +2881,52 @@ static int dpdk_filter_supported(struct netif_port *dev, enum rte_filter_type fl return rte_eth_dev_filter_supported(dev->id, fltype); } +void netif_mask_fdir_filter(int af, const struct netif_port *port, + struct rte_eth_fdir_filter *filt) +{ + struct rte_eth_fdir_info fdir_info; + const struct rte_eth_fdir_masks *fmask; + union rte_eth_fdir_flow *flow = &filt->input.flow; + + if (rte_eth_dev_filter_ctrl(port->id, RTE_ETH_FILTER_FDIR, + RTE_ETH_FILTER_INFO, &fdir_info) < 0) { + RTE_LOG(WARNING, NETIF, "%s: Fail to fetch fdir info of %s !\n", + __func__, port->name); + return; + } + fmask = &fdir_info.mask; + + /* ipv4 flow */ + if (af == AF_INET) { + flow->ip4_flow.src_ip &= fmask->ipv4_mask.src_ip; + flow->ip4_flow.dst_ip &= fmask->ipv4_mask.dst_ip; + flow->ip4_flow.tos &= fmask->ipv4_mask.tos; + flow->ip4_flow.ttl &= fmask->ipv4_mask.ttl; + flow->ip4_flow.proto &= fmask->ipv4_mask.proto; + flow->tcp4_flow.src_port &= fmask->src_port_mask; + flow->tcp4_flow.dst_port &= fmask->dst_port_mask; + return; + } + + /* ipv6 flow */ + if (af == AF_INET6) { + flow->ipv6_flow.src_ip[0] &= fmask->ipv6_mask.src_ip[0]; + flow->ipv6_flow.src_ip[1] &= fmask->ipv6_mask.src_ip[1]; + flow->ipv6_flow.src_ip[2] &= fmask->ipv6_mask.src_ip[2]; + flow->ipv6_flow.src_ip[3] &= fmask->ipv6_mask.src_ip[3]; + flow->ipv6_flow.dst_ip[0] &= fmask->ipv6_mask.dst_ip[0]; + flow->ipv6_flow.dst_ip[1] &= fmask->ipv6_mask.dst_ip[1]; + flow->ipv6_flow.dst_ip[2] &= fmask->ipv6_mask.dst_ip[2]; + flow->ipv6_flow.dst_ip[3] &= fmask->ipv6_mask.dst_ip[3]; + flow->ipv6_flow.tc &= fmask->ipv6_mask.tc; + flow->ipv6_flow.proto &= fmask->ipv6_mask.proto; + flow->ipv6_flow.hop_limits &= fmask->ipv6_mask.hop_limits; + flow->tcp6_flow.src_port &= fmask->src_port_mask; + flow->tcp6_flow.dst_port &= fmask->dst_port_mask; + return; + } +} + static int dpdk_set_fdir_filt(struct netif_port *dev, enum rte_filter_op op, const struct rte_eth_fdir_filter *filt) { @@ -2901,8 +3056,8 @@ static struct netif_port* netif_rte_port_alloc(portid_t id, int nrxq, return NULL; } port->in_ptr->dev = port; - port->in_ptr->af = AF_INET; INIT_LIST_HEAD(&port->in_ptr->ifa_list); + INIT_LIST_HEAD(&port->in_ptr->ifm_list); if (tc_init_dev(port) != EDPVS_OK) { RTE_LOG(ERR, NETIF, "%s: fail to init TC\n", __func__); @@ -3131,6 +3286,10 @@ static void fill_port_config(struct netif_port *port, char *promisc_on) else if (!strcmp(cfg_stream->rss, "tunnel")) port->dev_conf.rx_adv_conf.rss_conf.rss_hf = ETH_RSS_TUNNEL; + port->dev_conf.fdir_conf.mode = cfg_stream->fdir_mode; + port->dev_conf.fdir_conf.pballoc = cfg_stream->fdir_pballoc; + port->dev_conf.fdir_conf.status = cfg_stream->fdir_status; + if (cfg_stream->rx_queue_nb > 0 && port->nrxq > cfg_stream->rx_queue_nb) { RTE_LOG(WARNING, NETIF, "%s: rx-queues(%d) configured in workers != " "rx-queues(%d) configured in device, setup %d rx-queues for %s\n", @@ -3231,6 +3390,17 @@ static int add_bond_slaves(struct netif_port *port) return EDPVS_OK; } +/* flush FDIR filters for all physical dpdk ports */ +static int fdir_filter_flush(const struct netif_port *port) +{ + if (!port || port->type != PORT_TYPE_GENERAL) + return EDPVS_OK; + if (rte_eth_dev_filter_ctrl(port->id, RTE_ETH_FILTER_FDIR, + RTE_ETH_FILTER_FLUSH, NULL) < 0) + return EDPVS_DPDKAPIFAIL; + return EDPVS_OK; +} + /* * Note: Invoke the function after port is allocated and lcores are configured. */ @@ -3354,6 +3524,20 @@ int netif_port_start(struct netif_port *port) if (port->type == PORT_TYPE_BOND_MASTER) update_bond_macaddr(port); + /* add in6_addr multicast address */ + ret = idev_add_mcast_init(port); + if (ret != EDPVS_OK) { + RTE_LOG(WARNING, NETIF, "multicast address add failed for device %s\n", port->name); + return ret; + } + + /* flush FDIR filters */ + ret = fdir_filter_flush(port); + if (ret != EDPVS_OK) { + RTE_LOG(WARNING, NETIF, "fail to flush FDIR filters for device %s\n", port->name); + return ret; + } + return EDPVS_OK; } @@ -3536,16 +3720,15 @@ static struct rte_eth_conf default_port_conf = { .src_ip = 0x00000000, .dst_ip = 0xFFFFFFFF, }, + .ipv6_mask = { + .src_ip = { 0, 0, 0, 0 }, + .dst_ip = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }, + }, .src_port_mask = 0x0000, /* to be changed according to slave lcore number in use */ -#if RTE_VERSION >= 0x10040010 //VERSION_NUM(16, 4, 0, 16), dpdk-16.04 .dst_port_mask = 0x00F8, -#else - /* alert!!! port mask is host byte order while - * filter is network byte order */ - .dst_port_mask = 0xF800, // previous dpdk version -#endif + .mac_addr_byte_mask = 0x00, .tunnel_type_mask = 0, .tunnel_id_mask = 0, @@ -3598,10 +3781,23 @@ int netif_print_port_conf(const struct rte_eth_conf *port_conf, char *buf, int * } memset(tbuf1, 0, sizeof(tbuf1)); - snprintf(tbuf1, sizeof(tbuf1), "ipv4_src_ip: %#8x\nipv4_dst_ip: %#8x\n" - "src_port: %#4x\ndst_port: %#4x", port_conf->fdir_conf.mask.ipv4_mask.src_ip, - port_conf->fdir_conf.mask.ipv4_mask.dst_ip, port_conf->fdir_conf.mask.src_port_mask, - port_conf->fdir_conf.mask.dst_port_mask); + snprintf(tbuf1, sizeof(tbuf1), + "fdir ipv4 mask: src 0x%08x dst 0x%08x\n" + "fdir ipv6 mask: src 0x%08x:%08x:%08x:%08x dst 0x%08x:%08x:%08x:%08x\n" + "fdir port mask: src 0x%04x dst 0x%04x\n", + port_conf->fdir_conf.mask.ipv4_mask.src_ip, + port_conf->fdir_conf.mask.ipv4_mask.dst_ip, + port_conf->fdir_conf.mask.ipv6_mask.src_ip[0], + port_conf->fdir_conf.mask.ipv6_mask.src_ip[1], + port_conf->fdir_conf.mask.ipv6_mask.src_ip[2], + port_conf->fdir_conf.mask.ipv6_mask.src_ip[3], + port_conf->fdir_conf.mask.ipv6_mask.dst_ip[0], + port_conf->fdir_conf.mask.ipv6_mask.dst_ip[1], + port_conf->fdir_conf.mask.ipv6_mask.dst_ip[2], + port_conf->fdir_conf.mask.ipv6_mask.dst_ip[3], + port_conf->fdir_conf.mask.src_port_mask, + port_conf->fdir_conf.mask.dst_port_mask + ); if (*len - strlen(buf) - 1 < strlen(tbuf1)) { RTE_LOG(WARNING, NETIF, "[%s] no enough buf\n", __func__); return EDPVS_INVAL; @@ -3823,7 +4019,7 @@ int netif_lcore_start(void) */ static int obtain_dpdk_bond_name(char *dst, const char *ori, size_t size) { - char str[DEVICE_NAME_MAX_LEN]; + char str[IFNAMSIZ]; unsigned num; if (!ori || sscanf(ori, "%[_a-zA-Z]%u", str, &num) != 2) @@ -3890,8 +4086,8 @@ int netif_virtual_devices_add(void) return EDPVS_INVAL; } - char dummy_name[DEVICE_NAME_MAX_LEN] = {'\0'}; - int rc = obtain_dpdk_bond_name(dummy_name, bond_cfg->name, DEVICE_NAME_MAX_LEN); + char dummy_name[IFNAMSIZ] = {'\0'}; + int rc = obtain_dpdk_bond_name(dummy_name, bond_cfg->name, IFNAMSIZ); if (rc != EDPVS_OK) { RTE_LOG(ERR, NETIF, "%s: wrong bond device name in config file %s\n", __func__, bond_cfg->name); diff --git a/src/route.c b/src/route.c index 89231590e..e247292b7 100644 --- a/src/route.c +++ b/src/route.c @@ -419,12 +419,12 @@ struct route_entry *route4_output(const struct flow4 *fl4) struct route_entry *route; - route = route_out_local_lookup(fl4->daddr.s_addr); + route = route_out_local_lookup(fl4->fl4_daddr.s_addr); if(route){ return route; } - - route = route_out_net_lookup(&fl4->daddr); + + route = route_out_net_lookup(&fl4->fl4_daddr); if(route){ return route; } diff --git a/src/sa_pool.c b/src/sa_pool.c index 8633c894b..b2b614ebd 100644 --- a/src/sa_pool.c +++ b/src/sa_pool.c @@ -48,8 +48,10 @@ #include "inet.h" #include "netif.h" #include "route.h" +#include "route6.h" #include "ctrl.h" #include "sa_pool.h" +#include "linux_ipv6.h" #include "parser/parser.h" #include "parser/vector.h" @@ -113,7 +115,7 @@ struct sa_pool { /* hashed pools by dest's . if no dest provided, * just use first pool. it's not need create/destroy pool - * for each dest, that'll be to complicated. */ + * for each dest, that'll be too complicated. */ struct sa_entry_pool *pool_hash; uint8_t pool_hash_sz; @@ -138,30 +140,41 @@ static uint64_t sa_lcore_mask; static uint8_t sa_pool_hash_size = SAPOOL_DEF_HASH_SZ; -static int __add_del_filter(struct netif_port *dev, lcoreid_t cid, - __be32 dip, __be16 dport, +static int __add_del_filter(int af, struct netif_port *dev, lcoreid_t cid, + const union inet_addr *dip, __be16 dport, uint32_t filter_id[MAX_FDIR_PROTO], bool add) { struct rte_eth_fdir_filter filt[MAX_FDIR_PROTO] = { { - .input.flow_type = RTE_ETH_FLOW_NONFRAG_IPV4_TCP, - .input.flow.tcp4_flow.ip.dst_ip = dip, - .input.flow.tcp4_flow.dst_port = dport, - .action.behavior = RTE_ETH_FDIR_ACCEPT, .action.report_status = RTE_ETH_FDIR_REPORT_ID, .soft_id = filter_id[0], }, { - .input.flow_type = RTE_ETH_FLOW_NONFRAG_IPV4_UDP, - .input.flow.udp4_flow.ip.dst_ip = dip, - .input.flow.udp4_flow.dst_port = dport, - .action.behavior = RTE_ETH_FDIR_ACCEPT, .action.report_status = RTE_ETH_FDIR_REPORT_ID, .soft_id = filter_id[1], }, }; + + if (af == AF_INET) { + filt[0].input.flow_type = RTE_ETH_FLOW_NONFRAG_IPV4_TCP; + filt[0].input.flow.tcp4_flow.ip.dst_ip = dip->in.s_addr; + filt[0].input.flow.tcp4_flow.dst_port = dport; + filt[1].input.flow_type = RTE_ETH_FLOW_NONFRAG_IPV4_UDP; + filt[1].input.flow.udp4_flow.ip.dst_ip = dip->in.s_addr; + filt[1].input.flow.udp4_flow.dst_port = dport; + } else if (af == AF_INET6) { + filt[0].input.flow_type = RTE_ETH_FLOW_NONFRAG_IPV6_TCP; + memcpy(filt[0].input.flow.ipv6_flow.dst_ip, &dip->in6, sizeof(struct in6_addr)); + filt[0].input.flow.tcp6_flow.dst_port = dport; + filt[1].input.flow_type = RTE_ETH_FLOW_NONFRAG_IPV6_UDP; + memcpy(filt[1].input.flow.ipv6_flow.dst_ip, &dip->in6, sizeof(struct in6_addr)); + filt[1].input.flow.udp6_flow.dst_port = dport; + } else { + return EDPVS_NOTSUPP; + } + queueid_t queue; int err; enum rte_filter_op op, rop; @@ -190,6 +203,9 @@ static int __add_del_filter(struct netif_port *dev, lcoreid_t cid, filt[0].action.rx_queue = filt[1].action.rx_queue = queue; op = add ? RTE_ETH_FILTER_ADD : RTE_ETH_FILTER_DELETE; + netif_mask_fdir_filter(af, dev, &filt[0]); + netif_mask_fdir_filter(af, dev, &filt[1]); + err = netif_fdir_filter_set(dev, op, &filt[0]); if (err != EDPVS_OK) return err; @@ -202,28 +218,30 @@ static int __add_del_filter(struct netif_port *dev, lcoreid_t cid, } #ifdef CONFIG_DPVS_SAPOOL_DEBUG - RTE_LOG(DEBUG, SAPOOL, "FDIR: %s %s TCP/UDP " - "ip %s port %d (0x%04x) mask 0x%04X queue %d lcore %2d\n", + RTE_LOG(DEBUG, SAPOOL, "FDIR: %s %s %s TCP/UDP " + "ip %s port %d (0x%04x) mask 0x%04X queue %d lcore %2d filterID %d/%d\n", add ? "add" : "del", dev->name, - inet_ntop(AF_INET, &dip, ipaddr, sizeof(ipaddr)) ? : "::", - ntohs(dport), ntohs(dport), sa_fdirs[cid].mask, queue, cid); + af == AF_INET ? "IPv4" : "IPv6", + inet_ntop(af, dip, ipaddr, sizeof(ipaddr)) ? : "::", + ntohs(dport), ntohs(dport), sa_fdirs[cid].mask, queue, cid, + filter_id[0], filter_id[1]); #endif return err; } -static inline int sa_add_filter(struct netif_port *dev, lcoreid_t cid, - __be32 dip, __be16 dport, +static inline int sa_add_filter(int af, struct netif_port *dev, lcoreid_t cid, + const union inet_addr *dip, __be16 dport, uint32_t filter_id[MAX_FDIR_PROTO]) { - return __add_del_filter(dev, cid, dip, dport, filter_id, true); + return __add_del_filter(af, dev, cid, dip, dport, filter_id, true); } -static inline int sa_del_filter(struct netif_port *dev, lcoreid_t cid, - __be32 dip, __be16 dport, +static inline int sa_del_filter(int af, struct netif_port *dev, lcoreid_t cid, + const union inet_addr *dip, __be16 dport, uint32_t filter_id[MAX_FDIR_PROTO]) { - return __add_del_filter(dev, cid, dip, dport, filter_id, false); + return __add_del_filter(af, dev, cid, dip, dport, filter_id, false); } static int sa_pool_alloc_hash(struct sa_pool *ap, uint8_t hash_sz, @@ -317,7 +335,8 @@ int sa_pool_create(struct inet_ifaddr *ifa, uint16_t low, uint16_t high) /* if add filter failed, waste some soft-id is acceptable. */ filtids[0] = fdir->soft_id++; filtids[1] = fdir->soft_id++; - err = sa_add_filter(ifa->idev->dev, cid, ifa->addr.in.s_addr, + + err = sa_add_filter(ifa->af, ifa->idev->dev, cid, &ifa->addr, fdir->port_base, filtids); if (err != EDPVS_OK) { sa_pool_free_hash(ap); @@ -363,7 +382,7 @@ int sa_pool_destroy(struct inet_ifaddr *ifa) return EDPVS_BUSY; } - sa_del_filter(ifa->idev->dev, cid, ifa->addr.in.s_addr, + sa_del_filter(ifa->af, ifa->idev->dev, cid, &ifa->addr, fdir->port_base, ap->filter_id); sa_pool_free_hash(ap); rte_free(ap); @@ -376,35 +395,49 @@ int sa_pool_destroy(struct inet_ifaddr *ifa) return EDPVS_OK; } - /* hash dest's . if no dest provided, just use first pool. */ static inline struct sa_entry_pool * -sa_pool_hash(const struct sa_pool *ap, const struct sockaddr_in *sin) +sa_pool_hash(const struct sa_pool *ap, const struct sockaddr_storage *ss) { - uint16_t vect[2]; + uint32_t hashkey; assert(ap && ap->pool_hash && ap->pool_hash_sz >= 1); - - if (!sin) + if (!ss) return &ap->pool_hash[0]; - vect[0] = ntohl(sin->sin_addr.s_addr) & 0xffff; - vect[1] = ntohs(sin->sin_port); + if (ss->ss_family == AF_INET) { + uint16_t vect[2]; + const struct sockaddr_in *sin = (const struct sockaddr_in *)ss; - return &ap->pool_hash[(vect[0] + vect[1]) % ap->pool_hash_sz]; + vect[0] = ntohl(sin->sin_addr.s_addr) & 0xffff; + vect[1] = ntohs(sin->sin_port); + hashkey = (vect[0] + vect[1]) % ap->pool_hash_sz; + + return &ap->pool_hash[hashkey]; + } else if (ss->ss_family == AF_INET6) { + uint32_t vect[5] = { 0 }; + const struct sockaddr_in6 *sin6 = (const struct sockaddr_in6 *)ss; + + vect[0] = sin6->sin6_port; + memcpy(&vect[1], &sin6->sin6_addr, 16); + hashkey = rte_jhash_32b(vect, 5, sin6->sin6_family) % ap->pool_hash_sz; + + return &ap->pool_hash[hashkey]; + } else { + return NULL; + } } -/* - * this API support IPv4 only. - * sockaddr is not safe use sockaddr_storage if need proto-independent. - */ static inline int sa_pool_fetch(struct sa_entry_pool *pool, - struct sockaddr_in *sin) + struct sockaddr_storage *ss) { + assert(pool && ss); + struct sa_entry *ent; + struct sockaddr_in *sin = (struct sockaddr_in *)ss; + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)ss; #ifdef CONFIG_DPVS_SAPOOL_DEBUG char addr[64]; #endif - assert(pool && sin); ent = list_first_entry_or_null(&pool->free_enties, struct sa_entry, list); if (!ent) { @@ -417,9 +450,17 @@ static inline int sa_pool_fetch(struct sa_entry_pool *pool, return EDPVS_RESOURCE; } - sin->sin_family = AF_INET; - sin->sin_addr.s_addr = ent->addr.in.s_addr; - sin->sin_port = ent->port; + if (ss->ss_family == AF_INET) { + sin->sin_family = AF_INET; + sin->sin_addr.s_addr = ent->addr.in.s_addr; + sin->sin_port = ent->port; + } else if (ss->ss_family == AF_INET6) { + sin6->sin6_family = AF_INET6; + sin6->sin6_addr = ent->addr.in6; + sin6->sin6_port = ent->port; + } else { + return EDPVS_NOTSUPP; + } ent->flags |= SA_F_USED; list_move_tail(&ent->list, &pool->used_enties); @@ -428,7 +469,7 @@ static inline int sa_pool_fetch(struct sa_entry_pool *pool, #ifdef CONFIG_DPVS_SAPOOL_DEBUG RTE_LOG(DEBUG, SAPOOL, "%s: %s:%d fetched!\n", __func__, - inet_ntop(AF_INET, &ent->addr.in, addr, sizeof(addr)) ? : NULL, + inet_ntop(ss->ss_family, &ent->addr, addr, sizeof(addr)) ? : NULL, ntohs(ent->port)); #endif @@ -436,14 +477,24 @@ static inline int sa_pool_fetch(struct sa_entry_pool *pool, } static inline int sa_pool_release(struct sa_entry_pool *pool, - const struct sockaddr_in *sin) + const struct sockaddr_storage *ss) { - assert(pool && sin); + assert(pool && ss); + struct sa_entry *ent; - __be16 port = ntohs(sin->sin_port); + const struct sockaddr_in *sin = (const struct sockaddr_in *)ss; + const struct sockaddr_in6 *sin6 = (const struct sockaddr_in6 *)ss; + __be16 port; #ifdef CONFIG_DPVS_SAPOOL_DEBUG char addr[64]; #endif + + if (ss->ss_family == AF_INET) + port = ntohs(sin->sin_port); + else if (ss->ss_family == AF_INET6) + port = ntohs(sin6->sin6_port); + else + return EDPVS_NOTSUPP; assert(port > 0 && port < MAX_PORT); /* it's too slow to traverse the used_enties list @@ -455,8 +506,12 @@ static inline int sa_pool_release(struct sa_entry_pool *pool, return EDPVS_INVAL; } - assert(ent->addr.in.s_addr == sin->sin_addr.s_addr && - ent->port == sin->sin_port); + if (ss->ss_family == AF_INET) + assert(ent->addr.in.s_addr == sin->sin_addr.s_addr && + ent->port == sin->sin_port); + else + assert(ipv6_addr_equal(&ent->addr.in6, &sin6->sin6_addr) && + ent->port == sin6->sin6_port); ent->flags &= (~SA_F_USED); list_move_tail(&ent->list, &pool->free_enties); @@ -465,7 +520,7 @@ static inline int sa_pool_release(struct sa_entry_pool *pool, #ifdef CONFIG_DPVS_SAPOOL_DEBUG RTE_LOG(DEBUG, SAPOOL, "%s: %s:%d released!\n", __func__, - inet_ntop(AF_INET, &ent->addr.in, addr, sizeof(addr)) ? : NULL, + inet_ntop(ss->ss_family, &ent->addr, addr, sizeof(addr)) ? : NULL, ntohs(ent->port)); #endif @@ -496,9 +551,11 @@ static inline int sa_pool_release(struct sa_entry_pool *pool, * * daddr is a hint to found dev/saddr (by route/netif module). * dev is also a hint, the saddr(ifa) is the key. + * af is needed when both saddr and daddr are NULL. */ -int sa_fetch(struct netif_port *dev, const struct sockaddr_in *daddr, - struct sockaddr_in *saddr) +static int sa4_fetch(struct netif_port *dev, + const struct sockaddr_in *daddr, + struct sockaddr_in *saddr) { struct inet_ifaddr *ifa; struct flow4 fl; @@ -507,24 +564,24 @@ int sa_fetch(struct netif_port *dev, const struct sockaddr_in *daddr, assert(saddr); if (saddr && saddr->sin_addr.s_addr != INADDR_ANY && saddr->sin_port != 0) - return 0; /* everything is known, why call this function ? */ + return EDPVS_OK; /* everything is known, why call this function ? */ /* if source IP is assiged, we can find ifa->this_sa_pool * without @daddr and @dev. */ if (saddr->sin_addr.s_addr) { - ifa = inet_addr_ifa_get(AF_INET, dev, - (union inet_addr*)&saddr->sin_addr); + ifa = inet_addr_ifa_get(AF_INET, dev, (union inet_addr*)&saddr->sin_addr); if (!ifa) return EDPVS_NOTEXIST; if (!ifa->this_sa_pool) { - RTE_LOG(WARNING, SAPOOL, "%s: fetch addr on IP without pool.", - __func__); + RTE_LOG(WARNING, SAPOOL, "%s: fetch addr on IP without pool.", __func__); inet_addr_ifa_put(ifa); return EDPVS_INVAL; } - err = sa_pool_fetch(sa_pool_hash(ifa->this_sa_pool, daddr), saddr); + err = sa_pool_fetch(sa_pool_hash(ifa->this_sa_pool, + (struct sockaddr_storage *)daddr), + (struct sockaddr_storage *)saddr); if (err == EDPVS_OK) rte_atomic32_inc(&ifa->this_sa_pool->refcnt); inet_addr_ifa_put(ifa); @@ -533,9 +590,9 @@ int sa_fetch(struct netif_port *dev, const struct sockaddr_in *daddr, /* try to find source ifa by @dev and @daddr */ memset(&fl, 0, sizeof(struct flow4)); - fl.oif = dev; - fl.daddr.s_addr = daddr ? daddr->sin_addr.s_addr : htonl(INADDR_ANY); - fl.saddr.s_addr = saddr ? saddr->sin_addr.s_addr : htonl(INADDR_ANY); + fl.fl4_oif = dev; + fl.fl4_daddr.s_addr = daddr ? daddr->sin_addr.s_addr : htonl(INADDR_ANY); + fl.fl4_saddr.s_addr = saddr ? saddr->sin_addr.s_addr : htonl(INADDR_ANY); rt = route4_output(&fl); if (!rt) return EDPVS_NOROUTE;; @@ -560,7 +617,88 @@ int sa_fetch(struct netif_port *dev, const struct sockaddr_in *daddr, } /* do fetch socket address */ - err = sa_pool_fetch(sa_pool_hash(ifa->this_sa_pool, daddr), saddr); + err = sa_pool_fetch(sa_pool_hash(ifa->this_sa_pool, + (struct sockaddr_storage *)daddr), + (struct sockaddr_storage *)saddr); + if (err == EDPVS_OK) + rte_atomic32_inc(&ifa->this_sa_pool->refcnt); + + inet_addr_ifa_put(ifa); + return err; +} + +static int sa6_fetch(struct netif_port *dev, + const struct sockaddr_in6 *daddr, + struct sockaddr_in6 *saddr) +{ + struct inet_ifaddr *ifa; + struct flow6 fl6; + struct route6 *rt6; + int err; + assert(saddr); + + if (saddr && !ipv6_addr_any(&saddr->sin6_addr) && saddr->sin6_port != 0) + return EDPVS_OK; /* everything is known, why call this function ? */ + + /* if source IP is assiged, we can find ifa->this_sa_pool + * without @daddr and @dev. */ + if (!ipv6_addr_any(&saddr->sin6_addr)) { + ifa = inet_addr_ifa_get(AF_INET6, dev, (union inet_addr*)&saddr->sin6_addr); + if (!ifa) + return EDPVS_NOTEXIST; + + if (!ifa->this_sa_pool) { + RTE_LOG(WARNING, SAPOOL, "%s: fetch addr on IP without pool.", __func__); + inet_addr_ifa_put(ifa); + return EDPVS_INVAL; + } + + err = sa_pool_fetch(sa_pool_hash(ifa->this_sa_pool, + (struct sockaddr_storage *)daddr), + (struct sockaddr_storage *)saddr); + if (err == EDPVS_OK) + rte_atomic32_inc(&ifa->this_sa_pool->refcnt); + inet_addr_ifa_put(ifa); + return err; + } + + /* try to find source ifa by @dev and @daddr */ + memset(&fl6, 0, sizeof(struct flow6)); + fl6.fl6_oif = dev; + if (daddr) + fl6.fl6_daddr= daddr->sin6_addr; + if (saddr) + fl6.fl6_saddr= saddr->sin6_addr; + rt6 = route6_output(NULL, &fl6); + if (!rt6) + return EDPVS_NOROUTE;; + + /* select source address. */ + if (ipv6_addr_any(&rt6->rt6_src.addr)) { + inet_addr_select(AF_INET6, rt6->rt6_dev, + (union inet_addr *)&rt6->rt6_dst.addr, + RT_SCOPE_UNIVERSE, + (union inet_addr *)&rt6->rt6_src.addr); + } + ifa = inet_addr_ifa_get(AF_INET6, rt6->rt6_dev, + (union inet_addr *)&rt6->rt6_src.addr); + if (!ifa) { + route6_put(rt6); + return EDPVS_NOTEXIST; + } + route6_put(rt6); + + if (!ifa->this_sa_pool) { + RTE_LOG(WARNING, SAPOOL, "%s: fetch addr on IP without pool.", + __func__); + inet_addr_ifa_put(ifa); + return EDPVS_INVAL; + } + + /* do fetch socket address */ + err = sa_pool_fetch(sa_pool_hash(ifa->this_sa_pool, + (struct sockaddr_storage *)daddr), + (struct sockaddr_storage *)saddr); if (err == EDPVS_OK) rte_atomic32_inc(&ifa->this_sa_pool->refcnt); @@ -568,8 +706,28 @@ int sa_fetch(struct netif_port *dev, const struct sockaddr_in *daddr, return err; } -int sa_release(const struct netif_port *dev, const struct sockaddr_in *daddr, - const struct sockaddr_in *saddr) +int sa_fetch(int af, struct netif_port *dev, + const struct sockaddr_storage *daddr, + struct sockaddr_storage *saddr) +{ + if (unlikely(daddr && daddr->ss_family != af)) + return EDPVS_INVAL; + if (unlikely(saddr && saddr->ss_family != af)) + return EDPVS_INVAL; + if (AF_INET == af) + return sa4_fetch(dev, (const struct sockaddr_in *)daddr, + (struct sockaddr_in *)saddr); + else if (AF_INET6 == af) + return sa6_fetch(dev, (const struct sockaddr_in6 *)daddr, + (struct sockaddr_in6 *)saddr); + else + return EDPVS_NOTSUPP; +} + +/* call me with `saddr` must not NULL */ +int sa_release(const struct netif_port *dev, + const struct sockaddr_storage *daddr, + const struct sockaddr_storage *saddr) { struct inet_ifaddr *ifa; int err; @@ -577,8 +735,21 @@ int sa_release(const struct netif_port *dev, const struct sockaddr_in *daddr, if (!saddr) return EDPVS_INVAL; - ifa = inet_addr_ifa_get(AF_INET, dev, - (union inet_addr*)&saddr->sin_addr); + if (daddr && saddr->ss_family != daddr->ss_family) + return EDPVS_INVAL; + + if (AF_INET == saddr->ss_family) { + const struct sockaddr_in *saddr4 = (const struct sockaddr_in *)saddr; + ifa = inet_addr_ifa_get(AF_INET, dev, + (union inet_addr*)&saddr4->sin_addr); + } else if (AF_INET6 == saddr->ss_family) { + const struct sockaddr_in6 *saddr6 = (const struct sockaddr_in6 *)saddr; + ifa = inet_addr_ifa_get(AF_INET6, dev, + (union inet_addr*)&saddr6->sin6_addr); + } else { + return EDPVS_NOTSUPP; + } + if (!ifa) return EDPVS_NOTEXIST; diff --git a/tools/dpip/Makefile b/tools/dpip/Makefile index b36369e3c..469c8f46b 100644 --- a/tools/dpip/Makefile +++ b/tools/dpip/Makefile @@ -33,7 +33,7 @@ DEFS = -D DPVS_MAX_LCORE=64 CFLAGS += $(DEFS) OBJS = dpip.o utils.o route.o addr.o neigh.o link.o vlan.o \ - qsch.o cls.o tunnel.o ../../src/common.o \ + qsch.o cls.o tunnel.o ipv6.o ../../src/common.o \ ../keepalived/keepalived/libipvs-2.6/sockopt.o all: $(TARGET) diff --git a/tools/dpip/addr.c b/tools/dpip/addr.c index 4ed15d45c..361bc966d 100644 --- a/tools/dpip/addr.c +++ b/tools/dpip/addr.c @@ -29,8 +29,11 @@ static void addr_help(void) fprintf(stderr, "Usage:\n" " dpip addr show [ dev STRING ]\n" + " dpip -6 addr show [ dev STRING ]\n" " dpip addr { add | set } IFADDR dev STRING [LIFETIME] [ SCOPE ] [FLAGS]\n" + " dpip -6 addr { add | set } IFADDR dev STRING [LIFETIME] [ SCOPE ] [FLAGS]\n" " dpip addr del IFADDR dev STRING\n" + " dpip -6 addr del IFADDR dev STRING\n" " dpip addr flush dev STRING\n" " dpip addr help\n" "Parameters:\n" diff --git a/tools/dpip/dpip.c b/tools/dpip/dpip.c index cfa71aba5..537f97930 100644 --- a/tools/dpip/dpip.c +++ b/tools/dpip/dpip.c @@ -34,7 +34,7 @@ static void usage(void) " "DPIP_NAME" [OPTIONS] OBJECT { COMMAND | help }\n" "Parameters:\n" " OBJECT := { link | addr | route | neigh | vlan | tunnel |\n" - " qsch | cls }\n" + " qsch | cls | ipv6 }\n" " COMMAND := { add | del | change | replace | show | flush }\n" "Options:\n" " -v, --verbose\n" diff --git a/tools/dpip/ipv6.c b/tools/dpip/ipv6.c new file mode 100644 index 000000000..c63a8e05a --- /dev/null +++ b/tools/dpip/ipv6.c @@ -0,0 +1,154 @@ +/* + * DPVS is a software load balancer (Virtual Server) based on DPDK. + * + * Copyright (C) 2017 iQIYI (www.iqiyi.com). + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ +/** + * Tool for IPv6 protocol. + * + * Lei Chen , initial, Jul 2018. + */ +#include +#include +#include +#include +#include "common.h" +#include "inet.h" +#include "dpip.h" +#include "sockopt.h" +#include "conf/ipv6.h" + +enum { + IPV6_STATS_CPU_ALL = 0xFFFFFFFF, + IPV6_STATS_CPU_TOTAL = 0xFFFFFFFE, +}; + +struct ipv6_conf { + int stats_cpu; +}; + +static struct ipv6_conf ipv6_conf; + +static void ipv6_help(void) +{ + fprintf(stderr, "Usage:\n"); + fprintf(stderr, " dpip ipv6 show [ cpu CPU | all | total ]\n"); + fprintf(stderr, "\n"); + fprintf(stderr, "Example:\n"); + fprintf(stderr, " dpip ipv6 show\n"); + fprintf(stderr, " dpip ipv6 show total\n"); + fprintf(stderr, " dpip ipv6 show all\n"); + fprintf(stderr, " dpip ipv6 show cpu 6\n"); +} + +static int ipv6_parse(struct dpip_obj *obj, struct dpip_conf *cf) +{ + struct ipv6_conf *conf = obj->param; + + memset(conf, 0, sizeof(*conf)); + conf->stats_cpu = IPV6_STATS_CPU_TOTAL; + + while (cf->argc > 0) { + if (strcmp(CURRARG(cf), "cpu") == 0) { + NEXTARG_CHECK(cf, CURRARG(cf)); + + conf->stats_cpu = atoi(CURRARG(cf)); + if (conf->stats_cpu < 0 || conf->stats_cpu >= DPVS_MAX_LCORE) { + fprintf(stderr, "bad cpu id `%s'\n", CURRARG(cf)); + return EDPVS_INVAL; + } + } else if (strcmp(CURRARG(cf), "all") == 0) { + conf->stats_cpu = IPV6_STATS_CPU_ALL; + } else if (strcmp(CURRARG(cf), "total") == 0) { + conf->stats_cpu = IPV6_STATS_CPU_TOTAL; + } else { + fprintf(stderr, "unknow argument `%s'\n", CURRARG(cf)); + return EDPVS_INVAL; + } + + NEXTARG(cf); + } + + return EDPVS_OK; +} + +static int ipv6_do_cmd(struct dpip_obj *obj, dpip_cmd_t cmd, + struct dpip_conf *conf) +{ + struct ip6_stats_param *stats; + struct ipv6_conf *cf = obj->param; + char cpu[16]; + size_t size; + int err, i; + + if (cmd != DPIP_CMD_SHOW) + return EDPVS_NOTSUPP; + + err = dpvs_getsockopt(SOCKOPT_IP6_STATS, NULL, 0, (void **)&stats, &size); + if (err != EDPVS_OK) + return EDPVS_INVAL; + + if (size != sizeof(*stats)) { + fprintf(stderr, "corrupted response.\n"); + dpvs_sockopt_msg_free(stats); + return EDPVS_INVAL; + } + + switch (cf->stats_cpu) { + case IPV6_STATS_CPU_TOTAL: + inet_stats_dump(NULL, NULL, &stats->stats); + break; + case IPV6_STATS_CPU_ALL: + inet_stats_dump("All", " ", &stats->stats); + + for (i = 0; i < NELEMS(stats->stats_cpus); i++) { + snprintf(cpu, sizeof(cpu), "cpu %d", i); + inet_stats_dump(cpu, " ", &stats->stats_cpus[i]); + } + break; + default: + if (cf->stats_cpu < 0 || + cf->stats_cpu >= NELEMS(stats->stats_cpus)) { + fprintf(stderr, "bad cpu id %d.\n", cf->stats_cpu); + break; + } + + snprintf(cpu, sizeof(cpu), "cpu %d", cf->stats_cpu); + inet_stats_dump(cpu, " ", &stats->stats_cpus[cf->stats_cpu]); + break; + } + + dpvs_sockopt_msg_free(stats); + + return EDPVS_OK; +} + +struct dpip_obj dpip_ipv6 = { + .name = "ipv6", + .param = &ipv6_conf, + .help = ipv6_help, + .parse = ipv6_parse, + .do_cmd = ipv6_do_cmd, +}; + +static void __init ipv6_init(void) +{ + dpip_register_obj(&dpip_ipv6); +} + +static void __exit ipv6_exit(void) +{ + dpip_unregister_obj(&dpip_ipv6); +} diff --git a/tools/dpip/link.c b/tools/dpip/link.c index 7af7cf59d..aa2593d0c 100644 --- a/tools/dpip/link.c +++ b/tools/dpip/link.c @@ -328,12 +328,12 @@ static int dump_nic_verbose(char *name, int namelen) ext_get->dev_info.max_vfs); printf(" %-16s%-16s%-16s%-16s\n", "max_vmdq_pools", "rx_ol_capa", "tx_ol_capa", "reta_size"); - printf(" %-16u%-16u%-16u%-16u\n", ext_get->dev_info.max_vmdq_pools, + printf(" %-16u0x%-14X0x%-14X%-16u\n", ext_get->dev_info.max_vmdq_pools, ext_get->dev_info.rx_offload_capa, ext_get->dev_info.tx_offload_capa, ext_get->dev_info.reta_size); printf(" %-16s%-16s%-16s%-16s\n", "hash_key_size", "flowtype_rss_ol", "vmdq_que_base", "vmdq_que_num"); - printf(" %-16u%-16lu%-16u%-16u\n", ext_get->dev_info.hash_key_size, + printf(" %-16u0x%-14lX%-16u%-16u\n", ext_get->dev_info.hash_key_size, ext_get->dev_info.flow_type_rss_offloads, ext_get->dev_info.vmdq_queue_base, ext_get->dev_info.vmdq_queue_num); printf(" %-16s%-16s%-16s%-16s\n", "rx_desc_max", "rx_desc_min", diff --git a/tools/dpip/neigh.c b/tools/dpip/neigh.c index 94176c033..5532fb588 100644 --- a/tools/dpip/neigh.c +++ b/tools/dpip/neigh.c @@ -99,7 +99,7 @@ static void neigh_dump(struct dp_vs_neigh_conf *neigh) char ipaddr[64]; if (neigh->state >= DPVS_NUD_S_REACHABLE) - printf("ip: %s mac: %02x:%02x:%02x:%02x:%02x:%02x mbuf: %d %s\n", + printf("ip: %-48s mac: %02x:%02x:%02x:%02x:%02x:%02x state: %-12s dev: %s core: %d %s\n", inet_ntop(neigh->af, &neigh->ip_addr, ipaddr, sizeof(ipaddr)) ? ipaddr : "::", neigh->eth_addr.ether_addr_octet[0], neigh->eth_addr.ether_addr_octet[1], @@ -107,11 +107,13 @@ static void neigh_dump(struct dp_vs_neigh_conf *neigh) neigh->eth_addr.ether_addr_octet[3], neigh->eth_addr.ether_addr_octet[4], neigh->eth_addr.ether_addr_octet[5], - neigh->que_num, (neigh->flag & NEIGHBOUR_STATIC) ? "static" : ""); + nud_state_names[neigh->state], neigh->ifname, neigh->cid, + (neigh->flag & NEIGHBOUR_STATIC) ? "static" : ""); else - printf("ip: %s mac:incomplate mbuf: %d %s\n", + printf("ip: %-48s mac:incomplate state: %-12s dev: %s core: %d %s\n", inet_ntop(neigh->af, &neigh->ip_addr, ipaddr, sizeof(ipaddr)) ? ipaddr : "::", - neigh->que_num, (neigh->flag & NEIGHBOUR_STATIC) ? "static" : ""); + nud_state_names[neigh->state], neigh->ifname, neigh->cid, + (neigh->flag & NEIGHBOUR_STATIC) ? "static" : ""); return; } @@ -142,12 +144,12 @@ static int neigh_do_cmd(struct dpip_obj *obj, dpip_cmd_t cmd, return err; if (size < sizeof(*array) || size != sizeof(*array) + \ - array->n_neigh * sizeof(struct dp_vs_neigh_conf)) { + array->neigh_nums * sizeof(struct dp_vs_neigh_conf)) { fprintf(stderr, "corrupted response.\n"); dpvs_sockopt_msg_free(array); return EDPVS_INVAL; } - for (i = 0; i < array->n_neigh; i++) + for (i = 0; i < array->neigh_nums; i++) neigh_dump(&array->addrs[i]); dpvs_sockopt_msg_free(array); return EDPVS_OK; diff --git a/tools/dpip/route.c b/tools/dpip/route.c index f8dc17ff1..8a8bd3ffd 100644 --- a/tools/dpip/route.c +++ b/tools/dpip/route.c @@ -22,6 +22,8 @@ #include "common.h" #include "dpip.h" #include "conf/route.h" +#include "conf/route6.h" +#include "linux_ipv6.h" #include "sockopt.h" static void route_help(void) @@ -43,6 +45,8 @@ static void route_help(void) " dpip route add default via 10.0.0.1\n" " dpip route add 172.0.0.0/16 via 172.0.0.3 dev dpdk0\n" " dpip route add 192.168.0.0/24 dev dpdk0\n" + " dpip -6 route add ffe1::/128 dev dpdk0" + " dpip -6 route add 2001:db8:1::/64 via 2001:db8:1::1 dev dpdk0\n" " dpip route del 172.0.0.0/16\n" " dpip route set 172.0.0.0/16 via 172.0.0.1\n" " dpip route flush\n" @@ -111,7 +115,7 @@ static const char *flags_itoa(uint32_t flags) return flags_buf; } -static void route_dump(const struct dp_vs_route_conf *route) +static void route4_dump(const struct dp_vs_route_conf *route) { char dst[64], via[64], src[64]; @@ -128,7 +132,47 @@ static void route_dump(const struct dp_vs_route_conf *route) return; } -static int route_parse_args(struct dpip_conf *conf, +static void route6_dump(const struct dp_vs_route6_conf *rt6_cfg) +{ + char dst[64], gateway[64], src[64], scope[32]; + + if (rt6_cfg->flags & RTF_KNI) + snprintf(scope, sizeof(scope), "%s", "kni_host"); + else if (rt6_cfg->flags & RTF_LOCALIN) + snprintf(scope, sizeof(scope), "%s", "host"); + else if (rt6_cfg->flags & RTF_FORWARD) { + if (ipv6_addr_any(&rt6_cfg->gateway)) + snprintf(scope, sizeof(scope), "%s", "link"); + else + snprintf(scope, sizeof(scope), "%s", "global"); + } else + snprintf(scope, sizeof(scope), "%s", "::"); + + if (ipv6_addr_any(&rt6_cfg->dst.addr) && rt6_cfg->dst.plen == 0) { + snprintf(dst, sizeof(dst), "%s", "default"); + printf("%s %s", af_itoa(AF_INET6), dst); + } else { + inet_ntop(AF_INET6, (union inet_addr*)&rt6_cfg->dst.addr, dst, sizeof(dst)); + printf("%s %s/%d", af_itoa(AF_INET6), dst, rt6_cfg->dst.plen); + } + + if (!ipv6_addr_any(&rt6_cfg->gateway)) + printf(" via %s", inet_ntop(AF_INET6, (union inet_addr*)&rt6_cfg->gateway, + gateway, sizeof(gateway)) ? gateway : "::"); + if (!ipv6_addr_any(&rt6_cfg->src.addr)) + printf(" src %s", inet_ntop(AF_INET6, (union inet_addr*)&rt6_cfg->src.addr, + src, sizeof(src)) ? src : "::"); + printf(" dev %s", rt6_cfg->ifname); + + if (rt6_cfg->mtu > 0) + printf(" mtu %d", rt6_cfg->mtu); + + printf(" scope %s", scope); + + printf("\n"); +} + +static int route4_parse_args(struct dpip_conf *conf, struct dp_vs_route_conf *route) { char *prefix = NULL; @@ -258,12 +302,108 @@ static int route_parse_args(struct dpip_conf *conf, } if (conf->verbose) - route_dump(route); + route4_dump(route); return 0; } -static int route_do_cmd(struct dpip_obj *obj, dpip_cmd_t cmd, +static int route6_parse_args(struct dpip_conf *conf, + struct dp_vs_route6_conf *rt6_cfg) +{ + int af; + char *prefix = NULL; + + memset(rt6_cfg, 0, sizeof(*rt6_cfg)); + + while (conf->argc > 0) { + if (strcmp(conf->argv[0], "via") == 0) { + NEXTARG_CHECK(conf, "via"); + if (inet_pton_try(&af, conf->argv[0], + (union inet_addr *)&rt6_cfg->gateway) <= 0) + return -1; + } else if (strcmp(conf->argv[0], "dev") == 0) { + NEXTARG_CHECK(conf, "dev"); + snprintf(rt6_cfg->ifname, sizeof(rt6_cfg->ifname), "%s", conf->argv[0]); + } else if (strcmp(conf->argv[0], "tos") == 0) { + NEXTARG_CHECK(conf, "tos"); + } else if (strcmp(conf->argv[0], "mtu") == 0) { + NEXTARG_CHECK(conf, "mtu"); + rt6_cfg->mtu = atoi(conf->argv[0]); + } else if (strcmp(conf->argv[0], "scope") == 0) { + NEXTARG_CHECK(conf, "scope"); + if (strcmp(conf->argv[0], "host") == 0) + rt6_cfg->flags |= RTF_LOCALIN; + else if (strcmp(conf->argv[0], "kni_host") == 0) + rt6_cfg->flags |= RTF_KNI; + else if (strcmp(conf->argv[0], "link") == 0) + rt6_cfg->flags |= RTF_FORWARD; + else if (strcmp(conf->argv[0], "global") == 0) + rt6_cfg->flags |= RTF_FORWARD; + } else if (strcmp(conf->argv[0], "src") == 0) { + NEXTARG_CHECK(conf, "src"); + if (inet_pton_try(&af, conf->argv[0], + (union inet_addr *)&rt6_cfg->src.addr) <= 0) + return -1; + } else if (strcmp(conf->argv[0], "metric") == 0) { + NEXTARG_CHECK(conf, "metric"); + } else if (strcmp(conf->argv[0], "proto") == 0) { + NEXTARG_CHECK(conf, "proto"); + } else if (strcmp(conf->argv[0], "onlink") == 0) { + ;/* on-link is output only */ + } else if (strcmp(conf->argv[0], "local") == 0) { + rt6_cfg->flags |= RTF_LOCALIN; + } else { + prefix = conf->argv[0]; + } + + NEXTARG(conf); + } + + if ((rt6_cfg->flags & RTF_FORWARD) && (ipv6_addr_any(&rt6_cfg->dst.addr) == 0)) + rt6_cfg->flags |= RTF_DEFAULT; + if (!(rt6_cfg->flags & (RTF_LOCALIN|RTF_KNI|RTF_FORWARD|RTF_DEFAULT))) + rt6_cfg->flags |= RTF_FORWARD; + + if (conf->argc > 0) { + fprintf(stderr, "too many arguments\n"); + return -1; + } + + if (conf->cmd == DPIP_CMD_SHOW) + return 0; + + if (!prefix) { + fprintf(stderr, "missing prefix\n"); + return -1; + } + + /* PREFIX */ + if (strcmp(prefix, "default") == 0) { + memset(&rt6_cfg->dst.addr, 0, sizeof(rt6_cfg->dst.addr)); + } else { + char *addr, *plen; + + addr = prefix; + if ((plen = strchr(addr, '/')) != NULL) + *plen++ = '\0'; + + if (inet_pton_try(&af, prefix, + (union inet_addr*)&rt6_cfg->dst.addr) <= 0) + return -1; + + rt6_cfg->dst.plen = plen ? atoi(plen) : 0; + } + + if (!rt6_cfg->dst.plen && (strcmp(prefix, "default") != 0)) + rt6_cfg->dst.plen = 128; + + if (conf->verbose) + route6_dump(rt6_cfg); + + return 0; +} + +static int route4_do_cmd(struct dpip_obj *obj, dpip_cmd_t cmd, struct dpip_conf *conf) { struct dp_vs_route_conf route; @@ -271,7 +411,7 @@ static int route_do_cmd(struct dpip_obj *obj, dpip_cmd_t cmd, size_t size, i; int err; - if (route_parse_args(conf, &route) != 0) + if (route4_parse_args(conf, &route) != 0) return EDPVS_INVAL; switch (conf->cmd) { @@ -302,7 +442,7 @@ static int route_do_cmd(struct dpip_obj *obj, dpip_cmd_t cmd, } for (i = 0; i < array->nroute; i++) - route_dump(&array->routes[i]); + route4_dump(&array->routes[i]); dpvs_sockopt_msg_free(array); return EDPVS_OK; @@ -311,6 +451,68 @@ static int route_do_cmd(struct dpip_obj *obj, dpip_cmd_t cmd, } } +static int route6_do_cmd(struct dpip_obj *obj, dpip_cmd_t cmd, + struct dpip_conf *conf) +{ + struct dp_vs_route6_conf rt6_cfg; + struct dp_vs_route6_conf_array *rt6_arr; + size_t size, i; + int err; + + if (route6_parse_args(conf, &rt6_cfg) != 0) + return EDPVS_INVAL; + + switch (conf->cmd) { + case DPIP_CMD_ADD: + rt6_cfg.ops = RT6_OPS_ADD; + return dpvs_setsockopt(SOCKOPT_SET_ROUTE6_ADD_DEL, &rt6_cfg, sizeof(rt6_cfg)); + case DPIP_CMD_DEL: + rt6_cfg.ops = RT6_OPS_DEL; + return dpvs_setsockopt(SOCKOPT_SET_ROUTE6_ADD_DEL, &rt6_cfg, sizeof(rt6_cfg)); + case DPIP_CMD_SET: + return EDPVS_NOTSUPP; + case DPIP_CMD_FLUSH: + rt6_cfg.ops = RT6_OPS_FLUSH; + return dpvs_setsockopt(SOCKOPT_SET_ROUTE6_FLUSH, &rt6_cfg, sizeof(rt6_cfg)); + case DPIP_CMD_SHOW: + err = dpvs_getsockopt(SOCKOPT_GET_ROUTE6_SHOW, &rt6_cfg, sizeof(rt6_cfg), + (void **)&rt6_arr, &size); + if (err != 0) + return err; + if (size < sizeof(*rt6_arr) || + size != sizeof(*rt6_arr) + + rt6_arr->nroute * sizeof(struct dp_vs_route6_conf)) { + fprintf(stderr, "corrupted response.\n"); + dpvs_sockopt_msg_free(rt6_arr); + return EDPVS_INVAL; + } + for (i = 0; i < rt6_arr->nroute; i++) + route6_dump(&rt6_arr->routes[i]); + + dpvs_sockopt_msg_free(rt6_arr); + return EDPVS_OK; + + default: + return EDPVS_NOTSUPP; + } + + return EDPVS_OK; +} + +static int route_do_cmd(struct dpip_obj *obj, dpip_cmd_t cmd, + struct dpip_conf *conf) +{ + switch (conf->af) { + case AF_UNSPEC: + case AF_INET: + return route4_do_cmd(obj, cmd, conf); + case AF_INET6: + return route6_do_cmd(obj, cmd, conf); + default: + return EDPVS_NOTSUPP; + } +} + struct dpip_obj dpip_route = { .name = "route", .help = route_help, diff --git a/tools/ipvsadm/ipvsadm.c b/tools/ipvsadm/ipvsadm.c index 30fc975c2..cbff2d55c 100644 --- a/tools/ipvsadm/ipvsadm.c +++ b/tools/ipvsadm/ipvsadm.c @@ -303,7 +303,7 @@ static int parse_netmask(char *buf, u_int32_t *addr); static int parse_timeout(char *buf, int min, int max); static unsigned int parse_fwmark(char *buf); static int parse_sockpair(char *buf, ipvs_sockpair_t *sockpair); -static int parse_match(const char *buf, ipvs_service_t *svc); +static int parse_match_snat(const char *buf, ipvs_service_t *svc); /* check the options based on the commands_v_options table */ static void generic_opt_check(int command, int options); @@ -404,6 +404,8 @@ parse_options(int argc, char **argv, struct ipvs_command_entry *ce, NULL, NULL }, { "icmp-service", 'q', POPT_ARG_STRING, &optarg, 'q', NULL, NULL }, + { "icmpv6-service", '1', POPT_ARG_STRING, &optarg, 'q', + NULL, NULL }, { "fwmark-service", 'f', POPT_ARG_STRING, &optarg, 'f', NULL, NULL }, { "scheduler", 's', POPT_ARG_STRING, &optarg, 's', NULL, NULL }, @@ -549,30 +551,32 @@ parse_options(int argc, char **argv, struct ipvs_command_entry *ce, case 'u': case 'q': set_option(options, OPT_SERVICE); - if (c == 't') - ce->svc.protocol = IPPROTO_TCP; - else if (c == 'u') - ce->svc.protocol = IPPROTO_UDP; - else - ce->svc.protocol = IPPROTO_ICMP; + if (c == 't') + ce->svc.protocol = IPPROTO_TCP; + else if (c == 'u') + ce->svc.protocol = IPPROTO_UDP; + else if (c == 'q') + ce->svc.protocol = IPPROTO_ICMP; + else if (c == '1') // a~Z is out. ipvsadm is really not friendly here + ce->svc.protocol = IPPROTO_ICMPV6; parse = parse_service(optarg, &ce->svc); if (!(parse & SERVICE_ADDR)) fail(2, "illegal virtual server " "address[:port] specified"); break; - case 'H': + case 'H': set_option(options, OPT_SERVICE); - if (parse_match(optarg, &ce->svc) != 0) + if (parse_match_snat(optarg, &ce->svc) != 0) fail(2, "illegal match specified"); - break; + break; case 'f': set_option(options, OPT_SERVICE); /* * Set protocol to a sane values, even * though it is not used */ - ce->svc.af = AF_INET; + ce->svc.af = AF_INET;/*FIXME:DPVS not support fwmark?*/ ce->svc.protocol = IPPROTO_TCP; ce->svc.fwmark = parse_fwmark(optarg); break; @@ -1224,9 +1228,9 @@ parse_sockpair(char *buf, ipvs_sockpair_t *sockpair) sockpair->af = af; sockpair->proto = proto; - sockpair->sip = sip.s_addr; + memcpy(&sockpair->sip, &sip, sizeof(sockpair->sip)); sockpair->sport = ntohs(sport); - sockpair->tip = tip.s_addr; + memcpy(&sockpair->tip, &tip, sizeof(sockpair->tip)); sockpair->tport = ntohs(tport); return 1; @@ -1234,7 +1238,7 @@ parse_sockpair(char *buf, ipvs_sockpair_t *sockpair) /* * comma separated parameters list, all fields is used to match packets. * - * proto := tcp | udp | icmp + * proto := tcp | udp | icmp |icmpv6 * src-range := RANGE * dst-range := RANGE * iif := IFNAME @@ -1245,7 +1249,7 @@ parse_sockpair(char *buf, ipvs_sockpair_t *sockpair) * * proto=tcp,src-range=192.168.0.1-10:80-100,dst-range=10.0.0.1:1024,iif=eth0 */ -static int parse_match(const char *buf, ipvs_service_t *svc) +static int parse_match_snat(const char *buf, ipvs_service_t *svc) { char params[256]; char *arg, *start, *sp, key[32], val[128]; @@ -1271,6 +1275,8 @@ static int parse_match(const char *buf, ipvs_service_t *svc) svc->protocol = IPPROTO_UDP; else if (strcmp(val, "icmp") == 0) svc->protocol = IPPROTO_ICMP; + else if (strcmp(val, "icmpv6") == 0) + svc->protocol = IPPROTO_ICMPV6; else return -1; } else if (strcmp(key, "src-range") == 0) { @@ -1421,6 +1427,7 @@ static void usage_exit(const char *program, const int exit_status) " --tcp-service -t service-address service-address is host[:port]\n" " --udp-service -u service-address service-address is host[:port]\n" " --icmp-service -q service-address service-address is host[:port]\n" + " --icmpv6-service -1 service-address service-address is host[:port]\n" " --fwmark-service -f fwmark fwmark is an integer greater than zero\n" " --ipv6 -6 fwmark entry uses IPv6\n" " --scheduler -s scheduler one of " SCHEDULERS ",\n" @@ -1504,7 +1511,6 @@ static void print_conn_entry(const ipvs_conn_entry_t *conn_entry, { char *cname, *vname, *lname, *dname; char proto_str[8], time_str[8]; - union inet_addr addr; if (conn_entry->proto == IPPROTO_TCP) snprintf(proto_str, sizeof(proto_str), "%s", "tcp"); @@ -1512,26 +1518,24 @@ static void print_conn_entry(const ipvs_conn_entry_t *conn_entry, snprintf(proto_str, sizeof(proto_str), "%s", "udp"); else if (conn_entry->proto == IPPROTO_ICMP) snprintf(proto_str, sizeof(proto_str), "%s", "icmp"); + else if (conn_entry->proto == IPPROTO_ICMPV6) + snprintf(proto_str, sizeof(proto_str), "%s", "icmpv6"); else snprintf(proto_str, sizeof(proto_str), "%s", "--"); snprintf(time_str, sizeof(time_str), "%ds", conn_entry->timeout); - addr.in.s_addr = conn_entry->caddr; - if (!(cname = addrport_to_anyname(conn_entry->af, &addr, ntohs(conn_entry->cport), - conn_entry->proto, format))) + if (!(cname = addrport_to_anyname(conn_entry->af, &conn_entry->caddr, + ntohs(conn_entry->cport), conn_entry->proto, format))) goto exit; - addr.in.s_addr = conn_entry->vaddr; - if (!(vname = addrport_to_anyname(conn_entry->af, &addr, ntohs(conn_entry->vport), - conn_entry->proto, format))) + if (!(vname = addrport_to_anyname(conn_entry->af, &conn_entry->vaddr, + ntohs(conn_entry->vport), conn_entry->proto, format))) goto exit; - addr.in.s_addr = conn_entry->laddr; - if (!(lname = addrport_to_anyname(conn_entry->af, &addr, ntohs(conn_entry->lport), - conn_entry->proto, format))) + if (!(lname = addrport_to_anyname(conn_entry->af, &conn_entry->laddr, + ntohs(conn_entry->lport), conn_entry->proto, format))) goto exit; - addr.in.s_addr = conn_entry->daddr; - if (!(dname = addrport_to_anyname(conn_entry->af, &addr, ntohs(conn_entry->dport), - conn_entry->proto, format))) + if (!(dname = addrport_to_anyname(conn_entry->af, &conn_entry->daddr, + ntohs(conn_entry->dport), conn_entry->proto, format))) goto exit; printf("[%d]%-3s %-6s %-11s %-18s %-18s %-18s %s\n", @@ -1735,21 +1739,23 @@ print_service_entry(ipvs_service_entry_t *se, unsigned int format) se->protocol, format))) fail(2, "addrport_to_anyname: %s", strerror(errno)); if (format & FMT_RULE) { - if (se->protocol == IPPROTO_TCP) - proto = "-t"; - else if (se->protocol == IPPROTO_UDP) - proto = "-u"; - else - proto = "-q"; + if (se->protocol == IPPROTO_TCP) + proto = "-t"; + else if (se->protocol == IPPROTO_UDP) + proto = "-u"; + else + proto = "-q"; sprintf(svc_name, "%s %s", proto, vname); - } else { - if (se->protocol == IPPROTO_TCP) - proto = "TCP"; - else if (se->protocol == IPPROTO_UDP) - proto = "UDP"; - else - proto = "ICMP"; + } else { + if (se->protocol == IPPROTO_TCP) + proto = "TCP"; + else if (se->protocol == IPPROTO_UDP) + proto = "UDP"; + else if (se->protocol == IPPROTO_ICMP) + proto = "ICMP"; + else + proto = "ICMPv6"; sprintf(svc_name, "%s %s", proto, vname); if (se->af != AF_INET6) @@ -1757,44 +1763,48 @@ print_service_entry(ipvs_service_entry_t *se, unsigned int format) } free(vname); } else { /* match */ - char *proto; - - if (se->protocol == IPPROTO_TCP) - proto = "tcp"; - else if (se->protocol == IPPROTO_UDP) - proto = "udp"; - else - proto = "icmp"; - - if (format & FMT_RULE) { - snprintf(svc_name, sizeof(svc_name), - "-H proto=%s,src-range=%s,dst-range=%s,iif=%s,oif=%s", - proto, se->srange, se->drange, se->iifname, se->oifname); + char *proto; + + if (se->protocol == IPPROTO_TCP) + proto = "tcp"; + else if (se->protocol == IPPROTO_UDP) + proto = "udp"; + else if (se->protocol == IPPROTO_ICMP) + proto = "icmp"; + else + proto = "icmpv6"; + + if (format & FMT_RULE) { + snprintf(svc_name, sizeof(svc_name), + "-H proto=%s,src-range=%s,dst-range=%s,iif=%s,oif=%s", + proto, se->srange, se->drange, se->iifname, se->oifname); - } else { - int left = sizeof(svc_name); - svc_name[0] = '\0'; + } else { + int left = sizeof(svc_name); + svc_name[0] = '\0'; - left -= snprintf(svc_name + strlen(svc_name), left, - "MATCH %s", proto); + left -= snprintf(svc_name + strlen(svc_name), left, + "MATCH %s", proto); - if (strcmp(se->srange, "0.0.0.0-0.0.0.0:0-0") != 0) - left -= snprintf(svc_name + strlen(svc_name), left, - ",from=%s", se->srange); - - if (strcmp(se->drange, "0.0.0.0-0.0.0.0:0-0") != 0) - left -= snprintf(svc_name + strlen(svc_name), left, - ",to=%s", se->drange); - - if (strlen(se->iifname)) - left -= snprintf(svc_name + strlen(svc_name), left, - ",iif=%s", se->iifname); - - if (strlen(se->oifname)) - left -= snprintf(svc_name + strlen(svc_name), left, - ",oif=%s", se->oifname); - } - } + if (strcmp(se->srange, "[::-::]:0-0") != 0 && + strcmp(se->srange, "0.0.0.0-0.0.0.0:0-0") != 0) + left -= snprintf(svc_name + strlen(svc_name), left, + ",from=%s", se->srange); + + if (strcmp(se->drange, "[::-::]:0-0") != 0 && + strcmp(se->drange, "0.0.0.0-0.0.0.0:0-0") != 0) + left -= snprintf(svc_name + strlen(svc_name), left, + ",to=%s", se->drange); + + if (strlen(se->iifname)) + left -= snprintf(svc_name + strlen(svc_name), left, + ",iif=%s", se->iifname); + + if (strlen(se->oifname)) + left -= snprintf(svc_name + strlen(svc_name), left, + ",oif=%s", se->oifname); + } + } /* copy svc's stats from dest */ copy_stats_from_dest(se, d); @@ -1949,11 +1959,11 @@ static void list_laddrs_print_laddr(struct ip_vs_laddr_entry * entry) { char pbuf[32]; - sprintf(pbuf , "%u.%u.%u.%u" , PRINT_NIP(entry->addr.ip)); + inet_ntop(entry->af, (char *)&entry->addr, pbuf, sizeof(pbuf)); - printf("%-20s %-8s %-20s %-10lu %-10u\n" , - "" , - "" , + printf("%-20s %-8s %-20s %-10lu %-10u\n", + "", + "", pbuf, entry->port_conflict, entry->conn_counts); @@ -2216,6 +2226,9 @@ int service_to_port(const char *name, unsigned short proto) else if (proto == IPPROTO_ICMP && (service = getservbyname(name, "icmp")) != NULL) return ntohs((unsigned short) service->s_port); + else if (proto == IPPROTO_ICMPV6 + && (service = getservbyname(name, "icmpv6")) != NULL) + return ntohs((unsigned short) service->s_port); else return -1; } @@ -2234,6 +2247,9 @@ static char * port_to_service(unsigned short port, unsigned short proto) else if (proto == IPPROTO_ICMP && (service = getservbyport(htons(port), "icmp")) != NULL) return service->s_name; + else if (proto == IPPROTO_ICMPV6 && + (service = getservbyport(htons(port), "icmpv6")) != NULL) + return service->s_name; else return (char *) NULL; } diff --git a/tools/keepalived/keepalived/check/check_parser.c b/tools/keepalived/keepalived/check/check_parser.c index 6fc44f5bd..15e697223 100644 --- a/tools/keepalived/keepalived/check/check_parser.c +++ b/tools/keepalived/keepalived/check/check_parser.c @@ -160,6 +160,8 @@ proto_handler(vector_t *strvec) vs->service_type = IPPROTO_UDP; else if(!strcmp(str, "ICMP")) vs->service_type = IPPROTO_ICMP; + else if (!strcmp(str, "ICMPV6")) + vs->service_type = IPPROTO_ICMPV6; else vs->service_type = IPPROTO_TCP; /*default*/ } diff --git a/tools/keepalived/keepalived/libipvs-2.6/dp_vs.h b/tools/keepalived/keepalived/libipvs-2.6/dp_vs.h index e603835f1..a222ca3f4 100644 --- a/tools/keepalived/keepalived/libipvs-2.6/dp_vs.h +++ b/tools/keepalived/keepalived/libipvs-2.6/dp_vs.h @@ -3,11 +3,14 @@ #include #include "conf/route.h" +#include "conf/route6.h" #include "conf/inetaddr.h" #include "conf/laddr.h" #include "conf/blklst.h" #include "conf/conn.h" #include "ip_tunnel.h" +#include "ipvs/service.h" +#include "ipvs/dest.h" enum{ DPVS_SO_SET_FLUSH = 200, diff --git a/tools/keepalived/keepalived/libipvs-2.6/libipvs.c b/tools/keepalived/keepalived/libipvs-2.6/libipvs.c index 0b8911dbc..fce9a716e 100644 --- a/tools/keepalived/keepalived/libipvs-2.6/libipvs.c +++ b/tools/keepalived/keepalived/libipvs-2.6/libipvs.c @@ -54,6 +54,61 @@ struct ip_vs_getinfo ipvs_info; #define CHECK_COMPAT_LADDR(s, ret) CHECK_IPV4(s, ret) +/* ipv6 support */ +typedef struct dpvs_servicedest_s { + struct dp_vs_service_user svc; + struct dp_vs_dest_user dest; +} dpvs_servicedest_t; + +#define SVC_CONVERT(X, Y) { \ + X->af = Y->af; \ + memcpy(&X->addr, &Y->addr, sizeof(X->addr)); \ + X->port = Y->port; \ + X->fwmark = Y->fwmark; \ + snprintf(X->sched_name, IP_VS_SCHEDNAME_MAXLEN, "%s", Y->sched_name); \ + X->flags = Y->flags; \ + X->timeout = Y->timeout; \ + X->conn_timeout = Y->conn_timeout; \ + X->netmask = Y->netmask; \ + X->bps = Y->bps; \ + X->limit_proportion = Y->limit_proportion; \ + snprintf(X->srange, sizeof(X->srange), "%s", Y->srange); \ + snprintf(X->drange, sizeof(X->drange), "%s", Y->drange); \ + snprintf(X->iifname, sizeof(X->iifname), "%s", Y->iifname); \ + snprintf(X->oifname, sizeof(X->oifname), "%s", Y->oifname);} + +#define IPVS_2_DPVS(X, Y) { \ + SVC_CONVERT(X, Y) \ + X->proto = Y->protocol;} + +#define DPVS_2_IPVS(X, Y) { \ + SVC_CONVERT(X, Y) \ + X->num_dests = Y->num_dests; \ + X->num_laddrs = Y->num_laddrs; \ + memcpy(&X->stats, &Y->stats, sizeof(X->stats)); \ + X->protocol = Y->proto;} + +#define DST_CONVERT(X, Y) { \ + X->af = Y->af; \ + memcpy(&X->addr, &Y->addr, sizeof(X->addr)); \ + X->port = Y->port; \ + X->conn_flags = Y->conn_flags; \ + X->weight = Y->weight;} + +#define IPRS_2_DPRS(X, Y) { \ + DST_CONVERT(X, Y) \ + X->max_conn = Y->u_threshold; \ + X->min_conn = Y->l_threshold;} + +#define DPRS_2_IPRS(X, Y) { \ + DST_CONVERT(X, Y) \ + X->u_threshold = Y->max_conn; \ + X->l_threshold = Y->min_conn; \ + X->activeconns = Y->actconns; \ + X->inactconns = Y->inactconns; \ + X->persistconns = Y->persistconns; \ + memcpy(&X->stats, &Y->stats, sizeof(X->stats));} + void ipvs_service_entry_2_user(const ipvs_service_entry_t *entry, ipvs_service_t *user); int ipvs_init(void) @@ -107,21 +162,23 @@ int ipvs_flush(void) int ipvs_add_service(ipvs_service_t *svc) { - ipvs_func = ipvs_add_service; - CHECK_COMPAT_SVC(svc, -1); - return dpvs_setsockopt(DPVS_SO_SET_ADD, (const void *)svc, sizeof(struct ip_vs_service_kern)); -out_err: - return -1; + struct dp_vs_service_user dpvs_svc; + struct dp_vs_service_user *dpvs_svc_ptr = &dpvs_svc; + + IPVS_2_DPVS(dpvs_svc_ptr, svc); + + return dpvs_setsockopt(DPVS_SO_SET_ADD, dpvs_svc_ptr, sizeof(dpvs_svc)); } int ipvs_update_service(ipvs_service_t *svc) { - ipvs_func = ipvs_update_service; - CHECK_COMPAT_SVC(svc, -1); - return dpvs_setsockopt(DPVS_SO_SET_EDIT, (const void *)svc, sizeof(struct ip_vs_service_kern)); -out_err: - return -1; + struct dp_vs_service_user dpvs_svc; + struct dp_vs_service_user *dpvs_svc_ptr = &dpvs_svc; + + IPVS_2_DPVS(dpvs_svc_ptr, svc); + + return dpvs_setsockopt(DPVS_SO_SET_EDIT, dpvs_svc_ptr, sizeof(dpvs_svc)); } int ipvs_update_service_by_options(ipvs_service_t *svc, unsigned int options) @@ -206,86 +263,82 @@ int ipvs_update_service_synproxy(ipvs_service_t *svc , int enable) int ipvs_del_service(ipvs_service_t *svc) { - ipvs_func = ipvs_del_service; - CHECK_COMPAT_SVC(svc, -1); - return dpvs_setsockopt(DPVS_SO_SET_DEL, (const void *)svc, sizeof(struct ip_vs_service_kern)); -out_err: - return -1; -} + struct dp_vs_service_user dpvs_svc; + struct dp_vs_service_user *dpvs_svc_ptr = &dpvs_svc; + + IPVS_2_DPVS(dpvs_svc_ptr, svc); + return dpvs_setsockopt(DPVS_SO_SET_DEL, dpvs_svc_ptr, sizeof(dpvs_svc)); +} int ipvs_zero_service(ipvs_service_t *svc) { - ipvs_func = ipvs_zero_service; - CHECK_COMPAT_SVC(svc, -1); - return dpvs_setsockopt(DPVS_SO_SET_ZERO, (const void *)svc, sizeof(struct ip_vs_service_kern)); -out_err: - return -1; -} + struct dp_vs_service_user dpvs_svc; + struct dp_vs_service_user *dpvs_svc_ptr = &dpvs_svc; + + IPVS_2_DPVS(dpvs_svc_ptr, svc); + return dpvs_setsockopt(DPVS_SO_SET_ZERO, dpvs_svc_ptr, sizeof(dpvs_svc)); +} int ipvs_add_dest(ipvs_service_t *svc, ipvs_dest_t *dest) { - ipvs_servicedest_t svcdest; - ipvs_func = ipvs_add_dest; + dpvs_servicedest_t svcdest; + struct dp_vs_service_user *dpvs_svc_ptr = &svcdest.svc; + struct dp_vs_dest_user *dpvs_dest_ptr = &svcdest.dest; + + IPVS_2_DPVS(dpvs_svc_ptr, svc); + IPRS_2_DPRS(dpvs_dest_ptr, dest); - CHECK_COMPAT_SVC(svc, -1); - CHECK_COMPAT_DEST(dest, -1); - memcpy(&svcdest.svc, svc, sizeof(svcdest.svc)); - memcpy(&svcdest.dest, dest, sizeof(svcdest.dest)); return dpvs_setsockopt(DPVS_SO_SET_ADDDEST, &svcdest, sizeof(svcdest)); -out_err: - return -1; } - int ipvs_update_dest(ipvs_service_t *svc, ipvs_dest_t *dest) { - ipvs_servicedest_t svcdest; + dpvs_servicedest_t svcdest; + struct dp_vs_service_user *dpvs_svc_ptr = &svcdest.svc; + struct dp_vs_dest_user *dpvs_dest_ptr = &svcdest.dest; + + IPVS_2_DPVS(dpvs_svc_ptr, svc); + IPRS_2_DPRS(dpvs_dest_ptr, dest); - ipvs_func = ipvs_update_dest; - CHECK_COMPAT_SVC(svc, -1); - CHECK_COMPAT_DEST(dest, -1); - memcpy(&svcdest.svc, svc, sizeof(svcdest.svc)); - memcpy(&svcdest.dest, dest, sizeof(svcdest.dest)); return dpvs_setsockopt(DPVS_SO_SET_EDITDEST, &svcdest, sizeof(svcdest)); -out_err: - return -1; } - int ipvs_del_dest(ipvs_service_t *svc, ipvs_dest_t *dest) { - ipvs_servicedest_t svcdest; + dpvs_servicedest_t svcdest; + struct dp_vs_service_user *dpvs_svc_ptr = &svcdest.svc; + struct dp_vs_dest_user *dpvs_dest_ptr = &svcdest.dest; - ipvs_func = ipvs_del_dest; + IPVS_2_DPVS(dpvs_svc_ptr, svc); + IPRS_2_DPRS(dpvs_dest_ptr, dest); - CHECK_COMPAT_SVC(svc, -1); - CHECK_COMPAT_DEST(dest, -1); - memcpy(&svcdest.svc, svc, sizeof(svcdest.svc)); - memcpy(&svcdest.dest, dest, sizeof(svcdest.dest)); return dpvs_setsockopt(DPVS_SO_SET_DELDEST, &svcdest, sizeof(svcdest)); -out_err: - return -1; } static void ipvs_fill_laddr_conf(ipvs_service_t *svc, ipvs_laddr_t *laddr, struct dp_vs_laddr_conf *conf) { memset(conf, 0, sizeof(*conf)); - conf->af = svc->af; + conf->af = laddr->af; conf->proto = svc->protocol; conf->vport = svc->port; conf->fwmark = svc->fwmark; if (strlen(laddr->ifname)) snprintf(conf->ifname, sizeof(conf->ifname), "%s", laddr->ifname); + if (svc->af == AF_INET) { conf->vaddr.in = svc->addr.in; - conf->laddr.in = laddr->addr.in; } else { conf->vaddr.in6 = svc->addr.in6; + } + + if (laddr->af == AF_INET) { + conf->laddr.in = laddr->addr.in; + } else { conf->laddr.in6 = laddr->addr.in6; - } + } return; } @@ -293,16 +346,21 @@ static void ipvs_fill_laddr_conf(ipvs_service_t *svc, ipvs_laddr_t *laddr, static void ipvs_fill_ipaddr_conf(ipvs_laddr_t *laddr, struct inet_addr_param *param) { memset(param, 0, sizeof(*param)); - param->af = AF_INET; + param->af = laddr->af; if (strlen(laddr->ifname)) snprintf(param->ifname, sizeof(param->ifname), "%s", laddr->ifname); - param->addr.in = laddr->addr.in; - param->plen = 32; + if (laddr->af == AF_INET) { + param->addr.in = laddr->addr.in; + param->plen = 32; + } else { + param->plen = 128; + param->addr.in6 = laddr->addr.in6; + } param->flags |= IFA_F_SAPOOL; return; } -int ipvs_add_laddr(ipvs_service_t *svc, ipvs_laddr_t * laddr) +int ipvs_add_laddr(ipvs_service_t *svc, ipvs_laddr_t *laddr) { struct dp_vs_laddr_conf conf; struct inet_addr_param param; @@ -314,7 +372,7 @@ int ipvs_add_laddr(ipvs_service_t *svc, ipvs_laddr_t * laddr) return dpvs_setsockopt(SOCKOPT_SET_LADDR_ADD, &conf, sizeof(conf)); } -int ipvs_del_laddr(ipvs_service_t *svc, ipvs_laddr_t * laddr) +int ipvs_del_laddr(ipvs_service_t *svc, ipvs_laddr_t *laddr) { struct dp_vs_laddr_conf conf; struct inet_addr_param param; @@ -423,39 +481,43 @@ int ipvs_stop_daemon(ipvs_daemon_t *dm) struct ip_vs_get_services *ipvs_get_services(void) { struct ip_vs_get_services *get; - struct ip_vs_get_services_kern *getk,*getk_rcv; - size_t len, len_rcv; + struct dp_vs_get_services *dpvs_get, *dpvs_get_rcv; + struct dp_vs_service_entry *dpvs_entry; + struct ip_vs_service_entry *ipvs_entry; + size_t len = 0, len_rcv = 0; int i; - - len = sizeof(*get) + + len = sizeof(struct ip_vs_get_services) + sizeof(ipvs_service_entry_t) * ipvs_info.num_services; if (!(get = calloc(len, 1))) return NULL; - len = sizeof(*getk) + - sizeof(struct ip_vs_service_entry_kern) * ipvs_info.num_services; - if (!(getk = malloc(len))) { + + len = sizeof(struct dp_vs_get_services); + if (!(dpvs_get = calloc(len, 1))) { free(get); return NULL; } - - ipvs_func = ipvs_get_services; - getk->num_services = ipvs_info.num_services; - len_rcv = len; - if (dpvs_getsockopt(DPVS_SO_GET_SERVICES, getk, len, (void **)&getk_rcv, &len_rcv)) { + dpvs_get->num_services = ipvs_info.num_services; + + if (dpvs_getsockopt(DPVS_SO_GET_SERVICES, dpvs_get, len, (void **)&dpvs_get_rcv, &len_rcv)) { free(get); - free(getk); + free(dpvs_get); return NULL; } - memcpy(get, getk_rcv, sizeof(struct ip_vs_get_services)); - for (i = 0; i < getk_rcv->num_services; i++) { - memcpy(&get->entrytable[i], &getk_rcv->entrytable[i], - sizeof(struct ip_vs_service_entry_kern)); - get->entrytable[i].af = AF_INET; - get->entrytable[i].addr.ip = get->entrytable[i].__addr_v4; + + get->num_services = dpvs_get_rcv->num_services; + for (i = 0; i < dpvs_get_rcv->num_services; i++) { + ipvs_entry = &get->entrytable[i]; + dpvs_entry = &dpvs_get_rcv->entrytable[i]; + DPVS_2_IPVS(ipvs_entry, dpvs_entry); + if (dpvs_get_rcv->entrytable[i].af == AF_INET) { + get->entrytable[i].__addr_v4 = get->entrytable[i].addr.ip; + get->entrytable[i].pe_name[0] = '\0'; + } } - free(getk); - dpvs_sockopt_msg_free(getk_rcv); + + free(dpvs_get); + dpvs_sockopt_msg_free(dpvs_get_rcv); return get; } @@ -640,54 +702,62 @@ struct dp_vs_blklst_conf_array *ipvs_get_blklsts(void) struct ip_vs_get_dests *ipvs_get_dests(ipvs_service_entry_t *svc) { struct ip_vs_get_dests *d; - struct ip_vs_get_dests_kern *dk, *dk_rcv; - size_t len, len_rcv; + struct dp_vs_get_dests *dpvs_dests, *dpvs_dests_rcv; + struct ip_vs_dest_entry *ipvs_entry; + struct dp_vs_dest_entry *dpvs_entry; + size_t len = 0, len_rcv = 0; int i; - len = sizeof(*d) + sizeof(ipvs_dest_entry_t) * svc->num_dests; - if (!(d = malloc(len))) + len = sizeof(struct ip_vs_get_dests) + + sizeof(ipvs_dest_entry_t) * svc->num_dests; + if (!(d = calloc(len, 1))) return NULL; - ipvs_func = ipvs_get_dests; - - if (svc->af != AF_INET) { - errno = EAFNOSUPPORT; - free(d); - return NULL; - } - - len = sizeof(*dk) + sizeof(struct ip_vs_dest_entry_kern) * svc->num_dests; - if (!(dk = malloc(len))) { + len = sizeof(struct dp_vs_get_dests); + if (!(dpvs_dests = calloc(len, 1))) { free(d); return NULL; } - dk->fwmark = svc->fwmark; - dk->protocol = svc->protocol; - dk->addr = svc->addr.ip; - dk->port = svc->port; - dk->num_dests = svc->num_dests; - snprintf(dk->srange, sizeof(dk->srange), "%s", svc->srange); - snprintf(dk->drange, sizeof(dk->drange), "%s", svc->drange); - snprintf(dk->iifname, sizeof(dk->iifname), "%s", svc->iifname); - snprintf(dk->oifname, sizeof(dk->oifname), "%s", svc->oifname); - - if (dpvs_getsockopt(DPVS_SO_GET_DESTS, dk, len, (void **)&dk_rcv, &len_rcv)) { + dpvs_dests->af = svc->af; + dpvs_dests->fwmark = svc->fwmark; + dpvs_dests->proto = svc->protocol; + memcpy(&dpvs_dests->addr, &svc->addr, sizeof(svc->addr)); + dpvs_dests->port = svc->port; + dpvs_dests->num_dests = svc->num_dests; + snprintf(dpvs_dests->srange, sizeof(dpvs_dests->srange), "%s", svc->srange); + snprintf(dpvs_dests->drange, sizeof(dpvs_dests->drange), "%s", svc->drange); + snprintf(dpvs_dests->iifname, sizeof(dpvs_dests->iifname), "%s", svc->iifname); + snprintf(dpvs_dests->oifname, sizeof(dpvs_dests->oifname), "%s", svc->oifname); + + if (dpvs_getsockopt(DPVS_SO_GET_DESTS, dpvs_dests, len, (void **)&dpvs_dests_rcv, &len_rcv)) { free(d); - free(dk); + free(dpvs_dests); return NULL; } - memcpy(d, dk_rcv, sizeof(struct ip_vs_get_dests_kern)); - d->af = AF_INET; - d->addr.ip = d->__addr_v4; - for (i = 0; i < dk_rcv->num_dests; i++) { - memcpy(&d->entrytable[i], &dk_rcv->entrytable[i], - sizeof(struct ip_vs_dest_entry_kern)); - d->entrytable[i].af = AF_INET; - d->entrytable[i].addr.ip = d->entrytable[i].__addr_v4; + + d->af = dpvs_dests_rcv->af; + memcpy(&d->addr, &dpvs_dests_rcv->addr, sizeof(d->addr)); + d->protocol = dpvs_dests_rcv->proto; + d->port = dpvs_dests_rcv->port; + d->fwmark = dpvs_dests_rcv->fwmark; + d->num_dests = dpvs_dests_rcv->num_dests; + snprintf(d->srange, sizeof(dpvs_dests_rcv->srange), "%s", dpvs_dests_rcv->srange); + snprintf(d->drange, sizeof(dpvs_dests_rcv->drange), "%s", dpvs_dests_rcv->drange); + snprintf(d->iifname, sizeof(dpvs_dests_rcv->iifname), "%s", dpvs_dests_rcv->iifname); + snprintf(d->oifname, sizeof(dpvs_dests_rcv->oifname), "%s", dpvs_dests_rcv->oifname); + if (d->af == AF_INET) { + d->__addr_v4 = d->addr.ip; } - free(dk); - dpvs_sockopt_msg_free(dk_rcv); + for (i = 0; i < dpvs_dests_rcv->num_dests; i++) { + ipvs_entry = &d->entrytable[i]; + dpvs_entry = &dpvs_dests_rcv->entrytable[i]; + DPRS_2_IPRS(ipvs_entry, dpvs_entry); + if (d->entrytable[i].af == AF_INET) + d->entrytable[i].__addr_v4= d->entrytable[i].addr.ip; + } + free(dpvs_dests); + dpvs_sockopt_msg_free(dpvs_dests_rcv); return d; } @@ -719,44 +789,37 @@ void ipvs_sort_dests(struct ip_vs_get_dests *d, ipvs_dest_cmp_t f) ipvs_service_entry_t * ipvs_get_service(struct ip_vs_service_user *hint) { - ipvs_service_entry_t *svc,*svc_rcv; + ipvs_service_entry_t *svc; + struct dp_vs_service_entry dpvs_svc, *dpvs_svc_ptr, *dpvs_svc_rcv; socklen_t len; size_t len_rcv; - ipvs_func = ipvs_get_service; - len = sizeof(*svc); svc = calloc(1, len); if (!svc) return NULL; - len_rcv = len; memset((void *)svc, 0x00, len); - svc->fwmark = hint->fwmark; - svc->af = hint->af; - svc->protocol = hint->protocol; - svc->addr = hint->addr; - svc->port = hint->port; - snprintf(svc->srange, sizeof(svc->srange), "%s", hint->srange); - snprintf(svc->drange, sizeof(svc->drange), "%s", hint->drange); - snprintf(svc->iifname, sizeof(svc->iifname), "%s", hint->iifname); - snprintf(svc->oifname, sizeof(svc->oifname), "%s", hint->oifname); - - CHECK_COMPAT_SVC(svc, NULL); + len = sizeof(dpvs_svc); + len_rcv = sizeof(*dpvs_svc_rcv); + memset(&dpvs_svc, 0, len); + dpvs_svc_ptr = &dpvs_svc; + IPVS_2_DPVS(dpvs_svc_ptr, hint); + if (dpvs_getsockopt(DPVS_SO_GET_SERVICE, - svc, len, (void **)&svc_rcv, &len_rcv)) { - free(svc); - return NULL; + &dpvs_svc, len, (void **)&dpvs_svc_rcv, &len_rcv)) { + goto out_err; + } + + DPVS_2_IPVS(svc, dpvs_svc_rcv) + if (svc->af == AF_INET) { + svc->pe_name[0] = '\0'; + svc->__addr_v4 = svc->addr.ip; } - memcpy(svc, svc_rcv, len_rcv); - svc->af = AF_INET; - svc->addr.ip = svc->__addr_v4; - svc->pe_name[0] = '\0'; - dpvs_sockopt_msg_free(svc_rcv); + dpvs_sockopt_msg_free(dpvs_svc_rcv); return svc; out_err: free(svc); - dpvs_sockopt_msg_free(svc_rcv); return NULL; } @@ -766,20 +829,31 @@ void ipvs_free_service(ipvs_service_entry_t* p) free(p); } -int ipvs_set_route(struct dp_vs_route_conf* rt, int cmd) +int ipvs_set_route(struct dp_vs_route_conf *rt, int cmd) { int err = -1; if (cmd == IPROUTE_DEL){ err = dpvs_setsockopt(SOCKOPT_SET_ROUTE_DEL, rt, sizeof(struct dp_vs_route_conf)); - free(rt); - } - else if (cmd == IPROUTE_ADD){ + } else if (cmd == IPROUTE_ADD){ err = dpvs_setsockopt(SOCKOPT_SET_ROUTE_ADD, rt, sizeof(struct dp_vs_route_conf)); - free(rt); } return err; } +int ipvs_set_route6(struct dp_vs_route6_conf *rt6_cfg, int cmd) +{ + int err = -1; + if (cmd == IPROUTE_DEL) { + rt6_cfg->ops = RT6_OPS_DEL; + err = dpvs_setsockopt(SOCKOPT_SET_ROUTE6_ADD_DEL, rt6_cfg, + sizeof(struct dp_vs_route6_conf)); + } else if (cmd == IPROUTE_ADD) { + rt6_cfg->ops = RT6_OPS_ADD; + err = dpvs_setsockopt(SOCKOPT_SET_ROUTE6_ADD_DEL, rt6_cfg, + sizeof(struct dp_vs_route6_conf)); + } + return err; +} int ipvs_set_ipaddr(struct inet_addr_param *param, int cmd) { diff --git a/tools/keepalived/keepalived/libipvs-2.6/libipvs.h b/tools/keepalived/keepalived/libipvs-2.6/libipvs.h index 1455910ee..d3b2cdd66 100644 --- a/tools/keepalived/keepalived/libipvs-2.6/libipvs.h +++ b/tools/keepalived/keepalived/libipvs-2.6/libipvs.h @@ -185,6 +185,8 @@ extern int ipvs_send_gratuitous_arp(struct in_addr *in); extern int ipvs_set_route(struct dp_vs_route_conf*, int cmd); +extern int ipvs_set_route6(struct dp_vs_route6_conf*, int cmd); + extern int ipvs_set_ipaddr(struct inet_addr_param *param, int cmd); extern struct dp_vs_blklst_conf_array *ipvs_get_blklsts(void); diff --git a/tools/keepalived/keepalived/vrrp/vrrp_ipaddress.c b/tools/keepalived/keepalived/vrrp/vrrp_ipaddress.c index a4a1e3a25..f7f554b23 100644 --- a/tools/keepalived/keepalived/vrrp/vrrp_ipaddress.c +++ b/tools/keepalived/keepalived/vrrp/vrrp_ipaddress.c @@ -34,12 +34,16 @@ static void dpvs_fill_addrconf(ip_address_t *ipaddress, char *dpdk_port, struct inet_addr_param *param) { param->af = ipaddress->ifa.ifa_family; - param->addr.in = ipaddress->u.sin.sin_addr; if (dpdk_port) { strcpy(param->ifname, dpdk_port); } else { strcpy(param->ifname, ipaddress->ifp->ifname); } + if (param->af == AF_INET) + param->addr.in = ipaddress->u.sin.sin_addr; + else + param->addr.in6 = ipaddress->u.sin6_addr; + strcpy(param->ifname, dpdk_port); param->plen = ipaddress->ifa.ifa_prefixlen; param->flags &= ~IFA_F_SAPOOL; } diff --git a/tools/keepalived/keepalived/vrrp/vrrp_iproute.c b/tools/keepalived/keepalived/vrrp/vrrp_iproute.c index d99640f4a..42b3c2ff6 100644 --- a/tools/keepalived/keepalived/vrrp/vrrp_iproute.c +++ b/tools/keepalived/keepalived/vrrp/vrrp_iproute.c @@ -31,42 +31,78 @@ #include "memory.h" #include "utils.h" -void dpvs_fill_rtconf(ip_route_t *iproute, struct dp_vs_route_conf *route_conf) +/* + * refer to function netlink_scope_a2n + * */ +static int scope_n2dpvs(int scope) +{ + if (scope == 254) + return ROUTE_CF_SCOPE_HOST; + if (scope == 253) + return ROUTE_CF_SCOPE_LINK; + if (scope == 0) + return ROUTE_CF_SCOPE_GLOBAL; + return ROUTE_CF_SCOPE_GLOBAL; +} + +static int flag_n2dpvs(int scope) +{ + if (scope == 254) + return RTF_LOCALIN; + if (scope == 253) + return RTF_FORWARD; + return RTF_FORWARD; +} + +void dpvs_fill_rt4conf(ip_route_t *iproute, struct dp_vs_route_conf *route_conf) { route_conf->af = AF_INET; - (route_conf->dst).in = ((iproute->dst)->u).sin.sin_addr; - route_conf->plen = iproute->dmask; - if(iproute->gw){ - (route_conf->via).in = ((iproute->gw)->u).sin.sin_addr; - } - else{ + (route_conf->dst).in = (iproute->dst->u).sin.sin_addr; + route_conf->plen = iproute->dmask; + + if (iproute->gw){ + (route_conf->via).in = (iproute->gw->u).sin.sin_addr; + } else { (route_conf->via).in.s_addr = 0; } - if(iproute->src){ - (route_conf->src).in = ((iproute->src) -> u).sin.sin_addr; - } - else - (route_conf->src).in.s_addr = 0; - if(iproute->scope == 254) { - route_conf->scope = ROUTE_CF_SCOPE_HOST; - } - else if(iproute->scope == 253) { - route_conf->scope = ROUTE_CF_SCOPE_LINK; - } - else if(iproute->scope == 0) { - route_conf->scope = ROUTE_CF_SCOPE_GLOBAL; + if (iproute->src){ + (route_conf->src).in = (iproute->src->u).sin.sin_addr; + } else { + (route_conf->src).in.s_addr = 0; } - + + route_conf->scope = scope_n2dpvs(iproute->scope); strcpy(route_conf->ifname, iproute->ifname); route_conf->mtu = 0; route_conf->metric = 0; } +void dpvs_fill_rt6conf(ip_route_t *iproute, struct dp_vs_route6_conf *rt6_cfg) +{ + rt6_cfg->dst.addr = ((iproute->dst)->u).sin6_addr; + rt6_cfg->dst.plen = iproute->dmask; + rt6_cfg->src.plen = 128; + if (iproute->gw) { + rt6_cfg->gateway = (iproute->gw->u).sin6_addr; + } else { + memset(&rt6_cfg->gateway, 0, sizeof(rt6_cfg->gateway)); + } + + if (iproute->src) { + rt6_cfg->src.addr = (iproute->src->u).sin6_addr; + } else { + memset(&rt6_cfg->src, 0, sizeof(rt6_cfg->src)); + } + + rt6_cfg->flags |= flag_n2dpvs(iproute->scope); + strcpy(rt6_cfg->ifname, iproute->ifname); + rt6_cfg->mtu = 0; +} + int netlink_route(ip_route_t *iproute, int cmd) { - struct dp_vs_route_conf *route_conf; char *tmp_dst,*tmp_src; tmp_dst = ipaddresstos(iproute->dst); @@ -76,9 +112,22 @@ netlink_route(ip_route_t *iproute, int cmd) cmd, tmp_dst, iproute->dmask, tmp_src, iproute->ifname, iproute->scope); FREE(tmp_dst); FREE(tmp_src); - route_conf = (struct dp_vs_route_conf *)malloc(sizeof(struct dp_vs_route_conf)); - dpvs_fill_rtconf(iproute, route_conf); - ipvs_set_route(route_conf, cmd); + + if (iproute->dst->ifa.ifa_family == AF_INET) { + struct dp_vs_route_conf *route_conf; + route_conf = (struct dp_vs_route_conf *)malloc(sizeof(struct dp_vs_route_conf)); + memset(route_conf, 0, sizeof(*route_conf)); + dpvs_fill_rt4conf(iproute, route_conf); + ipvs_set_route(route_conf, cmd); + free(route_conf); + } else { + struct dp_vs_route6_conf *rt6_cfg; + rt6_cfg = (struct dp_vs_route6_conf *)malloc(sizeof(struct dp_vs_route6_conf)); + memset(rt6_cfg, 0, sizeof(*rt6_cfg)); + dpvs_fill_rt6conf(iproute, rt6_cfg); + ipvs_set_route6(rt6_cfg, cmd); + free(rt6_cfg); + } return 1; } diff --git a/uoa/example/udp_serv.c b/uoa/example/udp_serv.c deleted file mode 100644 index 1179829d9..000000000 --- a/uoa/example/udp_serv.c +++ /dev/null @@ -1,106 +0,0 @@ -/* - * DPVS is a software load balancer (Virtual Server) based on DPDK. - * - * Copyright (C) 2018 iQIYI (www.iqiyi.com). - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - */ -/* - * Example UDP server to get real client IP/port by UOA. - * - * raychen@qiyi.com, Mar 2018, initial. - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "common.h" /* for __u8, __be16, __be32, __u64 only, - just define them if not want common.h */ -#include "uoa.h" - -#define SA struct sockaddr -#define SERV_PORT 6000 - -int main(int argc, char *argv[]) -{ - int sockfd, n, enable = 1; - char buff[4096], from[64]; - struct sockaddr_in local, peer; - struct uoa_param_map map; - socklen_t len, mlen; - - if ((sockfd = socket(AF_INET, SOCK_DGRAM, 0)) < 0) { - perror("fail to create socket"); - exit(1); - } - - setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR, &enable, sizeof(enable)); - setsockopt(sockfd, SOL_SOCKET, SO_REUSEPORT, &enable, sizeof(enable)); - - memset(&local, 0, sizeof(struct sockaddr_in)); - local.sin_family = AF_INET; - local.sin_port = htons(SERV_PORT); - local.sin_addr.s_addr = htonl(INADDR_ANY); - - if (bind(sockfd, (struct sockaddr *)&local, sizeof(local)) != 0) { - perror("bind"); - exit(1); - } - - while (1) { - len = sizeof(peer); - n = recvfrom(sockfd, buff, sizeof(buff), 0, (SA *)&peer, &len); - if (n < 0) { - perror("recvfrom"); - break; - } - - inet_ntop(AF_INET, &peer.sin_addr, from, sizeof(from)); -#if 0 - printf("Receive %d bytes from %s:%d\n", - n, from, ntohs(peer.sin_port)); -#endif - - /* - * get real client address: - * - * note: src/dst is for original pkt, so peer is - * "orginal" source, instead of local. wildcard - * lookup for daddr (or local IP) is supported. - */ - memset(&map, 0, sizeof(map)); - map.saddr = peer.sin_addr.s_addr; - map.sport = peer.sin_port; - map.daddr = htonl(INADDR_ANY); - map.dport = htons(SERV_PORT); - mlen = sizeof(map); - - if (getsockopt(sockfd, IPPROTO_IP, UOA_SO_GET_LOOKUP, - &map, &mlen) == 0) { - inet_ntop(AF_INET, &map.real_saddr, from, sizeof(from)); - printf(" real client %s:%d\n", - from, ntohs(map.real_sport)); - } - - len = sizeof(peer); - sendto(sockfd, buff, n, 0, (SA *)&peer, len); - } - - close(sockfd); - exit(0); -} diff --git a/uoa/example/uperf.c b/uoa/example/uperf.c deleted file mode 100644 index 2fcae2dd1..000000000 --- a/uoa/example/uperf.c +++ /dev/null @@ -1,540 +0,0 @@ -/* - * DPVS is a software load balancer (Virtual Server) based on DPDK. - * - * Copyright (C) 2018 iQIYI (www.iqiyi.com). - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - */ -/* - * UDP client for performance (high concurrency) test. - * - * raychen@qiyi.com, Mar 2018, initial. - */ -#define _GNU_SOURCE -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define SA struct sockaddr - -#define DEF_SERV_PORT 6000 -#define DEF_MAX_CONN 2000 /* per worker */ -#define DEF_DURATION 10 /* seconds */ -#define DEF_PKT_SIZE 1000 /* bytes */ -#define DEF_DUMP_INTV 1 /* seconds */ - -struct config { - int max_conn; /* max conn per worker */ - int duration; /* test duration in seconds */ - int pkt_size; /* packet size in bytes */ - int interval; /* dump interval seconds */ - struct sockaddr_in servaddr; /* server address */ -}; - -struct stats { - uint64_t tot_conns; - uint64_t conns; - uint64_t pkts_sent; - uint64_t pkts_recv; - uint64_t bytes_sent; - uint64_t bytes_recv; - uint64_t errors; -}; - -struct worker { - int cpu; - pid_t pid; - struct config conf; - struct stats stats; - char *sndbuf; - char *rcvbuf; -}; - -static cpu_set_t cpuset; /* cpu for workers */ -static sig_atomic_t quit_test = 0; /* for master */ -static sig_atomic_t quit_client = 0; /* for worker */ - -static struct worker workers[CPU_SETSIZE] = {}; - -static void usage(const char *prog) -{ - fprintf(stderr, "Usage:\n"); - fprintf(stderr, " %s [OPTIONS] host[:port]\n", prog); - fprintf(stderr, "Options:\n"); - fprintf(stderr, " -c CPUMASK workers' CPU mask in hex format.\n"); - fprintf(stderr, " -m MAXCONN connection per worker (CPU).\n"); - fprintf(stderr, " -t DRUATION test duration in second.\n"); - fprintf(stderr, " -s SIZE packet size (payload) in byte.\n"); - fprintf(stderr, " -i INTERVAL print interval in second.\n"); - fprintf(stderr, " -h show this help info.\n"); - fprintf(stderr, "Examples:\n"); - fprintf(stderr, " %s 127.0.0.1\n", prog); - fprintf(stderr, " %s -c 1f 1.1.1.1:1234\n", prog); - fprintf(stderr, " %s -c f -m 1000 -t 10 -s 10 2.2.2.2:5000\n", prog); -} - -static void sig_quit(int signo) -{ - quit_test = 1; -} - -static void sig_quit_client(int signo) -{ - quit_client = 1; -} - -static int hexstr_to_cpuset(const char *hex, cpu_set_t *set) -{ - const char *c; - unsigned long long mask; - int cpu; - - if (!hex || !set) - return -1; - - for (c = hex; *c != '\0'; c++) { - if (!isxdigit(*c)) - return -1; - } - - CPU_ZERO(set); - mask = strtoull(hex, NULL, 16); - - for (cpu = 0; cpu < sizeof(mask) * 8; cpu++) { - if (mask & (0x1LL<max_conn = DEF_MAX_CONN; - conf->duration = DEF_DURATION; - conf->pkt_size = DEF_PKT_SIZE; - conf->interval = DEF_DUMP_INTV; - conf->servaddr.sin_family = AF_INET; - conf->servaddr.sin_port = htons(DEF_SERV_PORT); - - if (argc <= 1) { - usage(prog); - exit(0); - } - - while ((opt = getopt_long(argc, argv, "hc:m:t:s:i:", - opts, NULL)) != -1) { - switch (opt) { - case 'h': - usage(prog); - exit(0); - case 'c': - if (hexstr_to_cpuset(optarg, &cpuset) != 0) { - fprintf(stderr, "Bad CPU mask: %s\n", optarg); - exit(1); - } - break; - case 'm': - conf->max_conn = atoi(optarg); - break; - case 't': - conf->duration = atoi(optarg); - if (conf->duration <= 0) { - fprintf(stderr, "Invalid duration.\n"); - exit(1); - } - break; - case 's': - conf->pkt_size = atoi(optarg); - break; - case 'i': - conf->interval = atoi(optarg); - if (conf->interval <= 0) { - fprintf(stderr, "Invalid interval.\n"); - exit(1); - } - break; - case '?': - default: - fprintf(stderr, "Invalid option: %s\n", argv[optind]); - return -1; - } - } - - argc -= optind; - argv += optind; - - if (argc < 1) { - fprintf(stderr, "Missing server IP address.\n"); - exit(1); - } - - host = argv[0]; - port = strrchr(host, ':'); - if (port) - *port++ = '\0'; - - if (inet_pton(AF_INET, host, &conf->servaddr.sin_addr) <= 0) { - fprintf(stderr, "Invalid host IP: %s\n", host); - exit(1); - } - - if (port) { - if (atoi(port) <= 0 || atoi(port) >= 65535) { - fprintf(stderr, "Invalid port: %s\n", port); - exit(1); - } - - conf->servaddr.sin_port = htons(atoi(port)); - } - - return 0; -} - -static inline void dump_stats(int cpu, const struct stats *st) -{ - printf("[% 2d] %5"PRIu64" %8"PRIu64" %8"PRIu64" %12"PRIu64" %12"PRIu64" %8"PRIu64" %8"PRIu64"\n", - cpu, st->conns, st->pkts_recv, st->pkts_sent, - st->bytes_recv, st->bytes_sent, st->errors, st->tot_conns); -} - -static int udp_new_conn(int epfd, struct worker *wk) -{ - int sockfd; - struct epoll_event ev; - socklen_t salen = sizeof(struct sockaddr_in); - - sockfd = socket(AF_INET, SOCK_DGRAM, 0); - if (sockfd < 0) { - perror("socket"); - return -1; - } - - /* use connect to receive ICMP port unreachable. */ - if (connect(sockfd, (SA *)&wk->conf.servaddr, salen) != 0) { - perror("connect"); - close(sockfd); - return -1; - } - - if (send(sockfd, wk->sndbuf, wk->conf.pkt_size, 0) != wk->conf.pkt_size) { - perror("send"); - close(sockfd); - return -1; - } - - wk->stats.pkts_sent++; - wk->stats.bytes_sent += wk->conf.pkt_size; - - fcntl(sockfd, F_SETFL, fcntl(sockfd, F_GETFL, 0) | O_NONBLOCK); - - memset(&ev, 0, sizeof(ev)); - ev.events = EPOLLIN | EPOLLERR; - ev.data.fd = sockfd; - if (epoll_ctl(epfd, EPOLL_CTL_ADD, sockfd, &ev) != 0) { - perror("epoll_ctl"); - close(sockfd); - return -1; - } - - wk->stats.conns++; - wk->stats.tot_conns++; - - return 0; -} - -static void udp_handle_reply(int epfd, int fd, struct worker *wk) -{ - int n; - - n = recv(fd, wk->rcvbuf, wk->conf.pkt_size, 0); - - if (n < 0) { - /* we're nonblock recv */ - if (errno == EINTR && errno == EAGAIN) - return; - - wk->stats.errors++; - } - - epoll_ctl(epfd, EPOLL_CTL_DEL, fd, NULL); - close(fd); - - wk->stats.conns--; - if (n >= 0) { - wk->stats.pkts_recv++; - wk->stats.bytes_recv += n; - } -} - -static void udp_client(struct worker *wk) -{ - int epfd, nfds, timeo, i; - struct epoll_event *events; - struct config *conf = &wk->conf; - struct stats *stats = &wk->stats; - struct timespec ts_start, ts_now, ts_elapse, ts_dump; - - events = malloc(conf->max_conn * sizeof(struct epoll_event)); - if (!events) { - fprintf(stderr, "%s: no memory\n", __func__); - exit(1); - } - - wk->sndbuf = malloc(conf->pkt_size); - wk->rcvbuf = malloc(conf->pkt_size); - if (!wk->sndbuf || !wk->rcvbuf) { - fprintf(stderr, "%s: no memory\n", __func__); - exit(1); - } - - /* generate random alpha string for UDP payload. */ - for (i = 0; i < conf->pkt_size; i++) - wk->sndbuf[i] = 'A' + (random() % 26); - - /* - * each socket send one packet and receive a reply, - * try to create "connections" until max_conn reached. - * - * use epoll to avoid block on recv reply. - */ - epfd = epoll_create1(0); - if (epfd < 0) { - perror("epoll_create1"); - exit(1); - } - - signal(SIGQUIT, sig_quit_client); - - memset(stats, 0, sizeof(*stats)); - clock_gettime(CLOCK_MONOTONIC_COARSE, &ts_start); - clock_gettime(CLOCK_MONOTONIC_COARSE, &ts_dump); - - /* 0123 01234 01234567 01234567 012345678901 012345678901 01234567 01234567 */ - printf("CPU%d conns ipackets opackets ibytes obytes errors tot-conn\n", wk->cpu); - - /* main loop */ - while (1) { - if (quit_test || quit_client) - break; - - /* try create conn as much as possible */ - while (stats->conns < conf->max_conn) - udp_new_conn(epfd, wk); - - clock_gettime(CLOCK_MONOTONIC_COARSE, &ts_now); - timespec_sub(&ts_now, &ts_start, &ts_elapse); - - /* stop test if duration reached. */ - if (ts_elapse.tv_sec >= conf->duration) - break; - - /* decide wait timeout for MIN(interval, duration_remain). - * calculate in ms */ - timeo = (conf->duration - ts_elapse.tv_sec) * 1000 \ - - ts_elapse.tv_nsec / 1000000; - timeo = (timeo <= conf->interval * 1000) ? timeo : - conf->interval * 1000; - - /* dump stats with interval */ - timespec_sub(&ts_now, &ts_dump, &ts_elapse); - if (ts_elapse.tv_sec >= conf->interval) { - dump_stats(wk->cpu, stats); - ts_dump = ts_now; - } - - nfds = epoll_wait(epfd, events, conf->max_conn, timeo); - if (nfds == -1) { - perror("epoll_wait"); - exit(1); - } - - for (i = 0; i < nfds; i++) { - udp_handle_reply(epfd, events[i].data.fd, wk); - } - } - - clock_gettime(CLOCK_MONOTONIC_COARSE, &ts_now); - timespec_sub(&ts_now, &ts_start, &ts_elapse); - - dump_stats(wk->cpu, stats); - - printf("[%2d] --------\n", wk->cpu); - printf("[%2d] Summary: total connection %"PRIu64", errors %"PRIu64" duration %lu.%03lu\n", - wk->cpu, stats->tot_conns, stats->errors, ts_elapse.tv_sec, ts_elapse.tv_nsec / 1000000); - printf("[%2d] RX %lu pps %lu B/s, TX %lu pps %lu B/s\n", wk->cpu, - stats->pkts_recv * 1000 / (ts_elapse.tv_sec * 1000 + ts_elapse.tv_nsec / 1000000), - stats->bytes_recv * 1000 / (ts_elapse.tv_sec * 1000 + ts_elapse.tv_nsec / 1000000), - stats->pkts_sent * 1000 / (ts_elapse.tv_sec * 1000 + ts_elapse.tv_nsec / 1000000), - stats->bytes_sent * 1000 / (ts_elapse.tv_sec * 1000 + ts_elapse.tv_nsec / 1000000)); - - /* exiting, nothing need to release. */ - return; -} - -static int new_worker(const int cpu, const struct config *conf) -{ - pid_t pid; - - workers[cpu].cpu = cpu; - workers[cpu].conf = *conf; - - pid = fork(); - - if (pid > 0) { /* master */ - workers[cpu].pid = pid; - } else if (pid == 0) { /* worker */ - cpu_set_t set; - - CPU_ZERO(&set); - CPU_SET(cpu, &set); - if (sched_setaffinity(getpid(), CPU_SETSIZE, &set) != 0) - perror("sched_setaffinity"); - - udp_client(&workers[cpu]); - - exit(1); /* never return */ - } else { - fprintf(stderr, "%s: fail to fork worker\n", __func__); - return -1; - } - - return 0; -} - -int main(int argc, char *argv[]) -{ - int cpu; - int num_workers = 0; - struct config conf; - struct rlimit limit; - - if (parse_args(argc, argv, &conf) != 0) - exit(1); - - /* example only, pls use sigaction */ - signal(SIGINT, sig_quit); - - /* extend open-file limit as needed. */ - if (getrlimit(RLIMIT_OFILE, &limit) == 0) { - limit.rlim_cur = limit.rlim_max; - if (setrlimit(RLIMIT_OFILE, &limit) != 0) - perror("setrlimit(OFILE)"); - } - - /* standalone mode ? */ - if (CPU_COUNT(&cpuset) == 0) { - struct worker *wk = &workers[0]; - - /* master itself is worker (client) */ - memset(wk, 0, sizeof(*wk)); - wk->cpu = 0; - wk->pid = getpid(); - wk->conf = conf; - - udp_client(wk); - exit(0); - } - - /* - * master/worker mode. - * let worker to performe test. - */ - for (cpu = 0; cpu < CPU_SETSIZE; cpu++) { - if (!CPU_ISSET(cpu, &cpuset)) - continue; - - if (new_worker(cpu, &conf) == 0) - num_workers++; - } - - /* abort test if no worker created ! */ - if (!num_workers) - exit(1); - - /* wait all workers exit or user stop the test */ - while (num_workers) { - while (waitpid(-1, NULL, WNOHANG) > 0) - num_workers--; - - /* kill all workers if user stop test */ - if (quit_test) { - for (cpu = 0; cpu < CPU_SETSIZE; cpu++) { - if (workers[cpu].pid == 0) - continue; - - kill(workers[cpu].pid, SIGQUIT); - } - - quit_test = 0; - } - - sleep(1); - } - - printf("Test stopped!\n"); - exit(0); -}