Skip to content

Commit

Permalink
net/i40e: optimize Tx by using AVX512
Browse files Browse the repository at this point in the history
Optimize Tx path by using AVX512 instructions and vectorize the
tx free bufs process.

Signed-off-by: Leyi Rong <leyi.rong@intel.com>
Signed-off-by: Bruce Richardson <bruce.richardson@intel.com>
Acked-by: Wenzhuo Lu <wenzhuo.lu@intel.com>
  • Loading branch information
Ninja-Mobius authored and Ferruh Yigit committed Jan 19, 2021
1 parent e6a6a13 commit 5171b4e
Show file tree
Hide file tree
Showing 3 changed files with 155 additions and 20 deletions.
19 changes: 19 additions & 0 deletions drivers/net/i40e/i40e_rxtx.c
Original file line number Diff line number Diff line change
Expand Up @@ -2533,6 +2533,25 @@ i40e_tx_queue_release_mbufs(struct i40e_tx_queue *txq)
* vPMD tx will not set sw_ring's mbuf to NULL after free,
* so need to free remains more carefully.
*/
#ifdef CC_AVX512_SUPPORT
if (dev->tx_pkt_burst == i40e_xmit_pkts_vec_avx512) {
struct i40e_vec_tx_entry *swr = (void *)txq->sw_ring;

i = txq->tx_next_dd - txq->tx_rs_thresh + 1;
if (txq->tx_tail < i) {
for (; i < txq->nb_tx_desc; i++) {
rte_pktmbuf_free_seg(swr[i].mbuf);
swr[i].mbuf = NULL;
}
i = 0;
}
for (; i < txq->tx_tail; i++) {
rte_pktmbuf_free_seg(swr[i].mbuf);
swr[i].mbuf = NULL;
}
return;
}
#endif
if (dev->tx_pkt_burst == i40e_xmit_pkts_vec_avx2 ||
dev->tx_pkt_burst == i40e_xmit_pkts_vec) {
i = txq->tx_next_dd - txq->tx_rs_thresh + 1;
Expand Down
4 changes: 4 additions & 0 deletions drivers/net/i40e/i40e_rxtx.h
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,10 @@ struct i40e_tx_entry {
uint16_t last_id;
};

struct i40e_vec_tx_entry {
struct rte_mbuf *mbuf;
};

/*
* Structure associated with each TX queue.
*/
Expand Down
152 changes: 132 additions & 20 deletions drivers/net/i40e/i40e_rxtx_vec_avx512.c
Original file line number Diff line number Diff line change
Expand Up @@ -873,6 +873,115 @@ i40e_recv_scattered_pkts_vec_avx512(void *rx_queue,
rx_pkts + retval, nb_pkts);
}

static __rte_always_inline int
i40e_tx_free_bufs_avx512(struct i40e_tx_queue *txq)
{
struct i40e_vec_tx_entry *txep;
uint32_t n;
uint32_t i;
int nb_free = 0;
struct rte_mbuf *m, *free[RTE_I40E_TX_MAX_FREE_BUF_SZ];

/* check DD bits on threshold descriptor */
if ((txq->tx_ring[txq->tx_next_dd].cmd_type_offset_bsz &
rte_cpu_to_le_64(I40E_TXD_QW1_DTYPE_MASK)) !=
rte_cpu_to_le_64(I40E_TX_DESC_DTYPE_DESC_DONE))
return 0;

n = txq->tx_rs_thresh;

/* first buffer to free from S/W ring is at index
* tx_next_dd - (tx_rs_thresh-1)
*/
txep = (void *)txq->sw_ring;
txep += txq->tx_next_dd - (n - 1);

if (txq->offloads & DEV_TX_OFFLOAD_MBUF_FAST_FREE && (n & 31) == 0) {
struct rte_mempool *mp = txep[0].mbuf->pool;
void **cache_objs;
struct rte_mempool_cache *cache = rte_mempool_default_cache(mp,
rte_lcore_id());

if (!cache || cache->len == 0)
goto normal;

cache_objs = &cache->objs[cache->len];

if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n);
goto done;
}

/* The cache follows the following algorithm
* 1. Add the objects to the cache
* 2. Anything greater than the cache min value (if it
* crosses the cache flush threshold) is flushed to the ring.
*/
/* Add elements back into the cache */
uint32_t copied = 0;
/* n is multiple of 32 */
while (copied < n) {
const __m512i a = _mm512_load_si512(&txep[copied]);
const __m512i b = _mm512_load_si512(&txep[copied + 8]);
const __m512i c = _mm512_load_si512(&txep[copied + 16]);
const __m512i d = _mm512_load_si512(&txep[copied + 24]);

_mm512_storeu_si512(&cache_objs[copied], a);
_mm512_storeu_si512(&cache_objs[copied + 8], b);
_mm512_storeu_si512(&cache_objs[copied + 16], c);
_mm512_storeu_si512(&cache_objs[copied + 24], d);
copied += 32;
}
cache->len += n;

if (cache->len >= cache->flushthresh) {
rte_mempool_ops_enqueue_bulk
(mp, &cache->objs[cache->size],
cache->len - cache->size);
cache->len = cache->size;
}
goto done;
}

normal:
m = rte_pktmbuf_prefree_seg(txep[0].mbuf);
if (likely(m)) {
free[0] = m;
nb_free = 1;
for (i = 1; i < n; i++) {
rte_prefetch0(&txep[i + 3].mbuf->cacheline1);
m = rte_pktmbuf_prefree_seg(txep[i].mbuf);
if (likely(m)) {
if (likely(m->pool == free[0]->pool)) {
free[nb_free++] = m;
} else {
rte_mempool_put_bulk(free[0]->pool,
(void *)free,
nb_free);
free[0] = m;
nb_free = 1;
}
}
}
rte_mempool_put_bulk(free[0]->pool, (void **)free, nb_free);
} else {
for (i = 1; i < n; i++) {
m = rte_pktmbuf_prefree_seg(txep[i].mbuf);
if (m)
rte_mempool_put(m->pool, m);
}
}

done:
/* buffers were freed, update counters */
txq->nb_tx_free = (uint16_t)(txq->nb_tx_free + txq->tx_rs_thresh);
txq->tx_next_dd = (uint16_t)(txq->tx_next_dd + txq->tx_rs_thresh);
if (txq->tx_next_dd >= txq->nb_tx_desc)
txq->tx_next_dd = (uint16_t)(txq->tx_rs_thresh - 1);

return txq->tx_rs_thresh;
}

static inline void
vtx1(volatile struct i40e_tx_desc *txdp, struct rte_mbuf *pkt, uint64_t flags)
{
Expand All @@ -892,13 +1001,6 @@ vtx(volatile struct i40e_tx_desc *txdp,
const uint64_t hi_qw_tmpl = (I40E_TX_DESC_DTYPE_DATA |
((uint64_t)flags << I40E_TXD_QW1_CMD_SHIFT));

/* if unaligned on 32-bit boundary, do one to align */
if (((uintptr_t)txdp & 0x1F) != 0 && nb_pkts != 0) {
vtx1(txdp, *pkt, flags);
nb_pkts--, txdp++, pkt++;
}

/* do two at a time while possible, in bursts */
for (; nb_pkts > 3; txdp += 4, pkt += 4, nb_pkts -= 4) {
uint64_t hi_qw3 =
hi_qw_tmpl |
Expand All @@ -917,14 +1019,13 @@ vtx(volatile struct i40e_tx_desc *txdp,
((uint64_t)pkt[0]->data_len <<
I40E_TXD_QW1_TX_BUF_SZ_SHIFT);

__m256i desc2_3 = _mm256_set_epi64x
__m512i desc0_3 =
_mm512_set_epi64
(hi_qw3, pkt[3]->buf_iova + pkt[3]->data_off,
hi_qw2, pkt[2]->buf_iova + pkt[2]->data_off);
__m256i desc0_1 = _mm256_set_epi64x
(hi_qw1, pkt[1]->buf_iova + pkt[1]->data_off,
hi_qw2, pkt[2]->buf_iova + pkt[2]->data_off,
hi_qw1, pkt[1]->buf_iova + pkt[1]->data_off,
hi_qw0, pkt[0]->buf_iova + pkt[0]->data_off);
_mm256_store_si256((void *)(txdp + 2), desc2_3);
_mm256_store_si256((void *)txdp, desc0_1);
_mm512_storeu_si512((void *)txdp, desc0_3);
}

/* do any last ones */
Expand All @@ -934,13 +1035,23 @@ vtx(volatile struct i40e_tx_desc *txdp,
}
}

static __rte_always_inline void
tx_backlog_entry_avx512(struct i40e_vec_tx_entry *txep,
struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
{
int i;

for (i = 0; i < (int)nb_pkts; ++i)
txep[i].mbuf = tx_pkts[i];
}

static inline uint16_t
i40e_xmit_fixed_burst_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
uint16_t nb_pkts)
{
struct i40e_tx_queue *txq = (struct i40e_tx_queue *)tx_queue;
volatile struct i40e_tx_desc *txdp;
struct i40e_tx_entry *txep;
struct i40e_vec_tx_entry *txep;
uint16_t n, nb_commit, tx_id;
uint64_t flags = I40E_TD_CMD;
uint64_t rs = I40E_TX_DESC_CMD_RS | I40E_TD_CMD;
Expand All @@ -949,21 +1060,22 @@ i40e_xmit_fixed_burst_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
nb_pkts = RTE_MIN(nb_pkts, txq->tx_rs_thresh);

if (txq->nb_tx_free < txq->tx_free_thresh)
i40e_tx_free_bufs(txq);
i40e_tx_free_bufs_avx512(txq);

nb_commit = nb_pkts = (uint16_t)RTE_MIN(txq->nb_tx_free, nb_pkts);
if (unlikely(nb_pkts == 0))
return 0;

tx_id = txq->tx_tail;
txdp = &txq->tx_ring[tx_id];
txep = &txq->sw_ring[tx_id];
txep = (void *)txq->sw_ring;
txep += tx_id;

txq->nb_tx_free = (uint16_t)(txq->nb_tx_free - nb_pkts);

n = (uint16_t)(txq->nb_tx_desc - tx_id);
if (nb_commit >= n) {
tx_backlog_entry(txep, tx_pkts, n);
tx_backlog_entry_avx512(txep, tx_pkts, n);

vtx(txdp, tx_pkts, n - 1, flags);
tx_pkts += (n - 1);
Expand All @@ -977,11 +1089,11 @@ i40e_xmit_fixed_burst_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
txq->tx_next_rs = (uint16_t)(txq->tx_rs_thresh - 1);

/* avoid reach the end of ring */
txdp = &txq->tx_ring[tx_id];
txep = &txq->sw_ring[tx_id];
txdp = txq->tx_ring;
txep = (void *)txq->sw_ring;
}

tx_backlog_entry(txep, tx_pkts, nb_commit);
tx_backlog_entry_avx512(txep, tx_pkts, nb_commit);

vtx(txdp, tx_pkts, nb_commit, flags);

Expand Down

0 comments on commit 5171b4e

Please sign in to comment.