Skip to content

Commit

Permalink
ice: replace custom AIM algorithm with kernel's DIM library
Browse files Browse the repository at this point in the history
The ice driver has support for adaptive interrupt moderation, an
algorithm for tuning the interrupt rate dynamically. This algorithm
is based on various assumptions about ring size, socket buffer size,
link speed, SKB overhead, ethernet frame overhead and more.

The Linux kernel has support for a dynamic interrupt moderation
algorithm known as "dimlib". Replace the custom driver-specific
implementation of dynamic interrupt moderation with the kernel's
algorithm.

The Intel hardware has a different hardware implementation than the
originators of the dimlib code had to work with, which requires the
driver to use a slightly different set of inputs for the actual
moderation values, while getting all the advice from dimlib of
better/worse, shift left or right.

The change made for this implementation is to use a pair of values
for each of the 5 "slots" that the dimlib moderation expects, and
the driver will program those pairs when dimlib recommends a slot to
use. The currently implementation uses two tables, one for receive
and one for transmit, and the pairs of values in each slot set the
maximum delay of an interrupt and a maximum number of interrupts per
second (both expressed in microseconds).

There are two separate kinds of bugs fixed by using DIMLIB, one is
UDP single stream send was too slow, and the other is that 8K
ping-pong was going to the most aggressive moderation and has much
too high latency.

The overall result of using DIMLIB is that we meet or exceed our
performance expectations set based on the old algorithm.

Co-developed-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Signed-off-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Tested-by: Tony Brelinski <tonyx.brelinski@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
  • Loading branch information
jacob-keller authored and anguy11 committed Apr 15, 2021
1 parent b8b4772 commit cdf1f1f
Show file tree
Hide file tree
Showing 7 changed files with 175 additions and 267 deletions.
1 change: 1 addition & 0 deletions drivers/net/ethernet/intel/Kconfig
Original file line number Diff line number Diff line change
Expand Up @@ -294,6 +294,7 @@ config ICE
tristate "Intel(R) Ethernet Connection E800 Series Support"
default n
depends on PCI_MSI
select DIMLIB
select NET_DEVLINK
select PLDMFW
help
Expand Down
5 changes: 4 additions & 1 deletion drivers/net/ethernet/intel/ice/ice.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
#include <linux/bpf.h>
#include <linux/avf/virtchnl.h>
#include <linux/cpu_rmap.h>
#include <linux/dim.h>
#include <net/devlink.h>
#include <net/ipv6.h>
#include <net/xdp_sock.h>
Expand Down Expand Up @@ -351,7 +352,7 @@ struct ice_q_vector {
u16 reg_idx;
u8 num_ring_rx; /* total number of Rx rings in vector */
u8 num_ring_tx; /* total number of Tx rings in vector */
u8 itr_countdown; /* when 0 should adjust adaptive ITR */
u8 wb_on_itr:1; /* if true, WB on ITR is enabled */
/* in usecs, need to use ice_intrl_to_usecs_reg() before writing this
* value to the device
*/
Expand All @@ -366,6 +367,8 @@ struct ice_q_vector {
struct irq_affinity_notify affinity_notify;

char name[ICE_INT_NAME_STR_LEN];

u16 total_events; /* net_dim(): number of interrupts processed */
} ____cacheline_internodealigned_in_smp;

enum ice_pf_flags {
Expand Down
6 changes: 6 additions & 0 deletions drivers/net/ethernet/intel/ice/ice_ethtool.c
Original file line number Diff line number Diff line change
Expand Up @@ -3634,6 +3634,12 @@ ice_set_rc_coalesce(enum ice_container_type c_type, struct ethtool_coalesce *ec,
ICE_MAX_INTRL);
return -EINVAL;
}
if (ec->rx_coalesce_usecs_high != rc->ring->q_vector->intrl &&
(ec->use_adaptive_rx_coalesce || ec->use_adaptive_tx_coalesce)) {
netdev_info(vsi->netdev, "Invalid value, %s-usecs-high cannot be changed if adaptive-tx or adaptive-rx is enabled\n",
c_type_str);
return -EINVAL;
}
if (ec->rx_coalesce_usecs_high != rc->ring->q_vector->intrl) {
rc->ring->q_vector->intrl = ec->rx_coalesce_usecs_high;
ice_write_intrl(rc->ring->q_vector,
Expand Down
13 changes: 5 additions & 8 deletions drivers/net/ethernet/intel/ice/ice_lib.c
Original file line number Diff line number Diff line change
Expand Up @@ -385,6 +385,8 @@ static irqreturn_t ice_msix_clean_rings(int __always_unused irq, void *data)
if (!q_vector->tx.ring && !q_vector->rx.ring)
return IRQ_HANDLED;

q_vector->total_events++;

napi_schedule(&q_vector->napi);

return IRQ_HANDLED;
Expand Down Expand Up @@ -3270,20 +3272,15 @@ int ice_vsi_cfg_tc(struct ice_vsi *vsi, u8 ena_tc)
/**
* ice_update_ring_stats - Update ring statistics
* @ring: ring to update
* @cont: used to increment per-vector counters
* @pkts: number of processed packets
* @bytes: number of processed bytes
*
* This function assumes that caller has acquired a u64_stats_sync lock.
*/
static void
ice_update_ring_stats(struct ice_ring *ring, struct ice_ring_container *cont,
u64 pkts, u64 bytes)
static void ice_update_ring_stats(struct ice_ring *ring, u64 pkts, u64 bytes)
{
ring->stats.bytes += bytes;
ring->stats.pkts += pkts;
cont->total_bytes += bytes;
cont->total_pkts += pkts;
}

/**
Expand All @@ -3295,7 +3292,7 @@ ice_update_ring_stats(struct ice_ring *ring, struct ice_ring_container *cont,
void ice_update_tx_ring_stats(struct ice_ring *tx_ring, u64 pkts, u64 bytes)
{
u64_stats_update_begin(&tx_ring->syncp);
ice_update_ring_stats(tx_ring, &tx_ring->q_vector->tx, pkts, bytes);
ice_update_ring_stats(tx_ring, pkts, bytes);
u64_stats_update_end(&tx_ring->syncp);
}

Expand All @@ -3308,7 +3305,7 @@ void ice_update_tx_ring_stats(struct ice_ring *tx_ring, u64 pkts, u64 bytes)
void ice_update_rx_ring_stats(struct ice_ring *rx_ring, u64 pkts, u64 bytes)
{
u64_stats_update_begin(&rx_ring->syncp);
ice_update_ring_stats(rx_ring, &rx_ring->q_vector->rx, pkts, bytes);
ice_update_ring_stats(rx_ring, pkts, bytes);
u64_stats_update_end(&rx_ring->syncp);
}

Expand Down
108 changes: 108 additions & 0 deletions drivers/net/ethernet/intel/ice/ice_main.c
Original file line number Diff line number Diff line change
Expand Up @@ -5196,6 +5196,105 @@ int ice_vsi_cfg(struct ice_vsi *vsi)
return err;
}

/* THEORY OF MODERATION:
* The below code creates custom DIM profiles for use by this driver, because
* the ice driver hardware works differently than the hardware that DIMLIB was
* originally made for. ice hardware doesn't have packet count limits that
* can trigger an interrupt, but it *does* have interrupt rate limit support,
* and this code adds that capability to be used by the driver when it's using
* DIMLIB. The DIMLIB code was always designed to be a suggestion to the driver
* for how to "respond" to traffic and interrupts, so this driver uses a
* slightly different set of moderation parameters to get best performance.
*/
struct ice_dim {
/* the throttle rate for interrupts, basically worst case delay before
* an initial interrupt fires, value is stored in microseconds.
*/
u16 itr;
/* the rate limit for interrupts, which can cap a delay from a small
* ITR at a certain amount of interrupts per second. f.e. a 2us ITR
* could yield as much as 500,000 interrupts per second, but with a
* 10us rate limit, it limits to 100,000 interrupts per second. Value
* is stored in microseconds.
*/
u16 intrl;
};

/* Make a different profile for Rx that doesn't allow quite so aggressive
* moderation at the high end (it maxes out at 128us or about 8k interrupts a
* second. The INTRL/rate parameters here are only useful to cap small ITR
* values, which is why for larger ITR's - like 128, which can only generate
* 8k interrupts per second, there is no point to rate limit and the values
* are set to zero. The rate limit values do affect latency, and so must
* be reasonably small so to not impact latency sensitive tests.
*/
static const struct ice_dim rx_profile[] = {
{2, 10},
{8, 16},
{32, 0},
{96, 0},
{128, 0}
};

/* The transmit profile, which has the same sorts of values
* as the previous struct
*/
static const struct ice_dim tx_profile[] = {
{2, 10},
{8, 16},
{64, 0},
{128, 0},
{256, 0}
};

static void ice_tx_dim_work(struct work_struct *work)
{
struct ice_ring_container *rc;
struct ice_q_vector *q_vector;
struct dim *dim;
u16 itr, intrl;

dim = container_of(work, struct dim, work);
rc = container_of(dim, struct ice_ring_container, dim);
q_vector = container_of(rc, struct ice_q_vector, tx);

if (dim->profile_ix >= ARRAY_SIZE(tx_profile))
dim->profile_ix = ARRAY_SIZE(tx_profile) - 1;

/* look up the values in our local table */
itr = tx_profile[dim->profile_ix].itr;
intrl = tx_profile[dim->profile_ix].intrl;

ice_write_itr(rc, itr);
ice_write_intrl(q_vector, intrl);

dim->state = DIM_START_MEASURE;
}

static void ice_rx_dim_work(struct work_struct *work)
{
struct ice_ring_container *rc;
struct ice_q_vector *q_vector;
struct dim *dim;
u16 itr, intrl;

dim = container_of(work, struct dim, work);
rc = container_of(dim, struct ice_ring_container, dim);
q_vector = container_of(rc, struct ice_q_vector, rx);

if (dim->profile_ix >= ARRAY_SIZE(rx_profile))
dim->profile_ix = ARRAY_SIZE(rx_profile) - 1;

/* look up the values in our local table */
itr = rx_profile[dim->profile_ix].itr;
intrl = rx_profile[dim->profile_ix].intrl;

ice_write_itr(rc, itr);
ice_write_intrl(q_vector, intrl);

dim->state = DIM_START_MEASURE;
}

/**
* ice_napi_enable_all - Enable NAPI for all q_vectors in the VSI
* @vsi: the VSI being configured
Expand All @@ -5210,6 +5309,12 @@ static void ice_napi_enable_all(struct ice_vsi *vsi)
ice_for_each_q_vector(vsi, q_idx) {
struct ice_q_vector *q_vector = vsi->q_vectors[q_idx];

INIT_WORK(&q_vector->tx.dim.work, ice_tx_dim_work);
q_vector->tx.dim.mode = DIM_CQ_PERIOD_MODE_START_FROM_EQE;

INIT_WORK(&q_vector->rx.dim.work, ice_rx_dim_work);
q_vector->rx.dim.mode = DIM_CQ_PERIOD_MODE_START_FROM_EQE;

if (q_vector->rx.ring || q_vector->tx.ring)
napi_enable(&q_vector->napi);
}
Expand Down Expand Up @@ -5618,6 +5723,9 @@ static void ice_napi_disable_all(struct ice_vsi *vsi)

if (q_vector->rx.ring || q_vector->tx.ring)
napi_disable(&q_vector->napi);

cancel_work_sync(&q_vector->tx.dim.work);
cancel_work_sync(&q_vector->rx.dim.work);
}
}

Expand Down
Loading

0 comments on commit cdf1f1f

Please sign in to comment.