linux-rdma · dledford · Mar 14, 2017 · Mar 14, 2017 · Mar 13, 2017 · Mar 7, 2017
diff --git a/providers/mlx4/qp.c b/providers/mlx4/qp.c
@@ -203,7 +203,7 @@ static void set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ibv_sge *sg)
 	 * chunk and get a valid (!= * 0xffffffff) byte count but
 	 * stale data, and end up sending the wrong data.
 	 */
-	udma_ordering_write_barrier();
+	udma_to_device_barrier();
 
 	if (likely(sg->length))
 		dseg->byte_count = htobe32(sg->length);
@@ -227,9 +227,6 @@ int mlx4_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr,
 
 	pthread_spin_lock(&qp->sq.lock);
 
-	/* Get all user DMA buffers ready to go */
-	udma_to_device_barrier();
-
 	/* XXX check that state is OK to post send */
 
 	ind = qp->sq.head;
@@ -402,7 +399,7 @@ int mlx4_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr,
 					wqe += to_copy;
 					addr += to_copy;
 					seg_len += to_copy;
-					udma_ordering_write_barrier(); /* see comment below */
+					udma_to_device_barrier(); /* see comment below */
 					seg->byte_count = htobe32(MLX4_INLINE_SEG | seg_len);
 					seg_len = 0;
 					seg = wqe;
@@ -430,7 +427,7 @@ int mlx4_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr,
 				 * data, and end up sending the wrong
 				 * data.
 				 */
-				udma_ordering_write_barrier();
+				udma_to_device_barrier();
 				seg->byte_count = htobe32(MLX4_INLINE_SEG | seg_len);
 			}
 
@@ -452,7 +449,7 @@ int mlx4_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr,
 		 * setting ownership bit (because HW can start
 		 * executing as soon as we do).
 		 */
-		udma_ordering_write_barrier();
+		udma_to_device_barrier();
 
 		ctrl->owner_opcode = htobe32(mlx4_ib_opcode[wr->opcode]) |
 			(ind & qp->sq.wqe_cnt ? htobe32(1 << 31) : 0);
@@ -476,18 +473,16 @@ int mlx4_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr,
 		ctrl->owner_opcode |= htobe32((qp->sq.head & 0xffff) << 8);
 
 		ctrl->bf_qpn |= qp->doorbell_qpn;
+		++qp->sq.head;
 		/*
 		 * Make sure that descriptor is written to memory
 		 * before writing to BlueFlame page.
 		 */
-		mmio_wc_start();
-
-		++qp->sq.head;
-
-		pthread_spin_lock(&ctx->bf_lock);
+		mmio_wc_spinlock(&ctx->bf_lock);
 
 		mlx4_bf_copy(ctx->bf_page + ctx->bf_offset, (unsigned long *) ctrl,
 			     align(size * 16, 64));
+		/* Flush before toggling bf_offset to be latency oriented */
 		mmio_flush_writes();
 
 		ctx->bf_offset ^= ctx->bf_buf_size;

diff --git a/providers/mlx5/mlx5.c b/providers/mlx5/mlx5.c
@@ -524,7 +524,7 @@ static int get_num_low_lat_uuars(int tot_uuars)
  */
 static int need_uuar_lock(struct mlx5_context *ctx, int uuarn)
 {
-	if (uuarn == 0)
+	if (uuarn == 0 || mlx5_single_threaded)
 		return 0;
 
 	if (uuarn >= (ctx->tot_uuars - ctx->low_lat_uuars) * 2)

diff --git a/providers/mlx5/qp.c b/providers/mlx5/qp.c
@@ -930,11 +930,11 @@ static inline int _mlx5_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr,
 
 		/* Make sure that the doorbell write happens before the memcpy
 		 * to WC memory below */
-		mmio_wc_start();
-
 		ctx = to_mctx(ibqp->context);
 		if (bf->need_lock)
-			mlx5_spin_lock(&bf->lock);
+			mmio_wc_spinlock(&bf->lock.lock);
+		else
+			mmio_wc_start();
 
 		if (!ctx->shut_up_bf && nreq == 1 && bf->uuarn &&
 		    (inl || ctx->prefer_bf) && size > 1 &&
@@ -953,6 +953,7 @@ static inline int _mlx5_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr,
 		 * writes doorbell 2, and it's write is flushed earlier. Since
 		 * the mmio_flush_writes is CPU local, this will result in the HCA seeing
 		 * doorbell 2, followed by doorbell 1.
+		 * Flush before toggling bf_offset to be latency oriented.
 		 */
 		mmio_flush_writes();
 		bf->offset ^= bf->buf_size;

diff --git a/util/udma_barrier.h b/util/udma_barrier.h
@@ -33,6 +33,8 @@
 #ifndef __UTIL_UDMA_BARRIER_H
 #define __UTIL_UDMA_BARRIER_H
 
+#include <pthread.h>
+
 /* Barriers for DMA.
 
    These barriers are expliclty only for use with user DMA operations. If you
@@ -222,4 +224,37 @@
 */
 #define mmio_ordered_writes_hack() mmio_flush_writes()
 
+/* Write Combining Spinlock primitive
+
+   Any access to a multi-value WC region must ensure that multiple cpus do not
+   write to the same values concurrently, these macros make that
+   straightforward and efficient if the choosen exclusion is a spinlock.
+
+   The spinlock guarantees that the WC writes issued within the critical
+   section are made visible as TLP to the device. The TLP must be seen by the
+   device strictly in the order that the spinlocks are acquired, and combining
+   WC writes between different sections is not permitted.
+
+   Use of these macros allow the fencing inside the spinlock to be combined
+   with the fencing required for DMA.
+ */
+static inline void mmio_wc_spinlock(pthread_spinlock_t *lock)
+{
+	pthread_spin_lock(lock);
+#if !defined(__i386__) && !defined(__x86_64__)
+	/* For x86 the serialization within the spin lock is enough to
+	 * strongly order WC and other memory types. */
+	mmio_wc_start();
+#endif
+}
+
+static inline void mmio_wc_spinunlock(pthread_spinlock_t *lock)
+{
+	/* It is possible that on x86 the atomic in the lock is strong enough
+	 * to force-flush the WC buffers quickly, and this SFENCE can be
+	 * omitted too. */
+	mmio_flush_writes();
+	pthread_spin_unlock(lock);
+}
+
 #endif