Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Barriers #95

Merged
merged 3 commits into from
Mar 14, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 7 additions & 12 deletions providers/mlx4/qp.c
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,7 @@ static void set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ibv_sge *sg)
* chunk and get a valid (!= * 0xffffffff) byte count but
* stale data, and end up sending the wrong data.
*/
udma_ordering_write_barrier();
udma_to_device_barrier();

if (likely(sg->length))
dseg->byte_count = htobe32(sg->length);
Expand All @@ -227,9 +227,6 @@ int mlx4_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr,

pthread_spin_lock(&qp->sq.lock);

/* Get all user DMA buffers ready to go */
udma_to_device_barrier();

/* XXX check that state is OK to post send */

ind = qp->sq.head;
Expand Down Expand Up @@ -402,7 +399,7 @@ int mlx4_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr,
wqe += to_copy;
addr += to_copy;
seg_len += to_copy;
udma_ordering_write_barrier(); /* see comment below */
udma_to_device_barrier(); /* see comment below */
seg->byte_count = htobe32(MLX4_INLINE_SEG | seg_len);
seg_len = 0;
seg = wqe;
Expand Down Expand Up @@ -430,7 +427,7 @@ int mlx4_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr,
* data, and end up sending the wrong
* data.
*/
udma_ordering_write_barrier();
udma_to_device_barrier();
seg->byte_count = htobe32(MLX4_INLINE_SEG | seg_len);
}

Expand All @@ -452,7 +449,7 @@ int mlx4_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr,
* setting ownership bit (because HW can start
* executing as soon as we do).
*/
udma_ordering_write_barrier();
udma_to_device_barrier();

ctrl->owner_opcode = htobe32(mlx4_ib_opcode[wr->opcode]) |
(ind & qp->sq.wqe_cnt ? htobe32(1 << 31) : 0);
Expand All @@ -476,18 +473,16 @@ int mlx4_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr,
ctrl->owner_opcode |= htobe32((qp->sq.head & 0xffff) << 8);

ctrl->bf_qpn |= qp->doorbell_qpn;
++qp->sq.head;
/*
* Make sure that descriptor is written to memory
* before writing to BlueFlame page.
*/
mmio_wc_start();

++qp->sq.head;

pthread_spin_lock(&ctx->bf_lock);
mmio_wc_spinlock(&ctx->bf_lock);

mlx4_bf_copy(ctx->bf_page + ctx->bf_offset, (unsigned long *) ctrl,
align(size * 16, 64));
/* Flush before toggling bf_offset to be latency oriented */
mmio_flush_writes();

ctx->bf_offset ^= ctx->bf_buf_size;
Expand Down
2 changes: 1 addition & 1 deletion providers/mlx5/mlx5.c
Original file line number Diff line number Diff line change
Expand Up @@ -524,7 +524,7 @@ static int get_num_low_lat_uuars(int tot_uuars)
*/
static int need_uuar_lock(struct mlx5_context *ctx, int uuarn)
{
if (uuarn == 0)
if (uuarn == 0 || mlx5_single_threaded)
return 0;

if (uuarn >= (ctx->tot_uuars - ctx->low_lat_uuars) * 2)
Expand Down
7 changes: 4 additions & 3 deletions providers/mlx5/qp.c
Original file line number Diff line number Diff line change
Expand Up @@ -930,11 +930,11 @@ static inline int _mlx5_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr,

/* Make sure that the doorbell write happens before the memcpy
* to WC memory below */
mmio_wc_start();

ctx = to_mctx(ibqp->context);
if (bf->need_lock)
mlx5_spin_lock(&bf->lock);
mmio_wc_spinlock(&bf->lock.lock);
else
mmio_wc_start();

if (!ctx->shut_up_bf && nreq == 1 && bf->uuarn &&
(inl || ctx->prefer_bf) && size > 1 &&
Expand All @@ -953,6 +953,7 @@ static inline int _mlx5_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr,
* writes doorbell 2, and it's write is flushed earlier. Since
* the mmio_flush_writes is CPU local, this will result in the HCA seeing
* doorbell 2, followed by doorbell 1.
* Flush before toggling bf_offset to be latency oriented.
*/
mmio_flush_writes();
bf->offset ^= bf->buf_size;
Expand Down
35 changes: 35 additions & 0 deletions util/udma_barrier.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@
#ifndef __UTIL_UDMA_BARRIER_H
#define __UTIL_UDMA_BARRIER_H

#include <pthread.h>

/* Barriers for DMA.

These barriers are expliclty only for use with user DMA operations. If you
Expand Down Expand Up @@ -222,4 +224,37 @@
*/
#define mmio_ordered_writes_hack() mmio_flush_writes()

/* Write Combining Spinlock primitive

Any access to a multi-value WC region must ensure that multiple cpus do not
write to the same values concurrently, these macros make that
straightforward and efficient if the choosen exclusion is a spinlock.

The spinlock guarantees that the WC writes issued within the critical
section are made visible as TLP to the device. The TLP must be seen by the
device strictly in the order that the spinlocks are acquired, and combining
WC writes between different sections is not permitted.

Use of these macros allow the fencing inside the spinlock to be combined
with the fencing required for DMA.
*/
static inline void mmio_wc_spinlock(pthread_spinlock_t *lock)
{
pthread_spin_lock(lock);
#if !defined(__i386__) && !defined(__x86_64__)
/* For x86 the serialization within the spin lock is enough to
* strongly order WC and other memory types. */
mmio_wc_start();
#endif
}

static inline void mmio_wc_spinunlock(pthread_spinlock_t *lock)
{
/* It is possible that on x86 the atomic in the lock is strong enough
* to force-flush the WC buffers quickly, and this SFENCE can be
* omitted too. */
mmio_flush_writes();
pthread_spin_unlock(lock);
}

#endif