From 357283edddf7a74fc0c4264ba7296f297ea0f133 Mon Sep 17 00:00:00 2001
From: JackH <jack@realtimeradio.co.uk>
Date: Tue, 16 Jun 2020 08:23:01 -0700
Subject: [PATCH 01/91] Add LWA352 packet format

---
 python/bifrost/libbifrost.py     |   1 +
 python/bifrost/packet_capture.py |   4 +
 src/bifrost/packet_capture.h     |   4 +
 src/formats/base.hpp             |   4 +
 src/formats/formats.hpp          |   1 +
 src/formats/snap2.hpp            | 190 +++++++++++++++++++++++++++++++
 src/packet_capture.cpp           |  10 ++
 src/packet_capture.hpp           |  90 ++++++++++++++-
 8 files changed, 303 insertions(+), 1 deletion(-)
 create mode 100644 src/formats/snap2.hpp

diff --git a/python/bifrost/libbifrost.py b/python/bifrost/libbifrost.py
index 897d9fd65..68e64654c 100644
--- a/python/bifrost/libbifrost.py
+++ b/python/bifrost/libbifrost.py
@@ -47,6 +47,7 @@ class BifrostObject(object):
     """Base class for simple objects with create/destroy functions"""
     def __init__(self, constructor, destructor, *args):
         self.obj = destructor.argtypes[0]()
+        print(self.obj)
         _check(constructor(ctypes.byref(self.obj), *args))
         self._destructor = destructor
     def _destroy(self):
diff --git a/python/bifrost/packet_capture.py b/python/bifrost/packet_capture.py
index f5fc446ca..fc02d4ffb 100644
--- a/python/bifrost/packet_capture.py
+++ b/python/bifrost/packet_capture.py
@@ -40,6 +40,10 @@ def set_chips(self, fnc):
         self._ref_cache['chips'] = _bf.BFpacketcapture_chips_sequence_callback(fnc)
         _check(_bf.bfPacketCaptureCallbackSetCHIPS(
                self.obj, self._ref_cache['chips']))
+    def set_snap2(self, fnc):
+        self._ref_cache['snap2'] = _bf.BFpacketcapture_snap2_sequence_callback(fnc)
+        _check(_bf.bfPacketCaptureCallbackSetSNAP2(
+               self.obj, self._ref_cache['snap2']))
     def set_ibeam(self, fnc):
         self._ref_cache['ibeam'] = _bf.BFpacketcapture_ibeam_sequence_callback(fnc)
         _check(_bf.bfPacketCaptureCallbackSetIBeam(
diff --git a/src/bifrost/packet_capture.h b/src/bifrost/packet_capture.h
index b32af9bdb..0bf38c64e 100644
--- a/src/bifrost/packet_capture.h
+++ b/src/bifrost/packet_capture.h
@@ -39,6 +39,8 @@ extern "C" {
 
 typedef int (*BFpacketcapture_chips_sequence_callback)(BFoffset, int, int, int,
                                                        BFoffset*, void const**, size_t*);
+typedef int (*BFpacketcapture_snap2_sequence_callback)(BFoffset, int, int, int,
+                                                       BFoffset*, void const**, size_t*);
 typedef int (*BFpacketcapture_ibeam_sequence_callback)(BFoffset, int, int, int,
                                                        BFoffset*, void const**, size_t*);
 typedef int (*BFpacketcapture_cor_sequence_callback)(BFoffset, BFoffset, int, int,
@@ -56,6 +58,8 @@ BFstatus bfPacketCaptureCallbackCreate(BFpacketcapture_callback* obj);
 BFstatus bfPacketCaptureCallbackDestroy(BFpacketcapture_callback obj);
 BFstatus bfPacketCaptureCallbackSetCHIPS(BFpacketcapture_callback obj,
                                          BFpacketcapture_chips_sequence_callback callback);
+BFstatus bfPacketCaptureCallbackSetSNAP2(BFpacketcapture_callback obj,
+                                         BFpacketcapture_snap2_sequence_callback callback);
 BFstatus bfPacketCaptureCallbackSetIBeam(BFpacketcapture_callback obj,
                                          BFpacketcapture_ibeam_sequence_callback callback);
 BFstatus bfPacketCaptureCallbackSetCOR(BFpacketcapture_callback obj,
diff --git a/src/formats/base.hpp b/src/formats/base.hpp
index 8bc05fedc..8e545a611 100644
--- a/src/formats/base.hpp
+++ b/src/formats/base.hpp
@@ -64,6 +64,10 @@ struct PacketDesc {
 	int            src;
 	int            nchan;
 	int            chan0;
+	int            nchan_tot;
+	int            npol;
+	int            pol0;
+	int            npol_tot;
 	uint32_t       sync;
 	uint64_t       time_tag;
 	int            tuning;
diff --git a/src/formats/formats.hpp b/src/formats/formats.hpp
index 311319169..6e2f4c527 100644
--- a/src/formats/formats.hpp
+++ b/src/formats/formats.hpp
@@ -35,3 +35,4 @@
 #include "tbn.hpp"
 #include "tbf.hpp"
 #include "ibeam.hpp"
+#include "snap2.hpp"
diff --git a/src/formats/snap2.hpp b/src/formats/snap2.hpp
new file mode 100644
index 000000000..a63108c86
--- /dev/null
+++ b/src/formats/snap2.hpp
@@ -0,0 +1,190 @@
+/*
+ * Copyright (c) 2019, The Bifrost Authors. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * * Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in the
+ *   documentation and/or other materials provided with the distribution.
+ * * Neither the name of The Bifrost Authors nor the names of its
+ *   contributors may be used to endorse or promote products derived
+ *   from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#pragma once
+
+#include "base.hpp"
+
+//#include <immintrin.h> // SSE
+
+#define SNAP2_HEADER_MAGIC 0xaabbccdd
+
+// TODO: parameterize somewhere. This isn't
+// related to the packet formatting
+#define PIPELINE_NPOL 704
+#define PIPELINE_NCHAN 32
+
+#pragma pack(1)
+// All entries are network (i.e. big) endian
+struct snap2_hdr_type {
+        uint64_t  seq;       // Spectra counter == packet counter
+        uint32_t  magic;     // = 0xaabbccdd
+        uint16_t  npol;      // Number of pols in this packet
+        uint16_t  npol_tot;      // Number of pols total
+        uint16_t  nchan;     // Number of channels in this packet
+        uint16_t  nchan_tot;     // Number of channels total (for this pipeline)
+        uint32_t  chan_block_id; // ID of this block of chans
+        uint32_t  chan0;     // First channel in this packet 
+        uint32_t  pol0;      // First pol in this packet 
+};
+
+/*
+ * The PacketDecoder's job is to unpack
+ * a packet into a standard PacketDesc
+ * format, and verify that a packet
+ * is valid.
+ */
+
+#define BF_SNAP2_DEBUG 0
+
+class SNAP2Decoder : virtual public PacketDecoder {
+protected:
+    inline bool valid_packet(const PacketDesc* pkt) const {
+#if BFSNAP2_DEBUG
+        cout << "seq: "<< pkt->seq << endl;
+        cout << "src: "<< pkt->src << endl;
+        cout << "nsrc: "<< pkt->nsrc << endl;
+        cout << "nchan: "<< pkt->nchan << endl;
+        cout << "chan0: "<< pkt->chan0 << endl;
+#endif
+        return ( 
+                 pkt->seq >= 0
+                 && pkt->src >= 0
+                 && pkt->src < _nsrc
+                 && pkt->nsrc == _nsrc
+                 && pkt->chan0 >= 0
+               );
+    }
+public:
+    SNAP2Decoder(int nsrc, int src0) : PacketDecoder(nsrc, src0) {}
+    inline bool operator()(const uint8_t* pkt_ptr,
+                           int            pkt_size,
+                           PacketDesc*    pkt) const {
+        if( pkt_size < (int)sizeof(snap2_hdr_type) ) {
+            return false;
+        }
+        const snap2_hdr_type* pkt_hdr  = (snap2_hdr_type*)pkt_ptr;
+        const uint8_t*        pkt_pld  = pkt_ptr  + sizeof(snap2_hdr_type);
+        int                   pld_size = pkt_size - sizeof(snap2_hdr_type);
+        if( be32toh(pkt_hdr->magic) != SNAP2_HEADER_MAGIC ) {
+            return false;
+        }
+        pkt->seq   = be64toh(pkt_hdr->seq);
+        int npol_blocks  = (be16toh(pkt_hdr->npol_tot) / be16toh(pkt_hdr->npol));
+        int nchan_blocks = (be16toh(pkt_hdr->nchan_tot) / be16toh(pkt_hdr->nchan));
+
+        pkt->nsrc = npol_blocks * nchan_blocks;// _nsrc;
+        pkt->src = (npol_blocks * be16toh(pkt_hdr->chan_block_id)) +  (be16toh(pkt_hdr->npol_tot) / be16toh(pkt_hdr->npol));
+        pkt->nchan  = be16toh(pkt_hdr->nchan);
+        pkt->chan0  = be32toh(pkt_hdr->chan_block_id) * be16toh(pkt_hdr->nchan);
+        pkt->nchan_tot  = be16toh(pkt_hdr->nchan_tot);
+        pkt->npol  = be16toh(pkt_hdr->npol);
+        pkt->npol_tot  = be16toh(pkt_hdr->npol_tot);
+        pkt->pol0  = be32toh(pkt_hdr->pol0);
+        pkt->payload_size = pld_size;
+        pkt->payload_ptr  = pkt_pld;
+        return this->valid_packet(pkt);
+    }
+};
+
+class SNAP2Processor : virtual public PacketProcessor {
+protected:
+    int _pipeline_nchan = PIPELINE_NCHAN;
+public:
+    inline void operator()(const PacketDesc* pkt,
+                           uint64_t          seq0,
+                           uint64_t          nseq_per_obuf,
+                           int               nbuf,
+                           uint8_t*          obufs[],
+                           size_t            ngood_bytes[],
+                           size_t*           src_ngood_bytes[]) {
+            int    obuf_idx = ((pkt->seq - seq0 >= 1*nseq_per_obuf) +
+                               (pkt->seq - seq0 >= 2*nseq_per_obuf));
+            size_t obuf_seq0 = seq0 + obuf_idx*nseq_per_obuf;
+            size_t nbyte = pkt->payload_size * BF_UNPACK_FACTOR;
+            ngood_bytes[obuf_idx]               += nbyte;
+            src_ngood_bytes[obuf_idx][pkt->src] += nbyte;
+            int payload_size = pkt->payload_size;
+        
+            size_t obuf_offset = (pkt->seq-obuf_seq0)*pkt->nsrc*payload_size;
+            typedef unaligned256_type itype; //256 bits = 32 pols / word
+            typedef aligned256_type otype;
+        
+            obuf_offset *= BF_UNPACK_FACTOR;
+        
+            // Note: Using these SSE types allows the compiler to use SSE instructions
+            //         However, they require aligned memory (otherwise segfault)
+            itype const* __restrict__ in  = (itype const*)pkt->payload_ptr;
+            otype*       __restrict__ out = (otype*      )&obufs[obuf_idx][obuf_offset];
+        
+            // Copy packet payload one channel at a time.
+            // Packets have payload format nchans x npols x complexity
+            // spacing with which channel chunks are copied depends
+            // on the total number of channels/pols in the system
+            for(int chan=0; chan<pkt->nchan; chan++) {
+                 //   // TODO: AVX stores here will probably be much faster
+                 //   ::memcpy(&out[(((pkt->npol_tot) * (pkt->chan0 + chan)) + (pkt->pol0)) / 32],
+                 //            &in[(pkt->npol / 32) * chan], pkt->npol / 32);
+            }
+    }
+
+    inline void blank_out_source(uint8_t* data,
+                                     int      src,
+                                     int      nsrc,
+                                     int      nchan,
+                                     int      nseq) {
+            typedef aligned256_type otype;
+            otype* __restrict__ aligned_data = (otype*)data;
+            for( int t=0; t<nseq; ++t ) {
+                    for( int c=0; c<nchan; ++c ) {
+                            ::memset(&aligned_data[src + nsrc*(c + nchan*t)],
+                                     0, sizeof(otype));
+                    }
+            }
+    }
+};
+
+class SNAP2HeaderFiller : virtual public PacketHeaderFiller {
+public:
+    inline int get_size() { return sizeof(chips_hdr_type); }
+    inline void operator()(const PacketDesc* hdr_base,
+                           BFoffset          framecount,
+                           char*             hdr) {
+        chips_hdr_type* header = reinterpret_cast<chips_hdr_type*>(hdr);
+        memset(header, 0, sizeof(chips_hdr_type));
+        
+        header->roach    = hdr_base->src + 1;
+        header->gbe      = hdr_base->tuning;
+        header->nchan    = hdr_base->nchan;
+        header->nsubband = 1;       // Should be changable?
+        header->subband  = 0;       // Should be changable?
+        header->nroach   = hdr_base->nsrc;
+        header->chan0    = htons(hdr_base->chan0);
+        header->seq      = htobe64(hdr_base->seq);
+    }
+};
diff --git a/src/packet_capture.cpp b/src/packet_capture.cpp
index ca594764f..05fadc816 100644
--- a/src/packet_capture.cpp
+++ b/src/packet_capture.cpp
@@ -132,6 +132,13 @@ BFstatus bfPacketCaptureCallbackSetCHIPS(BFpacketcapture_callback obj,
     return BF_STATUS_SUCCESS;
 }
 
+BFstatus bfPacketCaptureCallbackSetSNAP2(BFpacketcapture_callback obj,
+                                         BFpacketcapture_snap2_sequence_callback callback) {
+    BF_ASSERT(obj, BF_STATUS_INVALID_HANDLE);
+    obj->set_snap2(callback);
+    return BF_STATUS_SUCCESS;
+}
+
 BFstatus bfPacketCaptureCallbackSetIBeam(BFpacketcapture_callback obj,
                                          BFpacketcapture_ibeam_sequence_callback callback) {
     BF_ASSERT(obj, BF_STATUS_INVALID_HANDLE);
@@ -245,9 +252,12 @@ BFpacketcapture_status BFpacketcapture_impl::recv() {
 				ret = BF_CAPTURE_CONTINUED;
 			}
 		}
+	    BF_PRINTD("_bufs.size(): " << _bufs.size());
 		if( _bufs.size() == 2 ) {
+	        BF_PRINTD("Committing buffer");
 			this->commit_buf();
 		}
+	    BF_PRINTD("Rseerving buffer");
 		this->reserve_buf();
 	} else {
 		
diff --git a/src/packet_capture.hpp b/src/packet_capture.hpp
index bce215344..e4eb55528 100644
--- a/src/packet_capture.hpp
+++ b/src/packet_capture.hpp
@@ -350,6 +350,7 @@ inline uint64_t round_nearest(uint64_t val, uint64_t mult) {
 
 class BFpacketcapture_callback_impl {
     BFpacketcapture_chips_sequence_callback _chips_callback;
+    BFpacketcapture_snap2_sequence_callback _snap2_callback;
     BFpacketcapture_ibeam_sequence_callback _ibeam_callback;
     BFpacketcapture_cor_sequence_callback   _cor_callback;
     BFpacketcapture_vdif_sequence_callback  _vdif_callback;
@@ -358,13 +359,20 @@ class BFpacketcapture_callback_impl {
 public:
     BFpacketcapture_callback_impl()
      : _chips_callback(NULL), _ibeam_callback(NULL), _cor_callback(NULL), 
-        _vdif_callback(NULL), _tbn_callback(NULL), _drx_callback(NULL) {}
+        _vdif_callback(NULL), _tbn_callback(NULL), _drx_callback(NULL),
+        _snap2_callback(NULL) {}
     inline void set_chips(BFpacketcapture_chips_sequence_callback callback) {
         _chips_callback = callback;
     }
     inline BFpacketcapture_chips_sequence_callback get_chips() {
         return _chips_callback;
     }
+    inline void set_snap2(BFpacketcapture_snap2_sequence_callback callback) {
+        _snap2_callback = callback;
+    }
+    inline BFpacketcapture_snap2_sequence_callback get_snap2() {
+        return _snap2_callback;
+    }
     inline void set_ibeam(BFpacketcapture_ibeam_sequence_callback callback) {
         _ibeam_callback = callback;
     }
@@ -625,6 +633,81 @@ class BFpacketcapture_chips_impl : public BFpacketcapture_impl {
 	}
 };
 
+class BFpacketcapture_snap2_impl : public BFpacketcapture_impl {
+	ProcLog            _type_log;
+	ProcLog            _chan_log;
+	
+	BFpacketcapture_snap2_sequence_callback _sequence_callback;
+	
+	void on_sequence_start(const PacketDesc* pkt, BFoffset* seq0, BFoffset* time_tag, const void** hdr, size_t* hdr_size ) {
+        // TODO: Might be safer to round to nearest here, but the current firmware
+		//         always starts things ~3 seq's before the 1sec boundary anyway.
+		//seq = round_up(pkt->seq, _slot_ntime);
+		//*_seq          = round_nearest(pkt->seq, _slot_ntime);
+		_seq          = round_up(pkt->seq, _slot_ntime);
+		this->on_sequence_changed(pkt, seq0, time_tag, hdr, hdr_size);
+    }
+    void on_sequence_active(const PacketDesc* pkt) {
+        if( pkt ) {
+		    //cout << "Latest nchan, chan0 = " << pkt->nchan << ", " << pkt->chan0 << endl;
+		}
+		else {
+			//cout << "No latest packet" << endl;
+		}
+	}
+    // Has the configuration changed? I.e., different channels being sent.
+	inline bool has_sequence_changed(const PacketDesc* pkt) {
+        // TODO: sequence never changes?
+        //return false;
+	    //return (pkt->seq % 128 == 0);
+	}
+	void on_sequence_changed(const PacketDesc* pkt, BFoffset* seq0, BFoffset* time_tag, const void** hdr, size_t* hdr_size) {
+	    *seq0 = _seq;// + _nseq_per_buf*_bufs.size();
+        _chan0 = pkt->chan0;
+        _nchan = pkt->nchan;
+        _payload_size = pkt->payload_size;
+        
+	    if( _sequence_callback ) {
+	        int status = (*_sequence_callback)(*seq0,
+			                                   _chan0,
+			                                   _nchan,
+			                                   _nsrc,
+			                                   time_tag,
+			                                   hdr,
+			                                   hdr_size);
+			if( status != 0 ) {
+			    // TODO: What to do here? Needed?
+				throw std::runtime_error("BAD HEADER CALLBACK STATUS");
+			}
+		} else {
+			// Simple default for easy testing
+			*time_tag = *seq0;
+			*hdr      = NULL;
+			*hdr_size = 0;
+		}
+        
+		_chan_log.update() << "chan0        : " << _chan0 << "\n"
+		                   << "nchan        : " << _nchan << "\n"
+		                   << "payload_size : " << _payload_size << "\n";
+    }
+public:
+	inline BFpacketcapture_snap2_impl(PacketCaptureThread* capture,
+	                                  BFring               ring,
+	                                  int                  nsrc,
+	                                  int                  src0,
+	                                  int                  buffer_ntime,
+	                                  int                  slot_ntime,
+	                                  BFpacketcapture_callback sequence_callback)
+		: BFpacketcapture_impl(capture, nullptr, nullptr, ring, nsrc, buffer_ntime, slot_ntime), 
+		  _type_log((std::string(capture->get_name())+"/type").c_str()),
+		  _chan_log((std::string(capture->get_name())+"/chans").c_str()),
+		  _sequence_callback(sequence_callback->get_snap2()) {
+		_decoder = new SNAP2Decoder(nsrc, src0);
+		_processor = new SNAP2Processor();
+		_type_log.update("type : %s\n", "snap2");
+	}
+};
+
 template<uint8_t B>
 class BFpacketcapture_ibeam_impl : public BFpacketcapture_impl {
     uint8_t            _nbeam = B;
@@ -1092,6 +1175,11 @@ BFstatus BFpacketcapture_create(BFpacketcapture* obj,
                                                                  buffer_ntime, slot_ntime,
                                                                  sequence_callback),
                            *obj = 0);
+    } else if( std::string(format).substr(0, 5) == std::string("snap2") ) {
+        BF_TRY_RETURN_ELSE(*obj = new BFpacketcapture_snap2_impl(capture, ring, nsrc, src0,
+                                                                 buffer_ntime, slot_ntime,
+                                                                 sequence_callback),
+                           *obj = 0);
     } else if( std::string(format).substr(0, 6) == std::string("ibeam2") ) {
         BF_TRY_RETURN_ELSE(*obj = new BFpacketcapture_ibeam_impl<2>(capture, ring, nsrc, src0,
                                                                     buffer_ntime, slot_ntime,

From 9fb5a0940289e0b794a3468574223178894fdc39 Mon Sep 17 00:00:00 2001
From: JackH <jack@rtr>
Date: Thu, 18 Jun 2020 17:09:57 +0000
Subject: [PATCH 02/91] Add xGPU bindings

This allows the DP4A library to be used, which is way faster
---
 src/Makefile          |  7 +++++
 src/bf_xgpu.cpp       | 60 +++++++++++++++++++++++++++++++++++++++++++
 src/bifrost/bf_xgpu.h |  9 +++++++
 3 files changed, 76 insertions(+)
 create mode 100644 src/bf_xgpu.cpp
 create mode 100644 src/bifrost/bf_xgpu.h

diff --git a/src/Makefile b/src/Makefile
index 72619d2a4..a79c574de 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -37,6 +37,10 @@ ifndef NOCUDA
   fir.o \
   guantize.o \
   gunpack.o
+ifdef XGPU
+  LIBBIFROST_OBJS += \
+  bf_xgpu.o
+endif
 endif
 
 JIT_SOURCES ?= \
@@ -110,6 +114,9 @@ endif
 ifndef NOCUDA
   CPPFLAGS  += -DBF_CUDA_ENABLED=1
   LIB       += -L$(CUDA_LIBDIR64) -L$(CUDA_LIBDIR) -lcuda -lcudart -lnvrtc -lcublas -lcudadevrt -L. -lcufft_static_pruned -lculibos -lnvToolsExt
+ifdef XGPU
+  LIB       += -lxgpu
+endif
 endif
 
 ifndef ANY_ARCH
diff --git a/src/bf_xgpu.cpp b/src/bf_xgpu.cpp
new file mode 100644
index 000000000..66ea4d5b2
--- /dev/null
+++ b/src/bf_xgpu.cpp
@@ -0,0 +1,60 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <math.h>
+#include <limits.h>
+#include <unistd.h>
+
+#include <bifrost/array.h>
+#include <bifrost/common.h>
+#include <utils.hpp>
+#include <stdlib.h>
+#include <stdio.h>
+#include <iostream>
+
+#include <xgpu.h>
+
+extern "C" {
+
+static XGPUContext context;
+static XGPUInfo info;
+
+BFstatus xgpuInitialize(BFarray *in, BFarray *out, int gpu_dev) {
+  int xgpu_error;
+  xgpuInfo(&info);
+  if (num_contiguous_elements(in) != info.vecLength) {
+    return BF_STATUS_INVALID_SHAPE;
+  }
+  if (num_contiguous_elements(out) != info.matLength) {
+    return BF_STATUS_INVALID_SHAPE;
+  }
+  context.array_h = (ComplexInput *)in->data;
+  context.array_len = info.vecLength;
+  context.matrix_h = (Complex *)out->data;
+  context.matrix_len = info.matLength;
+  xgpu_error = xgpuInit(&context, gpu_dev);
+  if (xgpu_error != XGPU_OK) {
+    return BF_STATUS_INTERNAL_ERROR;
+  } else {
+    return BF_STATUS_SUCCESS;
+  }
+}
+
+BFstatus xgpuCorrelate(BFarray *in, BFarray *out, int doDump) {
+  int xgpu_error;
+  context.array_h = (ComplexInput *)in->data;
+  context.array_len = info.vecLength;
+  context.matrix_h = (Complex *)out->data;
+  context.matrix_len = info.matLength;
+  xgpu_error = xgpuCudaXengineSwizzle(&context, doDump ? SYNCOP_DUMP : SYNCOP_SYNC_TRANSFER);
+  if (doDump) {
+    xgpuClearDeviceIntegrationBuffer(&context);
+  }
+  if (xgpu_error != XGPU_OK) {
+    return BF_STATUS_INTERNAL_ERROR;
+  } else {
+    return BF_STATUS_SUCCESS;
+  }
+}
+
+} // C
diff --git a/src/bifrost/bf_xgpu.h b/src/bifrost/bf_xgpu.h
new file mode 100644
index 000000000..120fc8232
--- /dev/null
+++ b/src/bifrost/bf_xgpu.h
@@ -0,0 +1,9 @@
+#include <bifrost/common.h>
+#include <bifrost/array.h>
+
+//extern "C" {
+
+BFstatus xgpuInitialize(BFarray *in, BFarray *out, int gpu_dev);
+BFstatus xgpuCorrelate(BFarray *in, BFarray *out, int doDump);
+
+//}

From 1cf8cdbbee181de1bade6cd5855ca7e445ee7976 Mon Sep 17 00:00:00 2001
From: JackH <jack@rtr>
Date: Fri, 19 Jun 2020 13:39:50 +0000
Subject: [PATCH 03/91] remove print

---
 python/bifrost/libbifrost.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/bifrost/libbifrost.py b/python/bifrost/libbifrost.py
index 68e64654c..897d9fd65 100644
--- a/python/bifrost/libbifrost.py
+++ b/python/bifrost/libbifrost.py
@@ -47,7 +47,6 @@ class BifrostObject(object):
     """Base class for simple objects with create/destroy functions"""
     def __init__(self, constructor, destructor, *args):
         self.obj = destructor.argtypes[0]()
-        print(self.obj)
         _check(constructor(ctypes.byref(self.obj), *args))
         self._destructor = destructor
     def _destroy(self):

From 6d088a2aba4a9280fe52de0f7fac7571c45ac5a0 Mon Sep 17 00:00:00 2001
From: JackH <jack@rtr>
Date: Sun, 21 Jun 2020 16:36:44 +0000
Subject: [PATCH 04/91] Add Kernel-only xgpu functions

I.e., call xGPU passing pointers to already transferred
data on the device. This gives up xGPU's pipelining abilities,
but makes it easier to use the xGPU kernel alongside other consumers
also using the same GPU input buffer.
---
 src/Makefile          |  2 +-
 src/bf_xgpu.cpp       | 43 +++++++++++++++++++++++++++++++++++++++++--
 src/bifrost/bf_xgpu.h |  1 +
 user.mk               |  5 +++--
 4 files changed, 46 insertions(+), 5 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index a79c574de..e3d83f592 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -113,7 +113,7 @@ endif
 
 ifndef NOCUDA
   CPPFLAGS  += -DBF_CUDA_ENABLED=1
-  LIB       += -L$(CUDA_LIBDIR64) -L$(CUDA_LIBDIR) -lcuda -lcudart -lnvrtc -lcublas -lcudadevrt -L. -lcufft_static_pruned -lculibos -lnvToolsExt
+  LIB       += -L$(CUDA_LIBDIR64) -L$(CUDA_LIBDIR) -lcuda -lcudart -lnvrtc -lcublas -lcudadevrt -L. -lcufft_static_pruned -lculibos -lnvToolsExt -lxgpu
 ifdef XGPU
   LIB       += -lxgpu
 endif
diff --git a/src/bf_xgpu.cpp b/src/bf_xgpu.cpp
index 66ea4d5b2..54e11b1f8 100644
--- a/src/bf_xgpu.cpp
+++ b/src/bf_xgpu.cpp
@@ -19,6 +19,11 @@ extern "C" {
 static XGPUContext context;
 static XGPUInfo info;
 
+/* 
+ * Initialize the xGPU library by providing
+ * a pointer to the input and output data (on the host),
+ * and a GPU device ID
+ */
 BFstatus xgpuInitialize(BFarray *in, BFarray *out, int gpu_dev) {
   int xgpu_error;
   xgpuInfo(&info);
@@ -28,7 +33,7 @@ BFstatus xgpuInitialize(BFarray *in, BFarray *out, int gpu_dev) {
   if (num_contiguous_elements(out) != info.matLength) {
     return BF_STATUS_INVALID_SHAPE;
   }
-  context.array_h = (ComplexInput *)in->data;
+  context.array_h = (SwizzleInput *)in->data;
   context.array_len = info.vecLength;
   context.matrix_h = (Complex *)out->data;
   context.matrix_len = info.matLength;
@@ -40,9 +45,16 @@ BFstatus xgpuInitialize(BFarray *in, BFarray *out, int gpu_dev) {
   }
 }
 
+/*
+ * Call the xGPU kernel.
+ * in : pointer to input data array on host
+ * out: pointer to output data array on host
+ * doDump : if 1, this is the last call in an integration, and results
+ *          will be copied to the host.
+ */
 BFstatus xgpuCorrelate(BFarray *in, BFarray *out, int doDump) {
   int xgpu_error;
-  context.array_h = (ComplexInput *)in->data;
+  context.array_h = (SwizzleInput *)in->data;
   context.array_len = info.vecLength;
   context.matrix_h = (Complex *)out->data;
   context.matrix_len = info.matLength;
@@ -57,4 +69,31 @@ BFstatus xgpuCorrelate(BFarray *in, BFarray *out, int doDump) {
   }
 }
 
+/*
+ * Call the xGPU kernel having pre-copied data to device memory.
+ * Note that this means xGPU can't take advantage of its inbuild
+ * copy/compute pipelining.
+ * in : pointer to input data array on device
+ * out: pointer to output data array on device
+ * doDump : if 1, this is the last call in an integration, and results
+ *          will be copied to the host.
+ */
+BFstatus xgpuKernel(BFarray *in, BFarray *out, int doDump) {
+  int xgpu_error;
+  context.array_h = (ComplexInput *)in->data;
+  context.array_len = info.vecLength;
+  context.matrix_h = (Complex *)out->data;
+  context.matrix_len = info.matLength;
+  xgpu_error = xgpuCudaXengineSwizzleKernel(&context, doDump ? SYNCOP_DUMP : SYNCOP_SYNC_TRANSFER,
+		                           (SwizzleInput *)in->data, (Complex *)out->data);
+  if (doDump) {
+    xgpuClearDeviceIntegrationBuffer(&context);
+  }
+  if (xgpu_error != XGPU_OK) {
+    return BF_STATUS_INTERNAL_ERROR;
+  } else {
+    return BF_STATUS_SUCCESS;
+  }
+}
+
 } // C
diff --git a/src/bifrost/bf_xgpu.h b/src/bifrost/bf_xgpu.h
index 120fc8232..447c090e7 100644
--- a/src/bifrost/bf_xgpu.h
+++ b/src/bifrost/bf_xgpu.h
@@ -5,5 +5,6 @@
 
 BFstatus xgpuInitialize(BFarray *in, BFarray *out, int gpu_dev);
 BFstatus xgpuCorrelate(BFarray *in, BFarray *out, int doDump);
+BFstatus xgpuKernel(BFarray *in, BFarray *out, int doDump);
 
 //}
diff --git a/user.mk b/user.mk
index 49cc2a199..256c674f2 100644
--- a/user.mk
+++ b/user.mk
@@ -28,5 +28,6 @@ ALIGNMENT ?= 4096 # Memory allocation alignment
 #ANY_ARCH   = 1 # Disable native architecture compilation
 #CUDA_DEBUG = 1 # Enable CUDA debugging (nvcc -G)
 #NUMA       = 1 # Enable use of numa library for setting affinity of ring memory
-#HWLOC      = 1 # Enable use of hwloc library for memory binding in udp_capture
-#VMA        = 1 # Enable use of Mellanox libvma in udp_capture
+HWLOC      = 1 # Enable use of hwloc library for memory binding in udp_capture
+VMA        = 1 # Enable use of Mellanox libvma in udp_capture
+XGPU       = 1 # build xGPU integrations (requires the xGPU library)

From 93104062e745b571aa1c71d39329814defc321ee Mon Sep 17 00:00:00 2001
From: JackH <jack@rtr>
Date: Mon, 22 Jun 2020 19:58:25 +0000
Subject: [PATCH 05/91] add xgpuSubSelect; fix xgpuKernel

Add some checking for proper pointer spaces. More checking required
---
 src/bf_xgpu.cpp       | 78 ++++++++++++++++++++++++++++++++++++++-----
 src/bifrost/bf_xgpu.h |  5 +--
 2 files changed, 71 insertions(+), 12 deletions(-)

diff --git a/src/bf_xgpu.cpp b/src/bf_xgpu.cpp
index 54e11b1f8..82c923758 100644
--- a/src/bf_xgpu.cpp
+++ b/src/bf_xgpu.cpp
@@ -27,18 +27,33 @@ static XGPUInfo info;
 BFstatus xgpuInitialize(BFarray *in, BFarray *out, int gpu_dev) {
   int xgpu_error;
   xgpuInfo(&info);
-  if (num_contiguous_elements(in) != info.vecLength) {
-    return BF_STATUS_INVALID_SHAPE;
-  }
-  if (num_contiguous_elements(out) != info.matLength) {
-    return BF_STATUS_INVALID_SHAPE;
+  // Don't bother checking sizes if the input space is CUDA.
+  // We're not going to use these arrays anyway
+  if (in->space != BF_SPACE_CUDA) {
+      if (num_contiguous_elements(in) != info.vecLength) {
+        fprintf(stderr, "ERROR: xgpuInitialize: number of elements in != vecLength\n");
+        fprintf(stderr, "number of elements in: %lu\n", num_contiguous_elements(in));
+        fprintf(stderr, "vecLength: %llu\n", info.vecLength);
+        return BF_STATUS_INVALID_SHAPE;
+      }
+      if (num_contiguous_elements(out) != info.matLength) {
+        fprintf(stderr, "ERROR: xgpuInitialize: number of elements out != matLength\n");
+        fprintf(stderr, "number of elements out: %lu\n", num_contiguous_elements(out));
+        fprintf(stderr, "matLength: %llu\n", info.matLength);
+        return BF_STATUS_INVALID_SHAPE;
+      }
   }
   context.array_h = (SwizzleInput *)in->data;
   context.array_len = info.vecLength;
   context.matrix_h = (Complex *)out->data;
   context.matrix_len = info.matLength;
-  xgpu_error = xgpuInit(&context, gpu_dev);
+  if (in->space == BF_SPACE_CUDA) {
+      xgpu_error = xgpuInit(&context, gpu_dev | XGPU_DONT_REGISTER | XGPU_DONT_MALLOC_GPU);
+  } else {
+      xgpu_error = xgpuInit(&context, gpu_dev);
+  }
   if (xgpu_error != XGPU_OK) {
+    fprintf(stderr, "ERROR: xgpuInitialize: call returned %d\n", xgpu_error);
     return BF_STATUS_INTERNAL_ERROR;
   } else {
     return BF_STATUS_SUCCESS;
@@ -53,6 +68,12 @@ BFstatus xgpuInitialize(BFarray *in, BFarray *out, int gpu_dev) {
  *          will be copied to the host.
  */
 BFstatus xgpuCorrelate(BFarray *in, BFarray *out, int doDump) {
+  if (in->space == BF_SPACE_CUDA) {
+    return BF_STATUS_UNSUPPORTED_SPACE;
+  }
+  if (out->space == BF_SPACE_CUDA) {
+    return BF_STATUS_UNSUPPORTED_SPACE;
+  }
   int xgpu_error;
   context.array_h = (SwizzleInput *)in->data;
   context.array_len = info.vecLength;
@@ -78,18 +99,59 @@ BFstatus xgpuCorrelate(BFarray *in, BFarray *out, int doDump) {
  * doDump : if 1, this is the last call in an integration, and results
  *          will be copied to the host.
  */
+static int newAcc = 1; // flush vacc on the first call
 BFstatus xgpuKernel(BFarray *in, BFarray *out, int doDump) {
+  if (in->space != BF_SPACE_CUDA) {
+    return BF_STATUS_UNSUPPORTED_SPACE;
+  }
+  if (out->space != BF_SPACE_CUDA) {
+    return BF_STATUS_UNSUPPORTED_SPACE;
+  }
   int xgpu_error;
   context.array_h = (ComplexInput *)in->data;
   context.array_len = info.vecLength;
   context.matrix_h = (Complex *)out->data;
   context.matrix_len = info.matLength;
-  xgpu_error = xgpuCudaXengineSwizzleKernel(&context, doDump ? SYNCOP_DUMP : SYNCOP_SYNC_TRANSFER,
+  xgpu_error = xgpuCudaXengineSwizzleKernel(&context, doDump ? SYNCOP_DUMP : 0, newAcc,
 		                           (SwizzleInput *)in->data, (Complex *)out->data);
+
+  if (newAcc) {
+    newAcc = 0;
+  }
   if (doDump) {
-    xgpuClearDeviceIntegrationBuffer(&context);
+    newAcc = 1;
+  }
+  if (xgpu_error != XGPU_OK) {
+    fprintf(stderr, "ERROR: xgpuKernel: kernel call returned %d\n", xgpu_error);
+    return BF_STATUS_INTERNAL_ERROR;
+  } else {
+    return BF_STATUS_SUCCESS;
+  }
+}
+
+/*
+ * Given an xGPU accumulation buffer, grab a subset of visibilities from
+ * and gather them in a new buffer, in order chan x visibility x complexity [int32]
+ * BFarray *in : Pointer to a BFarray with storage in device memory, where xGPU results reside
+ * BFarray *in : Pointer to a BFarray with storage in device memory where collated visibilities should be written.
+ * int **vismap : array of visibilities in [[polA, polB], [polC, polD], ... ] form.
+ * int nvis : The number of visibilities to colate (length of the vismap array)
+ */
+BFstatus xgpuSubSelect(BFarray *in, BFarray *out, BFarray *vismap) {
+  long long unsigned nvis = num_contiguous_elements(vismap);
+  int xgpu_error;
+  if (in->space != BF_SPACE_CUDA) {
+    return BF_STATUS_UNSUPPORTED_SPACE;
+  }
+  if (out->space != BF_SPACE_CUDA) {
+    return BF_STATUS_UNSUPPORTED_SPACE;
+  }
+  if (vismap->space != BF_SPACE_CUDA) {
+    return BF_STATUS_UNSUPPORTED_SPACE;
   }
+  xgpu_error = xgpuCudaSubSelect(&context, (Complex *)in->data, (Complex *)out->data, (int *)vismap->data, nvis);
   if (xgpu_error != XGPU_OK) {
+    fprintf(stderr, "ERROR: xgpuKernel: kernel call returned %d\n", xgpu_error);
     return BF_STATUS_INTERNAL_ERROR;
   } else {
     return BF_STATUS_SUCCESS;
diff --git a/src/bifrost/bf_xgpu.h b/src/bifrost/bf_xgpu.h
index 447c090e7..2de082067 100644
--- a/src/bifrost/bf_xgpu.h
+++ b/src/bifrost/bf_xgpu.h
@@ -1,10 +1,7 @@
 #include <bifrost/common.h>
 #include <bifrost/array.h>
 
-//extern "C" {
-
 BFstatus xgpuInitialize(BFarray *in, BFarray *out, int gpu_dev);
 BFstatus xgpuCorrelate(BFarray *in, BFarray *out, int doDump);
 BFstatus xgpuKernel(BFarray *in, BFarray *out, int doDump);
-
-//}
+BFstatus xgpuSubSelect(BFarray *in, BFarray *out, BFarray *vismap);

From d18fbe9ffb4143c80bd48f204247c2a88ba3a63c Mon Sep 17 00:00:00 2001
From: JackH <jack@rtr>
Date: Tue, 30 Jun 2020 11:17:54 -0400
Subject: [PATCH 06/91] Add beamforming code to bifrost

Inspired by the CUBLAS usage in https://github.com/devincody/DSAbeamformer

Operates in 3 steps --
1. Tranpose data and promote to float
2. Compute beams
3. Compute beam dynamic spectra, and sum to (in LWA352's case) 1ms

Assumes no polarization ordering of input, but relies on user to
upload beamforming coefficients which create X-pol and Y-pol beams.
This is an easy way to deal with the arbitrary input ordering at runtime,
but isn't very efficient (half the beamforming coeffs are zero).
The kernel assumes the beams are constructed like this and uses the
fact to generate averaged dynamic spectra (XX, YY, XY_r, XY_i).

May well have synchronization bugs which make the benchmarks meaningless,
but currently obtains ~50 Gbps throughput (~9MHz bandwidth for 4-bit inputs) with

NANTS = 352
NPOLS = 2
NCHANS = 192 (4.4 MHz for LWA352)
NBEAMS = 32 (16 x 2-pols)
NTIMES = 480
NTIMES_SUM = 24 (1ms)
---
 src/Makefile            |   4 +-
 src/beamform.cpp        |  62 +++++++++++
 src/bifrost/beamform.h  |  17 +++
 src/cublas_beamform.cu  | 237 ++++++++++++++++++++++++++++++++++++++++
 src/cublas_beamform.cuh |  35 ++++++
 5 files changed, 354 insertions(+), 1 deletion(-)
 create mode 100644 src/beamform.cpp
 create mode 100644 src/bifrost/beamform.h
 create mode 100644 src/cublas_beamform.cu
 create mode 100644 src/cublas_beamform.cuh

diff --git a/src/Makefile b/src/Makefile
index e3d83f592..a03efd2a3 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -36,7 +36,9 @@ ifndef NOCUDA
   reduce.o \
   fir.o \
   guantize.o \
-  gunpack.o
+  gunpack.o \
+  beamform.o \
+  cublas_beamform.o
 ifdef XGPU
   LIBBIFROST_OBJS += \
   bf_xgpu.o
diff --git a/src/beamform.cpp b/src/beamform.cpp
new file mode 100644
index 000000000..ba40d9e87
--- /dev/null
+++ b/src/beamform.cpp
@@ -0,0 +1,62 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <math.h>
+#include <limits.h>
+#include <unistd.h>
+
+#include <bifrost/array.h>
+#include <bifrost/common.h>
+#include <utils.hpp>
+#include <stdlib.h>
+#include <stdio.h>
+#include <iostream>
+
+#include "cublas_beamform.cuh"
+
+extern "C" {
+
+
+/* 
+ * Initialize the beamformer library
+ */
+
+BFstatus beamformInitialize(
+  int gpudev,
+  int ninputs,
+  int nchans,
+  int ntimes,
+  int nbeams,
+  int ntime_blocks
+) {
+  // TODO: array size checking
+  // TODO: use complex data types
+  cublas_beamform_init(
+    gpudev,
+    ninputs,
+    nchans,
+    ntimes,
+    nbeams,
+    ntime_blocks
+  );
+  return BF_STATUS_SUCCESS;
+}
+
+BFstatus beamformRun(BFarray *in, BFarray *out, BFarray *weights) {
+  if (in->space != BF_SPACE_CUDA) {
+    fprintf(stderr, "Beamformer input buffer must be in CUDA space\n");
+    return BF_STATUS_INVALID_SPACE;
+  }
+  if (out->space != BF_SPACE_CUDA) {
+    fprintf(stderr, "Beamformer output buffer must be in CUDA space\n");
+    return BF_STATUS_INVALID_SPACE;
+  }
+  if (weights->space != BF_SPACE_CUDA) {
+    fprintf(stderr, "Beamformer weights buffer must be in CUDA space\n");
+    return BF_STATUS_INVALID_SPACE;
+  }
+  cublas_beamform((unsigned char *)in->data, (float *)out->data, (float *)weights->data);
+  return BF_STATUS_SUCCESS;
+}
+  
+} // C
diff --git a/src/bifrost/beamform.h b/src/bifrost/beamform.h
new file mode 100644
index 000000000..5b2e486b4
--- /dev/null
+++ b/src/bifrost/beamform.h
@@ -0,0 +1,17 @@
+#include <bifrost/common.h>
+#include <bifrost/array.h>
+
+BFstatus beamformInitialize(
+  int gpudev,
+  int ninputs,
+  int nchans,
+  int ntimes,
+  int nbeams,
+  int ntime_blocks
+);
+
+BFstatus beamformRun(
+  BFarray *in,
+  BFarray *out,
+  BFarray *weights
+);
diff --git a/src/cublas_beamform.cu b/src/cublas_beamform.cu
new file mode 100644
index 000000000..a98ae32ff
--- /dev/null
+++ b/src/cublas_beamform.cu
@@ -0,0 +1,237 @@
+#include <cublas_v2.h>
+#include <cuda_runtime.h>
+#include <stdio.h>
+
+#include "cublas_beamform.cuh"
+
+__constant__ float lut[16] = {0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, -8.0, -7.0, -6.0, -5.0, -4.0, -3.0, -2.0, -1.0};
+
+// Transpose time x chan x pol x 4+4 bit to
+// chan x pol x time x 32+32 bit float
+__global__ void trans_4bit_to_float(unsigned char *in,
+                                    float *out,
+                                    int n_pol,
+                                    int n_chan,
+                                    int n_time
+                                   ) {
+  //long long int tid = blockDim.y*blockDim.x*blockIdx.y + blockDim.x*blockIdx.x + threadIdx.x;
+  //int pol  = tid % n_pol;
+  //int chan = (tid / n_pol) % n_chan;
+  //int time = (tid / (n_pol * n_chan));
+  int time = blockIdx.x;
+  int chan = blockIdx.y;
+  int pol = TRANSPOSE_POL_BLOCK_SIZE*threadIdx.x;
+  unsigned char *in_off = in + time*n_chan*n_pol + chan*n_pol + pol;
+  float *out_off = out + 2*( chan*n_pol*n_time + pol*n_time + time);
+  //long long int old_index = time*n_chan*n_pol + chan*n_pol + pol;
+  //long long int new_index = chan*n_pol*n_time + pol*n_time + time;
+  float real, imag;
+  unsigned char temp;
+  #pragma unroll
+  for (int i=0; i<TRANSPOSE_POL_BLOCK_SIZE; i++) {
+    temp = *in_off++;
+    //real = lut[in[old_index+i] >> 4];
+    //imag = lut[in[old_index+i] & 0b1111];
+    //out[2*(new_index+i)] = real;
+    //out[2*(new_index+i)+1] = imag;
+    real = lut[temp >> 4];
+    imag = lut[temp & 255];
+    *out_off++ = real;
+    *out_off++ = imag;
+  }
+}
+
+// Transpose chan x beam x pol x time x 32+32 float to
+// beam x time[part-summed] x chan x [XX,YY,XY*_r,XY*_i] x 32 float
+// Each thread deals with two pols of a beam, and sums over n_time_sum time samples
+__global__ void trans_output_and_sum(float *in,
+                                    float *out,
+                                    int n_chan,
+                                    int n_beam,
+                                    int n_time,
+                                    int n_time_sum
+                                   ) {
+  int chan = blockIdx.x;
+  int beam = blockIdx.y;
+  int time = threadIdx.x;
+  long long int old_index = chan*n_beam*n_time*2 + beam*n_time*2 + time*n_time_sum; // start index for n_time/n_time_sum samples
+  long long int new_index = beam*(n_time / n_time_sum)*n_chan + time*n_chan + chan;
+  float xx=0., yy=0., xy_r=0., xy_i=0.;
+  float x_r, x_i, y_r, y_i;
+  int t;
+  for (t=0; t<n_time_sum; t++) {
+    x_r = in[2*old_index + 2*t];
+    x_i = in[2*old_index + 2*t + 1];
+    y_r = in[2*old_index + n_time + 2*t];
+    y_i = in[2*old_index + n_time + 2*t + 1];
+    xx = xx + x_r*x_r + x_i*x_i;
+    yy = yy + y_r*y_r + y_i*y_i;
+    xy_r = xy_r + x_r*y_r + x_i*y_i;
+    xy_i = xy_i + x_i*y_r - x_r*y_i;
+  }
+  out[4*new_index] = xx;
+  out[4*new_index+1] = yy;
+  out[4*new_index+2] = xy_r;
+  out[4*new_index+3] = xy_i;
+}
+
+__global__ void complex2pow(float *in, float *out, int N) {
+  int tid = blockDim.x * blockIdx.x + threadIdx.x;
+  if (tid < N) {
+    out[tid] = (in[2*tid]*in[2*tid] + in[2*tid + 1]*in[2*tid + 1]);
+  }
+}
+
+#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
+inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
+{
+   if (code != cudaSuccess) 
+   {
+      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
+      if (abort) exit(code);
+   }
+}
+
+/* Error checking for cuBLAS */
+void gpuBLASchk(int errval) {
+	if (errval != CUBLAS_STATUS_SUCCESS) {
+		fprintf(stderr, "Failed BLAS call, error code %d\n", errval);
+	}
+}
+
+/*
+Transa	CUDA_OP_N	Matrix A (Fourier coefficient matrix) is not transposed
+Transb	CUDA_OP_N	Matrix B (input data) is not transposed
+M	N_BEAMS	Number of rows of A/C
+N	N_TIMESTEPS_PER_CALL	Number of columns of B/C
+K	N_ANTENNAS	Number of columns of A, number of rows in B
+Alpha	1.0/127	Fourier coefficients are 8-bit ints so must be normalized to 1
+Atype	CUDA_C_8I	Data type of Fourier coefficient matrix (i.e. 8-bit + 8-bit integers)
+Lda	N_BEAMS	Leading dimension of Fourier coefficient matrix
+strideA	N_BEAMS*N_ANTENNAS	Stride between different frequencies in A
+Btype	CUDA_C_8I	Data type of input data matrix
+Ldb	N_ANTENNAS	Leading dimension of input matrix
+StrideB	N_ANTENNAS*N_TIMESTEPS_PER_CALL	Stride between different frequencies in input matrix
+Beta	0	Zero out the output data tensor
+Ctype	CUDA_C_32F	Data type of output matrix
+Ldc	N_BEAMS	Leading Dimension of output matrix
+strideC	N_BEAMS*N_TIMESTEPS_PER_CALL	Stride between different frequencies in output matrix
+batchCount	NCHANS	How many frequencies
+computeType	CUDA_C_32F	Internal datatype
+Algo	CUBLAS_GEMM_DEFAULT_TENSOR_OP	Use tensor operations
+*/
+
+struct beamform_context {
+  int gpu_device;
+  cublasHandle_t handle;
+  cudaStream_t stream;
+  // GPU buffers for intermediate data products
+  float *in32_d; // Reordered, floatified 4-bit input data
+  float *out_d;  // CUBLAS beamformer output
+  float *weights_d; // Beamformer coefficients
+  unsigned char *in4_d;     // 4-bit input data
+  float *sum_out_d; // Time-summed output data
+  int ninputs;   // Number of inputs (ants * pols)
+  int npols;     // Number of polarizations per antenna
+  int nchans;    // Number of channels input
+  int ntimes;    // Number of time samples input
+  int nbeams;    // Number of beams output
+  int ntimeblocks; // Number of time samples to keep after summation
+};
+
+static struct beamform_context context;
+
+void cublas_beamform_destroy(){
+  cudaFree(context.in32_d);
+  cudaFree(context.out_d);
+}
+
+void cublas_beamform_init(int device, int ninputs, int nchans, int ntimes, int nbeams, int ntimeblocks) {
+  context.gpu_device = device;
+  gpuErrchk( cudaSetDevice(context.gpu_device) );
+  gpuErrchk(cudaStreamCreate(&(context.stream)));
+  gpuBLASchk(cublasCreate(&(context.handle)));
+  gpuBLASchk(cublasSetStream(context.handle, context.stream));
+  gpuBLASchk(cublasSetPointerMode(context.handle, CUBLAS_POINTER_MODE_HOST));
+  //gpuBLASchk(cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE));
+  gpuBLASchk(cublasSetMathMode(context.handle, CUBLAS_TENSOR_OP_MATH));
+
+  context.ninputs = ninputs;
+  context.nchans = nchans;
+  context.ntimes = ntimes;
+  context.nbeams = nbeams;
+  context.ntimeblocks = ntimeblocks;
+
+  // Internally allocate intermediate buffers
+  gpuErrchk( cudaMalloc(&context.in32_d,  ninputs * nchans * ntimes * 2 * sizeof(float)) );
+  gpuErrchk( cudaMalloc(&context.out_d,   ntimes * nchans * nbeams * 2 * sizeof(float)) );
+}
+
+void cublas_beamform(unsigned char *in4_d, float *sum_out_d, float *weights_d) {
+  // Transpose input data and promote to float.
+  // CUBLAS doesn't support float coeffs with int8 data
+  dim3 transBlockGrid(context.ntimes, context.nchans);
+  dim3 transThreadGrid(context.ninputs / TRANSPOSE_POL_BLOCK_SIZE);
+  trans_4bit_to_float<<<transBlockGrid, transThreadGrid, 0, context.stream>>>(
+      in4_d,
+      context.in32_d,
+      context.ninputs,
+      context.nchans,
+      context.ntimes
+  );
+  cudaStreamSynchronize(context.stream);
+
+  // Beamform using GEMM
+  float alpha = 1.0;
+  float beta = 0.0;
+  // GEMM:
+  // C <= alpha*AB + beta*C
+  // alpha = 1.0
+  // beta = 0.0
+  // A matrix: beamforming coeffs (NBEAMS * NANTS)
+  // B matrix: data matrix (NANTS * NTIMES)
+  gpuBLASchk(cublasGemmStridedBatchedEx(
+    context.handle,
+    CUBLAS_OP_N, // transpose A?
+    CUBLAS_OP_N, // transpose B?
+    context.nbeams,      // m
+    context.ntimes,      // n
+    context.ninputs,     // k
+    // Coeffs
+    &alpha,              // alpha
+    weights_d,           // A
+    CUDA_C_32F,          // A type
+    context.nbeams,      // Lda
+    context.nbeams*context.ninputs,// strideA : stride size
+    // Data
+    context.in32_d,      // B
+    CUDA_C_32F,          // B type
+    context.ninputs,     // Ldb
+    context.ninputs*context.ntimes,// strideB : stride size
+    &beta,       // beta
+    // Results
+    context.out_d,       // C
+    CUDA_C_32F,          // Ctype 
+    context.nbeams,      // Ldc
+    context.nbeams*context.ntimes,// Stride C
+    context.nchans,      // batchCount
+    CUDA_C_32F,          // compute type
+    CUBLAS_GEMM_DEFAULT_TENSOR_OP // algo
+    ));
+  cudaStreamSynchronize(context.stream);
+
+  // Create XX, YY, XY beam powers.
+  // Sum over `ntimes_sum` samples
+  int ntimes_sum = context.ntimes / context.ntimeblocks;
+  dim3 sumBlockGrid(context.nchans, context.nbeams/2);
+  dim3 sumThreadGrid(context.ntimes / ntimes_sum);
+  trans_output_and_sum<<<sumBlockGrid, sumThreadGrid, 0, context.stream>>>(
+    context.out_d,
+    sum_out_d,
+    context.nchans,
+    context.nbeams/2,
+    context.ntimes,
+    ntimes_sum
+  );
+  cudaStreamSynchronize(context.stream);
+}
diff --git a/src/cublas_beamform.cuh b/src/cublas_beamform.cuh
new file mode 100644
index 000000000..eca2d63f2
--- /dev/null
+++ b/src/cublas_beamform.cuh
@@ -0,0 +1,35 @@
+#ifndef _CUBLAS_BEAMFORM_H
+#define _CUBLAS_BEAMFORM_H
+
+#include <cublas_v2.h>
+#include <cuda_runtime.h>
+#include <stdio.h>
+
+// Transpose time x chan x pol x 4+4 bit to
+#define TRANSPOSE_POL_BLOCK_SIZE 8
+// chan x pol x time x 32+32 bit float
+__global__ void trans_4bit_to_float(unsigned char *in,
+                                    float *out,
+                                    int n_pol,
+                                    int n_chan,
+                                    int n_time
+                                   );
+
+// Transpose chan x beam x pol x time x 32+32 float to
+// beam x time[part-summed] x chan x [XX,YY,XY*_r,XY*_i] x 32 float
+// Each thread deals with two pols of a beam, and sums over n_time_sum time samples
+__global__ void trans_output_and_sum(float *in,
+                                    float *out,
+                                    int n_chan,
+                                    int n_beam,
+                                    int n_time,
+                                    int n_time_sum
+                                   );
+
+__global__ void complex2pow(float *in, float *out, int N);
+
+void cublas_beamform_destroy();
+void cublas_beamform(unsigned char *in4_d, float *sum_out_d, float *weights_d);
+void cublas_beamform_init(int device, int ninputs, int nchans, int ntimes, int nbeams, int ntimeblocks);
+
+#endif

From 056d98c74fc18da8a667c2ea66790cf7621a1b92 Mon Sep 17 00:00:00 2001
From: JackH <jack@rtr>
Date: Tue, 30 Jun 2020 16:24:01 +0000
Subject: [PATCH 07/91] gitignore ctags files

---
 .gitignore | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/.gitignore b/.gitignore
index 74a87e23a..46cd88933 100644
--- a/.gitignore
+++ b/.gitignore
@@ -97,3 +97,9 @@ target/
 # Benchmarking files
 test/benchmarks/development_vs_gpuspec/with_bifrost/
 test/benchmarks/development_vs_gpuspec/without_bifrost/
+
+# ctags files
+python/bifrost/tags
+python/tags
+src/tags
+tags

From 55b2d6f4a3f0151264b555512e5f2b6afd605891 Mon Sep 17 00:00:00 2001
From: JackH <jack@rtr>
Date: Tue, 30 Jun 2020 16:25:02 +0000
Subject: [PATCH 08/91] Update GPU arch

---
 user.mk | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/user.mk b/user.mk
index 256c674f2..bb2908a26 100644
--- a/user.mk
+++ b/user.mk
@@ -11,7 +11,8 @@ PYINSTALLFLAGS ?=
 
 #GPU_ARCHS     ?= 30 32 35 37 50 52 53 # Nap time!
 #GPU_ARCHS     ?= 35 52
-GPU_ARCHS     ?= 35 61
+#GPU_ARCHS     ?= 35 61
+GPU_ARCHS     ?= 75
 
 GPU_SHAREDMEM ?= 16384 # GPU shared memory size
 

From 08129378fae6ef959626e404904feef079bf4294 Mon Sep 17 00:00:00 2001
From: JackH <jack@rtr>
Date: Tue, 30 Jun 2020 19:25:16 +0000
Subject: [PATCH 09/91] Add embryonic ibverbs packet RX support

---
 src/Makefile           |  6 ++++
 src/packet_capture.hpp | 68 ++++++++++++++++++++++++++++++++++++++++++
 user.mk                |  1 +
 3 files changed, 75 insertions(+)

diff --git a/src/Makefile b/src/Makefile
index a03efd2a3..cf8b69db7 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -105,6 +105,12 @@ ifdef VMA
   CPPFLAGS   += -DBF_VMA_ENABLED=1
 endif
 
+ifdef IBV
+  # Requires ibverbs/hashpipe to be installed
+  LIB        += -libverbs -lhashpipe
+  CPPFLAGS   += -DBF_HPIBV_ENABLED=1
+endif
+
 ifdef ALIGNMENT
   CPPFLAGS   += -DBF_ALIGNMENT=$(ALIGNMENT)
 endif
diff --git a/src/packet_capture.hpp b/src/packet_capture.hpp
index e4eb55528..5bdc30221 100644
--- a/src/packet_capture.hpp
+++ b/src/packet_capture.hpp
@@ -223,6 +223,74 @@ class VMAReceiver {
 };
 #endif // BF_VMA_ENABLED
 
+#ifndef BF_HPIBV_ENABLED
+#define BF_HPIBV_ENABLED 0
+#endif
+
+#if BF_HPIBV_ENABLED
+#include "hashpipe_ibverbs.h"
+#define IBV_UDP_PAYLOAD_OFFSET 42
+class IBVReceiver {
+    struct hashpipe_ibv_context _hibv_ctx;
+    struct hashpipe_ibv_recv_pkt* _hibv_rpkt; // Current packet chain
+    struct hashpipe_ibv_recv_pkt* _pkt; // Current packet
+public:
+    IBVReceiver(int port, int pkt_size_max, char *ifname) 
+        : _pkt(NULL), _hibv_rpkt(NULL) {
+        strncpy(_hibv_ctx.interface_name, ifname, IFNAMSIZ);
+	    _hibv_ctx.interface_name[IFNAMSIZ-1] = '\0'; // Ensure NUL termination
+	    _hibv_ctx.send_pkt_num = 1;
+	    _hibv_ctx.recv_pkt_num = 8192;
+	    _hibv_ctx.pkt_size_max = pkt_size_max;
+	    _hibv_ctx.max_flows = 1;
+	    int ret = hashpipe_ibv_init(&_hibv_ctx);
+	    if( ret ) {
+	        //PANIC
+	    }
+	    // Subscribe to RX flow
+	    ret = hashpipe_ibv_flow(
+            &_hibv_ctx,
+            0, IBV_FLOW_SPEC_UDP,
+            _hibv_ctx.mac, NULL, NULL, NULL, NULL, NULL,
+            NULL, port);
+        if( ret ) {
+	        //PANIC
+        }
+    }
+    ~IBVReceiver() { }
+    inline int recv_packet(uint8_t* buf, size_t bufsize, uint8_t** pkt_ptr, int flags=0) {
+        // If we don't have a work-request queue on the go,
+        // get some new packets.
+        // If we do, walk through the packets in this queue.
+        // Once at the end of the queue, release the current queue and wait
+        // for another
+        if ( !_hibv_rpkt ) {
+            _hibv_rpkt = hashpipe_ibv_recv_pkts(&_hibv_ctx, -1); //infinite timeout
+            _pkt = _hibv_rpkt;
+        } else {
+            _pkt = (struct hashpipe_ibv_recv_pkt *)_pkt->wr.next;
+        }
+        if ( !_pkt ) {
+            hashpipe_ibv_release_pkts(&_hibv_ctx, _hibv_rpkt);
+            _hibv_rpkt = hashpipe_ibv_recv_pkts(&_hibv_ctx, -1); //infinite timeout
+            _pkt = _hibv_rpkt;
+        }
+        if ( _pkt ) {
+            // IBV returns Eth/UDP/IP headers. Strip them off here.
+            *pkt_ptr = (uint8_t *)_pkt->wr.sg_list->addr + IBV_UDP_PAYLOAD_OFFSET;
+            return _pkt->length - IBV_UDP_PAYLOAD_OFFSET;
+        } else {
+            //TODO: can we ever get here? And is returning 0 if no packets
+            // are available allowed?
+            hashpipe_ibv_release_pkts(&_hibv_ctx, _hibv_rpkt);
+            _hibv_rpkt = 0;
+            return 0;
+        }
+    }
+};
+#endif // BF_HPIBV_ENABLED
+
+
 class UDPPacketReceiver : public PacketCaptureMethod {
 #if BF_VMA_ENABLED
     VMAReceiver            _vma;
diff --git a/user.mk b/user.mk
index bb2908a26..7e76cc48f 100644
--- a/user.mk
+++ b/user.mk
@@ -32,3 +32,4 @@ ALIGNMENT ?= 4096 # Memory allocation alignment
 HWLOC      = 1 # Enable use of hwloc library for memory binding in udp_capture
 VMA        = 1 # Enable use of Mellanox libvma in udp_capture
 XGPU       = 1 # build xGPU integrations (requires the xGPU library)
+IBV        = 1 # Build IB Verbs support using the hashpipe library

From f5d60e488ad7a8fbb5d90d574b3105e3bd052584 Mon Sep 17 00:00:00 2001
From: JackH <jack@rtr>
Date: Thu, 2 Jul 2020 14:50:05 +0000
Subject: [PATCH 10/91] IBVerbs integration

Move IB verbs receiving class to a dedicated C file. When
using the hashpipe_ibverbs library within packet_capture.hpp
directly something in that file messes up the compatibility
of the ibverbs structs (their sizes are different) to those
interpretted by hashpipe. Odd, but working around for now.
---
 src/Makefile                 |  6 +++-
 src/bf_ibverbs.cpp           | 58 ++++++++++++++++++++++++++++++
 src/bifrost/bf_ibverbs.h     | 17 +++++++++
 src/bifrost/io.h             |  3 +-
 src/bifrost/packet_capture.h | 13 +++++++
 src/packet_capture.cpp       | 24 +++++++++++++
 src/packet_capture.hpp       | 70 +++++++-----------------------------
 user.mk                      |  5 ++-
 8 files changed, 136 insertions(+), 60 deletions(-)
 create mode 100644 src/bf_ibverbs.cpp
 create mode 100644 src/bifrost/bf_ibverbs.h

diff --git a/src/Makefile b/src/Makefile
index cf8b69db7..27819cb0b 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -44,6 +44,10 @@ ifdef XGPU
   bf_xgpu.o
 endif
 endif
+ifdef IBV
+  LIBBIFROST_OBJS += \
+  bf_ibverbs.o
+endif
 
 JIT_SOURCES ?= \
   Complex.hpp.jit \
@@ -107,7 +111,7 @@ endif
 
 ifdef IBV
   # Requires ibverbs/hashpipe to be installed
-  LIB        += -libverbs -lhashpipe
+  LIB        += -lhashpipe_ibverbs
   CPPFLAGS   += -DBF_HPIBV_ENABLED=1
 endif
 
diff --git a/src/bf_ibverbs.cpp b/src/bf_ibverbs.cpp
new file mode 100644
index 000000000..7d859f474
--- /dev/null
+++ b/src/bf_ibverbs.cpp
@@ -0,0 +1,58 @@
+#include <stdio.h>
+#include "hashpipe_ibverbs.h"
+#include <bifrost/common.h>
+
+#define IBV_UDP_PAYLOAD_OFFSET 42
+
+extern "C" {
+struct hashpipe_ibv_context _hibv_ctx = {0};
+struct hashpipe_ibv_recv_pkt* _hibv_rpkt; // Current packet chain
+struct hashpipe_ibv_recv_pkt* _pkt; // Current packet
+
+BFstatus ibv_init(size_t pkt_size_max) {
+    fprintf(stderr, "Configuring IBV socket\n");
+    int port = 10000;
+    char ifname[IFNAMSIZ] = "ens1f1";
+    strncpy(_hibv_ctx.interface_name, ifname, IFNAMSIZ);
+    _hibv_ctx.interface_name[IFNAMSIZ-1] = '\0'; // Ensure NUL termination
+    _hibv_ctx.send_pkt_num = 1;
+    _hibv_ctx.recv_pkt_num = 8192;
+    _hibv_ctx.pkt_size_max = pkt_size_max;
+    _hibv_ctx.max_flows = 1;
+    int ret = hashpipe_ibv_init(&_hibv_ctx);
+	if( ret ) {
+	    fprintf(stderr, "ERROR: haspipe_ibv_init returned %d\n", ret);
+	}
+
+    // Subscribe to RX flow
+    ret = hashpipe_ibv_flow(
+            &_hibv_ctx,
+            0, IBV_FLOW_SPEC_UDP,
+            _hibv_ctx.mac, NULL, 0, 0, 0, 0, 0, port);
+    if( ret ) {
+	    fprintf(stderr, "ERROR: haspipe_ibv_flow returned %d\n", ret);
+    }
+
+    return BF_STATUS_SUCCESS;
+}
+
+int ibv_recv_packet(uint8_t** pkt_ptr, int flags) {
+    // If we don't have a work-request queue on the go,
+    // get some new packets.
+    if ( _pkt ) {
+        _pkt = (struct hashpipe_ibv_recv_pkt *)_pkt->wr.next;
+        if ( !_pkt ) {
+            hashpipe_ibv_release_pkts(&_hibv_ctx, _hibv_rpkt);
+            _hibv_rpkt = NULL;
+        }
+    }
+    while (!_hibv_rpkt) {
+        _hibv_rpkt = hashpipe_ibv_recv_pkts(&_hibv_ctx, 1);
+        _pkt = _hibv_rpkt;
+    }
+    // IBV returns Eth/UDP/IP headers. Strip them off here.
+    *pkt_ptr = (uint8_t *)_pkt->wr.sg_list->addr + IBV_UDP_PAYLOAD_OFFSET;
+    return _pkt->length - IBV_UDP_PAYLOAD_OFFSET;
+}
+
+}
diff --git a/src/bifrost/bf_ibverbs.h b/src/bifrost/bf_ibverbs.h
new file mode 100644
index 000000000..ce1206c74
--- /dev/null
+++ b/src/bifrost/bf_ibverbs.h
@@ -0,0 +1,17 @@
+#ifndef BF_IBVERBS_H_INCLUDE_GUARD_
+#define BF_IBVERBS_H_INCLUDE_GUARD_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+
+BFstatus ibv_init(size_t pkt_size_max);
+int ibv_recv_packet(uint8_t** pkt_ptr, int flags);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // BF_IBVERBS_INCLUDE_GUARD_
diff --git a/src/bifrost/io.h b/src/bifrost/io.h
index 086dc8de7..53746a365 100644
--- a/src/bifrost/io.h
+++ b/src/bifrost/io.h
@@ -39,7 +39,8 @@ typedef enum BFiomethod_ {
     BF_IO_GENERIC = 0,
     BF_IO_DISK    = 1,
     BF_IO_UDP     = 2,
-    BF_IO_SNIFFER = 3
+    BF_IO_SNIFFER = 3,
+    BF_IO_IBV_UDP = 4
 } BFiomethod;
 
 typedef enum BFiowhence_ {
diff --git a/src/bifrost/packet_capture.h b/src/bifrost/packet_capture.h
index 0bf38c64e..9f05a2be0 100644
--- a/src/bifrost/packet_capture.h
+++ b/src/bifrost/packet_capture.h
@@ -106,6 +106,19 @@ BFstatus bfUdpCaptureCreate(BFpacketcapture* obj,
                             BFsize           slot_ntime,
                             BFpacketcapture_callback sequence_callback,
                             int              core);
+
+BFstatus bfIbvUdpCaptureCreate(BFpacketcapture* obj,
+                            const char*      format,
+                            int              fd,
+                            BFring           ring,
+                            BFsize           nsrc,
+                            BFsize           src0,
+                            BFsize           max_payload_size,
+                            BFsize           buffer_ntime,
+                            BFsize           slot_ntime,
+                            BFpacketcapture_callback sequence_callback,
+                            int              core);
+
 BFstatus bfUdpSnifferCreate(BFpacketcapture* obj,
                             const char*      format,
                             int              fd,
diff --git a/src/packet_capture.cpp b/src/packet_capture.cpp
index 05fadc816..317ed901f 100644
--- a/src/packet_capture.cpp
+++ b/src/packet_capture.cpp
@@ -326,6 +326,30 @@ BFstatus bfUdpCaptureCreate(BFpacketcapture* obj,
                                   BF_IO_UDP);
 }
 
+BFstatus bfIbvUdpCaptureCreate(BFpacketcapture* obj,
+                            const char*      format,
+                            int              fd,
+                            BFring           ring,
+                            BFsize           nsrc,
+                            BFsize           src0,
+                            BFsize           max_payload_size,
+                            BFsize           buffer_ntime,
+                            BFsize           slot_ntime,
+                            BFpacketcapture_callback sequence_callback,
+                            int              core) {
+    return BFpacketcapture_create(obj,
+                                  format,
+                                  fd,
+                                  ring,
+                                  nsrc,
+                                  src0,
+                                  buffer_ntime,
+                                  slot_ntime,
+                                  sequence_callback,
+                                  core,
+                                  BF_IO_IBV_UDP);
+}
+
 BFstatus bfUdpSnifferCreate(BFpacketcapture* obj,
                             const char*      format,
                             int              fd,
diff --git a/src/packet_capture.hpp b/src/packet_capture.hpp
index 5bdc30221..c028d36c0 100644
--- a/src/packet_capture.hpp
+++ b/src/packet_capture.hpp
@@ -228,69 +228,22 @@ class VMAReceiver {
 #endif
 
 #if BF_HPIBV_ENABLED
-#include "hashpipe_ibverbs.h"
-#define IBV_UDP_PAYLOAD_OFFSET 42
-class IBVReceiver {
-    struct hashpipe_ibv_context _hibv_ctx;
-    struct hashpipe_ibv_recv_pkt* _hibv_rpkt; // Current packet chain
-    struct hashpipe_ibv_recv_pkt* _pkt; // Current packet
+#include <bifrost/bf_ibverbs.h>
+
+class IBVUDPPacketReceiver : public PacketCaptureMethod {
 public:
-    IBVReceiver(int port, int pkt_size_max, char *ifname) 
-        : _pkt(NULL), _hibv_rpkt(NULL) {
-        strncpy(_hibv_ctx.interface_name, ifname, IFNAMSIZ);
-	    _hibv_ctx.interface_name[IFNAMSIZ-1] = '\0'; // Ensure NUL termination
-	    _hibv_ctx.send_pkt_num = 1;
-	    _hibv_ctx.recv_pkt_num = 8192;
-	    _hibv_ctx.pkt_size_max = pkt_size_max;
-	    _hibv_ctx.max_flows = 1;
-	    int ret = hashpipe_ibv_init(&_hibv_ctx);
-	    if( ret ) {
-	        //PANIC
-	    }
-	    // Subscribe to RX flow
-	    ret = hashpipe_ibv_flow(
-            &_hibv_ctx,
-            0, IBV_FLOW_SPEC_UDP,
-            _hibv_ctx.mac, NULL, NULL, NULL, NULL, NULL,
-            NULL, port);
-        if( ret ) {
-	        //PANIC
-        }
+    IBVUDPPacketReceiver(int fd, size_t pkt_size_max=JUMBO_FRAME_SIZE)
+        : PacketCaptureMethod(fd, pkt_size_max, BF_IO_IBV_UDP)
+    {
+        ibv_init(pkt_size_max);
     }
-    ~IBVReceiver() { }
-    inline int recv_packet(uint8_t* buf, size_t bufsize, uint8_t** pkt_ptr, int flags=0) {
-        // If we don't have a work-request queue on the go,
-        // get some new packets.
-        // If we do, walk through the packets in this queue.
-        // Once at the end of the queue, release the current queue and wait
-        // for another
-        if ( !_hibv_rpkt ) {
-            _hibv_rpkt = hashpipe_ibv_recv_pkts(&_hibv_ctx, -1); //infinite timeout
-            _pkt = _hibv_rpkt;
-        } else {
-            _pkt = (struct hashpipe_ibv_recv_pkt *)_pkt->wr.next;
-        }
-        if ( !_pkt ) {
-            hashpipe_ibv_release_pkts(&_hibv_ctx, _hibv_rpkt);
-            _hibv_rpkt = hashpipe_ibv_recv_pkts(&_hibv_ctx, -1); //infinite timeout
-            _pkt = _hibv_rpkt;
-        }
-        if ( _pkt ) {
-            // IBV returns Eth/UDP/IP headers. Strip them off here.
-            *pkt_ptr = (uint8_t *)_pkt->wr.sg_list->addr + IBV_UDP_PAYLOAD_OFFSET;
-            return _pkt->length - IBV_UDP_PAYLOAD_OFFSET;
-        } else {
-            //TODO: can we ever get here? And is returning 0 if no packets
-            // are available allowed?
-            hashpipe_ibv_release_pkts(&_hibv_ctx, _hibv_rpkt);
-            _hibv_rpkt = 0;
-            return 0;
-        }
+    inline int recv_packet(uint8_t** pkt_ptr, int flags=0) {
+        return ibv_recv_packet(pkt_ptr, flags);
     }
+    inline const char* get_name() { return "ibv_udp_capture"; }
 };
 #endif // BF_HPIBV_ENABLED
 
-
 class UDPPacketReceiver : public PacketCaptureMethod {
 #if BF_VMA_ENABLED
     VMAReceiver            _vma;
@@ -728,6 +681,7 @@ class BFpacketcapture_snap2_impl : public BFpacketcapture_impl {
         // TODO: sequence never changes?
         //return false;
 	    //return (pkt->seq % 128 == 0);
+        return false;
 	}
 	void on_sequence_changed(const PacketDesc* pkt, BFoffset* seq0, BFoffset* time_tag, const void** hdr, size_t* hdr_size) {
 	    *seq0 = _seq;// + _nseq_per_buf*_bufs.size();
@@ -1231,6 +1185,8 @@ BFstatus BFpacketcapture_create(BFpacketcapture* obj,
         method = new DiskPacketReader(fd, max_payload_size);
     } else if( backend == BF_IO_UDP ) {
         method = new UDPPacketReceiver(fd, max_payload_size);
+    } else if( backend == BF_IO_IBV_UDP ) {
+        method = new IBVUDPPacketReceiver(fd, max_payload_size);
     } else if( backend == BF_IO_SNIFFER ) {
         method = new UDPPacketSniffer(fd, max_payload_size);
     } else {
diff --git a/user.mk b/user.mk
index 7e76cc48f..43fc3d96a 100644
--- a/user.mk
+++ b/user.mk
@@ -23,6 +23,9 @@ CUDA_INCDIR   ?= $(CUDA_HOME)/include
 
 ALIGNMENT ?= 4096 # Memory allocation alignment
 
+# Defining the below will turn on various compilation
+# ifdef clauses.
+# ***defining as 0 is the same as defining as 1!***
 #NODEBUG    = 1 # Disable debugging mode (use this for production releases)
 #TRACE      = 1 # Enable tracing mode (generates annotations for use with nvprof/nvvp)
 #NOCUDA     = 1 # Disable CUDA support
@@ -30,6 +33,6 @@ ALIGNMENT ?= 4096 # Memory allocation alignment
 #CUDA_DEBUG = 1 # Enable CUDA debugging (nvcc -G)
 #NUMA       = 1 # Enable use of numa library for setting affinity of ring memory
 HWLOC      = 1 # Enable use of hwloc library for memory binding in udp_capture
-VMA        = 1 # Enable use of Mellanox libvma in udp_capture
+#VMA        = 1 # Enable use of Mellanox libvma in udp_capture
 XGPU       = 1 # build xGPU integrations (requires the xGPU library)
 IBV        = 1 # Build IB Verbs support using the hashpipe library

From 324bb2ca317a0ddd848507a026b090623e398ab1 Mon Sep 17 00:00:00 2001
From: JackH <jack@rtr>
Date: Thu, 2 Jul 2020 14:51:47 +0000
Subject: [PATCH 11/91] Add flag to use ibverbs

---
 python/bifrost/packet_capture.py | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/python/bifrost/packet_capture.py b/python/bifrost/packet_capture.py
index fc02d4ffb..a375e6126 100644
--- a/python/bifrost/packet_capture.py
+++ b/python/bifrost/packet_capture.py
@@ -81,7 +81,8 @@ def end(self):
 
 class UDPCapture(_CaptureBase):
     def __init__(self, fmt, sock, ring, nsrc, src0, max_payload_size,
-                 buffer_ntime, slot_ntime, sequence_callback, core=None):
+                 buffer_ntime, slot_ntime, sequence_callback, core=None,
+                 ibverbs=False, interface='', port=-1):
         try:
             fmt = fmt.encode()
         except AttributeError:
@@ -89,11 +90,19 @@ def __init__(self, fmt, sock, ring, nsrc, src0, max_payload_size,
             pass
         if core is None:
             core = -1
-        BifrostObject.__init__(
-            self, _bf.bfUdpCaptureCreate, _bf.bfPacketCaptureDestroy,
-            fmt, sock.fileno(), ring.obj, nsrc, src0,
-            max_payload_size, buffer_ntime, slot_ntime,
-            sequence_callback.obj, core)
+        if not ibverbs:
+            BifrostObject.__init__(
+                self, _bf.bfUdpCaptureCreate, _bf.bfPacketCaptureDestroy,
+                fmt, sock.fileno(), ring.obj, nsrc, src0,
+                max_payload_size, buffer_ntime, slot_ntime,
+                sequence_callback.obj, core)
+        else:
+            print("Using IBVerbs")
+            BifrostObject.__init__(
+                self, _bf.bfIbvUdpCaptureCreate, _bf.bfPacketCaptureDestroy,
+                fmt, sock.fileno(), ring.obj, nsrc, src0,
+                max_payload_size, buffer_ntime, slot_ntime,
+                sequence_callback.obj, core)
 
 class UDPSniffer(_CaptureBase):
     def __init__(self, fmt, sock, ring, nsrc, src0, max_payload_size,

From 30217af5a5d92d494eb11cbdaef01019763d0009 Mon Sep 17 00:00:00 2001
From: JackH <jack@rtr>
Date: Thu, 2 Jul 2020 14:52:29 +0000
Subject: [PATCH 12/91] Change function names to match `bfFunctionName`
 convention

---
 src/beamform.cpp       | 4 ++--
 src/bf_xgpu.cpp        | 8 ++++----
 src/bifrost/beamform.h | 4 ++--
 src/bifrost/bf_xgpu.h  | 8 ++++----
 4 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/beamform.cpp b/src/beamform.cpp
index ba40d9e87..190a6232e 100644
--- a/src/beamform.cpp
+++ b/src/beamform.cpp
@@ -21,7 +21,7 @@ extern "C" {
  * Initialize the beamformer library
  */
 
-BFstatus beamformInitialize(
+BFstatus bfBeamformInitialize(
   int gpudev,
   int ninputs,
   int nchans,
@@ -42,7 +42,7 @@ BFstatus beamformInitialize(
   return BF_STATUS_SUCCESS;
 }
 
-BFstatus beamformRun(BFarray *in, BFarray *out, BFarray *weights) {
+BFstatus bfBeamformRun(BFarray *in, BFarray *out, BFarray *weights) {
   if (in->space != BF_SPACE_CUDA) {
     fprintf(stderr, "Beamformer input buffer must be in CUDA space\n");
     return BF_STATUS_INVALID_SPACE;
diff --git a/src/bf_xgpu.cpp b/src/bf_xgpu.cpp
index 82c923758..25c3774eb 100644
--- a/src/bf_xgpu.cpp
+++ b/src/bf_xgpu.cpp
@@ -24,7 +24,7 @@ static XGPUInfo info;
  * a pointer to the input and output data (on the host),
  * and a GPU device ID
  */
-BFstatus xgpuInitialize(BFarray *in, BFarray *out, int gpu_dev) {
+BFstatus bfXgpuInitialize(BFarray *in, BFarray *out, int gpu_dev) {
   int xgpu_error;
   xgpuInfo(&info);
   // Don't bother checking sizes if the input space is CUDA.
@@ -67,7 +67,7 @@ BFstatus xgpuInitialize(BFarray *in, BFarray *out, int gpu_dev) {
  * doDump : if 1, this is the last call in an integration, and results
  *          will be copied to the host.
  */
-BFstatus xgpuCorrelate(BFarray *in, BFarray *out, int doDump) {
+BFstatus bfXgpuCorrelate(BFarray *in, BFarray *out, int doDump) {
   if (in->space == BF_SPACE_CUDA) {
     return BF_STATUS_UNSUPPORTED_SPACE;
   }
@@ -100,7 +100,7 @@ BFstatus xgpuCorrelate(BFarray *in, BFarray *out, int doDump) {
  *          will be copied to the host.
  */
 static int newAcc = 1; // flush vacc on the first call
-BFstatus xgpuKernel(BFarray *in, BFarray *out, int doDump) {
+BFstatus bfXgpuKernel(BFarray *in, BFarray *out, int doDump) {
   if (in->space != BF_SPACE_CUDA) {
     return BF_STATUS_UNSUPPORTED_SPACE;
   }
@@ -137,7 +137,7 @@ BFstatus xgpuKernel(BFarray *in, BFarray *out, int doDump) {
  * int **vismap : array of visibilities in [[polA, polB], [polC, polD], ... ] form.
  * int nvis : The number of visibilities to colate (length of the vismap array)
  */
-BFstatus xgpuSubSelect(BFarray *in, BFarray *out, BFarray *vismap) {
+BFstatus bfXgpuSubSelect(BFarray *in, BFarray *out, BFarray *vismap) {
   long long unsigned nvis = num_contiguous_elements(vismap);
   int xgpu_error;
   if (in->space != BF_SPACE_CUDA) {
diff --git a/src/bifrost/beamform.h b/src/bifrost/beamform.h
index 5b2e486b4..8fa43a66f 100644
--- a/src/bifrost/beamform.h
+++ b/src/bifrost/beamform.h
@@ -1,7 +1,7 @@
 #include <bifrost/common.h>
 #include <bifrost/array.h>
 
-BFstatus beamformInitialize(
+BFstatus bfBeamformInitialize(
   int gpudev,
   int ninputs,
   int nchans,
@@ -10,7 +10,7 @@ BFstatus beamformInitialize(
   int ntime_blocks
 );
 
-BFstatus beamformRun(
+BFstatus bfBeamformRun(
   BFarray *in,
   BFarray *out,
   BFarray *weights
diff --git a/src/bifrost/bf_xgpu.h b/src/bifrost/bf_xgpu.h
index 2de082067..ede805053 100644
--- a/src/bifrost/bf_xgpu.h
+++ b/src/bifrost/bf_xgpu.h
@@ -1,7 +1,7 @@
 #include <bifrost/common.h>
 #include <bifrost/array.h>
 
-BFstatus xgpuInitialize(BFarray *in, BFarray *out, int gpu_dev);
-BFstatus xgpuCorrelate(BFarray *in, BFarray *out, int doDump);
-BFstatus xgpuKernel(BFarray *in, BFarray *out, int doDump);
-BFstatus xgpuSubSelect(BFarray *in, BFarray *out, BFarray *vismap);
+BFstatus bfXgpuInitialize(BFarray *in, BFarray *out, int gpu_dev);
+BFstatus bfXgpuCorrelate(BFarray *in, BFarray *out, int doDump);
+BFstatus bfXgpuKernel(BFarray *in, BFarray *out, int doDump);
+BFstatus bfXgpuSubSelect(BFarray *in, BFarray *out, BFarray *vismap);

From faca6c02d781d81f2a2cd6f13f493c802b172c57 Mon Sep 17 00:00:00 2001
From: JackH <jack@rtr>
Date: Thu, 2 Jul 2020 15:29:19 +0000
Subject: [PATCH 13/91] Change IB verbs packet capture method name to
 "udp_capture"

Evidently, this is the trigger to make `like_bmon` work
its magic
---
 src/packet_capture.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/packet_capture.hpp b/src/packet_capture.hpp
index c028d36c0..cd9f41d57 100644
--- a/src/packet_capture.hpp
+++ b/src/packet_capture.hpp
@@ -240,7 +240,7 @@ class IBVUDPPacketReceiver : public PacketCaptureMethod {
     inline int recv_packet(uint8_t** pkt_ptr, int flags=0) {
         return ibv_recv_packet(pkt_ptr, flags);
     }
-    inline const char* get_name() { return "ibv_udp_capture"; }
+    inline const char* get_name() { return "udp_capture"; }
 };
 #endif // BF_HPIBV_ENABLED
 

From c411072dc8c34b23495d6b5af45b7192c6ca9ff1 Mon Sep 17 00:00:00 2001
From: JackH <jack@rtr>
Date: Fri, 3 Jul 2020 16:36:51 +0000
Subject: [PATCH 14/91] Packet receiver OK @ ~28Gb/s

Increase RX packet depth of IB verbs interface to 32k (this
seems to be the maximum).

Make packet handler use AVX stream store instructions.

1. The receiver is currently hard coded for 64 pols per packet.
   It would be trivial to parameterize this, but it may have some
   small performance implication.
2. Code loads 64-bit values into a 256-bit AVX register before writing
   to memory. If the IBV interface can be tweaked to enforce alignment
   (talk to DM about this) the first stage won't be necessary.
3. 64 pols per packet = 512 bits per memory write (1 freq channel of data).
   Newer machines supporting AVX512 could probably run faster than the current
   code by using _mm512_stream_si512 in place of _mm256_stream_si256
---
 src/bf_ibverbs.cpp    |  4 +++-
 src/formats/snap2.hpp | 29 ++++++++++++++++++++++-------
 user.mk               |  2 +-
 3 files changed, 26 insertions(+), 9 deletions(-)

diff --git a/src/bf_ibverbs.cpp b/src/bf_ibverbs.cpp
index 7d859f474..38fa2e6a0 100644
--- a/src/bf_ibverbs.cpp
+++ b/src/bf_ibverbs.cpp
@@ -16,8 +16,10 @@ BFstatus ibv_init(size_t pkt_size_max) {
     strncpy(_hibv_ctx.interface_name, ifname, IFNAMSIZ);
     _hibv_ctx.interface_name[IFNAMSIZ-1] = '\0'; // Ensure NUL termination
     _hibv_ctx.send_pkt_num = 1;
-    _hibv_ctx.recv_pkt_num = 8192;
+    _hibv_ctx.recv_pkt_num = 32768;
     _hibv_ctx.pkt_size_max = pkt_size_max;
+    fprintf(stderr, "IBV: pkt_size_max: %d\n", _hibv_ctx.pkt_size_max);
+    fprintf(stderr, "IBV: recv_pkt_num: %d\n", _hibv_ctx.recv_pkt_num);
     _hibv_ctx.max_flows = 1;
     int ret = hashpipe_ibv_init(&_hibv_ctx);
 	if( ret ) {
diff --git a/src/formats/snap2.hpp b/src/formats/snap2.hpp
index a63108c86..ff401123b 100644
--- a/src/formats/snap2.hpp
+++ b/src/formats/snap2.hpp
@@ -30,7 +30,8 @@
 
 #include "base.hpp"
 
-//#include <immintrin.h> // SSE
+#include <immintrin.h> // SSE
+#include <emmintrin.h>
 
 #define SNAP2_HEADER_MAGIC 0xaabbccdd
 
@@ -141,15 +142,29 @@ class SNAP2Processor : virtual public PacketProcessor {
             //         However, they require aligned memory (otherwise segfault)
             itype const* __restrict__ in  = (itype const*)pkt->payload_ptr;
             otype*       __restrict__ out = (otype*      )&obufs[obuf_idx][obuf_offset];
+
+            int words_per_chan_in = pkt->npol >> 5; // 32 pols per 256-bit word
+            int words_per_chan_out = pkt->npol_tot >> 5;
+            int pol_offset_out = pkt->pol0 >> 5;
+            int pkt_chan = pkt->chan0;           // The first channel in this packet
         
             // Copy packet payload one channel at a time.
-            // Packets have payload format nchans x npols x complexity
-            // spacing with which channel chunks are copied depends
+            // Packets have payload format nchans x npols x complexity.
+            // Output buffer order is chans * npol_total * complexity
+            // Spacing with which channel chunks are copied depends
             // on the total number of channels/pols in the system
-            for(int chan=0; chan<pkt->nchan; chan++) {
-                 //   // TODO: AVX stores here will probably be much faster
-                 //   ::memcpy(&out[(((pkt->npol_tot) * (pkt->chan0 + chan)) + (pkt->pol0)) / 32],
-                 //            &in[(pkt->npol / 32) * chan], pkt->npol / 32);
+            __m256i *dest_p;
+            __m256i vecbuf[2];
+            uint64_t *in64 = (uint64_t *)in;
+            int c, i;
+            dest_p = (__m256i *)(out + (words_per_chan_out * (pkt_chan)) + pol_offset_out);
+            for(c=0; c<pkt->nchan; c++) {
+               vecbuf[0] = _mm256_set_epi64x(in64[3], in64[2], in64[1], in64[0]);
+               vecbuf[1] = _mm256_set_epi64x(in64[7], in64[6], in64[5], in64[4]);
+               _mm256_stream_si256(dest_p, vecbuf[0]);
+               _mm256_stream_si256(dest_p+1,   vecbuf[1]);
+               in64 += 8;
+               dest_p += words_per_chan_out;
             }
     }
 
diff --git a/user.mk b/user.mk
index 43fc3d96a..e8067c8e5 100644
--- a/user.mk
+++ b/user.mk
@@ -33,6 +33,6 @@ ALIGNMENT ?= 4096 # Memory allocation alignment
 #CUDA_DEBUG = 1 # Enable CUDA debugging (nvcc -G)
 #NUMA       = 1 # Enable use of numa library for setting affinity of ring memory
 HWLOC      = 1 # Enable use of hwloc library for memory binding in udp_capture
-#VMA        = 1 # Enable use of Mellanox libvma in udp_capture
+VMA        = 1 # Enable use of Mellanox libvma in udp_capture
 XGPU       = 1 # build xGPU integrations (requires the xGPU library)
 IBV        = 1 # Build IB Verbs support using the hashpipe library

From 42c696acb597b34f99738a8188f98dac44cdd226 Mon Sep 17 00:00:00 2001
From: JackH <jack@rtr>
Date: Mon, 6 Jul 2020 13:00:03 +0000
Subject: [PATCH 15/91] Py2->3 decode; tweak exception raising

The behaviour of the traceback library has changed in py3, so remove
the now nonexistent call.

Tweak error handling to properly pass an exception to the cleanup print

Fix missing decode()
---
 tools/like_top.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/tools/like_top.py b/tools/like_top.py
index 9af42e2b6..c4750a7f4 100755
--- a/tools/like_top.py
+++ b/tools/like_top.py
@@ -189,7 +189,7 @@ def get_gpu_memory_usage():
         pass
     else:
         # Parse the ouptut and turn everything into something useful, if possible
-        lines = output.split('\n')[:-1]
+        lines = output.decode().split('\n')[:-1]
         for line in lines:
             used, total, free, draw, limit, load = line.split(',')
             data['devCount'] += 1
@@ -402,7 +402,8 @@ def main(args):
     except KeyboardInterrupt:
         pass
 
-    except Exception as error:
+    except Exception as err:
+        error = err
         exc_type, exc_value, exc_traceback = sys.exc_info()
         fileObject = StringIO()
         traceback.print_tb(exc_traceback, file=fileObject)
@@ -428,7 +429,7 @@ def main(args):
     # Final reporting
     try:
         ## Error
-        print("%s: failed with %s at line %i" % (os.path.basename(__file__), str(error), traceback.tb_lineno(exc_traceback)))
+        print("%s: failed with %s" % (os.path.basename(__file__), str(error)))
         for line in tbString.split('\n'):
             print(line)
     except NameError:

From 2ec319c3dfb4a51c0b0bb0f61ff29d48eafb518c Mon Sep 17 00:00:00 2001
From: JackH <jack@rtr>
Date: Mon, 6 Jul 2020 15:03:53 +0000
Subject: [PATCH 16/91] Add gbps throughput to like_top

Time metrics for processing / waiting for input/output
data are helpful for figuringout the bottlenecks in the
pipeline, but aren't particularly intuitive (IMO)
measures of whether things are "fast enough"
---
 tools/like_top.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/tools/like_top.py b/tools/like_top.py
index c4750a7f4..d2b1fd18d 100755
--- a/tools/like_top.py
+++ b/tools/like_top.py
@@ -333,10 +333,11 @@ def main(args):
                             ac = max([0.0, log['acquire_time']])
                             pr = max([0.0, log['process_time']])
                             re = max([0.0, log['reserve_time']])
+                            gb = max([0.0, log.get('gbps', 0.0)])
                         except KeyError:
-                            ac, pr, re = 0.0, 0.0, 0.0
+                            ac, pr, re, gb = 0.0, 0.0, 0.0, 0.0
 
-                        blockList['%i-%s' % (pid, block)] = {'pid': pid, 'name':block, 'cmd': cmd, 'core': cr, 'acquire': ac, 'process': pr, 'reserve': re, 'total':ac+pr+re}
+                        blockList['%i-%s' % (pid, block)] = {'pid': pid, 'name':block, 'cmd': cmd, 'core': cr, 'acquire': ac, 'process': pr, 'reserve': re, 'total':ac+pr+re, 'gbps':gb}
 
                 ## Sort
                 order = sorted(blockList, key=lambda x: blockList[x][sort_key], reverse=sort_rev)
@@ -374,7 +375,7 @@ def main(args):
                 k = _add_line(scr, k, 0, output, std)
             ### Header
             k = _add_line(scr, k, 0, ' ', std)
-            output = '%6s  %15s  %4s  %5s  %7s  %7s  %7s  %7s  Cmd' % ('PID', 'Block', 'Core', '%CPU', 'Total', 'Acquire', 'Process', 'Reserve')
+            output = '%6s  %15s  %4s  %5s  %7s  %7s  %7s  %7s  %7s  Cmd' % ('PID', 'Block', 'Core', '%CPU', 'Total', 'Acquire', 'Process', 'Reserve', 'Gbits/s')
             csize = size[1]-len(output)
             output += ' '*csize
             output += '\n'
@@ -387,7 +388,7 @@ def main(args):
                     c = '%5.1f' % c
                 except KeyError:
                     c = '%5s' % ' '
-                output = '%6i  %15s  %4i  %5s  %7.3f  %7.3f  %7.3f  %7.3f  %s' % (d['pid'], d['name'][:15], d['core'], c, d['total'], d['acquire'], d['process'], d['reserve'], d['cmd'][:csize+3])
+                output = '%6i  %15s  %4i  %5s  %7.3f  %7.3f  %7.3f  %7.3f  %7.3f %s' % (d['pid'], d['name'][:15], d['core'], c, d['total'], d['acquire'], d['process'], d['reserve'], d['gbps'], d['cmd'][:csize+3])
                 k = _add_line(scr, k, 0, output, std)
                 if k >= size[0] - 1:
                     break

From 498034ea3623e08a5ba23450ee05c51096575e94 Mon Sep 17 00:00:00 2001
From: JackH <jack@rtr>
Date: Mon, 6 Jul 2020 15:04:58 +0000
Subject: [PATCH 17/91] Create new sequences every 480 time samples for SNAP2
 packets

This is a gross thing to hardcode, so FIXME.
But, having new sequences periodically means that the header
timestamps can be used as actual timestamps, rather
than just counting bytes in some infinite data stream (which
doesn't seem like a good idea when the input stream is from a
network, and could conceivably behave strangely).

Having timestamps derived from actual packet headers periodically
seems sensible(?)
---
 src/packet_capture.hpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/packet_capture.hpp b/src/packet_capture.hpp
index cd9f41d57..c61454098 100644
--- a/src/packet_capture.hpp
+++ b/src/packet_capture.hpp
@@ -678,10 +678,9 @@ class BFpacketcapture_snap2_impl : public BFpacketcapture_impl {
 	}
     // Has the configuration changed? I.e., different channels being sent.
 	inline bool has_sequence_changed(const PacketDesc* pkt) {
-        // TODO: sequence never changes?
+        // TODO: Decide what a sequence actually is!
+	    return (pkt->seq % 480 == 0);
         //return false;
-	    //return (pkt->seq % 128 == 0);
-        return false;
 	}
 	void on_sequence_changed(const PacketDesc* pkt, BFoffset* seq0, BFoffset* time_tag, const void** hdr, size_t* hdr_size) {
 	    *seq0 = _seq;// + _nseq_per_buf*_bufs.size();

From 42619e15eb459dbf2340fb0c31819f03c2bcb19c Mon Sep 17 00:00:00 2001
From: JackH <jack@rtr>
Date: Thu, 23 Jul 2020 13:30:23 +0000
Subject: [PATCH 18/91] Split up beamforming GEM and dynamic spectra /
 transpose / integrate

Allow an option to beamform and integrate in one hit by
passing ntime_blocks>0 when initializing the library. Otherwise
don't transpose or integrate the data.

This change allows multiple downstream processes to use raw beamformer
data for their own, different purposes -- (eg) one generating integrated
dynamic spectra, and one generating VLBI voltage beams
---
 src/beamform.cpp        | 12 ++++++++++
 src/bifrost/beamform.h  |  5 +++++
 src/cublas_beamform.cu  | 50 ++++++++++++++++++++++++++++++++++++-----
 src/cublas_beamform.cuh |  1 +
 4 files changed, 62 insertions(+), 6 deletions(-)

diff --git a/src/beamform.cpp b/src/beamform.cpp
index 190a6232e..eb6c563e4 100644
--- a/src/beamform.cpp
+++ b/src/beamform.cpp
@@ -59,4 +59,16 @@ BFstatus bfBeamformRun(BFarray *in, BFarray *out, BFarray *weights) {
   return BF_STATUS_SUCCESS;
 }
   
+BFstatus bfBeamformIntegrate(BFarray *in, BFarray *out) {
+  if (in->space != BF_SPACE_CUDA) {
+    fprintf(stderr, "Beamformer input buffer must be in CUDA space\n");
+    return BF_STATUS_INVALID_SPACE;
+  }
+  if (out->space != BF_SPACE_CUDA) {
+    fprintf(stderr, "Beamformer output buffer must be in CUDA space\n");
+    return BF_STATUS_INVALID_SPACE;
+  }
+  cublas_beamform_integrate((float *)in->data, (float *)out->data);
+  return BF_STATUS_SUCCESS;
+}
 } // C
diff --git a/src/bifrost/beamform.h b/src/bifrost/beamform.h
index 8fa43a66f..1da1937a9 100644
--- a/src/bifrost/beamform.h
+++ b/src/bifrost/beamform.h
@@ -15,3 +15,8 @@ BFstatus bfBeamformRun(
   BFarray *out,
   BFarray *weights
 );
+
+BFstatus bfBeamformIntegrate(
+  BFarray *in,
+  BFarray *out
+);
diff --git a/src/cublas_beamform.cu b/src/cublas_beamform.cu
index a98ae32ff..94f095850 100644
--- a/src/cublas_beamform.cu
+++ b/src/cublas_beamform.cu
@@ -143,7 +143,9 @@ static struct beamform_context context;
 
 void cublas_beamform_destroy(){
   cudaFree(context.in32_d);
-  cudaFree(context.out_d);
+  if (context.ntimeblocks > 0) {
+    cudaFree(context.out_d);
+  }
 }
 
 void cublas_beamform_init(int device, int ninputs, int nchans, int ntimes, int nbeams, int ntimeblocks) {
@@ -164,10 +166,14 @@ void cublas_beamform_init(int device, int ninputs, int nchans, int ntimes, int n
 
   // Internally allocate intermediate buffers
   gpuErrchk( cudaMalloc(&context.in32_d,  ninputs * nchans * ntimes * 2 * sizeof(float)) );
-  gpuErrchk( cudaMalloc(&context.out_d,   ntimes * nchans * nbeams * 2 * sizeof(float)) );
+  // If the context is initialized with ntimeblocks=0, then we do no summing so don't
+  // need the intermediate buffer
+  if (ntimeblocks > 0) {
+    gpuErrchk( cudaMalloc(&context.out_d,   ntimes * nchans * nbeams * 2 * sizeof(float)) );
+  }
 }
 
-void cublas_beamform(unsigned char *in4_d, float *sum_out_d, float *weights_d) {
+void cublas_beamform(unsigned char *in4_d, float *out_d, float *weights_d) {
   // Transpose input data and promote to float.
   // CUBLAS doesn't support float coeffs with int8 data
   dim3 transBlockGrid(context.ntimes, context.nchans);
@@ -181,6 +187,17 @@ void cublas_beamform(unsigned char *in4_d, float *sum_out_d, float *weights_d) {
   );
   cudaStreamSynchronize(context.stream);
 
+  // If we are integrating beam powers, put the
+  // GEM output in the context-defined intermediate
+  // buffer. If not, then write beamformer output
+  // to the address given by the user.
+  float *gem_out_d;
+  if (context.ntimeblocks > 0) {
+    gem_out_d = context.out_d;
+  } else {
+    gem_out_d = out_d;
+  }
+
   // Beamform using GEMM
   float alpha = 1.0;
   float beta = 0.0;
@@ -190,6 +207,7 @@ void cublas_beamform(unsigned char *in4_d, float *sum_out_d, float *weights_d) {
   // beta = 0.0
   // A matrix: beamforming coeffs (NBEAMS * NANTS)
   // B matrix: data matrix (NANTS * NTIMES)
+    
   gpuBLASchk(cublasGemmStridedBatchedEx(
     context.handle,
     CUBLAS_OP_N, // transpose A?
@@ -210,7 +228,7 @@ void cublas_beamform(unsigned char *in4_d, float *sum_out_d, float *weights_d) {
     context.ninputs*context.ntimes,// strideB : stride size
     &beta,       // beta
     // Results
-    context.out_d,       // C
+    gem_out_d,       // C
     CUDA_C_32F,          // Ctype 
     context.nbeams,      // Ldc
     context.nbeams*context.ntimes,// Stride C
@@ -220,14 +238,34 @@ void cublas_beamform(unsigned char *in4_d, float *sum_out_d, float *weights_d) {
     ));
   cudaStreamSynchronize(context.stream);
 
+  // Optionally:
+  if (context.ntimeblocks > 0) {
+    // Create XX, YY, XY beam powers.
+    // Sum over `ntimes_sum` samples
+    int ntimes_sum = context.ntimes / context.ntimeblocks;
+    dim3 sumBlockGrid(context.nchans, context.nbeams/2);
+    dim3 sumThreadGrid(context.ntimes / ntimes_sum);
+    trans_output_and_sum<<<sumBlockGrid, sumThreadGrid, 0, context.stream>>>(
+      gem_out_d,
+      out_d,
+      context.nchans,
+      context.nbeams/2,
+      context.ntimes,
+      ntimes_sum
+    );
+    cudaStreamSynchronize(context.stream);
+  }
+}
+
+void cublas_beamform_integrate(float *in_d, float *out_d) {
   // Create XX, YY, XY beam powers.
   // Sum over `ntimes_sum` samples
   int ntimes_sum = context.ntimes / context.ntimeblocks;
   dim3 sumBlockGrid(context.nchans, context.nbeams/2);
   dim3 sumThreadGrid(context.ntimes / ntimes_sum);
   trans_output_and_sum<<<sumBlockGrid, sumThreadGrid, 0, context.stream>>>(
-    context.out_d,
-    sum_out_d,
+    in_d,
+    out_d,
     context.nchans,
     context.nbeams/2,
     context.ntimes,
diff --git a/src/cublas_beamform.cuh b/src/cublas_beamform.cuh
index eca2d63f2..6a0d2b22a 100644
--- a/src/cublas_beamform.cuh
+++ b/src/cublas_beamform.cuh
@@ -30,6 +30,7 @@ __global__ void complex2pow(float *in, float *out, int N);
 
 void cublas_beamform_destroy();
 void cublas_beamform(unsigned char *in4_d, float *sum_out_d, float *weights_d);
+void cublas_beamform_integrate(float *in_d, float *sum_out_d);
 void cublas_beamform_init(int device, int ninputs, int nchans, int ntimes, int nbeams, int ntimeblocks);
 
 #endif

From 9f077f2238adff7b56c7d8c0e5e3f72dd0ab607b Mon Sep 17 00:00:00 2001
From: JackH <jack@rtr>
Date: Fri, 24 Jul 2020 18:51:22 +0000
Subject: [PATCH 19/91] Add missing ifdefs for IBV code

---
 src/packet_capture.hpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/packet_capture.hpp b/src/packet_capture.hpp
index c61454098..69258e231 100644
--- a/src/packet_capture.hpp
+++ b/src/packet_capture.hpp
@@ -1184,8 +1184,10 @@ BFstatus BFpacketcapture_create(BFpacketcapture* obj,
         method = new DiskPacketReader(fd, max_payload_size);
     } else if( backend == BF_IO_UDP ) {
         method = new UDPPacketReceiver(fd, max_payload_size);
+#if BF_HPIBV_ENABLED
     } else if( backend == BF_IO_IBV_UDP ) {
         method = new IBVUDPPacketReceiver(fd, max_payload_size);
+#endif
     } else if( backend == BF_IO_SNIFFER ) {
         method = new UDPPacketSniffer(fd, max_payload_size);
     } else {

From d0a907aac0efe0ca506646294af52214d5977a4a Mon Sep 17 00:00:00 2001
From: JackH <jack@rtr>
Date: Wed, 29 Jul 2020 13:37:56 +0000
Subject: [PATCH 20/91] Remove remnants of JH's hashpipe IBV code

---
 src/Makefile             |  6 +---
 src/bf_ibverbs.cpp       | 60 ----------------------------------------
 src/bifrost/bf_ibverbs.h | 17 ------------
 3 files changed, 1 insertion(+), 82 deletions(-)
 delete mode 100644 src/bf_ibverbs.cpp
 delete mode 100644 src/bifrost/bf_ibverbs.h

diff --git a/src/Makefile b/src/Makefile
index 9dc5101b8..b5e6057b8 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -20,7 +20,7 @@ LIBBIFROST_OBJS = \
   unpack.o \
   quantize.o \
   proclog.o
-ifdef VERBS:
+ifdef VERBS
   # These files require IB verbs to compile
   LIBBIFROST_OBJS += \
   ib_verbs.o
@@ -49,10 +49,6 @@ ifdef XGPU
   bf_xgpu.o
 endif
 endif
-ifdef IBV
-  LIBBIFROST_OBJS += \
-  bf_ibverbs.o
-endif
 
 JIT_SOURCES ?= \
   Complex.hpp.jit \
diff --git a/src/bf_ibverbs.cpp b/src/bf_ibverbs.cpp
deleted file mode 100644
index 38fa2e6a0..000000000
--- a/src/bf_ibverbs.cpp
+++ /dev/null
@@ -1,60 +0,0 @@
-#include <stdio.h>
-#include "hashpipe_ibverbs.h"
-#include <bifrost/common.h>
-
-#define IBV_UDP_PAYLOAD_OFFSET 42
-
-extern "C" {
-struct hashpipe_ibv_context _hibv_ctx = {0};
-struct hashpipe_ibv_recv_pkt* _hibv_rpkt; // Current packet chain
-struct hashpipe_ibv_recv_pkt* _pkt; // Current packet
-
-BFstatus ibv_init(size_t pkt_size_max) {
-    fprintf(stderr, "Configuring IBV socket\n");
-    int port = 10000;
-    char ifname[IFNAMSIZ] = "ens1f1";
-    strncpy(_hibv_ctx.interface_name, ifname, IFNAMSIZ);
-    _hibv_ctx.interface_name[IFNAMSIZ-1] = '\0'; // Ensure NUL termination
-    _hibv_ctx.send_pkt_num = 1;
-    _hibv_ctx.recv_pkt_num = 32768;
-    _hibv_ctx.pkt_size_max = pkt_size_max;
-    fprintf(stderr, "IBV: pkt_size_max: %d\n", _hibv_ctx.pkt_size_max);
-    fprintf(stderr, "IBV: recv_pkt_num: %d\n", _hibv_ctx.recv_pkt_num);
-    _hibv_ctx.max_flows = 1;
-    int ret = hashpipe_ibv_init(&_hibv_ctx);
-	if( ret ) {
-	    fprintf(stderr, "ERROR: haspipe_ibv_init returned %d\n", ret);
-	}
-
-    // Subscribe to RX flow
-    ret = hashpipe_ibv_flow(
-            &_hibv_ctx,
-            0, IBV_FLOW_SPEC_UDP,
-            _hibv_ctx.mac, NULL, 0, 0, 0, 0, 0, port);
-    if( ret ) {
-	    fprintf(stderr, "ERROR: haspipe_ibv_flow returned %d\n", ret);
-    }
-
-    return BF_STATUS_SUCCESS;
-}
-
-int ibv_recv_packet(uint8_t** pkt_ptr, int flags) {
-    // If we don't have a work-request queue on the go,
-    // get some new packets.
-    if ( _pkt ) {
-        _pkt = (struct hashpipe_ibv_recv_pkt *)_pkt->wr.next;
-        if ( !_pkt ) {
-            hashpipe_ibv_release_pkts(&_hibv_ctx, _hibv_rpkt);
-            _hibv_rpkt = NULL;
-        }
-    }
-    while (!_hibv_rpkt) {
-        _hibv_rpkt = hashpipe_ibv_recv_pkts(&_hibv_ctx, 1);
-        _pkt = _hibv_rpkt;
-    }
-    // IBV returns Eth/UDP/IP headers. Strip them off here.
-    *pkt_ptr = (uint8_t *)_pkt->wr.sg_list->addr + IBV_UDP_PAYLOAD_OFFSET;
-    return _pkt->length - IBV_UDP_PAYLOAD_OFFSET;
-}
-
-}
diff --git a/src/bifrost/bf_ibverbs.h b/src/bifrost/bf_ibverbs.h
deleted file mode 100644
index ce1206c74..000000000
--- a/src/bifrost/bf_ibverbs.h
+++ /dev/null
@@ -1,17 +0,0 @@
-#ifndef BF_IBVERBS_H_INCLUDE_GUARD_
-#define BF_IBVERBS_H_INCLUDE_GUARD_
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#include <stdint.h>
-
-BFstatus ibv_init(size_t pkt_size_max);
-int ibv_recv_packet(uint8_t** pkt_ptr, int flags);
-
-#ifdef __cplusplus
-} // extern "C"
-#endif
-
-#endif // BF_IBVERBS_INCLUDE_GUARD_

From 76da1ebc61c6524b87dc979751f7d9be857dd176 Mon Sep 17 00:00:00 2001
From: JackH <jack@rtr>
Date: Wed, 29 Jul 2020 13:39:40 +0000
Subject: [PATCH 21/91] Remove remnants of JH's hashpipe IBV code

---
 src/packet_capture.cpp | 24 ------------------------
 src/packet_capture.hpp | 23 -----------------------
 2 files changed, 47 deletions(-)

diff --git a/src/packet_capture.cpp b/src/packet_capture.cpp
index 9c24f8d87..f6d56b43a 100644
--- a/src/packet_capture.cpp
+++ b/src/packet_capture.cpp
@@ -328,30 +328,6 @@ BFstatus bfUdpCaptureCreate(BFpacketcapture* obj,
                                   BF_IO_UDP);
 }
 
-BFstatus bfIbvUdpCaptureCreate(BFpacketcapture* obj,
-                            const char*      format,
-                            int              fd,
-                            BFring           ring,
-                            BFsize           nsrc,
-                            BFsize           src0,
-                            BFsize           max_payload_size,
-                            BFsize           buffer_ntime,
-                            BFsize           slot_ntime,
-                            BFpacketcapture_callback sequence_callback,
-                            int              core) {
-    return BFpacketcapture_create(obj,
-                                  format,
-                                  fd,
-                                  ring,
-                                  nsrc,
-                                  src0,
-                                  buffer_ntime,
-                                  slot_ntime,
-                                  sequence_callback,
-                                  core,
-                                  BF_IO_IBV_UDP);
-}
-
 BFstatus bfUdpSnifferCreate(BFpacketcapture* obj,
                             const char*      format,
                             int              fd,
diff --git a/src/packet_capture.hpp b/src/packet_capture.hpp
index 509b9f6c8..c8fc3f561 100644
--- a/src/packet_capture.hpp
+++ b/src/packet_capture.hpp
@@ -222,27 +222,6 @@ class VMAReceiver {
 };
 #endif // BF_VMA_ENABLED
 
-#ifndef BF_HPIBV_ENABLED
-#define BF_HPIBV_ENABLED 0
-#endif
-
-#if BF_HPIBV_ENABLED
-#include <bifrost/bf_ibverbs.h>
-
-class IBVUDPPacketReceiver : public PacketCaptureMethod {
-public:
-    IBVUDPPacketReceiver(int fd, size_t pkt_size_max=JUMBO_FRAME_SIZE)
-        : PacketCaptureMethod(fd, pkt_size_max, BF_IO_IBV_UDP)
-    {
-        ibv_init(pkt_size_max);
-    }
-    inline int recv_packet(uint8_t** pkt_ptr, int flags=0) {
-        return ibv_recv_packet(pkt_ptr, flags);
-    }
-    inline const char* get_name() { return "udp_capture"; }
-};
-#endif // BF_HPIBV_ENABLED
-
 class UDPPacketReceiver : public PacketCaptureMethod {
 #if BF_VMA_ENABLED
     VMAReceiver            _vma;
@@ -1204,8 +1183,6 @@ BFstatus BFpacketcapture_create(BFpacketcapture* obj,
         method = new DiskPacketReader(fd, max_payload_size);
     } else if( backend == BF_IO_UDP ) {
         method = new UDPPacketReceiver(fd, max_payload_size);
-    } else if( backend == BF_IO_IBV_UDP ) {
-        method = new IBVUDPPacketReceiver(fd, max_payload_size);
     } else if( backend == BF_IO_SNIFFER ) {
         method = new UDPPacketSniffer(fd, max_payload_size);
 #if BF_VERBS_ENABLED

From 951b7412c90dd758446bdd211814742a1fc6043b Mon Sep 17 00:00:00 2001
From: JackH <jack@rtr>
Date: Wed, 29 Jul 2020 13:39:59 +0000
Subject: [PATCH 22/91] Default to buffer 32k packets

Reaches 27Gbps on LWA352 pipeline
---
 src/ib_verbs.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ib_verbs.hpp b/src/ib_verbs.hpp
index b2d29a6ad..d00cbca08 100644
--- a/src/ib_verbs.hpp
+++ b/src/ib_verbs.hpp
@@ -53,7 +53,7 @@ extern "C" {
 #endif
 
 #ifndef BF_VERBS_NPKTBUF
-#define BF_VERBS_NPKTBUF 8192
+#define BF_VERBS_NPKTBUF 32768
 #endif
 
 #ifndef BF_VERBS_WCBATCH

From d8632960e2342a62d83960c8c5856ab23ebc91ff Mon Sep 17 00:00:00 2001
From: JackH <jack@rtr>
Date: Wed, 29 Jul 2020 09:59:18 -0400
Subject: [PATCH 23/91] Remove some unused code

---
 src/formats/snap2.hpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/formats/snap2.hpp b/src/formats/snap2.hpp
index ff401123b..fe0f2d7cd 100644
--- a/src/formats/snap2.hpp
+++ b/src/formats/snap2.hpp
@@ -143,7 +143,6 @@ class SNAP2Processor : virtual public PacketProcessor {
             itype const* __restrict__ in  = (itype const*)pkt->payload_ptr;
             otype*       __restrict__ out = (otype*      )&obufs[obuf_idx][obuf_offset];
 
-            int words_per_chan_in = pkt->npol >> 5; // 32 pols per 256-bit word
             int words_per_chan_out = pkt->npol_tot >> 5;
             int pol_offset_out = pkt->pol0 >> 5;
             int pkt_chan = pkt->chan0;           // The first channel in this packet
@@ -156,7 +155,7 @@ class SNAP2Processor : virtual public PacketProcessor {
             __m256i *dest_p;
             __m256i vecbuf[2];
             uint64_t *in64 = (uint64_t *)in;
-            int c, i;
+            int c;
             dest_p = (__m256i *)(out + (words_per_chan_out * (pkt_chan)) + pol_offset_out);
             for(c=0; c<pkt->nchan; c++) {
                vecbuf[0] = _mm256_set_epi64x(in64[3], in64[2], in64[1], in64[0]);

From 35e819d39d32688f212a73109373448adf372001 Mon Sep 17 00:00:00 2001
From: JackH <jack@rtr>
Date: Wed, 29 Jul 2020 09:59:47 -0400
Subject: [PATCH 24/91] Change how bifrost sequences are defined

Sequence only changes if out-of-order packets indicate
the upstream transmitters have reset
---
 src/packet_capture.hpp | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/src/packet_capture.hpp b/src/packet_capture.hpp
index c61454098..a0e3e7957 100644
--- a/src/packet_capture.hpp
+++ b/src/packet_capture.hpp
@@ -657,6 +657,7 @@ class BFpacketcapture_chips_impl : public BFpacketcapture_impl {
 class BFpacketcapture_snap2_impl : public BFpacketcapture_impl {
 	ProcLog            _type_log;
 	ProcLog            _chan_log;
+    BFoffset           _last_pkt_seq;
 	
 	BFpacketcapture_snap2_sequence_callback _sequence_callback;
 	
@@ -665,7 +666,8 @@ class BFpacketcapture_snap2_impl : public BFpacketcapture_impl {
 		//         always starts things ~3 seq's before the 1sec boundary anyway.
 		//seq = round_up(pkt->seq, _slot_ntime);
 		//*_seq          = round_nearest(pkt->seq, _slot_ntime);
-		_seq          = round_up(pkt->seq, _slot_ntime);
+		//_seq          = round_up(pkt->seq, _slot_ntime);
+		_seq          = pkt->seq;
 		this->on_sequence_changed(pkt, seq0, time_tag, hdr, hdr_size);
     }
     void on_sequence_active(const PacketDesc* pkt) {
@@ -679,8 +681,12 @@ class BFpacketcapture_snap2_impl : public BFpacketcapture_impl {
     // Has the configuration changed? I.e., different channels being sent.
 	inline bool has_sequence_changed(const PacketDesc* pkt) {
         // TODO: Decide what a sequence actually is!
-	    return (pkt->seq % 480 == 0);
-        //return false;
+        // Currently a new sequence starts whenever packets come out of order.
+        // This isn't great, but the packet RX code assumes packets are in order too.
+        bool is_new_seq;
+        is_new_seq = ((pkt->seq != _last_pkt_seq) && (pkt->seq != _last_pkt_seq));
+        _last_pkt_seq = pkt->seq;
+	    return is_new_seq;
 	}
 	void on_sequence_changed(const PacketDesc* pkt, BFoffset* seq0, BFoffset* time_tag, const void** hdr, size_t* hdr_size) {
 	    *seq0 = _seq;// + _nseq_per_buf*_bufs.size();

From f889f156092084b5c557510a3cb953ce3ffd95f6 Mon Sep 17 00:00:00 2001
From: JackH <jack@rtr>
Date: Wed, 29 Jul 2020 10:05:05 -0400
Subject: [PATCH 25/91] Don't use VMA or HWLOC

---
 user.mk | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/user.mk b/user.mk
index df51b226f..edea22fbd 100644
--- a/user.mk
+++ b/user.mk
@@ -32,7 +32,7 @@ ALIGNMENT ?= 4096 # Memory allocation alignment
 #ANY_ARCH   = 1 # Disable native architecture compilation
 #CUDA_DEBUG = 1 # Enable CUDA debugging (nvcc -G)
 #NUMA       = 1 # Enable use of numa library for setting affinity of ring memory
-HWLOC      = 1 # Enable use of hwloc library for memory binding in udp_capture
-VMA        = 1 # Enable use of Mellanox libvma in udp_capture
+#HWLOC      = 1 # Enable use of hwloc library for memory binding in udp_capture
+#VMA        = 1 # Enable use of Mellanox libvma in udp_capture
 XGPU       = 1 # build xGPU integrations (requires the xGPU library)
 VERBS      = 1 # Enable use of IB verbs with udp_verbs_capture

From d52ebb28477081587a1980a40ef1d0bab6ad6505 Mon Sep 17 00:00:00 2001
From: JackH <jack@rtr>
Date: Thu, 30 Jul 2020 17:23:53 +0000
Subject: [PATCH 26/91] VERMS doesn't need -lvma

---
 src/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Makefile b/src/Makefile
index b5e6057b8..ee2e9708b 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -112,7 +112,7 @@ endif
 
 ifdef VERBS
   # Requires Mellanox libvma to be installed
-  LIB        += -lvma -libverbs
+  LIB        += -libverbs
   CPPFLAGS   += -DBF_VERBS_ENABLED=1
 endif
 

From 6ee33a1256a7cb19ea2342d40df2f42b3e726f31 Mon Sep 17 00:00:00 2001
From: JackH <jack@rtr>
Date: Thu, 30 Jul 2020 17:24:22 +0000
Subject: [PATCH 27/91] Tweaks to sequence boundaries

Round start of seq to a gulp size.
Decide if a sequence has ended by checking if the first packet of the next sequence
is the current time step + gulp size

This enforces gulp_size = slot_size. But, it's not obvious slot size actually
does anything other than set the time granularity at which a buffer may start
---
 src/packet_capture.hpp | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/src/packet_capture.hpp b/src/packet_capture.hpp
index 22f74ba51..9bc978e9b 100644
--- a/src/packet_capture.hpp
+++ b/src/packet_capture.hpp
@@ -656,7 +656,7 @@ class BFpacketcapture_chips_impl : public BFpacketcapture_impl {
 class BFpacketcapture_snap2_impl : public BFpacketcapture_impl {
 	ProcLog            _type_log;
 	ProcLog            _chan_log;
-    BFoffset           _last_pkt_seq;
+    BFoffset           _last_seq;
 	
 	BFpacketcapture_snap2_sequence_callback _sequence_callback;
 	
@@ -665,8 +665,7 @@ class BFpacketcapture_snap2_impl : public BFpacketcapture_impl {
 		//         always starts things ~3 seq's before the 1sec boundary anyway.
 		//seq = round_up(pkt->seq, _slot_ntime);
 		//*_seq          = round_nearest(pkt->seq, _slot_ntime);
-		//_seq          = round_up(pkt->seq, _slot_ntime);
-		_seq          = pkt->seq;
+		_seq          = round_up(pkt->seq, _slot_ntime);
 		this->on_sequence_changed(pkt, seq0, time_tag, hdr, hdr_size);
     }
     void on_sequence_active(const PacketDesc* pkt) {
@@ -680,11 +679,12 @@ class BFpacketcapture_snap2_impl : public BFpacketcapture_impl {
     // Has the configuration changed? I.e., different channels being sent.
 	inline bool has_sequence_changed(const PacketDesc* pkt) {
         // TODO: Decide what a sequence actually is!
-        // Currently a new sequence starts whenever packets come out of order.
-        // This isn't great, but the packet RX code assumes packets are in order too.
+        // Currently a new sequence starts whenever a block finishes and the next
+        // packet isn't from the next block
+        // TODO. Is this actually reasonable? Does it recover from upstream resyncs?
         bool is_new_seq;
-        is_new_seq = ((pkt->seq != _last_pkt_seq) && (pkt->seq != _last_pkt_seq));
-        _last_pkt_seq = pkt->seq;
+        is_new_seq = (pkt->seq != _last_seq + _slot_ntime);
+        _last_seq = pkt->seq;
 	    return is_new_seq;
 	}
 	void on_sequence_changed(const PacketDesc* pkt, BFoffset* seq0, BFoffset* time_tag, const void** hdr, size_t* hdr_size) {
@@ -692,6 +692,7 @@ class BFpacketcapture_snap2_impl : public BFpacketcapture_impl {
         _chan0 = pkt->chan0;
         _nchan = pkt->nchan;
         _payload_size = pkt->payload_size;
+        _last_seq = _seq;
         
 	    if( _sequence_callback ) {
 	        int status = (*_sequence_callback)(*seq0,

From 9206d8f682bb725a3cd6173be03259ffc5c720c9 Mon Sep 17 00:00:00 2001
From: JackH <jack@rtr>
Date: Thu, 30 Jul 2020 20:28:51 +0000
Subject: [PATCH 28/91] Make new sequence start when packet seq number resets

---
 src/packet_capture.cpp |  8 ++++++++
 src/packet_capture.hpp | 16 ++++++++--------
 2 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/src/packet_capture.cpp b/src/packet_capture.cpp
index f6d56b43a..7827fbc0c 100644
--- a/src/packet_capture.cpp
+++ b/src/packet_capture.cpp
@@ -91,6 +91,14 @@ int PacketCaptureThread::run(uint64_t seq_beg,
 		BF_PRINTD("HERE" << " " << _pkt.seq << " < " << seq_beg);
 		_have_pkt = false;
 		if( less_than(_pkt.seq, seq_beg) ) {
+            // If lots [TODO: what is lots] of packets are late
+            // return. Otherwise a seq reset can lead to being stuck
+            // here endlessly counting late packets.
+            if( less_than(_pkt.seq + nseq_per_obuf, seq_beg) ) {
+		        _have_pkt = true;
+                ret = CAPTURE_SUCCESS;
+                break;
+            }
 			++_stats.nlate;
 			_stats.nlate_bytes += _pkt.payload_size;
 			++_src_stats[_pkt.src].nlate;
diff --git a/src/packet_capture.hpp b/src/packet_capture.hpp
index 9bc978e9b..7114482ee 100644
--- a/src/packet_capture.hpp
+++ b/src/packet_capture.hpp
@@ -661,11 +661,6 @@ class BFpacketcapture_snap2_impl : public BFpacketcapture_impl {
 	BFpacketcapture_snap2_sequence_callback _sequence_callback;
 	
 	void on_sequence_start(const PacketDesc* pkt, BFoffset* seq0, BFoffset* time_tag, const void** hdr, size_t* hdr_size ) {
-        // TODO: Might be safer to round to nearest here, but the current firmware
-		//         always starts things ~3 seq's before the 1sec boundary anyway.
-		//seq = round_up(pkt->seq, _slot_ntime);
-		//*_seq          = round_nearest(pkt->seq, _slot_ntime);
-		_seq          = round_up(pkt->seq, _slot_ntime);
 		this->on_sequence_changed(pkt, seq0, time_tag, hdr, hdr_size);
     }
     void on_sequence_active(const PacketDesc* pkt) {
@@ -682,17 +677,22 @@ class BFpacketcapture_snap2_impl : public BFpacketcapture_impl {
         // Currently a new sequence starts whenever a block finishes and the next
         // packet isn't from the next block
         // TODO. Is this actually reasonable? Does it recover from upstream resyncs?
-        bool is_new_seq;
-        is_new_seq = (pkt->seq != _last_seq + _slot_ntime);
+        bool is_new_seq = false;
+        if ( pkt->seq != _last_seq + _slot_ntime ) {
+            is_new_seq = true;
+            this->flush();
+        }
         _last_seq = pkt->seq;
 	    return is_new_seq;
 	}
 	void on_sequence_changed(const PacketDesc* pkt, BFoffset* seq0, BFoffset* time_tag, const void** hdr, size_t* hdr_size) {
+		_seq          = round_up(pkt->seq, _slot_ntime);
+        fprintf(stderr, "New seq start is %d based on pkt->seq %d\n", _seq, (int)pkt->seq);
+        _last_seq     = _seq;
 	    *seq0 = _seq;// + _nseq_per_buf*_bufs.size();
         _chan0 = pkt->chan0;
         _nchan = pkt->nchan;
         _payload_size = pkt->payload_size;
-        _last_seq = _seq;
         
 	    if( _sequence_callback ) {
 	        int status = (*_sequence_callback)(*seq0,

From e6fdff18b3bdf725e590b94a1a770f17b9048527 Mon Sep 17 00:00:00 2001
From: JackH <jack@rtr>
Date: Sat, 1 Aug 2020 11:55:57 +0000
Subject: [PATCH 29/91] handle StopIteration exceptions

Python 3.7 enables https://www.python.org/dev/peps/pep-0479/

StopIteration errors are, from Python 3.7, promoited to RuntimeErrors
and must be explicitly handled by user code.
See https://stackoverflow.com/questions/51700960/runtimeerror-generator-raised-stopiteration-every-time-i-try-to-run-app
---
 python/bifrost/ring.py | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/python/bifrost/ring.py b/python/bifrost/ring.py
index ddf2c71e0..83ba84ec0 100644
--- a/python/bifrost/ring.py
+++ b/python/bifrost/ring.py
@@ -112,8 +112,11 @@ def open_earliest_sequence(self, guarantee=True):
     def read(self, whence='earliest', guarantee=True):
         with ReadSequence(self, which=whence, guarantee=guarantee) as cur_seq:
             while True:
-                yield cur_seq
-                cur_seq.increment()
+                try:
+                    yield cur_seq
+                    cur_seq.increment()
+                except StopIteration:
+                    return
     #def _data(self):
     #    data_ptr = _get(self.lib.bfRingLockedGetData, self.obj)
     #    #data_ptr = c_void_p()
@@ -273,9 +276,12 @@ def read(self, span_size, stride=None, begin=0):
             stride = span_size
         offset = begin
         while True:
-            with self.acquire(offset, span_size) as ispan:
-                yield ispan
-            offset += stride
+            try:
+                with self.acquire(offset, span_size) as ispan:
+                    yield ispan
+                offset += stride
+            except StopIteration:
+                return
 
 class SpanBase(object):
     def __init__(self, ring, writeable):

From f9530728b4b15ab24e6b135c072156ac962808d4 Mon Sep 17 00:00:00 2001
From: JackH <jack@rtr>
Date: Sat, 1 Aug 2020 12:29:38 +0000
Subject: [PATCH 30/91] Remove debug print

---
 src/packet_capture.hpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/packet_capture.hpp b/src/packet_capture.hpp
index 7114482ee..e8b397d40 100644
--- a/src/packet_capture.hpp
+++ b/src/packet_capture.hpp
@@ -687,7 +687,6 @@ class BFpacketcapture_snap2_impl : public BFpacketcapture_impl {
 	}
 	void on_sequence_changed(const PacketDesc* pkt, BFoffset* seq0, BFoffset* time_tag, const void** hdr, size_t* hdr_size) {
 		_seq          = round_up(pkt->seq, _slot_ntime);
-        fprintf(stderr, "New seq start is %d based on pkt->seq %d\n", _seq, (int)pkt->seq);
         _last_seq     = _seq;
 	    *seq0 = _seq;// + _nseq_per_buf*_bufs.size();
         _chan0 = pkt->chan0;

From fa671f43984ec660330ec6078a0460218a5a0a65 Mon Sep 17 00:00:00 2001
From: JackH <jack@rtr>
Date: Mon, 10 Aug 2020 19:14:26 +0000
Subject: [PATCH 31/91] Tweak SNAP2 packet format to contain sync time

And feed sync time into the pipeline time_tag
---
 src/formats/snap2.hpp  | 10 +++-------
 src/packet_capture.hpp |  7 +++++--
 2 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/src/formats/snap2.hpp b/src/formats/snap2.hpp
index fe0f2d7cd..9855b8c2e 100644
--- a/src/formats/snap2.hpp
+++ b/src/formats/snap2.hpp
@@ -33,8 +33,6 @@
 #include <immintrin.h> // SSE
 #include <emmintrin.h>
 
-#define SNAP2_HEADER_MAGIC 0xaabbccdd
-
 // TODO: parameterize somewhere. This isn't
 // related to the packet formatting
 #define PIPELINE_NPOL 704
@@ -44,9 +42,9 @@
 // All entries are network (i.e. big) endian
 struct snap2_hdr_type {
         uint64_t  seq;       // Spectra counter == packet counter
-        uint32_t  magic;     // = 0xaabbccdd
+        uint32_t  sync_time; // UNIX sync time
         uint16_t  npol;      // Number of pols in this packet
-        uint16_t  npol_tot;      // Number of pols total
+        uint16_t  npol_tot;  // Number of pols total
         uint16_t  nchan;     // Number of channels in this packet
         uint16_t  nchan_tot;     // Number of channels total (for this pipeline)
         uint32_t  chan_block_id; // ID of this block of chans
@@ -92,10 +90,8 @@ class SNAP2Decoder : virtual public PacketDecoder {
         const snap2_hdr_type* pkt_hdr  = (snap2_hdr_type*)pkt_ptr;
         const uint8_t*        pkt_pld  = pkt_ptr  + sizeof(snap2_hdr_type);
         int                   pld_size = pkt_size - sizeof(snap2_hdr_type);
-        if( be32toh(pkt_hdr->magic) != SNAP2_HEADER_MAGIC ) {
-            return false;
-        }
         pkt->seq   = be64toh(pkt_hdr->seq);
+        pkt->time_tag = be32toh(pkt_hdr->sync_time);
         int npol_blocks  = (be16toh(pkt_hdr->npol_tot) / be16toh(pkt_hdr->npol));
         int nchan_blocks = (be16toh(pkt_hdr->nchan_tot) / be16toh(pkt_hdr->nchan));
 
diff --git a/src/packet_capture.hpp b/src/packet_capture.hpp
index e8b397d40..ef5e5d9b4 100644
--- a/src/packet_capture.hpp
+++ b/src/packet_capture.hpp
@@ -657,6 +657,7 @@ class BFpacketcapture_snap2_impl : public BFpacketcapture_impl {
 	ProcLog            _type_log;
 	ProcLog            _chan_log;
     BFoffset           _last_seq;
+    BFoffset           _last_time_tag;
 	
 	BFpacketcapture_snap2_sequence_callback _sequence_callback;
 	
@@ -678,7 +679,7 @@ class BFpacketcapture_snap2_impl : public BFpacketcapture_impl {
         // packet isn't from the next block
         // TODO. Is this actually reasonable? Does it recover from upstream resyncs?
         bool is_new_seq = false;
-        if ( pkt->seq != _last_seq + _slot_ntime ) {
+        if ( (_last_time_tag != pkt->time_tag) || (pkt->seq != _last_seq + _slot_ntime) ) {
             is_new_seq = true;
             this->flush();
         }
@@ -687,7 +688,9 @@ class BFpacketcapture_snap2_impl : public BFpacketcapture_impl {
 	}
 	void on_sequence_changed(const PacketDesc* pkt, BFoffset* seq0, BFoffset* time_tag, const void** hdr, size_t* hdr_size) {
 		_seq          = round_up(pkt->seq, _slot_ntime);
-        _last_seq     = _seq;
+        *time_tag      = (BFoffset) pkt->time_tag;
+        _last_time_tag = pkt->time_tag;
+        _last_seq      = _seq;
 	    *seq0 = _seq;// + _nseq_per_buf*_bufs.size();
         _chan0 = pkt->chan0;
         _nchan = pkt->nchan;

From 8e92235fbe7d39bbe17095c175fc4634a9106292 Mon Sep 17 00:00:00 2001
From: JackH <jack@rtr>
Date: Sun, 16 Aug 2020 12:40:19 +0000
Subject: [PATCH 32/91] Add option to sum over freq chans while subselecting
 baselines

---
 src/bf_xgpu.cpp       | 8 ++++----
 src/bifrost/bf_xgpu.h | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/bf_xgpu.cpp b/src/bf_xgpu.cpp
index 25c3774eb..3fcbb8768 100644
--- a/src/bf_xgpu.cpp
+++ b/src/bf_xgpu.cpp
@@ -134,10 +134,10 @@ BFstatus bfXgpuKernel(BFarray *in, BFarray *out, int doDump) {
  * and gather them in a new buffer, in order chan x visibility x complexity [int32]
  * BFarray *in : Pointer to a BFarray with storage in device memory, where xGPU results reside
  * BFarray *in : Pointer to a BFarray with storage in device memory where collated visibilities should be written.
- * int **vismap : array of visibilities in [[polA, polB], [polC, polD], ... ] form.
- * int nvis : The number of visibilities to colate (length of the vismap array)
+ * BFarray *vismap : array of visibilities in [[polA, polB], [polC, polD], ... ] form.
+ * int nchan_sum: The number of frequency channels to sum over
  */
-BFstatus bfXgpuSubSelect(BFarray *in, BFarray *out, BFarray *vismap) {
+BFstatus bfXgpuSubSelect(BFarray *in, BFarray *out, BFarray *vismap, int nchan_sum) {
   long long unsigned nvis = num_contiguous_elements(vismap);
   int xgpu_error;
   if (in->space != BF_SPACE_CUDA) {
@@ -149,7 +149,7 @@ BFstatus bfXgpuSubSelect(BFarray *in, BFarray *out, BFarray *vismap) {
   if (vismap->space != BF_SPACE_CUDA) {
     return BF_STATUS_UNSUPPORTED_SPACE;
   }
-  xgpu_error = xgpuCudaSubSelect(&context, (Complex *)in->data, (Complex *)out->data, (int *)vismap->data, nvis);
+  xgpu_error = xgpuCudaSubSelect(&context, (Complex *)in->data, (Complex *)out->data, (int *)vismap->data, nvis, nchan_sum);
   if (xgpu_error != XGPU_OK) {
     fprintf(stderr, "ERROR: xgpuKernel: kernel call returned %d\n", xgpu_error);
     return BF_STATUS_INTERNAL_ERROR;
diff --git a/src/bifrost/bf_xgpu.h b/src/bifrost/bf_xgpu.h
index ede805053..7b4f45b1e 100644
--- a/src/bifrost/bf_xgpu.h
+++ b/src/bifrost/bf_xgpu.h
@@ -4,4 +4,4 @@
 BFstatus bfXgpuInitialize(BFarray *in, BFarray *out, int gpu_dev);
 BFstatus bfXgpuCorrelate(BFarray *in, BFarray *out, int doDump);
 BFstatus bfXgpuKernel(BFarray *in, BFarray *out, int doDump);
-BFstatus bfXgpuSubSelect(BFarray *in, BFarray *out, BFarray *vismap);
+BFstatus bfXgpuSubSelect(BFarray *in, BFarray *out, BFarray *vismap, int nchan_sum);

From 99c0b5b2be38c937ebd980d470a9f0ac95be32a8 Mon Sep 17 00:00:00 2001
From: JackH <jack@rtr>
Date: Sun, 16 Aug 2020 17:09:04 +0000
Subject: [PATCH 33/91] Add methods for converting an input order to a
 visibility order

---
 src/bf_xgpu.cpp       | 73 +++++++++++++++++++++++++++++++++++++++++++
 src/bifrost/bf_xgpu.h |  1 +
 2 files changed, 74 insertions(+)

diff --git a/src/bf_xgpu.cpp b/src/bf_xgpu.cpp
index 3fcbb8768..35e68d429 100644
--- a/src/bf_xgpu.cpp
+++ b/src/bf_xgpu.cpp
@@ -158,4 +158,77 @@ BFstatus bfXgpuSubSelect(BFarray *in, BFarray *out, BFarray *vismap, int nchan_s
   }
 }
 
+/* Computes the triangular index of an (i,j) pair as shown here...
+ * NB: Output is valid only if i >= j.
+ *
+ *      i=0  1  2  3  4..
+ *     +---------------
+ * j=0 | 00 01 03 06 10
+ *   1 |    02 04 07 11
+ *   2 |       05 08 12
+ *   3 |          09 13
+ *   4 |             14
+ *   :
+ */
+int tri_index(int i, int j){
+  return (i * (i+1))/2 + j;
+  }
+
+/* Returns index into the GPU's register tile ordered output buffer for the
+ * real component of the cross product of inputs in0 and in1.  Note that in0
+ * and in1 are input indexes (i.e. 0 based) and often represent antenna and
+ * polarization by passing (2*ant_idx+pol_idx) as the input number (NB: ant_idx
+ * and pol_idx are also 0 based).  Return value is valid if in1 >= in0.  The
+ * corresponding imaginary component is located xgpu_info.matLength words after
+ * the real component.
+ */
+int regtile_index(int in0, int in1, int nstand) {
+  int a0, a1, p0, p1;
+  int num_words_per_cell=4;
+  int quadrant, quadrant_index, quadrant_size, cell_index, pol_offset, index;
+  a0 = in0 >> 1;
+  a1 = in1 >> 1;
+  p0 = in0 & 1;
+  p1 = in1 & 1;
+  
+  // Index within a quadrant
+  quadrant_index = tri_index(a1/2, a0/2);
+  // Quadrant for this input pair
+  quadrant = 2*(a0&1) + (a1&1);
+  // Size of quadrant
+  quadrant_size = (nstand/2 + 1) * nstand/4;
+  // Index of cell (in units of cells)
+  cell_index = quadrant*quadrant_size + quadrant_index;
+  // Pol offset
+  pol_offset = 2*p1 + p0;
+  // Word index (in units of words (i.e. floats) of real component
+  index = (cell_index * num_words_per_cell) + pol_offset;
+  return index;
+  }
+
+BFstatus bfXgpuGetOrder(BFarray *antpol_to_input, BFarray *antpol_to_bl, BFarray *is_conj, int nstand, int npol) {
+  int *ip_map = (int *)antpol_to_input->data; // indexed by stand, pol
+  int *bl_map = (int *)antpol_to_bl->data;    // indexed by stand0, stand1, pol0, pol1
+  int *conj_map = (int *)is_conj->data;       // indexed by stand0, stand1, pol0, pol1
+  int s0, s1, p0, p1, i0, i1;
+  for (s0=0; s0<nstand; s0++) {
+    for (s1=0; s1<nstand; s1++) {
+      for (p0=0; p0<npol; p0++) {
+        for (p1=0; p1<npol; p1++) {
+          i0 = ip_map[npol*s0 + p0];
+          i1 = ip_map[npol*s1 + p1];
+          if (i1 >= i0) {
+            bl_map[s0*nstand*npol*npol + s1*npol*npol + p0*npol + p1] = regtile_index(i0, i1, nstand);
+            conj_map[s0*nstand*npol*npol + s1*npol*npol + p0*npol + p1] = 0;
+          } else {
+            bl_map[s0*nstand*npol*npol + s1*npol*npol + p0*npol + p1] = regtile_index(i1, i0, nstand);
+            conj_map[s0*nstand*npol*npol + s1*npol*npol + p0*npol + p1] = 1;
+          }
+        }
+      }
+    }
+  }
+  return BF_STATUS_SUCCESS;
+}
+
 } // C
diff --git a/src/bifrost/bf_xgpu.h b/src/bifrost/bf_xgpu.h
index 7b4f45b1e..6e806c0df 100644
--- a/src/bifrost/bf_xgpu.h
+++ b/src/bifrost/bf_xgpu.h
@@ -5,3 +5,4 @@ BFstatus bfXgpuInitialize(BFarray *in, BFarray *out, int gpu_dev);
 BFstatus bfXgpuCorrelate(BFarray *in, BFarray *out, int doDump);
 BFstatus bfXgpuKernel(BFarray *in, BFarray *out, int doDump);
 BFstatus bfXgpuSubSelect(BFarray *in, BFarray *out, BFarray *vismap, int nchan_sum);
+BFstatus bfXgpuGetOrder(BFarray *antpol_to_input, BFarray *antpol_to_bl, BFarray *is_conj, int nstand, int npol);

From 63f84d9a1ca3c8dbeb37beda5eb0894c950a8deb Mon Sep 17 00:00:00 2001
From: JackH <jack@rtr>
Date: Mon, 17 Aug 2020 11:26:01 +0000
Subject: [PATCH 34/91] Remove IBV option from UDPCapture constructor

This is no longer necessary now there is a dedicated UDPVerbsCapture class
---
 python/bifrost/packet_capture.py | 20 ++++++--------------
 1 file changed, 6 insertions(+), 14 deletions(-)

diff --git a/python/bifrost/packet_capture.py b/python/bifrost/packet_capture.py
index 33c61abab..c72f000ae 100644
--- a/python/bifrost/packet_capture.py
+++ b/python/bifrost/packet_capture.py
@@ -82,7 +82,7 @@ def end(self):
 class UDPCapture(_CaptureBase):
     def __init__(self, fmt, sock, ring, nsrc, src0, max_payload_size,
                  buffer_ntime, slot_ntime, sequence_callback, core=None,
-                 ibverbs=False, interface='', port=-1):
+                 interface='', port=-1):
         try:
             fmt = fmt.encode()
         except AttributeError:
@@ -90,19 +90,11 @@ def __init__(self, fmt, sock, ring, nsrc, src0, max_payload_size,
             pass
         if core is None:
             core = -1
-        if not ibverbs:
-            BifrostObject.__init__(
-                self, _bf.bfUdpCaptureCreate, _bf.bfPacketCaptureDestroy,
-                fmt, sock.fileno(), ring.obj, nsrc, src0,
-                max_payload_size, buffer_ntime, slot_ntime,
-                sequence_callback.obj, core)
-        else:
-            print("Using IBVerbs")
-            BifrostObject.__init__(
-                self, _bf.bfIbvUdpCaptureCreate, _bf.bfPacketCaptureDestroy,
-                fmt, sock.fileno(), ring.obj, nsrc, src0,
-                max_payload_size, buffer_ntime, slot_ntime,
-                sequence_callback.obj, core)
+        BifrostObject.__init__(
+            self, _bf.bfUdpCaptureCreate, _bf.bfPacketCaptureDestroy,
+            fmt, sock.fileno(), ring.obj, nsrc, src0,
+            max_payload_size, buffer_ntime, slot_ntime,
+            sequence_callback.obj, core)
 
 class UDPSniffer(_CaptureBase):
     def __init__(self, fmt, sock, ring, nsrc, src0, max_payload_size,

From 3cb5105ec70baae091db52e6765d58bf057369e9 Mon Sep 17 00:00:00 2001
From: JackH <jack@rtr>
Date: Mon, 17 Aug 2020 18:01:19 +0000
Subject: [PATCH 35/91] Add function to reorder DP4A xGPU output into
 visibility matrix

---
 src/bf_xgpu.cpp       | 41 ++++++++++++++++++++++++++++++++++++++++-
 src/bifrost/bf_xgpu.h |  3 ++-
 2 files changed, 42 insertions(+), 2 deletions(-)

diff --git a/src/bf_xgpu.cpp b/src/bf_xgpu.cpp
index 35e68d429..34809ec64 100644
--- a/src/bf_xgpu.cpp
+++ b/src/bf_xgpu.cpp
@@ -206,11 +206,16 @@ int regtile_index(int in0, int in1, int nstand) {
   return index;
   }
 
-BFstatus bfXgpuGetOrder(BFarray *antpol_to_input, BFarray *antpol_to_bl, BFarray *is_conj, int nstand, int npol) {
+BFstatus bfXgpuGetOrder(BFarray *antpol_to_input, BFarray *antpol_to_bl, BFarray *is_conj) {
   int *ip_map = (int *)antpol_to_input->data; // indexed by stand, pol
   int *bl_map = (int *)antpol_to_bl->data;    // indexed by stand0, stand1, pol0, pol1
   int *conj_map = (int *)is_conj->data;       // indexed by stand0, stand1, pol0, pol1
   int s0, s1, p0, p1, i0, i1;
+  int nstand, npol;
+  XGPUInfo xgpu_info;
+  xgpuInfo(&xgpu_info);
+  nstand = xgpu_info.nstation;
+  npol = xgpu_info.npol;
   for (s0=0; s0<nstand; s0++) {
     for (s1=0; s1<nstand; s1++) {
       for (p0=0; p0<npol; p0++) {
@@ -231,4 +236,38 @@ BFstatus bfXgpuGetOrder(BFarray *antpol_to_input, BFarray *antpol_to_bl, BFarray
   return BF_STATUS_SUCCESS;
 }
 
+/*
+ * Reorder a DP4A xGPU spec output into something more sane, throwing
+ * away unwanted baselines and re-concatenating real and imag parts in
+ * a reasonable way.
+ */
+BFstatus bfXgpuReorder(BFarray *xgpu_output, BFarray *reordered, BFarray *baselines, BFarray *is_conjugated) {
+  XGPUInfo xgpu_info;
+  xgpuInfo(&xgpu_info);
+
+  int *output = (int *)reordered->data;
+  int *input_r = (int *)xgpu_output->data;
+  int *input_i = input_r + xgpu_info.matLength;
+  int *bl = (int *)baselines->data;
+  int *conj = (int *)is_conjugated->data;
+  int n_bl = num_contiguous_elements(baselines);
+  int xgpu_n_input = xgpu_info.nstation * xgpu_info.npol;
+  int n_chan = xgpu_info.nfrequency;
+  int i, c;
+  // number of entries per channel
+  size_t regtile_chan_len = 4 * 4 * xgpu_n_input/4 * (xgpu_n_input/4+1) / 2;
+  fprintf(stderr, "nbaselines: %d; nchans:%d\n", n_bl, n_chan);
+  for (i=0; i<n_bl; i++) {
+    for (c=0; c<n_chan; c++) {
+      output[2*i*n_chan + 2*c]     = input_r[c*regtile_chan_len + bl[i]];
+      if ( conj[i] ) {
+        output[2*i*n_chan + 2*c + 1] = input_i[c*regtile_chan_len + bl[i]];
+      } else {
+        output[2*i*n_chan + 2*c + 1] = -input_i[c*regtile_chan_len + bl[i]];
+      }
+    }
+  }
+  return BF_STATUS_SUCCESS;
+}
+
 } // C
diff --git a/src/bifrost/bf_xgpu.h b/src/bifrost/bf_xgpu.h
index 6e806c0df..872777d41 100644
--- a/src/bifrost/bf_xgpu.h
+++ b/src/bifrost/bf_xgpu.h
@@ -5,4 +5,5 @@ BFstatus bfXgpuInitialize(BFarray *in, BFarray *out, int gpu_dev);
 BFstatus bfXgpuCorrelate(BFarray *in, BFarray *out, int doDump);
 BFstatus bfXgpuKernel(BFarray *in, BFarray *out, int doDump);
 BFstatus bfXgpuSubSelect(BFarray *in, BFarray *out, BFarray *vismap, int nchan_sum);
-BFstatus bfXgpuGetOrder(BFarray *antpol_to_input, BFarray *antpol_to_bl, BFarray *is_conj, int nstand, int npol);
+BFstatus bfXgpuGetOrder(BFarray *antpol_to_input, BFarray *antpol_to_bl, BFarray *is_conj);
+BFstatus bfXgpuReorder(BFarray *xgpu_output, BFarray *reordered, BFarray *baselines, BFarray *is_conjugated);

From 2eb5ff272f99eb5cfe253dd33d62129cd94f923f Mon Sep 17 00:00:00 2001
From: JackH <jack@rtr>
Date: Mon, 17 Aug 2020 18:48:38 +0000
Subject: [PATCH 36/91] Remove debugging prints

---
 src/bf_xgpu.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/bf_xgpu.cpp b/src/bf_xgpu.cpp
index 34809ec64..0c55a0b9a 100644
--- a/src/bf_xgpu.cpp
+++ b/src/bf_xgpu.cpp
@@ -256,7 +256,6 @@ BFstatus bfXgpuReorder(BFarray *xgpu_output, BFarray *reordered, BFarray *baseli
   int i, c;
   // number of entries per channel
   size_t regtile_chan_len = 4 * 4 * xgpu_n_input/4 * (xgpu_n_input/4+1) / 2;
-  fprintf(stderr, "nbaselines: %d; nchans:%d\n", n_bl, n_chan);
   for (i=0; i<n_bl; i++) {
     for (c=0; c<n_chan; c++) {
       output[2*i*n_chan + 2*c]     = input_r[c*regtile_chan_len + bl[i]];

From d9f42e9cd874400ae190edcedec902ff3a64411f Mon Sep 17 00:00:00 2001
From: JackH <jack@rtr>
Date: Fri, 4 Sep 2020 11:23:48 +0000
Subject: [PATCH 37/91] Fix conjugations

Fix bugs and set conjugation convention such that
given a visibility matrix in order:

[stand0, stand1, pol0, pol1]

The conjugation of the data is (stand0, pol0) * conj(stand1, pol1)
---
 src/bf_xgpu.cpp | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/src/bf_xgpu.cpp b/src/bf_xgpu.cpp
index 0c55a0b9a..13a014c5f 100644
--- a/src/bf_xgpu.cpp
+++ b/src/bf_xgpu.cpp
@@ -222,12 +222,14 @@ BFstatus bfXgpuGetOrder(BFarray *antpol_to_input, BFarray *antpol_to_bl, BFarray
         for (p1=0; p1<npol; p1++) {
           i0 = ip_map[npol*s0 + p0];
           i1 = ip_map[npol*s1 + p1];
-          if (i1 >= i0) {
+          // Set the conj map such that bl_map[stand0, stand1, pol0, pol1] has conjugation convention
+          // stand0,pol0 * conj(stand1,pol1)
+          if (i1 > i0) {
             bl_map[s0*nstand*npol*npol + s1*npol*npol + p0*npol + p1] = regtile_index(i0, i1, nstand);
-            conj_map[s0*nstand*npol*npol + s1*npol*npol + p0*npol + p1] = 0;
+            conj_map[s0*nstand*npol*npol + s1*npol*npol + p0*npol + p1] = 1;
           } else {
             bl_map[s0*nstand*npol*npol + s1*npol*npol + p0*npol + p1] = regtile_index(i1, i0, nstand);
-            conj_map[s0*nstand*npol*npol + s1*npol*npol + p0*npol + p1] = 1;
+            conj_map[s0*nstand*npol*npol + s1*npol*npol + p0*npol + p1] = 0;
           }
         }
       }
@@ -240,6 +242,7 @@ BFstatus bfXgpuGetOrder(BFarray *antpol_to_input, BFarray *antpol_to_bl, BFarray
  * Reorder a DP4A xGPU spec output into something more sane, throwing
  * away unwanted baselines and re-concatenating real and imag parts in
  * a reasonable way.
+ * Also remove conjugation weirdness so baselines a,b has conjugation a*conj(b)
  */
 BFstatus bfXgpuReorder(BFarray *xgpu_output, BFarray *reordered, BFarray *baselines, BFarray *is_conjugated) {
   XGPUInfo xgpu_info;
@@ -259,10 +262,10 @@ BFstatus bfXgpuReorder(BFarray *xgpu_output, BFarray *reordered, BFarray *baseli
   for (i=0; i<n_bl; i++) {
     for (c=0; c<n_chan; c++) {
       output[2*i*n_chan + 2*c]     = input_r[c*regtile_chan_len + bl[i]];
-      if ( conj[i] ) {
-        output[2*i*n_chan + 2*c + 1] = input_i[c*regtile_chan_len + bl[i]];
-      } else {
+      if ( conj[i] == 1 ) {
         output[2*i*n_chan + 2*c + 1] = -input_i[c*regtile_chan_len + bl[i]];
+      } else {
+        output[2*i*n_chan + 2*c + 1] = input_i[c*regtile_chan_len + bl[i]];
       }
     }
   }

From 42511597dee05ece560201f52f8c4ea52491d86a Mon Sep 17 00:00:00 2001
From: JackH <jack@rtr>
Date: Thu, 10 Sep 2020 17:30:33 +0000
Subject: [PATCH 38/91] Fix source ID computation

---
 src/formats/snap2.hpp | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/src/formats/snap2.hpp b/src/formats/snap2.hpp
index 9855b8c2e..23de0c387 100644
--- a/src/formats/snap2.hpp
+++ b/src/formats/snap2.hpp
@@ -96,13 +96,13 @@ class SNAP2Decoder : virtual public PacketDecoder {
         int nchan_blocks = (be16toh(pkt_hdr->nchan_tot) / be16toh(pkt_hdr->nchan));
 
         pkt->nsrc = npol_blocks * nchan_blocks;// _nsrc;
-        pkt->src = (npol_blocks * be16toh(pkt_hdr->chan_block_id)) +  (be16toh(pkt_hdr->npol_tot) / be16toh(pkt_hdr->npol));
         pkt->nchan  = be16toh(pkt_hdr->nchan);
         pkt->chan0  = be32toh(pkt_hdr->chan_block_id) * be16toh(pkt_hdr->nchan);
         pkt->nchan_tot  = be16toh(pkt_hdr->nchan_tot);
         pkt->npol  = be16toh(pkt_hdr->npol);
         pkt->npol_tot  = be16toh(pkt_hdr->npol_tot);
         pkt->pol0  = be32toh(pkt_hdr->pol0);
+        pkt->src = (pkt->pol0 / pkt->npol) + be32toh(pkt_hdr->chan_block_id) * npol_blocks;
         pkt->payload_size = pld_size;
         pkt->payload_ptr  = pkt_pld;
         return this->valid_packet(pkt);
@@ -153,6 +153,9 @@ class SNAP2Processor : virtual public PacketProcessor {
             uint64_t *in64 = (uint64_t *)in;
             int c;
             dest_p = (__m256i *)(out + (words_per_chan_out * (pkt_chan)) + pol_offset_out);
+            //if((pol_offset_out == 0) && (pkt_chan==0) && ((pkt->seq % 120)==0) ){
+            //   fprintf(stderr, "nsrc: %d seq: %d, dest_p: %p obuf idx %d, obuf offset %lu, nseq_per_obuf %d, seq0 %d, nbuf: %d\n", pkt->nsrc, pkt->seq, dest_p, obuf_idx, obuf_offset, nseq_per_obuf, seq0, nbuf);
+            //}
             for(c=0; c<pkt->nchan; c++) {
                vecbuf[0] = _mm256_set_epi64x(in64[3], in64[2], in64[1], in64[0]);
                vecbuf[1] = _mm256_set_epi64x(in64[7], in64[6], in64[5], in64[4]);
@@ -168,14 +171,16 @@ class SNAP2Processor : virtual public PacketProcessor {
                                      int      nsrc,
                                      int      nchan,
                                      int      nseq) {
-            typedef aligned256_type otype;
-            otype* __restrict__ aligned_data = (otype*)data;
-            for( int t=0; t<nseq; ++t ) {
-                    for( int c=0; c<nchan; ++c ) {
-                            ::memset(&aligned_data[src + nsrc*(c + nchan*t)],
-                                     0, sizeof(otype));
-                    }
-            }
+            fprintf(stderr, "TRYING TO BLANK OUT A SOURCE WITH MISSING PACKETS. BUT BLANKING NOT IMPLEMENTED\n");
+            //typedef aligned256_type otype;
+            //fprintf(stderr, "You really better not be here\n");
+            //otype* __restrict__ aligned_data = (otype*)data;
+            //for( int t=0; t<nseq; ++t ) {
+            //        for( int c=0; c<nchan; ++c ) {
+            //                ::memset(&aligned_data[src + nsrc*(c + nchan*t)],
+            //                         0, sizeof(otype));
+            //        }
+            //}
     }
 };
 

From 5f2272702d6eeab9a757f127b93bd4da5d0a13f4 Mon Sep 17 00:00:00 2001
From: JackH <jack@rtr>
Date: Wed, 7 Oct 2020 12:14:36 +0000
Subject: [PATCH 39/91] Py3-ize pipeline2dot

---
 tools/pipeline2dot.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tools/pipeline2dot.py b/tools/pipeline2dot.py
index dd21a592d..c899af23c 100755
--- a/tools/pipeline2dot.py
+++ b/tools/pipeline2dot.py
@@ -63,7 +63,7 @@ def get_process_details(pid):
 
     data = {'user':'', 'cpu':0.0, 'mem':0.0, 'etime':'00:00', 'threads':0}
     try:
-        output = subprocess.check_output('ps o user,pcpu,pmem,etime,nlwp %i' % pid, shell=True)
+        output = subprocess.check_output('ps o user,pcpu,pmem,etime,nlwp %i' % pid, shell=True).decode()
         output = output.split('\n')[1]
         fields = output.split(None, 4)
         data['user'] = fields[0]
@@ -348,4 +348,3 @@ def main(args):
                         help='exclude associated blocks')
     args = parser.parse_args()
     main(args)
-    
\ No newline at end of file

From 5cc85c55cd5d1ee389cbd33040fcde428f4a3615 Mon Sep 17 00:00:00 2001
From: JackH <jack@rtr>
Date: Wed, 7 Oct 2020 14:24:42 +0000
Subject: [PATCH 40/91] Add option to conjugate baselines while subselecting

---
 src/bf_xgpu.cpp       | 10 ++++++++--
 src/bifrost/bf_xgpu.h |  2 +-
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/src/bf_xgpu.cpp b/src/bf_xgpu.cpp
index 13a014c5f..91d55c843 100644
--- a/src/bf_xgpu.cpp
+++ b/src/bf_xgpu.cpp
@@ -137,7 +137,7 @@ BFstatus bfXgpuKernel(BFarray *in, BFarray *out, int doDump) {
  * BFarray *vismap : array of visibilities in [[polA, polB], [polC, polD], ... ] form.
  * int nchan_sum: The number of frequency channels to sum over
  */
-BFstatus bfXgpuSubSelect(BFarray *in, BFarray *out, BFarray *vismap, int nchan_sum) {
+BFstatus bfXgpuSubSelect(BFarray *in, BFarray *out, BFarray *vismap, BFarray *conj, int nchan_sum) {
   long long unsigned nvis = num_contiguous_elements(vismap);
   int xgpu_error;
   if (in->space != BF_SPACE_CUDA) {
@@ -149,7 +149,13 @@ BFstatus bfXgpuSubSelect(BFarray *in, BFarray *out, BFarray *vismap, int nchan_s
   if (vismap->space != BF_SPACE_CUDA) {
     return BF_STATUS_UNSUPPORTED_SPACE;
   }
-  xgpu_error = xgpuCudaSubSelect(&context, (Complex *)in->data, (Complex *)out->data, (int *)vismap->data, nvis, nchan_sum);
+  if (conj->space != BF_SPACE_CUDA) {
+    return BF_STATUS_UNSUPPORTED_SPACE;
+  }
+  if (num_contiguous_elements(conj) != nvis) {
+    return BF_STATUS_INVALID_SHAPE;
+  }
+  xgpu_error = xgpuCudaSubSelect(&context, (Complex *)in->data, (Complex *)out->data, (int *)vismap->data, (int *)conj->data, nvis, nchan_sum);
   if (xgpu_error != XGPU_OK) {
     fprintf(stderr, "ERROR: xgpuKernel: kernel call returned %d\n", xgpu_error);
     return BF_STATUS_INTERNAL_ERROR;
diff --git a/src/bifrost/bf_xgpu.h b/src/bifrost/bf_xgpu.h
index 872777d41..e06b5be61 100644
--- a/src/bifrost/bf_xgpu.h
+++ b/src/bifrost/bf_xgpu.h
@@ -4,6 +4,6 @@
 BFstatus bfXgpuInitialize(BFarray *in, BFarray *out, int gpu_dev);
 BFstatus bfXgpuCorrelate(BFarray *in, BFarray *out, int doDump);
 BFstatus bfXgpuKernel(BFarray *in, BFarray *out, int doDump);
-BFstatus bfXgpuSubSelect(BFarray *in, BFarray *out, BFarray *vismap, int nchan_sum);
+BFstatus bfXgpuSubSelect(BFarray *in, BFarray *out, BFarray *vismap, BFarray *conj, int nchan_sum);
 BFstatus bfXgpuGetOrder(BFarray *antpol_to_input, BFarray *antpol_to_bl, BFarray *is_conj);
 BFstatus bfXgpuReorder(BFarray *xgpu_output, BFarray *reordered, BFarray *baselines, BFarray *is_conjugated);

From db72993f3abf62ad6facf6df1db4c2fa58e73631 Mon Sep 17 00:00:00 2001
From: JackH <jack@rtr>
Date: Tue, 20 Oct 2020 15:56:52 +0000
Subject: [PATCH 41/91] Beamformer fixes and comment updates

- Add function to accumulate a single beam from a buffer of many
- Make accumulation functions take acc_len as an arg rather than reading from context
---
 src/beamform.cpp        | 17 +++++++--
 src/bifrost/beamform.h  | 52 ++++++++++++++++++++++++++-
 src/cublas_beamform.cu  | 79 +++++++++++++++++++++++++++++++++--------
 src/cublas_beamform.cuh |  3 +-
 4 files changed, 133 insertions(+), 18 deletions(-)

diff --git a/src/beamform.cpp b/src/beamform.cpp
index eb6c563e4..16d874222 100644
--- a/src/beamform.cpp
+++ b/src/beamform.cpp
@@ -59,7 +59,7 @@ BFstatus bfBeamformRun(BFarray *in, BFarray *out, BFarray *weights) {
   return BF_STATUS_SUCCESS;
 }
   
-BFstatus bfBeamformIntegrate(BFarray *in, BFarray *out) {
+BFstatus bfBeamformIntegrate(BFarray *in, BFarray *out, int ntimes_sum) {
   if (in->space != BF_SPACE_CUDA) {
     fprintf(stderr, "Beamformer input buffer must be in CUDA space\n");
     return BF_STATUS_INVALID_SPACE;
@@ -68,7 +68,20 @@ BFstatus bfBeamformIntegrate(BFarray *in, BFarray *out) {
     fprintf(stderr, "Beamformer output buffer must be in CUDA space\n");
     return BF_STATUS_INVALID_SPACE;
   }
-  cublas_beamform_integrate((float *)in->data, (float *)out->data);
+  cublas_beamform_integrate((float *)in->data, (float *)out->data, ntimes_sum);
+  return BF_STATUS_SUCCESS;
+}
+
+BFstatus bfBeamformIntegrateSingleBeam(BFarray *in, BFarray *out, int ntimes_sum, int beam_index) {
+  if (in->space != BF_SPACE_CUDA) {
+    fprintf(stderr, "Beamformer input buffer must be in CUDA space\n");
+    return BF_STATUS_INVALID_SPACE;
+  }
+  if (out->space != BF_SPACE_CUDA) {
+    fprintf(stderr, "Beamformer output buffer must be in CUDA space\n");
+    return BF_STATUS_INVALID_SPACE;
+  }
+  cublas_beamform_integrate_single_beam((float *)in->data, (float *)out->data, ntimes_sum, beam_index);
   return BF_STATUS_SUCCESS;
 }
 } // C
diff --git a/src/bifrost/beamform.h b/src/bifrost/beamform.h
index 1da1937a9..890b727da 100644
--- a/src/bifrost/beamform.h
+++ b/src/bifrost/beamform.h
@@ -1,6 +1,16 @@
 #include <bifrost/common.h>
 #include <bifrost/array.h>
 
+/*
+ * gpudev: GPU device ID to use
+ * ninputs: Number of inputs (single-polarization) to the beamformer
+ * nchans: Number of frequency channels
+ * ntimes: Number of time samples per beamforming call
+ * nbeams: Number of beams to generate. If using ntime_blocks > 0, beams=N will deliver
+ * ntime_blocks: Number of time blocks to output. Eg. if ntimes=1000 and ntime_blocks=10, the beamformer
+                 will integrate over 100 samples per call. Set to 0 for no accumulation, in which case
+                 raw beam voltages are output.
+ */
 BFstatus bfBeamformInitialize(
   int gpudev,
   int ninputs,
@@ -10,13 +20,53 @@ BFstatus bfBeamformInitialize(
   int ntime_blocks
 );
 
+/*
+ * in:  Pointer to ntime x nchan x ninputs x 4+4 bit data block
+ * out: Pointer to output data.
+ *        If ntime_blocks > 0:
+ *          For the purposes of generating dynamic spectra, beam 2n and 2n+1 are considered
+ *          to be two pols of the same pointing, and are cross-multipled and summed over
+ *          ntimes/ntime_blocks to form the output array:
+ *            nbeam/2 x ntime_blocks x nchan x 4 x float32 (powers, XX, YY, re(XY, im(XY))
+ *          Note that this means for N dual-pol beam pointings, the beamformer should be
+ *          constructed with nbeams=2N. This isn't very efficient, but makes it easy to deal
+ *          with arbitrary polarization orderings in the input buffer (suitable beamforming
+ *          coefficients can make appropriate single-pol beam pairs).
+ *        If ntime_blocks = 0:
+ *          Data are returned as voltages, in order:
+ *            ntimes x nchan x nbeam x complex64 beamformer block
+ *                            
+ * weights -- pointer to nbeams x nchans x ninputs x complex64 weights
+ */
 BFstatus bfBeamformRun(
   BFarray *in,
   BFarray *out,
   BFarray *weights
 );
 
+/*
+ * Take the output of bfBeamformRun with ntime_blocks = 0, and perform transposing and integration
+ * of data, to deliver a time integrated dual-pol dynamic spectra of the form:
+ *   nbeam/2 x ntime/ntimes_sum x nchan x 4 x float32 (powers, XX, YY, re(XY, im(XY))
+ * I.e., the format which would be returned by bfBeamformRun is ntime_blocks > 0
+ */
 BFstatus bfBeamformIntegrate(
   BFarray *in,
-  BFarray *out
+  BFarray *out,
+  int ntimes_sum,
+);
+
+/*
+ * Take the output of bfBeamformRun with ntime_blocks = 0, and 
+ * deliver a time integrated dual-pol dynamic spectra for a single beam of the form:
+ *   ntime/ntimes_sum x nchan x 4 x float32 (powers, XX, YY, re(XY, im(XY))
+ * 
+ * ntime_sum: the number of times to integrate
+ * beam_index: The beam to select (if beam_index=N, beams N and N+1 will be used as a polarization pair)
+ */
+BFstatus bfBeamformIntegrateSingleBeam(
+  BFarray *in,
+  BFarray *out,
+  int ntimes_sum,
+  int beam_index
 );
diff --git a/src/cublas_beamform.cu b/src/cublas_beamform.cu
index 94f095850..4be7aedb6 100644
--- a/src/cublas_beamform.cu
+++ b/src/cublas_beamform.cu
@@ -54,16 +54,16 @@ __global__ void trans_output_and_sum(float *in,
   int chan = blockIdx.x;
   int beam = blockIdx.y;
   int time = threadIdx.x;
-  long long int old_index = chan*n_beam*n_time*2 + beam*n_time*2 + time*n_time_sum; // start index for n_time/n_time_sum samples
+  long long int old_index = chan*n_beam*n_time*2 + beam*n_time*2 + time*n_time_sum*2; // start index for n_time/n_time_sum samples
   long long int new_index = beam*(n_time / n_time_sum)*n_chan + time*n_chan + chan;
   float xx=0., yy=0., xy_r=0., xy_i=0.;
   float x_r, x_i, y_r, y_i;
   int t;
   for (t=0; t<n_time_sum; t++) {
-    x_r = in[2*old_index + 2*t];
-    x_i = in[2*old_index + 2*t + 1];
-    y_r = in[2*old_index + n_time + 2*t];
-    y_i = in[2*old_index + n_time + 2*t + 1];
+    x_r = in[old_index + 2*t];
+    x_i = in[old_index + 2*t + 1];
+    y_r = in[old_index + 2*n_time + 2*t];
+    y_i = in[old_index + 2*n_time + 2*t + 1];
     xx = xx + x_r*x_r + x_i*x_i;
     yy = yy + y_r*y_r + y_i*y_i;
     xy_r = xy_r + x_r*y_r + x_i*y_i;
@@ -75,6 +75,42 @@ __global__ void trans_output_and_sum(float *in,
   out[4*new_index+3] = xy_i;
 }
 
+// Take an input of order chan x beam x pol x time x 32+32 float and generate
+// a single beam of time[part-summed] x chan x [XX,YY,XY*_r,XY*_i] x 32 float
+// Each thread deals with two pols of a beam (beam_index and beam_index+1)
+// and sums over n_time_sum time samples
+__global__ void trans_output_and_sum_single_beam(float *in,
+                                                 float *out,
+                                                 int n_chan,
+                                                 int n_beam,
+                                                 int n_time,
+                                                 int n_time_sum,
+                                                 int beam_index
+                                                ) {
+  int chan = blockIdx.x;
+  int beam = beam_index;
+  int time = threadIdx.x;
+  long long int old_index = chan*n_beam*n_time*2 + beam*n_time*2 + time*n_time_sum*2; // start index for n_time/n_time_sum samples
+  long long int new_index = time*n_chan*4 + chan*4;
+  float xx=0., yy=0., xy_r=0., xy_i=0.;
+  float x_r, x_i, y_r, y_i;
+  int t;
+  for (t=0; t<n_time_sum; t++) {
+    x_r = in[old_index + 2*t];
+    x_i = in[old_index + 2*t + 1];
+    y_r = in[old_index + 2*n_time + 2*t];
+    y_i = in[old_index + 2*n_time + 2*t + 1];
+    xx = xx + x_r*x_r + x_i*x_i;
+    yy = yy + y_r*y_r + y_i*y_i;
+    xy_r = xy_r + x_r*y_r + x_i*y_i;
+    xy_i = xy_i + x_i*y_r - x_r*y_i;
+  }
+  out[new_index] = xx;
+  out[new_index+1] = yy;
+  out[new_index+2] = xy_r;
+  out[new_index+3] = xy_i;
+}
+
 __global__ void complex2pow(float *in, float *out, int N) {
   int tid = blockDim.x * blockIdx.x + threadIdx.x;
   if (tid < N) {
@@ -126,11 +162,11 @@ struct beamform_context {
   cublasHandle_t handle;
   cudaStream_t stream;
   // GPU buffers for intermediate data products
-  float *in32_d; // Reordered, floatified 4-bit input data
-  float *out_d;  // CUBLAS beamformer output
+  float *in32_d; // Reordered [to inputs x chans x times x complexity], floatified 4-bit input data
+  float *out_d;  // CUBLAS beamformer output [ntime x nchan x nbeam x complexity]
   float *weights_d; // Beamformer coefficients
   unsigned char *in4_d;     // 4-bit input data
-  float *sum_out_d; // Time-summed output data
+  float *sum_out_d; // Time-summed output data, used only if ntimeblocks > 0
   int ninputs;   // Number of inputs (ants * pols)
   int npols;     // Number of polarizations per antenna
   int nchans;    // Number of channels input
@@ -167,7 +203,7 @@ void cublas_beamform_init(int device, int ninputs, int nchans, int ntimes, int n
   // Internally allocate intermediate buffers
   gpuErrchk( cudaMalloc(&context.in32_d,  ninputs * nchans * ntimes * 2 * sizeof(float)) );
   // If the context is initialized with ntimeblocks=0, then we do no summing so don't
-  // need the intermediate buffer
+  // need the intermediate buffer allocated internally.
   if (ntimeblocks > 0) {
     gpuErrchk( cudaMalloc(&context.out_d,   ntimes * nchans * nbeams * 2 * sizeof(float)) );
   }
@@ -188,7 +224,7 @@ void cublas_beamform(unsigned char *in4_d, float *out_d, float *weights_d) {
   cudaStreamSynchronize(context.stream);
 
   // If we are integrating beam powers, put the
-  // GEM output in the context-defined intermediate
+  // GEM output in the internal intermediate
   // buffer. If not, then write beamformer output
   // to the address given by the user.
   float *gem_out_d;
@@ -242,6 +278,7 @@ void cublas_beamform(unsigned char *in4_d, float *out_d, float *weights_d) {
   if (context.ntimeblocks > 0) {
     // Create XX, YY, XY beam powers.
     // Sum over `ntimes_sum` samples
+    // Write to the user-provided output buffer
     int ntimes_sum = context.ntimes / context.ntimeblocks;
     dim3 sumBlockGrid(context.nchans, context.nbeams/2);
     dim3 sumThreadGrid(context.ntimes / ntimes_sum);
@@ -257,13 +294,12 @@ void cublas_beamform(unsigned char *in4_d, float *out_d, float *weights_d) {
   }
 }
 
-void cublas_beamform_integrate(float *in_d, float *out_d) {
+void cublas_beamform_integrate(float *in_d, float *out_d, int ntimes_sum) {
   // Create XX, YY, XY beam powers.
   // Sum over `ntimes_sum` samples
-  int ntimes_sum = context.ntimes / context.ntimeblocks;
   dim3 sumBlockGrid(context.nchans, context.nbeams/2);
   dim3 sumThreadGrid(context.ntimes / ntimes_sum);
-  trans_output_and_sum<<<sumBlockGrid, sumThreadGrid, 0, context.stream>>>(
+  trans_output_and_sum<<<sumBlockGrid, sumThreadGrid>>>(
     in_d,
     out_d,
     context.nchans,
@@ -271,5 +307,20 @@ void cublas_beamform_integrate(float *in_d, float *out_d) {
     context.ntimes,
     ntimes_sum
   );
-  cudaStreamSynchronize(context.stream);
+}
+
+void cublas_beamform_integrate_single_beam(float *in_d, float *out_d, int ntimes_sum, int beam_index) {
+  // Create XX, YY, XY beam powers.
+  // Sum over `ntimes_sum` samples
+  dim3 sumBlockGrid(context.nchans);
+  dim3 sumThreadGrid(context.ntimes / ntimes_sum);
+  trans_output_and_sum_single_beam<<<sumBlockGrid, sumThreadGrid>>>(
+    in_d,
+    out_d,
+    context.nchans,
+    context.nbeams/2,
+    context.ntimes,
+    ntimes_sum,
+    beam_index
+  );
 }
diff --git a/src/cublas_beamform.cuh b/src/cublas_beamform.cuh
index 6a0d2b22a..1e9df948f 100644
--- a/src/cublas_beamform.cuh
+++ b/src/cublas_beamform.cuh
@@ -30,7 +30,8 @@ __global__ void complex2pow(float *in, float *out, int N);
 
 void cublas_beamform_destroy();
 void cublas_beamform(unsigned char *in4_d, float *sum_out_d, float *weights_d);
-void cublas_beamform_integrate(float *in_d, float *sum_out_d);
+void cublas_beamform_integrate(float *in_d, float *sum_out_d, int ntimes_sum);
+void cublas_beamform_integrate_single_beam(float *in_d, float *sum_out_d, int ntimes_sum, int beam_index);
 void cublas_beamform_init(int device, int ninputs, int nchans, int ntimes, int nbeams, int ntimeblocks);
 
 #endif

From 246af1c0c9a44701678e9227eda881e6f0769111 Mon Sep 17 00:00:00 2001
From: JackH <jack@rtr>
Date: Wed, 28 Oct 2020 12:09:18 +0000
Subject: [PATCH 42/91] Add LWA352 voltage packet output

---
 src/formats/formats.hpp      |  1 +
 src/formats/lwa352_vbeam.hpp | 57 ++++++++++++++++++++++++++++++++++++
 src/packet_writer.hpp        | 19 ++++++++++++
 3 files changed, 77 insertions(+)
 create mode 100644 src/formats/lwa352_vbeam.hpp

diff --git a/src/formats/formats.hpp b/src/formats/formats.hpp
index 6e2f4c527..79dd5fd4e 100644
--- a/src/formats/formats.hpp
+++ b/src/formats/formats.hpp
@@ -36,3 +36,4 @@
 #include "tbf.hpp"
 #include "ibeam.hpp"
 #include "snap2.hpp"
+#include "lwa352_vbeam.hpp"
diff --git a/src/formats/lwa352_vbeam.hpp b/src/formats/lwa352_vbeam.hpp
new file mode 100644
index 000000000..8cccef753
--- /dev/null
+++ b/src/formats/lwa352_vbeam.hpp
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2019, The Bifrost Authors. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * * Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in the
+ *   documentation and/or other materials provided with the distribution.
+ * * Neither the name of The Bifrost Authors nor the names of its
+ *   contributors may be used to endorse or promote products derived
+ *   from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#pragma once
+
+#include "base.hpp"
+
+#pragma pack(1)
+struct lwa352_vbeam_hdr_type {
+    uint64_t sync_word;
+    uint64_t sync_time;
+    uint64_t time_tag;
+    double   bw_hz;
+    double   sfreq;
+    uint32_t nchan;
+    uint32_t chan0;
+    uint32_t npol;
+};
+
+class LWA352VBeamHeaderFiller : virtual public PacketHeaderFiller {
+public:
+    inline int get_size() { return sizeof(lwa352_vbeam_hdr_type); }
+    inline void operator()(const PacketDesc* hdr_base,
+                           BFoffset          framecount,
+                           char*             hdr) {
+        lwa352_vbeam_hdr_type* header = reinterpret_cast<lwa352_vbeam_hdr_type*>(hdr);
+        memset(header, 0, sizeof(lwa352_vbeam_hdr_type));
+        
+        header->sync_word        = 0xAABBCCDD00000000L;
+        header->time_tag         = htobe64(hdr_base->seq);
+    }
+};
diff --git a/src/packet_writer.hpp b/src/packet_writer.hpp
index dbdc70e84..527505680 100644
--- a/src/packet_writer.hpp
+++ b/src/packet_writer.hpp
@@ -320,6 +320,18 @@ class BFpacketwriter_tbf_impl : public BFpacketwriter_impl {
     }
 };
 
+class BFpacketwriter_lwa352_vbeam_impl : public BFpacketwriter_impl {
+    ProcLog            _type_log;
+public:
+    inline BFpacketwriter_lwa352_vbeam_impl(PacketWriterThread* writer,
+                                   int                 nsamples)
+     : BFpacketwriter_impl(writer, nullptr, nsamples, BF_DTYPE_CF32),
+       _type_log((std::string(writer->get_name())+"/type").c_str()) {
+        _filler = new LWA352VBeamHeaderFiller();
+        _type_log.update("type : %s\n", "tbf");
+    }
+};
+
 BFstatus BFpacketwriter_create(BFpacketwriter* obj,
                                const char*     format,
                                int             fd,
@@ -351,6 +363,10 @@ BFstatus BFpacketwriter_create(BFpacketwriter* obj,
         nsamples = 4096;
     } else if( format == std::string("tbf") ) {
         nsamples = 6144;
+    } else if( std::string(format).substr(0, 13) == std::string("lwa352_vbeam_") ) {
+        // e.g. "lwa352_vbeam_184" is a 184-channel voltage beam"
+        int nchan = std::atoi((std::string(format).substr(13, std::string(format).length())).c_str());
+        nsamples = 2*nchan; // 2 polarizations. Natively 32-bit floating complex (see implementation class)
     }
     
     PacketWriterMethod* method;
@@ -390,6 +406,9 @@ BFstatus BFpacketwriter_create(BFpacketwriter* obj,
     } else if( format == std::string("tbf") ) {
         BF_TRY_RETURN_ELSE(*obj = new BFpacketwriter_tbf_impl(writer, nsamples),
                            *obj = 0);
+    } else if( std::string(format).substr(0, 13) == std::string("lwa352_vbeam_") ) {
+        BF_TRY_RETURN_ELSE(*obj = new BFpacketwriter_lwa352_vbeam_impl(writer, nsamples),
+                           *obj = 0);
     } else {
         return BF_STATUS_UNSUPPORTED;
     }

From 9c693fd2d60f0f119bd0249602ead5c0bbaa20f3 Mon Sep 17 00:00:00 2001
From: JackH <jack@rtr>
Date: Tue, 17 Nov 2020 18:55:00 +0000
Subject: [PATCH 43/91] Syntax error in BfBeamformIntegrate declaration

---
 src/bifrost/beamform.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/bifrost/beamform.h b/src/bifrost/beamform.h
index 890b727da..296336008 100644
--- a/src/bifrost/beamform.h
+++ b/src/bifrost/beamform.h
@@ -53,7 +53,7 @@ BFstatus bfBeamformRun(
 BFstatus bfBeamformIntegrate(
   BFarray *in,
   BFarray *out,
-  int ntimes_sum,
+  int ntimes_sum
 );
 
 /*

From d10f38aac7a65f3260c102868e060c69e36f3151 Mon Sep 17 00:00:00 2001
From: JackH <jack@rtr>
Date: Tue, 15 Dec 2020 20:20:50 +0000
Subject: [PATCH 44/91] Header docstring

---
 src/bifrost/beamform.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/bifrost/beamform.h b/src/bifrost/beamform.h
index 296336008..7a5e4d297 100644
--- a/src/bifrost/beamform.h
+++ b/src/bifrost/beamform.h
@@ -6,7 +6,8 @@
  * ninputs: Number of inputs (single-polarization) to the beamformer
  * nchans: Number of frequency channels
  * ntimes: Number of time samples per beamforming call
- * nbeams: Number of beams to generate. If using ntime_blocks > 0, beams=N will deliver
+ * nbeams: Number of beams to generate. If using ntime_blocks > 0, beams=N will deliver N/2 beams.
+ *         (See bfBeamformRun)
  * ntime_blocks: Number of time blocks to output. Eg. if ntimes=1000 and ntime_blocks=10, the beamformer
                  will integrate over 100 samples per call. Set to 0 for no accumulation, in which case
                  raw beam voltages are output.

From fb7db55a119ba4e5e446977ff10b848412b04c69 Mon Sep 17 00:00:00 2001
From: JackH <jack@rtr>
Date: Thu, 11 Feb 2021 13:11:54 +0000
Subject: [PATCH 45/91] Fix mismerge

Put UDPVerbsCapture back in after mismerging it out
---
 python/bifrost/packet_capture.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/python/bifrost/packet_capture.py b/python/bifrost/packet_capture.py
index b25e8b1e4..7a4ad2594 100644
--- a/python/bifrost/packet_capture.py
+++ b/python/bifrost/packet_capture.py
@@ -107,6 +107,17 @@ def __init__(self, fmt, sock, ring, nsrc, src0, max_payload_size,
             sequence_callback.obj, core)
 
 class UDPSniffer(_CaptureBase):
+    def __init__(self, fmt, sock, ring, nsrc, src0, max_payload_size,
+                 buffer_ntime, slot_ntime, sequence_callback, core=None):
+        if core is None:
+            core = -1
+        BifrostObject.__init__(
+            self, _bf.bfUdpSnifferCreate, _bf.bfPacketCaptureDestroy,
+            fmt, sock.fileno(), ring.obj, nsrc, src0,
+            max_payload_size, buffer_ntime, slot_ntime,
+            sequence_callback.obj, core)
+
+class UDPVerbsCapture(_CaptureBase):
     def __init__(self, fmt, sock, ring, nsrc, src0, max_payload_size,
                  buffer_ntime, slot_ntime, sequence_callback, core=None):
         try:

From 72791eeed0d50671b692c4126f02e9d8d8ba2ca0 Mon Sep 17 00:00:00 2001
From: ubuntu <ubuntu@pm.me>
Date: Tue, 11 May 2021 14:00:59 +0000
Subject: [PATCH 46/91] Re-enable NUMA memory

---
 user.mk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/user.mk b/user.mk
index edea22fbd..27d469d71 100644
--- a/user.mk
+++ b/user.mk
@@ -31,7 +31,7 @@ ALIGNMENT ?= 4096 # Memory allocation alignment
 #NOCUDA     = 1 # Disable CUDA support
 #ANY_ARCH   = 1 # Disable native architecture compilation
 #CUDA_DEBUG = 1 # Enable CUDA debugging (nvcc -G)
-#NUMA       = 1 # Enable use of numa library for setting affinity of ring memory
+NUMA       = 1 # Enable use of numa library for setting affinity of ring memory
 #HWLOC      = 1 # Enable use of hwloc library for memory binding in udp_capture
 #VMA        = 1 # Enable use of Mellanox libvma in udp_capture
 XGPU       = 1 # build xGPU integrations (requires the xGPU library)

From aa70eed212195d7f436e9484d5c2626c40cc04fe Mon Sep 17 00:00:00 2001
From: ubuntu <ubuntu@pm.me>
Date: Tue, 11 May 2021 14:22:33 +0000
Subject: [PATCH 47/91] Add back in snap2 packet callback

Got lost in the wash somewhere
---
 python/bifrost/packet_capture.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/python/bifrost/packet_capture.py b/python/bifrost/packet_capture.py
index 7a4ad2594..df7024ce9 100644
--- a/python/bifrost/packet_capture.py
+++ b/python/bifrost/packet_capture.py
@@ -42,6 +42,10 @@ def set_chips(self, fnc):
         self._ref_cache['chips'] = _bf.BFpacketcapture_chips_sequence_callback(fnc)
         _check(_bf.bfPacketCaptureCallbackSetCHIPS(
                self.obj, self._ref_cache['chips']))
+    def set_snap2(self, fnc):
+        self._ref_cache['snap2'] = _bf.BFpacketcapture_snap2_sequence_callback(fnc)
+        _check(_bf.bfPacketCaptureCallbackSetSNAP2(
+               self.obj, self._ref_cache['snap2']))
     def set_ibeam(self, fnc):
         self._ref_cache['ibeam'] = _bf.BFpacketcapture_ibeam_sequence_callback(fnc)
         _check(_bf.bfPacketCaptureCallbackSetIBeam(

From 08f8ed869a9f3785e33f5c6d2bb038cebfc4a7c3 Mon Sep 17 00:00:00 2001
From: ubuntu <ubuntu@pm.me>
Date: Tue, 11 May 2021 17:42:31 +0000
Subject: [PATCH 48/91] Merge fix again -- helps if ibverbs actually uses
 ibverbs(!)

---
 python/bifrost/packet_capture.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/bifrost/packet_capture.py b/python/bifrost/packet_capture.py
index df7024ce9..77b22fcc1 100644
--- a/python/bifrost/packet_capture.py
+++ b/python/bifrost/packet_capture.py
@@ -133,7 +133,7 @@ def __init__(self, fmt, sock, ring, nsrc, src0, max_payload_size,
         if core is None:
             core = -1
         BifrostObject.__init__(
-            self, _bf.bfUdpSnifferCreate, _bf.bfPacketCaptureDestroy,
+            self, _bf.bfUdpVerbsCaptureCreate, _bf.bfPacketCaptureDestroy,
             fmt, sock.fileno(), ring.obj, nsrc, src0,
             max_payload_size, buffer_ntime, slot_ntime,
             sequence_callback.obj, core)

From 91e57fe4caa94266092a3cc2ed946f281e8c9a0e Mon Sep 17 00:00:00 2001
From: ubuntu <ubuntu@pm.me>
Date: Fri, 14 May 2021 15:47:06 +0000
Subject: [PATCH 49/91] Add some debug prints

---
 src/formats/snap2.hpp | 29 ++++++++++++++++++++++-------
 1 file changed, 22 insertions(+), 7 deletions(-)

diff --git a/src/formats/snap2.hpp b/src/formats/snap2.hpp
index 23de0c387..ea3d04e55 100644
--- a/src/formats/snap2.hpp
+++ b/src/formats/snap2.hpp
@@ -64,13 +64,13 @@ struct snap2_hdr_type {
 class SNAP2Decoder : virtual public PacketDecoder {
 protected:
     inline bool valid_packet(const PacketDesc* pkt) const {
-#if BFSNAP2_DEBUG
-        cout << "seq: "<< pkt->seq << endl;
-        cout << "src: "<< pkt->src << endl;
-        cout << "nsrc: "<< pkt->nsrc << endl;
-        cout << "nchan: "<< pkt->nchan << endl;
-        cout << "chan0: "<< pkt->chan0 << endl;
-#endif
+//#if BF_SNAP2_DEBUG
+//        cout << "seq: "<< pkt->seq << endl;
+//        cout << "src: "<< pkt->src << endl;
+//        cout << "nsrc: "<< pkt->nsrc << endl;
+//        cout << "nchan: "<< pkt->nchan << endl;
+//        cout << "chan0: "<< pkt->chan0 << endl;
+//#endif
         return ( 
                  pkt->seq >= 0
                  && pkt->src >= 0
@@ -92,6 +92,12 @@ class SNAP2Decoder : virtual public PacketDecoder {
         int                   pld_size = pkt_size - sizeof(snap2_hdr_type);
         pkt->seq   = be64toh(pkt_hdr->seq);
         pkt->time_tag = be32toh(pkt_hdr->sync_time);
+#if BF_SNAP2_DEBUG
+	fprintf(stderr, "seq: %lu\t", pkt->seq);
+	fprintf(stderr, "sync_time: %lu\t", pkt->time_tag);
+	fprintf(stderr, "nchan: %lu\t", be16toh(pkt_hdr->nchan));
+	fprintf(stderr, "npol: %lu\t", be16toh(pkt_hdr->npol));
+#endif
         int npol_blocks  = (be16toh(pkt_hdr->npol_tot) / be16toh(pkt_hdr->npol));
         int nchan_blocks = (be16toh(pkt_hdr->nchan_tot) / be16toh(pkt_hdr->nchan));
 
@@ -105,6 +111,15 @@ class SNAP2Decoder : virtual public PacketDecoder {
         pkt->src = (pkt->pol0 / pkt->npol) + be32toh(pkt_hdr->chan_block_id) * npol_blocks;
         pkt->payload_size = pld_size;
         pkt->payload_ptr  = pkt_pld;
+#if BF_SNAP2_DEBUG
+	fprintf(stderr, "nsrc: %lu\t", pkt->nsrc);
+	fprintf(stderr, "src: %lu\t", pkt->src);
+	fprintf(stderr, "chan0: %lu\t", pkt->chan0);
+	fprintf(stderr, "chan_block_id: %lu\t", be32toh(pkt_hdr->chan_block_id));
+	fprintf(stderr, "nchan_tot: %lu\t", pkt->nchan_tot);
+	fprintf(stderr, "npol_tot: %lu\t", pkt->npol_tot);
+	fprintf(stderr, "pol0: %lu\n", pkt->pol0);
+#endif
         return this->valid_packet(pkt);
     }
 };

From 4fb0d0b7ba1d3ae5bdd744215e752007d97486ed Mon Sep 17 00:00:00 2001
From: ubuntu <ubuntu@pm.me>
Date: Fri, 14 May 2021 16:02:49 +0000
Subject: [PATCH 50/91] Enable HWLOC

This makes a pretty big difference at high packet RX rates
---
 user.mk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/user.mk b/user.mk
index 27d469d71..b2d4dbdca 100644
--- a/user.mk
+++ b/user.mk
@@ -32,7 +32,7 @@ ALIGNMENT ?= 4096 # Memory allocation alignment
 #ANY_ARCH   = 1 # Disable native architecture compilation
 #CUDA_DEBUG = 1 # Enable CUDA debugging (nvcc -G)
 NUMA       = 1 # Enable use of numa library for setting affinity of ring memory
-#HWLOC      = 1 # Enable use of hwloc library for memory binding in udp_capture
+HWLOC      = 1 # Enable use of hwloc library for memory binding in udp_capture
 #VMA        = 1 # Enable use of Mellanox libvma in udp_capture
 XGPU       = 1 # build xGPU integrations (requires the xGPU library)
 VERBS      = 1 # Enable use of IB verbs with udp_verbs_capture

From 7aa790e17bfcd74206330ae98d12152943662dfd Mon Sep 17 00:00:00 2001
From: ubuntu <ubuntu@pm.me>
Date: Wed, 19 May 2021 15:22:08 +0000
Subject: [PATCH 51/91] Allow (hopefully) buffer size and slot size to be
 different for SNAP2 packets

---
 src/packet_capture.cpp | 3 ++-
 src/packet_capture.hpp | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/packet_capture.cpp b/src/packet_capture.cpp
index 0e18dde4d..115d72505 100644
--- a/src/packet_capture.cpp
+++ b/src/packet_capture.cpp
@@ -94,7 +94,8 @@ int PacketCaptureThread::run(uint64_t seq_beg,
             // If lots [TODO: what is lots] of packets are late
             // return. Otherwise a seq reset can lead to being stuck
             // here endlessly counting late packets.
-            if( less_than(_pkt.seq + nseq_per_obuf, seq_beg) ) {
+            if( less_than(_pkt.seq + 1000*nseq_per_obuf, seq_beg) ) {
+		   		fprintf(stderr, "Breaking from packet receive because of so many late packets\n"); 
 		        _have_pkt = true;
                 ret = CAPTURE_SUCCESS;
                 break;
diff --git a/src/packet_capture.hpp b/src/packet_capture.hpp
index ace8d2522..0314b900d 100644
--- a/src/packet_capture.hpp
+++ b/src/packet_capture.hpp
@@ -686,7 +686,8 @@ class BFpacketcapture_snap2_impl : public BFpacketcapture_impl {
         // packet isn't from the next block
         // TODO. Is this actually reasonable? Does it recover from upstream resyncs?
         bool is_new_seq = false;
-        if ( (_last_time_tag != pkt->time_tag) || (pkt->seq != _last_seq + _slot_ntime) ) {
+        if ( (_last_time_tag != pkt->time_tag) || (pkt->seq != _last_seq + _nseq_per_buf) ) {
+	    //fprintf(stderr, "packet seq was %lu. Expected %lu + %lu\r\n", pkt->seq, _last_seq, _nseq_per_buf);
             is_new_seq = true;
             this->flush();
         }

From aebba8211717d9c20f176e3addfad4fbaae17541 Mon Sep 17 00:00:00 2001
From: ubuntu <ubuntu@pm.me>
Date: Thu, 20 May 2021 14:05:19 +0000
Subject: [PATCH 52/91] Tweak sequence-has-changed condition

Allow two slots of loss before considering the sequence new.
---
 src/packet_capture.hpp | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/packet_capture.hpp b/src/packet_capture.hpp
index 0314b900d..678fa9397 100644
--- a/src/packet_capture.hpp
+++ b/src/packet_capture.hpp
@@ -687,9 +687,13 @@ class BFpacketcapture_snap2_impl : public BFpacketcapture_impl {
         // TODO. Is this actually reasonable? Does it recover from upstream resyncs?
         bool is_new_seq = false;
         if ( (_last_time_tag != pkt->time_tag) || (pkt->seq != _last_seq + _nseq_per_buf) ) {
-	    //fprintf(stderr, "packet seq was %lu. Expected %lu + %lu\r\n", pkt->seq, _last_seq, _nseq_per_buf);
-            is_new_seq = true;
-            this->flush();
+	    	// We could have a packet sequence number which isn't what we expect
+	    	// but is only wrong because of missing packets. Set an upper bound on
+	    	// two slots of loss
+	    	if (pkt->seq > _last_seq + 2*_slot_ntime) {
+			    is_new_seq = true;
+				this->flush();
+	    	}
         }
         _last_seq = pkt->seq;
 	    return is_new_seq;

From 39d133bdb22e054e2d1750835f124616e67b3480 Mon Sep 17 00:00:00 2001
From: ubuntu <ubuntu@pm.me>
Date: Tue, 8 Jun 2021 15:42:10 +0000
Subject: [PATCH 53/91] Remove pring when trying to blank packets

---
 src/formats/snap2.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/formats/snap2.hpp b/src/formats/snap2.hpp
index ea3d04e55..8d5420830 100644
--- a/src/formats/snap2.hpp
+++ b/src/formats/snap2.hpp
@@ -186,7 +186,7 @@ class SNAP2Processor : virtual public PacketProcessor {
                                      int      nsrc,
                                      int      nchan,
                                      int      nseq) {
-            fprintf(stderr, "TRYING TO BLANK OUT A SOURCE WITH MISSING PACKETS. BUT BLANKING NOT IMPLEMENTED\n");
+            //fprintf(stderr, "TRYING TO BLANK OUT A SOURCE WITH MISSING PACKETS. BUT BLANKING NOT IMPLEMENTED\n");
             //typedef aligned256_type otype;
             //fprintf(stderr, "You really better not be here\n");
             //otype* __restrict__ aligned_data = (otype*)data;

From bcbd25169bd6d9ce9b78b746530cd29dc5a49a2e Mon Sep 17 00:00:00 2001
From: ubuntu <ubuntu@pm.me>
Date: Tue, 22 Jun 2021 19:42:50 +0000
Subject: [PATCH 54/91] Remove _timeout use (which causes segfaults?)

---
 src/ib_verbs.cpp | 3 ++-
 src/ib_verbs.hpp | 4 ++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/ib_verbs.cpp b/src/ib_verbs.cpp
index 19e2bc67b..7742e706a 100644
--- a/src/ib_verbs.cpp
+++ b/src/ib_verbs.cpp
@@ -545,7 +545,8 @@ int Verbs::recv_packet(uint8_t** pkt_ptr, int flags) {
         }
     }
     while( _verbs.pkt_batch == NULL ) {
-        _verbs.pkt_batch = this->receive(_timeout);
+        //_verbs.pkt_batch = this->receive(_timeout);
+        _verbs.pkt_batch = this->receive(1);
         _verbs.pkt = _verbs.pkt_batch;
     }
     // IBV returns Eth/UDP/IP headers. Strip them off here.
diff --git a/src/ib_verbs.hpp b/src/ib_verbs.hpp
index 3e52a343d..7468b834b 100644
--- a/src/ib_verbs.hpp
+++ b/src/ib_verbs.hpp
@@ -102,7 +102,7 @@ struct bf_ibv {
 class Verbs : public BoundThread {
     int    _fd;
     size_t _pkt_size_max;
-    int    _timeout;
+    //int    _timeout;
     bf_ibv _verbs;
     
     void get_interface_name(char* name) {
@@ -238,7 +238,7 @@ class Verbs : public BoundThread {
     Verbs(int fd, size_t pkt_size_max, int core)
         : BoundThread(core), _fd(fd), _pkt_size_max(pkt_size_max) {
             ::memset(&_verbs, 0, sizeof(_verbs));
-            _timeout = get_timeout_ms();
+            //_timeout = get_timeout_ms();
             
             create_context();
             create_buffers(pkt_size_max);

From 8c5c52d433c255829d8de69ed5378f41d9e0c727 Mon Sep 17 00:00:00 2001
From: ubuntu <ubuntu@pm.me>
Date: Tue, 29 Jun 2021 13:15:33 +0000
Subject: [PATCH 55/91] Hack to pass through chan0 to bifrost pipeline

chan0 is used internally for the channel number within a
single pipeline -- i.e., it is always 0 for the first channel
in a given packet stream. Rather than messing with this, use
the "tuning" packet header field to pass the actual channel
number to the bifrost capture callback
---
 src/formats/snap2.hpp  | 1 +
 src/packet_capture.hpp | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/formats/snap2.hpp b/src/formats/snap2.hpp
index 8d5420830..a8fc7350f 100644
--- a/src/formats/snap2.hpp
+++ b/src/formats/snap2.hpp
@@ -101,6 +101,7 @@ class SNAP2Decoder : virtual public PacketDecoder {
         int npol_blocks  = (be16toh(pkt_hdr->npol_tot) / be16toh(pkt_hdr->npol));
         int nchan_blocks = (be16toh(pkt_hdr->nchan_tot) / be16toh(pkt_hdr->nchan));
 
+	pkt->tuning = be32toh(pkt_hdr->chan0); // Abuse this so we can use chan0 to reference channel within pipeline
         pkt->nsrc = npol_blocks * nchan_blocks;// _nsrc;
         pkt->nchan  = be16toh(pkt_hdr->nchan);
         pkt->chan0  = be32toh(pkt_hdr->chan_block_id) * be16toh(pkt_hdr->nchan);
diff --git a/src/packet_capture.hpp b/src/packet_capture.hpp
index be06c7f4b..a554217cd 100644
--- a/src/packet_capture.hpp
+++ b/src/packet_capture.hpp
@@ -710,7 +710,7 @@ class BFpacketcapture_snap2_impl : public BFpacketcapture_impl {
         
 	    if( _sequence_callback ) {
 	        int status = (*_sequence_callback)(*seq0,
-			                                   _chan0,
+			                                   pkt->tuning, // Hacked to contain chan0
 			                                   _nchan,
 			                                   _nsrc,
 			                                   time_tag,

From d463a2537bae8b1c7f388fdd74cccbedc2d68def Mon Sep 17 00:00:00 2001
From: ubuntu <ubuntu@pm.me>
Date: Tue, 13 Jul 2021 19:04:34 +0000
Subject: [PATCH 56/91] Py3 re-fixes

---
 tools/like_top.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/like_top.py b/tools/like_top.py
index 082047424..37dc0bde0 100755
--- a/tools/like_top.py
+++ b/tools/like_top.py
@@ -192,7 +192,7 @@ def get_gpu_memory_usage():
         pass
     else:
         # Parse the ouptut and turn everything into something useful, if possible
-        lines = output.decode().split('\n')[:-1]
+        lines = output.split('\n')[:-1]
         for line in lines:
             used, total, free, draw, limit, load = line.split(',')
             data['devCount'] += 1

From 55784eeeef8b0e9315b0b8b72d569cb3846d7940 Mon Sep 17 00:00:00 2001
From: ubuntu <ubuntu@pm.me>
Date: Thu, 15 Jul 2021 16:25:24 +0000
Subject: [PATCH 57/91] Remove some divergences from JD's ibverb-support branch

---
 python/bifrost/packet_capture.py | 10 +++++++---
 src/bifrost/packet_capture.h     | 13 -------------
 src/ib_verbs.hpp                 |  2 +-
 3 files changed, 8 insertions(+), 17 deletions(-)

diff --git a/python/bifrost/packet_capture.py b/python/bifrost/packet_capture.py
index 77b22fcc1..8454b0085 100644
--- a/python/bifrost/packet_capture.py
+++ b/python/bifrost/packet_capture.py
@@ -94,8 +94,7 @@ def end(self):
 
 class UDPCapture(_CaptureBase):
     def __init__(self, fmt, sock, ring, nsrc, src0, max_payload_size,
-                 buffer_ntime, slot_ntime, sequence_callback, core=None,
-                 interface='', port=-1):
+                 buffer_ntime, slot_ntime, sequence_callback, core=None):
         try:
             fmt = fmt.encode()
         except AttributeError:
@@ -113,6 +112,12 @@ def __init__(self, fmt, sock, ring, nsrc, src0, max_payload_size,
 class UDPSniffer(_CaptureBase):
     def __init__(self, fmt, sock, ring, nsrc, src0, max_payload_size,
                  buffer_ntime, slot_ntime, sequence_callback, core=None):
+        try:
+            fmt = fmt.encode()
+        except AttributeError:
+            # Python2 catch
+            pass
+        nsrc = self._flatten_value(nsrc)
         if core is None:
             core = -1
         BifrostObject.__init__(
@@ -129,7 +134,6 @@ def __init__(self, fmt, sock, ring, nsrc, src0, max_payload_size,
         except AttributeError:
             # Python2 catch
             pass
-        nsrc = self._flatten_value(nsrc)
         if core is None:
             core = -1
         BifrostObject.__init__(
diff --git a/src/bifrost/packet_capture.h b/src/bifrost/packet_capture.h
index 0e2bf0bd7..c3e5cc34d 100644
--- a/src/bifrost/packet_capture.h
+++ b/src/bifrost/packet_capture.h
@@ -111,19 +111,6 @@ BFstatus bfUdpCaptureCreate(BFpacketcapture* obj,
                             BFsize           slot_ntime,
                             BFpacketcapture_callback sequence_callback,
                             int              core);
-
-BFstatus bfIbvUdpCaptureCreate(BFpacketcapture* obj,
-                            const char*      format,
-                            int              fd,
-                            BFring           ring,
-                            BFsize           nsrc,
-                            BFsize           src0,
-                            BFsize           max_payload_size,
-                            BFsize           buffer_ntime,
-                            BFsize           slot_ntime,
-                            BFpacketcapture_callback sequence_callback,
-                            int              core);
-
 BFstatus bfUdpSnifferCreate(BFpacketcapture* obj,
                             const char*      format,
                             int              fd,
diff --git a/src/ib_verbs.hpp b/src/ib_verbs.hpp
index da76df4cb..51da59c8c 100644
--- a/src/ib_verbs.hpp
+++ b/src/ib_verbs.hpp
@@ -50,7 +50,7 @@
 #endif
 
 #ifndef BF_VERBS_NPKTBUF
-#define BF_VERBS_NPKTBUF 32768
+#define BF_VERBS_NPKTBUF 8192
 #endif
 
 #ifndef BF_VERBS_WCBATCH

From e39cf99756a38c5e1b577736628c892cedd08b7d Mon Sep 17 00:00:00 2001
From: ubuntu <ubuntu@pm.me>
Date: Thu, 15 Jul 2021 17:10:24 +0000
Subject: [PATCH 58/91] Make compiling actually work when XGPU isn't used

---
 src/Makefile          | 3 ++-
 src/bifrost/bf_xgpu.h | 2 ++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/Makefile b/src/Makefile
index ec1b137c1..00b3bf893 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -121,8 +121,9 @@ endif
 
 ifndef NOCUDA
   CPPFLAGS  += -DBF_CUDA_ENABLED=1
-  LIB       += -L$(CUDA_LIBDIR64) -L$(CUDA_LIBDIR) -lcuda -lcudart -lnvrtc -lcublas -lcudadevrt -L. -lcufft_static_pruned -lculibos -lnvToolsExt -lxgpu
+  LIB       += -L$(CUDA_LIBDIR64) -L$(CUDA_LIBDIR) -lcuda -lcudart -lnvrtc -lcublas -lcudadevrt -L. -lcufft_static_pruned -lculibos -lnvToolsExt
 ifdef XGPU
+  CPPFLAGS  += -DBF_XGPU_ENABLED=1
   LIB       += -lxgpu
 endif
 endif
diff --git a/src/bifrost/bf_xgpu.h b/src/bifrost/bf_xgpu.h
index e06b5be61..39dbbfdb5 100644
--- a/src/bifrost/bf_xgpu.h
+++ b/src/bifrost/bf_xgpu.h
@@ -1,9 +1,11 @@
 #include <bifrost/common.h>
 #include <bifrost/array.h>
 
+#if BF_XGPU_ENABLED
 BFstatus bfXgpuInitialize(BFarray *in, BFarray *out, int gpu_dev);
 BFstatus bfXgpuCorrelate(BFarray *in, BFarray *out, int doDump);
 BFstatus bfXgpuKernel(BFarray *in, BFarray *out, int doDump);
 BFstatus bfXgpuSubSelect(BFarray *in, BFarray *out, BFarray *vismap, BFarray *conj, int nchan_sum);
 BFstatus bfXgpuGetOrder(BFarray *antpol_to_input, BFarray *antpol_to_bl, BFarray *is_conj);
 BFstatus bfXgpuReorder(BFarray *xgpu_output, BFarray *reordered, BFarray *baselines, BFarray *is_conjugated);
+#endif

From 2e7fb1c831aed2c9d4b9e5fd869bd59c105715a5 Mon Sep 17 00:00:00 2001
From: ubuntu <ubuntu@pm.me>
Date: Thu, 15 Jul 2021 18:51:10 +0000
Subject: [PATCH 59/91] Don't use "pragma pack"

Parroting JD:

The pragma pack compiler directive affects all structures following
the pragma. So, an innocent struct in a header file which wants
to be packed can mess up other code in fun ways, depending on orders of
header includes.
For some reason, IB Verbs configuration seems to be easily broken by
this kind of thing.

Use struct __attribute__((packed)) {...} instead
---
 src/formats/lwa352_vbeam.hpp | 3 +--
 src/formats/snap2.hpp        | 3 +--
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/formats/lwa352_vbeam.hpp b/src/formats/lwa352_vbeam.hpp
index 8cccef753..e7c7fe402 100644
--- a/src/formats/lwa352_vbeam.hpp
+++ b/src/formats/lwa352_vbeam.hpp
@@ -30,8 +30,7 @@
 
 #include "base.hpp"
 
-#pragma pack(1)
-struct lwa352_vbeam_hdr_type {
+struct __attribute__((packed)) lwa352_vbeam_hdr_type {
     uint64_t sync_word;
     uint64_t sync_time;
     uint64_t time_tag;
diff --git a/src/formats/snap2.hpp b/src/formats/snap2.hpp
index a8fc7350f..ea93244b3 100644
--- a/src/formats/snap2.hpp
+++ b/src/formats/snap2.hpp
@@ -38,9 +38,8 @@
 #define PIPELINE_NPOL 704
 #define PIPELINE_NCHAN 32
 
-#pragma pack(1)
 // All entries are network (i.e. big) endian
-struct snap2_hdr_type {
+struct __attribute__((packed)) snap2_hdr_type {
         uint64_t  seq;       // Spectra counter == packet counter
         uint32_t  sync_time; // UNIX sync time
         uint16_t  npol;      // Number of pols in this packet

From f46f3649fc2051e79aea0b39849d1f08c40e04d5 Mon Sep 17 00:00:00 2001
From: ubuntu <ubuntu@pm.me>
Date: Thu, 15 Jul 2021 19:28:04 +0000
Subject: [PATCH 60/91] Always include xgpu header file

I'm not smart enough to make the python bindings get generated when
the header entries are wrapped in #if's
---
 src/bifrost/bf_xgpu.h | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/src/bifrost/bf_xgpu.h b/src/bifrost/bf_xgpu.h
index 39dbbfdb5..fa9504a4b 100644
--- a/src/bifrost/bf_xgpu.h
+++ b/src/bifrost/bf_xgpu.h
@@ -1,11 +1,25 @@
+#ifndef BF_XGPU_H_INCLUDE_GUARD_
+#define BF_XGPU_H_INCLUDE_GUARD_
+
 #include <bifrost/common.h>
 #include <bifrost/array.h>
 
-#if BF_XGPU_ENABLED
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//TODO: figure out how to make ctypesgen to the right thing with python generation
+//#if(BF_XGPU_ENABLED)
 BFstatus bfXgpuInitialize(BFarray *in, BFarray *out, int gpu_dev);
 BFstatus bfXgpuCorrelate(BFarray *in, BFarray *out, int doDump);
 BFstatus bfXgpuKernel(BFarray *in, BFarray *out, int doDump);
 BFstatus bfXgpuSubSelect(BFarray *in, BFarray *out, BFarray *vismap, BFarray *conj, int nchan_sum);
 BFstatus bfXgpuGetOrder(BFarray *antpol_to_input, BFarray *antpol_to_bl, BFarray *is_conj);
 BFstatus bfXgpuReorder(BFarray *xgpu_output, BFarray *reordered, BFarray *baselines, BFarray *is_conjugated);
+//#endif // BF_XGPU_ENABLED
+
+#ifdef __cplusplus
+} // extern "C"
 #endif
+
+#endif // BF_XGPU_H_INCLUDE_GUARD

From 5767a0ed99bbbe94390fef8b2ca0f8dfb0d97b23 Mon Sep 17 00:00:00 2001
From: ubuntu <ubuntu@pm.me>
Date: Thu, 15 Jul 2021 19:38:22 +0000
Subject: [PATCH 61/91] Bigger packet buffers

---
 src/ib_verbs.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ib_verbs.hpp b/src/ib_verbs.hpp
index 51da59c8c..da76df4cb 100644
--- a/src/ib_verbs.hpp
+++ b/src/ib_verbs.hpp
@@ -50,7 +50,7 @@
 #endif
 
 #ifndef BF_VERBS_NPKTBUF
-#define BF_VERBS_NPKTBUF 8192
+#define BF_VERBS_NPKTBUF 32768
 #endif
 
 #ifndef BF_VERBS_WCBATCH

From fc4704b21f13e2112d64824456a6c2171e669ffa Mon Sep 17 00:00:00 2001
From: ubuntu <ubuntu@pm.me>
Date: Tue, 20 Jul 2021 10:24:45 +0000
Subject: [PATCH 62/91] re-speedup IBV (and presumably fix a bug)

Blindly copied from the now-deprecated implementation in ib_verbs.cpp
---
 src/ib_verbs.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ib_verbs.hpp b/src/ib_verbs.hpp
index da76df4cb..8db58fe25 100644
--- a/src/ib_verbs.hpp
+++ b/src/ib_verbs.hpp
@@ -647,7 +647,7 @@ class Verbs {
                     recv_head = &(_verbs.pkt_buf[wr_id]);
                     recv_tail = &recv_head->wr;
                 } else {
-                    recv_tail = &(_verbs.pkt_buf[wr_id].wr);
+                    recv_tail->next = &(_verbs.pkt_buf[wr_id].wr);
                     recv_tail = recv_tail->next;
                 }
             } // for each work completion

From 35a7b46ec8e3132d643293e68470fad35db69f7c Mon Sep 17 00:00:00 2001
From: ubuntu <ubuntu@pm.me>
Date: Tue, 17 Aug 2021 15:47:15 +0000
Subject: [PATCH 63/91] Make beamformer GEMM do the right thing

TODO: Is there a more sensible (less-transposey)
data ordering. [Probably]
---
 src/cublas_beamform.cu | 62 +++++++++++++++++++++++++++++++++---------
 1 file changed, 49 insertions(+), 13 deletions(-)

diff --git a/src/cublas_beamform.cu b/src/cublas_beamform.cu
index 4be7aedb6..ff8fe85dd 100644
--- a/src/cublas_beamform.cu
+++ b/src/cublas_beamform.cu
@@ -21,8 +21,8 @@ __global__ void trans_4bit_to_float(unsigned char *in,
   int time = blockIdx.x;
   int chan = blockIdx.y;
   int pol = TRANSPOSE_POL_BLOCK_SIZE*threadIdx.x;
-  unsigned char *in_off = in + time*n_chan*n_pol + chan*n_pol + pol;
-  float *out_off = out + 2*( chan*n_pol*n_time + pol*n_time + time);
+  unsigned char *in_off = in + time*n_chan*n_pol + chan*n_pol + pol; // 4+4 bit
+  float *out_off = out + 2*( chan*n_pol*n_time + pol*n_time + time); // 32+32 bit
   //long long int old_index = time*n_chan*n_pol + chan*n_pol + pol;
   //long long int new_index = chan*n_pol*n_time + pol*n_time + time;
   float real, imag;
@@ -34,10 +34,11 @@ __global__ void trans_4bit_to_float(unsigned char *in,
     //imag = lut[in[old_index+i] & 0b1111];
     //out[2*(new_index+i)] = real;
     //out[2*(new_index+i)+1] = imag;
-    real = lut[temp >> 4];
-    imag = lut[temp & 255];
-    *out_off++ = real;
-    *out_off++ = imag;
+    real = lut[(temp >> 4) & 0b1111];
+    imag = lut[temp & 0b1111];
+    out_off[0] = real;
+    out_off[1] = imag;
+    out_off += 2*n_time;
   }
 }
 
@@ -191,7 +192,7 @@ void cublas_beamform_init(int device, int ninputs, int nchans, int ntimes, int n
   gpuBLASchk(cublasCreate(&(context.handle)));
   gpuBLASchk(cublasSetStream(context.handle, context.stream));
   gpuBLASchk(cublasSetPointerMode(context.handle, CUBLAS_POINTER_MODE_HOST));
-  //gpuBLASchk(cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE));
+  //gpuBLASchk(cublasSetPointerMode(context.handle, CUBLAS_POINTER_MODE_DEVICE));
   gpuBLASchk(cublasSetMathMode(context.handle, CUBLAS_TENSOR_OP_MATH));
 
   context.ninputs = ninputs;
@@ -202,6 +203,7 @@ void cublas_beamform_init(int device, int ninputs, int nchans, int ntimes, int n
 
   // Internally allocate intermediate buffers
   gpuErrchk( cudaMalloc(&context.in32_d,  ninputs * nchans * ntimes * 2 * sizeof(float)) );
+  //gpuErrchk( cudaMemcpy(context.in32_d, in32_h, ninputs * nchans * ntimes * 2 * sizeof(float), cudaMemcpyHostToDevice) );
   // If the context is initialized with ntimeblocks=0, then we do no summing so don't
   // need the intermediate buffer allocated internally.
   if (ntimeblocks > 0) {
@@ -244,23 +246,24 @@ void cublas_beamform(unsigned char *in4_d, float *out_d, float *weights_d) {
   // A matrix: beamforming coeffs (NBEAMS * NANTS)
   // B matrix: data matrix (NANTS * NTIMES)
     
+  /*
   gpuBLASchk(cublasGemmStridedBatchedEx(
     context.handle,
-    CUBLAS_OP_N, // transpose A?
-    CUBLAS_OP_N, // transpose B?
+    CUBLAS_OP_T, // transpose A?
+    CUBLAS_OP_T, // transpose B?
     context.nbeams,      // m
     context.ntimes,      // n
     context.ninputs,     // k
-    // Coeffs
+    // Coeffs: [nchans x] nbeams x ninputs (m x k)
     &alpha,              // alpha
     weights_d,           // A
     CUDA_C_32F,          // A type
-    context.nbeams,      // Lda
+    context.ninputs,      // Lda
     context.nbeams*context.ninputs,// strideA : stride size
-    // Data
+    // Data: [nchans x] ninputs x ntimes (k x n)
     context.in32_d,      // B
     CUDA_C_32F,          // B type
-    context.ninputs,     // Ldb
+    context.ntimes,     // Ldb
     context.ninputs*context.ntimes,// strideB : stride size
     &beta,       // beta
     // Results
@@ -272,6 +275,39 @@ void cublas_beamform(unsigned char *in4_d, float *out_d, float *weights_d) {
     CUDA_C_32F,          // compute type
     CUBLAS_GEMM_DEFAULT_TENSOR_OP // algo
     ));
+  */
+  
+  gpuBLASchk(cublasGemmStridedBatchedEx(
+    context.handle,
+    CUBLAS_OP_N, // transpose A?
+    CUBLAS_OP_T, // transpose B?
+    context.ntimes,      // n
+    context.nbeams,      // m
+    context.ninputs,     // k
+    &alpha,              // alpha
+    //
+    // Data: [nchans x] ninputs x ntimes (k x n)
+    context.in32_d,      // B
+    CUDA_C_32F,          // B type
+    context.ntimes,     // Ldb
+    context.ninputs*context.ntimes,// strideB : stride size
+    //
+    // Coeffs: [nchans x] nbeams x ninputs (m x k)
+    weights_d,           // A
+    CUDA_C_32F,          // A type
+    context.nbeams,      // Lda
+    context.nbeams*context.ninputs,// strideA : stride size
+    //
+    &beta,       // beta
+    // Results
+    gem_out_d,       // C
+    CUDA_C_32F,          // Ctype 
+    context.ntimes,      // Ldc
+    context.nbeams*context.ntimes,// Stride C
+    context.nchans,      // batchCount
+    CUDA_C_32F,          // compute type
+    CUBLAS_GEMM_DEFAULT_TENSOR_OP // algo
+    ));
   cudaStreamSynchronize(context.stream);
 
   // Optionally:

From e2b3b9701b38cf9a68a27e8fcfb1e81c67b4e0ab Mon Sep 17 00:00:00 2001
From: ubuntu <ubuntu@pm.me>
Date: Tue, 17 Aug 2021 16:57:30 +0000
Subject: [PATCH 64/91] Update beamformer calls to match actual data ordering

---
 src/bifrost/beamform.h  |  6 +++---
 src/cublas_beamform.cu  | 29 ++++++++++++++++++++---------
 src/cublas_beamform.cuh |  2 +-
 3 files changed, 24 insertions(+), 13 deletions(-)

diff --git a/src/bifrost/beamform.h b/src/bifrost/beamform.h
index 7a5e4d297..8002a691a 100644
--- a/src/bifrost/beamform.h
+++ b/src/bifrost/beamform.h
@@ -24,7 +24,7 @@ BFstatus bfBeamformInitialize(
 /*
  * in:  Pointer to ntime x nchan x ninputs x 4+4 bit data block
  * out: Pointer to output data.
- *        If ntime_blocks > 0:
+ *        If ntime_blocks > 0: !!!!UNTESTED, probably broken!!!!
  *          For the purposes of generating dynamic spectra, beam 2n and 2n+1 are considered
  *          to be two pols of the same pointing, and are cross-multipled and summed over
  *          ntimes/ntime_blocks to form the output array:
@@ -35,7 +35,7 @@ BFstatus bfBeamformInitialize(
  *          coefficients can make appropriate single-pol beam pairs).
  *        If ntime_blocks = 0:
  *          Data are returned as voltages, in order:
- *            ntimes x nchan x nbeam x complex64 beamformer block
+ *            nchan x nbeam x ntime x complex64 beamformer block
  *                            
  * weights -- pointer to nbeams x nchans x ninputs x complex64 weights
  */
@@ -49,7 +49,7 @@ BFstatus bfBeamformRun(
  * Take the output of bfBeamformRun with ntime_blocks = 0, and perform transposing and integration
  * of data, to deliver a time integrated dual-pol dynamic spectra of the form:
  *   nbeam/2 x ntime/ntimes_sum x nchan x 4 x float32 (powers, XX, YY, re(XY, im(XY))
- * I.e., the format which would be returned by bfBeamformRun is ntime_blocks > 0
+ * I.e., the format which would be returned by bfBeamformRun if ntime_blocks > 0
  */
 BFstatus bfBeamformIntegrate(
   BFarray *in,
diff --git a/src/cublas_beamform.cu b/src/cublas_beamform.cu
index ff8fe85dd..3f635eda9 100644
--- a/src/cublas_beamform.cu
+++ b/src/cublas_beamform.cu
@@ -45,6 +45,7 @@ __global__ void trans_4bit_to_float(unsigned char *in,
 // Transpose chan x beam x pol x time x 32+32 float to
 // beam x time[part-summed] x chan x [XX,YY,XY*_r,XY*_i] x 32 float
 // Each thread deals with two pols of a beam, and sums over n_time_sum time samples
+// n_beam is the _output_ number of beams. I.e., the number of dual-pol beams
 __global__ void trans_output_and_sum(float *in,
                                     float *out,
                                     int n_chan,
@@ -55,25 +56,28 @@ __global__ void trans_output_and_sum(float *in,
   int chan = blockIdx.x;
   int beam = blockIdx.y;
   int time = threadIdx.x;
-  long long int old_index = chan*n_beam*n_time*2 + beam*n_time*2 + time*n_time_sum*2; // start index for n_time/n_time_sum samples
-  long long int new_index = beam*(n_time / n_time_sum)*n_chan + time*n_chan + chan;
-  float xx=0., yy=0., xy_r=0., xy_i=0.;
+  // n_beam here is a dual pol beam
+  // input is: chan x beam x pol [2] x time x complexity
+  long long int old_index = 2*(chan*n_beam*2*n_time + beam*2*n_time + time*n_time_sum); // start index for n_time/n_time_sum samples
+  // output is: beam x time x chan x pol-products [4]
+  long long int new_index = 4*(beam*(n_time / n_time_sum)*n_chan + time*n_chan + chan);
+  float xx=0., yy=0., xy_r=0., xy_i=0.; // accumulator registers
   float x_r, x_i, y_r, y_i;
   int t;
   for (t=0; t<n_time_sum; t++) {
     x_r = in[old_index + 2*t];
     x_i = in[old_index + 2*t + 1];
-    y_r = in[old_index + 2*n_time + 2*t];
-    y_i = in[old_index + 2*n_time + 2*t + 1];
+    y_r = in[old_index + 2*(n_time + t)];
+    y_i = in[old_index + 2*(n_time + t) + 1];
     xx = xx + x_r*x_r + x_i*x_i;
     yy = yy + y_r*y_r + y_i*y_i;
     xy_r = xy_r + x_r*y_r + x_i*y_i;
     xy_i = xy_i + x_i*y_r - x_r*y_i;
   }
-  out[4*new_index] = xx;
-  out[4*new_index+1] = yy;
-  out[4*new_index+2] = xy_r;
-  out[4*new_index+3] = xy_i;
+  out[new_index] = xx;
+  out[new_index+1] = yy;
+  out[new_index+2] = xy_r;
+  out[new_index+3] = xy_i;
 }
 
 // Take an input of order chan x beam x pol x time x 32+32 float and generate
@@ -330,6 +334,13 @@ void cublas_beamform(unsigned char *in4_d, float *out_d, float *weights_d) {
   }
 }
 
+/* Take input data of form
+   nchan x nbeam x time x complex64 [i.e. the output of cublas_beamform],
+   sum over ``ntimes_sum``, interpret beams `2n` and `2n+1` as a dual pol pair,
+   and generate an output array of form
+   nbeam x ntime / ntime_sum x nchan x nbeam / 2 x 4 x complex64.
+   The last 4-element axis holds XX, YY, Re(XY), Im(XY
+*/
 void cublas_beamform_integrate(float *in_d, float *out_d, int ntimes_sum) {
   // Create XX, YY, XY beam powers.
   // Sum over `ntimes_sum` samples
diff --git a/src/cublas_beamform.cuh b/src/cublas_beamform.cuh
index 1e9df948f..a33919e3b 100644
--- a/src/cublas_beamform.cuh
+++ b/src/cublas_beamform.cuh
@@ -15,7 +15,7 @@ __global__ void trans_4bit_to_float(unsigned char *in,
                                     int n_time
                                    );
 
-// Transpose chan x beam x pol x time x 32+32 float to
+// Transpose chan x beam x pol x pol x 32+32 float to
 // beam x time[part-summed] x chan x [XX,YY,XY*_r,XY*_i] x 32 float
 // Each thread deals with two pols of a beam, and sums over n_time_sum time samples
 __global__ void trans_output_and_sum(float *in,

From 787464e9e34831350a4903e05f86b1e977e45094 Mon Sep 17 00:00:00 2001
From: jaycedowell <jayce.dowell@gmail.com>
Date: Tue, 17 Aug 2021 15:46:55 -0600
Subject: [PATCH 65/91] The COR packets 'navg' field has the same units as
 'time_tag'.

---
 src/formats/cor.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/formats/cor.hpp b/src/formats/cor.hpp
index bf1660ded..c7c7b0787 100644
--- a/src/formats/cor.hpp
+++ b/src/formats/cor.hpp
@@ -72,7 +72,7 @@ class CORDecoder : virtual public PacketDecoder {
 	    pkt->sync         = pkt_hdr->sync_word;
 	    pkt->time_tag     = be64toh(pkt_hdr->time_tag);
 	    pkt->decimation   = be32toh(pkt_hdr->navg);
-	    pkt->seq          = pkt->time_tag / 196000000 / (pkt->decimation / 100);
+	    pkt->seq          = pkt->time_tag / pkt->decimation;
 	    pkt->nsrc         = _nsrc;
 	    pkt->src          = (stand0*(2*(nstand-1)+1-stand0)/2 + stand1 + 1 - _src0)*nserver \
 	                        + (server - 1);

From 9a6e56a7bfe3df15b82433795dbebb46a11a19f8 Mon Sep 17 00:00:00 2001
From: ubuntu <ubuntu@pm.me>
Date: Wed, 18 Aug 2021 14:55:33 +0000
Subject: [PATCH 66/91] Supply beam coeffs in chan x beam x input [C-ordered]
 order

---
 src/cublas_beamform.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/cublas_beamform.cu b/src/cublas_beamform.cu
index 3f635eda9..7e2b7bdbb 100644
--- a/src/cublas_beamform.cu
+++ b/src/cublas_beamform.cu
@@ -284,7 +284,7 @@ void cublas_beamform(unsigned char *in4_d, float *out_d, float *weights_d) {
   gpuBLASchk(cublasGemmStridedBatchedEx(
     context.handle,
     CUBLAS_OP_N, // transpose A?
-    CUBLAS_OP_T, // transpose B?
+    CUBLAS_OP_N, // transpose B?
     context.ntimes,      // n
     context.nbeams,      // m
     context.ninputs,     // k
@@ -299,7 +299,7 @@ void cublas_beamform(unsigned char *in4_d, float *out_d, float *weights_d) {
     // Coeffs: [nchans x] nbeams x ninputs (m x k)
     weights_d,           // A
     CUDA_C_32F,          // A type
-    context.nbeams,      // Lda
+    context.ninputs,      // Lda
     context.nbeams*context.ninputs,// strideA : stride size
     //
     &beta,       // beta

From b9bda798896c3b779b5731f578cea38e6e608ae8 Mon Sep 17 00:00:00 2001
From: ubuntu <ubuntu@pm.me>
Date: Fri, 27 Aug 2021 12:14:55 +0000
Subject: [PATCH 67/91] Catch mmap error

---
 src/ib_verbs.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ib_verbs.hpp b/src/ib_verbs.hpp
index 8db58fe25..eb6f61152 100644
--- a/src/ib_verbs.hpp
+++ b/src/ib_verbs.hpp
@@ -277,7 +277,7 @@ class Verbs {
         _verbs.mr_size = (size_t) BF_VERBS_NPKTBUF*BF_VERBS_NQP * _pkt_size_max;
         _verbs.mr_buf = (uint8_t *) ::mmap(NULL, _verbs.mr_size, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_LOCKED, -1, 0);
 
-        check_error(_verbs.mr_buf == MAP_FAILED,
+        check_error(_verbs.mr_buf == MAP_FAILED ? -1 : 0,
                     "allocate memory region buffer");
         check_error(::mlock(_verbs.mr_buf, _verbs.mr_size),
                     "lock memory region buffer");

From b107bfc5f6a35e6aaf8e6cbca5d380d0f358ecdf Mon Sep 17 00:00:00 2001
From: JackH <jack@rtr>
Date: Thu, 18 Nov 2021 21:11:07 +0000
Subject: [PATCH 68/91] Implement source blanking.

This is actually quite annoying. The call to blank a source
doesn't know about the details of the packet structure, and one can't
necessarily blank the right parts of the receive buffer when only given
nsrc, src, nchan, chan, since these values have been abused. E.g.
the output buffer is in Time x Chan x Antpol order, but the Chan
dimension can be a multiple of nchan, and nsrc != the number of Antpols in the system.

Hack the information out of the packet stream and then write a clurgy blanking
function.

Barely tested.
---
 src/formats/snap2.hpp | 60 ++++++++++++++++++++++++++-----------------
 1 file changed, 36 insertions(+), 24 deletions(-)

diff --git a/src/formats/snap2.hpp b/src/formats/snap2.hpp
index ea93244b3..16140be77 100644
--- a/src/formats/snap2.hpp
+++ b/src/formats/snap2.hpp
@@ -33,11 +33,6 @@
 #include <immintrin.h> // SSE
 #include <emmintrin.h>
 
-// TODO: parameterize somewhere. This isn't
-// related to the packet formatting
-#define PIPELINE_NPOL 704
-#define PIPELINE_NCHAN 32
-
 // All entries are network (i.e. big) endian
 struct __attribute__((packed)) snap2_hdr_type {
         uint64_t  seq;       // Spectra counter == packet counter
@@ -63,13 +58,13 @@ struct __attribute__((packed)) snap2_hdr_type {
 class SNAP2Decoder : virtual public PacketDecoder {
 protected:
     inline bool valid_packet(const PacketDesc* pkt) const {
-//#if BF_SNAP2_DEBUG
-//        cout << "seq: "<< pkt->seq << endl;
-//        cout << "src: "<< pkt->src << endl;
-//        cout << "nsrc: "<< pkt->nsrc << endl;
-//        cout << "nchan: "<< pkt->nchan << endl;
-//        cout << "chan0: "<< pkt->chan0 << endl;
-//#endif
+#if BF_SNAP2_DEBUG
+        cout << "seq: "<< pkt->seq << endl;
+        cout << "src: "<< pkt->src << endl;
+        cout << "nsrc: "<< pkt->nsrc << endl;
+        cout << "nchan: "<< pkt->nchan << endl;
+        cout << "chan0: "<< pkt->chan0 << endl;
+#endif
         return ( 
                  pkt->seq >= 0
                  && pkt->src >= 0
@@ -125,8 +120,10 @@ class SNAP2Decoder : virtual public PacketDecoder {
 };
 
 class SNAP2Processor : virtual public PacketProcessor {
-protected:
-    int _pipeline_nchan = PIPELINE_NCHAN;
+private:
+    bool _initialized = false;
+    int _npol_tot = 0;
+    int _npol_pkt = 0;
 public:
     inline void operator()(const PacketDesc* pkt,
                            uint64_t          seq0,
@@ -157,6 +154,12 @@ class SNAP2Processor : virtual public PacketProcessor {
             int words_per_chan_out = pkt->npol_tot >> 5;
             int pol_offset_out = pkt->pol0 >> 5;
             int pkt_chan = pkt->chan0;           // The first channel in this packet
+
+            if ( !_initialized ) {
+                _npol_tot = pkt->npol_tot;
+                _npol_pkt = pkt->npol;
+                _initialized = true;
+            }
         
             // Copy packet payload one channel at a time.
             // Packets have payload format nchans x npols x complexity.
@@ -186,16 +189,25 @@ class SNAP2Processor : virtual public PacketProcessor {
                                      int      nsrc,
                                      int      nchan,
                                      int      nseq) {
-            //fprintf(stderr, "TRYING TO BLANK OUT A SOURCE WITH MISSING PACKETS. BUT BLANKING NOT IMPLEMENTED\n");
-            //typedef aligned256_type otype;
-            //fprintf(stderr, "You really better not be here\n");
-            //otype* __restrict__ aligned_data = (otype*)data;
-            //for( int t=0; t<nseq; ++t ) {
-            //        for( int c=0; c<nchan; ++c ) {
-            //                ::memset(&aligned_data[src + nsrc*(c + nchan*t)],
-            //                         0, sizeof(otype));
-            //        }
-            //}
+            //fprintf(stderr, "Zeroing out source %d of %d (%d chans) (%d nseq)\n", src, nsrc, nchan, nseq);
+            int npol_blocks = _npol_tot / _npol_pkt;
+            int nchan_blocks = nsrc / npol_blocks;
+            int chan_block = src / npol_blocks;
+            int pol_block =  src % npol_blocks;
+            //fprintf(stderr, "Channel block: %d. Zeroing %d bytes\n", chan_block, _npol_pkt);
+            // Output buffer is Time x Chan x Pol
+            // -> Time x chan_block x chan x pol_block x pol_words
+            // -> Time x 
+            int chan_block_offset_bytes = chan_block * nchan * _npol_tot;
+            int time_offset_bytes = nchan_blocks * nchan * _npol_tot;
+            int pol_offset_bytes = pol_block*_npol_pkt;
+            //fprintf(stderr, "Offset bytes: T*%d + [chan block] %d + c*%d + [pol offset] %d\n", time_offset_bytes, chan_block_offset_bytes, _npol_tot, pol_offset_bytes);
+            for( int t=0; t<nseq; t++ ) {
+                    for( int c=0; c<nchan; c++ ) {
+                            ::memset(&data[t*time_offset_bytes + chan_block_offset_bytes
+                                           + c*_npol_tot + pol_offset_bytes], 0, _npol_pkt);
+                    }
+            }
     }
 };
 

From 316871d9602c782a709812026e9026158da9c5db Mon Sep 17 00:00:00 2001
From: Jack <jack@realtimeradio.co.uk>
Date: Sun, 9 Jan 2022 20:02:18 +0000
Subject: [PATCH 69/91] Revert "Implement source blanking."

This reverts commit b107bfc5f6a35e6aaf8e6cbca5d380d0f358ecdf.

Don't do this for now to make things faster
---
 src/formats/snap2.hpp | 60 +++++++++++++++++--------------------------
 1 file changed, 24 insertions(+), 36 deletions(-)

diff --git a/src/formats/snap2.hpp b/src/formats/snap2.hpp
index 16140be77..ea93244b3 100644
--- a/src/formats/snap2.hpp
+++ b/src/formats/snap2.hpp
@@ -33,6 +33,11 @@
 #include <immintrin.h> // SSE
 #include <emmintrin.h>
 
+// TODO: parameterize somewhere. This isn't
+// related to the packet formatting
+#define PIPELINE_NPOL 704
+#define PIPELINE_NCHAN 32
+
 // All entries are network (i.e. big) endian
 struct __attribute__((packed)) snap2_hdr_type {
         uint64_t  seq;       // Spectra counter == packet counter
@@ -58,13 +63,13 @@ struct __attribute__((packed)) snap2_hdr_type {
 class SNAP2Decoder : virtual public PacketDecoder {
 protected:
     inline bool valid_packet(const PacketDesc* pkt) const {
-#if BF_SNAP2_DEBUG
-        cout << "seq: "<< pkt->seq << endl;
-        cout << "src: "<< pkt->src << endl;
-        cout << "nsrc: "<< pkt->nsrc << endl;
-        cout << "nchan: "<< pkt->nchan << endl;
-        cout << "chan0: "<< pkt->chan0 << endl;
-#endif
+//#if BF_SNAP2_DEBUG
+//        cout << "seq: "<< pkt->seq << endl;
+//        cout << "src: "<< pkt->src << endl;
+//        cout << "nsrc: "<< pkt->nsrc << endl;
+//        cout << "nchan: "<< pkt->nchan << endl;
+//        cout << "chan0: "<< pkt->chan0 << endl;
+//#endif
         return ( 
                  pkt->seq >= 0
                  && pkt->src >= 0
@@ -120,10 +125,8 @@ class SNAP2Decoder : virtual public PacketDecoder {
 };
 
 class SNAP2Processor : virtual public PacketProcessor {
-private:
-    bool _initialized = false;
-    int _npol_tot = 0;
-    int _npol_pkt = 0;
+protected:
+    int _pipeline_nchan = PIPELINE_NCHAN;
 public:
     inline void operator()(const PacketDesc* pkt,
                            uint64_t          seq0,
@@ -154,12 +157,6 @@ class SNAP2Processor : virtual public PacketProcessor {
             int words_per_chan_out = pkt->npol_tot >> 5;
             int pol_offset_out = pkt->pol0 >> 5;
             int pkt_chan = pkt->chan0;           // The first channel in this packet
-
-            if ( !_initialized ) {
-                _npol_tot = pkt->npol_tot;
-                _npol_pkt = pkt->npol;
-                _initialized = true;
-            }
         
             // Copy packet payload one channel at a time.
             // Packets have payload format nchans x npols x complexity.
@@ -189,25 +186,16 @@ class SNAP2Processor : virtual public PacketProcessor {
                                      int      nsrc,
                                      int      nchan,
                                      int      nseq) {
-            //fprintf(stderr, "Zeroing out source %d of %d (%d chans) (%d nseq)\n", src, nsrc, nchan, nseq);
-            int npol_blocks = _npol_tot / _npol_pkt;
-            int nchan_blocks = nsrc / npol_blocks;
-            int chan_block = src / npol_blocks;
-            int pol_block =  src % npol_blocks;
-            //fprintf(stderr, "Channel block: %d. Zeroing %d bytes\n", chan_block, _npol_pkt);
-            // Output buffer is Time x Chan x Pol
-            // -> Time x chan_block x chan x pol_block x pol_words
-            // -> Time x 
-            int chan_block_offset_bytes = chan_block * nchan * _npol_tot;
-            int time_offset_bytes = nchan_blocks * nchan * _npol_tot;
-            int pol_offset_bytes = pol_block*_npol_pkt;
-            //fprintf(stderr, "Offset bytes: T*%d + [chan block] %d + c*%d + [pol offset] %d\n", time_offset_bytes, chan_block_offset_bytes, _npol_tot, pol_offset_bytes);
-            for( int t=0; t<nseq; t++ ) {
-                    for( int c=0; c<nchan; c++ ) {
-                            ::memset(&data[t*time_offset_bytes + chan_block_offset_bytes
-                                           + c*_npol_tot + pol_offset_bytes], 0, _npol_pkt);
-                    }
-            }
+            //fprintf(stderr, "TRYING TO BLANK OUT A SOURCE WITH MISSING PACKETS. BUT BLANKING NOT IMPLEMENTED\n");
+            //typedef aligned256_type otype;
+            //fprintf(stderr, "You really better not be here\n");
+            //otype* __restrict__ aligned_data = (otype*)data;
+            //for( int t=0; t<nseq; ++t ) {
+            //        for( int c=0; c<nchan; ++c ) {
+            //                ::memset(&aligned_data[src + nsrc*(c + nchan*t)],
+            //                         0, sizeof(otype));
+            //        }
+            //}
     }
 };
 

From 8c0699f3f679d03931b1eae2d4e160ec382f5edc Mon Sep 17 00:00:00 2001
From: JackH <jack@rtr>
Date: Fri, 21 Jan 2022 16:47:58 +0000
Subject: [PATCH 70/91] Add option to transpose xgpu output

---
 src/bf_xgpu.cpp       | 9 ++++++---
 src/bifrost/bf_xgpu.h | 2 +-
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/bf_xgpu.cpp b/src/bf_xgpu.cpp
index 91d55c843..5a340247d 100644
--- a/src/bf_xgpu.cpp
+++ b/src/bf_xgpu.cpp
@@ -131,13 +131,16 @@ BFstatus bfXgpuKernel(BFarray *in, BFarray *out, int doDump) {
 
 /*
  * Given an xGPU accumulation buffer, grab a subset of visibilities from
- * and gather them in a new buffer, in order chan x visibility x complexity [int32]
+ * and gather them in a new buffer, in order
+ * chan x visibility x complexity [int32] (if transpose=0)
+ * or
+ * visibility x chan * complexity [int32] (if transpose!=0)
  * BFarray *in : Pointer to a BFarray with storage in device memory, where xGPU results reside
  * BFarray *in : Pointer to a BFarray with storage in device memory where collated visibilities should be written.
  * BFarray *vismap : array of visibilities in [[polA, polB], [polC, polD], ... ] form.
  * int nchan_sum: The number of frequency channels to sum over
  */
-BFstatus bfXgpuSubSelect(BFarray *in, BFarray *out, BFarray *vismap, BFarray *conj, int nchan_sum) {
+BFstatus bfXgpuSubSelect(BFarray *in, BFarray *out, BFarray *vismap, BFarray *conj, int nchan_sum, int transpose) {
   long long unsigned nvis = num_contiguous_elements(vismap);
   int xgpu_error;
   if (in->space != BF_SPACE_CUDA) {
@@ -155,7 +158,7 @@ BFstatus bfXgpuSubSelect(BFarray *in, BFarray *out, BFarray *vismap, BFarray *co
   if (num_contiguous_elements(conj) != nvis) {
     return BF_STATUS_INVALID_SHAPE;
   }
-  xgpu_error = xgpuCudaSubSelect(&context, (Complex *)in->data, (Complex *)out->data, (int *)vismap->data, (int *)conj->data, nvis, nchan_sum);
+  xgpu_error = xgpuCudaSubSelect(&context, (Complex *)in->data, (Complex *)out->data, (int *)vismap->data, (int *)conj->data, nvis, nchan_sum, transpose);
   if (xgpu_error != XGPU_OK) {
     fprintf(stderr, "ERROR: xgpuKernel: kernel call returned %d\n", xgpu_error);
     return BF_STATUS_INTERNAL_ERROR;
diff --git a/src/bifrost/bf_xgpu.h b/src/bifrost/bf_xgpu.h
index fa9504a4b..b8f21b783 100644
--- a/src/bifrost/bf_xgpu.h
+++ b/src/bifrost/bf_xgpu.h
@@ -13,7 +13,7 @@ extern "C" {
 BFstatus bfXgpuInitialize(BFarray *in, BFarray *out, int gpu_dev);
 BFstatus bfXgpuCorrelate(BFarray *in, BFarray *out, int doDump);
 BFstatus bfXgpuKernel(BFarray *in, BFarray *out, int doDump);
-BFstatus bfXgpuSubSelect(BFarray *in, BFarray *out, BFarray *vismap, BFarray *conj, int nchan_sum);
+BFstatus bfXgpuSubSelect(BFarray *in, BFarray *out, BFarray *vismap, BFarray *conj, int nchan_sum, int transpose);
 BFstatus bfXgpuGetOrder(BFarray *antpol_to_input, BFarray *antpol_to_bl, BFarray *is_conj);
 BFstatus bfXgpuReorder(BFarray *xgpu_output, BFarray *reordered, BFarray *baselines, BFarray *is_conjugated);
 //#endif // BF_XGPU_ENABLED

From 8a953cc54e5c3c4182bd60f719e18e5079262d3d Mon Sep 17 00:00:00 2001
From: jaycedowell <jayce.dowell@gmail.com>
Date: Thu, 1 Jun 2023 09:22:42 -0600
Subject: [PATCH 71/91] Need to actually save files...

---
 python/bifrost/ring.py |  8 ------
 src/Makefile.in        | 61 ------------------------------------------
 2 files changed, 69 deletions(-)

diff --git a/python/bifrost/ring.py b/python/bifrost/ring.py
index e1d7342e1..e4335e1f6 100644
--- a/python/bifrost/ring.py
+++ b/python/bifrost/ring.py
@@ -118,11 +118,7 @@ def read(self, whence='earliest', guarantee=True):
                 try:
                     yield cur_seq
                     cur_seq.increment()
-<<<<<<< HEAD
-                except StopIteration:
-=======
                 except EndOfDataStop:
->>>>>>> 9a628e76e865e895c9a54df63d1793d9d10d57c6
                     return
     #def _data(self):
     #    data_ptr = _get(self.lib.bfRingLockedGetData, self.obj)
@@ -287,11 +283,7 @@ def read(self, span_size, stride=None, begin=0):
                 with self.acquire(offset, span_size) as ispan:
                     yield ispan
                 offset += stride
-<<<<<<< HEAD
-            except StopIteration:
-=======
             except EndOfDataStop:
->>>>>>> 9a628e76e865e895c9a54df63d1793d9d10d57c6
                 return
 
 class SpanBase(object):
diff --git a/src/Makefile.in b/src/Makefile.in
index d89cd783c..8c4e09740 100644
--- a/src/Makefile.in
+++ b/src/Makefile.in
@@ -107,67 +107,6 @@ NVCCFLAGS += $(NVCC_GENCODE)
 #NVCCFLAGS += -Xcudafe "--diag_suppress=unrecognized_gcc_pragma"
 #NVCCFLAGS += --expt-relaxed-constexpr
 
-<<<<<<< HEAD:src/Makefile
-ifndef NODEBUG
-  CPPFLAGS  += -DBF_DEBUG=1
-  CXXFLAGS  += -g
-  NVCCFLAGS += -g
-endif
-
-LIB += -lgomp
-
-ifdef TRACE
-  CPPFLAGS   += -DBF_TRACE_ENABLED=1
-endif
-
-ifdef NUMA
-  # Requires libnuma-dev to be installed
-  LIB        += -lnuma
-  CPPFLAGS   += -DBF_NUMA_ENABLED=1
-endif
-
-ifdef HWLOC
-  # Requires libhwloc-dev to be installed
-  LIB        += -lhwloc
-  CPPFLAGS   += -DBF_HWLOC_ENABLED=1
-endif
-
-ifdef VMA
-  # Requires Mellanox libvma to be installed
-  LIB        += -lvma
-  CPPFLAGS   += -DBF_VMA_ENABLED=1
-endif
-
-ifdef VERBS
-  # Requires Mellanox libvma to be installed
-  LIB        += -libverbs
-  CPPFLAGS   += -DBF_VERBS_ENABLED=1
-endif
-
-ifdef ALIGNMENT
-  CPPFLAGS   += -DBF_ALIGNMENT=$(ALIGNMENT)
-endif
-
-ifdef CUDA_DEBUG
-  NVCCFLAGS += -G
-endif
-
-ifndef NOCUDA
-  CPPFLAGS  += -DBF_CUDA_ENABLED=1
-  LIB       += -L$(CUDA_LIBDIR64) -L$(CUDA_LIBDIR) -lcuda -lcudart -lnvrtc -lcublas -lcudadevrt -L. -lcufft_static_pruned -lculibos -lnvToolsExt
-ifdef XGPU
-  CPPFLAGS  += -DBF_XGPU_ENABLED=1
-  LIB       += -lxgpu
-endif
-endif
-
-ifndef ANY_ARCH
-  CXXFLAGS  += -march=native
-  NVCCFLAGS += -Xcompiler "-march=native"
-endif
-
-=======
->>>>>>> 9a628e76e865e895c9a54df63d1793d9d10d57c6:src/Makefile.in
 LIB_DIR = ../lib
 INC_DIR = .
 CPPFLAGS += -I. -I$(INC_DIR) -I$(CUDA_INCDIR)

From 90886715fbb4dc0a1c97202909be158a776dd171 Mon Sep 17 00:00:00 2001
From: jaycedowell <jayce.dowell@gmail.com>
Date: Thu, 1 Jun 2023 09:23:47 -0600
Subject: [PATCH 72/91] Revert to the ibverb-support version.

---
 tools/like_top.py     | 12 +++++-------
 tools/pipeline2dot.py |  1 +
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/tools/like_top.py b/tools/like_top.py
index c4ad90ed9..89ca700dd 100755
--- a/tools/like_top.py
+++ b/tools/like_top.py
@@ -341,11 +341,10 @@ def main(args):
                             ac = max([0.0, log['acquire_time']])
                             pr = max([0.0, log['process_time']])
                             re = max([0.0, log['reserve_time']])
-                            gb = max([0.0, log.get('gbps', 0.0)])
                         except KeyError:
-                            ac, pr, re, gb = 0.0, 0.0, 0.0, 0.0
+                            ac, pr, re = 0.0, 0.0, 0.0
 
-                        blockList['%i-%s' % (pid, block)] = {'pid': pid, 'name':block, 'cmd': cmd, 'core': cr, 'acquire': ac, 'process': pr, 'reserve': re, 'total':ac+pr+re, 'gbps':gb}
+                        blockList['%i-%s' % (pid, block)] = {'pid': pid, 'name':block, 'cmd': cmd, 'core': cr, 'acquire': ac, 'process': pr, 'reserve': re, 'total':ac+pr+re}
 
                 ## Sort
                 order = sorted(blockList, key=lambda x: blockList[x][sort_key], reverse=sort_rev)
@@ -383,7 +382,7 @@ def main(args):
                 k = _add_line(scr, k, 0, output, std)
             ### Header
             k = _add_line(scr, k, 0, ' ', std)
-            output = '%6s  %15s  %4s  %5s  %7s  %7s  %7s  %7s  %7s  Cmd' % ('PID', 'Block', 'Core', '%CPU', 'Total', 'Acquire', 'Process', 'Reserve', 'Gbits/s')
+            output = '%6s  %15s  %4s  %5s  %7s  %7s  %7s  %7s  Cmd' % ('PID', 'Block', 'Core', '%CPU', 'Total', 'Acquire', 'Process', 'Reserve')
             csize = size[1]-len(output)
             if csize < 0:
                 csize = 0
@@ -398,7 +397,7 @@ def main(args):
                     c = '%5.1f' % c
                 except KeyError:
                     c = '%5s' % ' '
-                output = '%6i  %15s  %4i  %5s  %7.3f  %7.3f  %7.3f  %7.3f  %7.3f %s' % (d['pid'], d['name'][:15], d['core'], c, d['total'], d['acquire'], d['process'], d['reserve'], d['gbps'], d['cmd'][:csize+3])
+                output = '%6i  %15s  %4i  %5s  %7.3f  %7.3f  %7.3f  %7.3f  %s' % (d['pid'], d['name'][:15], d['core'], c, d['total'], d['acquire'], d['process'], d['reserve'], d['cmd'][:csize+3])
                 k = _add_line(scr, k, 0, output, std)
                 if k >= size[0] - 1:
                     break
@@ -413,8 +412,7 @@ def main(args):
     except KeyboardInterrupt:
         pass
 
-    except Exception as err:
-        error = err
+    except Exception as error:
         exc_type, exc_value, exc_traceback = sys.exc_info()
         fileObject = StringIO()
         traceback.print_tb(exc_traceback, file=fileObject)
diff --git a/tools/pipeline2dot.py b/tools/pipeline2dot.py
index d2c772bd8..b6f20f640 100755
--- a/tools/pipeline2dot.py
+++ b/tools/pipeline2dot.py
@@ -353,3 +353,4 @@ def main(args):
                         help='exclude associated blocks')
     args = parser.parse_args()
     main(args)
+    

From aa064b1524a3f0b9407646bec4d69edf3922fec9 Mon Sep 17 00:00:00 2001
From: jaycedowell <jayce.dowell@gmail.com>
Date: Thu, 1 Jun 2023 09:26:00 -0600
Subject: [PATCH 73/91] Focus on packet formats for now.

---
 src/Makefile.in         |   8 +-
 src/beamform.cpp        |  87 ----------
 src/bf_xgpu.cpp         | 284 ------------------------------
 src/bifrost/beamform.h  |  73 --------
 src/bifrost/bf_xgpu.h   |  25 ---
 src/cublas_beamform.cu  | 373 ----------------------------------------
 src/cublas_beamform.cuh |  37 ----
 7 files changed, 1 insertion(+), 886 deletions(-)
 delete mode 100644 src/beamform.cpp
 delete mode 100644 src/bf_xgpu.cpp
 delete mode 100644 src/bifrost/beamform.h
 delete mode 100644 src/bifrost/bf_xgpu.h
 delete mode 100644 src/cublas_beamform.cu
 delete mode 100644 src/cublas_beamform.cuh

diff --git a/src/Makefile.in b/src/Makefile.in
index 8c4e09740..360859f60 100644
--- a/src/Makefile.in
+++ b/src/Makefile.in
@@ -69,13 +69,7 @@ ifeq ($(HAVE_CUDA),1)
   reduce.o \
   fir.o \
   guantize.o \
-  gunpack.o \
-  beamform.o \
-  cublas_beamform.o
-ifdef XGPU
-  LIBBIFROST_OBJS += \
-  bf_xgpu.o
-endif
+  gunpack.o
 endif
 
 JIT_SOURCES ?= \
diff --git a/src/beamform.cpp b/src/beamform.cpp
deleted file mode 100644
index 16d874222..000000000
--- a/src/beamform.cpp
+++ /dev/null
@@ -1,87 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <assert.h>
-#include <math.h>
-#include <limits.h>
-#include <unistd.h>
-
-#include <bifrost/array.h>
-#include <bifrost/common.h>
-#include <utils.hpp>
-#include <stdlib.h>
-#include <stdio.h>
-#include <iostream>
-
-#include "cublas_beamform.cuh"
-
-extern "C" {
-
-
-/* 
- * Initialize the beamformer library
- */
-
-BFstatus bfBeamformInitialize(
-  int gpudev,
-  int ninputs,
-  int nchans,
-  int ntimes,
-  int nbeams,
-  int ntime_blocks
-) {
-  // TODO: array size checking
-  // TODO: use complex data types
-  cublas_beamform_init(
-    gpudev,
-    ninputs,
-    nchans,
-    ntimes,
-    nbeams,
-    ntime_blocks
-  );
-  return BF_STATUS_SUCCESS;
-}
-
-BFstatus bfBeamformRun(BFarray *in, BFarray *out, BFarray *weights) {
-  if (in->space != BF_SPACE_CUDA) {
-    fprintf(stderr, "Beamformer input buffer must be in CUDA space\n");
-    return BF_STATUS_INVALID_SPACE;
-  }
-  if (out->space != BF_SPACE_CUDA) {
-    fprintf(stderr, "Beamformer output buffer must be in CUDA space\n");
-    return BF_STATUS_INVALID_SPACE;
-  }
-  if (weights->space != BF_SPACE_CUDA) {
-    fprintf(stderr, "Beamformer weights buffer must be in CUDA space\n");
-    return BF_STATUS_INVALID_SPACE;
-  }
-  cublas_beamform((unsigned char *)in->data, (float *)out->data, (float *)weights->data);
-  return BF_STATUS_SUCCESS;
-}
-  
-BFstatus bfBeamformIntegrate(BFarray *in, BFarray *out, int ntimes_sum) {
-  if (in->space != BF_SPACE_CUDA) {
-    fprintf(stderr, "Beamformer input buffer must be in CUDA space\n");
-    return BF_STATUS_INVALID_SPACE;
-  }
-  if (out->space != BF_SPACE_CUDA) {
-    fprintf(stderr, "Beamformer output buffer must be in CUDA space\n");
-    return BF_STATUS_INVALID_SPACE;
-  }
-  cublas_beamform_integrate((float *)in->data, (float *)out->data, ntimes_sum);
-  return BF_STATUS_SUCCESS;
-}
-
-BFstatus bfBeamformIntegrateSingleBeam(BFarray *in, BFarray *out, int ntimes_sum, int beam_index) {
-  if (in->space != BF_SPACE_CUDA) {
-    fprintf(stderr, "Beamformer input buffer must be in CUDA space\n");
-    return BF_STATUS_INVALID_SPACE;
-  }
-  if (out->space != BF_SPACE_CUDA) {
-    fprintf(stderr, "Beamformer output buffer must be in CUDA space\n");
-    return BF_STATUS_INVALID_SPACE;
-  }
-  cublas_beamform_integrate_single_beam((float *)in->data, (float *)out->data, ntimes_sum, beam_index);
-  return BF_STATUS_SUCCESS;
-}
-} // C
diff --git a/src/bf_xgpu.cpp b/src/bf_xgpu.cpp
deleted file mode 100644
index 5a340247d..000000000
--- a/src/bf_xgpu.cpp
+++ /dev/null
@@ -1,284 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <assert.h>
-#include <math.h>
-#include <limits.h>
-#include <unistd.h>
-
-#include <bifrost/array.h>
-#include <bifrost/common.h>
-#include <utils.hpp>
-#include <stdlib.h>
-#include <stdio.h>
-#include <iostream>
-
-#include <xgpu.h>
-
-extern "C" {
-
-static XGPUContext context;
-static XGPUInfo info;
-
-/* 
- * Initialize the xGPU library by providing
- * a pointer to the input and output data (on the host),
- * and a GPU device ID
- */
-BFstatus bfXgpuInitialize(BFarray *in, BFarray *out, int gpu_dev) {
-  int xgpu_error;
-  xgpuInfo(&info);
-  // Don't bother checking sizes if the input space is CUDA.
-  // We're not going to use these arrays anyway
-  if (in->space != BF_SPACE_CUDA) {
-      if (num_contiguous_elements(in) != info.vecLength) {
-        fprintf(stderr, "ERROR: xgpuInitialize: number of elements in != vecLength\n");
-        fprintf(stderr, "number of elements in: %lu\n", num_contiguous_elements(in));
-        fprintf(stderr, "vecLength: %llu\n", info.vecLength);
-        return BF_STATUS_INVALID_SHAPE;
-      }
-      if (num_contiguous_elements(out) != info.matLength) {
-        fprintf(stderr, "ERROR: xgpuInitialize: number of elements out != matLength\n");
-        fprintf(stderr, "number of elements out: %lu\n", num_contiguous_elements(out));
-        fprintf(stderr, "matLength: %llu\n", info.matLength);
-        return BF_STATUS_INVALID_SHAPE;
-      }
-  }
-  context.array_h = (SwizzleInput *)in->data;
-  context.array_len = info.vecLength;
-  context.matrix_h = (Complex *)out->data;
-  context.matrix_len = info.matLength;
-  if (in->space == BF_SPACE_CUDA) {
-      xgpu_error = xgpuInit(&context, gpu_dev | XGPU_DONT_REGISTER | XGPU_DONT_MALLOC_GPU);
-  } else {
-      xgpu_error = xgpuInit(&context, gpu_dev);
-  }
-  if (xgpu_error != XGPU_OK) {
-    fprintf(stderr, "ERROR: xgpuInitialize: call returned %d\n", xgpu_error);
-    return BF_STATUS_INTERNAL_ERROR;
-  } else {
-    return BF_STATUS_SUCCESS;
-  }
-}
-
-/*
- * Call the xGPU kernel.
- * in : pointer to input data array on host
- * out: pointer to output data array on host
- * doDump : if 1, this is the last call in an integration, and results
- *          will be copied to the host.
- */
-BFstatus bfXgpuCorrelate(BFarray *in, BFarray *out, int doDump) {
-  if (in->space == BF_SPACE_CUDA) {
-    return BF_STATUS_UNSUPPORTED_SPACE;
-  }
-  if (out->space == BF_SPACE_CUDA) {
-    return BF_STATUS_UNSUPPORTED_SPACE;
-  }
-  int xgpu_error;
-  context.array_h = (SwizzleInput *)in->data;
-  context.array_len = info.vecLength;
-  context.matrix_h = (Complex *)out->data;
-  context.matrix_len = info.matLength;
-  xgpu_error = xgpuCudaXengineSwizzle(&context, doDump ? SYNCOP_DUMP : SYNCOP_SYNC_TRANSFER);
-  if (doDump) {
-    xgpuClearDeviceIntegrationBuffer(&context);
-  }
-  if (xgpu_error != XGPU_OK) {
-    return BF_STATUS_INTERNAL_ERROR;
-  } else {
-    return BF_STATUS_SUCCESS;
-  }
-}
-
-/*
- * Call the xGPU kernel having pre-copied data to device memory.
- * Note that this means xGPU can't take advantage of its inbuild
- * copy/compute pipelining.
- * in : pointer to input data array on device
- * out: pointer to output data array on device
- * doDump : if 1, this is the last call in an integration, and results
- *          will be copied to the host.
- */
-static int newAcc = 1; // flush vacc on the first call
-BFstatus bfXgpuKernel(BFarray *in, BFarray *out, int doDump) {
-  if (in->space != BF_SPACE_CUDA) {
-    return BF_STATUS_UNSUPPORTED_SPACE;
-  }
-  if (out->space != BF_SPACE_CUDA) {
-    return BF_STATUS_UNSUPPORTED_SPACE;
-  }
-  int xgpu_error;
-  context.array_h = (ComplexInput *)in->data;
-  context.array_len = info.vecLength;
-  context.matrix_h = (Complex *)out->data;
-  context.matrix_len = info.matLength;
-  xgpu_error = xgpuCudaXengineSwizzleKernel(&context, doDump ? SYNCOP_DUMP : 0, newAcc,
-		                           (SwizzleInput *)in->data, (Complex *)out->data);
-
-  if (newAcc) {
-    newAcc = 0;
-  }
-  if (doDump) {
-    newAcc = 1;
-  }
-  if (xgpu_error != XGPU_OK) {
-    fprintf(stderr, "ERROR: xgpuKernel: kernel call returned %d\n", xgpu_error);
-    return BF_STATUS_INTERNAL_ERROR;
-  } else {
-    return BF_STATUS_SUCCESS;
-  }
-}
-
-/*
- * Given an xGPU accumulation buffer, grab a subset of visibilities from
- * and gather them in a new buffer, in order
- * chan x visibility x complexity [int32] (if transpose=0)
- * or
- * visibility x chan * complexity [int32] (if transpose!=0)
- * BFarray *in : Pointer to a BFarray with storage in device memory, where xGPU results reside
- * BFarray *in : Pointer to a BFarray with storage in device memory where collated visibilities should be written.
- * BFarray *vismap : array of visibilities in [[polA, polB], [polC, polD], ... ] form.
- * int nchan_sum: The number of frequency channels to sum over
- */
-BFstatus bfXgpuSubSelect(BFarray *in, BFarray *out, BFarray *vismap, BFarray *conj, int nchan_sum, int transpose) {
-  long long unsigned nvis = num_contiguous_elements(vismap);
-  int xgpu_error;
-  if (in->space != BF_SPACE_CUDA) {
-    return BF_STATUS_UNSUPPORTED_SPACE;
-  }
-  if (out->space != BF_SPACE_CUDA) {
-    return BF_STATUS_UNSUPPORTED_SPACE;
-  }
-  if (vismap->space != BF_SPACE_CUDA) {
-    return BF_STATUS_UNSUPPORTED_SPACE;
-  }
-  if (conj->space != BF_SPACE_CUDA) {
-    return BF_STATUS_UNSUPPORTED_SPACE;
-  }
-  if (num_contiguous_elements(conj) != nvis) {
-    return BF_STATUS_INVALID_SHAPE;
-  }
-  xgpu_error = xgpuCudaSubSelect(&context, (Complex *)in->data, (Complex *)out->data, (int *)vismap->data, (int *)conj->data, nvis, nchan_sum, transpose);
-  if (xgpu_error != XGPU_OK) {
-    fprintf(stderr, "ERROR: xgpuKernel: kernel call returned %d\n", xgpu_error);
-    return BF_STATUS_INTERNAL_ERROR;
-  } else {
-    return BF_STATUS_SUCCESS;
-  }
-}
-
-/* Computes the triangular index of an (i,j) pair as shown here...
- * NB: Output is valid only if i >= j.
- *
- *      i=0  1  2  3  4..
- *     +---------------
- * j=0 | 00 01 03 06 10
- *   1 |    02 04 07 11
- *   2 |       05 08 12
- *   3 |          09 13
- *   4 |             14
- *   :
- */
-int tri_index(int i, int j){
-  return (i * (i+1))/2 + j;
-  }
-
-/* Returns index into the GPU's register tile ordered output buffer for the
- * real component of the cross product of inputs in0 and in1.  Note that in0
- * and in1 are input indexes (i.e. 0 based) and often represent antenna and
- * polarization by passing (2*ant_idx+pol_idx) as the input number (NB: ant_idx
- * and pol_idx are also 0 based).  Return value is valid if in1 >= in0.  The
- * corresponding imaginary component is located xgpu_info.matLength words after
- * the real component.
- */
-int regtile_index(int in0, int in1, int nstand) {
-  int a0, a1, p0, p1;
-  int num_words_per_cell=4;
-  int quadrant, quadrant_index, quadrant_size, cell_index, pol_offset, index;
-  a0 = in0 >> 1;
-  a1 = in1 >> 1;
-  p0 = in0 & 1;
-  p1 = in1 & 1;
-  
-  // Index within a quadrant
-  quadrant_index = tri_index(a1/2, a0/2);
-  // Quadrant for this input pair
-  quadrant = 2*(a0&1) + (a1&1);
-  // Size of quadrant
-  quadrant_size = (nstand/2 + 1) * nstand/4;
-  // Index of cell (in units of cells)
-  cell_index = quadrant*quadrant_size + quadrant_index;
-  // Pol offset
-  pol_offset = 2*p1 + p0;
-  // Word index (in units of words (i.e. floats) of real component
-  index = (cell_index * num_words_per_cell) + pol_offset;
-  return index;
-  }
-
-BFstatus bfXgpuGetOrder(BFarray *antpol_to_input, BFarray *antpol_to_bl, BFarray *is_conj) {
-  int *ip_map = (int *)antpol_to_input->data; // indexed by stand, pol
-  int *bl_map = (int *)antpol_to_bl->data;    // indexed by stand0, stand1, pol0, pol1
-  int *conj_map = (int *)is_conj->data;       // indexed by stand0, stand1, pol0, pol1
-  int s0, s1, p0, p1, i0, i1;
-  int nstand, npol;
-  XGPUInfo xgpu_info;
-  xgpuInfo(&xgpu_info);
-  nstand = xgpu_info.nstation;
-  npol = xgpu_info.npol;
-  for (s0=0; s0<nstand; s0++) {
-    for (s1=0; s1<nstand; s1++) {
-      for (p0=0; p0<npol; p0++) {
-        for (p1=0; p1<npol; p1++) {
-          i0 = ip_map[npol*s0 + p0];
-          i1 = ip_map[npol*s1 + p1];
-          // Set the conj map such that bl_map[stand0, stand1, pol0, pol1] has conjugation convention
-          // stand0,pol0 * conj(stand1,pol1)
-          if (i1 > i0) {
-            bl_map[s0*nstand*npol*npol + s1*npol*npol + p0*npol + p1] = regtile_index(i0, i1, nstand);
-            conj_map[s0*nstand*npol*npol + s1*npol*npol + p0*npol + p1] = 1;
-          } else {
-            bl_map[s0*nstand*npol*npol + s1*npol*npol + p0*npol + p1] = regtile_index(i1, i0, nstand);
-            conj_map[s0*nstand*npol*npol + s1*npol*npol + p0*npol + p1] = 0;
-          }
-        }
-      }
-    }
-  }
-  return BF_STATUS_SUCCESS;
-}
-
-/*
- * Reorder a DP4A xGPU spec output into something more sane, throwing
- * away unwanted baselines and re-concatenating real and imag parts in
- * a reasonable way.
- * Also remove conjugation weirdness so baselines a,b has conjugation a*conj(b)
- */
-BFstatus bfXgpuReorder(BFarray *xgpu_output, BFarray *reordered, BFarray *baselines, BFarray *is_conjugated) {
-  XGPUInfo xgpu_info;
-  xgpuInfo(&xgpu_info);
-
-  int *output = (int *)reordered->data;
-  int *input_r = (int *)xgpu_output->data;
-  int *input_i = input_r + xgpu_info.matLength;
-  int *bl = (int *)baselines->data;
-  int *conj = (int *)is_conjugated->data;
-  int n_bl = num_contiguous_elements(baselines);
-  int xgpu_n_input = xgpu_info.nstation * xgpu_info.npol;
-  int n_chan = xgpu_info.nfrequency;
-  int i, c;
-  // number of entries per channel
-  size_t regtile_chan_len = 4 * 4 * xgpu_n_input/4 * (xgpu_n_input/4+1) / 2;
-  for (i=0; i<n_bl; i++) {
-    for (c=0; c<n_chan; c++) {
-      output[2*i*n_chan + 2*c]     = input_r[c*regtile_chan_len + bl[i]];
-      if ( conj[i] == 1 ) {
-        output[2*i*n_chan + 2*c + 1] = -input_i[c*regtile_chan_len + bl[i]];
-      } else {
-        output[2*i*n_chan + 2*c + 1] = input_i[c*regtile_chan_len + bl[i]];
-      }
-    }
-  }
-  return BF_STATUS_SUCCESS;
-}
-
-} // C
diff --git a/src/bifrost/beamform.h b/src/bifrost/beamform.h
deleted file mode 100644
index 8002a691a..000000000
--- a/src/bifrost/beamform.h
+++ /dev/null
@@ -1,73 +0,0 @@
-#include <bifrost/common.h>
-#include <bifrost/array.h>
-
-/*
- * gpudev: GPU device ID to use
- * ninputs: Number of inputs (single-polarization) to the beamformer
- * nchans: Number of frequency channels
- * ntimes: Number of time samples per beamforming call
- * nbeams: Number of beams to generate. If using ntime_blocks > 0, beams=N will deliver N/2 beams.
- *         (See bfBeamformRun)
- * ntime_blocks: Number of time blocks to output. Eg. if ntimes=1000 and ntime_blocks=10, the beamformer
-                 will integrate over 100 samples per call. Set to 0 for no accumulation, in which case
-                 raw beam voltages are output.
- */
-BFstatus bfBeamformInitialize(
-  int gpudev,
-  int ninputs,
-  int nchans,
-  int ntimes,
-  int nbeams,
-  int ntime_blocks
-);
-
-/*
- * in:  Pointer to ntime x nchan x ninputs x 4+4 bit data block
- * out: Pointer to output data.
- *        If ntime_blocks > 0: !!!!UNTESTED, probably broken!!!!
- *          For the purposes of generating dynamic spectra, beam 2n and 2n+1 are considered
- *          to be two pols of the same pointing, and are cross-multipled and summed over
- *          ntimes/ntime_blocks to form the output array:
- *            nbeam/2 x ntime_blocks x nchan x 4 x float32 (powers, XX, YY, re(XY, im(XY))
- *          Note that this means for N dual-pol beam pointings, the beamformer should be
- *          constructed with nbeams=2N. This isn't very efficient, but makes it easy to deal
- *          with arbitrary polarization orderings in the input buffer (suitable beamforming
- *          coefficients can make appropriate single-pol beam pairs).
- *        If ntime_blocks = 0:
- *          Data are returned as voltages, in order:
- *            nchan x nbeam x ntime x complex64 beamformer block
- *                            
- * weights -- pointer to nbeams x nchans x ninputs x complex64 weights
- */
-BFstatus bfBeamformRun(
-  BFarray *in,
-  BFarray *out,
-  BFarray *weights
-);
-
-/*
- * Take the output of bfBeamformRun with ntime_blocks = 0, and perform transposing and integration
- * of data, to deliver a time integrated dual-pol dynamic spectra of the form:
- *   nbeam/2 x ntime/ntimes_sum x nchan x 4 x float32 (powers, XX, YY, re(XY, im(XY))
- * I.e., the format which would be returned by bfBeamformRun if ntime_blocks > 0
- */
-BFstatus bfBeamformIntegrate(
-  BFarray *in,
-  BFarray *out,
-  int ntimes_sum
-);
-
-/*
- * Take the output of bfBeamformRun with ntime_blocks = 0, and 
- * deliver a time integrated dual-pol dynamic spectra for a single beam of the form:
- *   ntime/ntimes_sum x nchan x 4 x float32 (powers, XX, YY, re(XY, im(XY))
- * 
- * ntime_sum: the number of times to integrate
- * beam_index: The beam to select (if beam_index=N, beams N and N+1 will be used as a polarization pair)
- */
-BFstatus bfBeamformIntegrateSingleBeam(
-  BFarray *in,
-  BFarray *out,
-  int ntimes_sum,
-  int beam_index
-);
diff --git a/src/bifrost/bf_xgpu.h b/src/bifrost/bf_xgpu.h
deleted file mode 100644
index b8f21b783..000000000
--- a/src/bifrost/bf_xgpu.h
+++ /dev/null
@@ -1,25 +0,0 @@
-#ifndef BF_XGPU_H_INCLUDE_GUARD_
-#define BF_XGPU_H_INCLUDE_GUARD_
-
-#include <bifrost/common.h>
-#include <bifrost/array.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-//TODO: figure out how to make ctypesgen to the right thing with python generation
-//#if(BF_XGPU_ENABLED)
-BFstatus bfXgpuInitialize(BFarray *in, BFarray *out, int gpu_dev);
-BFstatus bfXgpuCorrelate(BFarray *in, BFarray *out, int doDump);
-BFstatus bfXgpuKernel(BFarray *in, BFarray *out, int doDump);
-BFstatus bfXgpuSubSelect(BFarray *in, BFarray *out, BFarray *vismap, BFarray *conj, int nchan_sum, int transpose);
-BFstatus bfXgpuGetOrder(BFarray *antpol_to_input, BFarray *antpol_to_bl, BFarray *is_conj);
-BFstatus bfXgpuReorder(BFarray *xgpu_output, BFarray *reordered, BFarray *baselines, BFarray *is_conjugated);
-//#endif // BF_XGPU_ENABLED
-
-#ifdef __cplusplus
-} // extern "C"
-#endif
-
-#endif // BF_XGPU_H_INCLUDE_GUARD
diff --git a/src/cublas_beamform.cu b/src/cublas_beamform.cu
deleted file mode 100644
index 7e2b7bdbb..000000000
--- a/src/cublas_beamform.cu
+++ /dev/null
@@ -1,373 +0,0 @@
-#include <cublas_v2.h>
-#include <cuda_runtime.h>
-#include <stdio.h>
-
-#include "cublas_beamform.cuh"
-
-__constant__ float lut[16] = {0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, -8.0, -7.0, -6.0, -5.0, -4.0, -3.0, -2.0, -1.0};
-
-// Transpose time x chan x pol x 4+4 bit to
-// chan x pol x time x 32+32 bit float
-__global__ void trans_4bit_to_float(unsigned char *in,
-                                    float *out,
-                                    int n_pol,
-                                    int n_chan,
-                                    int n_time
-                                   ) {
-  //long long int tid = blockDim.y*blockDim.x*blockIdx.y + blockDim.x*blockIdx.x + threadIdx.x;
-  //int pol  = tid % n_pol;
-  //int chan = (tid / n_pol) % n_chan;
-  //int time = (tid / (n_pol * n_chan));
-  int time = blockIdx.x;
-  int chan = blockIdx.y;
-  int pol = TRANSPOSE_POL_BLOCK_SIZE*threadIdx.x;
-  unsigned char *in_off = in + time*n_chan*n_pol + chan*n_pol + pol; // 4+4 bit
-  float *out_off = out + 2*( chan*n_pol*n_time + pol*n_time + time); // 32+32 bit
-  //long long int old_index = time*n_chan*n_pol + chan*n_pol + pol;
-  //long long int new_index = chan*n_pol*n_time + pol*n_time + time;
-  float real, imag;
-  unsigned char temp;
-  #pragma unroll
-  for (int i=0; i<TRANSPOSE_POL_BLOCK_SIZE; i++) {
-    temp = *in_off++;
-    //real = lut[in[old_index+i] >> 4];
-    //imag = lut[in[old_index+i] & 0b1111];
-    //out[2*(new_index+i)] = real;
-    //out[2*(new_index+i)+1] = imag;
-    real = lut[(temp >> 4) & 0b1111];
-    imag = lut[temp & 0b1111];
-    out_off[0] = real;
-    out_off[1] = imag;
-    out_off += 2*n_time;
-  }
-}
-
-// Transpose chan x beam x pol x time x 32+32 float to
-// beam x time[part-summed] x chan x [XX,YY,XY*_r,XY*_i] x 32 float
-// Each thread deals with two pols of a beam, and sums over n_time_sum time samples
-// n_beam is the _output_ number of beams. I.e., the number of dual-pol beams
-__global__ void trans_output_and_sum(float *in,
-                                    float *out,
-                                    int n_chan,
-                                    int n_beam,
-                                    int n_time,
-                                    int n_time_sum
-                                   ) {
-  int chan = blockIdx.x;
-  int beam = blockIdx.y;
-  int time = threadIdx.x;
-  // n_beam here is a dual pol beam
-  // input is: chan x beam x pol [2] x time x complexity
-  long long int old_index = 2*(chan*n_beam*2*n_time + beam*2*n_time + time*n_time_sum); // start index for n_time/n_time_sum samples
-  // output is: beam x time x chan x pol-products [4]
-  long long int new_index = 4*(beam*(n_time / n_time_sum)*n_chan + time*n_chan + chan);
-  float xx=0., yy=0., xy_r=0., xy_i=0.; // accumulator registers
-  float x_r, x_i, y_r, y_i;
-  int t;
-  for (t=0; t<n_time_sum; t++) {
-    x_r = in[old_index + 2*t];
-    x_i = in[old_index + 2*t + 1];
-    y_r = in[old_index + 2*(n_time + t)];
-    y_i = in[old_index + 2*(n_time + t) + 1];
-    xx = xx + x_r*x_r + x_i*x_i;
-    yy = yy + y_r*y_r + y_i*y_i;
-    xy_r = xy_r + x_r*y_r + x_i*y_i;
-    xy_i = xy_i + x_i*y_r - x_r*y_i;
-  }
-  out[new_index] = xx;
-  out[new_index+1] = yy;
-  out[new_index+2] = xy_r;
-  out[new_index+3] = xy_i;
-}
-
-// Take an input of order chan x beam x pol x time x 32+32 float and generate
-// a single beam of time[part-summed] x chan x [XX,YY,XY*_r,XY*_i] x 32 float
-// Each thread deals with two pols of a beam (beam_index and beam_index+1)
-// and sums over n_time_sum time samples
-__global__ void trans_output_and_sum_single_beam(float *in,
-                                                 float *out,
-                                                 int n_chan,
-                                                 int n_beam,
-                                                 int n_time,
-                                                 int n_time_sum,
-                                                 int beam_index
-                                                ) {
-  int chan = blockIdx.x;
-  int beam = beam_index;
-  int time = threadIdx.x;
-  long long int old_index = chan*n_beam*n_time*2 + beam*n_time*2 + time*n_time_sum*2; // start index for n_time/n_time_sum samples
-  long long int new_index = time*n_chan*4 + chan*4;
-  float xx=0., yy=0., xy_r=0., xy_i=0.;
-  float x_r, x_i, y_r, y_i;
-  int t;
-  for (t=0; t<n_time_sum; t++) {
-    x_r = in[old_index + 2*t];
-    x_i = in[old_index + 2*t + 1];
-    y_r = in[old_index + 2*n_time + 2*t];
-    y_i = in[old_index + 2*n_time + 2*t + 1];
-    xx = xx + x_r*x_r + x_i*x_i;
-    yy = yy + y_r*y_r + y_i*y_i;
-    xy_r = xy_r + x_r*y_r + x_i*y_i;
-    xy_i = xy_i + x_i*y_r - x_r*y_i;
-  }
-  out[new_index] = xx;
-  out[new_index+1] = yy;
-  out[new_index+2] = xy_r;
-  out[new_index+3] = xy_i;
-}
-
-__global__ void complex2pow(float *in, float *out, int N) {
-  int tid = blockDim.x * blockIdx.x + threadIdx.x;
-  if (tid < N) {
-    out[tid] = (in[2*tid]*in[2*tid] + in[2*tid + 1]*in[2*tid + 1]);
-  }
-}
-
-#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
-inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
-{
-   if (code != cudaSuccess) 
-   {
-      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
-      if (abort) exit(code);
-   }
-}
-
-/* Error checking for cuBLAS */
-void gpuBLASchk(int errval) {
-	if (errval != CUBLAS_STATUS_SUCCESS) {
-		fprintf(stderr, "Failed BLAS call, error code %d\n", errval);
-	}
-}
-
-/*
-Transa	CUDA_OP_N	Matrix A (Fourier coefficient matrix) is not transposed
-Transb	CUDA_OP_N	Matrix B (input data) is not transposed
-M	N_BEAMS	Number of rows of A/C
-N	N_TIMESTEPS_PER_CALL	Number of columns of B/C
-K	N_ANTENNAS	Number of columns of A, number of rows in B
-Alpha	1.0/127	Fourier coefficients are 8-bit ints so must be normalized to 1
-Atype	CUDA_C_8I	Data type of Fourier coefficient matrix (i.e. 8-bit + 8-bit integers)
-Lda	N_BEAMS	Leading dimension of Fourier coefficient matrix
-strideA	N_BEAMS*N_ANTENNAS	Stride between different frequencies in A
-Btype	CUDA_C_8I	Data type of input data matrix
-Ldb	N_ANTENNAS	Leading dimension of input matrix
-StrideB	N_ANTENNAS*N_TIMESTEPS_PER_CALL	Stride between different frequencies in input matrix
-Beta	0	Zero out the output data tensor
-Ctype	CUDA_C_32F	Data type of output matrix
-Ldc	N_BEAMS	Leading Dimension of output matrix
-strideC	N_BEAMS*N_TIMESTEPS_PER_CALL	Stride between different frequencies in output matrix
-batchCount	NCHANS	How many frequencies
-computeType	CUDA_C_32F	Internal datatype
-Algo	CUBLAS_GEMM_DEFAULT_TENSOR_OP	Use tensor operations
-*/
-
-struct beamform_context {
-  int gpu_device;
-  cublasHandle_t handle;
-  cudaStream_t stream;
-  // GPU buffers for intermediate data products
-  float *in32_d; // Reordered [to inputs x chans x times x complexity], floatified 4-bit input data
-  float *out_d;  // CUBLAS beamformer output [ntime x nchan x nbeam x complexity]
-  float *weights_d; // Beamformer coefficients
-  unsigned char *in4_d;     // 4-bit input data
-  float *sum_out_d; // Time-summed output data, used only if ntimeblocks > 0
-  int ninputs;   // Number of inputs (ants * pols)
-  int npols;     // Number of polarizations per antenna
-  int nchans;    // Number of channels input
-  int ntimes;    // Number of time samples input
-  int nbeams;    // Number of beams output
-  int ntimeblocks; // Number of time samples to keep after summation
-};
-
-static struct beamform_context context;
-
-void cublas_beamform_destroy(){
-  cudaFree(context.in32_d);
-  if (context.ntimeblocks > 0) {
-    cudaFree(context.out_d);
-  }
-}
-
-void cublas_beamform_init(int device, int ninputs, int nchans, int ntimes, int nbeams, int ntimeblocks) {
-  context.gpu_device = device;
-  gpuErrchk( cudaSetDevice(context.gpu_device) );
-  gpuErrchk(cudaStreamCreate(&(context.stream)));
-  gpuBLASchk(cublasCreate(&(context.handle)));
-  gpuBLASchk(cublasSetStream(context.handle, context.stream));
-  gpuBLASchk(cublasSetPointerMode(context.handle, CUBLAS_POINTER_MODE_HOST));
-  //gpuBLASchk(cublasSetPointerMode(context.handle, CUBLAS_POINTER_MODE_DEVICE));
-  gpuBLASchk(cublasSetMathMode(context.handle, CUBLAS_TENSOR_OP_MATH));
-
-  context.ninputs = ninputs;
-  context.nchans = nchans;
-  context.ntimes = ntimes;
-  context.nbeams = nbeams;
-  context.ntimeblocks = ntimeblocks;
-
-  // Internally allocate intermediate buffers
-  gpuErrchk( cudaMalloc(&context.in32_d,  ninputs * nchans * ntimes * 2 * sizeof(float)) );
-  //gpuErrchk( cudaMemcpy(context.in32_d, in32_h, ninputs * nchans * ntimes * 2 * sizeof(float), cudaMemcpyHostToDevice) );
-  // If the context is initialized with ntimeblocks=0, then we do no summing so don't
-  // need the intermediate buffer allocated internally.
-  if (ntimeblocks > 0) {
-    gpuErrchk( cudaMalloc(&context.out_d,   ntimes * nchans * nbeams * 2 * sizeof(float)) );
-  }
-}
-
-void cublas_beamform(unsigned char *in4_d, float *out_d, float *weights_d) {
-  // Transpose input data and promote to float.
-  // CUBLAS doesn't support float coeffs with int8 data
-  dim3 transBlockGrid(context.ntimes, context.nchans);
-  dim3 transThreadGrid(context.ninputs / TRANSPOSE_POL_BLOCK_SIZE);
-  trans_4bit_to_float<<<transBlockGrid, transThreadGrid, 0, context.stream>>>(
-      in4_d,
-      context.in32_d,
-      context.ninputs,
-      context.nchans,
-      context.ntimes
-  );
-  cudaStreamSynchronize(context.stream);
-
-  // If we are integrating beam powers, put the
-  // GEM output in the internal intermediate
-  // buffer. If not, then write beamformer output
-  // to the address given by the user.
-  float *gem_out_d;
-  if (context.ntimeblocks > 0) {
-    gem_out_d = context.out_d;
-  } else {
-    gem_out_d = out_d;
-  }
-
-  // Beamform using GEMM
-  float alpha = 1.0;
-  float beta = 0.0;
-  // GEMM:
-  // C <= alpha*AB + beta*C
-  // alpha = 1.0
-  // beta = 0.0
-  // A matrix: beamforming coeffs (NBEAMS * NANTS)
-  // B matrix: data matrix (NANTS * NTIMES)
-    
-  /*
-  gpuBLASchk(cublasGemmStridedBatchedEx(
-    context.handle,
-    CUBLAS_OP_T, // transpose A?
-    CUBLAS_OP_T, // transpose B?
-    context.nbeams,      // m
-    context.ntimes,      // n
-    context.ninputs,     // k
-    // Coeffs: [nchans x] nbeams x ninputs (m x k)
-    &alpha,              // alpha
-    weights_d,           // A
-    CUDA_C_32F,          // A type
-    context.ninputs,      // Lda
-    context.nbeams*context.ninputs,// strideA : stride size
-    // Data: [nchans x] ninputs x ntimes (k x n)
-    context.in32_d,      // B
-    CUDA_C_32F,          // B type
-    context.ntimes,     // Ldb
-    context.ninputs*context.ntimes,// strideB : stride size
-    &beta,       // beta
-    // Results
-    gem_out_d,       // C
-    CUDA_C_32F,          // Ctype 
-    context.nbeams,      // Ldc
-    context.nbeams*context.ntimes,// Stride C
-    context.nchans,      // batchCount
-    CUDA_C_32F,          // compute type
-    CUBLAS_GEMM_DEFAULT_TENSOR_OP // algo
-    ));
-  */
-  
-  gpuBLASchk(cublasGemmStridedBatchedEx(
-    context.handle,
-    CUBLAS_OP_N, // transpose A?
-    CUBLAS_OP_N, // transpose B?
-    context.ntimes,      // n
-    context.nbeams,      // m
-    context.ninputs,     // k
-    &alpha,              // alpha
-    //
-    // Data: [nchans x] ninputs x ntimes (k x n)
-    context.in32_d,      // B
-    CUDA_C_32F,          // B type
-    context.ntimes,     // Ldb
-    context.ninputs*context.ntimes,// strideB : stride size
-    //
-    // Coeffs: [nchans x] nbeams x ninputs (m x k)
-    weights_d,           // A
-    CUDA_C_32F,          // A type
-    context.ninputs,      // Lda
-    context.nbeams*context.ninputs,// strideA : stride size
-    //
-    &beta,       // beta
-    // Results
-    gem_out_d,       // C
-    CUDA_C_32F,          // Ctype 
-    context.ntimes,      // Ldc
-    context.nbeams*context.ntimes,// Stride C
-    context.nchans,      // batchCount
-    CUDA_C_32F,          // compute type
-    CUBLAS_GEMM_DEFAULT_TENSOR_OP // algo
-    ));
-  cudaStreamSynchronize(context.stream);
-
-  // Optionally:
-  if (context.ntimeblocks > 0) {
-    // Create XX, YY, XY beam powers.
-    // Sum over `ntimes_sum` samples
-    // Write to the user-provided output buffer
-    int ntimes_sum = context.ntimes / context.ntimeblocks;
-    dim3 sumBlockGrid(context.nchans, context.nbeams/2);
-    dim3 sumThreadGrid(context.ntimes / ntimes_sum);
-    trans_output_and_sum<<<sumBlockGrid, sumThreadGrid, 0, context.stream>>>(
-      gem_out_d,
-      out_d,
-      context.nchans,
-      context.nbeams/2,
-      context.ntimes,
-      ntimes_sum
-    );
-    cudaStreamSynchronize(context.stream);
-  }
-}
-
-/* Take input data of form
-   nchan x nbeam x time x complex64 [i.e. the output of cublas_beamform],
-   sum over ``ntimes_sum``, interpret beams `2n` and `2n+1` as a dual pol pair,
-   and generate an output array of form
-   nbeam x ntime / ntime_sum x nchan x nbeam / 2 x 4 x complex64.
-   The last 4-element axis holds XX, YY, Re(XY), Im(XY
-*/
-void cublas_beamform_integrate(float *in_d, float *out_d, int ntimes_sum) {
-  // Create XX, YY, XY beam powers.
-  // Sum over `ntimes_sum` samples
-  dim3 sumBlockGrid(context.nchans, context.nbeams/2);
-  dim3 sumThreadGrid(context.ntimes / ntimes_sum);
-  trans_output_and_sum<<<sumBlockGrid, sumThreadGrid>>>(
-    in_d,
-    out_d,
-    context.nchans,
-    context.nbeams/2,
-    context.ntimes,
-    ntimes_sum
-  );
-}
-
-void cublas_beamform_integrate_single_beam(float *in_d, float *out_d, int ntimes_sum, int beam_index) {
-  // Create XX, YY, XY beam powers.
-  // Sum over `ntimes_sum` samples
-  dim3 sumBlockGrid(context.nchans);
-  dim3 sumThreadGrid(context.ntimes / ntimes_sum);
-  trans_output_and_sum_single_beam<<<sumBlockGrid, sumThreadGrid>>>(
-    in_d,
-    out_d,
-    context.nchans,
-    context.nbeams/2,
-    context.ntimes,
-    ntimes_sum,
-    beam_index
-  );
-}
diff --git a/src/cublas_beamform.cuh b/src/cublas_beamform.cuh
deleted file mode 100644
index a33919e3b..000000000
--- a/src/cublas_beamform.cuh
+++ /dev/null
@@ -1,37 +0,0 @@
-#ifndef _CUBLAS_BEAMFORM_H
-#define _CUBLAS_BEAMFORM_H
-
-#include <cublas_v2.h>
-#include <cuda_runtime.h>
-#include <stdio.h>
-
-// Transpose time x chan x pol x 4+4 bit to
-#define TRANSPOSE_POL_BLOCK_SIZE 8
-// chan x pol x time x 32+32 bit float
-__global__ void trans_4bit_to_float(unsigned char *in,
-                                    float *out,
-                                    int n_pol,
-                                    int n_chan,
-                                    int n_time
-                                   );
-
-// Transpose chan x beam x pol x pol x 32+32 float to
-// beam x time[part-summed] x chan x [XX,YY,XY*_r,XY*_i] x 32 float
-// Each thread deals with two pols of a beam, and sums over n_time_sum time samples
-__global__ void trans_output_and_sum(float *in,
-                                    float *out,
-                                    int n_chan,
-                                    int n_beam,
-                                    int n_time,
-                                    int n_time_sum
-                                   );
-
-__global__ void complex2pow(float *in, float *out, int N);
-
-void cublas_beamform_destroy();
-void cublas_beamform(unsigned char *in4_d, float *sum_out_d, float *weights_d);
-void cublas_beamform_integrate(float *in_d, float *sum_out_d, int ntimes_sum);
-void cublas_beamform_integrate_single_beam(float *in_d, float *sum_out_d, int ntimes_sum, int beam_index);
-void cublas_beamform_init(int device, int ninputs, int nchans, int ntimes, int nbeams, int ntimeblocks);
-
-#endif

From 4e1d3180175f4f13fd86d17259ac03c478a33dc5 Mon Sep 17 00:00:00 2001
From: jaycedowell <jayce.dowell@gmail.com>
Date: Thu, 1 Jun 2023 09:31:56 -0600
Subject: [PATCH 74/91] Focus on packet formats for now.

---
 .gitignore | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/.gitignore b/.gitignore
index 31f0cd390..65ddc68fe 100644
--- a/.gitignore
+++ b/.gitignore
@@ -109,9 +109,3 @@ target/
 # Benchmarking files
 test/benchmarks/development_vs_gpuspec/with_bifrost/
 test/benchmarks/development_vs_gpuspec/without_bifrost/
-
-# ctags files
-python/bifrost/tags
-python/tags
-src/tags
-tags

From 49722302b0b39858d6899de4892cee06f1bcbec1 Mon Sep 17 00:00:00 2001
From: jaycedowell <jayce.dowell@gmail.com>
Date: Thu, 1 Jun 2023 09:39:38 -0600
Subject: [PATCH 75/91] Move verbs buffer control into configure.

---
 configure.ac            | 7 +++++++
 src/bifrost/config.h.in | 1 +
 src/ib_verbs.hpp        | 6 ++----
 3 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/configure.ac b/configure.ac
index c727be91c..588ae1baa 100644
--- a/configure.ac
+++ b/configure.ac
@@ -220,6 +220,13 @@ AS_IF([test x$enable_verbs != xno],
                     [AC_SUBST([HAVE_VERBS], [1])
                      LIBS="$LIBS -libverbs"])])
 
+AC_ARG_WITH([verbs_npktbuf],
+           [AS_HELP_STRING([--with-verbs-npktbuf=N],
+                           [default Infiniband verbs buffer size in packets (default=8192)])],
+           [],
+           [with_verbs_npktbuf=8192])
+AC_SUBST([VERBS_NPKTBUF], [$with_verbs_npktbuf])
+
 #
 # RDMA
 #
diff --git a/src/bifrost/config.h.in b/src/bifrost/config.h.in
index c20559a15..1e3e52ed4 100644
--- a/src/bifrost/config.h.in
+++ b/src/bifrost/config.h.in
@@ -63,6 +63,7 @@ extern "C" {
 #define BF_HWLOC_ENABLED @HAVE_HWLOC@
 #define BF_VMA_ENABLED @HAVE_VMA@
 #define BF_VERBS_ENABLED @HAVE_VERBS@
+#define BF_VERBS_NPKTBUF @VERBS_NPKTBUF@
 #define BF_RDMA_ENABLED @HAVE_RDMA@
 #define BF_RDMA_MAXMEM @RDMA_MAXMEM@
 
diff --git a/src/ib_verbs.hpp b/src/ib_verbs.hpp
index 2d4176e30..2c3fab598 100644
--- a/src/ib_verbs.hpp
+++ b/src/ib_verbs.hpp
@@ -28,6 +28,8 @@
 
 #pragma once
 
+#include <bifrost/config.h>
+
 #include <stdexcept>
 #include <string>
 #include <sstream>
@@ -58,10 +60,6 @@
 #define BF_VERBS_NQP 1
 #endif
 
-#ifndef BF_VERBS_NPKTBUF
-#define BF_VERBS_NPKTBUF 32768
-#endif
-
 #ifndef BF_VERBS_WCBATCH
 #define BF_VERBS_WCBATCH 16
 #endif

From d07b153f019068568ff14c74f2e2685ddc8179dc Mon Sep 17 00:00:00 2001
From: jaycedowell <jayce.dowell@gmail.com>
Date: Thu, 1 Jun 2023 13:24:30 -0600
Subject: [PATCH 76/91] lwa352_vbeam_* -> vbeam_*

---
 src/formats/formats.hpp                     |  2 +-
 src/formats/{lwa352_vbeam.hpp => vbeam.hpp} | 10 +++++-----
 src/packet_writer.hpp                       | 14 +++++++-------
 3 files changed, 13 insertions(+), 13 deletions(-)
 rename src/formats/{lwa352_vbeam.hpp => vbeam.hpp} (86%)

diff --git a/src/formats/formats.hpp b/src/formats/formats.hpp
index 225811895..e52aefd39 100644
--- a/src/formats/formats.hpp
+++ b/src/formats/formats.hpp
@@ -37,4 +37,4 @@
 #include "ibeam.hpp"
 #include "snap2.hpp"
 #include "pbeam.hpp"
-#include "lwa352_vbeam.hpp"
+#include "vbeam.hpp"
diff --git a/src/formats/lwa352_vbeam.hpp b/src/formats/vbeam.hpp
similarity index 86%
rename from src/formats/lwa352_vbeam.hpp
rename to src/formats/vbeam.hpp
index e7c7fe402..29f98d749 100644
--- a/src/formats/lwa352_vbeam.hpp
+++ b/src/formats/vbeam.hpp
@@ -30,7 +30,7 @@
 
 #include "base.hpp"
 
-struct __attribute__((packed)) lwa352_vbeam_hdr_type {
+struct __attribute__((packed)) vbeam_hdr_type {
     uint64_t sync_word;
     uint64_t sync_time;
     uint64_t time_tag;
@@ -41,14 +41,14 @@ struct __attribute__((packed)) lwa352_vbeam_hdr_type {
     uint32_t npol;
 };
 
-class LWA352VBeamHeaderFiller : virtual public PacketHeaderFiller {
+class VBeamHeaderFiller : virtual public PacketHeaderFiller {
 public:
-    inline int get_size() { return sizeof(lwa352_vbeam_hdr_type); }
+    inline int get_size() { return sizeof(vbeam_hdr_type); }
     inline void operator()(const PacketDesc* hdr_base,
                            BFoffset          framecount,
                            char*             hdr) {
-        lwa352_vbeam_hdr_type* header = reinterpret_cast<lwa352_vbeam_hdr_type*>(hdr);
-        memset(header, 0, sizeof(lwa352_vbeam_hdr_type));
+        vbeam_hdr_type* header = reinterpret_cast<vbeam_hdr_type*>(hdr);
+        memset(header, 0, sizeof(vbeam_hdr_type));
         
         header->sync_word        = 0xAABBCCDD00000000L;
         header->time_tag         = htobe64(hdr_base->seq);
diff --git a/src/packet_writer.hpp b/src/packet_writer.hpp
index b1272f5fb..acab4d31c 100644
--- a/src/packet_writer.hpp
+++ b/src/packet_writer.hpp
@@ -485,14 +485,14 @@ class BFpacketwriter_tbf_impl : public BFpacketwriter_impl {
     }
 };
 
-class BFpacketwriter_lwa352_vbeam_impl : public BFpacketwriter_impl {
+class BFpacketwriter_vbeam_impl : public BFpacketwriter_impl {
     ProcLog            _type_log;
 public:
-    inline BFpacketwriter_lwa352_vbeam_impl(PacketWriterThread* writer,
+    inline BFpacketwriter_vbeam_impl(PacketWriterThread* writer,
                                    int                 nsamples)
      : BFpacketwriter_impl(writer, nullptr, nsamples, BF_DTYPE_CF32),
        _type_log((std::string(writer->get_name())+"/type").c_str()) {
-        _filler = new LWA352VBeamHeaderFiller();
+        _filler = new VBeamHeaderFiller();
         _type_log.update("type : %s\n", "tbf");
     }
 };
@@ -526,8 +526,8 @@ BFstatus BFpacketwriter_create(BFpacketwriter* obj,
         nsamples = 4096;
     } else if( format == std::string("tbf") ) {
         nsamples = 6144;
-    } else if( std::string(format).substr(0, 13) == std::string("lwa352_vbeam_") ) {
-        // e.g. "lwa352_vbeam_184" is a 184-channel voltage beam"
+    } else if( std::string(format).substr(0, 6) == std::string("vbeam_") ) {
+        // e.g. "vbeam_184" is a 184-channel voltage beam"
         int nchan = std::atoi((std::string(format).substr(13, std::string(format).length())).c_str());
         nsamples = 2*nchan; // 2 polarizations. Natively 32-bit floating complex (see implementation class)
     }
@@ -579,8 +579,8 @@ BFstatus BFpacketwriter_create(BFpacketwriter* obj,
     } else if( format == std::string("tbf") ) {
         BF_TRY_RETURN_ELSE(*obj = new BFpacketwriter_tbf_impl(writer, nsamples),
                            *obj = 0);
-    } else if( std::string(format).substr(0, 13) == std::string("lwa352_vbeam_") ) {
-        BF_TRY_RETURN_ELSE(*obj = new BFpacketwriter_lwa352_vbeam_impl(writer, nsamples),
+    } else if( std::string(format).substr(0, 6) == std::string("vbeam_") ) {
+        BF_TRY_RETURN_ELSE(*obj = new BFpacketwriter_vbeam_impl(writer, nsamples),
                            *obj = 0);
     } else {
         return BF_STATUS_UNSUPPORTED;

From b37bf9bdc44225dba45eea6f74397fef6cfb1bc5 Mon Sep 17 00:00:00 2001
From: jaycedowell <jayce.dowell@gmail.com>
Date: Thu, 1 Jun 2023 13:33:31 -0600
Subject: [PATCH 77/91] Remove some debugging.

---
 src/packet_capture.cpp | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/packet_capture.cpp b/src/packet_capture.cpp
index 115d72505..454e610a4 100644
--- a/src/packet_capture.cpp
+++ b/src/packet_capture.cpp
@@ -268,12 +268,9 @@ BFpacketcapture_status BFpacketcapture_impl::recv() {
 				ret = BF_CAPTURE_CONTINUED;
 			}
 		}
-	    BF_PRINTD("_bufs.size(): " << _bufs.size());
 		if( _bufs.size() == 2 ) {
-	        BF_PRINTD("Committing buffer");
 			this->commit_buf();
 		}
-	    BF_PRINTD("Rseerving buffer");
 		this->reserve_buf();
 	} else {
 		

From 9d67edb1f8feeaed91799cb60181a852b42bfbb8 Mon Sep 17 00:00:00 2001
From: jaycedowell <jayce.dowell@gmail.com>
Date: Thu, 1 Jun 2023 13:34:27 -0600
Subject: [PATCH 78/91] Catch here as well.

---
 src/ib_verbs.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ib_verbs.hpp b/src/ib_verbs.hpp
index 2c3fab598..fb5b0b5ea 100644
--- a/src/ib_verbs.hpp
+++ b/src/ib_verbs.hpp
@@ -430,7 +430,7 @@ class Verbs {
         _verbs.send_mr_size = (size_t) BF_VERBS_NPKTBUF*BF_VERBS_NQP * _pkt_size_max;
         _verbs.send_mr_buf = (uint8_t *) ::mmap(NULL, _verbs.send_mr_size, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_LOCKED, -1, 0);
 
-        check_error(_verbs.send_mr_buf == MAP_FAILED,
+        check_error(_verbs.send_mr_buf == MAP_FAILED ? -1 : 0,
                     "allocate memory region buffer");
         check_error(::mlock(_verbs.send_mr_buf, _verbs.send_mr_size),
                     "lock memory region buffer");

From bc6add7cb1e72432b37c4e548dcccf5e22e53d65 Mon Sep 17 00:00:00 2001
From: jaycedowell <jayce.dowell@gmail.com>
Date: Thu, 1 Jun 2023 13:40:42 -0600
Subject: [PATCH 79/91] Attempt to add a non-AVX version of the snap2 packet
 processor.

---
 src/formats/snap2.hpp | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/formats/snap2.hpp b/src/formats/snap2.hpp
index ea93244b3..228405b93 100644
--- a/src/formats/snap2.hpp
+++ b/src/formats/snap2.hpp
@@ -163,21 +163,28 @@ class SNAP2Processor : virtual public PacketProcessor {
             // Output buffer order is chans * npol_total * complexity
             // Spacing with which channel chunks are copied depends
             // on the total number of channels/pols in the system
+            int c=0;
+#if defined BF_AVX_ENABLED && BF_AVX_ENABLED
             __m256i *dest_p;
             __m256i vecbuf[2];
             uint64_t *in64 = (uint64_t *)in;
-            int c;
             dest_p = (__m256i *)(out + (words_per_chan_out * (pkt_chan)) + pol_offset_out);
+#endif
             //if((pol_offset_out == 0) && (pkt_chan==0) && ((pkt->seq % 120)==0) ){
             //   fprintf(stderr, "nsrc: %d seq: %d, dest_p: %p obuf idx %d, obuf offset %lu, nseq_per_obuf %d, seq0 %d, nbuf: %d\n", pkt->nsrc, pkt->seq, dest_p, obuf_idx, obuf_offset, nseq_per_obuf, seq0, nbuf);
             //}
             for(c=0; c<pkt->nchan; c++) {
+#if defined BF_AVX_ENABLED && BF_AVX_ENABLED
                vecbuf[0] = _mm256_set_epi64x(in64[3], in64[2], in64[1], in64[0]);
                vecbuf[1] = _mm256_set_epi64x(in64[7], in64[6], in64[5], in64[4]);
                _mm256_stream_si256(dest_p, vecbuf[0]);
                _mm256_stream_si256(dest_p+1,   vecbuf[1]);
                in64 += 8;
                dest_p += words_per_chan_out;
+#else
+               ::memcpy(&out[pkt->src + pkt->nsrc*chan],
+                        &in[c], sizeof(otype));
+#endif
             }
     }
 

From 74320677fb1e07fc2e0b6d77d941e463b8218c82 Mon Sep 17 00:00:00 2001
From: jaycedowell <jayce.dowell@gmail.com>
Date: Thu, 1 Jun 2023 13:42:47 -0600
Subject: [PATCH 80/91] Now in formats/base.hpp.

---
 src/formats/snap2.hpp | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/formats/snap2.hpp b/src/formats/snap2.hpp
index 228405b93..f34a82c4f 100644
--- a/src/formats/snap2.hpp
+++ b/src/formats/snap2.hpp
@@ -30,9 +30,6 @@
 
 #include "base.hpp"
 
-#include <immintrin.h> // SSE
-#include <emmintrin.h>
-
 // TODO: parameterize somewhere. This isn't
 // related to the packet formatting
 #define PIPELINE_NPOL 704

From d9736b400336ab5560ea6141eee0c233da5be129 Mon Sep 17 00:00:00 2001
From: jaycedowell <jayce.dowell@gmail.com>
Date: Thu, 1 Jun 2023 13:43:47 -0600
Subject: [PATCH 81/91] Revert to the ibverb-support version.

---
 src/formats/cor.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/formats/cor.hpp b/src/formats/cor.hpp
index c7c7b0787..bf1660ded 100644
--- a/src/formats/cor.hpp
+++ b/src/formats/cor.hpp
@@ -72,7 +72,7 @@ class CORDecoder : virtual public PacketDecoder {
 	    pkt->sync         = pkt_hdr->sync_word;
 	    pkt->time_tag     = be64toh(pkt_hdr->time_tag);
 	    pkt->decimation   = be32toh(pkt_hdr->navg);
-	    pkt->seq          = pkt->time_tag / pkt->decimation;
+	    pkt->seq          = pkt->time_tag / 196000000 / (pkt->decimation / 100);
 	    pkt->nsrc         = _nsrc;
 	    pkt->src          = (stand0*(2*(nstand-1)+1-stand0)/2 + stand1 + 1 - _src0)*nserver \
 	                        + (server - 1);

From 70bf2918ddfb2c5ae0989bf699f3b1ec08c7152f Mon Sep 17 00:00:00 2001
From: jaycedowell <jayce.dowell@gmail.com>
Date: Thu, 1 Jun 2023 13:45:37 -0600
Subject: [PATCH 82/91] Ugh.

---
 src/formats/snap2.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/formats/snap2.hpp b/src/formats/snap2.hpp
index f34a82c4f..2e5672853 100644
--- a/src/formats/snap2.hpp
+++ b/src/formats/snap2.hpp
@@ -179,7 +179,7 @@ class SNAP2Processor : virtual public PacketProcessor {
                in64 += 8;
                dest_p += words_per_chan_out;
 #else
-               ::memcpy(&out[pkt->src + pkt->nsrc*chan],
+               ::memcpy(&out[pkt->src + pkt->nsrc*c],
                         &in[c], sizeof(otype));
 #endif
             }

From 21210ba5d4ba44ddb03533939c1a3e3a8bea7875 Mon Sep 17 00:00:00 2001
From: jaycedowell <jayce.dowell@gmail.com>
Date: Thu, 1 Jun 2023 13:48:47 -0600
Subject: [PATCH 83/91] This block seems to be causing problems in CI.

---
 src/packet_capture.cpp | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/src/packet_capture.cpp b/src/packet_capture.cpp
index 454e610a4..60d3c3bf7 100644
--- a/src/packet_capture.cpp
+++ b/src/packet_capture.cpp
@@ -91,15 +91,6 @@ int PacketCaptureThread::run(uint64_t seq_beg,
 		BF_PRINTD("HERE" << " " << _pkt.seq << " < " << seq_beg);
 		_have_pkt = false;
 		if( less_than(_pkt.seq, seq_beg) ) {
-            // If lots [TODO: what is lots] of packets are late
-            // return. Otherwise a seq reset can lead to being stuck
-            // here endlessly counting late packets.
-            if( less_than(_pkt.seq + 1000*nseq_per_obuf, seq_beg) ) {
-		   		fprintf(stderr, "Breaking from packet receive because of so many late packets\n"); 
-		        _have_pkt = true;
-                ret = CAPTURE_SUCCESS;
-                break;
-            }
 			++_stats.nlate;
 			_stats.nlate_bytes += _pkt.payload_size;
 			++_src_stats[_pkt.src].nlate;

From 02e2e941d4337973e192e7d28f0eca645f78e717 Mon Sep 17 00:00:00 2001
From: jaycedowell <jayce.dowell@gmail.com>
Date: Thu, 20 Jul 2023 15:19:30 -0600
Subject: [PATCH 84/91] Clean up header filler.

---
 src/formats/snap2.hpp | 32 +++++++++++++-------------------
 1 file changed, 13 insertions(+), 19 deletions(-)

diff --git a/src/formats/snap2.hpp b/src/formats/snap2.hpp
index 2e5672853..c1a17b600 100644
--- a/src/formats/snap2.hpp
+++ b/src/formats/snap2.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, The Bifrost Authors. All rights reserved.
+ * Copyright (c) 2019-2023, The Bifrost Authors. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -30,11 +30,6 @@
 
 #include "base.hpp"
 
-// TODO: parameterize somewhere. This isn't
-// related to the packet formatting
-#define PIPELINE_NPOL 704
-#define PIPELINE_NCHAN 32
-
 // All entries are network (i.e. big) endian
 struct __attribute__((packed)) snap2_hdr_type {
         uint64_t  seq;       // Spectra counter == packet counter
@@ -122,8 +117,6 @@ class SNAP2Decoder : virtual public PacketDecoder {
 };
 
 class SNAP2Processor : virtual public PacketProcessor {
-protected:
-    int _pipeline_nchan = PIPELINE_NCHAN;
 public:
     inline void operator()(const PacketDesc* pkt,
                            uint64_t          seq0,
@@ -205,20 +198,21 @@ class SNAP2Processor : virtual public PacketProcessor {
 
 class SNAP2HeaderFiller : virtual public PacketHeaderFiller {
 public:
-    inline int get_size() { return sizeof(chips_hdr_type); }
+    inline int get_size() { return sizeof(snap2_hdr_type); }
     inline void operator()(const PacketDesc* hdr_base,
                            BFoffset          framecount,
                            char*             hdr) {
-        chips_hdr_type* header = reinterpret_cast<chips_hdr_type*>(hdr);
-        memset(header, 0, sizeof(chips_hdr_type));
+        snap2_hdr_type* header = reinterpret_cast<snap2_hdr_type*>(hdr);
+        memset(header, 0, sizeof(snap2_hdr_type));
+        
+        header->seq           = htobe64(hdr_base->seq);
+        header->npol          = 2;
+        header->npol_tot      = 2;
+        header->nchan         = hdr_base->nchan;
+        header->nchan_tot     = hdr_base->nchan * hdr_base->nsrc;
+        header->chan_block_id = hdr-base->src;
+        header->chan0         = htons(hdr_base->chan0);
+        header->pol0          = 0;
         
-        header->roach    = hdr_base->src + 1;
-        header->gbe      = hdr_base->tuning;
-        header->nchan    = hdr_base->nchan;
-        header->nsubband = 1;       // Should be changable?
-        header->subband  = 0;       // Should be changable?
-        header->nroach   = hdr_base->nsrc;
-        header->chan0    = htons(hdr_base->chan0);
-        header->seq      = htobe64(hdr_base->seq);
     }
 };

From e783a2463b8be3f0b698beecefee0d765a45e723 Mon Sep 17 00:00:00 2001
From: jaycedowell <jayce.dowell@gmail.com>
Date: Thu, 20 Jul 2023 15:21:43 -0600
Subject: [PATCH 85/91] Re-enable missing source blanking.

---
 src/formats/snap2.hpp | 26 ++++++++++++--------------
 1 file changed, 12 insertions(+), 14 deletions(-)

diff --git a/src/formats/snap2.hpp b/src/formats/snap2.hpp
index c1a17b600..10aee3788 100644
--- a/src/formats/snap2.hpp
+++ b/src/formats/snap2.hpp
@@ -179,20 +179,18 @@ class SNAP2Processor : virtual public PacketProcessor {
     }
 
     inline void blank_out_source(uint8_t* data,
-                                     int      src,
-                                     int      nsrc,
-                                     int      nchan,
-                                     int      nseq) {
-            //fprintf(stderr, "TRYING TO BLANK OUT A SOURCE WITH MISSING PACKETS. BUT BLANKING NOT IMPLEMENTED\n");
-            //typedef aligned256_type otype;
-            //fprintf(stderr, "You really better not be here\n");
-            //otype* __restrict__ aligned_data = (otype*)data;
-            //for( int t=0; t<nseq; ++t ) {
-            //        for( int c=0; c<nchan; ++c ) {
-            //                ::memset(&aligned_data[src + nsrc*(c + nchan*t)],
-            //                         0, sizeof(otype));
-            //        }
-            //}
+                                 int      src,
+                                 int      nsrc,
+                                 int      nchan,
+                                 int      nseq) {
+            typedef aligned256_type otype;
+            otype* __restrict__ aligned_data = (otype*)data;
+            for( int t=0; t<nseq; ++t ) {
+                   for( int c=0; c<nchan; ++c ) {
+                           ::memset(&aligned_data[src + nsrc*(c + nchan*t)],
+                                    0, sizeof(otype));
+                   }
+            }
     }
 };
 

From ae61dd8c2967f3004a5be935973d880d0704e449 Mon Sep 17 00:00:00 2001
From: jaycedowell <jayce.dowell@gmail.com>
Date: Thu, 20 Jul 2023 15:33:34 -0600
Subject: [PATCH 86/91] Ugh.

---
 src/formats/snap2.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/formats/snap2.hpp b/src/formats/snap2.hpp
index 10aee3788..2b9a85322 100644
--- a/src/formats/snap2.hpp
+++ b/src/formats/snap2.hpp
@@ -208,7 +208,7 @@ class SNAP2HeaderFiller : virtual public PacketHeaderFiller {
         header->npol_tot      = 2;
         header->nchan         = hdr_base->nchan;
         header->nchan_tot     = hdr_base->nchan * hdr_base->nsrc;
-        header->chan_block_id = hdr-base->src;
+        header->chan_block_id = hdr_base->src;
         header->chan0         = htons(hdr_base->chan0);
         header->pol0          = 0;
         

From 75c74f6a89937a4df8fa9e3dd9cdd2603c1cf3c8 Mon Sep 17 00:00:00 2001
From: jaycedowell <jaycedowell@gmail.com>
Date: Mon, 4 Dec 2023 16:00:01 -0700
Subject: [PATCH 87/91] Fix an off-by-one error from a header mis-match in how
 packets are generated vs received.

---
 src/formats/pbeam.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/formats/pbeam.hpp b/src/formats/pbeam.hpp
index a7dc966a7..bb910fbec 100644
--- a/src/formats/pbeam.hpp
+++ b/src/formats/pbeam.hpp
@@ -64,7 +64,7 @@ class PBeamDecoder: virtual public PacketDecoder {
 	    int                   pld_size = pkt_size - sizeof(pbeam_hdr_type);
 	    pkt->decimation =    be16toh(pkt_hdr->navg);
 	    pkt->time_tag   =    be64toh(pkt_hdr->seq);
-	    pkt->seq   =         (pkt->time_tag - 1) / pkt->decimation;
+	    pkt->seq   =         pkt->time_tag / pkt->decimation;
 	    //pkt->nsrc  =         pkt_hdr->nserver;
 	    pkt->nsrc  =         _nsrc;
 	    pkt->src   =         (pkt_hdr->beam - _src0) * pkt_hdr->nserver + (pkt_hdr->server - 1);

From b0254e8d522ee39e4ad5f103ca61a45b16572238 Mon Sep 17 00:00:00 2001
From: jaycedowell <jaycedowell@gmail.com>
Date: Fri, 8 Dec 2023 11:14:55 -0700
Subject: [PATCH 88/91] I thought this was in configure.ac already.

---
 src/bifrost/config.h.in | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/bifrost/config.h.in b/src/bifrost/config.h.in
index 1e3e52ed4..752a5c067 100644
--- a/src/bifrost/config.h.in
+++ b/src/bifrost/config.h.in
@@ -63,7 +63,7 @@ extern "C" {
 #define BF_HWLOC_ENABLED @HAVE_HWLOC@
 #define BF_VMA_ENABLED @HAVE_VMA@
 #define BF_VERBS_ENABLED @HAVE_VERBS@
-#define BF_VERBS_NPKTBUF @VERBS_NPKTBUF@
+#define BF_VERBS_NPKTBUF 32768
 #define BF_RDMA_ENABLED @HAVE_RDMA@
 #define BF_RDMA_MAXMEM @RDMA_MAXMEM@
 

From b16dc15ada2da3de73bbacccb59b786567cfd77e Mon Sep 17 00:00:00 2001
From: jaycedowell <jaycedowell@gmail.com>
Date: Fri, 8 Dec 2023 13:06:47 -0700
Subject: [PATCH 89/91] Fix a bad merge.

---
 src/ib_verbs.hpp | 31 +++++++------------------------
 1 file changed, 7 insertions(+), 24 deletions(-)

diff --git a/src/ib_verbs.hpp b/src/ib_verbs.hpp
index fcf557660..37b559068 100644
--- a/src/ib_verbs.hpp
+++ b/src/ib_verbs.hpp
@@ -28,8 +28,6 @@
 
 #pragma once
 
-#include <bifrost/config.h>
-
 #include <stdexcept>
 #include <string>
 #include <sstream>
@@ -52,6 +50,10 @@
 #define BF_VERBS_NQP 1
 #endif
 
+#ifndef BF_VERBS_NPKTBUF
+#define BF_VERBS_NPKTBUF 8192
+#endif
+
 #ifndef BF_VERBS_WCBATCH
 #define BF_VERBS_WCBATCH 16
 #endif
@@ -276,32 +278,13 @@ class Verbs {
         _verbs.mr_size = (size_t) BF_VERBS_NPKTBUF*BF_VERBS_NQP * _pkt_size_max;
         _verbs.mr_buf = (uint8_t *) ::mmap(NULL, _verbs.mr_size, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_LOCKED, -1, 0);
 
-        check_error(_verbs.mr_buf == MAP_FAILED ? -1 : 0,
-                    "allocate memory region buffer");
+        check_error(_verbs.mr_buf == MAP_FAILED,
+                    "allocate receive memory region buffer");
         check_error(::mlock(_verbs.mr_buf, _verbs.mr_size),
                     "lock receive memory region buffer");
         _verbs.mr = ibv_reg_mr(_verbs.pd, _verbs.mr_buf, _verbs.mr_size, IBV_ACCESS_LOCAL_WRITE);
         check_null(_verbs.mr,
-                   "register memory region");
-                   
-        // Start Send
-        
-        _verbs.send_pkt_buf = (bf_ibv_send_pkt*) ::malloc(BF_VERBS_NPKTBUF*BF_VERBS_NQP * sizeof(struct bf_ibv_send_pkt));
-        check_null(_verbs.send_pkt_buf, 
-                   "allocate send packet buffer");
-        ::memset(_verbs.send_pkt_buf, 0, BF_VERBS_NPKTBUF*BF_VERBS_NQP * sizeof(struct bf_ibv_send_pkt));
-        _verbs.send_mr_size = (size_t) BF_VERBS_NPKTBUF*BF_VERBS_NQP * _pkt_size_max;
-        _verbs.send_mr_buf = (uint8_t *) ::mmap(NULL, _verbs.send_mr_size, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_LOCKED, -1, 0);
-
-        check_error(_verbs.send_mr_buf == MAP_FAILED ? -1 : 0,
-                    "allocate memory region buffer");
-        check_error(::mlock(_verbs.send_mr_buf, _verbs.send_mr_size),
-                    "lock memory region buffer");
-        _verbs.send_mr = ibv_reg_mr(_verbs.pd, _verbs.send_mr_buf, _verbs.send_mr_size, IBV_ACCESS_LOCAL_WRITE);
-        check_null(_verbs.send_mr,
-                   "register memory region");
-        
-        // End Send
+                   "register receive memory region");
     }
     void destroy_buffers() {
         int failures = 0;

From 749f0ba87d1d60b694ce28ddfc3eaf666c08ff7c Mon Sep 17 00:00:00 2001
From: jaycedowell <jaycedowell@gmail.com>
Date: Fri, 8 Dec 2023 13:17:22 -0700
Subject: [PATCH 90/91] Give up on packet pacing for now.

---
 src/ib_verbs_send.hpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/ib_verbs_send.hpp b/src/ib_verbs_send.hpp
index 387696b23..498dd9873 100644
--- a/src/ib_verbs_send.hpp
+++ b/src/ib_verbs_send.hpp
@@ -334,9 +334,11 @@ class VerbsSend {
                         std::cout << "_verbs.offload_csum: " << (int) _verbs.offload_csum << std::endl;
                         
                         _verbs.hardware_pacing = 0;
+                        /*
                         if( ibv_is_qpt_supported(ibv_dev_attr.packet_pacing_caps.supported_qpts, IBV_QPT_RAW_PACKET) ) {
                           _verbs.hardware_pacing = ibv_dev_attr.packet_pacing_caps.qp_rate_limit_max;  
                         }
+                        */
                         std::cout << "_verbs.hardware_pacing: " << (int) _verbs.hardware_pacing << std::endl;
                         
                         break;
@@ -731,6 +733,7 @@ class VerbsSend {
       }
       
       // Apply the rate limit
+      /*
       ibv_qp_rate_limit_attr rl_attr;
       ::memset(&rl_attr, 0, sizeof(ibv_qp_rate_limit_attr));
       rl_attr.rate_limit = rate_limit;
@@ -740,7 +743,7 @@ class VerbsSend {
           check_error(ibv_modify_qp_rate_limit(_verbs.qp[i], &rl_attr),
                       "set queue pair rate limit");
       }
-      
+      */
       _rate_limit = rate_limit;
     }
     inline void get_ethernet_header(bf_ethernet_hdr* hdr) {

From e32a32b1ed8b01c440c67f35a2a7fb35ac17b2b6 Mon Sep 17 00:00:00 2001
From: jaycedowell <jayce.dowell@gmail.com>
Date: Tue, 19 Mar 2024 15:13:50 -0600
Subject: [PATCH 91/91] Nice.

---
 configure.ac | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/configure.ac b/configure.ac
index 6344dc53c..2033cc92f 100644
--- a/configure.ac
+++ b/configure.ac
@@ -139,14 +139,6 @@ AS_IF([test x$enable_verbs != xno],
                     [AC_SUBST([HAVE_VERBS], [1])
                      LIBS="$LIBS -libverbs"])])
 
-<<<<<<< HEAD
-AC_ARG_WITH([verbs_npktbuf],
-           [AS_HELP_STRING([--with-verbs-npktbuf=N],
-                           [default Infiniband verbs buffer size in packets (default=8192)])],
-           [],
-           [with_verbs_npktbuf=8192])
-AC_SUBST([VERBS_NPKTBUF], [$with_verbs_npktbuf])
-=======
  AC_ARG_WITH([rx-buffer-size],
             [AS_HELP_STRING([--with-rx-buffer-size=N],
                             [default Infiniband verbs receive buffer size in packets (default=8192)])],
@@ -188,7 +180,6 @@ AC_SUBST([VERBS_NPKTBUF], [$with_verbs_npktbuf])
           [AC_SUBST([VERBS_SEND_PACING], [0])
            AC_MSG_RESULT([no])])
        ])
->>>>>>> upstream/ibverb-support
 
 #
 # RDMA