diff --git a/JAERO/DSP.cpp b/JAERO/DSP.cpp
index 6e62705..cd4d7b1 100644
--- a/JAERO/DSP.cpp
+++ b/JAERO/DSP.cpp
@@ -525,27 +525,30 @@ double DiffDecode::UpdateSoft(double soft)
     double retval = 0;
 
     // if the previous value is a zero and the current also zero just return zero
-    if(soft < 0 && lastsoftstate < 0){
+    if(soft < 0 && lastsoftstate < 0)
+    {
 
         // last value is negative so just return to indicate a zero
         retval = lastsoftstate;
         lastsoftstate = soft;
     }
 
-    // if the previous value is one and the current also one
-    else if(soft > 0 && lastsoftstate > 0){
+     // if the previous value is one and the current also one
+     else if(soft > 0 && lastsoftstate > 0)
+     {
 
         // last value is postive so flip sign to indicate zero
         retval =- lastsoftstate;
         lastsoftstate = soft;
-    }
-    else{
+     }
+      else
+      {
 
 
         // retval and soft have different signs, so always return positive
-        retval = std::abs(lastsoftstate);
+        retval = std::fabs(lastsoftstate);
         lastsoftstate = soft;
-    }
+      }
 
     return retval;
 
diff --git a/JAERO/aerol.cpp b/JAERO/aerol.cpp
index 8e9664a..8b90393 100644
--- a/JAERO/aerol.cpp
+++ b/JAERO/aerol.cpp
@@ -1008,17 +1008,18 @@ QByteArray &AeroL::Decode(QVector<short> &bits, bool soft)//0 bit --> oldest bit
     quint16 bit=0;
     quint16 soft_bit=0;
 
-      for(int i=0;i<bits.size();i++)
+    for(int i=0;i<bits.size();i++)
     {
 
-
-        if(soft){
+        if(soft)
+        {
             if(((uchar)bits[i])>=128)bit=1;
             else bit=0;
-
-        }else{
-            bit=bits[i];
         }
+         else
+         {
+            bit=bits[i];
+         }
         soft_bit=bits[i];
 
         //for burst mode to allow tolerance of UW
@@ -1044,15 +1045,15 @@ QByteArray &AeroL::Decode(QVector<short> &bits, bool soft)//0 bit --> oldest bit
                     gotsync=0;
                 } else gotsync_last=0;
             }
-            else
-            {
+             else
+             {
                 gotsync=preambledetectorphaseinvariantreal.Update(bit);
                 if(!gotsync_last)
                 {
                     gotsync_last=gotsync;
                     gotsync=0;
                 } else gotsync_last=0;
-            }
+             }
 
             //for 10500 UW should be about 80 samples after start of packet signal from demodulator if not we have a false positive
             if(gotsync&&burstmode)
@@ -1086,47 +1087,55 @@ QByteArray &AeroL::Decode(QVector<short> &bits, bool soft)//0 bit --> oldest bit
                 {
                     bit=1-bit;
 
-                    if(soft_bit > 128){
+                    if(soft_bit > 128)
+                    {
                          soft_bit = 255-soft_bit;
-                    } else if (soft_bit < 128){
-                        soft_bit = 255-soft_bit;
                     }
+                     else if (soft_bit < 128)
+                     {
+                        soft_bit = 255-soft_bit;
+                     }
                 }
             }
 
         } //non 10500 burst mode use a phase invariant preamble detector
-        else if(burstmode){
-
-
+         else if(burstmode)
+         {
 
-          bool inverted = mskBurstDetector.inverted;
+            bool inverted = mskBurstDetector.inverted;
 
-          gotsync=mskBurstDetector.Update(bit);
+            gotsync=mskBurstDetector.Update(bit);
 
-          if( muw > 250 && gotsync){
+            if( muw > 250 && gotsync)
+            {
 
-              if(inverted != mskBurstDetector.inverted){
-                  mskBurstDetector.inverted = inverted;
-              }
-               gotsync = false;
+                if(inverted != mskBurstDetector.inverted)
+                {
+                    mskBurstDetector.inverted = inverted;
+                }
+                gotsync = false;
 
-           }
+            }
 
-           if(mskBurstDetector.inverted){
+            if(mskBurstDetector.inverted)
+            {
 
                 bit=1-bit;
 
-                if(soft_bit > 128){
+                if(soft_bit > 128)
+                {
                      soft_bit = 255-soft_bit;
-                } else if (soft_bit < 128){
-                    soft_bit = 255-soft_bit;
                 }
+                 else if (soft_bit < 128)
+                 {
+                    soft_bit = 255-soft_bit;
+                 }
             }
-        }
-        else{
-
+         }
+          else
+          {
             gotsync=preambledetector.Update(bit);
-        }
+          }
 
         if(cntr<1000000000)cntr++;
         if(cntr<16)
@@ -1186,28 +1195,16 @@ QByteArray &AeroL::Decode(QVector<short> &bits, bool soft)//0 bit --> oldest bit
 
                 RTChannelDeleaveFECScram::ReturnResult result;
 
-                if(useingOQPSK){
-
-                    if(soft){
-                    result = rtchanneldeleavefecscram.update(soft_bit);
-                    }else
-                    {
-                        result = rtchanneldeleavefecscram.update(bit);
-
-                    }
-
-                }else{
-
-                    if(soft){
-
-
-                    result = rtchanneldeleavefecscram.updateMSK(soft_bit);
-                    }
-                    else{
-                        result = rtchanneldeleavefecscram.updateMSK(bit);
-                    }
-
+                if(useingOQPSK)
+                {
+                    if(soft)result = rtchanneldeleavefecscram.update(soft_bit);
+                     else result = rtchanneldeleavefecscram.update(bit);
                 }
+                 else
+                 {
+                    if(soft)result = rtchanneldeleavefecscram.updateMSK(soft_bit);
+                     else result = rtchanneldeleavefecscram.updateMSK(bit);
+                 }
 
                 switch(result)
                 {
@@ -1410,7 +1407,8 @@ QByteArray &AeroL::Decode(QVector<short> &bits, bool soft)//0 bit --> oldest bit
 
             }
 
-            else{
+             else
+             {
 
                 // its a p channel
 
@@ -1773,11 +1771,11 @@ QByteArray &AeroL::Decode(QVector<short> &bits, bool soft)//0 bit --> oldest bit
 
 
                             }
-                            else
-                            {
+                             else
+                             {
                                 decline+=" Bad CRC\n";
                                 //allgood=false;
-                            }
+                             }
 
                             /*if(crc_calc==crc_rec)qDebug()<<k<<((QString)"").sprintf("rec = %02X", crc_rec)<<((QString)"").sprintf("calc = %02X", crc_calc)<<"OK"<<unencoded_BER_estimate*100.0;
                          else
@@ -1803,7 +1801,7 @@ QByteArray &AeroL::Decode(QVector<short> &bits, bool soft)//0 bit --> oldest bit
                 }
 
 
-            }
+             }
 
 
         }
@@ -1852,8 +1850,8 @@ QByteArray &AeroL::Decode(QVector<short> &bits, bool soft)//0 bit --> oldest bit
 
     }
 
-    if((!datacd)&&(!burstmode)){
-
+    if((!datacd)&&(!burstmode))
+    {
         decodedbytes.clear();
     }
 
@@ -1896,8 +1894,8 @@ qint64 AeroL::writeData(const char *data, qint64 len)
     return len;
 }
 
-void AeroL::processDemodulatedSoftBits(const QVector<short> &soft_bits){
-
+void AeroL::processDemodulatedSoftBits(const QVector<short> &soft_bits)
+{
 
     sbits.clear();
 
diff --git a/JAERO/aerol.h b/JAERO/aerol.h
index 93cbffb..439bcdd 100644
--- a/JAERO/aerol.h
+++ b/JAERO/aerol.h
@@ -542,8 +542,8 @@ class RTChannelDeleaveFECScram
 
     ReturnResult updateMSK(int bit)
     {
-        if(blockptr>=block.size()){
-
+        if(blockptr>=block.size())
+        {
             return FULL;
         }
         block[blockptr]=bit;
@@ -558,13 +558,11 @@ class RTChannelDeleaveFECScram
         bool cont = false;
 
 
-        if((((blockptr-(64*5))%(64*3))==0) && (blockptr /64 == 5 || blockptr /64 == targetBlocks || blockptr/64 == 8 || blockptr/64 == 50)){
-
+        if((((blockptr-(64*5))%(64*3))==0) && (blockptr /64 == 5 || blockptr /64 == targetBlocks || blockptr/64 == 8 || blockptr/64 == 50))
+        {
             cont = true;
-
         }
 
-
         //test if interleaver length works
         if(cont)//true for R and T packets
         {
diff --git a/JAERO/burstmskdemodulator.cpp b/JAERO/burstmskdemodulator.cpp
index c3ecd26..bf69554 100644
--- a/JAERO/burstmskdemodulator.cpp
+++ b/JAERO/burstmskdemodulator.cpp
@@ -445,7 +445,8 @@ qint64 BurstMskDemodulator::writeData(const char *data, qint64 len)
             int size_base = 126;
             int size_top = 74;
 
-            if(fb < 1200){
+            if(fb < 1200)
+            {
 
                 size_base = 150;
                 size_top = 74;
@@ -494,7 +495,8 @@ qint64 BurstMskDemodulator::writeData(const char *data, qint64 len)
             int maxtopposhigh =0;
             double maxtophigh =0;
 
-            for(int i=0; i < out_top.size(); i++){
+            for(int i=0; i < out_top.size(); i++)
+            {
 
                 if(i > 50)
                 {
@@ -756,7 +758,7 @@ qint64 BurstMskDemodulator::writeData(const char *data, qint64 len)
 
                 double imagin = diffdecode.UpdateSoft(pt_msk.imag());
 
-                int ibit=qRound((imagin)*127.0+128.0);
+                int ibit=qRound(0.75*(imagin)*127.0+128.0);
                 if(ibit>255)ibit=255;
                 if(ibit<0)ibit=0;
 
@@ -766,7 +768,7 @@ qint64 BurstMskDemodulator::writeData(const char *data, qint64 len)
 
                 real =- real;
 
-                ibit=qRound((real)*127.0+128.0);
+                ibit=qRound(0.75*(real)*127.0+128.0);
 
                 if(ibit>255)ibit=255;
                 if(ibit<0)ibit=0;
diff --git a/JAERO/burstmskdemodulator.h b/JAERO/burstmskdemodulator.h
index 697b08e..8653b53 100644
--- a/JAERO/burstmskdemodulator.h
+++ b/JAERO/burstmskdemodulator.h
@@ -41,7 +41,7 @@ class BurstMskDemodulator : public QIODevice
             fb=125;//bps
             Fs=8000;//Hz
             symbolspercycle=16;
-            signalthreshold=0.5;
+            signalthreshold=0.6;
         }
     };
     explicit BurstMskDemodulator(QObject *parent);
diff --git a/JAERO/burstoqpskdemodulator.cpp b/JAERO/burstoqpskdemodulator.cpp
index 7f5e6fb..911cedf 100644
--- a/JAERO/burstoqpskdemodulator.cpp
+++ b/JAERO/burstoqpskdemodulator.cpp
@@ -694,13 +694,13 @@ void BurstOqpskDemodulator::writeDataSlot(const char *data, qint64 len)
 
 
 
-                    int ibit=qRound(pt_qpsk.imag()*127.0+128.0);
+                    int ibit=qRound(0.75*pt_qpsk.imag()*127.0+128.0);
                     if(ibit>255)ibit=255;
                     if(ibit<0)ibit=0;
 
                     RxDataBits.push_back((uchar)ibit);
 
-                    ibit=qRound(pt_qpsk.real()*127.0+128.0);
+                    ibit=qRound(0.75*pt_qpsk.real()*127.0+128.0);
                     if(ibit>255)ibit=255;
                     if(ibit<0)ibit=0;
 
diff --git a/JAERO/burstoqpskdemodulator.h b/JAERO/burstoqpskdemodulator.h
index 59947c9..8a4ac29 100644
--- a/JAERO/burstoqpskdemodulator.h
+++ b/JAERO/burstoqpskdemodulator.h
@@ -37,7 +37,7 @@ class BurstOqpskDemodulator : public QIODevice
             lockingbw=10500;//Hz
             fb=10500;//bps
             Fs=48000;//Hz
-            signalthreshold=0.5;
+            signalthreshold=0.6;
             channel_stereo=false;
         }
     };
diff --git a/JAERO/jconvolutionalcodec.cpp b/JAERO/jconvolutionalcodec.cpp
index bfb1fe5..1909cef 100644
--- a/JAERO/jconvolutionalcodec.cpp
+++ b/JAERO/jconvolutionalcodec.cpp
@@ -135,13 +135,15 @@ QByteArray& JConvolutionalCodec::Hard_To_Soft_Convert(QByteArray& hard_bits_in)
 
     for(int i=0;i<hard_bits_in.size();i++)
     {
-        if(((uchar)(hard_bits_in.at(i)))==0){
+        if(((uchar)(hard_bits_in.at(i)))==0)
+        {
             hard_bits_in[i] = uchar(0);
         }
-        else{
+         else
+         {
             hard_bits_in[i] = uchar(255);
 
-        }
+         }
     }
     return hard_bits_in;
 }
diff --git a/JAERO/mskdemodulator.cpp b/JAERO/mskdemodulator.cpp
index ce144aa..cad6666 100644
--- a/JAERO/mskdemodulator.cpp
+++ b/JAERO/mskdemodulator.cpp
@@ -486,7 +486,7 @@ qint64 MskDemodulator::writeData(const char *data, qint64 len)
 
                 double imagin = diffdecode.UpdateSoft(thisonpt.imag());
 
-                int ibit=qRound((imagin)*127.0+128.0);
+                int ibit=qRound(0.75*(imagin)*127.0+128.0);
                 if(ibit>255)ibit=255;
                 if(ibit<0)ibit=0;
 
@@ -496,7 +496,7 @@ qint64 MskDemodulator::writeData(const char *data, qint64 len)
 
                 real =- real;
 
-                ibit=qRound((real)*127.0+128.0);
+                ibit=qRound(0.75*(real)*127.0+128.0);
 
                 if(ibit>255)ibit=255;
                 if(ibit<0)ibit=0;
diff --git a/JAERO/mskdemodulator.h b/JAERO/mskdemodulator.h
index 8f76044..87d28e2 100644
--- a/JAERO/mskdemodulator.h
+++ b/JAERO/mskdemodulator.h
@@ -37,7 +37,7 @@ class MskDemodulator : public QIODevice
             fb=125;//bps
             Fs=8000;//Hz
             symbolspercycle=16;
-            signalthreshold=0.5;
+            signalthreshold=0.6;
         }
     };
     explicit MskDemodulator(QObject *parent);
diff --git a/JAERO/oqpskdemodulator.cpp b/JAERO/oqpskdemodulator.cpp
index ffd9d0d..cce0d79 100644
--- a/JAERO/oqpskdemodulator.cpp
+++ b/JAERO/oqpskdemodulator.cpp
@@ -409,13 +409,13 @@ qint64 OqpskDemodulator::writeData(const char *data, qint64 len)
                 }*/
 
                 // soft bits
-                int ibit=qRound(pt_qpsk.imag()*127.0+128.0);
+                int ibit=qRound(0.75*pt_qpsk.imag()*127.0+128.0);
                 if(ibit>255)ibit=255;
                 if(ibit<0)ibit=0;
 
                 RxDataBits.push_back((uchar)ibit);
 
-                ibit=qRound(pt_qpsk.real()*127.0+128.0);
+                ibit=qRound(0.75*pt_qpsk.real()*127.0+128.0);
                 if(ibit>255)ibit=255;
                 if(ibit<0)ibit=0;
 
diff --git a/JAERO/oqpskdemodulator.h b/JAERO/oqpskdemodulator.h
index a32a520..a812f0d 100644
--- a/JAERO/oqpskdemodulator.h
+++ b/JAERO/oqpskdemodulator.h
@@ -32,7 +32,7 @@ class OqpskDemodulator : public QIODevice
             lockingbw=10500;//Hz
             fb=10500;//bps
             Fs=48000;//Hz
-            signalthreshold=0.5;
+            signalthreshold=0.6;
         }
     };
     explicit OqpskDemodulator(QObject *parent);
diff --git a/libcorrect/.appveyor-install-tools.cmd b/libcorrect/.appveyor-install-tools.cmd
new file mode 100644
index 0000000..6a3a3de
--- /dev/null
+++ b/libcorrect/.appveyor-install-tools.cmd
@@ -0,0 +1,47 @@
+@echo on
+
+if NOT EXIST C:\projects\tools (
+  mkdir C:\projects\tools
+)
+cd C:\projects\tools
+
+::###########################################################################
+:: Setup Compiler
+::###########################################################################
+if NOT EXIST llvm-installer.exe (
+    appveyor DownloadFile http://prereleases.llvm.org/win-snapshots/LLVM-5.0.0-r306282-win32.exe -FileName llvm-installer.exe
+)
+
+START /WAIT llvm-installer.exe /S /D=C:\"projects\tools\LLVM-install"
+@set PATH="C:\projects\tools\LLVM-install\bin";%PATH%
+clang-cl -v
+
+if DEFINED MINGW_PATH rename "C:\Program Files\Git\usr\bin\sh.exe" "sh-ignored.exe"
+if DEFINED MINGW_PATH @set "PATH=%PATH:C:\Program Files (x86)\Git\bin=%"
+if DEFINED MINGW_PATH @set "PATH=%PATH%;%MINGW_PATH%"
+if DEFINED MINGW_PATH g++ -v
+
+::###########################################################################
+:: Install a recent CMake
+::###########################################################################
+if NOT EXIST cmake (
+  appveyor DownloadFile https://cmake.org/files/v3.7/cmake-3.7.2-win64-x64.zip -FileName cmake.zip
+  7z x cmake.zip -oC:\projects\tools > nul
+  move C:\projects\tools\cmake-* C:\projects\tools\cmake
+  rm cmake.zip
+)
+@set PATH=C:\projects\tools\cmake\bin;%PATH%
+cmake --version
+
+::###########################################################################
+:: Install Ninja
+::###########################################################################
+if NOT EXIST ninja (
+  appveyor DownloadFile https://github.com/ninja-build/ninja/releases/download/v1.6.0/ninja-win.zip -FileName ninja.zip
+  7z x ninja.zip -oC:\projects\tools\ninja > nul
+  rm ninja.zip
+)
+@set PATH=C:\projects\tools\ninja;%PATH%
+ninja --version
+
+@echo off
diff --git a/libcorrect/.travis.yml b/libcorrect/.travis.yml
new file mode 100644
index 0000000..a536008
--- /dev/null
+++ b/libcorrect/.travis.yml
@@ -0,0 +1,12 @@
+language: c
+matrix:
+    include:
+        - os: linux
+          dist: trusty
+        - os: osx
+script:
+    - mkdir build
+    - cd build
+    - cmake ..
+    - make shim
+    - make check CTEST_OUTPUT_ON_FAILURE=TRUE
diff --git a/libcorrect/CMakeLists.txt b/libcorrect/CMakeLists.txt
new file mode 100644
index 0000000..fe8bbb7
--- /dev/null
+++ b/libcorrect/CMakeLists.txt
@@ -0,0 +1,84 @@
+cmake_minimum_required(VERSION 2.8)
+project(Correct)
+include(CheckLibraryExists)
+include(CheckIncludeFiles)
+include(CheckCXXSourceCompiles)
+
+if(MSVC)
+set(LIBM "")
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /W4")
+else(MSVC)
+set(LIBM "m")
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fPIC -std=c99 -Wpedantic -Wall")
+if(CMAKE_BUILD_TYPE STREQUAL "Debug")
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -g3 -O0 -march=native -fsanitize=address")
+    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,-no_pie,")
+else()
+  if("${CMAKE_C_COMPILER_ID}" STREQUAL "Clang")
+      set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O2")
+    else()
+      set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O2")
+  endif()
+  if(CMAKE_BUILD_TYPE STREQUAL "Profiling")
+      set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O2 -g3")
+  endif()
+endif()
+endif(MSVC)
+
+find_library(FEC fec)
+CHECK_LIBRARY_EXISTS(FEC dotprod "" HAVE_LIBFEC)
+check_cxx_source_compiles("
+    #include <x86intrin.h>
+    int main() {
+      __m128i vec;
+      return 0;
+    }" HAVE_SSE)
+
+if(HAVE_SSE)
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -msse4.2")
+endif()
+
+set(CMAKE_CXX_VISIBILITY_PRESET hidden)
+set(CMAKE_VISIBILITY_INLINES_HIDDEN 1)
+set(LIBRARY_OUTPUT_PATH ${PROJECT_BINARY_DIR}/lib)
+set(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/bin)
+
+include_directories(${CMAKE_SOURCE_DIR}/include)
+add_subdirectory(src)
+
+set(INSTALL_HEADERS "${CMAKE_BINARY_DIR}/include/correct.h")
+
+add_custom_target(correct-h ALL COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_SOURCE_DIR}/include/correct.h ${CMAKE_BINARY_DIR}/include/correct.h)
+
+if(HAVE_SSE)
+  set(correct_obj_files $<TARGET_OBJECTS:correct-reed-solomon> $<TARGET_OBJECTS:correct-convolutional> $<TARGET_OBJECTS:correct-convolutional-sse>)
+  set(INSTALL_HEADERS ${INSTALL_HEADERS} ${CMAKE_BINARY_DIR}/include/correct-sse.h)
+  add_custom_target(correct-sse-h ALL COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_SOURCE_DIR}/include/correct-sse.h ${CMAKE_BINARY_DIR}/include/correct-sse.h)
+  add_definitions(-DHAVE_SSE=1)
+else()
+  set(correct_obj_files $<TARGET_OBJECTS:correct-reed-solomon> $<TARGET_OBJECTS:correct-convolutional>)
+endif()
+add_library(correct SHARED ${correct_obj_files})
+add_library(correct_static ${correct_obj_files})
+set_target_properties(correct_static PROPERTIES OUTPUT_NAME "correct")
+
+add_subdirectory(util)
+add_subdirectory(tests)
+add_subdirectory(tools)
+# add_subdirectory(benchmarks)
+
+install(TARGETS correct correct_static
+        DESTINATION lib)
+install(FILES ${INSTALL_HEADERS} DESTINATION "${CMAKE_INSTALL_PREFIX}/include")
+
+add_library(fec_shim_static EXCLUDE_FROM_ALL src/fec_shim.c ${correct_obj_files})
+set_target_properties(fec_shim_static PROPERTIES OUTPUT_NAME "fec")
+add_library(fec_shim_shared SHARED EXCLUDE_FROM_ALL src/fec_shim.c ${correct_obj_files})
+set_target_properties(fec_shim_shared PROPERTIES OUTPUT_NAME "fec")
+add_custom_target(fec-shim-h COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_SOURCE_DIR}/include/fec_shim.h ${CMAKE_BINARY_DIR}/include/fec.h)
+add_custom_target(shim DEPENDS fec_shim_static fec_shim_shared fec-shim-h)
+
+install(TARGETS fec_shim_static fec_shim_shared
+        DESTINATION lib
+        OPTIONAL)
+install(FILES ${CMAKE_BINARY_DIR}/include/fec.h DESTINATION "${CMAKE_INSTALL_PREFIX}/include" OPTIONAL)
diff --git a/libcorrect/LICENSE b/libcorrect/LICENSE
new file mode 100644
index 0000000..8bb8fad
--- /dev/null
+++ b/libcorrect/LICENSE
@@ -0,0 +1,12 @@
+Copyright (c) 2016, Brian Armstrong
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/libcorrect/README.md b/libcorrect/README.md
new file mode 100644
index 0000000..c0ebbe8
--- /dev/null
+++ b/libcorrect/README.md
@@ -0,0 +1,38 @@
+[libcorrect](https://github.com/quiet/libcorrect)
+===========
+[![OSX/Linux Build Status](https://travis-ci.org/quiet/libcorrect.svg?branch=master)](https://travis-ci.org/quiet/libcorrect)
+[![Windows Build status](https://ci.appveyor.com/api/projects/status/i3e84jmj00fa5my8/branch/master?svg=true)](https://ci.appveyor.com/project/brian-armstrong/libcorrect/branch/master)
+
+libcorrect is a library for Forward Error Correction. By using libcorrect, you can encode extra redundancy into a packet of data and then send it across a lossy channel. When the packet is received, it can be decoded to recover the original, pre-encoded data.
+
+libcorrect accomplishes this task with two algorithms, [Convolutional codes](https://en.wikipedia.org/wiki/Convolutional_code) and [Reed-Solomon](https://en.wikipedia.org/wiki/Reed%E2%80%93Solomon_error_correction). Convolutional codes are robust to a constant background noise, while Reed-Solomon error correction is effective at dealing with noise that occurs in bursts. These algorithms have played an important role in [telecommunications](https://en.wikipedia.org/wiki/Error_detection_and_correction#Deep-space_telecommunications). libcorrect uses a [Viterbi algorithm](https://en.wikipedia.org/wiki/Viterbi_algorithm) decoder to decode convolutional codes.
+
+libcorrect is a performant, BSD-licensed library. It is also the author's hope that this library's contents could help others learn how its algorithms work.
+
+Design goals
+-----------
+
+1. libcorrect should be a drop-in, BSD-licensed substitute for [libfec](http://www.ka9q.net/code/fec/), which offers similar functionality under the LGPL-license. Although libfec is a fantastic library, the state of LGPL-licensed libraries on mobile devices is somewhat uncertain. For this reason, libcorrect is a completely new approach under the BSD license which supports the same algorithms as libfec. Additionally, libcorrect can be built with a compatibility layer so that libcorrect can be linked in place of libfec.
+
+    Achieving this goal gives [libquiet](https://github.com/quiet/quiet) a fully BSD-/MIT-licensed set of dependencies, which gives libquiet more flexibility in mobile applications.
+
+2. libcorrect should make it easier to investigate how forward error correction works. To accomplish this, libcorrect provides tools to test the fitness of convolutional codes and their polynomials. Additionally, libcorrect should be written in a way that leads to easy understanding of these powerful algorithms. This library's roadmap includes more documentation on how these algorithms work and how to increase their computational performance.
+
+3. libcorrect should explore further into error correction. This goal would help libquiet operate in noisier situations. One approach might be to use parts of libcorrect's Viterbi Algorithm decoder to create a [Turbo code](https://en.wikipedia.org/wiki/Turbo_code) decoder, although this is just an idea and may turn out to be prohibitively difficult.
+
+Build
+-----------
+libcorrect uses CMake, which allows for out-of-source builds. To get started, make sure that you have CMake installed, and then, from libcorrect's source directory, run `mkdir build && cd build && cmake .. && make && make install`. Additionally, if you would like the libfec compatibility layer, you can run `make shim && make install`, though do be cautioned that this can overwrite an existing installation of libfec.
+
+If you are on a host which has `<x86intrin.h>` available, then libcorrect will automatically build its SSE version as well. The SSE headers are provided under `<correct-sse.h>`. For now, it is on the caller of this code to ensure that SSE is available and can be used. libcorrect requires SSE functions up to and including SSE4.
+
+If you have any questions or problems with libcorrect, do not hesitate to open an issue.
+
+-----------
+I'd like to thank Ryan Hitchman and Josh Gao for all of their help and rubber ducking.
+
+A huge thank you goes to [Lucas Teske](https://github.com/racerxdl) for finding all the ways that libcorrect was broken on Windows and to [Denis Golovan](https://github.com/MageSlayer) for finding an error in the returned length of the convolutional code decoder.
+
+
+
+
diff --git a/libcorrect/appveyor.yml b/libcorrect/appveyor.yml
new file mode 100644
index 0000000..e741a3a
--- /dev/null
+++ b/libcorrect/appveyor.yml
@@ -0,0 +1,51 @@
+version: '{build}'
+
+build:
+    verbosity: detailed
+
+branches:
+    only:
+        - master
+
+environment:
+    matrix:
+        - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015
+          COMPILER: cl.exe
+          MSVC_BAT: C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat
+          MSVC_BAT_ARCH: x86
+          GENERATOR: "Visual Studio 14 2015 Win64"
+          APPVEYOR_SAVE_CACHE_ON_ERROR: true
+          DLL_PATH: lib\Release\fec.dll
+        - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015
+          COMPILER: "C:/projects/tools/LLVM-install/bin/clang-cl.exe"
+          MSVC_BAT: C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat
+          MSVC_BAT_ARCH: x86
+          GENERATOR: "Ninja"
+          APPVEYOR_SAVE_CACHE_ON_ERROR: true
+          DLL_PATH: lib\fec.dll
+
+
+install:
+    - call "%APPVEYOR_BUILD_FOLDER%\\.appveyor-install-tools.cmd"
+
+before_build:
+    - if DEFINED MSVC_BAT call "%MSVC_BAT%" %MSVC_BAT_ARCH%
+    - cd %APPVEYOR_BUILD_FOLDER%
+
+build_script:
+    - mkdir build
+    - cd build
+    - cmake -G "%GENERATOR%" -DCMAKE_C_COMPILER=%COMPILER% -DCMAKE_CXX_COMPILER=%COMPILER% -DCMAKE_BUILD_TYPE=Release ..
+    - cmake --build . --config Release --target shim
+    - cmake --build . --config Release --target test_runners
+    - dumpbin /EXPORTS %DLL_PATH%
+
+test_script:
+    - cd tests
+    - set CTEST_OUTPUT_ON_FAILURE=1
+    - ctest -C Release
+
+cache:
+    - C:\projects\tools\ninja
+    - C:\projects\tools\cmake
+    - C:\projects\tools\llvm-installer.exe
diff --git a/libcorrect/bin/32/libcorrect.a b/libcorrect/bin/32/libcorrect.a
new file mode 100644
index 0000000..b8fde09
Binary files /dev/null and b/libcorrect/bin/32/libcorrect.a differ
diff --git a/libcorrect/bin/32/libcorrect.dll b/libcorrect/bin/32/libcorrect.dll
index 42de8ed..db53d54 100644
Binary files a/libcorrect/bin/32/libcorrect.dll and b/libcorrect/bin/32/libcorrect.dll differ
diff --git a/libcorrect/bin/32/libcorrect.dll.a b/libcorrect/bin/32/libcorrect.dll.a
new file mode 100644
index 0000000..cfffabf
Binary files /dev/null and b/libcorrect/bin/32/libcorrect.dll.a differ
diff --git a/libcorrect/bin/64/libcorrect.a b/libcorrect/bin/64/libcorrect.a
new file mode 100644
index 0000000..f128820
Binary files /dev/null and b/libcorrect/bin/64/libcorrect.a differ
diff --git a/libcorrect/bin/64/libcorrect.dll b/libcorrect/bin/64/libcorrect.dll
index 8c52c55..bd799d5 100644
Binary files a/libcorrect/bin/64/libcorrect.dll and b/libcorrect/bin/64/libcorrect.dll differ
diff --git a/libcorrect/bin/64/libcorrect.dll.a b/libcorrect/bin/64/libcorrect.dll.a
new file mode 100644
index 0000000..ad29c30
Binary files /dev/null and b/libcorrect/bin/64/libcorrect.dll.a differ
diff --git a/libcorrect/include/correct-sse.h b/libcorrect/include/correct-sse.h
index 8052219..9372f19 100644
--- a/libcorrect/include/correct-sse.h
+++ b/libcorrect/include/correct-sse.h
@@ -20,11 +20,11 @@ size_t correct_convolutional_sse_encode_len(correct_convolutional_sse *conv, siz
 size_t correct_convolutional_sse_encode(correct_convolutional_sse *conv, const uint8_t *msg,
                                         size_t msg_len, uint8_t *encoded);
 
-size_t correct_convolutional_sse_decode(correct_convolutional_sse *conv, const uint8_t *encoded,
-                                        size_t num_encoded_bits, uint8_t *msg);
+ssize_t correct_convolutional_sse_decode(correct_convolutional_sse *conv, const uint8_t *encoded,
+                                         size_t num_encoded_bits, uint8_t *msg);
 
-size_t correct_convolutional_sse_decode_soft(correct_convolutional_sse *conv,
-                                             const correct_convolutional_soft_t *encoded,
-                                             size_t num_encoded_bits, uint8_t *msg);
+ssize_t correct_convolutional_sse_decode_soft(correct_convolutional_sse *conv,
+                                              const correct_convolutional_soft_t *encoded,
+                                              size_t num_encoded_bits, uint8_t *msg);
 
 #endif
diff --git a/libcorrect/include/correct.h b/libcorrect/include/correct.h
index 01f2653..12d817d 100644
--- a/libcorrect/include/correct.h
+++ b/libcorrect/include/correct.h
@@ -1,15 +1,16 @@
 #ifndef CORRECT_H
 #define CORRECT_H
 #include <stdint.h>
+
 #ifndef _MSC_VER
 #include <unistd.h>
-#ifdef __MINGW32__
-#define ssize_t int
-#endif
 #else
-#define ssize_t int
+#include <stddef.h>
+typedef ptrdiff_t ssize_t;
 #endif
 
+
+
 // Convolutional Codes
 
 // Convolutional polynomials are 16 bits wide
@@ -92,10 +93,11 @@ size_t correct_convolutional_encode(correct_convolutional *conv, const uint8_t *
  * value should then be converted to bytes to find the correct
  * length for msg.
  *
- * This function returns the number of bytes written to msg.
+ * This function returns the number of bytes written to msg. If
+ * it fails, it returns -1.
  */
-size_t correct_convolutional_decode(correct_convolutional *conv, const uint8_t *encoded,
-                                    size_t num_encoded_bits, uint8_t *msg);
+ssize_t correct_convolutional_decode(correct_convolutional *conv, const uint8_t *encoded,
+                                     size_t num_encoded_bits, uint8_t *msg);
 
 /* correct_convolutional_decode_soft uses the given conv instance
  * to decode a block encoded by correct_convolutional_encode and
@@ -115,11 +117,12 @@ size_t correct_convolutional_decode(correct_convolutional *conv, const uint8_t *
  * value should then be converted to bytes to find the correct
  * length for msg.
  *
- * This function returns the number of bytes written to msg.
+ * This function returns the number of bytes written to msg. If
+ * it fails, it returns -1.
  */
-size_t correct_convolutional_decode_soft(correct_convolutional *conv,
-                                         const correct_convolutional_soft_t *encoded,
-                                         size_t num_encoded_bits, uint8_t *msg);
+ssize_t correct_convolutional_decode_soft(correct_convolutional *conv,
+                                          const correct_convolutional_soft_t *encoded,
+                                          size_t num_encoded_bits, uint8_t *msg);
 
 // Reed-Solomon
 
diff --git a/libcorrect/include/correct/convolutional.h b/libcorrect/include/correct/convolutional.h
index 520319a..06b1710 100644
--- a/libcorrect/include/correct/convolutional.h
+++ b/libcorrect/include/correct/convolutional.h
@@ -10,6 +10,7 @@
 #include <assert.h>
 
 #include "correct.h"
+#include "correct/portable.h"
 
 typedef unsigned int shift_register_t;
 typedef uint16_t polynomial_t;
diff --git a/libcorrect/include/correct/convolutional/bit.h b/libcorrect/include/correct/convolutional/bit.h
index a01e1c6..6483898 100644
--- a/libcorrect/include/correct/convolutional/bit.h
+++ b/libcorrect/include/correct/convolutional/bit.h
@@ -24,6 +24,8 @@ void bit_writer_write_bitlist_reversed(bit_writer_t *w, uint8_t *l, size_t len);
 
 void bit_writer_flush_byte(bit_writer_t *w);
 
+size_t bit_writer_length(bit_writer_t *w);
+
 typedef struct {
     uint8_t current_byte;
     size_t byte_index;
diff --git a/libcorrect/include/correct/convolutional/metric.h b/libcorrect/include/correct/convolutional/metric.h
index 91bac45..ba64226 100644
--- a/libcorrect/include/correct/convolutional/metric.h
+++ b/libcorrect/include/correct/convolutional/metric.h
@@ -3,7 +3,7 @@
 // measure the hamming distance of two bit strings
 // implemented as population count of x XOR y
 inline distance_t metric_distance(unsigned int x, unsigned int y) {
-    return __builtin_popcount(x ^ y);
+    return popcount(x ^ y);
 }
 
 inline distance_t metric_soft_distance_linear(unsigned int hard_x, const uint8_t *soft_y, size_t len) {
diff --git a/libcorrect/include/correct/portable.h b/libcorrect/include/correct/portable.h
new file mode 100644
index 0000000..b5d4407
--- /dev/null
+++ b/libcorrect/include/correct/portable.h
@@ -0,0 +1,19 @@
+#ifdef __GNUC__
+#define HAVE_BUILTINS
+#endif
+
+#if defined(HAVE_BUILTINS) && !defined(DONT_USE_BUILTINS)
+#define popcount __builtin_popcount
+#define prefetch __builtin_prefetch
+#else
+
+
+static inline int popcount(int x) {
+    /* taken from the helpful http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel */
+    x = x - ((x >> 1) & 0x55555555);
+    x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
+    return ((x + (x >> 4) & 0x0f0f0f0f) * 0x01010101) >> 24;
+}
+static inline void prefetch(void *x) {}
+
+#endif
diff --git a/libcorrect/include/correct/reed-solomon.h b/libcorrect/include/correct/reed-solomon.h
index 3aa56ec..94d3efb 100644
--- a/libcorrect/include/correct/reed-solomon.h
+++ b/libcorrect/include/correct/reed-solomon.h
@@ -6,14 +6,9 @@
 #include <stdbool.h>
 #include <time.h>
 #include <stdint.h>
-#ifndef _MSC_VER
-#include <unistd.h>
-#ifdef __MINGW32__
-#define ssize_t int
-#endif
-#else
-	#define ssize_t int
-#endif
+
+#include "correct.h"
+#include "correct/portable.h"
 
 // an element in GF(2^8)
 typedef uint8_t field_element_t;
@@ -36,7 +31,7 @@ typedef struct {
     unsigned int order;
 } polynomial_t;
 
-typedef struct {
+struct correct_reed_solomon {
     size_t block_length;
     size_t message_length;
     size_t min_distance;
@@ -77,5 +72,5 @@ typedef struct {
     polynomial_t init_from_roots_scratch[2];
     bool has_init_decode;
 
-} correct_reed_solomon;
+};
 #endif
diff --git a/libcorrect/include/correct/util/error-sim-fec.h b/libcorrect/include/correct/util/error-sim-fec.h
new file mode 100644
index 0000000..b3deeb1
--- /dev/null
+++ b/libcorrect/include/correct/util/error-sim-fec.h
@@ -0,0 +1,8 @@
+#include "correct/util/error-sim.h"
+
+#include <fec.h>
+
+void conv_fec27_decode(void *conv_v, uint8_t *soft, size_t soft_len, uint8_t *msg);
+void conv_fec29_decode(void *conv_v, uint8_t *soft, size_t soft_len, uint8_t *msg);
+void conv_fec39_decode(void *conv_v, uint8_t *soft, size_t soft_len, uint8_t *msg);
+void conv_fec615_decode(void *conv_v, uint8_t *soft, size_t soft_len, uint8_t *msg);
diff --git a/libcorrect/include/correct/util/error-sim-shim.h b/libcorrect/include/correct/util/error-sim-shim.h
new file mode 100644
index 0000000..aed1fc1
--- /dev/null
+++ b/libcorrect/include/correct/util/error-sim-shim.h
@@ -0,0 +1,7 @@
+#include "correct/util/error-sim.h"
+#include "fec_shim.h"
+
+ssize_t conv_shim27_decode(void *conv_v, uint8_t *soft, size_t soft_len, uint8_t *msg);
+ssize_t conv_shim29_decode(void *conv_v, uint8_t *soft, size_t soft_len, uint8_t *msg);
+ssize_t conv_shim39_decode(void *conv_v, uint8_t *soft, size_t soft_len, uint8_t *msg);
+ssize_t conv_shim615_decode(void *conv_v, uint8_t *soft, size_t soft_len, uint8_t *msg);
diff --git a/libcorrect/include/correct/util/error-sim-sse.h b/libcorrect/include/correct/util/error-sim-sse.h
new file mode 100644
index 0000000..2447854
--- /dev/null
+++ b/libcorrect/include/correct/util/error-sim-sse.h
@@ -0,0 +1,7 @@
+#include "correct/util/error-sim.h"
+
+#include "correct-sse.h"
+
+size_t conv_correct_sse_enclen(void *conv_v, size_t msg_len);
+void conv_correct_sse_encode(void *conv_v, uint8_t *msg, size_t msg_len, uint8_t *encoded);
+ssize_t conv_correct_sse_decode(void *conv_v, uint8_t *soft, size_t soft_len, uint8_t *msg);
diff --git a/libcorrect/include/correct/util/error-sim.h b/libcorrect/include/correct/util/error-sim.h
new file mode 100644
index 0000000..e75c5a1
--- /dev/null
+++ b/libcorrect/include/correct/util/error-sim.h
@@ -0,0 +1,47 @@
+#include <stdbool.h>
+#include <math.h>
+#include <string.h>
+#include <stdlib.h>
+#include <float.h>
+#include <stdio.h>
+
+#include "correct.h"
+#include "correct/portable.h"
+
+size_t distance(uint8_t *a, uint8_t *b, size_t len);
+void gaussian(double *res, size_t n_res, double sigma);
+
+void encode_bpsk(uint8_t *msg, double *voltages, size_t n_syms, double bpsk_voltage);
+void byte2bit(uint8_t *bytes, uint8_t *bits, size_t n_bits);
+void decode_bpsk(uint8_t *soft, uint8_t *msg, size_t n_syms);
+void decode_bpsk_soft(double *voltages, uint8_t *soft, size_t n_syms, double bpsk_voltage);
+double log2amp(double l);
+double amp2log(double a);
+double sigma_for_eb_n0(double eb_n0, double bpsk_bit_energy);
+void build_white_noise(double *noise, size_t n_syms, double eb_n0, double bpsk_bit_energy);
+void add_white_noise(double *signal, double *noise, size_t n_syms);
+
+typedef struct {
+    uint8_t *msg_out;
+    size_t msg_len;
+    uint8_t *encoded;
+    double *v;
+    double *corrupted;
+    uint8_t *soft;
+    double *noise;
+    size_t enclen;
+    size_t enclen_bytes;
+    void (*encode)(void *, uint8_t *msg, size_t msg_len, uint8_t *encoded);
+    void *encoder;
+    ssize_t (*decode)(void *, uint8_t *soft, size_t soft_len, uint8_t *msg);
+    void *decoder;
+} conv_testbench;
+
+conv_testbench *resize_conv_testbench(conv_testbench *scratch, size_t (*enclen)(void *, size_t), void *enc, size_t msg_len);
+void free_scratch(conv_testbench *scratch);
+int test_conv_noise(conv_testbench *scratch, uint8_t *msg, size_t n_bytes,
+                    double bpsk_voltage);
+
+size_t conv_correct_enclen(void *conv_v, size_t msg_len);
+void conv_correct_encode(void *conv_v, uint8_t *msg, size_t msg_len, uint8_t *encoded);
+ssize_t conv_correct_decode(void *conv_v, uint8_t *soft, size_t soft_len, uint8_t *msg);
diff --git a/libcorrect/include/fec_shim.h b/libcorrect/include/fec_shim.h
index ecf6e11..b06e463 100644
--- a/libcorrect/include/fec_shim.h
+++ b/libcorrect/include/fec_shim.h
@@ -62,6 +62,13 @@ int chainback_viterbi615(void *vit, unsigned char *decoded, unsigned int n_decod
 void delete_viterbi615(void *vit);
 
 // Misc other
-static inline int parity(unsigned int x) { return __builtin_parity(x); }
+static inline int parity(unsigned int x) {
+    /* http://graphics.stanford.edu/~seander/bithacks.html#ParityParallel */
+    x ^= x >> 16;
+    x ^= x >> 8;
+    x ^= x >> 4;
+    x &= 0xf;
+    return (0x6996 >> x) & 1;
+}
 
 #endif
diff --git a/libcorrect/src/CMakeLists.txt b/libcorrect/src/CMakeLists.txt
new file mode 100644
index 0000000..07eb7dc
--- /dev/null
+++ b/libcorrect/src/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_subdirectory(convolutional)
+add_subdirectory(reed-solomon)
diff --git a/libcorrect/src/convolutional/CMakeLists.txt b/libcorrect/src/convolutional/CMakeLists.txt
new file mode 100644
index 0000000..43c4bd9
--- /dev/null
+++ b/libcorrect/src/convolutional/CMakeLists.txt
@@ -0,0 +1,5 @@
+set(SRCFILES bit.c metric.c history_buffer.c error_buffer.c lookup.c convolutional.c encode.c decode.c)
+add_library(correct-convolutional OBJECT ${SRCFILES})
+if(HAVE_SSE)
+    add_subdirectory(sse)
+endif()
diff --git a/libcorrect/src/convolutional/bit.c b/libcorrect/src/convolutional/bit.c
new file mode 100644
index 0000000..9ac4e1e
--- /dev/null
+++ b/libcorrect/src/convolutional/bit.c
@@ -0,0 +1,232 @@
+#include "correct/convolutional/bit.h"
+
+bit_writer_t *bit_writer_create(uint8_t *bytes, size_t len) {
+    bit_writer_t *w = calloc(1, sizeof(bit_writer_t));
+
+    if (bytes) {
+        bit_writer_reconfigure(w, bytes, len);
+    }
+
+    return w;
+}
+
+void bit_writer_reconfigure(bit_writer_t *w, uint8_t *bytes, size_t len) {
+    w->bytes = bytes;
+    w->len = len;
+
+    w->current_byte = 0;
+    w->current_byte_len = 0;
+    w->byte_index = 0;
+}
+
+void bit_writer_destroy(bit_writer_t *w) {
+    free(w);
+}
+
+void bit_writer_write(bit_writer_t *w, uint8_t val, unsigned int n) {
+    for (size_t j = 0; j < n; j++) {
+        bit_writer_write_1(w, val);
+        val >>= 1;
+    }
+}
+
+void bit_writer_write_1(bit_writer_t *w, uint8_t val) {
+    w->current_byte |= val & 1;
+    w->current_byte_len++;
+
+    if (w->current_byte_len == 8) {
+        // 8 bits in a byte -- move to the next byte
+        w->bytes[w->byte_index] = w->current_byte;
+        w->byte_index++;
+        w->current_byte_len = 0;
+        w->current_byte = 0;
+    } else {
+        w->current_byte <<= 1;
+    }
+}
+
+void bit_writer_write_bitlist(bit_writer_t *w, uint8_t *l, size_t len) {
+    // first close the current byte
+    // we might have been given too few elements to do that. be careful.
+    size_t close_len = 8 - w->current_byte_len;
+    close_len = (close_len < len) ? close_len : len;
+
+    uint16_t b = w->current_byte;
+
+    for (ptrdiff_t i = 0; i < close_len; i++) {
+        b |= l[i];
+        b <<= 1;
+    }
+
+
+    l += close_len;
+    len -= close_len;
+
+    uint8_t *bytes = w->bytes;
+    size_t byte_index = w->byte_index;
+
+    if (w->current_byte_len + close_len == 8) {
+        b >>= 1;
+        bytes[byte_index] = b;
+        byte_index++;
+    } else {
+        w->current_byte = b;
+        w->current_byte_len += close_len;
+        return;
+    }
+
+    size_t full_bytes = len/8;
+
+    for (size_t i = 0; i < full_bytes; i++) {
+        bytes[byte_index] = l[0] << 7 | l[1] << 6 | l[2] << 5 |
+                            l[3] << 4 | l[4] << 3 | l[5] << 2 |
+                            l[6] << 1 | l[7];
+        byte_index += 1;
+        l += 8;
+    }
+
+    len -= 8*full_bytes;
+
+    b = 0;
+    for (ptrdiff_t i = 0; i < len; i++) {
+        b |= l[i];
+        b <<= 1;
+    }
+
+    w->current_byte = b;
+    w->byte_index = byte_index;
+    w->current_byte_len = len;
+}
+
+void bit_writer_write_bitlist_reversed(bit_writer_t *w, uint8_t *l, size_t len) {
+    l = l + len - 1;
+
+    uint8_t *bytes = w->bytes;
+    size_t byte_index = w->byte_index;
+    uint16_t b;
+
+    if (w->current_byte_len != 0) {
+        size_t close_len = 8 - w->current_byte_len;
+        close_len = (close_len < len) ? close_len : len;
+
+        b = w->current_byte;
+
+        for (ptrdiff_t i = 0; i < close_len; i++) {
+            b |= *l;
+            b <<= 1;
+            l--;
+        }
+
+        len -= close_len;
+
+        if (w->current_byte_len + close_len == 8) {
+            b >>= 1;
+            bytes[byte_index] = b;
+            byte_index++;
+        } else {
+            w->current_byte = b;
+            w->current_byte_len += close_len;
+            return;
+        }
+    }
+
+    size_t full_bytes = len/8;
+
+    for (size_t i = 0; i < full_bytes; i++) {
+        bytes[byte_index] = l[0] << 7 | l[-1] << 6 | l[-2] << 5 |
+                            l[-3] << 4 | l[-4] << 3 | l[-5] << 2 |
+                            l[-6] << 1 | l[-7];
+        byte_index += 1;
+        l -= 8;
+    }
+
+    len -= 8*full_bytes;
+
+    b = 0;
+    for (ptrdiff_t i = 0; i < len; i++) {
+        b |= *l;
+        b <<= 1;
+        l--;
+    }
+
+    w->current_byte = (uint8_t)b;
+    w->byte_index = byte_index;
+    w->current_byte_len = len;
+}
+
+void bit_writer_flush_byte(bit_writer_t *w) {
+    if (w->current_byte_len != 0) {
+        w->current_byte <<= (8 - w->current_byte_len);
+        w->bytes[w->byte_index] = w->current_byte;
+        w->byte_index++;
+        w->current_byte_len = 0;
+    }
+}
+
+size_t bit_writer_length(bit_writer_t *w) {
+    return w->byte_index;
+}
+
+uint8_t reverse_byte(uint8_t b) {
+    return (b & 0x80) >> 7 | (b & 0x40) >> 5 | (b & 0x20) >> 3 |
+           (b & 0x10) >> 1 | (b & 0x08) << 1 | (b & 0x04) << 3 |
+           (b & 0x02) << 5 | (b & 0x01) << 7;
+}
+
+static uint8_t reverse_table[256];
+
+void create_reverse_table() {
+    for (uint16_t i = 0; i < 256; i++) {
+        reverse_table[i] = reverse_byte(i);
+    }
+}
+
+bit_reader_t *bit_reader_create(const uint8_t *bytes, size_t len) {
+    bit_reader_t *r = calloc(1, sizeof(bit_reader_t));
+
+    static bool reverse_table_created = false;
+
+    if (!reverse_table_created) {
+        create_reverse_table();
+        reverse_table_created = true;
+    }
+
+    if (bytes) {
+        bit_reader_reconfigure(r, bytes, len);
+    }
+
+    return r;
+}
+
+void bit_reader_reconfigure(bit_reader_t *r, const uint8_t *bytes, size_t len) {
+    r->bytes = bytes;
+    r->len = len;
+
+    r->current_byte_len = 8;
+    r->current_byte = bytes[0];
+    r->byte_index = 0;
+}
+
+void bit_reader_destroy(bit_reader_t *r) {
+    free(r);
+}
+
+uint8_t bit_reader_read(bit_reader_t *r, unsigned int n) {
+    unsigned int read = 0;
+    unsigned int n_copy = n;
+
+    if (r->current_byte_len < n) {
+        read = r->current_byte & ((1 << r->current_byte_len) - 1);
+        r->byte_index++;
+        r->current_byte = r->bytes[r->byte_index];
+        n -= r->current_byte_len;
+        r->current_byte_len = 8;
+        read <<= n;
+    }
+
+    uint8_t copy_mask = (1 << n) - 1;
+    copy_mask <<= (r->current_byte_len - n);
+    read |= (r->current_byte & copy_mask) >> (r->current_byte_len - n);
+    r->current_byte_len -= n;
+    return reverse_table[read] >> (8 - n_copy);
+}
diff --git a/libcorrect/src/convolutional/convolutional.c b/libcorrect/src/convolutional/convolutional.c
new file mode 100644
index 0000000..910ed15
--- /dev/null
+++ b/libcorrect/src/convolutional/convolutional.c
@@ -0,0 +1,59 @@
+#include "correct/convolutional/convolutional.h"
+
+// https://www.youtube.com/watch?v=b3_lVSrPB6w
+
+correct_convolutional *_correct_convolutional_init(correct_convolutional *conv,
+                                                   size_t rate, size_t order,
+                                                   const polynomial_t *poly) {
+    if (order > 8 * sizeof(shift_register_t)) {
+        // XXX turn this into an error code
+        // printf("order must be smaller than 8 * sizeof(shift_register_t)\n");
+        return NULL;
+    }
+    if (rate < 2) {
+        // XXX turn this into an error code
+        // printf("rate must be 2 or greater\n");
+        return NULL;
+    }
+
+    conv->order = order;
+    conv->rate = rate;
+    conv->numstates = 1 << order;
+
+    unsigned int *table = malloc(sizeof(unsigned int) * (1 << order));
+    fill_table(conv->rate, conv->order, poly, table);
+    *(unsigned int **)&conv->table = table;
+
+    conv->bit_writer = bit_writer_create(NULL, 0);
+    conv->bit_reader = bit_reader_create(NULL, 0);
+
+    conv->has_init_decode = false;
+    return conv;
+}
+
+correct_convolutional *correct_convolutional_create(size_t rate, size_t order,
+                                                    const polynomial_t *poly) {
+    correct_convolutional *conv = malloc(sizeof(correct_convolutional));
+    correct_convolutional *init_conv = _correct_convolutional_init(conv, rate, order, poly);
+    if (!init_conv) {
+        free(conv);
+    }
+    return init_conv;
+}
+
+void _correct_convolutional_teardown(correct_convolutional *conv) {
+    free(*(unsigned int **)&conv->table);
+    bit_writer_destroy(conv->bit_writer);
+    bit_reader_destroy(conv->bit_reader);
+    if (conv->has_init_decode) {
+        pair_lookup_destroy(conv->pair_lookup);
+        history_buffer_destroy(conv->history_buffer);
+        error_buffer_destroy(conv->errors);
+        free(conv->distances);
+    }
+}
+
+void correct_convolutional_destroy(correct_convolutional *conv) {
+    _correct_convolutional_teardown(conv);
+    free(conv);
+}
diff --git a/libcorrect/src/convolutional/decode.c b/libcorrect/src/convolutional/decode.c
new file mode 100644
index 0000000..e8891e0
--- /dev/null
+++ b/libcorrect/src/convolutional/decode.c
@@ -0,0 +1,321 @@
+#include "correct/convolutional/convolutional.h"
+
+void conv_decode_print_iter(correct_convolutional *conv, unsigned int iter,
+                            unsigned int winner_index) {
+    if (iter < 2220) {
+        return;
+    }
+    printf("iteration: %d\n", iter);
+    distance_t *errors = conv->errors->write_errors;
+    printf("errors:\n");
+    for (shift_register_t i = 0; i < conv->numstates / 2; i++) {
+        printf("%2d: %d\n", i, errors[i]);
+    }
+    printf("\n");
+    printf("history:\n");
+    for (shift_register_t i = 0; i < conv->numstates / 2; i++) {
+        printf("%2d: ", i);
+        for (unsigned int j = 0; j <= winner_index; j++) {
+            printf("%d", conv->history_buffer->history[j][i] ? 1 : 0);
+        }
+        printf("\n");
+    }
+    printf("\n");
+}
+
+void convolutional_decode_warmup(correct_convolutional *conv, unsigned int sets,
+                                 const uint8_t *soft) {
+    // first phase: load shiftregister up from 0 (order goes from 1 to conv->order)
+    // we are building up error metrics for the first order bits
+    for (unsigned int i = 0; i < conv->order - 1 && i < sets; i++) {
+        // peel off rate bits from encoded to recover the same `out` as in the encoding process
+        // the difference being that this `out` will have the channel noise/errors applied
+        unsigned int out;
+        if (!soft) {
+            out = bit_reader_read(conv->bit_reader, conv->rate);
+        }
+        const distance_t *read_errors = conv->errors->read_errors;
+        distance_t *write_errors = conv->errors->write_errors;
+        // walk all of the state we have so far
+        for (size_t j = 0; j < (1 << (i + 1)); j += 1) {
+            unsigned int last = j >> 1;
+            distance_t dist;
+            if (soft) {
+                if (conv->soft_measurement == CORRECT_SOFT_LINEAR) {
+                    dist = metric_soft_distance_linear(conv->table[j], soft + i * conv->rate,
+                                                       conv->rate);
+                } else {
+                    dist = metric_soft_distance_quadratic(conv->table[j], soft + i * conv->rate,
+                                                          conv->rate);
+                }
+            } else {
+                dist = metric_distance(conv->table[j], out);
+            }
+            write_errors[j] = dist + read_errors[last];
+        }
+        error_buffer_swap(conv->errors);
+    }
+}
+
+void convolutional_decode_inner(correct_convolutional *conv, unsigned int sets,
+                                const uint8_t *soft) {
+    shift_register_t highbit = 1 << (conv->order - 1);
+    for (unsigned int i = conv->order - 1; i < (sets - conv->order + 1); i++) {
+        distance_t *distances = conv->distances;
+        // lasterrors are the aggregate bit errors for the states of shiftregister for the previous
+        // time slice
+        if (soft) {
+            if (conv->soft_measurement == CORRECT_SOFT_LINEAR) {
+                for (unsigned int j = 0; j < 1 << (conv->rate); j++) {
+                    distances[j] =
+                        metric_soft_distance_linear(j, soft + i * conv->rate, conv->rate);
+                }
+            } else {
+                for (unsigned int j = 0; j < 1 << (conv->rate); j++) {
+                    distances[j] =
+                        metric_soft_distance_quadratic(j, soft + i * conv->rate, conv->rate);
+                }
+            }
+        } else {
+            unsigned int out = bit_reader_read(conv->bit_reader, conv->rate);
+            for (unsigned int i = 0; i < 1 << (conv->rate); i++) {
+                distances[i] = metric_distance(i, out);
+            }
+        }
+        pair_lookup_t pair_lookup = conv->pair_lookup;
+        pair_lookup_fill_distance(pair_lookup, distances);
+
+        // a mask to get the high order bit from the shift register
+        unsigned int num_iter = highbit << 1;
+        const distance_t *read_errors = conv->errors->read_errors;
+        // aggregate bit errors for this time slice
+        distance_t *write_errors = conv->errors->write_errors;
+
+        uint8_t *history = history_buffer_get_slice(conv->history_buffer);
+        // walk through all states, ignoring oldest bit
+        // we will track a best register state (path) and the number of bit errors at that path at
+        // this time slice
+        // this loop considers two paths per iteration (high order bit set, clear)
+        // so, it only runs numstates/2 iterations
+        // we'll update the history for every state and find the path with the least aggregated bit
+        // errors
+
+        // now run the main loop
+        // we calculate 2 sets of 2 register states here (4 states per iter)
+        // this creates 2 sets which share a predecessor, and 2 sets which share a successor
+        //
+        // the first set definition is the two states that are the same except for the least order
+        // bit
+        // these two share a predecessor because their high n - 1 bits are the same (differ only by
+        // newest bit)
+        //
+        // the second set definition is the two states that are the same except for the high order
+        // bit
+        // these two share a successor because the oldest high order bit will be shifted out, and
+        // the other bits will be present in the successor
+        //
+        shift_register_t highbase = highbit >> 1;
+        for (shift_register_t low = 0, high = highbit, base = 0; high < num_iter;
+             low += 8, high += 8, base += 4) {
+            // shifted-right ancestors
+            // low and low_plus_one share low_past_error
+            //   note that they are the same when shifted right by 1
+            // same goes for high and high_plus_one
+            for (shift_register_t offset = 0, base_offset = 0; base_offset < 4;
+                 offset += 2, base_offset += 1) {
+                distance_pair_key_t low_key = pair_lookup.keys[base + base_offset];
+                distance_pair_key_t high_key = pair_lookup.keys[highbase + base + base_offset];
+                distance_pair_t low_concat_dist = pair_lookup.distances[low_key];
+                distance_pair_t high_concat_dist = pair_lookup.distances[high_key];
+
+                distance_t low_past_error = read_errors[base + base_offset];
+                distance_t high_past_error = read_errors[highbase + base + base_offset];
+
+                distance_t low_error = (low_concat_dist & 0xffff) + low_past_error;
+                distance_t high_error = (high_concat_dist & 0xffff) + high_past_error;
+
+                shift_register_t successor = low + offset;
+                distance_t error;
+                uint8_t history_mask;
+                if (low_error <= high_error) {
+                    error = low_error;
+                    history_mask = 0;
+                } else {
+                    error = high_error;
+                    history_mask = 1;
+                }
+                write_errors[successor] = error;
+                history[successor] = history_mask;
+
+                shift_register_t low_plus_one = low + offset + 1;
+
+                distance_t low_plus_one_error = (low_concat_dist >> 16) + low_past_error;
+                distance_t high_plus_one_error = (high_concat_dist >> 16) + high_past_error;
+
+                shift_register_t plus_one_successor = low_plus_one;
+                distance_t plus_one_error;
+                uint8_t plus_one_history_mask;
+                if (low_plus_one_error <= high_plus_one_error) {
+                    plus_one_error = low_plus_one_error;
+                    plus_one_history_mask = 0;
+                } else {
+                    plus_one_error = high_plus_one_error;
+                    plus_one_history_mask = 1;
+                }
+                write_errors[plus_one_successor] = plus_one_error;
+                history[plus_one_successor] = plus_one_history_mask;
+            }
+        }
+
+        history_buffer_process(conv->history_buffer, write_errors, conv->bit_writer);
+        error_buffer_swap(conv->errors);
+    }
+}
+
+void convolutional_decode_tail(correct_convolutional *conv, unsigned int sets,
+                               const uint8_t *soft) {
+    // flush state registers
+    // now we only shift in 0s, skipping 1-successors
+    shift_register_t highbit = 1 << (conv->order - 1);
+    for (unsigned int i = sets - conv->order + 1; i < sets; i++) {
+        // lasterrors are the aggregate bit errors for the states of shiftregister for the previous
+        // time slice
+        const distance_t *read_errors = conv->errors->read_errors;
+        // aggregate bit errors for this time slice
+        distance_t *write_errors = conv->errors->write_errors;
+
+        uint8_t *history = history_buffer_get_slice(conv->history_buffer);
+
+        // calculate the distance from all output states to our sliced bits
+        distance_t *distances = conv->distances;
+        if (soft) {
+            if (conv->soft_measurement == CORRECT_SOFT_LINEAR) {
+                for (unsigned int j = 0; j < 1 << (conv->rate); j++) {
+                    distances[j] =
+                        metric_soft_distance_linear(j, soft + i * conv->rate, conv->rate);
+                }
+            } else {
+                for (unsigned int j = 0; j < 1 << (conv->rate); j++) {
+                    distances[j] =
+                        metric_soft_distance_quadratic(j, soft + i * conv->rate, conv->rate);
+                }
+            }
+        } else {
+            unsigned int out = bit_reader_read(conv->bit_reader, conv->rate);
+            for (unsigned int i = 0; i < 1 << (conv->rate); i++) {
+                distances[i] = metric_distance(i, out);
+            }
+        }
+        const unsigned int *table = conv->table;
+
+        // a mask to get the high order bit from the shift register
+        unsigned int num_iter = highbit << 1;
+        unsigned int skip = 1 << (conv->order - (sets - i));
+        unsigned int base_skip = skip >> 1;
+
+        shift_register_t highbase = highbit >> 1;
+        for (shift_register_t low = 0, high = highbit, base = 0; high < num_iter;
+             low += skip, high += skip, base += base_skip) {
+            unsigned int low_output = table[low];
+            unsigned int high_output = table[high];
+            distance_t low_dist = distances[low_output];
+            distance_t high_dist = distances[high_output];
+
+            distance_t low_past_error = read_errors[base];
+            distance_t high_past_error = read_errors[highbase + base];
+
+            distance_t low_error = low_dist + low_past_error;
+            distance_t high_error = high_dist + high_past_error;
+
+            shift_register_t successor = low;
+            distance_t error;
+            uint8_t history_mask;
+            if (low_error < high_error) {
+                error = low_error;
+                history_mask = 0;
+            } else {
+                error = high_error;
+                history_mask = 1;
+            }
+            write_errors[successor] = error;
+            history[successor] = history_mask;
+        }
+
+        history_buffer_process_skip(conv->history_buffer, write_errors, conv->bit_writer, skip);
+        error_buffer_swap(conv->errors);
+    }
+}
+
+void _convolutional_decode_init(correct_convolutional *conv, unsigned int min_traceback,
+                                unsigned int traceback_length, unsigned int renormalize_interval) {
+    conv->has_init_decode = true;
+
+    conv->distances = calloc(1 << (conv->rate), sizeof(distance_t));
+    conv->pair_lookup = pair_lookup_create(conv->rate, conv->order, conv->table);
+
+    conv->soft_measurement = CORRECT_SOFT_LINEAR;
+
+    // we limit history to go back as far as 5 * the order of our polynomial
+    conv->history_buffer = history_buffer_create(min_traceback, traceback_length, renormalize_interval,
+                                                 conv->numstates / 2, 1 << (conv->order - 1));
+
+    conv->errors = error_buffer_create(conv->numstates);
+}
+
+static ssize_t _convolutional_decode(correct_convolutional *conv, size_t num_encoded_bits,
+                                     size_t num_encoded_bytes, uint8_t *msg,
+                                     const soft_t *soft_encoded) {
+    if (!conv->has_init_decode) {
+        uint64_t max_error_per_input = conv->rate * soft_max;
+        unsigned int renormalize_interval = distance_max / max_error_per_input;
+        _convolutional_decode_init(conv, 5 * conv->order, 15 * conv->order, renormalize_interval);
+    }
+
+    size_t sets = num_encoded_bits / conv->rate;
+    // XXX fix this vvvvvv
+    size_t decoded_len_bytes = num_encoded_bytes;
+    bit_writer_reconfigure(conv->bit_writer, msg, decoded_len_bytes);
+
+    error_buffer_reset(conv->errors);
+    history_buffer_reset(conv->history_buffer);
+
+    // no outputs are generated during warmup
+    convolutional_decode_warmup(conv, sets, soft_encoded);
+    convolutional_decode_inner(conv, sets, soft_encoded);
+    convolutional_decode_tail(conv, sets, soft_encoded);
+
+    history_buffer_flush(conv->history_buffer, conv->bit_writer);
+
+    return bit_writer_length(conv->bit_writer);
+}
+
+// perform viterbi decoding
+// hard decoder
+ssize_t correct_convolutional_decode(correct_convolutional *conv, const uint8_t *encoded,
+                                     size_t num_encoded_bits, uint8_t *msg) {
+    if (num_encoded_bits % conv->rate) {
+        // XXX turn this into an error code
+        // printf("encoded length of message must be a multiple of rate\n");
+        return -1;
+    }
+
+    size_t num_encoded_bytes =
+        (num_encoded_bits % 8) ? (num_encoded_bits / 8 + 1) : (num_encoded_bits / 8);
+    bit_reader_reconfigure(conv->bit_reader, encoded, num_encoded_bytes);
+
+    return _convolutional_decode(conv, num_encoded_bits, num_encoded_bytes, msg, NULL);
+}
+
+ssize_t correct_convolutional_decode_soft(correct_convolutional *conv, const soft_t *encoded,
+                                          size_t num_encoded_bits, uint8_t *msg) {
+    if (num_encoded_bits % conv->rate) {
+        // XXX turn this into an error code
+        // printf("encoded length of message must be a multiple of rate\n");
+        return -1;
+    }
+
+    size_t num_encoded_bytes =
+        (num_encoded_bits % 8) ? (num_encoded_bits / 8 + 1) : (num_encoded_bits / 8);
+
+    return _convolutional_decode(conv, num_encoded_bits, num_encoded_bytes, msg, encoded);
+}
diff --git a/libcorrect/src/convolutional/encode.c b/libcorrect/src/convolutional/encode.c
new file mode 100644
index 0000000..5041262
--- /dev/null
+++ b/libcorrect/src/convolutional/encode.c
@@ -0,0 +1,61 @@
+#include "correct/convolutional/convolutional.h"
+
+size_t correct_convolutional_encode_len(correct_convolutional *conv, size_t msg_len) {
+    size_t msgbits = 8 * msg_len;
+    size_t encodedbits = conv->rate * (msgbits + conv->order + 1);
+    return encodedbits;
+}
+
+// shift in most significant bit every time, one byte at a time
+// shift register takes most recent bit on right, shifts left
+// poly is written in same order, just & mask message w/ poly
+
+// assume that encoded length is long enough?
+size_t correct_convolutional_encode(correct_convolutional *conv,
+                                    const uint8_t *msg,
+                                    size_t msg_len,
+                                    uint8_t *encoded) {
+    // convolutional code convolves filter coefficients, given by
+    //     the polynomial, with some history from our message.
+    //     the history is stored as single subsequent bits in shiftregister
+    shift_register_t shiftregister = 0;
+
+    // shiftmask is the shiftregister bit mask that removes bits
+    //      that extend beyond order
+    // e.g. if order is 7, then remove the 8th bit and beyond
+    unsigned int shiftmask = (1 << conv->order) - 1;
+
+    size_t encoded_len_bits = correct_convolutional_encode_len(conv, msg_len);
+    size_t encoded_len = (encoded_len_bits % 8) ? (encoded_len_bits / 8 + 1) : (encoded_len_bits / 8);
+    bit_writer_reconfigure(conv->bit_writer, encoded, encoded_len);
+
+    bit_reader_reconfigure(conv->bit_reader, msg, msg_len);
+
+    for (size_t i = 0; i < 8 * msg_len; i++) {
+        // shiftregister has oldest bits on left, newest on right
+        shiftregister <<= 1;
+        shiftregister |= bit_reader_read(conv->bit_reader, 1);
+        shiftregister &= shiftmask;
+        // shift most significant bit from byte and move down one bit at a time
+
+        // we do direct lookup of our convolutional output here
+        // all of the bits from this convolution are stored in this row
+        unsigned int out = conv->table[shiftregister];
+        bit_writer_write(conv->bit_writer, out, conv->rate);
+    }
+
+    // now flush the shiftregister
+    // this is simply running the loop as above but without any new inputs
+    // or rather, the new input string is all 0s
+    for (size_t i = 0; i < conv->order + 1; i++) {
+        shiftregister <<= 1;
+        shiftregister &= shiftmask;
+        unsigned int out = conv->table[shiftregister];
+        bit_writer_write(conv->bit_writer, out, conv->rate);
+    }
+
+    // 0-fill any remaining bits on our final byte
+    bit_writer_flush_byte(conv->bit_writer);
+
+    return encoded_len_bits;
+}
diff --git a/libcorrect/src/convolutional/error_buffer.c b/libcorrect/src/convolutional/error_buffer.c
new file mode 100644
index 0000000..a5fc0ab
--- /dev/null
+++ b/libcorrect/src/convolutional/error_buffer.c
@@ -0,0 +1,43 @@
+#include "correct/convolutional/error_buffer.h"
+
+error_buffer_t *error_buffer_create(unsigned int num_states) {
+    error_buffer_t *buf = calloc(1, sizeof(error_buffer_t));
+
+    // how large are the error buffers?
+    buf->num_states = num_states;
+
+    // save two error metrics, one for last round and one for this
+    // (double buffer)
+    // the error metric is the aggregated number of bit errors found
+    //   at a given path which terminates at a particular shift register state
+    buf->errors[0] = calloc(sizeof(distance_t), num_states);
+    buf->errors[1] = calloc(sizeof(distance_t), num_states);
+
+    // which buffer are we using, 0 or 1?
+    buf->index = 0;
+
+    buf->read_errors = buf->errors[0];
+    buf->write_errors = buf->errors[1];
+
+    return buf;
+}
+
+void error_buffer_destroy(error_buffer_t *buf) {
+    free(buf->errors[0]);
+    free(buf->errors[1]);
+    free(buf);
+}
+
+void error_buffer_reset(error_buffer_t *buf) {
+    memset(buf->errors[0], 0, buf->num_states * sizeof(distance_t));
+    memset(buf->errors[1], 0, buf->num_states * sizeof(distance_t));
+    buf->index = 0;
+    buf->read_errors = buf->errors[0];
+    buf->write_errors = buf->errors[1];
+}
+
+void error_buffer_swap(error_buffer_t *buf) {
+    buf->read_errors = buf->errors[buf->index];
+    buf->index = (buf->index + 1) % 2;
+    buf->write_errors = buf->errors[buf->index];
+}
diff --git a/libcorrect/src/convolutional/history_buffer.c b/libcorrect/src/convolutional/history_buffer.c
new file mode 100644
index 0000000..f54ffdd
--- /dev/null
+++ b/libcorrect/src/convolutional/history_buffer.c
@@ -0,0 +1,158 @@
+#include "correct/convolutional/history_buffer.h"
+
+history_buffer *history_buffer_create(unsigned int min_traceback_length,
+                                      unsigned int traceback_group_length,
+                                      unsigned int renormalize_interval, unsigned int num_states,
+                                      shift_register_t highbit) {
+    history_buffer *buf = calloc(1, sizeof(history_buffer));
+
+    *(unsigned int *)&buf->min_traceback_length = min_traceback_length;
+    *(unsigned int *)&buf->traceback_group_length = traceback_group_length;
+    *(unsigned int *)&buf->cap = min_traceback_length + traceback_group_length;
+    *(unsigned int *)&buf->num_states = num_states;
+    *(shift_register_t *)&buf->highbit = highbit;
+
+    buf->history = malloc(buf->cap * sizeof(uint8_t *));
+    for (unsigned int i = 0; i < buf->cap; i++) {
+        buf->history[i] = calloc(num_states, sizeof(uint8_t));
+    }
+    buf->fetched = malloc(buf->cap * sizeof(uint8_t));
+
+    buf->index = 0;
+    buf->len = 0;
+
+    buf->renormalize_counter = 0;
+    buf->renormalize_interval = renormalize_interval;
+
+    return buf;
+}
+
+void history_buffer_destroy(history_buffer *buf) {
+    for (unsigned int i = 0; i < buf->cap; i++) {
+        free(buf->history[i]);
+    }
+    free(buf->history);
+    free(buf->fetched);
+    free(buf);
+}
+
+void history_buffer_reset(history_buffer *buf) {
+    buf->len = 0;
+    buf->index = 0;
+}
+
+uint8_t *history_buffer_get_slice(history_buffer *buf) { return buf->history[buf->index]; }
+
+shift_register_t history_buffer_search(history_buffer *buf, const distance_t *distances,
+                                       unsigned int search_every) {
+    shift_register_t bestpath;
+    distance_t leasterror = USHRT_MAX;
+    // search for a state with the least error
+    for (shift_register_t state = 0; state < buf->num_states; state += search_every) {
+        if (distances[state] < leasterror) {
+            leasterror = distances[state];
+            bestpath = state;
+        }
+    }
+    return bestpath;
+}
+
+void history_buffer_renormalize(history_buffer *buf, distance_t *distances,
+                                shift_register_t min_register) {
+    distance_t min_distance = distances[min_register];
+    for (shift_register_t i = 0; i < buf->num_states; i++) {
+        distances[i] -= min_distance;
+    }
+}
+
+void history_buffer_traceback(history_buffer *buf, shift_register_t bestpath,
+                              unsigned int min_traceback_length, bit_writer_t *output) {
+    unsigned int fetched_index = 0;
+    shift_register_t highbit = buf->highbit;
+    unsigned int index = buf->index;
+    unsigned int cap = buf->cap;
+    for (unsigned int j = 0; j < min_traceback_length; j++) {
+        if (index == 0) {
+            index = cap - 1;
+        } else {
+            index--;
+        }
+        // we're walking backwards from what the work we did before
+        // so, we'll shift high order bits in
+        // the path will cross multiple different shift register states, and we determine
+        //   which state by going backwards one time slice at a time
+        uint8_t history = buf->history[index][bestpath];
+        shift_register_t pathbit = history ? highbit : 0;
+        bestpath |= pathbit;
+        bestpath >>= 1;
+    }
+    unsigned int prefetch_index = index;
+    if (prefetch_index == 0) {
+        prefetch_index = cap - 1;
+    } else {
+        prefetch_index--;
+    }
+    unsigned int len = buf->len;
+    for (unsigned int j = min_traceback_length; j < len; j++) {
+        index = prefetch_index;
+        if (prefetch_index == 0) {
+            prefetch_index = cap - 1;
+        } else {
+            prefetch_index--;
+        }
+        prefetch(buf->history[prefetch_index]);
+        // we're walking backwards from what the work we did before
+        // so, we'll shift high order bits in
+        // the path will cross multiple different shift register states, and we determine
+        //   which state by going backwards one time slice at a time
+        uint8_t history = buf->history[index][bestpath];
+        shift_register_t pathbit = history ? highbit : 0;
+        bestpath |= pathbit;
+        bestpath >>= 1;
+        buf->fetched[fetched_index] = (pathbit ? 1 : 0);
+        fetched_index++;
+    }
+    bit_writer_write_bitlist_reversed(output, buf->fetched, fetched_index);
+    buf->len -= fetched_index;
+}
+
+void history_buffer_process_skip(history_buffer *buf, distance_t *distances, bit_writer_t *output,
+                                 unsigned int skip) {
+    buf->index++;
+    if (buf->index == buf->cap) {
+        buf->index = 0;
+    }
+
+    buf->renormalize_counter++;
+    buf->len++;
+
+    // there are four ways these branches can resolve
+    // a) we are neither renormalizing nor doing a traceback
+    // b) we are renormalizing but not doing a traceback
+    // c) we are renormalizing and doing a traceback
+    // d) we are not renormalizing but we are doing a traceback
+    // in case c, we want to save the effort of finding the bestpath
+    //    since that's expensive
+    // so we have to check for that case after we renormalize
+    if (buf->renormalize_counter == buf->renormalize_interval) {
+        buf->renormalize_counter = 0;
+        shift_register_t bestpath = history_buffer_search(buf, distances, skip);
+        history_buffer_renormalize(buf, distances, bestpath);
+        if (buf->len == buf->cap) {
+            // reuse the bestpath found for renormalizing
+            history_buffer_traceback(buf, bestpath, buf->min_traceback_length, output);
+        }
+    } else if (buf->len == buf->cap) {
+        // not renormalizing, find the bestpath here
+        shift_register_t bestpath = history_buffer_search(buf, distances, skip);
+        history_buffer_traceback(buf, bestpath, buf->min_traceback_length, output);
+    }
+}
+
+void history_buffer_process(history_buffer *buf, distance_t *distances, bit_writer_t *output) {
+    history_buffer_process_skip(buf, distances, output, 1);
+}
+
+void history_buffer_flush(history_buffer *buf, bit_writer_t *output) {
+    history_buffer_traceback(buf, 0, 0, output);
+}
diff --git a/libcorrect/src/convolutional/lookup.c b/libcorrect/src/convolutional/lookup.c
new file mode 100644
index 0000000..8c96aae
--- /dev/null
+++ b/libcorrect/src/convolutional/lookup.c
@@ -0,0 +1,74 @@
+#include "correct/convolutional/lookup.h"
+
+// table has numstates rows
+// each row contains all of the polynomial output bits concatenated together
+// e.g. for rate 2, we have 2 bits in each row
+// the first poly gets the LEAST significant bit, last poly gets most significant
+void fill_table(unsigned int rate,
+                unsigned int order,
+                const polynomial_t *poly,
+                unsigned int *table) {
+    for (shift_register_t i = 0; i < 1 << order; i++) {
+        unsigned int out = 0;
+        unsigned int mask = 1;
+        for (size_t j = 0; j < rate; j++) {
+            out |= (popcount(i & poly[j]) % 2) ? mask : 0;
+            mask <<= 1;
+        }
+        table[i] = out;
+    }
+}
+
+pair_lookup_t pair_lookup_create(unsigned int rate,
+                                 unsigned int order,
+                                 const unsigned int *table) {
+    pair_lookup_t pairs;
+
+    pairs.keys = malloc(sizeof(unsigned int) * (1 << (order - 1)));
+    pairs.outputs = calloc((1 << (rate * 2)), sizeof(unsigned int));
+    unsigned int *inv_outputs = calloc((1 << (rate * 2)), sizeof(unsigned int));
+    unsigned int output_counter = 1;
+    // for every (even-numbered) shift register state, find the concatenated output of the state
+    //   and the subsequent state that follows it (low bit set). then, check to see if this
+    //   concatenated output has a unique key assigned to it already. if not, give it a key.
+    //   if it does, retrieve the key. assign this key to the shift register state.
+    for (unsigned int i = 0; i < (1 << (order - 1)); i++) {
+        // first get the concatenated pair of outputs
+        unsigned int out = table[i * 2 + 1];
+        out <<= rate;
+        out |= table[i * 2];
+
+        // does this concatenated output exist in the outputs table yet?
+        if (!inv_outputs[out]) {
+            // doesn't exist, allocate a new key
+            inv_outputs[out] = output_counter;
+            pairs.outputs[output_counter] = out;
+            output_counter++;
+        }
+        // set the opaque key for the ith shift register state to the concatenated output entry
+        pairs.keys[i] = inv_outputs[out];
+    }
+    pairs.outputs_len = output_counter;
+    pairs.output_mask = (1 << (rate)) - 1;
+    pairs.output_width = rate;
+    pairs.distances = calloc(pairs.outputs_len, sizeof(distance_pair_t));
+    free(inv_outputs);
+    return pairs;
+}
+
+void pair_lookup_destroy(pair_lookup_t pairs) {
+    free(pairs.keys);
+    free(pairs.outputs);
+    free(pairs.distances);
+}
+
+void pair_lookup_fill_distance(pair_lookup_t pairs, distance_t *distances) {
+    for (unsigned int i = 1; i < pairs.outputs_len; i += 1) {
+        output_pair_t concat_out = pairs.outputs[i];
+        unsigned int i_0 = concat_out & pairs.output_mask;
+        concat_out >>= pairs.output_width;
+        unsigned int i_1 = concat_out;
+
+        pairs.distances[i] = (distances[i_1] << 16) | distances[i_0];
+    }
+}
diff --git a/libcorrect/src/convolutional/metric.c b/libcorrect/src/convolutional/metric.c
new file mode 100644
index 0000000..894db4d
--- /dev/null
+++ b/libcorrect/src/convolutional/metric.c
@@ -0,0 +1,17 @@
+#include "correct/convolutional/metric.h"
+
+// measure the square of the euclidean distance between x and y
+// since euclidean dist is sqrt(a^2 + b^2 + ... + n^2), the square is just
+//    a^2 + b^2 + ... + n^2
+distance_t metric_soft_distance_quadratic(unsigned int hard_x, const uint8_t *soft_y, size_t len) {
+    distance_t dist = 0;
+    for (unsigned int i = 0; i < len; i++) {
+        // first, convert hard_x to a soft measurement (0 -> 0, 1 - > 255)
+        unsigned int soft_x = (hard_x & 1) ? 255 : 0;
+        hard_x >>= 1;
+        int d = soft_y[i] - soft_x;
+        dist += d*d;
+    }
+    return dist >> 3;
+}
+
diff --git a/libcorrect/src/convolutional/sse/CMakeLists.txt b/libcorrect/src/convolutional/sse/CMakeLists.txt
new file mode 100644
index 0000000..0d0ade9
--- /dev/null
+++ b/libcorrect/src/convolutional/sse/CMakeLists.txt
@@ -0,0 +1,2 @@
+set(SRCFILES lookup.c convolutional.c encode.c decode.c)
+add_library(correct-convolutional-sse OBJECT ${SRCFILES})
diff --git a/libcorrect/src/convolutional/sse/convolutional.c b/libcorrect/src/convolutional/sse/convolutional.c
new file mode 100644
index 0000000..484c5c6
--- /dev/null
+++ b/libcorrect/src/convolutional/sse/convolutional.c
@@ -0,0 +1,21 @@
+#include "correct/convolutional/sse/convolutional.h"
+
+correct_convolutional_sse *correct_convolutional_sse_create(size_t rate,
+                                                            size_t order,
+                                                            const polynomial_t *poly) {
+    correct_convolutional_sse *conv = malloc(sizeof(correct_convolutional_sse));
+    correct_convolutional *init_conv = _correct_convolutional_init(&conv->base_conv, rate, order, poly);
+    if (!init_conv) {
+        free(conv);
+        conv = NULL;
+    }
+    return conv;
+}
+
+void correct_convolutional_sse_destroy(correct_convolutional_sse *conv) {
+    if (conv->base_conv.has_init_decode) {
+        oct_lookup_destroy(conv->oct_lookup);
+    }
+    _correct_convolutional_teardown(&conv->base_conv);
+    free(conv);
+}
diff --git a/libcorrect/src/convolutional/sse/decode.c b/libcorrect/src/convolutional/sse/decode.c
new file mode 100644
index 0000000..0f6bcf5
--- /dev/null
+++ b/libcorrect/src/convolutional/sse/decode.c
@@ -0,0 +1,319 @@
+#include "correct/convolutional/sse/convolutional.h"
+
+static void convolutional_sse_decode_inner(correct_convolutional_sse *sse_conv, unsigned int sets,
+                                           const uint8_t *soft) {
+    correct_convolutional *conv = &sse_conv->base_conv;
+    shift_register_t highbit = 1 << (conv->order - 1);
+    unsigned int hist_buf_index = conv->history_buffer->index;
+    unsigned int hist_buf_cap = conv->history_buffer->cap;
+    unsigned int hist_buf_len = conv->history_buffer->len;
+    unsigned int hist_buf_rn_int = conv->history_buffer->renormalize_interval;
+    unsigned int hist_buf_rn_cnt = conv->history_buffer->renormalize_counter;
+    for (unsigned int i = conv->order - 1; i < (sets - conv->order + 1); i++) {
+        distance_t *distances = conv->distances;
+        // lasterrors are the aggregate bit errors for the states of
+        // shiftregister for the previous time slice
+        if (soft) {
+            if (conv->soft_measurement == CORRECT_SOFT_LINEAR) {
+                for (unsigned int j = 0; j < 1 << (conv->rate); j++) {
+                    distances[j] =
+                        metric_soft_distance_linear(j, soft + i * conv->rate, conv->rate);
+                }
+            } else {
+                for (unsigned int j = 0; j < 1 << (conv->rate); j++) {
+                    distances[j] =
+                        metric_soft_distance_quadratic(j, soft + i * conv->rate, conv->rate);
+                }
+            }
+        } else {
+            unsigned int out = bit_reader_read(conv->bit_reader, conv->rate);
+            for (unsigned int i = 0; i < 1 << (conv->rate); i++) {
+                distances[i] = metric_distance(i, out);
+            }
+        }
+        oct_lookup_t oct_lookup = sse_conv->oct_lookup;
+        oct_lookup_fill_distance(oct_lookup, distances);
+
+        // a mask to get the high order bit from the shift register
+        unsigned int num_iter = highbit << 1;
+        const distance_t *read_errors = conv->errors->read_errors;
+        // aggregate bit errors for this time slice
+        distance_t *write_errors = conv->errors->write_errors;
+
+        uint8_t *history = conv->history_buffer->history[hist_buf_index];
+        ;
+        // walk through all states, ignoring oldest bit
+        // we will track a best register state (path) and the number of bit
+        // errors at that path at this time slice
+        // this loop considers two paths per iteration (high order bit set,
+        // clear)
+        // so, it only runs numstates/2 iterations
+        // we'll update the history for every state and find the path with the
+        // least aggregated bit errors
+
+        // now run the main loop
+        // we calculate 2 sets of 2 register states here (4 states per iter)
+        // this creates 2 sets which share a predecessor, and 2 sets which share
+        // a successor
+        //
+        // the first set definition is the two states that are the same except
+        // for the least order bit
+        // these two share a predecessor because their high n - 1 bits are the
+        // same (differ only by newest bit)
+        //
+        // the second set definition is the two states that are the same except
+        // for the high order bit
+        // these two share a successor because the oldest high order bit will be
+        // shifted out, and the other bits will be present in the successor
+        //
+        shift_register_t highbase = highbit >> 1;
+        shift_register_t oct_highbase = highbase >> 2;
+        for (shift_register_t low = 0, high = highbit, base = 0, oct = 0; high < num_iter;
+             low += 32, high += 32, base += 16, oct += 4) {
+            // shifted-right ancestors
+            // low and low_plus_one share low_past_error
+            //   note that they are the same when shifted right by 1
+            // same goes for high and high_plus_one
+            __m128i past_shuffle_mask =
+                _mm_set_epi32(0x07060706, 0x05040504, 0x03020302, 0x01000100);
+            __m128i hist_mask =
+                _mm_set_epi32(0x80808080, 0x80808080, 0x0e0c0a09, 0x07050301);
+
+            // the loop below calculates 64 register states per loop iteration
+            // it does this by packing the 128-bit xmm registers with 8, 16-bit
+            // distances
+            // 4 of these registers hold distances for convolutional shift
+            // register states with the high bit cleared
+            //      and 4 hold distances for the corresponding shift register
+            //      states with the high bit set
+            // since each xmm register holds 8 distances, this adds up to a
+            // total of 8 * 8 = 64 shift register states
+            for (shift_register_t offset = 0, base_offset = 0; base_offset < 16;
+                 offset += 32, base_offset += 16) {
+                // load the past error for the register states with the high
+                // order bit cleared
+                __m128i low_past_error =
+                    _mm_loadl_epi64((const __m128i *)(read_errors + base + base_offset));
+                __m128i low_past_error0 =
+                    _mm_loadl_epi64((const __m128i *)(read_errors + base + base_offset + 4));
+                __m128i low_past_error1 =
+                    _mm_loadl_epi64((const __m128i *)(read_errors + base + base_offset + 8));
+                __m128i low_past_error2 =
+                    _mm_loadl_epi64((const __m128i *)(read_errors + base + base_offset + 12));
+
+                // shuffle the low past error
+                // register states that differ only by their low order bit share
+                // a past error
+                low_past_error = _mm_shuffle_epi8(low_past_error, past_shuffle_mask);
+                low_past_error0 = _mm_shuffle_epi8(low_past_error0, past_shuffle_mask);
+                low_past_error1 = _mm_shuffle_epi8(low_past_error1, past_shuffle_mask);
+                low_past_error2 = _mm_shuffle_epi8(low_past_error2, past_shuffle_mask);
+
+                // repeat past error lookup for register states with high order
+                // bit set
+                __m128i high_past_error =
+                    _mm_loadl_epi64((const __m128i *)(read_errors + highbase + base + base_offset));
+                __m128i high_past_error0 = _mm_loadl_epi64(
+                    (const __m128i *)(read_errors + highbase + base + base_offset + 4));
+                __m128i high_past_error1 = _mm_loadl_epi64(
+                    (const __m128i *)(read_errors + highbase + base + base_offset + 8));
+                __m128i high_past_error2 = _mm_loadl_epi64(
+                    (const __m128i *)(read_errors + highbase + base + base_offset + 12));
+
+                high_past_error = _mm_shuffle_epi8(high_past_error, past_shuffle_mask);
+                high_past_error0 = _mm_shuffle_epi8(high_past_error0, past_shuffle_mask);
+                high_past_error1 = _mm_shuffle_epi8(high_past_error1, past_shuffle_mask);
+                high_past_error2 = _mm_shuffle_epi8(high_past_error2, past_shuffle_mask);
+
+                // __m128i this_shuffle_mask = (__m128i){0x80800100, 0x80800302,
+                // 0x80800504, 0x80800706};
+
+                // load the opaque oct distance table keys from out loop index
+                distance_oct_key_t low_key = oct_lookup.keys[oct + (base_offset / 4)];
+                distance_oct_key_t low_key0 = oct_lookup.keys[oct + (base_offset / 4) + 1];
+                distance_oct_key_t low_key1 = oct_lookup.keys[oct + (base_offset / 4) + 2];
+                distance_oct_key_t low_key2 = oct_lookup.keys[oct + (base_offset / 4) + 3];
+
+                // load the distances for the register states with high order
+                // bit cleared
+                __m128i low_this_error =
+                    _mm_load_si128((const __m128i *)(oct_lookup.distances + low_key));
+                __m128i low_this_error0 =
+                    _mm_load_si128((const __m128i *)(oct_lookup.distances + low_key0));
+                __m128i low_this_error1 =
+                    _mm_load_si128((const __m128i *)(oct_lookup.distances + low_key1));
+                __m128i low_this_error2 =
+                    _mm_load_si128((const __m128i *)(oct_lookup.distances + low_key2));
+
+                // add the distance for this time slice to the past distances
+                __m128i low_error = _mm_add_epi16(low_past_error, low_this_error);
+                __m128i low_error0 = _mm_add_epi16(low_past_error0, low_this_error0);
+                __m128i low_error1 = _mm_add_epi16(low_past_error1, low_this_error1);
+                __m128i low_error2 = _mm_add_epi16(low_past_error2, low_this_error2);
+
+                // repeat oct distance table lookup for registers with high
+                // order bit set
+                distance_oct_key_t high_key =
+                    oct_lookup.keys[oct_highbase + oct + (base_offset / 4)];
+                distance_oct_key_t high_key0 =
+                    oct_lookup.keys[oct_highbase + oct + (base_offset / 4) + 1];
+                distance_oct_key_t high_key1 =
+                    oct_lookup.keys[oct_highbase + oct + (base_offset / 4) + 2];
+                distance_oct_key_t high_key2 =
+                    oct_lookup.keys[oct_highbase + oct + (base_offset / 4) + 3];
+
+                __m128i high_this_error =
+                    _mm_load_si128((const __m128i *)(oct_lookup.distances + high_key));
+                __m128i high_this_error0 =
+                    _mm_load_si128((const __m128i *)(oct_lookup.distances + high_key0));
+                __m128i high_this_error1 =
+                    _mm_load_si128((const __m128i *)(oct_lookup.distances + high_key1));
+                __m128i high_this_error2 =
+                    _mm_load_si128((const __m128i *)(oct_lookup.distances + high_key2));
+
+                __m128i high_error = _mm_add_epi16(high_past_error, high_this_error);
+                __m128i high_error0 = _mm_add_epi16(high_past_error0, high_this_error0);
+                __m128i high_error1 = _mm_add_epi16(high_past_error1, high_this_error1);
+                __m128i high_error2 = _mm_add_epi16(high_past_error2, high_this_error2);
+
+                // distances for this time slice calculated
+
+                // find the least error between registers who differ only in
+                // their high order bit
+                __m128i min_error = _mm_min_epu16(low_error, high_error);
+                __m128i min_error0 = _mm_min_epu16(low_error0, high_error0);
+                __m128i min_error1 = _mm_min_epu16(low_error1, high_error1);
+                __m128i min_error2 = _mm_min_epu16(low_error2, high_error2);
+
+                _mm_store_si128((__m128i *)(write_errors + low + offset), min_error);
+                _mm_store_si128((__m128i *)(write_errors + low + offset + 8), min_error0);
+                _mm_store_si128((__m128i *)(write_errors + low + offset + 16), min_error1);
+                _mm_store_si128((__m128i *)(write_errors + low + offset + 24), min_error2);
+
+                // generate history bits as (low_error > least_error)
+                // this operation fills each element with all 1s if true and 0s
+                // if false
+                // in other words, we set the history bit to 1 if
+                //      the register state with high order bit set was the least
+                //      error
+                __m128i hist = _mm_cmpgt_epi16(low_error, min_error);
+                // pack the bits down from 16-bit wide to 8-bit wide to
+                // accomodate history table
+                hist = _mm_shuffle_epi8(hist, hist_mask);
+
+                __m128i hist0 = _mm_cmpgt_epi16(low_error0, min_error0);
+                hist0 = _mm_shuffle_epi8(hist0, hist_mask);
+
+                __m128i hist1 = _mm_cmpgt_epi16(low_error1, min_error1);
+                hist1 = _mm_shuffle_epi8(hist1, hist_mask);
+
+                __m128i hist2 = _mm_cmpgt_epi16(low_error2, min_error2);
+                hist2 = _mm_shuffle_epi8(hist2, hist_mask);
+
+                // write the least error so that the next time slice sees it as
+                // the past error
+                // store the history bits set by cmp and shuffle operations
+                _mm_storel_epi64((__m128i *)(history + low + offset), hist);
+                _mm_storel_epi64((__m128i *)(history + low + offset + 8), hist0);
+                _mm_storel_epi64((__m128i *)(history + low + offset + 16), hist1);
+                _mm_storel_epi64((__m128i *)(history + low + offset + 24), hist2);
+            }
+        }
+
+        // bypass the call to history buffer
+        // we should really make that function inline and remove this below
+        if (hist_buf_len == hist_buf_cap - 1 || hist_buf_rn_cnt == hist_buf_rn_int - 1) {
+            // restore hist buffer state and invoke it
+            conv->history_buffer->len = hist_buf_len;
+            conv->history_buffer->index = hist_buf_index;
+            conv->history_buffer->renormalize_counter = hist_buf_rn_cnt;
+            history_buffer_process(conv->history_buffer, write_errors, conv->bit_writer);
+            // restore our local values
+            hist_buf_len = conv->history_buffer->len;
+            hist_buf_index = conv->history_buffer->index;
+            hist_buf_cap = conv->history_buffer->cap;
+            hist_buf_rn_cnt = conv->history_buffer->renormalize_counter;
+        } else {
+            hist_buf_len++;
+            hist_buf_index++;
+            if (hist_buf_index == hist_buf_cap) {
+                hist_buf_index = 0;
+            }
+            hist_buf_rn_cnt++;
+        }
+        error_buffer_swap(conv->errors);
+    }
+    conv->history_buffer->len = hist_buf_len;
+    conv->history_buffer->index = hist_buf_index;
+    conv->history_buffer->renormalize_counter = hist_buf_rn_cnt;
+}
+
+static void _convolutional_sse_decode_init(correct_convolutional_sse *conv,
+                                           unsigned int min_traceback,
+                                           unsigned int traceback_length,
+                                           unsigned int renormalize_interval) {
+    _convolutional_decode_init(&conv->base_conv, min_traceback, traceback_length,
+                               renormalize_interval);
+    conv->oct_lookup =
+        oct_lookup_create(conv->base_conv.rate, conv->base_conv.order, conv->base_conv.table);
+}
+
+static ssize_t _convolutional_sse_decode(correct_convolutional_sse *sse_conv,
+                                         size_t num_encoded_bits, size_t num_encoded_bytes,
+                                         uint8_t *msg, const soft_t *soft_encoded) {
+    correct_convolutional *conv = &sse_conv->base_conv;
+    if (!conv->has_init_decode) {
+        uint64_t max_error_per_input = conv->rate * soft_max;
+        // sse implementation unfortunately uses signed math on our unsigned values
+        // reduces usable distance by /2
+        unsigned int renormalize_interval = (distance_max / 2) / max_error_per_input;
+        _convolutional_sse_decode_init(sse_conv, 5 * conv->order, 100 * conv->order,
+                                       renormalize_interval);
+    }
+
+    size_t sets = num_encoded_bits / conv->rate;
+    // XXX fix this vvvvvv
+    size_t decoded_len_bytes = num_encoded_bytes;
+    bit_writer_reconfigure(conv->bit_writer, msg, decoded_len_bytes);
+
+    error_buffer_reset(conv->errors);
+    history_buffer_reset(conv->history_buffer);
+
+    // no outputs are generated during warmup
+    convolutional_decode_warmup(conv, sets, soft_encoded);
+    convolutional_sse_decode_inner(sse_conv, sets, soft_encoded);
+    convolutional_decode_tail(conv, sets, soft_encoded);
+
+    history_buffer_flush(conv->history_buffer, conv->bit_writer);
+
+    return bit_writer_length(conv->bit_writer);
+}
+
+ssize_t correct_convolutional_sse_decode(correct_convolutional_sse *conv, const uint8_t *encoded,
+                                         size_t num_encoded_bits, uint8_t *msg) {
+    if (num_encoded_bits % conv->base_conv.rate) {
+        // XXX turn this into an error code
+        // printf("encoded length of message must be a multiple of rate\n");
+        return -1;
+    }
+
+    size_t num_encoded_bytes =
+        (num_encoded_bits % 8) ? (num_encoded_bits / 8 + 1) : (num_encoded_bits / 8);
+    bit_reader_reconfigure(conv->base_conv.bit_reader, encoded, num_encoded_bytes);
+
+    return _convolutional_sse_decode(conv, num_encoded_bits, num_encoded_bytes, msg, NULL);
+}
+
+ssize_t correct_convolutional_sse_decode_soft(correct_convolutional_sse *conv, const soft_t *encoded,
+                                              size_t num_encoded_bits, uint8_t *msg) {
+    if (num_encoded_bits % conv->base_conv.rate) {
+        // XXX turn this into an error code
+        // printf("encoded length of message must be a multiple of rate\n");
+        return -1;
+    }
+
+    size_t num_encoded_bytes =
+        (num_encoded_bits % 8) ? (num_encoded_bits / 8 + 1) : (num_encoded_bits / 8);
+
+    return _convolutional_sse_decode(conv, num_encoded_bits, num_encoded_bytes, msg, encoded);
+}
diff --git a/libcorrect/src/convolutional/sse/encode.c b/libcorrect/src/convolutional/sse/encode.c
new file mode 100644
index 0000000..92ea10d
--- /dev/null
+++ b/libcorrect/src/convolutional/sse/encode.c
@@ -0,0 +1,9 @@
+#include "correct/convolutional/sse/convolutional.h"
+
+size_t correct_convolutional_sse_encode_len(correct_convolutional_sse *conv, size_t msg_len) {
+    return correct_convolutional_encode_len(&conv->base_conv, msg_len);
+}
+
+size_t correct_convolutional_sse_encode(correct_convolutional_sse *conv, const uint8_t *msg, size_t msg_len, uint8_t *encoded) {
+    return correct_convolutional_encode(&conv->base_conv, msg, msg_len, encoded);
+}
diff --git a/libcorrect/src/convolutional/sse/lookup.c b/libcorrect/src/convolutional/sse/lookup.c
new file mode 100644
index 0000000..472dd8f
--- /dev/null
+++ b/libcorrect/src/convolutional/sse/lookup.c
@@ -0,0 +1,183 @@
+#include "correct/convolutional/sse/lookup.h"
+
+quad_lookup_t quad_lookup_create(unsigned int rate,
+                                 unsigned int order,
+                                 const unsigned int *table) {
+    quad_lookup_t quads;
+
+    quads.keys = malloc(sizeof(unsigned int) * (1 << (order - 2)));
+    quads.outputs = calloc((1 << (rate * 4)), sizeof(unsigned int));
+    unsigned int *inv_outputs = calloc((1 << (rate * 4)), sizeof(unsigned int));
+    unsigned int output_counter = 1;
+    // for every (even-numbered) shift register state, find the concatenated output of the state
+    //   and the subsequent state that follows it (low bit set). then, check to see if this
+    //   concatenated output has a unique key assigned to it already. if not, give it a key.
+    //   if it does, retrieve the key. assign this key to the shift register state.
+    for (unsigned int i = 0; i < (1 << (order - 2)); i++) {
+        // first get the concatenated quad of outputs
+        unsigned int out = table[i * 4 + 3];
+        out <<= rate;
+        out |= table[i * 4 + 2];
+        out <<= rate;
+        out |= table[i * 4 + 1];
+        out <<= rate;
+        out |= table[i * 4];
+
+        // does this concatenated output exist in the outputs table yet?
+        if (!inv_outputs[out]) {
+            // doesn't exist, allocate a new key
+            inv_outputs[out] = output_counter;
+            quads.outputs[output_counter] = out;
+            output_counter++;
+        }
+        // set the opaque key for the ith shift register state to the concatenated output entry
+        quads.keys[i] = inv_outputs[out];
+    }
+    quads.outputs_len = output_counter;
+    quads.output_mask = (1 << (rate)) - 1;
+    quads.output_width = rate;
+    quads.distances = calloc(quads.outputs_len, sizeof(distance_quad_t));
+    free(inv_outputs);
+    return quads;
+}
+
+void quad_lookup_destroy(quad_lookup_t quads) {
+    free(quads.keys);
+    free(quads.outputs);
+    free(quads.distances);
+}
+
+void quad_lookup_fill_distance(quad_lookup_t quads, distance_t *distances) {
+    for (unsigned int i = 1; i < quads.outputs_len; i += 1) {
+        output_quad_t concat_out = quads.outputs[i];
+        unsigned int i_0 = concat_out & quads.output_mask;
+        concat_out >>= quads.output_width;
+        unsigned int i_1 = concat_out & quads.output_mask;
+        concat_out >>= quads.output_width;
+        unsigned int i_2 = concat_out & quads.output_mask;
+        concat_out >>= quads.output_width;
+        unsigned int i_3 = concat_out;
+
+        quads.distances[i] = ((uint64_t)distances[i_3] << 48) | ((uint64_t)distances[i_2] << 32) | (distances[i_1] << 16) | distances[i_0];
+    }
+}
+
+distance_oct_key_t oct_lookup_find_key(output_oct_t *outputs, output_oct_t out, size_t num_keys) {
+    for (size_t i = 1; i < num_keys; i++) {
+        if (outputs[i] == out) {
+            return i;
+        }
+    }
+    return 0;
+}
+
+oct_lookup_t oct_lookup_create(unsigned int rate,
+                                 unsigned int order,
+                                 const unsigned int *table) {
+    oct_lookup_t octs;
+
+    octs.keys = malloc((1 << (order - 3)) * sizeof(distance_oct_key_t));
+    octs.outputs = malloc(((output_oct_t)2 << rate) * sizeof(uint64_t));
+    output_oct_t *short_outs = calloc(((output_oct_t)2 << rate), sizeof(output_oct_t));
+    size_t outputs_len = 2 << rate;
+    unsigned int output_counter = 1;
+    // for every (even-numbered) shift register state, find the concatenated output of the state
+    //   and the subsequent state that follows it (low bit set). then, check to see if this
+    //   concatenated output has a unique key assigned to it already. if not, give it a key.
+    //   if it does, retrieve the key. assign this key to the shift register state.
+    for (shift_register_t i = 0; i < (1 << (order - 3)); i++) {
+        // first get the concatenated oct of outputs
+        output_oct_t out = table[i * 8 + 7];
+        out <<= rate;
+        out |= table[i * 8 + 6];
+        out <<= rate;
+        out |= table[i * 8 + 5];
+        out <<= rate;
+        out |= table[i * 8 + 4];
+        out <<= rate;
+        out |= table[i * 8 + 3];
+        out <<= rate;
+        out |= table[i * 8 + 2];
+        out <<= rate;
+        out |= table[i * 8 + 1];
+        out <<= rate;
+        out |= table[i * 8];
+
+        distance_oct_key_t key = oct_lookup_find_key(short_outs, out, output_counter);
+        // does this concatenated output exist in the outputs table yet?
+        if (!key) {
+            // doesn't exist, allocate a new key
+            // now build it in expanded form
+            output_oct_t expanded_out = table[i * 8 + 7];
+            expanded_out <<= 8;
+            expanded_out |= table[i * 8 + 6];
+            expanded_out <<= 8;
+            expanded_out |= table[i * 8 + 5];
+            expanded_out <<= 8;
+            expanded_out |= table[i * 8 + 4];
+            expanded_out <<= 8;
+            expanded_out |= table[i * 8 + 3];
+            expanded_out <<= 8;
+            expanded_out |= table[i * 8 + 2];
+            expanded_out <<= 8;
+            expanded_out |= table[i * 8 + 1];
+            expanded_out <<= 8;
+            expanded_out |= table[i * 8];
+
+            if (output_counter == outputs_len) {
+                octs.outputs = realloc(octs.outputs, outputs_len * 2 * sizeof(output_oct_t));
+                short_outs = realloc(short_outs, outputs_len * 2 * sizeof(output_oct_t));
+                outputs_len *= 2;
+            }
+            short_outs[output_counter] = out;
+            octs.outputs[output_counter] = expanded_out;
+            key = output_counter;
+            output_counter++;
+        }
+        // set the opaque key for the ith shift register state to the concatenated output entry
+        // we multiply the key by 2 since the distances are strided by 2
+        octs.keys[i] = key * 2;
+    }
+    free(short_outs);
+    octs.outputs_len = output_counter;
+    octs.output_mask = (1 << (rate)) - 1;
+    octs.output_width = rate;
+    octs.distances = malloc(octs.outputs_len * 2 * sizeof(uint64_t));
+    return octs;
+}
+
+void oct_lookup_destroy(oct_lookup_t octs) {
+    free(octs.keys);
+    free(octs.outputs);
+    free(octs.distances);
+}
+
+// WIP: sse approach to filling the distance table
+/*
+void oct_lookup_fill_distance_sse(oct_lookup_t octs, distance_t *distances) {
+    distance_pair_t *distance_pair = (distance_pair_t*)octs.distances;
+    __v4si index_shuffle_mask = (__v4si){0xffffff00, 0xffffff01, 0xffffff02, 0xffffff03};
+    __m256i dist_shuffle_mask = (__m256i){0x01000504, 0x09080d0c, 0xffffffff, 0xffffffff,
+                                          0x01000504, 0x09080d0c, 0xffffffff, 0xffffffff};
+    const int dist_permute_mask = 0x0c;
+    for (unsigned int i = 1; i < octs.outputs_len; i += 2) {
+        // big heaping todo vvv
+        // a) we want 16 bit distances GATHERed, not 32 bit
+        // b) we need to load 8 of those distances, not 4
+        __v4si short_concat_index = _mm_loadl_epi64(octs.outputs + 2*i);
+        __v4si short_concat_index0 = _mm_loadl_epi64(octs.outputs + 2*i + 1);
+        __m256i concat_index = _mm256_cvtepu8_epi32(short_concat_index);
+        __m256i concat_index0 = _mm256_cvtepu8_epi32(short_concat_index0);
+        __m256i dist = _mm256_i32gather_epi32(distances, concat_index, sizeof(distance_t));
+        __m256i dist0 = _mm256_i32gather_epi32(distances, concat_index0, sizeof(distance_t));
+        dist = _mm256_shuffle_epi8(dist, dist_shuffle_mask);
+        dist0 = _mm256_shuffle_epi8(dist0, dist_shuffle_mask);
+        dist = __builtin_shufflevector(dist, dist, 0, 5, 0, 0);
+        dist0 = __builtin_shufflevector(dist0, dist0, 0, 5, 0, 0);
+        __v4si packed_dist = _mm256_castsi256_si128(dist);
+        _mm_store_si128(distance_pair + 8 * i, packed_dist);
+        __v4si packed_dist0 = _mm256_castsi256_si128(dist0);
+        _mm_store_si128(distance_pair + 8 * i + 4, packed_dist0);
+    }
+}
+*/
diff --git a/libcorrect/src/fec_shim.c b/libcorrect/src/fec_shim.c
new file mode 100644
index 0000000..146c60c
--- /dev/null
+++ b/libcorrect/src/fec_shim.c
@@ -0,0 +1,255 @@
+#include <stdlib.h>
+#include <string.h>
+
+#include "fec_shim.h"
+
+typedef struct {
+    correct_reed_solomon *rs;
+    unsigned int msg_length;
+    unsigned int block_length;
+    unsigned int num_roots;
+    uint8_t *msg_out;
+    unsigned int pad;
+    uint8_t *erasures;
+} reed_solomon_shim;
+
+void *init_rs_char(int symbol_size, int primitive_polynomial,
+                   int first_consecutive_root, int root_gap, int number_roots,
+                   unsigned int pad) {
+    if (symbol_size != 8) {
+        return NULL;
+    }
+
+    reed_solomon_shim *shim = malloc(sizeof(reed_solomon_shim));
+
+    shim->pad = pad;
+    shim->block_length = 255 - pad;
+    shim->num_roots = number_roots;
+    shim->msg_length = shim->block_length - number_roots;
+    shim->rs = correct_reed_solomon_create(primitive_polynomial,
+                                           first_consecutive_root, root_gap, number_roots);
+    shim->msg_out = malloc(shim->block_length);
+    shim->erasures = malloc(number_roots);
+
+    return shim;
+}
+
+void free_rs_char(void *rs) {
+    reed_solomon_shim *shim = (reed_solomon_shim *)rs;
+    correct_reed_solomon_destroy(shim->rs);
+    free(shim->msg_out);
+    free(shim->erasures);
+    free(shim);
+}
+
+void encode_rs_char(void *rs, const unsigned char *msg, unsigned char *parity) {
+    reed_solomon_shim *shim = (reed_solomon_shim *)rs;
+    correct_reed_solomon_encode(shim->rs, msg, shim->msg_length, shim->msg_out);
+    memcpy(parity, shim->msg_out + shim->msg_length, shim->num_roots);
+}
+
+void decode_rs_char(void *rs, unsigned char *block, int *erasure_locations,
+                    int num_erasures) {
+    reed_solomon_shim *shim = (reed_solomon_shim *)rs;
+    for (int i = 0; i < num_erasures; i++) {
+        shim->erasures[i] = (uint8_t)(erasure_locations[i]) - shim->pad;
+    }
+    correct_reed_solomon_decode_with_erasures(shim->rs, block, shim->block_length,
+                                              shim->erasures, num_erasures,
+                                              block);
+}
+
+typedef struct {
+    correct_convolutional *conv;
+    unsigned int rate;
+    unsigned int order;
+    uint8_t *buf;
+    size_t buf_len;
+    uint8_t *read_iter;
+    uint8_t *write_iter;
+} convolutional_shim;
+
+static correct_convolutional_polynomial_t r12k7[] = {V27POLYA, V27POLYB};
+
+static correct_convolutional_polynomial_t r12k9[] = {V29POLYA, V29POLYB};
+
+static correct_convolutional_polynomial_t r13k9[] = {V39POLYA, V39POLYB,
+                                                     V39POLYC};
+
+static correct_convolutional_polynomial_t r16k15[] = {
+    V615POLYA, V615POLYB, V615POLYC, V615POLYD, V615POLYE, V615POLYF};
+
+/* Common methods */
+static void *create_viterbi(unsigned int num_decoded_bits, unsigned int rate,
+                            unsigned int order,
+                            correct_convolutional_polynomial_t *poly) {
+    convolutional_shim *shim = malloc(sizeof(convolutional_shim));
+
+    size_t num_decoded_bytes = (num_decoded_bits % 8)
+                                   ? (num_decoded_bits / 8 + 1)
+                                   : num_decoded_bits / 8;
+
+    shim->rate = rate;
+    shim->order = order;
+    shim->buf = malloc(num_decoded_bytes);
+    shim->buf_len = num_decoded_bytes;
+    shim->conv = correct_convolutional_create(rate, order, poly);
+    shim->read_iter = shim->buf;
+    shim->write_iter = shim->buf;
+
+    return shim;
+}
+
+static void delete_viterbi(void *vit) {
+    convolutional_shim *shim = (convolutional_shim *)vit;
+    free(shim->buf);
+    correct_convolutional_destroy(shim->conv);
+    free(shim);
+}
+
+static void init_viterbi(void *vit) {
+    convolutional_shim *shim = (convolutional_shim *)vit;
+    shim->read_iter = shim->buf;
+    shim->write_iter = shim->buf;
+}
+
+static void update_viterbi_blk(void *vit, const unsigned char *encoded_soft,
+                               unsigned int num_encoded_groups) {
+    convolutional_shim *shim = (convolutional_shim *)vit;
+
+    // don't overwrite our buffer
+    size_t rem = (shim->buf + shim->buf_len) - shim->write_iter;
+    size_t rem_bits = 8 * rem;
+    // this math isn't very clear
+    // here we sort of do the opposite of what liquid-dsp does
+    size_t n_write_bits = num_encoded_groups - (shim->order - 1);
+    if (n_write_bits > rem_bits) {
+        size_t reduction = n_write_bits - rem_bits;
+        num_encoded_groups -= reduction;
+        n_write_bits -= reduction;
+    }
+
+    // what if n_write_bits isn't a multiple of 8?
+    // libcorrect can't start and stop at arbitrary indices...
+    correct_convolutional_decode_soft(
+        shim->conv, encoded_soft, num_encoded_groups * shim->rate, shim->write_iter);
+    shim->write_iter += n_write_bits / 8;
+}
+
+static void chainback_viterbi(void *vit, unsigned char *decoded,
+                              unsigned int num_decoded_bits) {
+    convolutional_shim *shim = (convolutional_shim *)vit;
+
+    // num_decoded_bits not a multiple of 8?
+    // this is a similar problem to update_viterbi_blk
+    // although here we could actually resolve a non-multiple of 8
+    size_t rem = shim->write_iter - shim->read_iter;
+    size_t rem_bits = 8 * rem;
+
+    if (num_decoded_bits > rem_bits) {
+        num_decoded_bits = rem_bits;
+    }
+
+    size_t num_decoded_bytes = (num_decoded_bits % 8)
+                                   ? (num_decoded_bits / 8 + 1)
+                                   : num_decoded_bits / 8;
+    memcpy(decoded, shim->read_iter, num_decoded_bytes);
+
+    shim->read_iter += num_decoded_bytes;
+}
+
+/* Rate 1/2, k = 7 */
+void *create_viterbi27(int num_decoded_bits) {
+    return create_viterbi(num_decoded_bits, 2, 7, r12k7);
+}
+
+void delete_viterbi27(void *vit) { delete_viterbi(vit); }
+
+int init_viterbi27(void *vit, int _) {
+    init_viterbi(vit);
+    return 0;
+}
+
+int update_viterbi27_blk(void *vit, unsigned char *encoded_soft,
+                         int num_encoded_groups) {
+    update_viterbi_blk(vit, encoded_soft, num_encoded_groups);
+    return 0;
+}
+
+int chainback_viterbi27(void *vit, unsigned char *decoded,
+                        unsigned int num_decoded_bits, unsigned int _) {
+    chainback_viterbi(vit, decoded, num_decoded_bits);
+    return 0;
+}
+
+/* Rate 1/2, k = 9 */
+void *create_viterbi29(int num_decoded_bits) {
+    return create_viterbi(num_decoded_bits, 2, 9, r12k9);
+}
+
+void delete_viterbi29(void *vit) { delete_viterbi(vit); }
+
+int init_viterbi29(void *vit, int _) {
+    init_viterbi(vit);
+    return 0;
+}
+
+int update_viterbi29_blk(void *vit, unsigned char *encoded_soft,
+                         int num_encoded_groups) {
+    update_viterbi_blk(vit, encoded_soft, num_encoded_groups);
+    return 0;
+}
+
+int chainback_viterbi29(void *vit, unsigned char *decoded,
+                        unsigned int num_decoded_bits, unsigned int _) {
+    chainback_viterbi(vit, decoded, num_decoded_bits);
+    return 0;
+}
+
+/* Rate 1/3, k = 9 */
+void *create_viterbi39(int num_decoded_bits) {
+    return create_viterbi(num_decoded_bits, 3, 9, r13k9);
+}
+
+void delete_viterbi39(void *vit) { delete_viterbi(vit); }
+
+int init_viterbi39(void *vit, int _) {
+    init_viterbi(vit);
+    return 0;
+}
+
+int update_viterbi39_blk(void *vit, unsigned char *encoded_soft,
+                         int num_encoded_groups) {
+    update_viterbi_blk(vit, encoded_soft, num_encoded_groups);
+    return 0;
+}
+
+int chainback_viterbi39(void *vit, unsigned char *decoded,
+                        unsigned int num_decoded_bits, unsigned int _) {
+    chainback_viterbi(vit, decoded, num_decoded_bits);
+    return 0;
+}
+
+/* Rate 1/6, k = 15 */
+void *create_viterbi615(int num_decoded_bits) {
+    return create_viterbi(num_decoded_bits, 6, 15, r16k15);
+}
+
+void delete_viterbi615(void *vit) { delete_viterbi(vit); }
+
+int init_viterbi615(void *vit, int _) {
+    init_viterbi(vit);
+    return 0;
+}
+
+int update_viterbi615_blk(void *vit, unsigned char *encoded_soft,
+                          int num_encoded_groups) {
+    update_viterbi_blk(vit, encoded_soft, num_encoded_groups);
+    return 0;
+}
+
+int chainback_viterbi615(void *vit, unsigned char *decoded,
+                         unsigned int num_decoded_bits, unsigned int _) {
+    chainback_viterbi(vit, decoded, num_decoded_bits);
+    return 0;
+}
diff --git a/libcorrect/src/reed-solomon/CMakeLists.txt b/libcorrect/src/reed-solomon/CMakeLists.txt
new file mode 100644
index 0000000..eabe75e
--- /dev/null
+++ b/libcorrect/src/reed-solomon/CMakeLists.txt
@@ -0,0 +1,2 @@
+set(SRCFILES polynomial.c reed-solomon.c encode.c decode.c)
+add_library(correct-reed-solomon OBJECT ${SRCFILES})
diff --git a/libcorrect/src/reed-solomon/decode.c b/libcorrect/src/reed-solomon/decode.c
new file mode 100644
index 0000000..e765451
--- /dev/null
+++ b/libcorrect/src/reed-solomon/decode.c
@@ -0,0 +1,505 @@
+#include "correct/reed-solomon/encode.h"
+
+// calculate all syndromes of the received polynomial at the roots of the generator
+// because we're evaluating at the roots of the generator, and because the transmitted
+//   polynomial was made to be a product of the generator, we know that the transmitted
+//   polynomial is 0 at these roots
+// any nonzero syndromes we find here are the values of the error polynomial evaluated
+//   at these roots, so these values give us a window into the error polynomial. if
+//   these syndromes are all zero, then we can conclude the error polynomial is also
+//   zero. if they're nonzero, then we know our message received an error in transit.
+// returns true if syndromes are all zero
+static bool reed_solomon_find_syndromes(field_t field, polynomial_t msgpoly, field_logarithm_t **generator_root_exp,
+                                        field_element_t *syndromes, size_t min_distance) {
+    bool all_zero = true;
+    memset(syndromes, 0, min_distance * sizeof(field_element_t));
+    for (unsigned int i = 0; i < min_distance; i++) {
+        // profiling reveals that this function takes about 50% of the cpu time of
+        // decoding. so, in order to speed it up a little, we precompute and save
+        // the successive powers of the roots of the generator, which are
+        // located in generator_root_exp
+        field_element_t eval = polynomial_eval_lut(field, msgpoly, generator_root_exp[i]);
+        if (eval) {
+            all_zero = false;
+        }
+        syndromes[i] = eval;
+    }
+    return all_zero;
+}
+
+// Berlekamp-Massey algorithm to find LFSR that describes syndromes
+// returns number of errors and writes the error locator polynomial to rs->error_locator
+static unsigned int reed_solomon_find_error_locator(correct_reed_solomon *rs, size_t num_erasures) {
+    unsigned int numerrors = 0;
+
+    memset(rs->error_locator.coeff, 0, (rs->min_distance + 1) * sizeof(field_element_t));
+
+    // initialize to f(x) = 1
+    rs->error_locator.coeff[0] = 1;
+    rs->error_locator.order = 0;
+
+    memcpy(rs->last_error_locator.coeff, rs->error_locator.coeff, (rs->min_distance + 1) * sizeof(field_element_t));
+    rs->last_error_locator.order = rs->error_locator.order;
+
+    field_element_t discrepancy;
+    field_element_t last_discrepancy = 1;
+    unsigned int delay_length = 1;
+
+    for (unsigned int i = rs->error_locator.order; i < rs->min_distance - num_erasures; i++) {
+        discrepancy = rs->syndromes[i];
+        for (unsigned int j = 1; j <= numerrors; j++) {
+            discrepancy = field_add(rs->field, discrepancy,
+                                    field_mul(rs->field, rs->error_locator.coeff[j], rs->syndromes[i - j]));
+        }
+
+        if (!discrepancy) {
+            // our existing LFSR describes the new syndrome as well
+            // leave it as-is but update the number of delay elements
+            //   so that if a discrepancy occurs later we can eliminate it
+            delay_length++;
+            continue;
+        }
+
+        if (2 * numerrors <= i) {
+            // there's a discrepancy, but we still have room for more taps
+            // lengthen LFSR by one tap and set weight to eliminate discrepancy
+
+            // shift the last locator by the delay length, multiply by discrepancy,
+            //   and divide by the last discrepancy
+            // we move down because we're shifting up, and this prevents overwriting
+            for (int j = rs->last_error_locator.order; j >= 0; j--) {
+                // the bounds here will be ok since we have a headroom of numerrors
+                rs->last_error_locator.coeff[j + delay_length] = field_div(
+                    rs->field, field_mul(rs->field, rs->last_error_locator.coeff[j], discrepancy), last_discrepancy);
+            }
+            for (int j = delay_length - 1; j >= 0; j--) {
+                rs->last_error_locator.coeff[j] = 0;
+            }
+
+            // locator = locator - last_locator
+            // we will also update last_locator to be locator before this loop takes place
+            field_element_t temp;
+            for (int j = 0; j <= (rs->last_error_locator.order + delay_length); j++) {
+                temp = rs->error_locator.coeff[j];
+                rs->error_locator.coeff[j] =
+                    field_add(rs->field, rs->error_locator.coeff[j], rs->last_error_locator.coeff[j]);
+                rs->last_error_locator.coeff[j] = temp;
+            }
+            unsigned int temp_order = rs->error_locator.order;
+            rs->error_locator.order = rs->last_error_locator.order + delay_length;
+            rs->last_error_locator.order = temp_order;
+
+            // now last_locator is locator before we started,
+            //   and locator is (locator - (discrepancy/last_discrepancy) * x^(delay_length) * last_locator)
+
+            numerrors = i + 1 - numerrors;
+            last_discrepancy = discrepancy;
+            delay_length = 1;
+            continue;
+        }
+
+        // no more taps
+        // unlike the previous case, we are preserving last locator,
+        //    but we'll update locator as before
+        // we're basically flattening the two loops from the previous case because
+        //    we no longer need to update last_locator
+        for (int j = rs->last_error_locator.order; j >= 0; j--) {
+            rs->error_locator.coeff[j + delay_length] =
+                field_add(rs->field, rs->error_locator.coeff[j + delay_length],
+                          field_div(rs->field, field_mul(rs->field, rs->last_error_locator.coeff[j], discrepancy),
+                                    last_discrepancy));
+        }
+        rs->error_locator.order = (rs->last_error_locator.order + delay_length > rs->error_locator.order)
+                                      ? rs->last_error_locator.order + delay_length
+                                      : rs->error_locator.order;
+        delay_length++;
+    }
+    return rs->error_locator.order;
+}
+
+// find the roots of the error locator polynomial
+// Chien search
+bool reed_solomon_factorize_error_locator(field_t field, unsigned int num_skip, polynomial_t locator_log, field_element_t *roots,
+                                          field_logarithm_t **element_exp) {
+    // normally it'd be tricky to find all the roots
+    // but, the finite field is awfully finite...
+    // just brute force search across every field element
+    unsigned int root = num_skip;
+    memset(roots + num_skip, 0, (locator_log.order) * sizeof(field_element_t));
+    for (field_operation_t i = 0; i < 256; i++) {
+        // we make two optimizations here to help this search go faster
+        // a) we have precomputed the first successive powers of every single element
+        //   in the field. we need at most n powers, where n is the largest possible
+        //   degree of the error locator
+        // b) we have precomputed the error locator polynomial in log form, which
+        //   helps reduce some lookups that would be done here
+        if (!polynomial_eval_log_lut(field, locator_log, element_exp[i])) {
+            roots[root] = (field_element_t)i;
+            root++;
+        }
+    }
+    // this is where we find out if we are have too many errors to recover from
+    // berlekamp-massey may have built an error locator that has 0 discrepancy
+    // on the syndromes but doesn't have enough roots
+    return root == locator_log.order + num_skip;
+}
+
+// use error locator and syndromes to find the error evaluator polynomial
+void reed_solomon_find_error_evaluator(field_t field, polynomial_t locator, polynomial_t syndromes,
+                                       polynomial_t error_evaluator) {
+    // the error evaluator, omega(x), is S(x)*Lamba(x) mod x^(2t)
+    // where S(x) is a polynomial constructed from the syndromes
+    //   S(1) + S(2)*x + ... + S(2t)*x(2t - 1)
+    // and Lambda(x) is the error locator
+    // the modulo is implicit here -- we have limited the max length of error_evaluator,
+    //   which polynomial_mul will interpret to mean that it should not compute
+    //   powers larger than that, which is the same as performing mod x^(2t)
+    polynomial_mul(field, locator, syndromes, error_evaluator);
+}
+
+// use error locator, error roots and syndromes to find the error values
+// that is, the elements in the finite field which can be added to the received
+//   polynomial at the locations of the error roots in order to produce the
+//   transmitted polynomial
+// forney algorithm
+void reed_solomon_find_error_values(correct_reed_solomon *rs) {
+    // error value e(j) = -(X(j)^(1-c) * omega(X(j)^-1))/(lambda'(X(j)^-1))
+    // where X(j)^-1 is a root of the error locator, omega(X) is the error evaluator,
+    //   lambda'(X) is the first formal derivative of the error locator,
+    //   and c is the first consecutive root of the generator used in encoding
+
+    // first find omega(X), the error evaluator
+    // we generate S(x), the polynomial constructed from the roots of the syndromes
+    // this is *not* the polynomial constructed by expanding the products of roots
+    // S(x) = S(1) + S(2)*x + ... + S(2t)*x(2t - 1)
+    polynomial_t syndrome_poly;
+    syndrome_poly.order = rs->min_distance - 1;
+    syndrome_poly.coeff = rs->syndromes;
+    memset(rs->error_evaluator.coeff, 0, (rs->error_evaluator.order + 1) * sizeof(field_element_t));
+    reed_solomon_find_error_evaluator(rs->field, rs->error_locator, syndrome_poly, rs->error_evaluator);
+
+    // now find lambda'(X)
+    rs->error_locator_derivative.order = rs->error_locator.order - 1;
+    polynomial_formal_derivative(rs->field, rs->error_locator, rs->error_locator_derivative);
+
+    // calculate each e(j)
+    for (unsigned int i = 0; i < rs->error_locator.order; i++) {
+        if (rs->error_roots[i] == 0) {
+            continue;
+        }
+        rs->error_vals[i] = field_mul(
+            rs->field, field_pow(rs->field, rs->error_roots[i], rs->first_consecutive_root - 1),
+            field_div(
+                rs->field, polynomial_eval_lut(rs->field, rs->error_evaluator, rs->element_exp[rs->error_roots[i]]),
+                polynomial_eval_lut(rs->field, rs->error_locator_derivative, rs->element_exp[rs->error_roots[i]])));
+    }
+}
+
+void reed_solomon_find_error_locations(field_t field, field_logarithm_t generator_root_gap,
+                                       field_element_t *error_roots, field_logarithm_t *error_locations,
+                                       unsigned int num_errors, unsigned int num_skip) {
+    for (unsigned int i = 0; i < num_errors; i++) {
+        // the error roots are the reciprocals of the error locations, so div 1 by them
+
+        // we do mod 255 here because the log table aliases at index 1
+        // the log of 1 is both 0 and 255 (alpha^255 = alpha^0 = 1)
+        // for most uses it makes sense to have log(1) = 255, but in this case
+        // we're interested in a byte index, and the 255th index is not even valid
+        // just wrap it back to 0
+
+        if (error_roots[i] == 0) {
+            continue;
+        }
+
+        field_operation_t loc = field_div(field, 1, error_roots[i]);
+        for (field_operation_t j = 0; j < 256; j++) {
+            if (field_pow(field, j, generator_root_gap) == loc) {
+                error_locations[i] = field.log[j];
+                break;
+            }
+        }
+    }
+}
+
+// erasure method -- take given locations and convert to roots
+// this is the inverse of reed_solomon_find_error_locations
+static void reed_solomon_find_error_roots_from_locations(field_t field, field_logarithm_t generator_root_gap,
+                                                         const field_logarithm_t *error_locations,
+                                                         field_element_t *error_roots, unsigned int num_errors) {
+    for (unsigned int i = 0; i < num_errors; i++) {
+        field_element_t loc = field_pow(field, field.exp[error_locations[i]], generator_root_gap);
+        // field_element_t loc = field.exp[error_locations[i]];
+        error_roots[i] = field_div(field, 1, loc);
+        // error_roots[i] = loc;
+    }
+}
+
+// erasure method -- given the roots of the error locator, create the polynomial
+static polynomial_t reed_solomon_find_error_locator_from_roots(field_t field, unsigned int num_errors,
+                                                               field_element_t *error_roots,
+                                                               polynomial_t error_locator,
+                                                               polynomial_t *scratch) {
+    // multiply out roots to build the error locator polynomial
+    return polynomial_init_from_roots(field, num_errors, error_roots, error_locator, scratch);
+}
+
+// erasure method
+static void reed_solomon_find_modified_syndromes(correct_reed_solomon *rs, field_element_t *syndromes, polynomial_t error_locator, field_element_t *modified_syndromes) {
+    polynomial_t syndrome_poly;
+    syndrome_poly.order = rs->min_distance - 1;
+    syndrome_poly.coeff = syndromes;
+
+    polynomial_t modified_syndrome_poly;
+    modified_syndrome_poly.order = rs->min_distance - 1;
+    modified_syndrome_poly.coeff = modified_syndromes;
+
+    polynomial_mul(rs->field, error_locator, syndrome_poly, modified_syndrome_poly);
+}
+
+void correct_reed_solomon_decoder_create(correct_reed_solomon *rs) {
+    rs->has_init_decode = true;
+    rs->syndromes = calloc(rs->min_distance, sizeof(field_element_t));
+    rs->modified_syndromes = calloc(2 * rs->min_distance, sizeof(field_element_t));
+    rs->received_polynomial = polynomial_create(rs->block_length - 1);
+    rs->error_locator = polynomial_create(rs->min_distance);
+    rs->error_locator_log = polynomial_create(rs->min_distance);
+    rs->erasure_locator = polynomial_create(rs->min_distance);
+    rs->error_roots = calloc(2 * rs->min_distance, sizeof(field_element_t));
+    rs->error_vals = malloc(rs->min_distance * sizeof(field_element_t));
+    rs->error_locations = malloc(rs->min_distance * sizeof(field_logarithm_t));
+
+    rs->last_error_locator = polynomial_create(rs->min_distance);
+    rs->error_evaluator = polynomial_create(rs->min_distance - 1);
+    rs->error_locator_derivative = polynomial_create(rs->min_distance - 1);
+
+    // calculate and store the first block_length powers of every generator root
+    // we would have to do this work in order to calculate the syndromes
+    // if we save it, we can prevent the need to recalculate it on subsequent calls
+    // total memory usage is min_distance * block_length bytes e.g. 32 * 255 ~= 8k
+    rs->generator_root_exp = malloc(rs->min_distance * sizeof(field_logarithm_t *));
+    for (unsigned int i = 0; i < rs->min_distance; i++) {
+        rs->generator_root_exp[i] = malloc(rs->block_length * sizeof(field_logarithm_t));
+        polynomial_build_exp_lut(rs->field, rs->generator_roots[i], rs->block_length - 1, rs->generator_root_exp[i]);
+    }
+
+    // calculate and store the first min_distance powers of every element in the field
+    // we would have to do this for chien search anyway, and its size is only 256 * min_distance bytes
+    // for min_distance = 32 this is 8k of memory, a pittance for the speedup we receive in exchange
+    // we also get to reuse this work during error value calculation
+    rs->element_exp = malloc(256 * sizeof(field_logarithm_t *));
+    for (field_operation_t i = 0; i < 256; i++) {
+        rs->element_exp[i] = malloc(rs->min_distance * sizeof(field_logarithm_t));
+        polynomial_build_exp_lut(rs->field, i, rs->min_distance - 1, rs->element_exp[i]);
+    }
+
+    rs->init_from_roots_scratch[0] = polynomial_create(rs->min_distance);
+    rs->init_from_roots_scratch[1] = polynomial_create(rs->min_distance);
+}
+
+ssize_t correct_reed_solomon_decode(correct_reed_solomon *rs, const uint8_t *encoded, size_t encoded_length,
+                                    uint8_t *msg) {
+    if (encoded_length > rs->block_length) {
+        return -1;
+    }
+
+    // the message is the non-remainder part
+    size_t msg_length = encoded_length - rs->min_distance;
+    // if they handed us a nonfull block, we'll write in 0s
+    size_t pad_length = rs->block_length - encoded_length;
+
+    if (!rs->has_init_decode) {
+        // initialize rs for decoding
+        correct_reed_solomon_decoder_create(rs);
+    }
+
+    // we need to copy to our local buffer
+    // the buffer we're given has the coordinates in the wrong direction
+    // e.g. byte 0 corresponds to the 254th order coefficient
+    // so we're going to flip and then write padding
+    // the final copied buffer will look like
+    // | rem (rs->min_distance) | msg (msg_length) | pad (pad_length) |
+
+    for (unsigned int i = 0; i < encoded_length; i++) {
+        rs->received_polynomial.coeff[i] = encoded[encoded_length - (i + 1)];
+    }
+
+    // fill the pad_length with 0s
+    for (unsigned int i = 0; i < pad_length; i++) {
+        rs->received_polynomial.coeff[i + encoded_length] = 0;
+    }
+
+
+    bool all_zero = reed_solomon_find_syndromes(rs->field, rs->received_polynomial, rs->generator_root_exp,
+                                                rs->syndromes, rs->min_distance);
+
+    if (all_zero) {
+        // syndromes were all zero, so there was no error in the message
+        // copy to msg and we are done
+        for (unsigned int i = 0; i < msg_length; i++) {
+            msg[i] = rs->received_polynomial.coeff[encoded_length - (i + 1)];
+        }
+        return msg_length;
+    }
+
+    unsigned int order = reed_solomon_find_error_locator(rs, 0);
+    // XXX fix this vvvv
+    rs->error_locator.order = order;
+
+    for (unsigned int i = 0; i <= rs->error_locator.order; i++) {
+        // this is a little strange since the coeffs are logs, not elements
+        // also, we'll be storing log(0) = 0 for any 0 coeffs in the error locator
+        // that would seem bad but we'll just be using this in chien search, and we'll skip all 0 coeffs
+        // (you might point out that log(1) also = 0, which would seem to alias. however, that's ok,
+        //   because log(1) = 255 as well, and in fact that's how it's represented in our log table)
+        rs->error_locator_log.coeff[i] = rs->field.log[rs->error_locator.coeff[i]];
+    }
+    rs->error_locator_log.order = rs->error_locator.order;
+
+    if (!reed_solomon_factorize_error_locator(rs->field, 0, rs->error_locator_log, rs->error_roots, rs->element_exp)) {
+        // roots couldn't be found, so there were too many errors to deal with
+        // RS has failed for this message
+        return -1;
+    }
+
+    reed_solomon_find_error_locations(rs->field, rs->generator_root_gap, rs->error_roots, rs->error_locations,
+                                      rs->error_locator.order, 0);
+
+    reed_solomon_find_error_values(rs);
+
+    for (unsigned int i = 0; i < rs->error_locator.order; i++) {
+        rs->received_polynomial.coeff[rs->error_locations[i]] =
+            field_sub(rs->field, rs->received_polynomial.coeff[rs->error_locations[i]], rs->error_vals[i]);
+    }
+
+    for (unsigned int i = 0; i < msg_length; i++) {
+        msg[i] = rs->received_polynomial.coeff[encoded_length - (i + 1)];
+    }
+
+    return msg_length;
+}
+
+ssize_t correct_reed_solomon_decode_with_erasures(correct_reed_solomon *rs, const uint8_t *encoded,
+                                                  size_t encoded_length, const uint8_t *erasure_locations,
+                                                  size_t erasure_length, uint8_t *msg) {
+    if (!erasure_length) {
+        return correct_reed_solomon_decode(rs, encoded, encoded_length, msg);
+    }
+
+    if (encoded_length > rs->block_length) {
+        return -1;
+    }
+
+    if (erasure_length > rs->min_distance) {
+        return -1;
+    }
+
+    // the message is the non-remainder part
+    size_t msg_length = encoded_length - rs->min_distance;
+    // if they handed us a nonfull block, we'll write in 0s
+    size_t pad_length = rs->block_length - encoded_length;
+
+    if (!rs->has_init_decode) {
+        // initialize rs for decoding
+        correct_reed_solomon_decoder_create(rs);
+    }
+
+    // we need to copy to our local buffer
+    // the buffer we're given has the coordinates in the wrong direction
+    // e.g. byte 0 corresponds to the 254th order coefficient
+    // so we're going to flip and then write padding
+    // the final copied buffer will look like
+    // | rem (rs->min_distance) | msg (msg_length) | pad (pad_length) |
+
+    for (unsigned int i = 0; i < encoded_length; i++) {
+        rs->received_polynomial.coeff[i] = encoded[encoded_length - (i + 1)];
+    }
+
+    // fill the pad_length with 0s
+    for (unsigned int i = 0; i < pad_length; i++) {
+        rs->received_polynomial.coeff[i + encoded_length] = 0;
+    }
+
+    for (unsigned int i = 0; i < erasure_length; i++) {
+        // remap the coordinates of the erasures
+        rs->error_locations[i] = rs->block_length - (erasure_locations[i] + pad_length + 1);
+    }
+
+    reed_solomon_find_error_roots_from_locations(rs->field, rs->generator_root_gap, rs->error_locations,
+                                                 rs->error_roots, erasure_length);
+
+    rs->erasure_locator =
+        reed_solomon_find_error_locator_from_roots(rs->field, erasure_length, rs->error_roots, rs->erasure_locator, rs->init_from_roots_scratch);
+
+    bool all_zero = reed_solomon_find_syndromes(rs->field, rs->received_polynomial, rs->generator_root_exp,
+                                                rs->syndromes, rs->min_distance);
+
+    if (all_zero) {
+        // syndromes were all zero, so there was no error in the message
+        // copy to msg and we are done
+        for (unsigned int i = 0; i < msg_length; i++) {
+            msg[i] = rs->received_polynomial.coeff[encoded_length - (i + 1)];
+        }
+        return msg_length;
+    }
+
+    reed_solomon_find_modified_syndromes(rs, rs->syndromes, rs->erasure_locator, rs->modified_syndromes);
+
+    field_element_t *syndrome_copy = malloc(rs->min_distance * sizeof(field_element_t));
+    memcpy(syndrome_copy, rs->syndromes, rs->min_distance * sizeof(field_element_t));
+
+    for (unsigned int i = erasure_length; i < rs->min_distance; i++) {
+        rs->syndromes[i - erasure_length] = rs->modified_syndromes[i];
+    }
+
+    unsigned int order = reed_solomon_find_error_locator(rs, erasure_length);
+    // XXX fix this vvvv
+    rs->error_locator.order = order;
+
+    for (unsigned int i = 0; i <= rs->error_locator.order; i++) {
+        // this is a little strange since the coeffs are logs, not elements
+        // also, we'll be storing log(0) = 0 for any 0 coeffs in the error locator
+        // that would seem bad but we'll just be using this in chien search, and we'll skip all 0 coeffs
+        // (you might point out that log(1) also = 0, which would seem to alias. however, that's ok,
+        //   because log(1) = 255 as well, and in fact that's how it's represented in our log table)
+        rs->error_locator_log.coeff[i] = rs->field.log[rs->error_locator.coeff[i]];
+    }
+    rs->error_locator_log.order = rs->error_locator.order;
+
+    /*
+    for (unsigned int i = 0; i < erasure_length; i++) {
+        rs->error_roots[i] = field_div(rs->field, 1, rs->error_roots[i]);
+    }
+    */
+
+    if (!reed_solomon_factorize_error_locator(rs->field, erasure_length, rs->error_locator_log, rs->error_roots, rs->element_exp)) {
+        // roots couldn't be found, so there were too many errors to deal with
+        // RS has failed for this message
+        return -1;
+    }
+
+    polynomial_t temp_poly = polynomial_create(rs->error_locator.order + erasure_length);
+    polynomial_mul(rs->field, rs->erasure_locator, rs->error_locator, temp_poly);
+    polynomial_t placeholder_poly = rs->error_locator;
+    rs->error_locator = temp_poly;
+
+
+    reed_solomon_find_error_locations(rs->field, rs->generator_root_gap, rs->error_roots, rs->error_locations,
+                                      rs->error_locator.order, erasure_length);
+
+    memcpy(rs->syndromes, syndrome_copy, rs->min_distance * sizeof(field_element_t));
+
+    reed_solomon_find_error_values(rs);
+
+    for (unsigned int i = 0; i < rs->error_locator.order; i++) {
+        rs->received_polynomial.coeff[rs->error_locations[i]] =
+            field_sub(rs->field, rs->received_polynomial.coeff[rs->error_locations[i]], rs->error_vals[i]);
+    }
+
+    rs->error_locator = placeholder_poly;
+
+    for (unsigned int i = 0; i < msg_length; i++) {
+        msg[i] = rs->received_polynomial.coeff[encoded_length - (i + 1)];
+    }
+
+    return msg_length;
+}
diff --git a/libcorrect/src/reed-solomon/encode.c b/libcorrect/src/reed-solomon/encode.c
new file mode 100644
index 0000000..d4eb6f3
--- /dev/null
+++ b/libcorrect/src/reed-solomon/encode.c
@@ -0,0 +1,34 @@
+#include "correct/reed-solomon/encode.h"
+
+ssize_t correct_reed_solomon_encode(correct_reed_solomon *rs, const uint8_t *msg, size_t msg_length, uint8_t *encoded) {
+    if (msg_length > rs->message_length) {
+        return -1;
+    }
+
+    size_t pad_length = rs->message_length - msg_length;
+    for (unsigned int i = 0; i < msg_length; i++) {
+        // message goes from high order to low order but libcorrect polynomials go low to high
+        // so we reverse on the way in and on the way out
+        // we'd have to do a copy anyway so this reversal should be free
+        rs->encoded_polynomial.coeff[rs->encoded_polynomial.order - (i + pad_length)] = msg[i];
+    }
+
+    // 0-fill the rest of the coefficients -- this length will always be > 0
+    // because the order of this poly is block_length and the msg_length <= message_length
+    // e.g. 255 and 223
+    memset(rs->encoded_polynomial.coeff + (rs->encoded_polynomial.order + 1 - pad_length), 0, pad_length);
+    memset(rs->encoded_polynomial.coeff, 0, (rs->encoded_polynomial.order + 1 - rs->message_length));
+
+    polynomial_mod(rs->field, rs->encoded_polynomial, rs->generator, rs->encoded_remainder);
+
+    // now return byte order to highest order to lowest order
+    for (unsigned int i = 0; i < msg_length; i++) {
+        encoded[i] = rs->encoded_polynomial.coeff[rs->encoded_polynomial.order - (i + pad_length)];
+    }
+
+    for (unsigned int i = 0; i < rs->min_distance; i++) {
+        encoded[msg_length + i] = rs->encoded_remainder.coeff[rs->min_distance - (i + 1)];
+    }
+
+    return rs->block_length;
+}
diff --git a/libcorrect/src/reed-solomon/polynomial.c b/libcorrect/src/reed-solomon/polynomial.c
new file mode 100644
index 0000000..32c2792
--- /dev/null
+++ b/libcorrect/src/reed-solomon/polynomial.c
@@ -0,0 +1,255 @@
+#include "correct/reed-solomon/polynomial.h"
+
+polynomial_t polynomial_create(unsigned int order) {
+    polynomial_t polynomial;
+    polynomial.coeff = malloc(sizeof(field_element_t) * (order + 1));
+    polynomial.order = order;
+    return polynomial;
+}
+
+void polynomial_destroy(polynomial_t polynomial) {
+    free(polynomial.coeff);
+}
+
+// if you want a full multiplication, then make res.order = l.order + r.order
+// but if you just care about a lower order, e.g. mul mod x^i, then you can select
+//    fewer coefficients
+void polynomial_mul(field_t field, polynomial_t l, polynomial_t r, polynomial_t res) {
+    // perform an element-wise multiplication of two polynomials
+    memset(res.coeff, 0, sizeof(field_element_t) * (res.order + 1));
+    for (unsigned int i = 0; i <= l.order; i++) {
+        if (i > res.order) {
+            continue;
+        }
+        unsigned int j_limit = (r.order > res.order - i) ? res.order - i : r.order;
+        for (unsigned int j = 0; j <= j_limit; j++) {
+            // e.g. alpha^5*x * alpha^37*x^2 --> alpha^42*x^3
+            res.coeff[i + j] = field_add(field, res.coeff[i + j], field_mul(field, l.coeff[i], r.coeff[j]));
+        }
+    }
+}
+
+void polynomial_mod(field_t field, polynomial_t dividend, polynomial_t divisor, polynomial_t mod) {
+    // find the polynomial remainder of dividend mod divisor
+    // do long division and return just the remainder (written to mod)
+
+    if (mod.order < dividend.order) {
+        // mod.order must be >= dividend.order (scratch space needed)
+        // this is an error -- catch it in debug?
+        return;
+    }
+    // initialize remainder as dividend
+    memcpy(mod.coeff, dividend.coeff, sizeof(field_element_t) * (dividend.order + 1));
+
+    // XXX make sure divisor[divisor_order] is nonzero
+    field_logarithm_t divisor_leading = field.log[divisor.coeff[divisor.order]];
+    // long division steps along one order at a time, starting at the highest order
+    for (unsigned int i = dividend.order; i > 0; i--) {
+        // look at the leading coefficient of dividend and divisor
+        // if leading coefficient of dividend / leading coefficient of divisor is q
+        //   then the next row of subtraction will be q * divisor
+        // if order of q < 0 then what we have is the remainder and we are done
+        if (i < divisor.order) {
+            break;
+        }
+        if (mod.coeff[i] == 0) {
+            continue;
+        }
+        unsigned int q_order = i - divisor.order;
+        field_logarithm_t q_coeff = field_div_log(field, field.log[mod.coeff[i]], divisor_leading);
+
+        // now that we've chosen q, multiply the divisor by q and subtract from
+        //   our remainder. subtracting in GF(2^8) is XOR, just like addition
+        for (unsigned int j = 0; j <= divisor.order; j++) {
+            if (divisor.coeff[j] == 0) {
+                continue;
+            }
+            // all of the multiplication is shifted up by q_order places
+            mod.coeff[j + q_order] = field_add(field, mod.coeff[j + q_order],
+                        field_mul_log_element(field, field.log[divisor.coeff[j]], q_coeff));
+        }
+    }
+}
+
+void polynomial_formal_derivative(field_t field, polynomial_t poly, polynomial_t der) {
+    // if f(x) = a(n)*x^n + ... + a(1)*x + a(0)
+    // then f'(x) = n*a(n)*x^(n-1) + ... + 2*a(2)*x + a(1)
+    // where n*a(n) = sum(k=1, n, a(n)) e.g. the nth sum of a(n) in GF(2^8)
+
+    // assumes der.order = poly.order - 1
+    memset(der.coeff, 0, sizeof(field_element_t) * (der.order + 1));
+    for (unsigned int i = 0; i <= der.order; i++) {
+        // we're filling in the ith power of der, so we look ahead one power in poly
+        // f(x) = a(i + 1)*x^(i + 1) -> f'(x) = (i + 1)*a(i + 1)*x^i
+        // where (i + 1)*a(i + 1) is the sum of a(i + 1) (i + 1) times, not the product
+        der.coeff[i] = field_sum(field, poly.coeff[i + 1], i + 1);
+    }
+}
+
+field_element_t polynomial_eval(field_t field, polynomial_t poly, field_element_t val) {
+    // evaluate the polynomial poly at a particular element val
+    if (val == 0) {
+        return poly.coeff[0];
+    }
+
+    field_element_t res = 0;
+
+    // we're going to start at 0th order and multiply by val each time
+    field_logarithm_t val_exponentiated = field.log[1];
+    field_logarithm_t val_log = field.log[val];
+
+    for (unsigned int i = 0; i <= poly.order; i++) {
+        if (poly.coeff[i] != 0) {
+            // multiply-accumulate by the next coeff times the next power of val
+            res = field_add(field, res,
+                    field_mul_log_element(field, field.log[poly.coeff[i]], val_exponentiated));
+        }
+        // now advance to the next power
+        val_exponentiated = field_mul_log(field, val_exponentiated, val_log);
+    }
+    return res;
+}
+
+field_element_t polynomial_eval_lut(field_t field, polynomial_t poly, const field_logarithm_t *val_exp) {
+    // evaluate the polynomial poly at a particular element val
+    // in this case, all of the logarithms of the successive powers of val have been precalculated
+    // this removes the extra work we'd have to do to calculate val_exponentiated each time
+    //   if this function is to be called on the same val multiple times
+    if (val_exp[0] == 0) {
+        return poly.coeff[0];
+    }
+
+    field_element_t res = 0;
+
+    for (unsigned int i = 0; i <= poly.order; i++) {
+        if (poly.coeff[i] != 0) {
+            // multiply-accumulate by the next coeff times the next power of val
+            res = field_add(field, res,
+                    field_mul_log_element(field, field.log[poly.coeff[i]], val_exp[i]));
+        }
+    }
+    return res;
+}
+
+field_element_t polynomial_eval_log_lut(field_t field, polynomial_t poly_log, const field_logarithm_t *val_exp) {
+    // evaluate the log_polynomial poly at a particular element val
+    // like polynomial_eval_lut, the logarithms of the successive powers of val have been
+    //   precomputed
+    if (val_exp[0] == 0) {
+        if (poly_log.coeff[0] == 0) {
+            // special case for the non-existant log case
+            return 0;
+        }
+        return field.exp[poly_log.coeff[0]];
+    }
+
+    field_element_t res = 0;
+
+    for (unsigned int i = 0; i <= poly_log.order; i++) {
+        // using 0 as a sentinel value in log -- log(0) is really -inf
+        if (poly_log.coeff[i] != 0) {
+            // multiply-accumulate by the next coeff times the next power of val
+            res = field_add(field, res,
+                    field_mul_log_element(field, poly_log.coeff[i], val_exp[i]));
+        }
+    }
+    return res;
+}
+
+void polynomial_build_exp_lut(field_t field, field_element_t val, unsigned int order, field_logarithm_t *val_exp) {
+    // create the lookup table of successive powers of val used by polynomial_eval_lut
+    field_logarithm_t val_exponentiated = field.log[1];
+    field_logarithm_t val_log = field.log[val];
+    for (unsigned int i = 0; i <= order; i++) {
+        if (val == 0) {
+            val_exp[i] = 0;
+        } else {
+            val_exp[i] = val_exponentiated;
+            val_exponentiated = field_mul_log(field, val_exponentiated, val_log);
+        }
+    }
+}
+
+polynomial_t polynomial_init_from_roots(field_t field, unsigned int nroots, field_element_t *roots, polynomial_t poly, polynomial_t *scratch) {
+    unsigned int order = nroots;
+    polynomial_t l;
+    field_element_t l_coeff[2];
+    l.order = 1;
+    l.coeff = l_coeff;
+
+    // we'll keep two temporary stores of rightside polynomial
+    // each time through the loop, we take the previous result and use it as new rightside
+    // swap back and forth (prevents the need for a copy)
+    polynomial_t r[2];
+    r[0] = scratch[0];
+    r[1] = scratch[1];
+    unsigned int rcoeffres = 0;
+
+    // initialize the result with x + roots[0]
+    r[rcoeffres].coeff[1] = 1;
+    r[rcoeffres].coeff[0] = roots[0];
+    r[rcoeffres].order = 1;
+
+    // initialize lcoeff[1] with x
+    // we'll fill in the 0th order term in each loop iter
+    l.coeff[1] = 1;
+
+    // loop through, using previous run's result as the new right hand side
+    // this allows us to multiply one group at a time
+    for (unsigned int i = 1; i < nroots; i++) {
+        l.coeff[0] = roots[i];
+        unsigned int nextrcoeff = rcoeffres;
+        rcoeffres = (rcoeffres + 1) % 2;
+        r[rcoeffres].order = i + 1;
+        polynomial_mul(field, l, r[nextrcoeff], r[rcoeffres]);
+    }
+
+    memcpy(poly.coeff, r[rcoeffres].coeff, (order + 1) * sizeof(field_element_t));
+    poly.order = order;
+
+    return poly;
+}
+
+polynomial_t polynomial_create_from_roots(field_t field, unsigned int nroots, field_element_t *roots) {
+    polynomial_t poly = polynomial_create(nroots);
+    unsigned int order = nroots;
+    polynomial_t l;
+    l.order = 1;
+    l.coeff = calloc(2, sizeof(field_element_t));
+
+    polynomial_t r[2];
+    // we'll keep two temporary stores of rightside polynomial
+    // each time through the loop, we take the previous result and use it as new rightside
+    // swap back and forth (prevents the need for a copy)
+    r[0].coeff = calloc(order + 1, sizeof(field_element_t));
+    r[1].coeff = calloc(order + 1, sizeof(field_element_t));
+    unsigned int rcoeffres = 0;
+
+    // initialize the result with x + roots[0]
+    r[rcoeffres].coeff[0] = roots[0];
+    r[rcoeffres].coeff[1] = 1;
+    r[rcoeffres].order = 1;
+
+    // initialize lcoeff[1] with x
+    // we'll fill in the 0th order term in each loop iter
+    l.coeff[1] = 1;
+
+    // loop through, using previous run's result as the new right hand side
+    // this allows us to multiply one group at a time
+    for (unsigned int i = 1; i < nroots; i++) {
+        l.coeff[0] = roots[i];
+        unsigned int nextrcoeff = rcoeffres;
+        rcoeffres = (rcoeffres + 1) % 2;
+        r[rcoeffres].order = i + 1;
+        polynomial_mul(field, l, r[nextrcoeff], r[rcoeffres]);
+    }
+
+    memcpy(poly.coeff, r[rcoeffres].coeff, (order + 1) * sizeof(field_element_t));
+    poly.order = order;
+
+    free(l.coeff);
+    free(r[0].coeff);
+    free(r[1].coeff);
+
+    return poly;
+}
diff --git a/libcorrect/src/reed-solomon/reed-solomon.c b/libcorrect/src/reed-solomon/reed-solomon.c
new file mode 100644
index 0000000..91a708e
--- /dev/null
+++ b/libcorrect/src/reed-solomon/reed-solomon.c
@@ -0,0 +1,187 @@
+#include "correct/reed-solomon/reed-solomon.h"
+
+// coeff must be of size nroots + 1
+// e.g. 2 roots (x + alpha)(x + alpha^2) yields a poly with 3 terms x^2 + g0*x + g1
+static polynomial_t reed_solomon_build_generator(field_t field, unsigned int nroots, field_element_t first_consecutive_root, unsigned int root_gap, polynomial_t generator, field_element_t *roots) {
+    // generator has order 2*t
+    // of form (x + alpha^1)(x + alpha^2)...(x - alpha^2*t)
+    for (unsigned int i = 0; i < nroots; i++) {
+        roots[i] = field.exp[(root_gap * (i + first_consecutive_root)) % 255];
+    }
+    return polynomial_create_from_roots(field, nroots, roots);
+}
+
+correct_reed_solomon *correct_reed_solomon_create(field_operation_t primitive_polynomial, field_logarithm_t first_consecutive_root, field_logarithm_t generator_root_gap, size_t num_roots) {
+    correct_reed_solomon *rs = calloc(1, sizeof(correct_reed_solomon));
+    rs->field = field_create(primitive_polynomial);
+
+    rs->block_length = 255;
+    rs->min_distance = num_roots;
+    rs->message_length = rs->block_length - rs->min_distance;
+
+    rs->first_consecutive_root = first_consecutive_root;
+    rs->generator_root_gap = generator_root_gap;
+
+    rs->generator_roots = malloc(rs->min_distance * sizeof(field_element_t));
+
+    rs->generator = reed_solomon_build_generator(rs->field, rs->min_distance, rs->first_consecutive_root, rs->generator_root_gap, rs->generator, rs->generator_roots);
+
+    rs->encoded_polynomial = polynomial_create(rs->block_length - 1);
+    rs->encoded_remainder = polynomial_create(rs->block_length - 1);
+
+    rs->has_init_decode = false;
+
+    return rs;
+}
+
+void correct_reed_solomon_destroy(correct_reed_solomon *rs) {
+    field_destroy(rs->field);
+    polynomial_destroy(rs->generator);
+    free(rs->generator_roots);
+    polynomial_destroy(rs->encoded_polynomial);
+    polynomial_destroy(rs->encoded_remainder);
+    if (rs->has_init_decode) {
+        free(rs->syndromes);
+        free(rs->modified_syndromes);
+        polynomial_destroy(rs->received_polynomial);
+        polynomial_destroy(rs->error_locator);
+        polynomial_destroy(rs->error_locator_log);
+        polynomial_destroy(rs->erasure_locator);
+        free(rs->error_roots);
+        free(rs->error_vals);
+        free(rs->error_locations);
+        polynomial_destroy(rs->last_error_locator);
+        polynomial_destroy(rs->error_evaluator);
+        polynomial_destroy(rs->error_locator_derivative);
+        for (unsigned int i = 0; i < rs->min_distance; i++) {
+            free(rs->generator_root_exp[i]);
+        }
+        free(rs->generator_root_exp);
+        for (field_operation_t i = 0; i < 256; i++) {
+            free(rs->element_exp[i]);
+        }
+        free(rs->element_exp);
+        polynomial_destroy(rs->init_from_roots_scratch[0]);
+        polynomial_destroy(rs->init_from_roots_scratch[1]);
+    }
+    free(rs);
+}
+
+void correct_reed_solomon_debug_print(correct_reed_solomon *rs) {
+    for (unsigned int i = 0; i < 256; i++) {
+        printf("%3d  %3d    %3d  %3d\n", i, rs->field.exp[i], i, rs->field.log[i]);
+    }
+    printf("\n");
+
+    printf("roots: ");
+    for (unsigned int i = 0; i < rs->min_distance; i++) {
+        printf("%d", rs->generator_roots[i]);
+        if (i < rs->min_distance - 1) {
+            printf(", ");
+        }
+    }
+    printf("\n\n");
+
+    printf("generator: ");
+    for (unsigned int i = 0; i < rs->generator.order + 1; i++) {
+        printf("%d*x^%d", rs->generator.coeff[i], i);
+        if (i < rs->generator.order) {
+            printf(" + ");
+        }
+    }
+    printf("\n\n");
+
+    printf("generator (alpha format): ");
+    for (unsigned int i = rs->generator.order + 1; i > 0; i--) {
+        printf("alpha^%d*x^%d", rs->field.log[rs->generator.coeff[i - 1]], i - 1);
+        if (i > 1) {
+            printf(" + ");
+        }
+    }
+    printf("\n\n");
+
+    printf("remainder: ");
+    bool has_printed = false;
+    for (unsigned int i = 0; i < rs->encoded_remainder.order + 1; i++) {
+        if (!rs->encoded_remainder.coeff[i]) {
+            continue;
+        }
+        if (has_printed) {
+            printf(" + ");
+        }
+        has_printed = true;
+        printf("%d*x^%d", rs->encoded_remainder.coeff[i], i);
+    }
+    printf("\n\n");
+
+    printf("syndromes: ");
+    for (unsigned int i = 0; i < rs->min_distance; i++) {
+        printf("%d", rs->syndromes[i]);
+        if (i < rs->min_distance - 1) {
+            printf(", ");
+        }
+    }
+    printf("\n\n");
+
+    printf("numerrors: %d\n\n", rs->error_locator.order);
+
+    printf("error locator: ");
+    has_printed = false;
+    for (unsigned int i = 0; i < rs->error_locator.order + 1; i++) {
+        if (!rs->error_locator.coeff[i]) {
+            continue;
+        }
+        if (has_printed) {
+            printf(" + ");
+        }
+        has_printed = true;
+        printf("%d*x^%d", rs->error_locator.coeff[i], i);
+    }
+    printf("\n\n");
+
+    printf("error roots: ");
+    for (unsigned int i = 0; i < rs->error_locator.order; i++) {
+        printf("%d@%d", polynomial_eval(rs->field, rs->error_locator, rs->error_roots[i]), rs->error_roots[i]);
+        if (i < rs->error_locator.order - 1) {
+            printf(", ");
+        }
+    }
+    printf("\n\n");
+
+    printf("error evaluator: ");
+    has_printed = false;
+    for (unsigned int i = 0; i < rs->error_evaluator.order; i++) {
+        if (!rs->error_evaluator.coeff[i]) {
+            continue;
+        }
+        if (has_printed) {
+            printf(" + ");
+        }
+        has_printed = true;
+        printf("%d*x^%d", rs->error_evaluator.coeff[i], i);
+    }
+    printf("\n\n");
+
+    printf("error locator derivative: ");
+    has_printed = false;
+    for (unsigned int i = 0; i < rs->error_locator_derivative.order; i++) {
+        if (!rs->error_locator_derivative.coeff[i]) {
+            continue;
+        }
+        if (has_printed) {
+            printf(" + ");
+        }
+        has_printed = true;
+        printf("%d*x^%d", rs->error_locator_derivative.coeff[i], i);
+    }
+    printf("\n\n");
+
+    printf("error locator: ");
+    for (unsigned int i = 0; i < rs->error_locator.order; i++) {
+        printf("%d@%d", rs->error_vals[i], rs->error_locations[i]);
+        if (i < rs->error_locator.order - 1) {
+            printf(", ");
+        }
+    }
+    printf("\n\n");
+}
diff --git a/libcorrect/tests/CMakeLists.txt b/libcorrect/tests/CMakeLists.txt
new file mode 100644
index 0000000..b35e5f9
--- /dev/null
+++ b/libcorrect/tests/CMakeLists.txt
@@ -0,0 +1,54 @@
+include_directories("include")
+
+
+add_executable(convolutional_test_runner EXCLUDE_FROM_ALL convolutional.c $<TARGET_OBJECTS:error_sim>)
+target_link_libraries(convolutional_test_runner correct_static "${LIBM}")
+set_target_properties(convolutional_test_runner PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/tests")
+add_test(NAME convolutional_test WORKING_DIRECTORY "${CMAKE_BINARY_DIR}/tests" COMMAND convolutional_test_runner)
+set(all_test_runners ${all_test_runners} convolutional_test_runner)
+
+if(HAVE_SSE)
+    add_executable(convolutional_sse_test_runner EXCLUDE_FROM_ALL convolutional-sse.c $<TARGET_OBJECTS:error_sim_sse>)
+    target_link_libraries(convolutional_sse_test_runner correct_static "${LIBM}")
+    set_target_properties(convolutional_sse_test_runner PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/tests")
+    add_test(NAME convolutional_sse_test WORKING_DIRECTORY "${CMAKE_BINARY_DIR}/tests" COMMAND convolutional_sse_test_runner)
+    set(all_test_runners ${all_test_runners} convolutional_sse_test_runner)
+endif()
+
+if(HAVE_LIBFEC)
+    add_executable(convolutional_fec_test_runner EXCLUDE_FROM_ALL convolutional-fec.c $<TARGET_OBJECTS:error_sim_fec>)
+    target_link_libraries(convolutional_fec_test_runner correct_static FEC "${LIBM}")
+    set_target_properties(convolutional_fec_test_runner PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/tests")
+    add_test(NAME convolutional_fec_test WORKING_DIRECTORY "${CMAKE_BINARY_DIR}/tests" COMMAND convolutional_fec_test_runner)
+    set(all_test_runners ${all_test_runners} convolutional_fec_test_runner)
+endif()
+
+add_executable(convolutional_shim_test_runner EXCLUDE_FROM_ALL convolutional-shim.c $<TARGET_OBJECTS:error_sim_shim>)
+target_link_libraries(convolutional_shim_test_runner correct_static fec_shim_static "${LIBM}")
+set_target_properties(convolutional_shim_test_runner PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/tests")
+add_test(NAME convolutional_shim_test WORKING_DIRECTORY "${CMAKE_BINARY_DIR}/tests" COMMAND convolutional_shim_test_runner)
+set(all_test_runners ${all_test_runners} convolutional_shim_test_runner)
+
+add_executable(reed_solomon_test_runner EXCLUDE_FROM_ALL reed-solomon.c rs_tester.c)
+target_link_libraries(reed_solomon_test_runner correct_static "${LIBM}")
+set_target_properties(reed_solomon_test_runner PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/tests")
+add_test(NAME reed_solomon_test WORKING_DIRECTORY "${CMAKE_BINARY_DIR}/tests" COMMAND reed_solomon_test_runner)
+set(all_test_runners ${all_test_runners} reed_solomon_test_runner)
+
+if(HAVE_LIBFEC)
+    add_executable(reed_solomon_interop_test_runner EXCLUDE_FROM_ALL reed-solomon-fec-interop.c rs_tester.c rs_tester_fec.c)
+    target_link_libraries(reed_solomon_interop_test_runner correct_static FEC "${LIBM}")
+    set_target_properties(reed_solomon_interop_test_runner PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/tests")
+    add_test(NAME reed_solomon_interop_test WORKING_DIRECTORY "${CMAKE_BINARY_DIR}/tests" COMMAND reed_solomon_interop_test_runner)
+    set(all_test_runners ${all_test_runners} reed_solomon_interop_test_runner)
+endif()
+
+add_executable(reed_solomon_shim_interop_test_runner EXCLUDE_FROM_ALL reed-solomon-shim-interop.c rs_tester.c rs_tester_fec_shim.c)
+target_link_libraries(reed_solomon_shim_interop_test_runner correct_static fec_shim_static "${LIBM}")
+set_target_properties(reed_solomon_shim_interop_test_runner PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/tests")
+add_test(NAME reed_solomon_shim_interop_test WORKING_DIRECTORY "${CMAKE_BINARY_DIR}/tests" COMMAND reed_solomon_shim_interop_test_runner)
+set(all_test_runners ${all_test_runners} reed_solomon_shim_interop_test_runner)
+
+add_custom_target(test_runners DEPENDS ${all_test_runners})
+add_custom_target(check COMMAND ${CMAKE_CTEST_COMMAND} DEPENDS test_runners)
+enable_testing()
diff --git a/libcorrect/tests/convolutional-fec.c b/libcorrect/tests/convolutional-fec.c
new file mode 100644
index 0000000..f8b40db
--- /dev/null
+++ b/libcorrect/tests/convolutional-fec.c
@@ -0,0 +1,123 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+
+#include <fec.h>
+
+#include "correct.h"
+#include "correct/util/error-sim-fec.h"
+
+size_t max_block_len = 4096;
+
+size_t test_conv(correct_convolutional *conv, void *fec,
+                 void (*decode)(void *, uint8_t *, size_t, uint8_t *),
+                 conv_testbench **testbench_ptr, size_t msg_len, double eb_n0,
+                 double bpsk_bit_energy, double bpsk_voltage) {
+    uint8_t *msg = malloc(max_block_len);
+
+    size_t num_errors = 0;
+
+    while (msg_len) {
+        size_t block_len = (max_block_len < msg_len) ? max_block_len : msg_len;
+        msg_len -= block_len;
+
+        for (unsigned int j = 0; j < block_len; j++) {
+            msg[j] = rand() % 256;
+        }
+
+        *testbench_ptr =
+            resize_conv_testbench(*testbench_ptr, conv_correct_enclen, conv, block_len);
+        conv_testbench *testbench = *testbench_ptr;
+        testbench->encoder = conv;
+        testbench->encode = conv_correct_encode;
+        testbench->decoder = fec;
+        testbench->decode = decode;
+        build_white_noise(testbench->noise, testbench->enclen, eb_n0, bpsk_bit_energy);
+        num_errors += test_conv_noise(testbench, msg, block_len, bpsk_voltage);
+    }
+    free(msg);
+    return num_errors;
+}
+
+void assert_test_result(correct_convolutional *conv, void *fec,
+                        void (*decode)(void *, uint8_t *, size_t, uint8_t *),
+                        conv_testbench **testbench, size_t test_length, size_t rate, size_t order,
+                        double eb_n0, double error_rate) {
+    double bpsk_voltage = 1.0 / sqrt(2.0);
+    double bpsk_sym_energy = 2 * pow(bpsk_voltage, 2.0);
+    double bpsk_bit_energy = bpsk_sym_energy * rate;
+
+    size_t error_count =
+        test_conv(conv, fec, decode, testbench, test_length, eb_n0, bpsk_bit_energy, bpsk_voltage);
+    double observed_error_rate = error_count / ((double)test_length * 8);
+    if (observed_error_rate > error_rate) {
+        printf(
+            "test failed, expected error rate=%.2e, observed error rate=%.2e @%.1fdB for rate %zu "
+            "order %zu\n",
+            error_rate, observed_error_rate, eb_n0, rate, order);
+        exit(1);
+    } else {
+        printf(
+            "test passed, expected error rate=%.2e, observed error rate=%.2e @%.1fdB for rate %zu "
+            "order %zu\n",
+            error_rate, observed_error_rate, eb_n0, rate, order);
+    }
+}
+
+int main() {
+    srand(time(NULL));
+
+    conv_testbench *testbench = NULL;
+
+    correct_convolutional *conv;
+    void *fec;
+    uint16_t *poly;
+
+    poly = (uint16_t[]){V27POLYA, V27POLYB};
+    conv = correct_convolutional_create(2, 7, poly);
+    fec = create_viterbi27(8 * max_block_len);
+    assert_test_result(conv, fec, conv_fec27_decode, &testbench, 1000000, 2, 6, INFINITY, 0);
+    assert_test_result(conv, fec, conv_fec27_decode, &testbench, 1000000, 2, 6, 4.5, 8e-06);
+    assert_test_result(conv, fec, conv_fec27_decode, &testbench, 1000000, 2, 6, 4.0, 5e-05);
+    delete_viterbi27(fec);
+    correct_convolutional_destroy(conv);
+
+    printf("\n");
+
+    poly = (uint16_t[]){V29POLYA, V29POLYB};
+    conv = correct_convolutional_create(2, 9, poly);
+    fec = create_viterbi29(8 * max_block_len);
+    assert_test_result(conv, fec, conv_fec29_decode, &testbench, 1000000, 2, 9, INFINITY, 0);
+    assert_test_result(conv, fec, conv_fec29_decode, &testbench, 1000000, 2, 9, 4.5, 3e-06);
+    assert_test_result(conv, fec, conv_fec29_decode, &testbench, 1000000, 2, 9, 4.0, 8e-06);
+    delete_viterbi29(fec);
+    correct_convolutional_destroy(conv);
+
+    printf("\n");
+
+    poly = (uint16_t[]){V39POLYA, V39POLYB, V39POLYC};
+    conv = correct_convolutional_create(3, 9, poly);
+    fec = create_viterbi39(8 * max_block_len);
+    assert_test_result(conv, fec, conv_fec39_decode, &testbench, 1000000, 3, 9, INFINITY, 0);
+    assert_test_result(conv, fec, conv_fec39_decode, &testbench, 1000000, 3, 9, 4.5, 3e-06);
+    assert_test_result(conv, fec, conv_fec39_decode, &testbench, 1000000, 3, 9, 4.0, 5e-06);
+    delete_viterbi39(fec);
+    correct_convolutional_destroy(conv);
+
+    printf("\n");
+
+    poly = (uint16_t[]){V615POLYA, V615POLYB, V615POLYC, V615POLYD, V615POLYE, V615POLYF};
+    conv = correct_convolutional_create(6, 15, poly);
+    fec = create_viterbi615(8 * max_block_len);
+    assert_test_result(conv, fec, conv_fec615_decode, &testbench, 100000, 6, 15, INFINITY, 0);
+    assert_test_result(conv, fec, conv_fec615_decode, &testbench, 100000, 6, 15, 3.0, 3e-06);
+    assert_test_result(conv, fec, conv_fec615_decode, &testbench, 100000, 6, 15, 2.5, 1e-05);
+    delete_viterbi615(fec);
+    correct_convolutional_destroy(conv);
+
+    printf("\n");
+
+    free_scratch(testbench);
+    return 0;
+}
diff --git a/libcorrect/tests/convolutional-shim.c b/libcorrect/tests/convolutional-shim.c
new file mode 100644
index 0000000..7a19227
--- /dev/null
+++ b/libcorrect/tests/convolutional-shim.c
@@ -0,0 +1,122 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+
+#include "correct.h"
+#include "fec_shim.h"
+#include "correct/util/error-sim-shim.h"
+
+size_t max_block_len = 4096;
+
+size_t test_conv(correct_convolutional *conv, void *fec,
+                 ssize_t (*decode)(void *, uint8_t *, size_t, uint8_t *),
+                 conv_testbench **testbench_ptr, size_t msg_len, double eb_n0,
+                 double bpsk_bit_energy, double bpsk_voltage) {
+    uint8_t *msg = malloc(max_block_len);
+
+    size_t num_errors = 0;
+
+    while (msg_len) {
+        size_t block_len = (max_block_len < msg_len) ? max_block_len : msg_len;
+        msg_len -= block_len;
+
+        for (unsigned int j = 0; j < block_len; j++) {
+            msg[j] = rand() % 256;
+        }
+
+        *testbench_ptr =
+            resize_conv_testbench(*testbench_ptr, conv_correct_enclen, conv, block_len);
+        conv_testbench *testbench = *testbench_ptr;
+        testbench->encoder = conv;
+        testbench->encode = conv_correct_encode;
+        testbench->decoder = fec;
+        testbench->decode = decode;
+        build_white_noise(testbench->noise, testbench->enclen, eb_n0, bpsk_bit_energy);
+        num_errors += test_conv_noise(testbench, msg, block_len, bpsk_voltage);
+    }
+    free(msg);
+    return num_errors;
+}
+
+void assert_test_result(correct_convolutional *conv, void *fec,
+                        ssize_t (*decode)(void *, uint8_t *, size_t, uint8_t *),
+                        conv_testbench **testbench, size_t test_length, size_t rate, size_t order,
+                        double eb_n0, double error_rate) {
+    double bpsk_voltage = 1.0 / sqrt(2.0);
+    double bpsk_sym_energy = 2 * pow(bpsk_voltage, 2.0);
+    double bpsk_bit_energy = bpsk_sym_energy * rate;
+
+    size_t error_count =
+        test_conv(conv, fec, decode, testbench, test_length, eb_n0, bpsk_bit_energy, bpsk_voltage);
+    double observed_error_rate = error_count / ((double)test_length * 8);
+    if (observed_error_rate > error_rate) {
+        printf(
+            "test failed, expected error rate=%.2e, observed error rate=%.2e @%.1fdB for rate %zu "
+            "order %zu\n",
+            error_rate, observed_error_rate, eb_n0, rate, order);
+        exit(1);
+    } else {
+        printf(
+            "test passed, expected error rate=%.2e, observed error rate=%.2e @%.1fdB for rate %zu "
+            "order %zu\n",
+            error_rate, observed_error_rate, eb_n0, rate, order);
+    }
+}
+
+int main() {
+    srand(time(NULL));
+
+    conv_testbench *testbench = NULL;
+
+    correct_convolutional *conv;
+    void *fec;
+    uint16_t *poly;
+
+    poly = (uint16_t[]){V27POLYA, V27POLYB};
+    conv = correct_convolutional_create(2, 7, poly);
+    fec = create_viterbi27(8 * max_block_len);
+    assert_test_result(conv, fec, conv_shim27_decode, &testbench, 1000000, 2, 6, INFINITY, 0);
+    assert_test_result(conv, fec, conv_shim27_decode, &testbench, 1000000, 2, 6, 4.5, 8e-06);
+    assert_test_result(conv, fec, conv_shim27_decode, &testbench, 1000000, 2, 6, 4.0, 5e-05);
+    delete_viterbi27(fec);
+    correct_convolutional_destroy(conv);
+
+    printf("\n");
+
+    poly = (uint16_t[]){V29POLYA, V29POLYB};
+    conv = correct_convolutional_create(2, 9, poly);
+    fec = create_viterbi29(8 * max_block_len);
+    assert_test_result(conv, fec, conv_shim29_decode, &testbench, 1000000, 2, 9, INFINITY, 0);
+    assert_test_result(conv, fec, conv_shim29_decode, &testbench, 1000000, 2, 9, 4.5, 3e-06);
+    assert_test_result(conv, fec, conv_shim29_decode, &testbench, 1000000, 2, 9, 4.0, 8e-06);
+    delete_viterbi29(fec);
+    correct_convolutional_destroy(conv);
+
+    printf("\n");
+
+    poly = (uint16_t[]){V39POLYA, V39POLYB, V39POLYC};
+    conv = correct_convolutional_create(3, 9, poly);
+    fec = create_viterbi39(8 * max_block_len);
+    assert_test_result(conv, fec, conv_shim39_decode, &testbench, 1000000, 3, 9, INFINITY, 0);
+    assert_test_result(conv, fec, conv_shim39_decode, &testbench, 1000000, 3, 9, 4.5, 3e-06);
+    assert_test_result(conv, fec, conv_shim39_decode, &testbench, 1000000, 3, 9, 4.0, 9e-06);
+    delete_viterbi39(fec);
+    correct_convolutional_destroy(conv);
+
+    printf("\n");
+
+    poly = (uint16_t[]){V615POLYA, V615POLYB, V615POLYC, V615POLYD, V615POLYE, V615POLYF};
+    conv = correct_convolutional_create(6, 15, poly);
+    fec = create_viterbi615(8 * max_block_len);
+    assert_test_result(conv, fec, conv_shim615_decode, &testbench, 100000, 6, 15, INFINITY, 0);
+    assert_test_result(conv, fec, conv_shim615_decode, &testbench, 100000, 6, 15, 3.0, 2e-05);
+    assert_test_result(conv, fec, conv_shim615_decode, &testbench, 100000, 6, 15, 2.5, 4e-05);
+    delete_viterbi615(fec);
+    correct_convolutional_destroy(conv);
+
+    printf("\n");
+
+    free_scratch(testbench);
+    return 0;
+}
diff --git a/libcorrect/tests/convolutional-sse.c b/libcorrect/tests/convolutional-sse.c
new file mode 100644
index 0000000..fb2185e
--- /dev/null
+++ b/libcorrect/tests/convolutional-sse.c
@@ -0,0 +1,132 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+
+#include "correct/util/error-sim-sse.h"
+
+size_t max_block_len = 4096;
+
+size_t test_conv(correct_convolutional_sse *conv, conv_testbench **testbench_ptr,
+               size_t msg_len, double eb_n0, double bpsk_bit_energy,
+               double bpsk_voltage) {
+    uint8_t *msg = malloc(max_block_len);
+
+    size_t num_errors = 0;
+
+    while (msg_len) {
+        size_t block_len = (max_block_len < msg_len) ? max_block_len : msg_len;
+        msg_len -= block_len;
+
+        for (unsigned int j = 0; j < block_len; j++) {
+            msg[j] = rand() % 256;
+        }
+
+        *testbench_ptr = resize_conv_testbench(*testbench_ptr, conv_correct_sse_enclen, conv, block_len);
+        conv_testbench *testbench = *testbench_ptr;
+        testbench->encoder = conv;
+        testbench->encode = conv_correct_sse_encode;
+        testbench->decoder = conv;
+        testbench->decode = conv_correct_sse_decode;
+        build_white_noise(testbench->noise, testbench->enclen, eb_n0, bpsk_bit_energy);
+        num_errors += test_conv_noise(testbench, msg, block_len, bpsk_voltage);
+    }
+    free(msg);
+    return num_errors;
+}
+
+void assert_test_result(correct_convolutional_sse *conv, conv_testbench **testbench,
+                        size_t test_length, size_t rate, size_t order, double eb_n0, double error_rate) {
+    double bpsk_voltage = 1.0/sqrt(2.0);
+    double bpsk_sym_energy = 2*pow(bpsk_voltage, 2.0);
+    double bpsk_bit_energy = bpsk_sym_energy * rate;
+
+    size_t error_count = test_conv(conv, testbench, test_length, eb_n0, bpsk_bit_energy, bpsk_voltage);
+    double observed_error_rate = error_count/((double)test_length * 8);
+    if (observed_error_rate > error_rate) {
+        printf("test failed, expected error rate=%.2e, observed error rate=%.2e @%.1fdB for rate %zu order %zu\n",
+                error_rate, observed_error_rate, eb_n0, rate, order);
+        exit(1);
+    } else {
+        printf("test passed, expected error rate=%.2e, observed error rate=%.2e @%.1fdB for rate %zu order %zu\n",
+                error_rate, observed_error_rate, eb_n0, rate, order);
+    }
+}
+
+int main() {
+    srand(time(NULL));
+
+    conv_testbench *testbench = NULL;
+
+    correct_convolutional_sse *conv;
+
+    // n.b. the error rates below are at 5.0dB/4.5dB for order 6 polys
+    //  and 4.5dB/4.0dB for order 7-9 polys. this can be easy to miss.
+
+    conv = correct_convolutional_sse_create(2, 6, correct_conv_r12_6_polynomial);
+    assert_test_result(conv, &testbench, 1000000, 2, 6, INFINITY, 0);
+    assert_test_result(conv, &testbench, 1000000, 2, 6, 5.0, 8e-06);
+    assert_test_result(conv, &testbench, 1000000, 2, 6, 4.5, 3e-05);
+    correct_convolutional_sse_destroy(conv);
+
+    printf("\n");
+
+    conv = correct_convolutional_sse_create(2, 7, correct_conv_r12_7_polynomial);
+    assert_test_result(conv, &testbench, 1000000, 2, 7, INFINITY, 0);
+    assert_test_result(conv, &testbench, 1000000, 2, 7, 4.5, 1e-05);
+    assert_test_result(conv, &testbench, 1000000, 2, 7, 4.0, 5e-05);
+    correct_convolutional_sse_destroy(conv);
+
+    printf("\n");
+
+    conv = correct_convolutional_sse_create(2, 8, correct_conv_r12_8_polynomial);
+    assert_test_result(conv, &testbench, 1000000, 2, 8, INFINITY, 0);
+    assert_test_result(conv, &testbench, 1000000, 2, 8, 4.5, 5e-06);
+    assert_test_result(conv, &testbench, 1000000, 2, 8, 4.0, 3e-05);
+    correct_convolutional_sse_destroy(conv);
+
+    printf("\n");
+
+    conv = correct_convolutional_sse_create(2, 9, correct_conv_r12_9_polynomial);
+    assert_test_result(conv, &testbench, 1000000, 2, 9, INFINITY, 0);
+    assert_test_result(conv, &testbench, 1000000, 2, 9, 4.5, 3e-06);
+    assert_test_result(conv, &testbench, 1000000, 2, 9, 4.0, 8e-06);
+    correct_convolutional_sse_destroy(conv);
+
+    printf("\n");
+
+    conv = correct_convolutional_sse_create(3, 6, correct_conv_r13_6_polynomial);
+    assert_test_result(conv, &testbench, 1000000, 3, 6, INFINITY, 0);
+    assert_test_result(conv, &testbench, 1000000, 3, 6, 5.0, 5e-06);
+    assert_test_result(conv, &testbench, 1000000, 3, 6, 4.5, 2e-05);
+    correct_convolutional_sse_destroy(conv);
+
+    printf("\n");
+
+    conv = correct_convolutional_sse_create(3, 7, correct_conv_r13_7_polynomial);
+    assert_test_result(conv, &testbench, 1000000, 3, 7, INFINITY, 0);
+    assert_test_result(conv, &testbench, 1000000, 3, 7, 4.5, 5e-06);
+    assert_test_result(conv, &testbench, 1000000, 3, 7, 4.0, 3e-05);
+    correct_convolutional_sse_destroy(conv);
+
+    printf("\n");
+
+    conv = correct_convolutional_sse_create(3, 8, correct_conv_r13_8_polynomial);
+    assert_test_result(conv, &testbench, 1000000, 3, 8, INFINITY, 0);
+    assert_test_result(conv, &testbench, 1000000, 3, 8, 4.5, 4e-06);
+    assert_test_result(conv, &testbench, 1000000, 3, 8, 4.0, 1e-05);
+    correct_convolutional_sse_destroy(conv);
+
+    printf("\n");
+
+    conv = correct_convolutional_sse_create(3, 9, correct_conv_r13_9_polynomial);
+    assert_test_result(conv, &testbench, 1000000, 3, 9, INFINITY, 0);
+    assert_test_result(conv, &testbench, 1000000, 3, 9, 4.5, 3e-06);
+    assert_test_result(conv, &testbench, 1000000, 3, 9, 4.0, 5e-06);
+    correct_convolutional_sse_destroy(conv);
+
+    printf("\n");
+
+    free_scratch(testbench);
+    return 0;
+}
diff --git a/libcorrect/tests/convolutional.c b/libcorrect/tests/convolutional.c
new file mode 100644
index 0000000..4e58dd5
--- /dev/null
+++ b/libcorrect/tests/convolutional.c
@@ -0,0 +1,133 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+
+#include "correct.h"
+#include "correct/util/error-sim.h"
+
+size_t max_block_len = 4096;
+
+size_t test_conv(correct_convolutional *conv, conv_testbench **testbench_ptr,
+               size_t msg_len, double eb_n0, double bpsk_bit_energy,
+               double bpsk_voltage) {
+    uint8_t *msg = malloc(max_block_len);
+
+    size_t num_errors = 0;
+
+    while (msg_len) {
+        size_t block_len = (max_block_len < msg_len) ? max_block_len : msg_len;
+        msg_len -= block_len;
+
+        for (unsigned int j = 0; j < block_len; j++) {
+            msg[j] = rand() % 256;
+        }
+
+        *testbench_ptr = resize_conv_testbench(*testbench_ptr, conv_correct_enclen, conv, block_len);
+        conv_testbench *testbench = *testbench_ptr;
+        testbench->encoder = conv;
+        testbench->encode = conv_correct_encode;
+        testbench->decoder = conv;
+        testbench->decode = conv_correct_decode;
+        build_white_noise(testbench->noise, testbench->enclen, eb_n0, bpsk_bit_energy);
+        num_errors += test_conv_noise(testbench, msg, block_len, bpsk_voltage);
+    }
+    free(msg);
+    return num_errors;
+}
+
+void assert_test_result(correct_convolutional *conv, conv_testbench **testbench,
+                        size_t test_length, size_t rate, size_t order, double eb_n0, double error_rate) {
+    double bpsk_voltage = 1.0/sqrt(2.0);
+    double bpsk_sym_energy = 2*pow(bpsk_voltage, 2.0);
+    double bpsk_bit_energy = bpsk_sym_energy * rate;
+
+    size_t error_count = test_conv(conv, testbench, test_length, eb_n0, bpsk_bit_energy, bpsk_voltage);
+    double observed_error_rate = error_count/((double)test_length * 8);
+    if (observed_error_rate > error_rate) {
+        printf("test failed, expected error rate=%.2e, observed error rate=%.2e @%.1fdB for rate %zu order %zu\n",
+                error_rate, observed_error_rate, eb_n0, rate, order);
+        exit(1);
+    } else {
+        printf("test passed, expected error rate=%.2e, observed error rate=%.2e @%.1fdB for rate %zu order %zu\n",
+                error_rate, observed_error_rate, eb_n0, rate, order);
+    }
+}
+
+int main() {
+    srand(time(NULL));
+
+    conv_testbench *testbench = NULL;
+
+    correct_convolutional *conv;
+
+    // n.b. the error rates below are at 5.0dB/4.5dB for order 6 polys
+    //  and 4.5dB/4.0dB for order 7-9 polys. this can be easy to miss.
+
+    conv = correct_convolutional_create(2, 6, correct_conv_r12_6_polynomial);
+    assert_test_result(conv, &testbench, 1000000, 2, 6, INFINITY, 0);
+    assert_test_result(conv, &testbench, 1000000, 2, 6, 5.0, 5e-06);
+    assert_test_result(conv, &testbench, 1000000, 2, 6, 4.5, 3e-05);
+    correct_convolutional_destroy(conv);
+
+    printf("\n");
+
+    conv = correct_convolutional_create(2, 7, correct_conv_r12_7_polynomial);
+    assert_test_result(conv, &testbench, 1000000, 2, 7, INFINITY, 0);
+    assert_test_result(conv, &testbench, 1000000, 2, 7, 4.5, 1e-05);
+    assert_test_result(conv, &testbench, 1000000, 2, 7, 4.0, 5e-05);
+    correct_convolutional_destroy(conv);
+
+    printf("\n");
+
+    conv = correct_convolutional_create(2, 8, correct_conv_r12_8_polynomial);
+    assert_test_result(conv, &testbench, 1000000, 2, 8, INFINITY, 0);
+    assert_test_result(conv, &testbench, 1000000, 2, 8, 4.5, 5e-06);
+    assert_test_result(conv, &testbench, 1000000, 2, 8, 4.0, 3e-05);
+    correct_convolutional_destroy(conv);
+
+    printf("\n");
+
+    conv = correct_convolutional_create(2, 9, correct_conv_r12_9_polynomial);
+    assert_test_result(conv, &testbench, 1000000, 2, 9, INFINITY, 0);
+    assert_test_result(conv, &testbench, 1000000, 2, 9, 4.5, 3e-06);
+    assert_test_result(conv, &testbench, 1000000, 2, 9, 4.0, 1e-05);
+    correct_convolutional_destroy(conv);
+
+    printf("\n");
+
+    conv = correct_convolutional_create(3, 6, correct_conv_r13_6_polynomial);
+    assert_test_result(conv, &testbench, 1000000, 3, 6, INFINITY, 0);
+    assert_test_result(conv, &testbench, 1000000, 3, 6, 5.0, 5e-06);
+    assert_test_result(conv, &testbench, 1000000, 3, 6, 4.5, 2e-05);
+    correct_convolutional_destroy(conv);
+
+    printf("\n");
+
+    conv = correct_convolutional_create(3, 7, correct_conv_r13_7_polynomial);
+    assert_test_result(conv, &testbench, 1000000, 3, 7, INFINITY, 0);
+    assert_test_result(conv, &testbench, 1000000, 3, 7, 4.5, 5e-06);
+    assert_test_result(conv, &testbench, 1000000, 3, 7, 4.0, 3e-05);
+    correct_convolutional_destroy(conv);
+
+    printf("\n");
+
+    conv = correct_convolutional_create(3, 8, correct_conv_r13_8_polynomial);
+    assert_test_result(conv, &testbench, 1000000, 3, 8, INFINITY, 0);
+    assert_test_result(conv, &testbench, 1000000, 3, 8, 4.5, 4e-06);
+    assert_test_result(conv, &testbench, 1000000, 3, 8, 4.0, 1e-05);
+    correct_convolutional_destroy(conv);
+
+    printf("\n");
+
+    conv = correct_convolutional_create(3, 9, correct_conv_r13_9_polynomial);
+    assert_test_result(conv, &testbench, 1000000, 3, 9, INFINITY, 0);
+    assert_test_result(conv, &testbench, 1000000, 3, 9, 4.5, 3e-06);
+    assert_test_result(conv, &testbench, 1000000, 3, 9, 4.0, 5e-06);
+    correct_convolutional_destroy(conv);
+
+    printf("\n");
+
+    free_scratch(testbench);
+    return 0;
+}
diff --git a/libcorrect/tests/include/rs_tester.h b/libcorrect/tests/include/rs_tester.h
new file mode 100644
index 0000000..95143ad
--- /dev/null
+++ b/libcorrect/tests/include/rs_tester.h
@@ -0,0 +1,41 @@
+#include <stdlib.h>
+#include <string.h>
+#include <stdbool.h>
+#include <time.h>
+
+#include "correct.h"
+
+void rs_correct_encode(void *encoder, uint8_t *msg, size_t msg_length,
+                       uint8_t *msg_out);
+void rs_correct_decode(void *decoder, uint8_t *encoded, size_t encoded_length,
+                       uint8_t *erasure_locations, size_t erasure_length,
+                       uint8_t *msg, size_t pad_length, size_t num_roots);
+
+typedef struct {
+    size_t block_length;
+    size_t message_length;
+    size_t min_distance;
+    unsigned char *msg;
+    uint8_t *encoded;
+    int *indices;
+    uint8_t *corrupted_encoded;
+    uint8_t *erasure_locations;
+    unsigned char *recvmsg;
+} rs_testbench;
+
+typedef struct {
+    void (*encode)(void *, uint8_t *, size_t, uint8_t *);
+    void *encoder;
+    void (*decode)(void *, uint8_t *, size_t, uint8_t *, size_t, uint8_t *, size_t, size_t);
+    void *decoder;
+} rs_test;
+
+rs_testbench *rs_testbench_create(size_t block_length, size_t min_distance);
+void rs_testbench_destroy(rs_testbench *testbench);
+
+typedef struct {
+    bool output_matches;
+} rs_test_run;
+
+rs_test_run test_rs_errors(rs_test *test, rs_testbench *testbench, size_t msg_length,
+                    size_t num_errors, size_t num_erasures);
diff --git a/libcorrect/tests/include/rs_tester_fec.h b/libcorrect/tests/include/rs_tester_fec.h
new file mode 100644
index 0000000..99264a0
--- /dev/null
+++ b/libcorrect/tests/include/rs_tester_fec.h
@@ -0,0 +1,10 @@
+#include <stdint.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include <fec.h>
+void rs_fec_encode(void *encoder, uint8_t *msg, size_t msg_length,
+                   uint8_t *msg_out);
+void rs_fec_decode(void *decoder, uint8_t *encoded, size_t encoded_length,
+                   uint8_t *erasure_locations, size_t erasure_length,
+                   uint8_t *msg, size_t pad_length, size_t num_roots);
diff --git a/libcorrect/tests/include/rs_tester_fec_shim.h b/libcorrect/tests/include/rs_tester_fec_shim.h
new file mode 100644
index 0000000..d81c4cd
--- /dev/null
+++ b/libcorrect/tests/include/rs_tester_fec_shim.h
@@ -0,0 +1,10 @@
+#include <stdint.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include "fec_shim.h"
+void rs_fec_encode(void *encoder, uint8_t *msg, size_t msg_length,
+                   uint8_t *msg_out);
+void rs_fec_decode(void *decoder, uint8_t *encoded, size_t encoded_length,
+                   uint8_t *erasure_locations, size_t erasure_length,
+                   uint8_t *msg, size_t pad_length, size_t num_roots);
diff --git a/libcorrect/tests/reed-solomon-fec-interop.c b/libcorrect/tests/reed-solomon-fec-interop.c
new file mode 100644
index 0000000..c9f217b
--- /dev/null
+++ b/libcorrect/tests/reed-solomon-fec-interop.c
@@ -0,0 +1,138 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdbool.h>
+#include <time.h>
+
+#include "rs_tester.h"
+#include "rs_tester_fec.h"
+
+void print_test_type(size_t block_length, size_t message_length,
+                     size_t num_errors, size_t num_erasures) {
+    printf(
+        "testing reed solomon block length=%zu, message length=%zu, "
+        "errors=%zu, erasures=%zu...",
+        block_length, message_length, num_errors, num_erasures);
+}
+
+void fail_test() {
+    printf("FAILED\n");
+    exit(1);
+}
+
+void pass_test() { printf("PASSED\n"); }
+
+void run_tests(correct_reed_solomon *rs, void *fec_rs, rs_testbench *testbench,
+               size_t block_length, size_t test_msg_length, size_t num_errors,
+               size_t num_erasures, size_t num_iterations) {
+    // run both ways, correct->fec and fec->correct
+    rs_test test;
+    test.encode = rs_correct_encode;
+    test.encoder = rs;
+    test.decode = rs_fec_decode;
+    test.decoder = fec_rs;
+
+    print_test_type(block_length, test_msg_length, num_errors, num_erasures);
+    for (size_t i = 0; i < num_iterations; i++) {
+        rs_test_run run = test_rs_errors(&test, testbench, test_msg_length,
+                                         num_errors, num_erasures);
+        if (!run.output_matches) {
+            fail_test();
+        }
+    }
+
+    test.encode = rs_fec_encode;
+    test.encoder = fec_rs;
+    test.decode = rs_correct_decode;
+    test.decoder = rs;
+    for (size_t i = 0; i < num_iterations; i++) {
+        rs_test_run run = test_rs_errors(&test, testbench, test_msg_length,
+                                         num_errors, num_erasures);
+        if (!run.output_matches) {
+            fail_test();
+        }
+    }
+    pass_test();
+}
+
+int main() {
+    srand(time(NULL));
+
+    size_t block_length = 255;
+    size_t min_distance = 32;
+    size_t message_length = block_length - min_distance;
+
+    size_t pad_length;
+    void *fec_rs;
+
+    correct_reed_solomon *rs = correct_reed_solomon_create(
+        correct_rs_primitive_polynomial_ccsds, 1, 1, min_distance);
+    rs_testbench *testbench = rs_testbench_create(block_length, min_distance);
+
+    pad_length = message_length / 2;
+    fec_rs = init_rs_char(8, correct_rs_primitive_polynomial_ccsds, 1, 1, min_distance,
+                          pad_length);
+    run_tests(rs, fec_rs, testbench, block_length, message_length - pad_length,
+              0, 0, 20000);
+    run_tests(rs, fec_rs, testbench, block_length, message_length - pad_length,
+              min_distance / 2, 0, 20000);
+    run_tests(rs, fec_rs, testbench, block_length, message_length - pad_length,
+              0, min_distance, 20000);
+    run_tests(rs, fec_rs, testbench, block_length, message_length - pad_length,
+              min_distance / 4, min_distance / 2, 20000);
+    free_rs_char(fec_rs);
+
+    pad_length = 0;
+    fec_rs = init_rs_char(8, correct_rs_primitive_polynomial_ccsds, 1, 1, min_distance,
+                          pad_length);
+    run_tests(rs, fec_rs, testbench, block_length, message_length - pad_length,
+              0, 0, 20000);
+    run_tests(rs, fec_rs, testbench, block_length, message_length - pad_length,
+              min_distance / 2, 0, 20000);
+    run_tests(rs, fec_rs, testbench, block_length, message_length - pad_length,
+              0, min_distance, 20000);
+    run_tests(rs, fec_rs, testbench, block_length, message_length - pad_length,
+              min_distance / 4, min_distance / 2, 20000);
+    free_rs_char(fec_rs);
+
+    rs_testbench_destroy(testbench);
+    correct_reed_solomon_destroy(rs);
+
+    min_distance = 16;
+    message_length = block_length - min_distance;
+    rs = correct_reed_solomon_create(
+        correct_rs_primitive_polynomial_ccsds, 1, 1, min_distance);
+    testbench = rs_testbench_create(block_length, min_distance);
+
+    pad_length = message_length / 2;
+    fec_rs = init_rs_char(8, correct_rs_primitive_polynomial_ccsds, 1, 1, min_distance,
+                          pad_length);
+    run_tests(rs, fec_rs, testbench, block_length, message_length - pad_length,
+              0, 0, 20000);
+    run_tests(rs, fec_rs, testbench, block_length, message_length - pad_length,
+              min_distance / 2, 0, 20000);
+    run_tests(rs, fec_rs, testbench, block_length, message_length - pad_length,
+              0, min_distance, 20000);
+    run_tests(rs, fec_rs, testbench, block_length, message_length - pad_length,
+              min_distance / 4, min_distance / 2, 20000);
+    free_rs_char(fec_rs);
+
+    pad_length = 0;
+    fec_rs = init_rs_char(8, correct_rs_primitive_polynomial_ccsds, 1, 1, min_distance,
+                          pad_length);
+    run_tests(rs, fec_rs, testbench, block_length, message_length - pad_length,
+              0, 0, 20000);
+    run_tests(rs, fec_rs, testbench, block_length, message_length - pad_length,
+              min_distance / 2, 0, 20000);
+    run_tests(rs, fec_rs, testbench, block_length, message_length - pad_length,
+              0, min_distance, 20000);
+    run_tests(rs, fec_rs, testbench, block_length, message_length - pad_length,
+              min_distance / 4, min_distance / 2, 20000);
+    free_rs_char(fec_rs);
+
+    rs_testbench_destroy(testbench);
+    correct_reed_solomon_destroy(rs);
+
+    printf("test passed\n");
+    return 0;
+}
diff --git a/libcorrect/tests/reed-solomon-shim-interop.c b/libcorrect/tests/reed-solomon-shim-interop.c
new file mode 100644
index 0000000..9ee52fc
--- /dev/null
+++ b/libcorrect/tests/reed-solomon-shim-interop.c
@@ -0,0 +1,138 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdbool.h>
+#include <time.h>
+
+#include "rs_tester.h"
+#include "rs_tester_fec_shim.h"
+
+void print_test_type(size_t block_length, size_t message_length,
+                     size_t num_errors, size_t num_erasures) {
+    printf(
+        "testing reed solomon block length=%zu, message length=%zu, "
+        "errors=%zu, erasures=%zu...",
+        block_length, message_length, num_errors, num_erasures);
+}
+
+void fail_test() {
+    printf("FAILED\n");
+    exit(1);
+}
+
+void pass_test() { printf("PASSED\n"); }
+
+void run_tests(correct_reed_solomon *rs, void *fec_rs, rs_testbench *testbench,
+               size_t block_length, size_t test_msg_length, size_t num_errors,
+               size_t num_erasures, size_t num_iterations) {
+    // run both ways, correct->fec and fec->correct
+    rs_test test;
+    test.encode = rs_correct_encode;
+    test.encoder = rs;
+    test.decode = rs_fec_decode;
+    test.decoder = fec_rs;
+
+    print_test_type(block_length, test_msg_length, num_errors, num_erasures);
+    for (size_t i = 0; i < num_iterations; i++) {
+        rs_test_run run = test_rs_errors(&test, testbench, test_msg_length, num_errors,
+                                     num_erasures);
+        if (!run.output_matches) {
+            fail_test();
+        }
+    }
+
+    test.encode = rs_fec_encode;
+    test.encoder = fec_rs;
+    test.decode = rs_correct_decode;
+    test.decoder = rs;
+    for (size_t i = 0; i < num_iterations; i++) {
+        rs_test_run run = test_rs_errors(&test, testbench, test_msg_length, num_errors,
+                                     num_erasures);
+        if (!run.output_matches) {
+            fail_test();
+        }
+    }
+    pass_test();
+}
+
+int main() {
+    srand(time(NULL));
+
+    size_t block_length = 255;
+    size_t min_distance = 32;
+    size_t message_length = block_length - min_distance;
+
+    size_t pad_length;
+    void *fec_rs;
+
+    correct_reed_solomon *rs = correct_reed_solomon_create(
+        correct_rs_primitive_polynomial_ccsds, 1, 1, min_distance);
+    rs_testbench *testbench = rs_testbench_create(block_length, min_distance);
+
+    pad_length = message_length / 2;
+    fec_rs = init_rs_char(8, correct_rs_primitive_polynomial_ccsds, 1, 1, min_distance,
+                          pad_length);
+    run_tests(rs, fec_rs, testbench, block_length, message_length - pad_length,
+              0, 0, 20000);
+    run_tests(rs, fec_rs, testbench, block_length, message_length - pad_length,
+              min_distance / 2, 0, 20000);
+    run_tests(rs, fec_rs, testbench, block_length, message_length - pad_length,
+              0, min_distance, 20000);
+    run_tests(rs, fec_rs, testbench, block_length, message_length - pad_length,
+              min_distance / 4, min_distance / 2, 20000);
+    free_rs_char(fec_rs);
+
+    pad_length = 0;
+    fec_rs = init_rs_char(8, correct_rs_primitive_polynomial_ccsds, 1, 1, min_distance,
+                          pad_length);
+    run_tests(rs, fec_rs, testbench, block_length, message_length - pad_length,
+              0, 0, 20000);
+    run_tests(rs, fec_rs, testbench, block_length, message_length - pad_length,
+              min_distance / 2, 0, 20000);
+    run_tests(rs, fec_rs, testbench, block_length, message_length - pad_length,
+              0, min_distance, 20000);
+    run_tests(rs, fec_rs, testbench, block_length, message_length - pad_length,
+              min_distance / 4, min_distance / 2, 20000);
+    free_rs_char(fec_rs);
+
+    rs_testbench_destroy(testbench);
+    correct_reed_solomon_destroy(rs);
+
+    min_distance = 16;
+    message_length = block_length - min_distance;
+    rs = correct_reed_solomon_create(
+        correct_rs_primitive_polynomial_ccsds, 1, 1, min_distance);
+    testbench = rs_testbench_create(block_length, min_distance);
+
+    pad_length = message_length / 2;
+    fec_rs = init_rs_char(8, correct_rs_primitive_polynomial_ccsds, 1, 1, min_distance,
+                          pad_length);
+    run_tests(rs, fec_rs, testbench, block_length, message_length - pad_length,
+              0, 0, 20000);
+    run_tests(rs, fec_rs, testbench, block_length, message_length - pad_length,
+              min_distance / 2, 0, 20000);
+    run_tests(rs, fec_rs, testbench, block_length, message_length - pad_length,
+              0, min_distance, 20000);
+    run_tests(rs, fec_rs, testbench, block_length, message_length - pad_length,
+              min_distance / 4, min_distance / 2, 20000);
+    free_rs_char(fec_rs);
+
+    pad_length = 0;
+    fec_rs = init_rs_char(8, correct_rs_primitive_polynomial_ccsds, 1, 1, min_distance,
+                          pad_length);
+    run_tests(rs, fec_rs, testbench, block_length, message_length - pad_length,
+              0, 0, 20000);
+    run_tests(rs, fec_rs, testbench, block_length, message_length - pad_length,
+              min_distance / 2, 0, 20000);
+    run_tests(rs, fec_rs, testbench, block_length, message_length - pad_length,
+              0, min_distance, 20000);
+    run_tests(rs, fec_rs, testbench, block_length, message_length - pad_length,
+              min_distance / 4, min_distance / 2, 20000);
+    free_rs_char(fec_rs);
+
+    rs_testbench_destroy(testbench);
+    correct_reed_solomon_destroy(rs);
+
+    printf("test passed\n");
+    return 0;
+}
diff --git a/libcorrect/tests/reed-solomon.c b/libcorrect/tests/reed-solomon.c
new file mode 100644
index 0000000..1615c50
--- /dev/null
+++ b/libcorrect/tests/reed-solomon.c
@@ -0,0 +1,146 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdbool.h>
+#include <time.h>
+
+#include "rs_tester.h"
+
+void print_test_type(size_t block_length, size_t message_length,
+                     size_t num_errors, size_t num_erasures) {
+    printf(
+        "testing reed solomon block length=%zu, message length=%zu, "
+        "errors=%zu, erasures=%zu...",
+        block_length, message_length, num_errors, num_erasures);
+}
+
+void fail_test() {
+    printf("FAILED\n");
+    exit(1);
+}
+
+void pass_test() { printf("PASSED\n"); }
+
+void run_tests(correct_reed_solomon *rs, rs_testbench *testbench,
+               size_t block_length, size_t test_msg_length, size_t num_errors,
+               size_t num_erasures, size_t num_iterations) {
+    rs_test test;
+    test.encode = rs_correct_encode;
+    test.decode = rs_correct_decode;
+    test.encoder = rs;
+    test.decoder = rs;
+    print_test_type(block_length, test_msg_length, num_errors, num_erasures);
+    for (size_t i = 0; i < num_iterations; i++) {
+        rs_test_run run = test_rs_errors(&test, testbench, test_msg_length, num_errors,
+                                     num_erasures);
+        if (!run.output_matches) {
+            fail_test();
+        }
+    }
+    pass_test();
+}
+
+int main() {
+    srand(time(NULL));
+
+    size_t block_length = 255;
+    size_t min_distance = 32;
+    size_t message_length = block_length - min_distance;
+
+    correct_reed_solomon *rs = correct_reed_solomon_create(
+        correct_rs_primitive_polynomial_ccsds, 1, 1, min_distance);
+    rs_testbench *testbench = rs_testbench_create(block_length, min_distance);
+
+    run_tests(rs, testbench, block_length, message_length / 2, 0, 0, 20000);
+    run_tests(rs, testbench, block_length, message_length, 0, 0, 20000);
+    run_tests(rs, testbench, block_length, message_length / 2, min_distance / 2,
+              0, 20000);
+    run_tests(rs, testbench, block_length, message_length, min_distance / 2, 0,
+              20000);
+    run_tests(rs, testbench, block_length, message_length / 2, 0, min_distance,
+              20000);
+    run_tests(rs, testbench, block_length, message_length, 0, min_distance,
+              20000);
+    run_tests(rs, testbench, block_length, message_length / 2, min_distance / 4,
+              min_distance / 2, 20000);
+    run_tests(rs, testbench, block_length, message_length, min_distance / 4,
+              min_distance / 2, 20000);
+
+    rs_testbench_destroy(testbench);
+    correct_reed_solomon_destroy(rs);
+
+    min_distance = 16;
+    message_length = block_length - min_distance;
+    rs = correct_reed_solomon_create(
+        correct_rs_primitive_polynomial_ccsds, 1, 1, min_distance);
+    testbench = rs_testbench_create(block_length, min_distance);
+
+    run_tests(rs, testbench, block_length, message_length / 2, 0, 0, 20000);
+    run_tests(rs, testbench, block_length, message_length, 0, 0, 20000);
+    run_tests(rs, testbench, block_length, message_length / 2, min_distance / 2,
+              0, 20000);
+    run_tests(rs, testbench, block_length, message_length, min_distance / 2, 0,
+              20000);
+    run_tests(rs, testbench, block_length, message_length / 2, 0, min_distance,
+              20000);
+    run_tests(rs, testbench, block_length, message_length, 0, min_distance,
+              20000);
+    run_tests(rs, testbench, block_length, message_length / 2, min_distance / 4,
+              min_distance / 2, 20000);
+    run_tests(rs, testbench, block_length, message_length, min_distance / 4,
+              min_distance / 2, 20000);
+
+    rs_testbench_destroy(testbench);
+    correct_reed_solomon_destroy(rs);
+
+    min_distance = 8;
+    message_length = block_length - min_distance;
+    rs = correct_reed_solomon_create(
+        correct_rs_primitive_polynomial_ccsds, 1, 1, min_distance);
+    testbench = rs_testbench_create(block_length, min_distance);
+
+    run_tests(rs, testbench, block_length, message_length / 2, 0, 0, 20000);
+    run_tests(rs, testbench, block_length, message_length, 0, 0, 20000);
+    run_tests(rs, testbench, block_length, message_length / 2, min_distance / 2,
+              0, 20000);
+    run_tests(rs, testbench, block_length, message_length, min_distance / 2, 0,
+              20000);
+    run_tests(rs, testbench, block_length, message_length / 2, 0, min_distance,
+              20000);
+    run_tests(rs, testbench, block_length, message_length, 0, min_distance,
+              20000);
+    run_tests(rs, testbench, block_length, message_length / 2, min_distance / 4,
+              min_distance / 2, 20000);
+    run_tests(rs, testbench, block_length, message_length, min_distance / 4,
+              min_distance / 2, 20000);
+
+    rs_testbench_destroy(testbench);
+    correct_reed_solomon_destroy(rs);
+
+    min_distance = 4;
+    message_length = block_length - min_distance;
+    rs = correct_reed_solomon_create(
+        correct_rs_primitive_polynomial_ccsds, 1, 1, min_distance);
+    testbench = rs_testbench_create(block_length, min_distance);
+
+    run_tests(rs, testbench, block_length, message_length / 2, 0, 0, 20000);
+    run_tests(rs, testbench, block_length, message_length, 0, 0, 20000);
+    run_tests(rs, testbench, block_length, message_length / 2, min_distance / 2,
+              0, 20000);
+    run_tests(rs, testbench, block_length, message_length, min_distance / 2, 0,
+              20000);
+    run_tests(rs, testbench, block_length, message_length / 2, 0, min_distance,
+              20000);
+    run_tests(rs, testbench, block_length, message_length, 0, min_distance,
+              20000);
+    run_tests(rs, testbench, block_length, message_length / 2, min_distance / 4,
+              min_distance / 2, 20000);
+    run_tests(rs, testbench, block_length, message_length, min_distance / 4,
+              min_distance / 2, 20000);
+
+    rs_testbench_destroy(testbench);
+    correct_reed_solomon_destroy(rs);
+
+    printf("test passed\n");
+    return 0;
+}
diff --git a/libcorrect/tests/rs_tester.c b/libcorrect/tests/rs_tester.c
new file mode 100644
index 0000000..217bd42
--- /dev/null
+++ b/libcorrect/tests/rs_tester.c
@@ -0,0 +1,102 @@
+#include "rs_tester.h"
+
+void shuffle(int *a, size_t len) {
+    for (size_t i = 0; i < len - 2; i++) {
+        size_t j = rand() % (len - i) + i;
+        int temp = a[i];
+        a[i] = a[j];
+        a[j] = temp;
+    }
+}
+
+void rs_correct_encode(void *encoder, uint8_t *msg, size_t msg_length,
+                       uint8_t *msg_out) {
+    correct_reed_solomon_encode((correct_reed_solomon *)encoder, msg,
+                                msg_length, msg_out);
+}
+
+void rs_correct_decode(void *decoder, uint8_t *encoded, size_t encoded_length,
+                       uint8_t *erasure_locations, size_t erasure_length,
+                       uint8_t *msg, size_t pad_length, size_t num_roots) {
+    correct_reed_solomon_decode_with_erasures(
+        (correct_reed_solomon *)decoder, encoded, encoded_length,
+        erasure_locations, erasure_length, msg);
+}
+
+rs_testbench *rs_testbench_create(size_t block_length, size_t min_distance) {
+    rs_testbench *testbench = calloc(1, sizeof(rs_testbench));
+
+    size_t message_length = block_length - min_distance;
+    testbench->message_length = message_length;
+    testbench->block_length = block_length;
+    testbench->min_distance = min_distance;
+
+    testbench->msg = calloc(message_length, sizeof(unsigned char));
+    testbench->encoded = malloc(block_length * sizeof(uint8_t));
+
+    testbench->indices = malloc(block_length * sizeof(int));
+
+    testbench->corrupted_encoded = malloc(block_length * sizeof(uint8_t));
+    testbench->erasure_locations = malloc(min_distance * sizeof(uint8_t));
+    testbench->recvmsg = malloc(sizeof(unsigned char) * message_length);
+
+    return testbench;
+}
+
+void rs_testbench_destroy(rs_testbench *testbench) {
+    free(testbench->msg);
+    free(testbench->encoded);
+    free(testbench->indices);
+    free(testbench->corrupted_encoded);
+    free(testbench->erasure_locations);
+    free(testbench->recvmsg);
+    free(testbench);
+}
+
+rs_test_run test_rs_errors(rs_test *test, rs_testbench *testbench, size_t msg_length,
+                    size_t num_errors, size_t num_erasures) {
+    rs_test_run run;
+    run.output_matches = false;
+
+    if (msg_length > testbench->message_length) {
+        return run;
+    }
+
+    for (size_t i = 0; i < msg_length; i++) {
+        testbench->msg[i] = rand() % 256;
+    }
+
+    size_t block_length = msg_length + testbench->min_distance;
+    size_t pad_length = testbench->message_length - msg_length;
+
+    test->encode(test->encoder, testbench->msg, msg_length, testbench->encoded);
+
+    memcpy(testbench->corrupted_encoded, testbench->encoded, block_length);
+
+    for (int i = 0; i < block_length; i++) {
+        testbench->indices[i] = i;
+    }
+
+    shuffle(testbench->indices, block_length);
+
+    for (unsigned int i = 0; i < num_erasures; i++) {
+        int index = testbench->indices[i];
+        uint8_t corruption_mask = (rand() % 255) + 1;
+        testbench->corrupted_encoded[index] ^= corruption_mask;
+        testbench->erasure_locations[i] = index;
+    }
+
+    for (unsigned int i = 0; i < num_errors; i++) {
+        int index = testbench->indices[i + num_erasures];
+        uint8_t corruption_mask = (rand() % 255) + 1;
+        testbench->corrupted_encoded[index] ^= corruption_mask;
+    }
+
+    test->decode(test->decoder, testbench->corrupted_encoded, block_length,
+                 testbench->erasure_locations, num_erasures,
+                 testbench->recvmsg, pad_length, testbench->min_distance);
+
+    run.output_matches = (bool)(memcmp(testbench->msg, testbench->recvmsg, msg_length) == 0);
+
+    return run;
+}
diff --git a/libcorrect/tests/rs_tester_fec.c b/libcorrect/tests/rs_tester_fec.c
new file mode 100644
index 0000000..dcb49f6
--- /dev/null
+++ b/libcorrect/tests/rs_tester_fec.c
@@ -0,0 +1,30 @@
+#include "rs_tester_fec.h"
+void rs_fec_encode(void *encoder, uint8_t *msg, size_t msg_length,
+                   uint8_t *msg_out) {
+    // XXX make sure that pad length used to build encoder corresponds to this
+    // msg_length
+    memcpy(msg_out, msg, msg_length);
+    encode_rs_char(encoder, msg_out, msg_out + msg_length);
+}
+
+void rs_fec_decode(void *decoder, uint8_t *encoded, size_t encoded_length,
+                   uint8_t *erasure_locations, size_t erasure_length,
+                   uint8_t *msg, size_t pad_length, size_t num_roots) {
+    // XXX make sure that pad length used to build decoder corresponds to this
+    // encoded_length
+    if (erasure_length) {
+        static size_t locations_len = 0;
+        static int *locations = NULL;
+        if (locations_len < erasure_length) {
+            locations = realloc(locations, erasure_length * sizeof(int));
+            locations_len = erasure_length;
+        }
+        for (size_t i = 0; i < erasure_length; i++) {
+            locations[i] = (unsigned int)(erasure_locations[i]) + pad_length;
+        }
+        decode_rs_char(decoder, encoded, locations, erasure_length);
+    } else {
+        decode_rs_char(decoder, encoded, NULL, 0);
+    }
+    memcpy(msg, encoded, encoded_length - num_roots);
+}
diff --git a/libcorrect/tests/rs_tester_fec_shim.c b/libcorrect/tests/rs_tester_fec_shim.c
new file mode 100644
index 0000000..3e49a69
--- /dev/null
+++ b/libcorrect/tests/rs_tester_fec_shim.c
@@ -0,0 +1,26 @@
+#include "rs_tester_fec_shim.h"
+void rs_fec_encode(void *encoder, uint8_t *msg, size_t msg_length,
+                   uint8_t *msg_out) {
+    // XXX make sure that pad length used to build encoder corresponds to this
+    // msg_length
+    memcpy(msg_out, msg, msg_length);
+    encode_rs_char(encoder, msg_out, msg_out + msg_length);
+}
+
+void rs_fec_decode(void *decoder, uint8_t *encoded, size_t encoded_length,
+                   uint8_t *erasure_locations, size_t erasure_length,
+                   uint8_t *msg, size_t pad_length, size_t num_roots) {
+    // XXX make sure that pad length used to build decoder corresponds to this
+    // encoded_length
+    if (erasure_length) {
+        int *locations = malloc(erasure_length * sizeof(int));
+        for (size_t i = 0; i < erasure_length; i++) {
+            locations[i] = (unsigned int)(erasure_locations[i]) + pad_length;
+        }
+        decode_rs_char(decoder, encoded, locations, erasure_length);
+        free(locations);
+    } else {
+        decode_rs_char(decoder, encoded, NULL, 0);
+    }
+    memcpy(msg, encoded, encoded_length - num_roots);
+}
diff --git a/libcorrect/tools/CMakeLists.txt b/libcorrect/tools/CMakeLists.txt
new file mode 100644
index 0000000..f86990a
--- /dev/null
+++ b/libcorrect/tools/CMakeLists.txt
@@ -0,0 +1,29 @@
+add_executable(rs_find_primitive_poly EXCLUDE_FROM_ALL find_rs_primitive_poly.c)
+target_link_libraries(rs_find_primitive_poly correct_static)
+set(all_tools ${all_tools} rs_find_primitive_poly)
+
+if(HAVE_LIBFEC)
+    add_executable(conv_find_libfec_poly EXCLUDE_FROM_ALL find_conv_libfec_poly.c)
+    target_link_libraries(conv_find_libfec_poly correct_static fec)
+    set(all_tools ${all_tools} conv_find_libfec_poly)
+endif()
+
+if(HAVE_SSE)
+    add_executable(conv_find_optim_poly EXCLUDE_FROM_ALL find_conv_optim_poly.c $<TARGET_OBJECTS:error_sim_sse>)
+    target_link_libraries(conv_find_optim_poly correct_static)
+    set(all_tools ${all_tools} conv_find_optim_poly)
+
+    add_executable(conv_find_optim_poly_annealing EXCLUDE_FROM_ALL find_conv_optim_poly_annealing.c $<TARGET_OBJECTS:error_sim_sse>)
+    target_link_libraries(conv_find_optim_poly_annealing correct_static)
+    set(all_tools ${all_tools} conv_find_optim_poly_annealing)
+else()
+    add_executable(conv_find_optim_poly EXCLUDE_FROM_ALL find_conv_optim_poly.c $<TARGET_OBJECTS:error_sim>)
+    target_link_libraries(conv_find_optim_poly correct_static)
+    set(all_tools ${all_tools} conv_find_optim_poly)
+
+    add_executable(conv_find_optim_poly_annealing EXCLUDE_FROM_ALL find_conv_optim_poly_annealing.c $<TARGET_OBJECTS:error_sim>)
+    target_link_libraries(conv_find_optim_poly_annealing correct_static)
+    set(all_tools ${all_tools} conv_find_optim_poly_annealing)
+endif()
+
+add_custom_target(tools DEPENDS ${all_tools})
diff --git a/libcorrect/tools/find_conv_libfec_poly.c b/libcorrect/tools/find_conv_libfec_poly.c
new file mode 100644
index 0000000..4d15824
--- /dev/null
+++ b/libcorrect/tools/find_conv_libfec_poly.c
@@ -0,0 +1,279 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdbool.h>
+#include <string.h>
+#include <time.h>
+#include <stddef.h>
+#include <assert.h>
+
+#include <correct.h>
+#include <fec.h>
+
+// this program allows us to find all of the polynomials that come with libfec
+// this way, we can provide compatibility with libfec-encoded streams and vice versa
+// we can do this without directly copy-pasting from libfec's source, thanks
+//   to this finder
+
+typedef struct {
+    void *vit;
+    int update_len;
+    int (*init)(void *, int);
+    int (*update)(void *, unsigned char *, int);
+    int (*chainback)(void *, unsigned char *, unsigned int, unsigned int);
+} libfec_decoder_t;
+
+void byte2bit(uint8_t *bytes, uint8_t *bits, size_t n_bits) {
+    unsigned char cmask = 0x80;
+    for (size_t i = 0; i < n_bits; i++) {
+        bits[i] = (bytes[i/8] & cmask) ? 255 : 0;
+        cmask >>= 1;
+        if (!cmask) {
+            cmask = 0x80;
+        }
+    }
+}
+
+correct_convolutional_polynomial_t *resize_poly_list(correct_convolutional_polynomial_t *polys, size_t cap) {
+    polys = realloc(polys, cap * sizeof(correct_convolutional_polynomial_t));
+    return polys;
+}
+
+void find_poly_coeff(size_t rate, size_t order, uint8_t *msg, size_t msg_len, libfec_decoder_t libfec, correct_convolutional_polynomial_t **polys_dest, size_t *polys_len, size_t search_coeff) {
+    // find a single coefficient of an unknown convolutional polynomial
+    // we are given a payload to encode, and we'll test all possible coefficients
+    //    to see which ones yield correct decodings by libfec, which has some
+    //    unknown polynomial "baked in"
+
+    // temp poly (this will be the one we search with)
+    correct_convolutional_polynomial_t *poly = malloc(rate * sizeof(correct_convolutional_polynomial_t));
+
+    // what's the largest coefficient value we'll test?
+    correct_convolutional_polynomial_t maxcoeff = (1 << order) - 1;
+
+    // note that we start about half way in
+    // this sum asks that we have the
+    //   a) highest order bit set
+    //   b) lowest order bit set
+    // we're only interested in coefficient values for which this is
+    //   true because if it weren't, the coefficient would actually be
+    //   of a smaller order than its supposed given order
+    correct_convolutional_polynomial_t startcoeff = (1 << (order - 1)) + 1;
+
+    // the values of this don't really matter except for the coeff we're searching for
+    // but just to be safe, we set them all
+    for (size_t i = 0; i < rate; i++) {
+        poly[i] = startcoeff;
+    }
+
+    // create a dummy encoder so that we can find how long the resulting encoded value is
+    correct_convolutional *conv_dummy = correct_convolutional_create(rate, order, poly);
+    size_t enclen_bits = correct_convolutional_encode_len(conv_dummy, msg_len);
+    size_t enclen = (enclen_bits % 8) ? (enclen_bits / 8 + 1) : enclen_bits / 8;
+    correct_convolutional_destroy(conv_dummy);
+
+    // compact encoded format (this comes from libcorrect)
+    uint8_t *encoded = malloc(enclen * sizeof(uint8_t));
+    // soft encoded format (this goes to libfec, one byte per bit)
+    uint8_t *encoded_bits = malloc(enclen * 8 * sizeof(uint8_t));
+    // resulting decoded message which we'll compare to our given payload
+    uint8_t *msg_cmp = malloc(msg_len * sizeof(uint8_t));
+
+    // we keep a list of coefficients which yielded correct decodings
+    // there could be 0, 1, or more than 1, and we'll return all of them
+    // we'll dynamically resize this as we go
+    size_t polys_cap = 1;
+    *polys_len = 0;
+    correct_convolutional_polynomial_t *polys = NULL;
+    polys = resize_poly_list(polys, polys_cap);
+
+    // iteration constants -- we go by 2 because we want the lowest order bit to
+    // stay set
+    for (correct_convolutional_polynomial_t i = startcoeff; i <= maxcoeff; i += 2) {
+        poly[search_coeff] = i;
+        correct_convolutional *conv = correct_convolutional_create(rate, order, poly);
+
+        correct_convolutional_encode(conv, (uint8_t*)msg, msg_len, encoded);
+        byte2bit(encoded, encoded_bits, enclen);
+
+        // now erase all the bits we're not searching for
+        for (size_t i = 0; i < msg_len * 8; i++) {
+            for (size_t j = 0; j < rate; j++) {
+                if (j != search_coeff) {
+                    // 128 is a soft erasure
+                    encoded_bits[i * rate + j] = 128;
+                }
+            }
+        }
+
+        libfec.init(libfec.vit, 0);
+        libfec.update(libfec.vit, encoded_bits, libfec.update_len);
+        libfec.chainback(libfec.vit, msg_cmp, 8 * msg_len, 0);
+
+        correct_convolutional_destroy(conv);
+
+        if (memcmp(msg_cmp, msg, msg_len) == 0) {
+            // match found
+
+            // resize list to make room
+            if (*polys_len == polys_cap) {
+                polys = resize_poly_list(polys, polys_cap * 2);
+                polys_cap *= 2;
+            }
+            polys[*polys_len] = i;
+            *polys_len = *polys_len + 1;
+        }
+    }
+
+    polys = resize_poly_list(polys, *polys_len);
+    *polys_dest = polys;
+    free(poly);
+    free(msg_cmp);
+    free(encoded);
+    free(encoded_bits);
+}
+
+// we choose 2 bytes because we need a payload that's longer than
+// the shift register under test. since that includes an order 15
+// s.r., we need at least 15 bits.
+size_t msg_len = 2;
+
+void find_poly(size_t rate, size_t order, libfec_decoder_t libfec, correct_convolutional_polynomial_t *poly) {
+    // find the complete set of coefficients that are "baked in" to
+    //   one particular method of libfec
+    // most of this method is described by find_poly_coeff
+
+    // for each coeff we want to find, we'll generate random 2-byte payloads and give
+    //   them to find_poly_coeff. If find_poly_coeff returns an empty list, we
+    //   try again. If it returns a nonempty list, then we find the intersection of
+    //   all the coefficient values find_poly_coeff has given us so far (we start
+    //   with the complete set). we are finished when only one coeff value remains
+
+    // we perform this process for each coeff e.g. 6 times for a rate 1/6 polynomial
+
+    uint8_t msg[msg_len];
+
+    // this is the list returned to us by find_poly_coeff
+    correct_convolutional_polynomial_t *polys;
+    // the list's length is written here
+    size_t polys_len;
+
+    printf("rate 1/%zu order %zu poly:", rate, order);
+
+    for (size_t search_coeff = 0; search_coeff < rate; search_coeff++) {
+        correct_convolutional_polynomial_t *fit = NULL;
+        size_t fit_len = 0;
+        size_t fit_cap = 0;
+        bool done = false;
+
+        while (!done) {
+            for (size_t i = 0; i < msg_len; i++) {
+                msg[i] = rand() % 256;
+            }
+            find_poly_coeff(rate, order, msg, msg_len, libfec, &polys, &polys_len, search_coeff);
+
+            if (polys_len == 0) {
+                // skip if none fit (this is a special case)
+                continue;
+            }
+
+            if (fit_len == 0) {
+                // the very first intersection
+                // we'll just copy the list handed to us
+                fit_cap = polys_len;
+                fit_len = polys_len;
+                fit = resize_poly_list(fit, fit_cap);
+                for (size_t i = 0; i < polys_len; i++) {
+                    fit[i] = polys[i];
+                }
+            } else {
+                // find intersection
+                ptrdiff_t polys_iter = 0;
+                ptrdiff_t fit_iter = 0;
+                ptrdiff_t new_fit_iter = 0;
+                // the lists generated by find_poly_coeff are sorted
+                // so we just retain the sorted property and walk both
+                while (polys_iter < polys_len && fit_iter < fit_len) {
+                    if (polys[polys_iter] < fit[fit_iter]) {
+                        polys_iter++;
+                    } else if (polys[polys_iter] > fit[fit_iter]) {
+                        fit_iter++;
+                    } else {
+                        fit[new_fit_iter] = fit[fit_iter];
+                        polys_iter++;
+                        fit_iter++;
+                        new_fit_iter++;
+                    }
+                }
+                // if new_fit_iter is 0 here then we don't intersect at all
+                // in this case we have to restart the search for this coeff
+                if (new_fit_iter != 0) {
+                    fit_len = new_fit_iter;
+                } else {
+                    free(fit);
+                    fit = NULL;
+                    fit_cap = 0;
+                    fit_len = 0;
+                }
+            }
+
+            free(polys);
+
+            if (fit_len == 1) {
+                poly[search_coeff] = fit[0];
+                if (order <= 9) {
+                    printf(" %04o", fit[0]);
+                } else {
+                    printf(" %06o", fit[0]);
+                }
+                done = true;
+            }
+        }
+
+        free(fit);
+    }
+    printf("\n");
+}
+
+int main() {
+    libfec_decoder_t libfec;
+
+    srand(time(NULL));
+
+    setbuf(stdout, NULL);
+
+    correct_convolutional_polynomial_t poly[6];
+
+    libfec.vit = create_viterbi27(8 * msg_len);
+    libfec.update_len = 8 * msg_len + 6;
+    libfec.init = init_viterbi27;
+    libfec.update = update_viterbi27_blk;
+    libfec.chainback = chainback_viterbi27;
+    find_poly(2, 7, libfec, poly);
+    delete_viterbi27(libfec.vit);
+
+    libfec.vit = create_viterbi29(8 * msg_len);
+    libfec.update_len = 8 * msg_len + 8;
+    libfec.init = init_viterbi29;
+    libfec.update = update_viterbi29_blk;
+    libfec.chainback = chainback_viterbi29;
+    find_poly(2, 9, libfec, poly);
+    delete_viterbi29(libfec.vit);
+
+    libfec.vit = create_viterbi39(8 * msg_len);
+    libfec.update_len = 8 * msg_len + 8;
+    libfec.init = init_viterbi39;
+    libfec.update = update_viterbi39_blk;
+    libfec.chainback = chainback_viterbi39;
+    find_poly(3, 9, libfec, poly);
+    delete_viterbi39(libfec.vit);
+
+    libfec.vit = create_viterbi615(8 * msg_len);
+    libfec.update_len = 8 * msg_len + 14;
+    libfec.init = init_viterbi615;
+    libfec.update = update_viterbi615_blk;
+    libfec.chainback = chainback_viterbi615;
+    find_poly(6, 15, libfec, poly);
+    delete_viterbi615(libfec.vit);
+
+    return 0;
+}
diff --git a/libcorrect/tools/find_conv_optim_poly.c b/libcorrect/tools/find_conv_optim_poly.c
new file mode 100644
index 0000000..8b22574
--- /dev/null
+++ b/libcorrect/tools/find_conv_optim_poly.c
@@ -0,0 +1,330 @@
+#include <stdbool.h>
+#include <float.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <stddef.h>
+#include <limits.h>
+#include <pthread.h>
+
+#if HAVE_SSE
+#include "correct/util/error-sim-sse.h"
+typedef correct_convolutional_sse conv_t;
+static conv_t*(*conv_create)(size_t, size_t, const uint16_t *) = correct_convolutional_sse_create;
+static void(*conv_destroy)(conv_t *) = correct_convolutional_sse_destroy;
+static size_t(*conv_enclen)(void *, size_t) = conv_correct_sse_enclen;
+static void(*conv_encode)(void *, uint8_t *, size_t, uint8_t *) = conv_correct_sse_encode;
+static void(*conv_decode)(void *, uint8_t *, size_t, uint8_t *) = conv_correct_sse_decode;
+#else
+#include "correct/util/error-sim.h"
+typedef correct_convolutional conv_t;
+static conv_t*(*conv_create)(size_t, size_t, const uint16_t *) = correct_convolutional_create;
+static void(*conv_destroy)(conv_t *) = correct_convolutional_destroy;
+static size_t(*conv_enclen)(void *, size_t) = conv_correct_enclen;
+static void(*conv_encode)(void *, uint8_t *, size_t, uint8_t *) = conv_correct_encode;
+static void(*conv_decode)(void *, uint8_t *, size_t, uint8_t *) = conv_correct_decode;
+#endif
+
+typedef struct {
+    conv_t *conv;
+    correct_convolutional_polynomial_t *poly;
+} conv_tester_t;
+
+typedef struct {
+    int *distances;
+    float cost;
+    correct_convolutional_polynomial_t *poly;
+} conv_result_t;
+
+int compare_conv_results(const void *avoid, const void *bvoid) {
+    const conv_result_t *a = (const conv_result_t *)avoid;
+    const conv_result_t *b = (const conv_result_t *)bvoid;
+
+    if (a->cost > b->cost) {
+        return 1;
+    }
+    return -1;
+}
+
+typedef struct {
+    size_t rate;
+    size_t order;
+    conv_result_t *items;
+    size_t items_len;
+    conv_testbench *scratch;
+    uint8_t *msg;
+    size_t msg_len;
+    size_t test_offset;
+    double bpsk_voltage;
+} exhaustive_thread_args;
+
+void *search_exhaustive_thread(void *vargs) {
+    exhaustive_thread_args *args = (exhaustive_thread_args *)vargs;
+    conv_t *conv;
+    for (size_t i = 0; i < args->items_len; i++) {
+        conv = conv_create(args->rate, args->order, args->items[i].poly);
+        args->scratch->encode = conv_encode;
+        args->scratch->encoder = conv;
+        args->scratch->decode = conv_decode;
+        args->scratch->decoder = conv;
+        args->items[i].distances[args->test_offset] += test_conv_noise(args->scratch, args->msg, args->msg_len, args->bpsk_voltage);
+        conv_destroy(conv);
+    }
+    pthread_exit(NULL);
+}
+
+void search_exhaustive(size_t rate, size_t order,
+                       size_t n_bytes, uint8_t *msg,
+                       conv_testbench **scratches, size_t num_scratches,
+                       float *weights,
+                       conv_result_t *items,
+                       size_t items_len, double bpsk_voltage) {
+
+    exhaustive_thread_args *args = malloc(num_scratches * sizeof(exhaustive_thread_args));
+    pthread_t *threads = malloc(num_scratches * sizeof(pthread_t));
+
+    for (size_t i = 0; i < num_scratches; i++) {
+        args[i].rate = rate;
+        args[i].order = order;
+        args[i].items = items;
+        args[i].items_len = items_len;
+        args[i].scratch = scratches[i];
+        args[i].msg = msg;
+        args[i].msg_len = n_bytes;
+        args[i].test_offset = i;
+        args[i].bpsk_voltage = bpsk_voltage;
+        pthread_attr_t attr;
+        pthread_attr_init(&attr);
+        pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
+        pthread_create(&threads[i], &attr, search_exhaustive_thread, &args[i]);
+    }
+
+    for (size_t i = 0; i < num_scratches; i++) {
+        pthread_join(threads[i], NULL);
+    }
+
+    free(args);
+    free(threads);
+
+}
+
+void search_exhaustive_init(conv_result_t *items, size_t items_len,
+                            size_t num_scratches) {
+    for (size_t i = 0; i < items_len; i++) {
+        for (size_t j = 0; j < num_scratches; j++) {
+            items[i].distances[j] = 0;
+        }
+    }
+}
+
+void search_exhaustive_fin(conv_result_t *items, size_t items_len,
+                           float *weights, size_t weights_len) {
+    for (size_t i = 0; i < items_len; i++) {
+        items[i].cost = 0;
+        for (size_t j = 0; j < weights_len; j++) {
+            items[i].cost += weights[j] * items[i].distances[j];
+        }
+    }
+
+    qsort(items, items_len, sizeof(conv_result_t), compare_conv_results);
+}
+
+const size_t max_block_len = 16384;
+const size_t max_msg_len = 50000000;
+
+void test(size_t rate, size_t order,
+          conv_tester_t start, conv_testbench **scratches,
+          size_t num_scratches, float *weights,
+          size_t n_bytes, double *eb_n0,
+          double bpsk_bit_energy, size_t n_iter,
+          double bpsk_voltage) {
+
+    uint8_t *msg = malloc(max_block_len * sizeof(uint8_t));
+
+    correct_convolutional_polynomial_t maxcoeff = (1 << order) - 1;
+    correct_convolutional_polynomial_t startcoeff = (1 << (order - 1)) + 1;
+    size_t num_polys = (maxcoeff - startcoeff) / 2 + 1;
+    size_t convs_len = 1;
+    for (size_t i = 0; i < rate; i++) {
+        convs_len *= num_polys;
+    }
+
+    conv_result_t *exhaustive = malloc(convs_len * sizeof(conv_result_t));
+    correct_convolutional_polynomial_t *iter_poly = malloc(rate * sizeof(correct_convolutional_polynomial_t));
+
+    for (size_t i = 0; i < rate; i++) {
+        iter_poly[i] = startcoeff;
+    }
+
+    // init exhaustive with all polys
+    for (size_t i = 0; i < convs_len; i++) {
+        exhaustive[i].poly = malloc(rate * sizeof(correct_convolutional_polynomial_t));
+        exhaustive[i].distances = calloc(num_scratches, sizeof(int));
+        exhaustive[i].cost = 0;
+        memcpy(exhaustive[i].poly, iter_poly, rate * sizeof(correct_convolutional_polynomial_t));
+        // this next loop adds 2 with "carry"
+        for (size_t j = 0; j < rate; j++) {
+            if (iter_poly[j] < maxcoeff) {
+                iter_poly[j] += 2;
+                // no more carries to propagate
+                break;
+            } else {
+                iter_poly[j] = startcoeff;
+            }
+        }
+    }
+    free(iter_poly);
+
+    while (convs_len > 20) {
+        size_t bytes_remaining = n_bytes;
+
+        // call init(), which sets all the error metrics to 0 for our new run
+        search_exhaustive_init(exhaustive, convs_len, num_scratches);
+
+        while (bytes_remaining) {
+            // in order to keep memory usage constant, we separate the msg into
+            // blocks and send each one through
+            // each time we do this, we have to calculate a new noise for each
+            // testbench
+
+            size_t block_len = (max_block_len < bytes_remaining) ? max_block_len : bytes_remaining;
+            bytes_remaining -= block_len;
+
+            for (unsigned int j = 0; j < block_len; j++) {
+                msg[j] = rand() % 256;
+            }
+
+            for (size_t i = 0; i < num_scratches; i++) {
+                scratches[i] = resize_conv_testbench(scratches[i], conv_enclen, start.conv, block_len);
+                build_white_noise(scratches[i]->noise, scratches[i]->enclen, eb_n0[i], bpsk_bit_energy);
+            }
+
+            search_exhaustive(rate, order,
+                              block_len, msg, scratches, num_scratches, weights,
+                              exhaustive, convs_len, bpsk_voltage);
+        }
+
+        // call fin(), which calculates a cost metric for all of the distances
+        // added by our msg block iterations and then sorts by this metric
+        search_exhaustive_fin(exhaustive, convs_len, weights, num_scratches);
+
+        // decide paramters for next loop iter
+        // if we've reduced to 20 or fewer items, we're going to just select
+        // those and declare the test done
+        size_t new_convs_len = (convs_len / 2) < 20 ? 20 : convs_len / 2;
+
+        // normally we'll double the message length each time we halve
+        // the number of entries so that each iter takes roughly the
+        // same time but has twice the resolution of the previous run.
+        //
+        // however, if we've reached max_msg_len, then we assume that
+        // the error stats collected are likely converged to whatever
+        // final value they'll take, and adding more length will not
+        // help us get better metrics. if we're at that point, then
+        // we just select the top 20 items and declare them winners
+        if (n_bytes >= max_msg_len) {
+            // converged case
+            new_convs_len = 20;
+        } else {
+            // increase our error metric resolution next run
+            n_bytes *= 2;
+            n_bytes = (n_bytes < max_msg_len) ? n_bytes : max_msg_len;
+        }
+        for (size_t i = new_convs_len; i < convs_len; i++) {
+            // these entries lost, free their memory here
+            free(exhaustive[i].poly);
+            free(exhaustive[i].distances);
+        }
+        convs_len = new_convs_len;
+        printf("exhaustive run: %zu items remain\n", convs_len);
+    }
+
+    for (size_t i = 0; i < convs_len; i++) {
+        for (size_t j = 0; j < rate; j++) {
+            printf(" %06o", exhaustive[i].poly[j]);
+        }
+        printf(":");
+        for (size_t j = 0; j < num_scratches; j++) {
+            printf(" %.2e@%.1fdB", exhaustive[i].distances[j]/((float)n_bytes * 8), eb_n0[j]);
+        }
+        printf("\n");
+    }
+
+    for (size_t i = 0; i < convs_len; i++) {
+        free(exhaustive[i].poly);
+        free(exhaustive[i].distances);
+    }
+    free(exhaustive);
+    free(msg);
+}
+
+int main(int argc, char **argv) {
+    srand(time(NULL));
+
+    size_t rate, order, n_bytes, n_iter;
+
+    sscanf(argv[1], "%zu", &rate);
+    sscanf(argv[2], "%zu", &order);
+    sscanf(argv[3], "%zu", &n_bytes);
+    sscanf(argv[4], "%zu", &n_iter);
+
+    double bpsk_voltage = 1.0/sqrt(2.0);
+    double bpsk_sym_energy = 2*pow(bpsk_voltage, 2.0);
+    double bpsk_bit_energy = bpsk_sym_energy/1.0;
+
+    bpsk_bit_energy = bpsk_sym_energy * rate;  // rate bits transmitted for every input bit
+
+    correct_convolutional_polynomial_t maxcoeff = (1 << order) - 1;
+    correct_convolutional_polynomial_t startcoeff = (1 << (order - 1)) + 1;
+
+    conv_tester_t start;
+
+    start.poly = malloc(rate * sizeof(correct_convolutional_polynomial_t));
+
+    for (size_t i = 0; i < rate; i++) {
+        start.poly[i] = ((maxcoeff - startcoeff) / 2) + startcoeff + 1;
+    }
+
+    start.conv = conv_create(rate, order, start.poly);
+
+    size_t num_scratches = 4;
+    float *weights;
+    conv_testbench **scratches = malloc(num_scratches * sizeof(conv_testbench *));
+    double *eb_n0;
+
+    for (size_t i = 0; i < num_scratches; i++) {
+        scratches[i] = resize_conv_testbench(NULL, conv_enclen, start.conv, max_block_len);
+    }
+
+    switch (order) {
+        case 6:
+            eb_n0 = (double[]){6.0, 5.5, 5.0, 4.5};
+            weights = (float[]){8000, 400, 20, 1};
+            break;
+        case 7:
+            eb_n0 = (double[]){5.5, 5.0, 4.5, 4.0};
+            weights = (float[]){8000, 400, 20, 1};
+            break;
+        case 8:
+        case 9:
+            eb_n0 = (double[]){5.0, 4.5, 4.0, 3.5};
+            weights = (float[]){8000, 400, 20, 1};
+            break;
+        default:
+            eb_n0 = (double[]){4.5, 4.0, 3.5, 3.0};
+            weights = (float[]){8000, 400, 20, 1};
+    }
+
+    test(rate, order, start, scratches, num_scratches, weights, n_bytes, eb_n0, bpsk_bit_energy, n_iter, bpsk_voltage);
+
+    free(start.poly);
+    conv_destroy(start.conv);
+    for (size_t i = 0; i < num_scratches; i++) {
+        free_scratch(scratches[i]);
+    }
+    free(scratches);
+
+    return 0;
+}
diff --git a/libcorrect/tools/find_conv_optim_poly_annealing.c b/libcorrect/tools/find_conv_optim_poly_annealing.c
new file mode 100644
index 0000000..a4f0203
--- /dev/null
+++ b/libcorrect/tools/find_conv_optim_poly_annealing.c
@@ -0,0 +1,350 @@
+#include <stdbool.h>
+#include <float.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <stddef.h>
+#include <limits.h>
+#include <pthread.h>
+#include <signal.h>
+
+#if HAVE_SSE
+#include "correct/util/error-sim-sse.h"
+typedef correct_convolutional_sse conv_t;
+static conv_t*(*conv_create)(size_t, size_t, const uint16_t *) = correct_convolutional_sse_create;
+static void(*conv_destroy)(conv_t *) = correct_convolutional_sse_destroy;
+static size_t(*conv_enclen)(void *, size_t) = conv_correct_sse_enclen;
+static void(*conv_encode)(void *, uint8_t *, size_t, uint8_t *) = conv_correct_sse_encode;
+static void(*conv_decode)(void *, uint8_t *, size_t, uint8_t *) = conv_correct_sse_decode;
+#else
+#include "correct/util/error-sim.h"
+typedef correct_convolutional conv_t;
+static conv_t*(*conv_create)(size_t, size_t, const uint16_t *) = correct_convolutional_create;
+static void(*conv_destroy)(conv_t *) = correct_convolutional_destroy;
+static size_t(*conv_enclen)(void *, size_t) = conv_correct_enclen;
+static void(*conv_encode)(void *, uint8_t *, size_t, uint8_t *) = conv_correct_encode;
+static void(*conv_decode)(void *, uint8_t *, size_t, uint8_t *) = conv_correct_decode;
+#endif
+
+typedef struct {
+    conv_t *conv;
+    correct_convolutional_polynomial_t *poly;
+} conv_tester_t;
+
+void shuffle(int *a, size_t len) {
+    for (size_t i = 0; i < len - 2; i++) {
+        size_t j = rand() % (len - i) + i;
+        int temp = a[i];
+        a[i] = a[j];
+        a[j] = temp;
+    }
+}
+
+int rand_geo(float p, int max) {
+    int geo = 1;
+    while (geo < max) {
+        if (rand() / (float)RAND_MAX > p) {
+            geo++;
+        } else {
+            break;
+        }
+    }
+    return geo;
+}
+
+void next_neighbor(correct_convolutional_polynomial_t *start,
+                   correct_convolutional_polynomial_t *neighbor, size_t rate, size_t order) {
+    int coeffs[rate * (order - 2)];
+    for (int i = 0; i < rate * (order - 2); i++) {
+        coeffs[i] = i;
+    }
+    shuffle(coeffs, rate * (order - 2));
+
+    memcpy(neighbor, start, rate * sizeof(correct_convolutional_polynomial_t));
+    size_t nflips = rand_geo(0.4, rate * (order - 2));
+    for (int i = 0; i < nflips; i++) {
+        ptrdiff_t index = coeffs[i] / (order - 2);
+        // decide which bit to flip
+        // we avoid the edge bits to prevent creating a degenerate poly
+        neighbor[index] ^= 1 << (coeffs[i] % (order - 2) + 1);
+    }
+}
+
+bool accept(float cost_a, float cost_b, double temperature) {
+    if (cost_b < cost_a) {
+        return true;
+    }
+
+    float p = (float)(rand()) / (float)(RAND_MAX);
+
+    return exp((cost_a - cost_b) / (cost_a * temperature)) > p;
+}
+
+typedef struct {
+    size_t rate;
+    size_t order;
+    correct_convolutional_polynomial_t *poly;
+    unsigned int distance;
+    conv_testbench *scratch;
+    size_t msg_len;
+    double eb_n0;
+    double bpsk_voltage;
+    double bpsk_bit_energy;
+} thread_args;
+
+const size_t max_block_len = 16384;
+
+void *find_cost_thread(void *vargs) {
+    thread_args *args = (thread_args *)vargs;
+    conv_t *conv;
+    uint8_t *msg = malloc(max_block_len);
+
+    conv = conv_create(args->rate, args->order, args->poly);
+    args->distance = 0;
+    conv_testbench *scratch = args->scratch;
+
+    size_t bytes_remaining = args->msg_len;
+    while (bytes_remaining) {
+        // in order to keep memory usage constant, we separate the msg into
+        // blocks and send each one through
+        // each time we do this, we have to calculate a new noise for each
+        // testbench
+        size_t block_len = (max_block_len < bytes_remaining) ? max_block_len : bytes_remaining;
+        bytes_remaining -= block_len;
+
+        for (unsigned int j = 0; j < block_len; j++) {
+            msg[j] = rand() % 256;
+        }
+
+        scratch = resize_conv_testbench(scratch, conv_enclen, conv, block_len);
+        scratch->encode = conv_encode;
+        scratch->encoder = conv;
+        scratch->decode = conv_decode;
+        scratch->decoder = conv;
+
+        build_white_noise(scratch->noise, scratch->enclen, args->eb_n0, args->bpsk_bit_energy);
+
+        args->distance += test_conv_noise(scratch, msg, block_len, args->bpsk_voltage);
+    }
+    conv_destroy(conv);
+    free(msg);
+    pthread_exit(NULL);
+}
+
+float find_cost(size_t rate, size_t order, correct_convolutional_polynomial_t *poly, size_t msg_len,
+                conv_testbench **scratches, size_t num_scratches, float *weights, double *eb_n0,
+                double bpsk_voltage, double bpsk_bit_energy) {
+    thread_args *args = malloc(num_scratches * sizeof(thread_args));
+    pthread_t *threads = malloc(num_scratches * sizeof(pthread_t));
+
+    for (size_t i = 0; i < num_scratches; i++) {
+        args[i].rate = rate;
+        args[i].order = order;
+        args[i].poly = poly;
+        args[i].scratch = scratches[i];
+        args[i].msg_len = msg_len;
+        args[i].eb_n0 = eb_n0[i];
+        args[i].bpsk_voltage = bpsk_voltage;
+        args[i].bpsk_bit_energy = bpsk_bit_energy;
+        pthread_attr_t attr;
+        pthread_attr_init(&attr);
+        pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
+        pthread_create(&threads[i], &attr, find_cost_thread, &args[i]);
+    }
+
+    for (size_t i = 0; i < num_scratches; i++) {
+        pthread_join(threads[i], NULL);
+    }
+
+    float cost = 0;
+    printf("poly:");
+    for (size_t i = 0; i < rate; i++) {
+        printf(" %06o", poly[i]);
+    }
+    printf(" error:");
+    for (size_t i = 0; i < num_scratches; i++) {
+        cost += weights[i] * args[i].distance;
+        printf(" %.2e@%.1fdB", (args[i].distance / (float)(msg_len * 8)), eb_n0[i]);
+    }
+    printf("\n");
+
+    free(args);
+    free(threads);
+
+    return cost;
+}
+
+static bool terminated = false;
+
+void sig_handler(int sig) {
+    if (sig == SIGINT || sig == SIGTERM || sig == SIGHUP) {
+        if (!terminated) {
+            terminated = true;
+            printf("terminating after current poly\n");
+        }
+    }
+}
+
+void search_simulated_annealing(size_t rate, size_t order, size_t n_steps, conv_tester_t *start,
+                                size_t n_bytes, conv_testbench **scratches, size_t num_scratches,
+                                float *weights, double start_temperature, double cooling_factor,
+                                double *eb_n0, double bpsk_voltage, double bpsk_bit_energy) {
+    // perform simulated annealing to find the optimal polynomial
+
+    float cost = find_cost(rate, order, start->poly, n_bytes, scratches, num_scratches, weights,
+                           eb_n0, bpsk_voltage, bpsk_bit_energy);
+
+    correct_convolutional_polynomial_t *neighbor_poly =
+        malloc(rate * sizeof(correct_convolutional_polynomial_t));
+
+    correct_convolutional_polynomial_t *state =
+        malloc(rate * sizeof(correct_convolutional_polynomial_t));
+    correct_convolutional_polynomial_t *best =
+        malloc(rate * sizeof(correct_convolutional_polynomial_t));
+
+    float best_cost = cost;
+
+    memcpy(state, start->poly, rate * sizeof(correct_convolutional_polynomial_t));
+    memcpy(best, start->poly, rate * sizeof(correct_convolutional_polynomial_t));
+
+    double temperature = start_temperature;
+
+    for (size_t i = 0; i < n_steps; i++) {
+        next_neighbor(state, neighbor_poly, rate, order);
+        float neighbor_cost =
+            find_cost(rate, order, neighbor_poly, n_bytes, scratches, num_scratches, weights, eb_n0,
+                      bpsk_voltage, bpsk_bit_energy);
+        if (accept(cost, neighbor_cost, temperature)) {
+            // we're moving to our neighbor's house
+            memcpy(state, neighbor_poly, rate * sizeof(correct_convolutional_polynomial_t));
+            cost = neighbor_cost;
+        } else {
+            // actually where we live now is nice
+        }
+
+        if (cost < best_cost) {
+            best_cost = cost;
+            memcpy(best, state, rate * sizeof(correct_convolutional_polynomial_t));
+        }
+
+        temperature *= cooling_factor;
+
+        if (terminated) {
+            break;
+        }
+    }
+
+    printf("last state:");
+    for (size_t i = 0; i < rate; i++) {
+        printf(" %06o", state[i]);
+    }
+    printf("\n");
+
+    printf("best state:");
+    for (size_t i = 0; i < rate; i++) {
+        printf(" %06o", best[i]);
+    }
+
+    memcpy(start->poly, best, rate * sizeof(correct_convolutional_polynomial_t));
+
+    free(state);
+    free(best);
+    free(neighbor_poly);
+}
+
+void test_sa(size_t rate, size_t order, conv_tester_t start, conv_testbench **scratches,
+             size_t num_scratches, float *weights, size_t n_bytes, double *eb_n0,
+             double bpsk_bit_energy, size_t n_iter, double bpsk_voltage) {
+    for (size_t i = 0; i < n_iter; i++) {
+        double temperature = (i == 0) ? 0.5 : 250;
+        double cooling_factor = (i == 0) ? 0.985 : 0.95;
+        size_t n_steps = (i == 0) ? 500 : 100;
+
+        search_simulated_annealing(rate, order, n_steps, &start, n_bytes, scratches, num_scratches,
+                                   weights, temperature, cooling_factor, eb_n0, bpsk_voltage,
+                                   bpsk_bit_energy);
+    }
+}
+
+int main(int argc, char **argv) {
+    srand(time(NULL));
+
+    signal(SIGINT, sig_handler);
+    signal(SIGTERM, sig_handler);
+    signal(SIGHUP, sig_handler);
+
+    size_t rate, order, n_bytes, n_iter;
+
+    sscanf(argv[1], "%zu", &rate);
+    sscanf(argv[2], "%zu", &order);
+    sscanf(argv[3], "%zu", &n_bytes);
+    sscanf(argv[4], "%zu", &n_iter);
+
+    double bpsk_voltage = 1.0 / sqrt(2.0);
+    double bpsk_sym_energy = 2 * pow(bpsk_voltage, 2.0);
+    double bpsk_bit_energy = bpsk_sym_energy / 1.0;
+
+    bpsk_bit_energy = bpsk_sym_energy * rate;  // rate bits transmitted for every input bit
+
+    // correct_convolutional_polynomial_t maxcoeff = (1 << order) - 1;
+    correct_convolutional_polynomial_t startcoeff = (1 << (order - 1)) + 1;
+
+    conv_tester_t start;
+
+    start.poly = malloc(rate * sizeof(correct_convolutional_polynomial_t));
+
+    for (size_t i = 0; i < rate; i++) {
+        start.poly[i] = ((rand() % (1 << (order - 2))) << 1) + startcoeff;
+    }
+
+    start.conv = conv_create(rate, order, start.poly);
+
+    size_t num_scratches = 4;
+    float *weights;
+    conv_testbench **scratches = malloc(num_scratches * sizeof(conv_testbench *));
+    double *eb_n0;
+
+    for (size_t i = 0; i < num_scratches; i++) {
+        scratches[i] = resize_conv_testbench(NULL, conv_enclen, start.conv, max_block_len);
+    }
+
+    switch (order) {
+        case 6:
+            eb_n0 = (double[]){6.0, 5.5, 5.0, 4.5};
+            weights = (float[]){8000, 400, 20, 1};
+            break;
+        case 7:
+        case 8:
+            eb_n0 = (double[]){5.5, 5.0, 4.5, 4.0};
+            weights = (float[]){8000, 400, 20, 1};
+            break;
+        case 9:
+        case 10:
+            eb_n0 = (double[]){5.0, 4.5, 4.0, 3.5};
+            weights = (float[]){8000, 400, 20, 1};
+            break;
+        case 11:
+        case 12:
+        case 13:
+            eb_n0 = (double[]){4.5, 4.0, 3.5, 3.0};
+            weights = (float[]){8000, 400, 20, 1};
+            break;
+        default:
+            eb_n0 = (double[]){3.5, 3.0, 2.5, 2.0};
+            weights = (float[]){8000, 400, 20, 1};
+    }
+
+    test_sa(rate, order, start, scratches, num_scratches, weights, n_bytes, eb_n0, bpsk_bit_energy,
+            n_iter, bpsk_voltage);
+
+    free(start.poly);
+    conv_destroy(start.conv);
+    for (size_t i = 0; i < num_scratches; i++) {
+        free_scratch(scratches[i]);
+    }
+    free(scratches);
+
+    return 0;
+}
diff --git a/libcorrect/tools/find_rs_primitive_poly.c b/libcorrect/tools/find_rs_primitive_poly.c
new file mode 100644
index 0000000..5b5b3e0
--- /dev/null
+++ b/libcorrect/tools/find_rs_primitive_poly.c
@@ -0,0 +1,51 @@
+#include "correct/reed-solomon.h"
+
+size_t block_size = 255;
+int power_max = 8;
+
+// visit all of the elements from the poly
+bool trypoly(field_operation_t poly, field_logarithm_t *log) {
+    memset(log, 0, block_size + 1);
+    field_operation_t element = 1;
+    log[0] = (field_logarithm_t)0;
+    for (field_operation_t i = 1; i < block_size + 1; i++) {
+        element = element * 2;
+        element = (element > block_size) ? (element ^ poly) : element;
+        if (log[element] != 0) {
+            return false;
+        }
+        log[element] = (field_logarithm_t)i;
+    }
+    return true;
+}
+
+int main() {
+    field_logarithm_t *log = malloc((block_size + 1) * sizeof(field_logarithm_t));
+    for (field_operation_t i = (block_size + 1); i < (block_size + 1) << 1; i++) {
+        if (trypoly(i, log)) {
+            printf("0x%x valid: ", i);
+            field_operation_t poly = i;
+            int power = power_max;
+            while(poly) {
+                if (poly & (block_size + 1)) {
+                    if (power > 1) {
+                        printf("x^%d", power);
+                    } else if (power) {
+                        printf("x");
+                    } else {
+                        printf("1");
+                    }
+                    if (poly & block_size) {
+                        printf(" + ");
+                    }
+                }
+                power--;
+                poly <<= 1;
+                poly &= (block_size << 1) + 1;
+            }
+            printf("\n");
+        }
+    }
+    free(log);
+    return 0;
+}
diff --git a/libcorrect/util/CMakeLists.txt b/libcorrect/util/CMakeLists.txt
new file mode 100644
index 0000000..739f649
--- /dev/null
+++ b/libcorrect/util/CMakeLists.txt
@@ -0,0 +1,11 @@
+add_library(error_sim OBJECT error-sim.c)
+
+add_library(error_sim_shim OBJECT error-sim.c error-sim-shim.c)
+
+if(HAVE_LIBFEC)
+    add_library(error_sim_fec OBJECT error-sim.c error-sim-fec.c)
+endif()
+
+if(HAVE_SSE)
+    add_library(error_sim_sse OBJECT error-sim.c error-sim-sse.c)
+endif()
diff --git a/libcorrect/util/error-sim-fec.c b/libcorrect/util/error-sim-fec.c
new file mode 100644
index 0000000..d97aa87
--- /dev/null
+++ b/libcorrect/util/error-sim-fec.c
@@ -0,0 +1,29 @@
+#include "correct/util/error-sim-fec.h"
+
+void conv_fec27_decode(void *conv_v, uint8_t *soft, size_t soft_len, uint8_t *msg) {
+    init_viterbi27(conv_v, 0);
+    update_viterbi27_blk(conv_v, soft, soft_len / 2 - 2);
+    size_t n_decoded_bits = (soft_len / 2) - 8;
+    chainback_viterbi27(conv_v, msg, n_decoded_bits, 0);
+}
+
+void conv_fec29_decode(void *conv_v, uint8_t *soft, size_t soft_len, uint8_t *msg) {
+    init_viterbi29(conv_v, 0);
+    update_viterbi29_blk(conv_v, soft, soft_len / 2 - 2);
+    size_t n_decoded_bits = (soft_len / 2) - 10;
+    chainback_viterbi29(conv_v, msg, n_decoded_bits, 0);
+}
+
+void conv_fec39_decode(void *conv_v, uint8_t *soft, size_t soft_len, uint8_t *msg) {
+    init_viterbi39(conv_v, 0);
+    update_viterbi39_blk(conv_v, soft, soft_len / 3 - 2);
+    size_t n_decoded_bits = (soft_len / 3) - 10;
+    chainback_viterbi39(conv_v, msg, n_decoded_bits, 0);
+}
+
+void conv_fec615_decode(void *conv_v, uint8_t *soft, size_t soft_len, uint8_t *msg) {
+    init_viterbi615(conv_v, 0);
+    update_viterbi615_blk(conv_v, soft, soft_len / 6 - 2);
+    size_t n_decoded_bits = (soft_len / 6) - 16;
+    chainback_viterbi615(conv_v, msg, n_decoded_bits, 0);
+}
diff --git a/libcorrect/util/error-sim-shim.c b/libcorrect/util/error-sim-shim.c
new file mode 100644
index 0000000..d050d0b
--- /dev/null
+++ b/libcorrect/util/error-sim-shim.c
@@ -0,0 +1,33 @@
+#include "correct/util/error-sim-shim.h"
+
+ssize_t conv_shim27_decode(void *conv_v, uint8_t *soft, size_t soft_len, uint8_t *msg) {
+    init_viterbi27(conv_v, 0);
+    update_viterbi27_blk(conv_v, soft, soft_len / 2 - 2);
+    size_t n_decoded_bits = (soft_len / 2) - 8;
+    chainback_viterbi27(conv_v, msg, n_decoded_bits, 0);
+    return (n_decoded_bits % 8) ? (n_decoded_bits / 8) + 1 : n_decoded_bits / 8;
+}
+
+ssize_t conv_shim29_decode(void *conv_v, uint8_t *soft, size_t soft_len, uint8_t *msg) {
+    init_viterbi29(conv_v, 0);
+    update_viterbi29_blk(conv_v, soft, soft_len / 2 - 2);
+    size_t n_decoded_bits = (soft_len / 2) - 10;
+    chainback_viterbi29(conv_v, msg, n_decoded_bits, 0);
+    return (n_decoded_bits % 8) ? (n_decoded_bits / 8) + 1 : n_decoded_bits / 8;
+}
+
+ssize_t conv_shim39_decode(void *conv_v, uint8_t *soft, size_t soft_len, uint8_t *msg) {
+    init_viterbi39(conv_v, 0);
+    update_viterbi39_blk(conv_v, soft, soft_len / 3 - 2);
+    size_t n_decoded_bits = (soft_len / 3) - 10;
+    chainback_viterbi39(conv_v, msg, n_decoded_bits, 0);
+    return (n_decoded_bits % 8) ? (n_decoded_bits / 8) + 1 : n_decoded_bits / 8;
+}
+
+ssize_t conv_shim615_decode(void *conv_v, uint8_t *soft, size_t soft_len, uint8_t *msg) {
+    init_viterbi615(conv_v, 0);
+    update_viterbi615_blk(conv_v, soft, soft_len / 6 - 2);
+    size_t n_decoded_bits = (soft_len / 6) - 16;
+    chainback_viterbi615(conv_v, msg, n_decoded_bits, 0);
+    return (n_decoded_bits % 8) ? (n_decoded_bits / 8) + 1 : n_decoded_bits / 8;
+}
diff --git a/libcorrect/util/error-sim-sse.c b/libcorrect/util/error-sim-sse.c
new file mode 100644
index 0000000..e7b9a2f
--- /dev/null
+++ b/libcorrect/util/error-sim-sse.c
@@ -0,0 +1,13 @@
+#include "correct/util/error-sim-sse.h"
+
+size_t conv_correct_sse_enclen(void *conv_v, size_t msg_len) {
+    return correct_convolutional_sse_encode_len((correct_convolutional_sse *)conv_v, msg_len);
+}
+
+void conv_correct_sse_encode(void *conv_v, uint8_t *msg, size_t msg_len, uint8_t *encoded) {
+    correct_convolutional_sse_encode((correct_convolutional_sse *)conv_v, msg, msg_len, encoded);
+}
+
+ssize_t conv_correct_sse_decode(void *conv_v, uint8_t *soft, size_t soft_len, uint8_t *msg) {
+    return correct_convolutional_sse_decode_soft((correct_convolutional_sse *)conv_v, soft, soft_len, msg);
+}
diff --git a/libcorrect/util/error-sim.c b/libcorrect/util/error-sim.c
new file mode 100644
index 0000000..0cd3400
--- /dev/null
+++ b/libcorrect/util/error-sim.c
@@ -0,0 +1,188 @@
+#include "correct/util/error-sim.h"
+
+size_t distance(uint8_t *a, uint8_t *b, size_t len) {
+    size_t dist = 0;
+    for (size_t i = 0; i < len; i++) {
+        if (a[i] != b[i]) {
+
+        }
+        dist += popcount((unsigned int)a[i] ^ (unsigned int)b[i]);
+    }
+    return dist;
+}
+
+void gaussian(double *res, size_t n_res, double sigma) {
+    for (size_t i = 0; i < n_res; i += 2) {
+        // compute using polar method of box muller
+        double s, u, v;
+        while (true) {
+            u = (double)(rand())/(double)RAND_MAX;
+            v = (double)(rand())/(double)RAND_MAX;
+
+            s = pow(u, 2.0) + pow(v, 2.0);
+
+            if (s > DBL_EPSILON && s < 1) {
+                break;
+            }
+        }
+
+        double base = sqrt((-2.0 * log(s))/s);
+
+        double z0 = u * base;
+        res[i] = z0 * sigma;
+
+        if (i + 1 < n_res) {
+            double z1 = v * base;
+            res[i + 1] = z1 * sigma;
+        }
+    }
+}
+
+void encode_bpsk(uint8_t *msg, double *voltages, size_t n_syms, double bpsk_voltage) {
+    uint8_t mask = 0x80;
+    for (size_t i = 0; i < n_syms; i++) {
+        voltages[i] = msg[i/8] & mask ? bpsk_voltage : -bpsk_voltage;
+        mask >>= 1;
+        if (!mask) {
+            mask = 0x80;
+        }
+    }
+}
+
+void byte2bit(uint8_t *bytes, uint8_t *bits, size_t n_bits) {
+    unsigned char cmask = 0x80;
+    for (size_t i = 0; i < n_bits; i++) {
+        bits[i] = (bytes[i/8] & cmask) ? 255 : 0;
+        cmask >>= 1;
+        if (!cmask) {
+            cmask = 0x80;
+        }
+    }
+}
+
+void decode_bpsk(uint8_t *soft, uint8_t *msg, size_t n_syms) {
+    uint8_t mask = 0x80;
+    for (size_t i = 0; i < n_syms; i++) {
+        uint8_t bit = soft[i] > 127 ? 1 : 0;
+        if (bit) {
+            msg[i/8] |= mask;
+        }
+        mask >>= 1;
+        if (!mask) {
+            mask = 0x80;
+        }
+    }
+}
+
+void decode_bpsk_soft(double *voltages, uint8_t *soft, size_t n_syms, double bpsk_voltage) {
+    for (size_t i = 0; i < n_syms; i++) {
+        double rel = voltages[i]/bpsk_voltage;
+        if (rel > 1) {
+            soft[i] = 255;
+        } else if (rel < -1) {
+            soft[i] = 0;
+        } else {
+            soft[i] = (uint8_t)(127.5 + 127.5 * rel);
+        }
+    }
+}
+
+double log2amp(double l) {
+    return pow(10.0, l/10.0);
+}
+
+double amp2log(double a) {
+    return 10.0 * log10(a);
+}
+
+double sigma_for_eb_n0(double eb_n0, double bpsk_bit_energy) {
+    // eb/n0 is the ratio of bit energy to noise energy
+    // eb/n0 is expressed in dB so first we convert to amplitude
+    double eb_n0_amp = log2amp(eb_n0);
+    // now the conversion. sigma^2 = n0/2 = ((eb/n0)^-1 * eb)/2 = eb/(2 * (eb/n0))
+    return sqrt(bpsk_bit_energy/(double)(2.0 * eb_n0_amp));
+}
+
+void build_white_noise(double *noise, size_t n_syms, double eb_n0, double bpsk_bit_energy) {
+    double sigma = sigma_for_eb_n0(eb_n0, bpsk_bit_energy);
+    gaussian(noise, n_syms, sigma);
+}
+
+void add_white_noise(double *signal, double *noise, size_t n_syms) {
+    const double sqrt_2 = sqrt(2);
+    for (size_t i = 0; i < n_syms; i++) {
+        // we want to add the noise in to the signal
+        // but we can't add them directly, because they're expressed as magnitudes
+        //   and the signal is real valued while the noise is complex valued
+
+        // we'll assume that the noise is exactly half real, half imaginary
+        // which means it forms a 90-45-45 triangle in the complex plane
+        // that means that the magnitude we have here is sqrt(2) * the real valued portion
+        // so, we'll divide by sqrt(2)
+        // (we are effectively throwing away the complex portion)
+        signal[i] += noise[i]/sqrt_2;
+    }
+}
+
+conv_testbench *resize_conv_testbench(conv_testbench *scratch, size_t (*enclen_f)(void *, size_t), void *enc, size_t msg_len) {
+    if (!scratch) {
+        scratch = calloc(1, sizeof(conv_testbench));
+    }
+
+    scratch->msg_out = realloc(scratch->msg_out, msg_len);
+
+    size_t enclen = enclen_f(enc, msg_len);
+    size_t enclen_bytes = (enclen % 8) ? (enclen/8 + 1) : enclen/8;
+    scratch->enclen = enclen;
+    scratch->enclen_bytes = enclen_bytes;
+
+    scratch->encoded = realloc(scratch->encoded, enclen_bytes);
+    scratch->v = realloc(scratch->v, enclen * sizeof(double));
+    scratch->corrupted = realloc(scratch->corrupted, enclen * sizeof(double));
+    scratch->noise = realloc(scratch->noise, enclen * sizeof(double));
+    scratch->soft = realloc(scratch->soft, enclen);
+    return scratch;
+}
+
+void free_scratch(conv_testbench *scratch) {
+    free(scratch->msg_out);
+    free(scratch->encoded);
+    free(scratch->v);
+    free(scratch->corrupted);
+    free(scratch->soft);
+    free(scratch->noise);
+    free(scratch);
+}
+
+int test_conv_noise(conv_testbench *scratch, uint8_t *msg, size_t n_bytes,
+                    double bpsk_voltage) {
+    scratch->encode(scratch->encoder, msg, n_bytes, scratch->encoded);
+    encode_bpsk(scratch->encoded, scratch->v, scratch->enclen, bpsk_voltage);
+
+    memcpy(scratch->corrupted, scratch->v, scratch->enclen * sizeof(double));
+    add_white_noise(scratch->corrupted, scratch->noise, scratch->enclen);
+    decode_bpsk_soft(scratch->corrupted, scratch->soft, scratch->enclen, bpsk_voltage);
+
+    memset(scratch->msg_out, 0, n_bytes);
+
+    ssize_t decode_len = scratch->decode(scratch->decoder, scratch->soft, scratch->enclen, scratch->msg_out);
+
+    if (decode_len != n_bytes) {
+        printf("expected to decode %zu bytes, decoded %zu bytes instead\n", n_bytes, decode_len);
+        exit(1);
+    }
+
+    return distance((uint8_t*)msg, scratch->msg_out, n_bytes);
+}
+
+size_t conv_correct_enclen(void *conv_v, size_t msg_len) {
+    return correct_convolutional_encode_len((correct_convolutional *)conv_v, msg_len);
+}
+
+void conv_correct_encode(void *conv_v, uint8_t *msg, size_t msg_len, uint8_t *encoded) {
+    correct_convolutional_encode((correct_convolutional *)conv_v, msg, msg_len, encoded);
+}
+
+ssize_t conv_correct_decode(void *conv_v, uint8_t *soft, size_t soft_len, uint8_t *msg) {
+    return correct_convolutional_decode_soft((correct_convolutional *)conv_v, soft, soft_len, msg);
+}