In [2]:
val path = System.getProperty("user.dir") + "/source/load-ivy.sc"
interp.load.module(ammonite.ops.Path(java.nio.file.FileSystems.getDefault().getPath(path)))

Compiling /workdir/Main.sc

Downloading https://repo1.maven.org/maven2/edu/berkeley/cs/chisel3_2.12/maven-metadata.xml
Downloaded https://repo1.maven.org/maven2/edu/berkeley/cs/chisel3_2.12/maven-metadata.xml
Downloading https://repo1.maven.org/maven2/edu/berkeley/cs/chisel3_2.12/3.4.4/chisel3_2.12-3.4.4.pom
Downloaded https://repo1.maven.org/maven2/edu/berkeley/cs/chisel3_2.12/3.4.4/chisel3_2.12-3.4.4.pom
Downloading https://repo1.maven.org/maven2/org/scala-lang/scala-library/2.12.12/scala-library-2.12.12.pom
Downloading https://repo1.maven.org/maven2/edu/berkeley/cs/chisel3-macros_2.12/3.4.4/chisel3-macros_2.12-3.4.4.pom
Downloading https://repo1.maven.org/maven2/edu/berkeley/cs/chisel3-core_2.12/3.4.4/chisel3-core_2.12-3.4.4.pom
Downloading https://repo1.maven.org/maven2/org/scala-lang/scala-reflect/2.12.12/scala-reflect-2.12.12.pom
Downloaded https://repo1.maven.org/maven2/org/scala-lang/scala-library/2.12.12/scala-library-2.12.12.pom
Downloaded https://repo1.maven.org/maven2/org/scala-lang/scala-reflect/2.12

Compiling /workdir/Main.sc #2

[36mpath[39m: [32mString[39m = [32m"/workdir/source/load-ivy.sc"[39m

In [3]:
import chisel3._
import chisel3.util._
import chisel3.iotesters.{ChiselFlatSpec, Driver, PeekPokeTester}

[32mimport [39m[36mchisel3._
[39m
[32mimport [39m[36mchisel3.util._
[39m
[32mimport [39m[36mchisel3.iotesters.{ChiselFlatSpec, Driver, PeekPokeTester}[39m

<h1 style="text-align: center;">Chisel4ml: Generating Fast Implementations of Deeply Quantized Neural Networks using Chisel Generators</h1>
<div style="text-align: center;">
    <img src="slike/e7-logo.png" width=30%>
</div>


<h3 style="text-align: center;">Jure Vreča</h3>
<h4 style="text-align: center;">jure.vreca@ijs.si</h4>

# Chisel
* Constructing Hardware in a Scala Embedded Language
* Chisel is not HLS.
* Type-Safe Meta-Programming for RTL in Scala:
    * Parametrized types
    * Object-oriented programming
    * Functional programming
    * Static type checking

Note: Some of the slides and material were taken from: https://github.com/freechipsproject/chisel-bootcamp

# Example
<div style="text-align: center;">
    <img src="slike/FIR_diagram.png" width=75%>
</div>

In [4]:
class MovingAverage3(bitWidth: Int) extends Module {
  val io = IO(new Bundle {
    val in = Input(UInt(bitWidth.W))
    val out = Output(UInt(bitWidth.W))
  })
  val z1 = RegNext(io.in) // Create a register whose input is connected to the argument io.in
  val z2 = RegNext(z1)    // Create a register whose input is connected to the argument z1
  io.out := (io.in * 1.U) + (z1 * 1.U) + (z2 * 1.U) // `1.U` is an unsigned literal with value 1
}

defined [32mclass[39m [36mMovingAverage3[39m

In [5]:
visualize(() => new MovingAverage3(8))

In [6]:
//print(getFirrtl(new MovingAverage3(8)))
print(getVerilog(new MovingAverage3(8)))

Elaborating design...
Done elaborating.
module MovingAverage3(
  input        clock,
  input        reset,
  input  [7:0] io_in,
  output [7:0] io_out
);
`ifdef RANDOMIZE_REG_INIT
  reg [31:0] _RAND_0;
  reg [31:0] _RAND_1;
`endif // RANDOMIZE_REG_INIT
  reg [7:0] z1; // @[cmd3.sc 6:19]
  reg [7:0] z2; // @[cmd3.sc 7:19]
  wire [8:0] _T = io_in * 1'h1; // @[cmd3.sc 8:20]
  wire [8:0] _T_1 = z1 * 1'h1; // @[cmd3.sc 8:33]
  wire [8:0] _T_3 = _T + _T_1; // @[cmd3.sc 8:27]
  wire [8:0] _T_4 = z2 * 1'h1; // @[cmd3.sc 8:46]
  wire [8:0] _T_6 = _T_3 + _T_4; // @[cmd3.sc 8:40]
  assign io_out = _T_6[7:0]; // @[cmd3.sc 8:10]
  always @(posedge clock) begin
    z1 <= io_in; // @[cmd3.sc 6:19]
    z2 <= z1; // @[cmd3.sc 7:19]
  end
// Register and memory initialization
`ifdef RANDOMIZE_GARBAGE_ASSIGN
`define RANDOMIZE
`endif
`ifdef RANDOMIZE_INVALID_ASSIGN
`define RANDOMIZE
`endif
`ifdef RANDOMIZE_REG_INIT
`define RANDOMIZE
`endif
`ifdef RANDOMIZE_MEM_INIT
`define RANDOMIZE
`endif
`ifndef RANDOM


this are my notes. bla bla

# FIR Generator

In [None]:
// Generalized FIR filter parameterized by the convolution coefficients
class FirFilter(bitWidth: Int, coeffs: Seq[UInt]) extends Module {
  val io = IO(new Bundle {
    val in = Input(UInt(bitWidth.W))
    val out = Output(UInt())
  })
  // Create the serial-in, parallel-out shift register
  val zs = Reg(Vec(coeffs.length, UInt(bitWidth.W)))
  zs(0) := io.in
  for (i <- 1 until coeffs.length) {
    zs(i) := zs(i-1)
  }

  // Do the multiplies
  val products = VecInit.tabulate(coeffs.length)(i => zs(i) * coeffs(i))

  // Sum up the products
  io.out := products.reduce(_ +& _)
}

In [None]:
// same 3-point moving average filter as before
visualize(() => new FirFilter(8, Seq(1.U, 1.U, 1.U)))

// 1-cycle delay as a FIR filter
//visualize(() => new FirFilter(8, Seq(0.U, 1.U)))

// 5-point FIR filter with a triangle impulse response
//visualize(() => new FirFilter(8, Seq(1.U, 2.U, 3.U, 2.U, 1.U)))

# Artificial Neural Networks
<div style="text-align: center;">
    <img src="slike/neuron.drawio.png" width=45%>
</div>
<h1 style="text-align: center;">
$$
\displaystyle y=f(b + \sum_{i=0}^{N-1} x_i \cdot w_i)
$$
</h1>


# How does chisel4ml use Chisel?

In [7]:
object Neuron {
    def apply[I <: Bits,
              W <: Bits,
              M <: Bits,
              A <: Bits,
              O <: Bits](in: Seq[I],
                         weights: Seq[W],
                         thresh: A,
                         mul: (I, W) => M,
                         add: Vec[M] => A,
                         actFn: (A, A) => O,
                         shift: Int): O = {
        val muls = VecInit((in zip weights).map{
            case (a,b) => mul(a,b)
        })
        val pAct = add(muls)
        val sAct = (pAct << shift.abs).asTypeOf(pAct)
        actFn(sAct, thresh)
    }
}

defined [32mobject[39m [36mNeuron[39m

In [8]:
def mulUQ(i: SInt, w: SInt): SInt = i * w    // Uniform quantization
def addUQ = (x: Vec[SInt]) => x.reduceTree(_ +& _)

def mulBW = (i: SInt, w: Bool) => Mux(w, i, -i)  // Binary weight quantization

def mulBNN(i: Bool, w: Bool): Bool = ~(i ^ w) // Binarized quantization
def addBNN = (x: Vec[Bool]) => PopCount(x.asUInt)

def reluFn(act: SInt, thresh: SInt): UInt = Mux((act - thresh) > 0.S, (act - thresh).asUInt, 0.U)
def signFn(act:UInt, thresh: UInt): Bool = act >= thresh

defined [32mfunction[39m [36mmulUQ[39m
defined [32mfunction[39m [36maddUQ[39m
defined [32mfunction[39m [36mmulBW[39m
defined [32mfunction[39m [36mmulBNN[39m
defined [32mfunction[39m [36maddBNN[39m
defined [32mfunction[39m [36mreluFn[39m
defined [32mfunction[39m [36msignFn[39m

In [9]:
class DummyUniformModule extends Module {
  val io = IO(new Bundle {
    val in = Input(Vec(3, SInt(4.W)))
    val out = Output(UInt())
  })
    io.out := Neuron[SInt, SInt, SInt, SInt, UInt](in = io.in,
                                                   weights = Seq(1.S, -2.S, 3.S),
                                                   thresh = -1.S,
                                                   mul = mulUQ,
                                                   add = addUQ,
                                                   actFn = reluFn,
                                                   shift = 1
                                                 )
}

defined [32mclass[39m [36mDummyUniformModule[39m

In [10]:
    visualize(() => new DummyUniformModule())
//print(getFirrtl(new DummyUniformModule()))
//print(getVerilog(new DummyUniformModule()))

In [11]:
class DummyBinarizedModule extends Module {
  val io = IO(new Bundle {
    val in = Input(Vec(3, Bool()))
    val out = Output(UInt())
  })
    io.out := Neuron[Bool, Bool, Bool, UInt, Bool](in = io.in,
                                                  weights = Seq(true.B, false.B, true.B),
                                                  thresh = 2.U,
                                                  mul = mulBNN,
                                                  add = addBNN,
                                                  actFn = signFn,
                                                  shift = 0
                                                 )
}

defined [32mclass[39m [36mDummyBinarizedModule[39m

In [12]:
visualize(() => new DummyBinarizedModule())
print(getFirrtl(new DummyBinarizedModule()))
//print(getVerilog(new DummyBinarizedModule()))

circuit DummyBinarizedModule :
  module DummyBinarizedModule :
    input clock : Clock
    input reset : UInt<1>
    output io : { flip in : UInt<1>[3], out : UInt}

    node _T = xor(io.in[0], UInt<1>("h1")) @[cmd7.sc 6:42]
    node _T_1 = not(_T) @[cmd7.sc 6:38]
    node _T_2 = xor(io.in[1], UInt<1>("h0")) @[cmd7.sc 6:42]
    node _T_3 = not(_T_2) @[cmd7.sc 6:38]
    node _T_4 = xor(io.in[2], UInt<1>("h1")) @[cmd7.sc 6:42]
    node _T_5 = not(_T_4) @[cmd7.sc 6:38]
    wire _WIRE : UInt<1>[3] @[cmd6.sc 13:27]
    _WIRE[0] <= _T_1 @[cmd6.sc 13:27]
    _WIRE[1] <= _T_3 @[cmd6.sc 13:27]
    _WIRE[2] <= _T_5 @[cmd6.sc 13:27]
    node hi = cat(_WIRE[2], _WIRE[1]) @[cmd7.sc 7:43]
    node _T_6 = cat(hi, _WIRE[0]) @[cmd7.sc 7:43]
    node _T_7 = bits(_T_6, 0, 0) @[Bitwise.scala 49:65]
    node _T_8 = bits(_T_6, 1, 1) @[Bitwise.scala 49:65]
    node _T_9 = bits(_T_6, 2, 2) @[Bitwise.scala 49:65]
    node _T_10 = add(_T_8, _T_9) @[Bitwise.scala 47:55]
    node _T_11 = bits(_T_10, 1, 0) @[Bitwi

## Other abstractions in Chisel4ml:
* ProcessingElement == layer
* ProcessingPipeline == model

<p align="center">
<img src="slike/ProcElementC4ml.png" width=450  height=400 align=left>
<img src="slike/ProcPipelineC4ml.png" width=450 height=400 align=right>
</p>