In [None]:
val path = System.getProperty("user.dir") + "/source/load-ivy.sc"
interp.load.module(ammonite.ops.Path(java.nio.file.FileSystems.getDefault().getPath(path)))

In [None]:
import chisel3._
import chisel3.util._
import chisel3.iotesters.{ChiselFlatSpec, Driver, PeekPokeTester}

<img src="slike/welcome_slide.png" width=100%>


# Chisel
* Constructing Hardware in a Scala Embedded Language
* Chisel is not HLS.
* Type-Safe Meta-Programming for RTL in Scala:
    * Parametrized types
    * Object-oriented programming
    * Functional programming
    * Static type checking

Note: Some of the slides and material were taken from: https://github.com/freechipsproject/chisel-bootcamp

# Example

<center>
<img src="slike/FIR_diagram.png">
</center>

In [56]:
class MovingAverage3(bitWidth: Int) extends Module {
  val io = IO(new Bundle {
    val in = Input(UInt(bitWidth.W))
    val out = Output(UInt(bitWidth.W))
  })
  val z1 = RegNext(io.in) // Create a register whose input is connected to the argument io.in
  val z2 = RegNext(z1)    // Create a register whose input is connected to the argument z1
  io.out := (io.in * 1.U) + (z1 * 1.U) + (z2 * 1.U) // `1.U` is an unsigned literal with value 1
}

defined [32mclass[39m [36mMovingAverage3[39m


<center>
<img src="slike/FIR_diagram.png">
</center>

In [57]:
visualize(() => new MovingAverage3(8))

In [59]:
print(getFirrtl(new MovingAverage3(8)))
//print(getVerilog(new MovingAverage3(8)))

circuit MovingAverage3 :
  module MovingAverage3 :
    input clock : Clock
    input reset : UInt<1>
    output io : { flip in : UInt<8>, out : UInt<8>}

    reg z1 : UInt, clock with :
      reset => (UInt<1>("h0"), z1) @[cmd55.sc 6:19]
    z1 <= io.in @[cmd55.sc 6:19]
    reg z2 : UInt, clock with :
      reset => (UInt<1>("h0"), z2) @[cmd55.sc 7:19]
    z2 <= z1 @[cmd55.sc 7:19]
    node _T = mul(io.in, UInt<1>("h1")) @[cmd55.sc 8:20]
    node _T_1 = mul(z1, UInt<1>("h1")) @[cmd55.sc 8:33]
    node _T_2 = add(_T, _T_1) @[cmd55.sc 8:27]
    node _T_3 = tail(_T_2, 1) @[cmd55.sc 8:27]
    node _T_4 = mul(z2, UInt<1>("h1")) @[cmd55.sc 8:46]
    node _T_5 = add(_T_3, _T_4) @[cmd55.sc 8:40]
    node _T_6 = tail(_T_5, 1) @[cmd55.sc 8:40]
    io.out <= _T_6 @[cmd55.sc 8:10]


# FIR Generator

In [None]:
// Generalized FIR filter parameterized by the convolution coefficients
class FirFilter(bitWidth: Int, coeffs: Seq[UInt]) extends Module {
  val io = IO(new Bundle {
    val in = Input(UInt(bitWidth.W))
    val out = Output(UInt())
  })
  // Create the serial-in, parallel-out shift register
  val zs = Reg(Vec(coeffs.length, UInt(bitWidth.W)))
  zs(0) := io.in
  for (i <- 1 until coeffs.length) {
    zs(i) := zs(i-1)
  }

  // Do the multiplies
  val products = VecInit.tabulate(coeffs.length)(i => zs(i) * coeffs(i))

  // Sum up the products
  io.out := products.reduce(_ +& _)
}

In [62]:
// same 3-point moving average filter as before
//visualize(() => new FirFilter(8, Seq(1.U, 1.U, 1.U)))

// 1-cycle delay as a FIR filter
//visualize(() => new FirFilter(8, Seq(0.U, 1.U)))

// 5-point FIR filter with a triangle impulse response
visualize(() => new FirFilter(8, Seq(1.U, 2.U, 3.U, 2.U, 1.U)))

# How does chisel4ml use Chisel?

In [63]:
object Neuron {
    def apply[I <: Bits,
              W <: Bits,
              M <: Bits,
              A <: Bits,
              O <: Bits](in: Seq[I],
                         weights: Seq[W],
                         thresh: A,
                         mul: (I, W) => M,
                         add: Vec[M] => A,
                         actFn: (A, A) => O,
                         shift: Int): O = {
        val muls = VecInit((in zip weights).map{
            case (a,b) => mul(a,b)
        })
        val pAct = add(muls)
        val sAct = (pAct << shift.abs).asTypeOf(pAct)
        actFn(sAct, thresh)
    }
}

defined [32mobject[39m [36mNeuron[39m

In [64]:
def mulUQ(i: SInt, w: SInt): SInt = i * w    // Uniform quantization
def addUQ = (x: Vec[SInt]) => x.reduceTree(_ +& _)

def mulBW = (i: SInt, w: Bool) => Mux(w, i, -i)  // Binary weight quantization

def mulBNN(i: Bool, w: Bool): Bool = ~(i ^ w) // Binarized quantization
def addBNN = (x: Vec[Bool]) => PopCount(x.asUInt)

def reluFn(act: SInt, thresh: SInt): UInt = Mux((act - thresh) > 0.S, (act - thresh).asUInt, 0.U)
def signFn(act:UInt, thresh: UInt): Bool = act >= thresh

defined [32mfunction[39m [36mmulUQ[39m
defined [32mfunction[39m [36maddUQ[39m
defined [32mfunction[39m [36mmulBW[39m
defined [32mfunction[39m [36mmulBNN[39m
defined [32mfunction[39m [36maddBNN[39m
defined [32mfunction[39m [36mreluFn[39m
defined [32mfunction[39m [36msignFn[39m

In [65]:
class DummyUniformModule extends Module {
  val io = IO(new Bundle {
    val in = Input(Vec(3, SInt(4.W)))
    val out = Output(UInt())
  })
    io.out := Neuron[SInt, SInt, SInt, SInt, UInt](in = io.in,
                                                   weights = Seq(1.S, -2.S, 3.S),
                                                   thresh = -1.S,
                                                   mul = mulUQ,
                                                   add = addUQ,
                                                   actFn = reluFn,
                                                   shift = 1
                                                 )
}

defined [32mclass[39m [36mDummyUniformModule[39m

In [68]:
//visualize(() => new DummyUniformModule())
print(getFirrtl(new DummyUniformModule()))
//print(getVerilog(new DummyUniformModule()))

circuit DummyUniformModule :
  module DummyUniformModule :
    input clock : Clock
    input reset : UInt<1>
    output io : { flip in : SInt<4>[3], out : UInt}

    node _T = mul(io.in[0], asSInt(UInt<2>("h1"))) @[cmd63.sc 1:39]
    node _T_1 = mul(io.in[1], asSInt(UInt<2>("h2"))) @[cmd63.sc 1:39]
    node _T_2 = mul(io.in[2], asSInt(UInt<3>("h3"))) @[cmd63.sc 1:39]
    wire _WIRE : SInt<7>[3] @[cmd62.sc 13:27]
    _WIRE[0] <= _T @[cmd62.sc 13:27]
    _WIRE[1] <= _T_1 @[cmd62.sc 13:27]
    _WIRE[2] <= _T_2 @[cmd62.sc 13:27]
    node _T_3 = add(_WIRE[0], _WIRE[1]) @[cmd63.sc 2:46]
    wire _WIRE_1 : SInt<8>[2] @[cmd63.sc 2:43]
    _WIRE_1[0] <= _T_3 @[cmd63.sc 2:43]
    _WIRE_1[1] <= _WIRE[2] @[cmd63.sc 2:43]
    node _T_4 = add(_WIRE_1[0], _WIRE_1[1]) @[cmd63.sc 2:46]
    wire _WIRE_2 : SInt<9>[1] @[cmd63.sc 2:43]
    _WIRE_2[0] <= _T_4 @[cmd63.sc 2:43]
    node _T_5 = shl(_WIRE_2[0], 1) @[cmd62.sc 17:26]
    wire _WIRE_3 : SInt<9> @[cmd62.sc 17:48]
    node _T_6 = asUInt(_T_5) @[cmd6

In [69]:
class DummyBinarizedModule extends Module {
  val io = IO(new Bundle {
    val in = Input(Vec(3, Bool()))
    val out = Output(UInt())
  })
    io.out := Neuron[Bool, Bool, Bool, UInt, Bool](in = io.in,
                                                  weights = Seq(true.B, false.B, true.B),
                                                  thresh = 2.U,
                                                  mul = mulBNN,
                                                  add = addBNN,
                                                  actFn = signFn,
                                                  shift = 0
                                                 )
}

defined [32mclass[39m [36mDummyBinarizedModule[39m

In [70]:
//visualize(() => new DummyBinarizedModule())
print(getFirrtl(new DummyBinarizedModule()))
//print(getVerilog(new DummyBinarizedModule()))

circuit DummyBinarizedModule :
  module DummyBinarizedModule :
    input clock : Clock
    input reset : UInt<1>
    output io : { flip in : UInt<1>[3], out : UInt}

    node _T = xor(io.in[0], UInt<1>("h1")) @[cmd63.sc 6:42]
    node _T_1 = not(_T) @[cmd63.sc 6:38]
    node _T_2 = xor(io.in[1], UInt<1>("h0")) @[cmd63.sc 6:42]
    node _T_3 = not(_T_2) @[cmd63.sc 6:38]
    node _T_4 = xor(io.in[2], UInt<1>("h1")) @[cmd63.sc 6:42]
    node _T_5 = not(_T_4) @[cmd63.sc 6:38]
    wire _WIRE : UInt<1>[3] @[cmd62.sc 13:27]
    _WIRE[0] <= _T_1 @[cmd62.sc 13:27]
    _WIRE[1] <= _T_3 @[cmd62.sc 13:27]
    _WIRE[2] <= _T_5 @[cmd62.sc 13:27]
    node hi = cat(_WIRE[2], _WIRE[1]) @[cmd63.sc 7:43]
    node _T_6 = cat(hi, _WIRE[0]) @[cmd63.sc 7:43]
    node _T_7 = bits(_T_6, 0, 0) @[Bitwise.scala 49:65]
    node _T_8 = bits(_T_6, 1, 1) @[Bitwise.scala 49:65]
    node _T_9 = bits(_T_6, 2, 2) @[Bitwise.scala 49:65]
    node _T_10 = add(_T_8, _T_9) @[Bitwise.scala 47:55]
    node _T_11 = bits(_T_10, 1

## Other abstractions in Chisel4ml:
* ProcessingElement == layer
* ProcessingPipeline == model

<p align="center">
<img src="slike/ProcElementC4ml.png" width=450  height=400 align=left>
<img src="slike/ProcPipelineC4ml.png" width=450 height=400 align=right>
</p>