In [1]:
%install-location $cwd/swift-install
%install '.package(path: "$cwd/FastaiNotebook_01a_fastai_layers")' FastaiNotebook_01a_fastai_layers

Installing packages:
	.package(path: "/home/jmd/workspace/ml/fastai/nbs/swift/FastaiNotebook_01a_fastai_layers")
		FastaiNotebook_01a_fastai_layers
With SwiftPM flags: []
Working in: /tmp/tmpejz8j8mm/swift-install
/home/jmd/swift/usr/bin/swift-build: /home/jmd/anaconda3/lib/libcurl.so.4: no version information available (required by /home/jmd/swift/usr/lib/swift/linux/libFoundationNetworking.so)
Updating https://github.com/saeta/Just
Updating https://github.com/latenitesoft/NotebookExport
Updating https://github.com/mxcl/Path.swift
[1/6] Compiling FastaiNotebook_01a_fastai_layers 01_matmul.swift
[2/6] Compiling FastaiNotebook_01a_fastai_layers 00_load_data.swift
[3/6] Compiling FastaiNotebook_01a_fastai_layers 01a_fastai_layers.swift
[4/7] Merging module FastaiNotebook_01a_fastai_layers
[5/8] Compiling jupyterInstalledPackages jupyterInstalledPackages.swift
[6/9] Wrapping AST for FastaiNotebook_01a_fastai_layers for debugging
[7/9] Merging module jupyterInstalledPackages
[8/8] Linking 

In [2]:
//export
import Path
import TensorFlow

In [3]:
import FastaiNotebook_01a_fastai_layers

In [4]:
// export
public typealias TF=Tensor<Float>

In [5]:
// export
public func normalize(_ x:TF, mean:TF, std:TF) -> TF {
    return (x-mean)/std
}

In [6]:
var (xTrain, yTrain, xValid, yValid) = loadMNIST(path: mnistPath, flat: true)

In [14]:
let trainMean = xTrain.mean()
let trainStd  = xTrain.standardDeviation()
print(trainMean, trainStd)

0.13066047 0.3081079


In [15]:
xTrain = normalize(xTrain, mean: trainMean, std: trainStd)
xValid = normalize(xValid, mean: trainMean, std: trainStd)

In [16]:
//export
public func testNearZero(_ a: TF, tolerance: Float = 1e-3) {
    assert((abs(a) .< tolerance).all(), "Near zero: \(a)")
}

public func testSame(_ a: TF, _ b: TF) {
    // Check shapes match so broadcasting doesn't hide shape errors.
    assert(a.shape == b.shape)
    testNearZero(a-b)
}

In [18]:
testNearZero(xTrain.mean())
testNearZero(xTrain.standardDeviation() - 1.0)

In [19]:
let (n,m) = (xTrain.shape[0],xTrain.shape[1])
let c = yTrain.max()+1
print(n, m, c)

60000 784 10


In [20]:
//num hidden
let nh = 50

In [21]:
// simplified kaiming init / he init
let w1 = TF(randomNormal: [m, nh]) / sqrt(Float(m))
let b1 = TF(zeros: [nh])
let w2 = TF(randomNormal: [nh,1]) / sqrt(Float(nh))
let b2 = TF(zeros: [1])

In [23]:
testNearZero(w1.mean())
testNearZero(w1.standardDeviation()-1/sqrt(Float(m)))

In [25]:
// This should be ~ (0,1) (mean,std)...
print(xValid.mean(), xValid.standardDeviation())

0.0060177343 1.0076997


In [26]:
func lin(_ x: TF, _ w: TF, _ b: TF) -> TF { return x•w+b }

In [27]:
let t = lin(xValid, w1, b1)

In [28]:
//...so should this, because we used kaiming init, which is designed to do this
print(t.mean(), t.standardDeviation())

0.087955624 0.9942245


In [29]:
func myRelu(_ x:TF) -> TF { return max(x, 0) }

In [30]:
let t = myRelu(lin(xValid, w1, b1))

In [31]:
//...actually it really should be this!
print(t.mean(),t.standardDeviation())

0.43534523 0.6163497


In [32]:
// kaiming init / he init for relu
let w1 = TF(randomNormal: [m,nh]) * sqrt(2.0/Float(m))

In [33]:
print(w1.mean(), w1.standardDeviation())

-0.00022416159 0.05053568


In [36]:
let t = myRelu(lin(xValid, w1, b1))
print(t.mean(), t.standardDeviation())

0.46108657 0.7441583


In [37]:
func model(_ xb: TF) -> TF {
    let l1 = lin(xb, w1, b1)
    let l2 = myRelu(l1)
    let l3 = lin(l2, w2, b2)
    return l3
}

In [38]:
time(repeating: 10) { _ = model(xValid) }

average: 0.4939547 ms,   min: 0.456631 ms,   max: 0.567361 ms


In [39]:
let preds = model(xTrain)

In [40]:
// export
public func mse(_ out: TF, _ targ: TF) -> TF {
    return (out.squeezingShape(at: -1) - targ).squared().mean()
}

In [41]:
// Convert these to Float dtype.
var yTrainF = TF(yTrain)
var yValidF = TF(yValid)

In [42]:
mse(preds, yTrainF)

27.051638


In [43]:
/// WARNING: This is designed to be similar to the PyTorch 02_fully_connected lesson,
/// this isn't idiomatic Swift code.
class TFGrad {
    var inner, grad:  TF
    
    init(_ x: TF) {
        inner = x
        grad = TF(zeros: x.shape)
    } 
}

In [44]:
// Redefine our functions on TFGrad.
func lin(_ x: TFGrad, _ w: TFGrad, _ b: TFGrad) -> TFGrad {
    return TFGrad(x.inner • w.inner + b.inner)
}
func myRelu(_ x: TFGrad) -> TFGrad {
    return TFGrad(max(x.inner, 0))
}
func mse(_ inp: TFGrad, _ targ: TF) -> TF {
    //grad of loss with respect to output of previous layer
    return (inp.inner.squeezingShape(at: -1) - targ).squared().mean()
}

In [45]:
// Define our gradient functions.
func mseGrad(_ inp: TFGrad, _ targ: TF) {
    //grad of loss with respect to output of previous layer
    inp.grad = 2.0 * (inp.inner.squeezingShape(at: -1) - targ).expandingShape(at: -1) / Float(inp.inner.shape[0])
}

func reluGrad(_ inp: TFGrad, _ out: TFGrad) {
    //grad of relu with respect to input activations
    inp.grad = out.grad.replacing(with: TF(zeros: inp.inner.shape), where: (inp.inner .< 0))
}

In [46]:
func linGrad(_ inp:TFGrad, _ out:TFGrad, _ w:TFGrad, _ b:TFGrad){
    // grad of linear layer with respect to input activations, weights and bias
    inp.grad = out.grad • w.inner.transposed()
    w.grad = inp.inner.transposed() • out.grad
    b.grad = out.grad.sum(squeezingAxes: 0)
}

In [47]:
let w1a = TFGrad(w1)
let b1a = TFGrad(b1)
let w2a = TFGrad(w2)
let b2a = TFGrad(b2)

In [48]:
func forwardAndBackward(_ inp:TFGrad, _ targ:TF){
    // forward pass:
    let l1 = lin(inp, w1a, b1a)
    let l2 = myRelu(l1)
    let out = lin(l2, w2a, b2a)
    //we don't actually need the loss in backward!
    let loss = mse(out, targ)
    
    // backward pass:
    mseGrad(out, targ)
    linGrad(l2, out, w2a, b2a)
    reluGrad(l1, l2)
    linGrad(inp, l1, w1a, b1a)
}

In [49]:
let inp = TFGrad(xTrain)

In [50]:
forwardAndBackward(inp, yTrainF)

In [51]:
// This function prints out the calling function's name.  This 
// is useful to see what is going on in your program..
func trace(function: String = #function) {
  print(function)
}

// Try out the trace helper function.
func foo(a: Int, b: Int) -> Int {
  trace()
  return a+b
}
func bar(x: Int) -> Int {
  trace()
  return x*42+17
}

foo(a: 1, b: 2)
bar(x: 17)


foo(a:b:)
bar(x:)


731


In [52]:
func square(_ x: TF) -> TF {
    trace() 
    return x * x
}
func 𝛁square(_ x: TF) -> TF {
    trace()
    return 2 * x
}

func mean(_ x: TF) -> TF {
    trace()
    return x.mean()  // this is a reduction.  (can someone write this out longhand?)
}
func 𝛁mean(_ x: TF) -> TF {
    trace()
    return TF(ones: x.shape) / Float(x.shape[0])
}

In [53]:
func mseInner(_ x: TF) -> TF {
    return mean(square(x))
}

func 𝛁mseInner(_ x: TF) -> TF {
    return 𝛁mean(square(x)) * 𝛁square(x)
}

In [54]:
func mseInnerAndGrad(_ x: TF) -> (TF, TF) {
  return (mseInner(x), 𝛁mseInner(x))    
}

let exampleData = TF([1, 2, 3, 4])

let (mseInnerResult1, mseInnerGrad1) = mseInnerAndGrad(exampleData)
print()

print("result:", mseInnerResult1)
print("gradient:", mseInnerGrad1)

// Check that our gradient matches builtin S4TF's autodiff.
let builtinGrad = gradient(at: exampleData) { x in (x*x).mean() }
testSame(mseInnerGrad1, builtinGrad)

square(_:)
mean(_:)
square(_:)
𝛁mean(_:)
𝛁square(_:)

result: 7.5
gradient: [0.5, 1.0, 1.5, 2.0]


In [55]:
// The chainer for the gradient of square(x).
func square𝛁Chain(x: TF, ddx: TF) -> TF {
  trace()
  return ddx * 2*x
}

// The chainer for the gradient of mean(x).
func mean𝛁Chain(x: TF, ddx: TF) -> TF {
  trace()
  return ddx * TF(ones: x.shape) / Float(x.shape[0])
}

In [56]:
// Returns x*x and the chain for the gradient of x*x.
func squareVWC(_ x: TF) -> (value: TF, 
                            chain: (TF) -> TF) {
  trace()
  return (value: x*x,
          chain: { ddx in square𝛁Chain(x: x, ddx: ddx) })    
}

// Returns the mean of x and the chain for the mean.
func meanVWC(_ x: TF) -> (value: TF,
                          chain: (TF) -> TF) {
  trace()
  return (value: x.mean(),
          chain: { ddx in mean𝛁Chain(x: x, ddx: ddx) })
}

In [57]:
// We implement mean(square(x)) by calling each of the VWCs in turn.
func mseInnerVWC(_ x: TF) -> (value: TF, 
                              chain: (TF) -> TF) {

  // square and mean are tuples that carry the value/chain for each step.
  let square = squareVWC(x)
  let mean   = meanVWC(square.value)

  // The result is the combination of the results and the pullbacks.
  return (mean.value,
          // The mseInner pullback calls the functions in reverse order.
          { v in square.chain(mean.chain(v)) })
}

In [58]:
print("Calling the forward function:")
let mseInner2 = mseInnerVWC(exampleData)
print()

testSame(mseInner2.value, mseInnerResult1)


print("Calling the backward function:")
let mseInnerGrad2 = mseInner2.chain(TF(1))
print()

print(mseInnerGrad2)
// Check that we get the same result.
testSame(mseInnerGrad2, builtinGrad)


Calling the forward function:
squareVWC(_:)
meanVWC(_:)

Calling the backward function:
mean𝛁Chain(x:ddx:)
square𝛁Chain(x:ddx:)

[0.5, 1.0, 1.5, 2.0]


In [59]:
func reluVWC(_ x: TF) -> (value: TF, chain: (TF) -> TF) {
    return (value: max(x, 0),
            // Pullback for max(x, 0)
            chain: { 𝛁out -> TF in
              𝛁out.replacing(with: TF(zeros: x.shape), where: x .< 0)
            })
}

In [60]:
func linVWC(_ inp: TF, _ w: TF, _ b: TF) -> (value: TF,
                                             chain: (TF) -> (TF, TF, TF)) {
    return (value: inp • w + b,
            // Pullback for inp • w + b.  Three results because 'lin' has three args.
            chain: { 𝛁out in
              (𝛁out • w.transposed(), 
               inp.transposed() • 𝛁out,
               𝛁out.unbroadcasted(to: b.shape))
    })
}

In [61]:
func mseVWC(_ inp: TF, _ targ: TF) -> (value: TF,
                                       chain: (TF) -> (TF)) {
    let tmp = inp.squeezingShape(at: -1) - targ
    
    // We already wrote a VWC for x.square().mean(), so we can reuse it.
    let mseInner = mseInnerVWC(tmp)
    
    // Return the result, and a pullback that expands back out to
    // the input shape.
    return (mseInner.value, 
            { v in mseInner.chain(v).expandingShape(at: -1) })
}

In [62]:
func forwardAndBackward(_ inp: TF, _ targ: TF) -> (TF, TF, TF, TF, TF) {
    // Forward pass:
    let l1   = linVWC(inp, w1, b1)
    let l2   = reluVWC(l1.value)
    let out  = linVWC(l2.value, w2, b2)
    //we don't actually need the loss in backward, but we need the pullback.
    let loss = mseVWC(out.value, targ)
    
    // Backward pass:
    let 𝛁loss = TF(1) // We don't really need it but the gradient of the loss with respect to itself is 1
    let 𝛁out = loss.chain(𝛁loss)
    let (𝛁l2, 𝛁w2, 𝛁b2) = out.chain(𝛁out)
    let 𝛁l1 = l2.chain(𝛁l2)
    let (𝛁inp, 𝛁w1, 𝛁b1) = l1.chain(𝛁l1)
    return (𝛁inp, 𝛁w1, 𝛁b1, 𝛁w2, 𝛁b2)
}

In [63]:
let (𝛁xTrain, 𝛁w1, 𝛁b1, 𝛁w2, 𝛁b2) = forwardAndBackward(xTrain, yTrainF)
// Check this is still all correct
testSame(inp.grad, 𝛁xTrain)
testSame(w1a.grad, 𝛁w1)
testSame(b1a.grad, 𝛁b1)
testSame(w2a.grad, 𝛁w2)
testSame(b2a.grad, 𝛁b2)


squareVWC(_:)
meanVWC(_:)
mean𝛁Chain(x:ddx:)
square𝛁Chain(x:ddx:)


In [64]:
@differentiable
func mseInnerForAD(_ x: TF) -> TF {
    return x.squared().mean()
}

let mseInner𝛁Chain = pullback(at: exampleData, in: mseInnerForAD)
print(type(of: mseInner𝛁Chain))

(Tensor<Float>) -> Tensor<Float>


In [65]:
let (value, grad) = valueWithGradient(at: exampleData, in: mseInnerForAD)

print("value: \(value), grad:  \(grad)")

value: 7.5, grad:  [0.5, 1.0, 1.5, 2.0]


In [66]:
gradient(at: exampleData) { ($0*$0).mean() }

[0.5, 1.0, 1.5, 2.0]


In [67]:
@differentiable
func forward(_ inp: TF, _ targ: TF, w1: TF, b1: TF, w2: TF, b2: TF) -> TF {
    // FIXME: use lin
    let l1 = matmul(inp, w1) + b1
    let l2 = relu(l1)
    let l3 = matmul(l2, w2) + b2
    return (l3.squeezingShape(at: -1) - targ).squared().mean()
}

In [68]:
struct MyModel: Differentiable {
    public var w1, b1, w2, b2: TF
}

// Create an instance of our model with all the individual parameters we initialized.
let model = MyModel(w1: w1, b1: b1, w2: w2, b2: b2)

In [69]:
extension MyModel {
    @differentiable
    func forward(_ input: TF, _ target: TF) -> TF {
        // FIXME: use lin
        let l1 = matmul(input, w1) + b1
        let l2 = relu(l1)
        let l3 = matmul(l2, w2) + b2
        // use mse
        return (l3.squeezingShape(at: -1) - target).squared().mean()
    }
}

In [70]:
// Grads is a struct with one gradient per parameter.
let grads = gradient(at: model) { model in model.forward(xTrain, yTrainF) }

// Check that this still calculates the same thing.
testSame(𝛁w1, grads.w1)
testSame(𝛁b1, grads.b1)
testSame(𝛁w2, grads.w2)
testSame(𝛁b2, grads.b2)

In [71]:
time(repeating: 10) { _ = forwardAndBackward(xTrain, yTrainF) }

squareVWC(_:)
meanVWC(_:)
mean𝛁Chain(x:ddx:)
square𝛁Chain(x:ddx:)
squareVWC(_:)
meanVWC(_:)
mean𝛁Chain(x:ddx:)
square𝛁Chain(x:ddx:)
squareVWC(_:)
meanVWC(_:)
mean𝛁Chain(x:ddx:)
square𝛁Chain(x:ddx:)
squareVWC(_:)
meanVWC(_:)
mean𝛁Chain(x:ddx:)
square𝛁Chain(x:ddx:)
squareVWC(_:)
meanVWC(_:)
mean𝛁Chain(x:ddx:)
square𝛁Chain(x:ddx:)
squareVWC(_:)
meanVWC(_:)
mean𝛁Chain(x:ddx:)
square𝛁Chain(x:ddx:)
squareVWC(_:)
meanVWC(_:)
mean𝛁Chain(x:ddx:)
square𝛁Chain(x:ddx:)
squareVWC(_:)
meanVWC(_:)
mean𝛁Chain(x:ddx:)
square𝛁Chain(x:ddx:)
squareVWC(_:)
meanVWC(_:)
mean𝛁Chain(x:ddx:)
square𝛁Chain(x:ddx:)
squareVWC(_:)
meanVWC(_:)
mean𝛁Chain(x:ddx:)
square𝛁Chain(x:ddx:)
squareVWC(_:)
meanVWC(_:)
mean𝛁Chain(x:ddx:)
square𝛁Chain(x:ddx:)
average: 16.4167796 ms,   min: 12.753511 ms,   max: 19.997692 ms


In [72]:
time(repeating: 10) {
    _ = valueWithGradient(at: model) { 
        model in model.forward(xTrain, yTrainF)
    }
}

average: 9.8112337 ms,   min: 8.089194 ms,   max: 12.696444 ms


In [73]:
import NotebookExport
let exporter = NotebookExport(Path.cwd/"02_fully_connected.ipynb")
print(exporter.export(usingPrefix: "FastaiNotebook_"))

success
