second recording

joelgrus · Nov 21, 2017 · 7c4c368 · 7c4c368
1 parent ed10f0c
commit 7c4c368
Show file tree

Hide file tree

Showing 9 changed files with 73 additions and 56 deletions.
diff --git a/README.md b/README.md
@@ -24,9 +24,9 @@ type hinting, Python 3.6, and of course deep learning.
 # Here's the plan:
 
 1. Tensors
-2. Layers
-3. Neural Nets
-4. Loss Functions
+2. Loss Functions
+3. Layers
+4. Neural Nets
 5. Optimizers
 6. Data
 7. Training

diff --git a/fizzbuzz.py b/fizzbuzz.py
@@ -1,20 +1,20 @@
 """
-fizzbuzz is the terrible programming interview problem
-where for each of the numbers from 1 to 100,
+FizzBuzz is the following problem:
 
+For each of the numbers 1 to 100:
 * if the number is divisible by 3, print "fizz"
 * if the number is divisible by 5, print "buzz"
 * if the number is divisible by 15, print "fizzbuzz"
-* otherwise just print the number itself
+* otherwise, just print the number
 """
 from typing import List
 
 import numpy as np
 
+from joelnet.train import train
 from joelnet.nn import NeuralNet
 from joelnet.layers import Linear, Tanh
 from joelnet.optim import SGD
-from joelnet.train import train
 
 def fizz_buzz_encode(x: int) -> List[int]:
     if x % 15 == 0:
@@ -26,18 +26,21 @@ def fizz_buzz_encode(x: int) -> List[int]:
     else:
         return [1, 0, 0, 0]
 
+
 def binary_encode(x: int) -> List[int]:
     """
-    return the 10 binary digits of x
+    10 digit binary encoding of x
     """
     return [x >> i & 1 for i in range(10)]
 
 inputs = np.array([
-    binary_encode(x) for x in range(101, 1024)
+    binary_encode(x)
+    for x in range(101, 1024)
 ])
 
 targets = np.array([
-    fizz_buzz_encode(x) for x in range(101, 1024)
+    fizz_buzz_encode(x)
+    for x in range(101, 1024)
 ])
 
 net = NeuralNet([
@@ -53,8 +56,7 @@ def binary_encode(x: int) -> List[int]:
       optimizer=SGD(lr=0.001))
 
 for x in range(1, 101):
-    inputs = binary_encode(x)
-    predicted = net.forward(inputs)
+    predicted = net.forward(binary_encode(x))
     predicted_idx = np.argmax(predicted)
     actual_idx = np.argmax(fizz_buzz_encode(x))
     labels = [str(x), "fizz", "buzz", "fizzbuzz"]

diff --git a/joelnet/data.py b/joelnet/data.py
@@ -1,19 +1,21 @@
 """
-We will train our networks by feeding
-batches of data through them
+We'll feed inputs into our network in batches.
+So here are some tools for iterating over data in batches.
 """
-from typing import NamedTuple, Iterator
+from typing import Iterator, NamedTuple
 
 import numpy as np
 
 from joelnet.tensor import Tensor
 
 Batch = NamedTuple("Batch", [("inputs", Tensor), ("targets", Tensor)])
 
+
 class DataIterator:
     def __call__(self, inputs: Tensor, targets: Tensor) -> Iterator[Batch]:
         raise NotImplementedError
 
+
 class BatchIterator(DataIterator):
     def __init__(self, batch_size: int = 32, shuffle: bool = True) -> None:
         self.batch_size = batch_size
@@ -23,6 +25,7 @@ def __call__(self, inputs: Tensor, targets: Tensor) -> Iterator[Batch]:
         starts = np.arange(0, len(inputs), self.batch_size)
         if self.shuffle:
             np.random.shuffle(starts)
+
         for start in starts:
             end = start + self.batch_size
             batch_inputs = inputs[start:end]

diff --git a/joelnet/layers.py b/joelnet/layers.py
@@ -1,11 +1,10 @@
 """
-Our neural nets will be composed of layers.
-We will push inputs forward through the layers
-and propagate gradients backward through the layers.abs
+Our neural nets will be made up of layers.
+Each layer needs to pass its inputs forward
+and propagate gradients backward. For example,
+a neural net might look like
 
-A sample neural net might look like
-
-inputs -> linear -> tanh -> linear -> output
+inputs -> Linear -> Tanh -> Linear -> output
 """
 from typing import Dict, Callable
 
@@ -15,53 +14,65 @@
 
 
 class Layer:
-    """
-    base class for layers
-    """
     def __init__(self) -> None:
         self.params: Dict[str, Tensor] = {}
         self.grads: Dict[str, Tensor] = {}
 
     def forward(self, inputs: Tensor) -> Tensor:
+        """
+        Produce the outputs corresponding to these inputs
+        """
         raise NotImplementedError
 
     def backward(self, grad: Tensor) -> Tensor:
+        """
+        Backpropagate this gradient through the layer
+        """
         raise NotImplementedError
 
 
 class Linear(Layer):
     """
-    A linear layer computes
-
-    output = input @ w + b
+    computes output = inputs @ w + b
     """
     def __init__(self, input_size: int, output_size: int) -> None:
+        # inputs will be (batch_size, input_size)
+        # outputs will be (batch_size, output_size)
         super().__init__()
         self.params["w"] = np.random.randn(input_size, output_size)
         self.params["b"] = np.random.randn(output_size)
 
     def forward(self, inputs: Tensor) -> Tensor:
-        # batch_size, input_size = inputs.shape
+        """
+        outputs = inputs @ w + b
+        """
         self.inputs = inputs
         return inputs @ self.params["w"] + self.params["b"]
 
     def backward(self, grad: Tensor) -> Tensor:
         """
-        if y = f(x) and x = a * b
+        if y = f(x) and x = a * b + c
         then dy/da = f'(x) * b
-        then dy/db = f'(x) * a
+        and dy/db = f'(x) * a
+        and dy/dc = f'(x)
 
-        if now we have x = a @ b
+        if y = f(x) and x = a @ b + c
         then dy/da = f'(x) @ b.T
         and dy/db = a.T @ f'(x)
+        and dy/dc = f'(x)
         """
         self.grads["b"] = np.sum(grad, axis=0)
         self.grads["w"] = self.inputs.T @ grad
         return grad @ self.params["w"].T
 
+
 F = Callable[[Tensor], Tensor]
 
 class Activation(Layer):
+    """
+    An activation layer just applies a function
+    elementwise to its inputs
+    """
     def __init__(self, f: F, f_prime: F) -> None:
         super().__init__()
         self.f = f
@@ -72,6 +83,10 @@ def forward(self, inputs: Tensor) -> Tensor:
         return self.f(inputs)
 
     def backward(self, grad: Tensor) -> Tensor:
+        """
+        if y = f(x) and x = g(z)
+        then dy/dz = f'(x) * g'(z)
+        """
         return self.f_prime(self.inputs) * grad
 
 
@@ -84,5 +99,5 @@ def tanh_prime(x: Tensor) -> Tensor:
 
 
 class Tanh(Activation):
-    def __init__(self) -> None:
+    def __init__(self):
         super().__init__(tanh, tanh_prime)
diff --git a/joelnet/loss.py b/joelnet/loss.py
@@ -1,10 +1,6 @@
 """
-We will train our models using loss functions
-that indicate how good or bad our predictions are
-for known input/output pairs. Then we can use the
-gradients of this loss function with respect to the
-various parameters of the net to adjust the parameters
-and make our predictions better
+A loss function measures how good our predictions are,
+we can use this to adjust the parameters of our network
 """
 import numpy as np
 
@@ -17,10 +13,11 @@ def loss(self, predicted: Tensor, actual: Tensor) -> float:
     def grad(self, predicted: Tensor, actual: Tensor) -> Tensor:
         raise NotImplementedError
 
+
 class MSE(Loss):
     """
-    This is actually total squared error
-    not mean squared error
+    MSE is mean squared error, although we're
+    just going to do total squared error
     """
     def loss(self, predicted: Tensor, actual: Tensor) -> float:
         return np.sum((predicted - actual) ** 2)

diff --git a/joelnet/nn.py b/joelnet/nn.py
@@ -1,13 +1,14 @@
 """
-A neural net is just a series of layers.
-In fact, it behaves a lot like a layer itself
-although we're not going to make it one.
+A NeuralNet is just a collection of layers.
+It behaves a lot like a layer itself, although
+we're not going to make it one.
 """
 from typing import Sequence, Iterator, Tuple
 
 from joelnet.tensor import Tensor
 from joelnet.layers import Layer
 
+
 class NeuralNet:
     def __init__(self, layers: Sequence[Layer]) -> None:
         self.layers = layers

diff --git a/joelnet/optim.py b/joelnet/optim.py
@@ -1,18 +1,17 @@
 """
-An optimizer uses the computed gradients
-to adjust the parameters of a neural net
+We use an optimizer to adjust the parameters
+of our network based on the gradients computed
+during backpropagation
 """
-
 from joelnet.nn import NeuralNet
-from joelnet.tensor import Tensor
-
 
 class Optimizer:
     def step(self, net: NeuralNet) -> None:
         raise NotImplementedError
 
+
 class SGD(Optimizer):
-    def __init__(self, lr: float) -> None:
+    def __init__(self, lr: float = 0.01) -> None:
         self.lr = lr
 
     def step(self, net: NeuralNet) -> None:

diff --git a/joelnet/train.py b/joelnet/train.py
@@ -1,5 +1,5 @@
 """
-Here's the function for training neural nets
+Here's a function that can train a neural net
 """
 
 from joelnet.tensor import Tensor
@@ -15,7 +15,7 @@ def train(net: NeuralNet,
           num_epochs: int = 5000,
           iterator: DataIterator = BatchIterator(),
           loss: Loss = MSE(),
-          optimizer: Optimizer = SGD(lr=0.01)) -> None:
+          optimizer: Optimizer = SGD()) -> None:
     for epoch in range(num_epochs):
         epoch_loss = 0.0
         for batch in iterator(inputs, targets):

diff --git a/xor.py b/xor.py
@@ -1,17 +1,17 @@
 """
-The canonical example of a function that can't
-be learned by a linear layer alone is XOR.
+The canonical example of a function that can't be
+learned with a simple linear model is XOR
 """
 import numpy as np
 
+from joelnet.train import train
 from joelnet.nn import NeuralNet
 from joelnet.layers import Linear, Tanh
-from joelnet.train import train
 
 inputs = np.array([
     [0, 0],
-    [0, 1],
     [1, 0],
+    [0, 1],
     [1, 1]
 ])
 
@@ -28,7 +28,7 @@
     Linear(input_size=2, output_size=2)
 ])
 
-train(net, inputs, targets, num_epochs=5000)
+train(net, inputs, targets)
 
 for x, y in zip(inputs, targets):
     predicted = net.forward(x)