<br><br><br><br><br>

# Day 2 Homework

<br><br><br><br><br>

In [None]:
# We'll be training a decision tree to reproduce an image, and then speed up the image prediction.

# First, set up the tools we'll need to view the original image and models of it.

import PIL.Image
import numpy

def show(data):
    image = numpy.empty(data.shape + (3,))
    image[:, :, :] = data.reshape(data.shape + (1,))
    image -= image.min()
    image *= 255 / image.max()
    return PIL.Image.fromarray(image.astype(numpy.uint8))

In [None]:
y = numpy.array(PIL.Image.open("img/galaxy-infrared.jpg")).mean(axis=2)
height, width = y.shape
X = numpy.dstack(numpy.mgrid[0:height, 0:width]).reshape(height * width, 2)
show(y)

In [None]:
import sklearn.tree      # Start with an imprecise model: tree depth 4 is recognizable but low res.
model = sklearn.tree.DecisionTreeRegressor(max_depth=4)
model.fit(X, y.reshape(height * width, 1))
show(model.predict(X).reshape(height, width))

In [None]:
import awkward           # Now convert Scikit-Learn's tree model into a Pythonic one that we control.

def topython(model):
    mask = model.tree_.children_left < 0
    left = model.tree_.children_left.copy()
    right = model.tree_.children_right.copy()
    left[mask] = 0
    right[mask] = 0

    trees = awkward.Table()
    trees["feature"]   = awkward.MaskedArray(mask, model.tree_.feature)
    trees["threshold"] = awkward.MaskedArray(mask, model.tree_.threshold)
    trees["left"]      = awkward.MaskedArray(mask, awkward.IndexedArray(left, trees))
    trees["right"]     = awkward.MaskedArray(mask, awkward.IndexedArray(right, trees))
    trees["value"]     = model.tree_.value.reshape(mask.shape)

    return trees[0].tolist()

topython(model)

In [None]:
import time              # The Pythonic model prediction code is very simple.

def predict(tree, x):
    if tree["left"] is None:
        return tree["value"]
    elif x[tree["feature"]] < tree["threshold"]:
        return predict(tree["left"], x)
    else:
        return predict(tree["right"], x)

def predictall(tree, X):
    return numpy.array([predict(tree, x) for x in X])

In [None]:
model = sklearn.tree.DecisionTreeRegressor(max_depth=16)       # A deeper tree → more resolution.
model.fit(X, y.reshape(height * width, 1))
st = time.time(); image = model.predict(X).reshape(height, width); print(time.time() - st, "sec")
show(image)

In [None]:
starttime = time.time()                                        # But the Python version is super slow!
image = predictall(topython(model), X).reshape(height, width)
print(time.time() - starttime, "sec")
show(image)

### Homework

**This Python code is about 700× slower than Scikit-Learn.**

<br>

Your task is to speed up <tt>predict(tree, x)</tt> (and maybe <tt>predictall(tree, X)</tt>) using Numba. You are allowed to change the data structure or even use the `model.tree_` arrays directly out of Scikit-Learn.

Simply calling `model.predict` doesn't count.

<br><br><br><br><br>