### Operator overloading for more concise model definitions

Thinc allows you to **overload operators** and bind arbitrary functions to Python operators like `+`, `*`, but also `>>` or `@`. The `Model.define_operators` contextmanager takes a dict of operators mapped to functions – typically combinators like `chain`. The operators are only valid for the `with` block. This lets us define the model like this:

In [None]:
!pip install "thinc>=8.0.0a0" ml_datasets "tqdm>=4.41" syntok

In [None]:
from thinc.api import prefer_gpu
prefer_gpu()

Instead of defining the `chain` as a comma-separated list of elements, one can use custom operators.
For example, transforming the following code 

```python
from thinc.api import Model, chain, Relu, Softmax
n_hidden = 32
dropout = 0.2

model = chain(
    Relu(nO=n_hidden, dropout=dropout), 
    Relu(nO=n_hidden, dropout=dropout), 
    Softmax()
)
```


into this:

In [None]:
# First, add src to sys.path

import sys
import os
from pathlib import PurePath

# add custom python modules root to the path variable,
root_path = PurePath(os.getcwd()).parents[0]
print(root_path)
src_path = str(
    root_path.joinpath('src'))

if src_path not in sys.path:
    sys.path.insert(0, str(src_path))

print(sys.path)



In [None]:
from thinc.api import Model, chain, Relu, Softmax

 
n_hidden = 32
dropout = 0.2

with Model.define_operators({">>": chain}):
    model = Relu(nO=n_hidden, dropout=dropout) >> Relu(nO=n_hidden, dropout=dropout) >> Softmax()

You can now use the `model` object as an argument to the `train_model` function defined below

In [None]:
from thinc.api import Adam, fix_random_seed
from tqdm.notebook import tqdm
import ml_datasets
from train import train_model

fix_random_seed(0)
optimizer = Adam(0.001)
batch_size = 128
data = (train_X, train_Y), (dev_X, dev_Y) = ml_datasets.mnist()


print("Measuring performance across iterations:")
train_model(data, model, optimizer, 20, batch_size)



## Text classification

Next is a definition for a text classification network, which expects a *list of arrays as input*, where each array should have two columns with different numeric identifier features. 

The model takes a list of 2-dimensional arrays (the tokenized texts mapped to vocab IDs) and outputs a 2d array.

The two features will be embedded using separate embedding tables, and the two vectors added and passed through a `Maxout` layer with layer `normalization` and `dropout`. The sequences then pass through two `pooling` functions, and the `concatenated` results are passed through 2 `Relu` layers with `dropout` and `residual` connections. Finally, the sequence vectors are passed through an output layer, which has a `Softmax` activation.

In [None]:
from syntok.tokenizer import Tokenizer
import numpy as np


def load_data():
    train_data, dev_data = ml_datasets.dbpedia(limit=2000)
    train_texts, train_cats = zip(*train_data)
    dev_texts, dev_cats = zip(*dev_data)
    unique_cats = list(np.unique(np.concatenate((train_cats, dev_cats))))
    nr_class = len(unique_cats)
    print(f"{len(train_data)} training / {len(dev_data)} dev\n{nr_class} classes")

    train_y = np.zeros((len(train_cats), nr_class), dtype="f")
    for i, cat in enumerate(train_cats):
        train_y[i][unique_cats.index(cat)] = 1
    dev_y = np.zeros((len(dev_cats), nr_class), dtype="f")
    for i, cat in enumerate(dev_cats):
        dev_y[i][unique_cats.index(cat)] = 1

    train_tokenized = tokenize_texts(train_texts)
    dev_tokenized = tokenize_texts(dev_texts)
    # Generate simple vocab mapping, <unk> is 0
    vocab = {}
    count_id = 1
    for text in train_tokenized:
        for token in text:
            if token not in vocab:
                vocab[token] = count_id
                count_id += 1
    # Map texts using vocab
    train_X = []
    for text in train_tokenized:
        train_X.append(np.array([vocab.get(t, 0) for t in text]))
    dev_X = []
    for text in dev_tokenized:
        dev_X.append(np.array([vocab.get(t, 0) for t in text]))
    return (train_X, train_y), (dev_X, dev_y), vocab, train_texts, dev_texts


def tokenize_texts(texts):
    tok = Tokenizer()
    return [[token.value for token in tok.tokenize(text)] for text in texts]


In [None]:

(train_X, train_y), (dev_X, dev_y), vocab, train_texts, dev_texts = load_data()

In [None]:
print(train_X[1].shape)
print(train_y[1])

In [101]:
from thinc.api import add, chain, concatenate, clone
from thinc.api import with_array, reduce_max, reduce_mean, residual
from thinc.api import Model, Embed, Maxout, Softmax, Dropout

nH = 5

# with Model.define_operators({">>": chain, "|": concatenate, "+": add, "**": clone}):
#     model = (
#         with_array(
#             # (Embed(128, column=0) + Embed(64, column=1))
#             # add(Embed(128, column=0), Embed(64, column=1))
#             Embed(nO=128, nV = len(vocab) + 1)
#             >> Maxout(nH, normalize=True, dropout=0.2)
#         )
#         >> (reduce_max() | reduce_mean())
#         >> residual(Relu() >> Dropout(0.2)) ** 2
#         >> Softmax()
#     )

from thinc.api import Model, list2ragged, list2array, chain, with_array, reduce_mean


with Model.define_operators({">>": chain, "|": concatenate, "+": add, "**": clone}):
    model = (
        with_array(
            Embed(nO=128, nV = len(vocab) + 1)
            >> Maxout(nH, normalize=True, dropout=0.2)
        )
        >> list2ragged() 
        >> (reduce_mean() | reduce_max())
        >> residual(Relu() >> Dropout(0.2)) ** 2
        >> Softmax()
    )




# working
# with Model.define_operators({">>": chain, "|": concatenate, "+": add, "**": clone}):
#     model = (
#         with_array(
#             Embed(nO=128, nV = len(vocab) + 1)
#             >> Maxout(nH, normalize=True, dropout=0.2)
#         )
#         >> list2ragged() 
#         >> reduce_mean() 
#         >> Softmax()
#     )

model.initialize(X=train_X[:5], Y=train_y[:5])


DataValidationError: 

Data validation error in 'relu'
X: <class 'thinc.types.Ragged'> Y: <class 'numpy.ndarray'>

X   not a valid numpy or cupy array


In [97]:
def evaluate_model(model, dev_X, dev_Y, batch_size):
    correct = 0.0
    total = 0.0
    for X, Y in model.ops.multibatch(batch_size, dev_X, dev_Y):
        Yh = model.predict(X)
        for j in range(len(Yh)):
            correct += Yh[j].argmax(axis=0) == Y[j].argmax(axis=0)
        total += len(Y)
    return float(correct / total)

In [98]:
# train_model(data, model, optimizer, 20, batch_size)
fix_random_seed(0)
for n in range(10):
    loss = 0.0
    batches = model.ops.multibatch(batch_size, train_X, train_y, shuffle=True)
    for X, Y in tqdm(batches, leave=False):
        Yh, backprop = model.begin_update(X)
        d_loss = []
        for i in range(len(Yh)):
            d_loss.append(Yh[i] - Y[i])
            loss += ((Yh[i] - Y[i]) ** 2).sum()
        backprop(np.array(d_loss))
        model.finish_update(optimizer)
    score = evaluate_model(model, dev_X, dev_y, batch_size)
    print(f"{n}\t{loss:.2f}\t{score:.3f}")

HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

0	1856.33	0.171


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

1	1850.19	0.229


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

2	1840.18	0.330


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

3	1828.21	0.348


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

4	1815.07	0.399


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

5	1801.60	0.414


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

6	1787.43	0.414


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

7	1772.66	0.435


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

8	1757.87	0.440


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

9	1741.30	0.435
