# Tagger 

This example shows the usage of operator overloading to define the same network architecture show in the example 2.

Therefore, it works like this:

* `MultiEmbed` layer: multiple numeric ID features are extracted for each word, and each feature is separately embedded. The separate vectors are concatenated and returned.

* `Hidden`: the concatenated embeddings are passed through a dense layer with a “maxout” activation [Goodfellow et al, 2013](https://arxiv.org/abs/1302.4389).

* `CNN layers`: several convolutional layers are applied in sequence for contextual encoding (`clone` function). Each CNN layer performs a “sequence-to-column” transformation, where a window of surrounding words is concatenated to each vector (`expand_window`). A `Hidden` layer then maps the result back to the original dimensionality. Residual connections and layer normalization are used to assist convergence (`residual` - a unary combinator creating a residual connection).

* `Softmax`: gives the most likelly tag as the output of the model.

In [6]:
from thinc.api import prefer_gpu
prefer_gpu()

from thinc.api import fix_random_seed
fix_random_seed(0)

from tqdm import tqdm



In [7]:
def train_model(data, model, optimizer, n_iter, batch_size):
    (train_X, train_y), (test_X, test_y) = data
    model.initialize(X=train_X[:5], Y=train_y[:5])
    for n in range(n_iter):
        loss = 0.0
        batches = model.ops.multibatch(batch_size, train_X, train_y, shuffle=True)
        for X, Y in tqdm(batches, leave=False):
            Yh, backprop = model.begin_update(X)
            d_loss = []
            for i in range(len(Yh)):
                d_loss.append(Yh[i] - Y[i])
                loss += ((Yh[i] - Y[i]) ** 2).sum()
            backprop(d_loss)
            model.finish_update(optimizer)
        score = evaluate(model, test_X, test_y, batch_size)
        print(f"{n}\t{loss:.2f}\t{score:.3f}")
        
def evaluate(model, test_X, test_Y, batch_size):
    correct = 0
    total = 0
    for X, Y in model.ops.multibatch(batch_size, test_X, test_Y):
        Yh = model.predict(X)
        for yh, y in zip(Yh, Y):
            correct += (y.argmax(axis=1) == yh.argmax(axis=1)).sum()
            total += y.shape[0]
    return float(correct / total)

In [8]:
from thinc.api import Model, HashEmbed, Maxout, Softmax, expand_window, Relu
from thinc.api import residual, strings2arrays, with_array, clone, chain, concatenate


width = 128
depth = 4
n_tags = 17


def Hidden(nO, dropout=0.2):
     return Maxout(nO, nP=3, normalize=True, dropout=dropout)

def CNN(width):
    return residual(chain(expand_window(1), Hidden(width)))


with Model.define_operators({">>": chain, "**": clone, "|": concatenate}):
    model = strings2arrays() >> with_array(
                (
                    HashEmbed(width, 4000, column=0)
                    | HashEmbed(width // 2, 2000, column=0)
                    | HashEmbed(width // 2, 2000, column=0)
                    | HashEmbed(width // 2, 2000, column=0)                
                )
                >> Hidden(width)
                >> clone(CNN(width), depth)
                >> Softmax(n_tags)
        )

# model.to_dict()


In [9]:
import ml_datasets


CONFIG = """
[hyper_params]
width = 32
vector_width = 16
learn_rate = 0.001

[training]
n_iter = 10
batch_size = 128

[optimizer]
@optimizers = "Adam.v1"
learn_rate = ${hyper_params:learn_rate}
"""


from thinc.api import registry, Config

config = Config().from_str(CONFIG)
loaded_config = registry.make_from_config(config)

optimizer = loaded_config["optimizer"]
n_iter = loaded_config["training"]["n_iter"]
batch_size = loaded_config["training"]["batch_size"]

(train_X, train_y), (test_X, test_y) = data = ml_datasets.ud_ancora_pos_tags(limit=1000)

In [10]:
train_model(data, model, optimizer, n_iter, batch_size)

0%|          | 0/8 [00:00<?, ?it/s]0	28959.23	0.585
  0%|          | 0/8 [00:00<?, ?it/s]1	16876.99	0.703
  0%|          | 0/8 [00:00<?, ?it/s]2	11079.14	0.776
  0%|          | 0/8 [00:00<?, ?it/s]3	7632.26	0.818
  0%|          | 0/8 [00:00<?, ?it/s]4	5030.11	0.841
  0%|          | 0/8 [00:00<?, ?it/s]5	3160.11	0.851
  0%|          | 0/8 [00:00<?, ?it/s]6	1869.66	0.859
  0%|          | 0/8 [00:00<?, ?it/s]7	1112.38	0.862
  0%|          | 0/8 [00:00<?, ?it/s]8	724.43	0.863
100%|██████████| 8/8 [00:02<00:00,  3.21it/s]9	521.95	0.864
