In [1]:
! pip install "thinc>=8.0.0a0" ml_datasets "tqdm>=4.41" syntok

Collecting syntok
  Downloading syntok-1.3.1.tar.gz (23 kB)
Collecting regex
  Downloading regex-2020.6.8.tar.gz (690 kB)
[K     |████████████████████████████████| 690 kB 2.6 MB/s 
Building wheels for collected packages: syntok, regex
  Building wheel for syntok (setup.py) ... [?25ldone
[?25h  Created wheel for syntok: filename=syntok-1.3.1-py3-none-any.whl size=20916 sha256=3863d3dfc33f9bd46b4c3a41c5be810617349012ba96235a0e302131b43d940e
  Stored in directory: /Users/jean.metz/Library/Caches/pip/wheels/5e/c2/33/e5d7d8f2f8b0c391d76bf82b844c3151bf23a84d75d02b185f
  Building wheel for regex (setup.py) ... [?25ldone
[?25h  Created wheel for regex: filename=regex-2020.6.8-cp37-cp37m-macosx_10_9_x86_64.whl size=283191 sha256=cd462451177171fb9e0f8822fd2330acf274b743b8c844b5cdf2cd5ec938af23
  Stored in directory: /Users/jean.metz/Library/Caches/pip/wheels/46/f1/0b/a372e98f7103934a3573301c71b475143baf8ba6f6dffc876c
Successfully built syntok regex
Installing collected packages: regex, synt

In [1]:
from thinc.api import HashEmbed, Maxout, Softmax, expand_window, Relu
from thinc.api import residual, strings2arrays, with_array, clone, chain, concatenate
from thinc.layers import noop
from syntok.tokenizer import Tokenizer

import ml_datasets
import numpy

In [2]:
def tokenize_texts(texts):
    tok = Tokenizer()
    return [[token.value for token in tok.tokenize(text)] for text in texts]


def load_data(limit:int = 1000):
    (train_texts, train_cats), (dev_texts, dev_cats) = ml_datasets.dbpedia(limit=limit)
    # train_texts, train_cats = zip(*train_data)
    # dev_texts, dev_cats = zip(*dev_data)
    unique_cats = list(numpy.unique(numpy.concatenate((train_cats, dev_cats))))
    nr_class = len(unique_cats)
    print(f"{len(train_cats)} training / {len(dev_cats)} test")
    print(f"{nr_class} classes")

    train_y = numpy.zeros((len(train_cats), nr_class), dtype="f")
    for i, cat in enumerate(train_cats):
        train_y[i][unique_cats.index(cat)] = 1
    dev_y = numpy.zeros((len(dev_cats), nr_class), dtype="f")
    for i, cat in enumerate(dev_cats):
        dev_y[i][unique_cats.index(cat)] = 1

    train_tokenized = tokenize_texts(train_texts)
    dev_tokenized = tokenize_texts(dev_texts)
    # Generate simple vocab mapping, <unk> is 0
    vocab = {}
    count_id = 1
    for text in train_tokenized:
        for token in text:
            if token not in vocab:
                vocab[token] = count_id
                count_id += 1
    # Map texts using vocab
    train_X = []
    for text in train_tokenized:
        train_X.append(numpy.array([vocab.get(t, 0) for t in text]))
    dev_X = []
    for text in dev_tokenized:
        dev_X.append(numpy.array([vocab.get(t, 0) for t in text]))
    return (train_X, train_y), (dev_X, dev_y), vocab



# Define the model

In [3]:
from thinc.api import add, chain, concatenate, clone
from thinc.api import with_array, reduce_max, reduce_mean, residual
from thinc.api import Model, Embed, Maxout, Softmax, Dropout, Relu

nH = 5


with Model.define_operators({">>": chain, "|": concatenate, "+": add, "**": clone}):
    model = (
        with_array(
            (Embed(128, column=0) + Embed(64, column=1))
            >> Maxout(nH, normalize=True, dropout=0.2)
        )
        >> (reduce_max() | reduce_mean())
        >> residual(Relu() >> Dropout(0.2)) ** 2
        >> Softmax()
    )


In [4]:
(train_X, train_y), (dev_X, dev_y), vocab = load_data()

batch_size = C["training"]["batch_size"]
optimizer = C["optimizer"]
model = C["model"]
model.get_ref("embed").set_dim("nV", len(vocab))

model.initialize(X=train_X, Y=train_y)

68345856it [00:01, 36810592.28it/s]
Untaring file...
2000 training / 2000 dev
14 classes


NameError: name 'C' is not defined

In [None]:
def evaluate_model(model, dev_X, dev_Y, batch_size):
    correct = 0.0
    total = 0.0
    for X, Y in model.ops.multibatch(batch_size, dev_X, dev_Y):
        Yh = model.predict(X)
        for j in range(len(Yh)):
            correct += Yh[j].argmax(axis=0) == Y[j].argmax(axis=0)
        total += len(Y)
    return float(correct / total)

In [None]:
from thinc.api import fix_random_seed
from tqdm.notebook import tqdm

fix_random_seed(0)
for n in range(C["training"]["n_iter"]):
    loss = 0.0
    batches = model.ops.multibatch(batch_size, train_X, train_y, shuffle=True)
    for X, Y in tqdm(batches, leave=False):
        Yh, backprop = model.begin_update(X)
        d_loss = []
        for i in range(len(Yh)):
            d_loss.append(Yh[i] - Y[i])
            loss += ((Yh[i] - Y[i]) ** 2).sum()
        backprop(numpy.array(d_loss))
        model.finish_update(optimizer)
    score = evaluate_model(model, dev_X, dev_y, batch_size)
    print(f"{n}\t{loss:.2f}\t{score:.3f}")