<h3>Basic Recipe for Training a POS Tagger with SpaCy</h3>
<ol>
<li> Load Data
<ol><li>We'll be using a sample from Web Treebank corpus, in ConllX format</ol>
<li> Prepare environment for a new model
<ol><li>New model directory, with tagger and parser subdirectories. (Ensure you have permission)</ol>
<li> Build a vocabulary
<ol><li>We are just going to load the default English Vocabulary</ol>
<li> Build a Tagger (Ensure tagmap is provided if needed)
<ol><li>Our corpus has tags from Universal Tagset, so no need for a custom tag map</ol>
<li> Train Tagger

<ol><li>For each epoch: 
<ol><li>For each document in training data:
<ol><li>For each sentence in document:
<ol>
    <li>Create document with sentence words (tagger not yet applied)
    <li>Create GoldParse object with annotated labels
    <li>Apply the tagger to the document to get predictions
    <li>Update the tagger with GoldParse, Document (actual v predicted)
    
</ol>
</ol>
<li> Score predictions on validation set
</ol>

### Load Data

In [14]:
import requests

def read_conllx(text):
    
    for sent in text.strip().split('\n\n'):
        lines = sent.strip().split('\n')
        if lines:
            while lines[0].startswith('#'):
                lines.pop(0)
            tokens = []
            for line in lines:
                id_, word, lemma, tag, pos, morph, head, dep, _1, _2 = line.split()
                if '-' in id_:
                    continue
                try:
                    id_ = int(id_) - 1
                    head = (int(head) - 1) if head != '0' else id_
                    dep = 'ROOT' if dep == 'root' else dep
                    tokens.append((int(id_), unicode(word), unicode(pos), int(head), unicode(dep), 'O'))
                except:
                    print(line)
                    raise
            tuples = [list(t) for t in zip(*tokens)]
            yield (None, [[tuples, []]])

            
            
def LoadData(url, make_projective = False):
    conll_string = requests.get(url).content
    sents = list(read_conllx(conll_string))
    return sents
    
    
train_url = 'https://raw.githubusercontent.com/UniversalDependencies/UD_English/master/en-ud-train.conllu'
test_url = 'https://raw.githubusercontent.com/UniversalDependencies/UD_English/master/en-ud-test.conllu'
train_sents = LoadData(train_url)
test_sents = LoadData(test_url)

### Prepare Environment for New Model

In [1]:
from pathlib import Path
def prepare_environment_for_new_tagger(model_path, new_tagger_name):
    if not (model_dir / new_tagger_name).exists():
        (model_dir / new_tagger_name).mkdir()
        

model_dir = Path('/usr/local/lib/python2.7/dist-packages/spacy/data/en-1.1.0')
new_tagger_name = 'web-treebank-tagger'
prepare_environment_for_new_tagger(model_dir, new_tagger_name)

### Build a vocabulary

In [2]:
from spacy.vocab import Vocab
def build_vocab(model_dir, vec_path = None, lexeme_path = None):
    vocab = Vocab.load(model_dir)
    if lexeme_path:
        vocab.load_lexemes(lexeme_path)
    if vec_path:
        vocab.load_vectors_from_bin_loc(vec_path)
        
    return vocab
    
lexeme_path = model_dir / 'vocab' / 'lexemes.bin'
vocab = build_vocab(model_dir, vec_path=vec_path, lexeme_path=lexeme_path)

In [9]:
#test clusters are available

doc = Doc(vocab, words=[u'He',u'ate',u'pizza',u'.'])
print "Cluster Value for '{}': {}".format(*[doc[0], doc[0].cluster])


Cluster Value for 'He': 126


### Build a Tagger

In [22]:
from spacy.tagger import Tagger
def build_tagger(vocab, tag_map=None):
    tagger = Tagger(vocab, tag_map=tag_map)
    return tagger

tagger = build_tagger(vocab)

### Train Tagger

In [23]:
from spacy.scorer import Scorer
from spacy.gold import GoldParse
import random


def score_model(vocab, tagger, gold_docs, verbose=False):
    scorer = Scorer()
    for _, gold_doc in gold_docs:
        for (ids, words, tags, heads, deps, entities), _ in gold_doc:
            doc = Doc(vocab, words=map(unicode,words))
            tagger(doc)
            gold = GoldParse(doc, tags=tags)
            scorer.score(doc, gold, verbose=verbose)
    return scorer  


def train(tagger, vocab, train_sents, test_sents, model_dir, n_iter=20, seed = 0, feat_set = u'basic'):
    scorer = score_model(vocab, tagger, test_sents)
    print('%s:\t%.3f' % ("Pretraining", scorer.tags_acc))        
    for itn in range(n_iter):
        for _, doc_sents in train_sents:
            for (ids, words, tags, heads, deps, ner), _ in doc_sents:
                doc = Doc(vocab, words=map(unicode,words))
                gold = GoldParse(doc, tags=tags, heads=heads, deps=deps)
                tagger(doc)
                tagger.update(doc, gold)
        random.shuffle(train_sents)
        scorer = score_model(vocab, tagger, test_sents)
        print('%d:\t%.3f' % (itn, scorer.tags_acc))
    return tagger
trained_tagger = train(tagger, vocab, train_sents, test_sents, model_dir, n_iter = 10)

Pretraining:	0.000
0:	89.620
1:	89.488
2:	90.819
3:	91.078
4:	91.178
5:	90.839
6:	91.369
7:	91.947
8:	92.353
9:	92.565


### Save Tagger

In [42]:
def ensure_dir(path):
    if not path.exists():
        path.mkdir()
        
ensure_dir(model_dir / new_tagger_name)
trained_tagger.model.dump(str(model_dir / new_tagger_name / 'model'))