# "Training spaCy on IDT"
> "I've forgotten where I put the output model, though"

- toc: false
- branch: master
- comments: true
- categories: [spacy, idt]


In [1]:
!git clone -b dev https://github.com/UniversalDependencies/UD_Irish-IDT

Cloning into 'UD_Irish-IDT'...
remote: Enumerating objects: 1119, done.[K
remote: Counting objects: 100% (181/181), done.[K
remote: Compressing objects: 100% (83/83), done.[K
remote: Total 1119 (delta 131), reused 142 (delta 98), pack-reused 938[K
Receiving objects: 100% (1119/1119), 12.62 MiB | 20.10 MiB/s, done.
Resolving deltas: 100% (756/756), done.


In [None]:
%%capture
!pip install -U spacy spacy-lookups-data

In [80]:
!python -m spacy project clone pipelines/tagger_parser_ud

[38;5;2m✔ Cloned 'pipelines/tagger_parser_ud' from explosion/projects[0m
/content/tagger_parser_ud
[38;5;2m✔ Your project is now ready![0m
To fetch the assets, run:
python -m spacy project assets /content/tagger_parser_ud


In [81]:
%%writefile tagger_parser_ud/project.yml
title: "Part-of-speech Tagging & Dependency Parsing (Universal Dependencies)"
description: "This project template lets you train a part-of-speech tagger, morphologizer and dependency parser from a [Universal Dependencies](https://universaldependencies.org/) corpus. It takes care of downloading the treebank, converting it to spaCy's format and training and evaluating the model. The template uses the [`UD_English-EWT`](https://github.com/UniversalDependencies/UD_English-EWT) treebank by default, but you can swap it out for any other available treebank. Just make sure to adjust the `lang` and treebank settings in the variables below. Use `xx` for multi-language if no language-specific tokenizer is available in spaCy. Note that multi-word tokens will be merged together when the corpus is converted since spaCy does not support multi-word token expansion."

# Variables can be referenced across the project.yml using ${vars.var_name}
vars:
  config: "default"
  lang: "ga"
  treebank: "UD_Irish-IDT"
  train_name: "ga_idt-ud-train"
  dev_name: "ga_idt-ud-dev"
  test_name: "ga_idt-ud-test"
  package_name: "ud_ga_idt"
  package_version: "0.0.0"
  gpu: -1

# These are the directories that the project needs. The project CLI will make
# sure that they always exist.
directories: ["assets", "corpus", "training", "metrics", "configs", "packages"]

assets:
  - dest: "assets/${vars.treebank}"
    git:
      repo: "https://github.com/UniversalDependencies/${vars.treebank}"
      branch: "master"
      path: ""

workflows:
  all:
    - preprocess
    - train
    - evaluate
    - package

commands:
  - name: preprocess
    help: "Convert the data to spaCy's format"
    script:
      - "mkdir -p corpus/${vars.treebank}"
      - "python -m spacy convert assets/${vars.treebank}/${vars.train_name}.conllu corpus/${vars.treebank}/ --converter conllu --n-sents 10 --merge-subtokens"
      - "python -m spacy convert assets/${vars.treebank}/${vars.dev_name}.conllu corpus/${vars.treebank}/ --converter conllu --n-sents 10 --merge-subtokens"
      - "python -m spacy convert assets/${vars.treebank}/${vars.test_name}.conllu corpus/${vars.treebank}/ --converter conllu --n-sents 10 --merge-subtokens"
      - "mv corpus/${vars.treebank}/${vars.train_name}.spacy corpus/${vars.treebank}/train.spacy"
      - "mv corpus/${vars.treebank}/${vars.dev_name}.spacy corpus/${vars.treebank}/dev.spacy"
      - "mv corpus/${vars.treebank}/${vars.test_name}.spacy corpus/${vars.treebank}/test.spacy"
    deps:
      - "assets/${vars.treebank}/${vars.train_name}.conllu"
      - "assets/${vars.treebank}/${vars.dev_name}.conllu"
      - "assets/${vars.treebank}/${vars.test_name}.conllu"
    outputs:
      - "corpus/${vars.treebank}/train.spacy"
      - "corpus/${vars.treebank}/dev.spacy"
      - "corpus/${vars.treebank}/test.spacy"

  - name: train
    help: "Train ${vars.treebank}"
    script:
      - "python -m spacy train configs/${vars.config}.cfg --output training/${vars.treebank} --gpu-id ${vars.gpu} --paths.train corpus/${vars.treebank}/train.spacy --paths.dev corpus/${vars.treebank}/dev.spacy --nlp.lang=${vars.lang}"
    deps:
      - "corpus/${vars.treebank}/train.spacy"
      - "corpus/${vars.treebank}/dev.spacy"
      - "configs/${vars.config}.cfg"
    outputs:
      - "training/${vars.treebank}/model-best"

  - name: evaluate
    help: "Evaluate on the test data and save the metrics"
    script:
      - "python -m spacy evaluate ./training/${vars.treebank}/model-best ./corpus/${vars.treebank}/test.spacy --output ./metrics/${vars.treebank}.json --gpu-id ${vars.gpu}"
    deps:
      - "training/${vars.treebank}/model-best"
      - "corpus/${vars.treebank}/test.spacy"
    outputs:
      - "metrics/${vars.treebank}.json"

  - name: package
    help: "Package the trained model so it can be installed"
    script:
      - "python -m spacy package training/${vars.treebank}/model-best packages --name ${vars.package_name} --version ${vars.package_version} --force"
    deps:
      - "training/${vars.treebank}/model-best"
    outputs_no_cache:
      - "packages/${vars.lang}_${vars.package_name}-${vars.package_version}/dist/en_${vars.package_name}-${vars.package_version}.tar.gz"

  - name: clean
    help: "Remove intermediate files"
    script:
      - "rm -rf training/*"
      - "rm -rf metrics/*"
      - "rm -rf corpus/*"

Overwriting tagger_parser_ud/project.yml


In [82]:
!python -m spacy project assets /content/tagger_parser_ud

[38;5;4mℹ Fetching 1 asset(s)[0m
[38;5;2m✔ Downloaded asset /content/tagger_parser_ud/assets/UD_Irish-IDT[0m


In [77]:
%%writefile base_config.cfg
# This is an auto-generated partial config. To use it with 'spacy train'
# you can run spacy init fill-config to auto-fill all default settings:
# python -m spacy init fill-config ./base_config.cfg ./config.cfg
[paths]
train = null
dev = null

[system]
gpu_allocator = null

[nlp]
lang = "ga"
pipeline = ["tok2vec","tagger","morphologizer","parser","ner"]
batch_size = 1000

[components]

[components.tok2vec]
factory = "tok2vec"

[components.tok2vec.model]
@architectures = "spacy.Tok2Vec.v2"

[components.tok2vec.model.embed]
@architectures = "spacy.MultiHashEmbed.v2"
width = ${components.tok2vec.model.encode.width}
attrs = ["ORTH", "SHAPE"]
rows = [5000, 2500]
include_static_vectors = true

[components.tok2vec.model.encode]
@architectures = "spacy.MaxoutWindowEncoder.v2"
width = 256
depth = 8
window_size = 1
maxout_pieces = 3

[components.morphologizer]
factory = "morphologizer"

[components.morphologizer.model]
@architectures = "spacy.Tagger.v1"
nO = null

[components.morphologizer.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode.width}

[components.tagger]
factory = "tagger"

[components.tagger.model]
@architectures = "spacy.Tagger.v1"
nO = null

[components.tagger.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode.width}

[components.parser]
factory = "parser"

[components.parser.model]
@architectures = "spacy.TransitionBasedParser.v2"
state_type = "parser"
extra_state_tokens = false
hidden_width = 128
maxout_pieces = 3
use_upper = true
nO = null

[components.parser.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode.width}

[components.ner]
factory = "ner"

[components.ner.model]
@architectures = "spacy.TransitionBasedParser.v2"
state_type = "ner"
extra_state_tokens = false
hidden_width = 64
maxout_pieces = 2
use_upper = true
nO = null

[components.ner.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode.width}

[corpora]

[corpora.train]
@readers = "spacy.Corpus.v1"
path = ${paths.train}
max_length = 0

[corpora.dev]
@readers = "spacy.Corpus.v1"
path = ${paths.dev}
max_length = 0

[training]
dev_corpus = "corpora.dev"
train_corpus = "corpora.train"

[training.optimizer]
@optimizers = "Adam.v1"

[training.batcher]
@batchers = "spacy.batch_by_words.v1"
discard_oversize = false
tolerance = 0.2

[training.batcher.size]
@schedules = "compounding.v1"
start = 100
stop = 1000
compound = 1.001

[initialize]
vectors = ${paths.vectors}

Writing base_config.cfg


In [78]:
!python -m spacy init fill-config ./base_config.cfg ./config.cfg

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [76]:
!rm -rf models
!mkdir models
!python -m spacy train -v /content/ga_vectors_cc -p 'tagger,parser,ner' ga models idt-json/ga_idt-ud-train.json idt-json/ga_idt-ud-dev.json

Usage: python -m spacy train [OPTIONS] CONFIG_PATH
Try 'python -m spacy train --help' for help.

Error: Invalid value for 'CONFIG_PATH': Path '-v' does not exist.


In [41]:
!mkdir modelout

In [39]:
%cd /content
!rm -rf modelout
!rm meta.json

/content


In [40]:
%%writefile meta.json
{
  "name": "idt_sm",
  "lang": "ga",
  "version": "1.0.0",
  "spacy_version": ">=2.0.0,<3.0.0",
  "description": "Irish model for spaCy trained on IDT",
  "author": "Jim O'Regan",
  "email": "jaoregan@tcd.ie",
  "license": "CC BY-SA 3.0",
  "url": "https://huggingface.co/jimregan",
  "pipeline": ["tagger", "parser", "ner"]
}

Writing meta.json


In [42]:
!python -m spacy package --meta meta.json /content/models/model-best modelout

[38;5;2m✔ Loaded meta.json from file[0m
meta.json
[38;5;2m✔ Successfully created package 'ga_idt_sm-1.0.0'[0m
modelout/ga_idt_sm-1.0.0
To build the package, run `python setup.py sdist` in this directory.


In [44]:
!apt install git-lfs

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following NEW packages will be installed:
  git-lfs
0 upgraded, 1 newly installed, 0 to remove and 37 not upgraded.
Need to get 2,129 kB of archives.
After this operation, 7,662 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 git-lfs amd64 2.3.4-1 [2,129 kB]
Fetched 2,129 kB in 1s (1,931 kB/s)
Selecting previously unselected package git-lfs.
(Reading database ... 155219 files and directories currently installed.)
Preparing to unpack .../git-lfs_2.3.4-1_amd64.deb ...
Unpacking git-lfs (2.3.4-1) ...
Setting up git-lfs (2.3.4-1) ...
Processing triggers for man-db (2.8.3-2ubuntu0.1) ...


In [46]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.12.5-py3-none-any.whl (3.1 MB)
[K     |████████████████████████████████| 3.1 MB 4.4 MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 9.8 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.1.2-py3-none-any.whl (59 kB)
[K     |████████████████████████████████| 59 kB 5.0 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 17.6 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 18.3 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  

In [48]:
!transformers-cli login



        _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
        _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
        _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
        _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
        _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

        
Username: jimregan
Password: 
Login successful
Your token: QesBBGLTWurOVdTKllaNeKyzDraDHjNlgXgiCybgXPjIwxEOJwxFtecuxzHFNpHKaLqBYjYcurRZFQxDjGsJbjDUsyYDEzUddrFbVblmFXVzoDGfueJqGXmtPELecnRr 

Your token has been saved to /root/.huggingface/token


In [54]:
!tar ztvf dist/ga_idt_sm-1.0.0.tar.gz



drwxr-xr-x root/root         0 2021-11-17 21:35 ga_idt_sm-1.0.0/
-rw-r--r-- root/root        17 2021-11-17 21:35 ga_idt_sm-1.0.0/MANIFEST.in
-rw-r--r-- root/root       242 2021-11-17 21:35 ga_idt_sm-1.0.0/PKG-INFO
drwxr-xr-x root/root         0 2021-11-17 21:35 ga_idt_sm-1.0.0/ga_idt_sm/
-rw-r--r-- root/root       291 2021-11-17 21:35 ga_idt_sm-1.0.0/ga_idt_sm/__init__.py
drwxr-xr-x root/root         0 2021-11-17 21:35 ga_idt_sm-1.0.0/ga_idt_sm/ga_idt_sm-1.0.0/
-rw-r--r-- root/root       333 2021-11-17 21:35 ga_idt_sm-1.0.0/ga_idt_sm/ga_idt_sm-1.0.0/meta.json
drwxr-xr-x root/root         0 2021-11-17 21:35 ga_idt_sm-1.0.0/ga_idt_sm/ga_idt_sm-1.0.0/ner/
-rw-r--r-- root/root       367 2021-11-17 20:59 ga_idt_sm-1.0.0/ga_idt_sm/ga_idt_sm-1.0.0/ner/cfg
-rw-r--r-- root/root   4227498 2021-11-17 20:59 ga_idt_sm-1.0.0/ga_idt_sm/ga_idt_sm-1.0.0/ner/model
-rw-r--r-- root/root        60 2021-11-17 20:59 ga_idt_sm-1.0.0/ga_idt_sm/ga_idt_sm-1.0.0/ner/moves
drwxr-xr-x root/root         0 2021-11-17

In [67]:
%cd /content/modelout/ga_idt_sm-1.0.0

/content/modelout/ga_idt_sm-1.0.0


In [68]:
!ls

dist  ga_idt_sm  ga_idt_sm.egg-info  MANIFEST.in  meta.json  setup.py


In [55]:
!transformers-cli repo create ga_idt_sm

[90mgit version 2.17.1[0m
Error: unknown flag: --version

[90mSorry, no usage text found for "git-lfs"[0m

You are about to create [1mjimregan/ga_idt_sm[0m
Proceed? [Y/n] Y

Your repo now lives at:
  [1mhttps://huggingface.co/jimregan/ga_idt_sm[0m

You can clone it locally with the command below, and commit/push as usual.

  git clone https://huggingface.co/jimregan/ga_idt_sm



In [69]:
!git clone https://huggingface.co/jimregan/ga_idt_sm hf

Cloning into 'hf'...
remote: Enumerating objects: 3, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Compressing objects: 100% (2/2), done.[K
remote: Total 3 (delta 0), reused 0 (delta 0)[K
Unpacking objects: 100% (3/3), done.


In [74]:
!python -m spacy package ./ga_idt_sm ./hf --build wheel

[38;5;4mℹ Building package artifacts: wheel[0m
Traceback (most recent call last):
  File "/usr/lib/python3.7/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/usr/lib/python3.7/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/usr/local/lib/python3.7/dist-packages/spacy/__main__.py", line 4, in <module>
    setup_cli()
  File "/usr/local/lib/python3.7/dist-packages/spacy/cli/_util.py", line 71, in setup_cli
    command(prog_name=COMMAND)
  File "/usr/local/lib/python3.7/dist-packages/click/core.py", line 829, in __call__
    return self.main(*args, **kwargs)
  File "/usr/local/lib/python3.7/dist-packages/click/core.py", line 782, in main
    rv = self.invoke(ctx)
  File "/usr/local/lib/python3.7/dist-packages/click/core.py", line 1259, in invoke
    return _process_result(sub_ctx.command.invoke(sub_ctx))
  File "/usr/local/lib/python3.7/dist-packages/click/core.py", line 1066, in invoke
    return ctx.invoke(self.callback, **ctx.params

In [72]:
!ls hf

ga_idt_sm-1.0.0


In [66]:
!ls ga_idt_sm-1.0.0/

dist  ga_idt_sm  ga_idt_sm.egg-info  MANIFEST.in  meta.json  setup.py


In [61]:
!rm -rf .git

In [None]:
!git add .