# "Training spaCy 3 on IDT"
> "Not going well so far"

- toc: false
- branch: master
- comments: true
- categories: [spacy, idt]


In [1]:
%%capture
!pip install -U pip setuptools wheel
!pip install thinc --pre
!!pip install -U spacy spacy-lookups-data

In [2]:
!python -m spacy project clone pipelines/tagger_parser_ud

[38;5;2m✔ Cloned 'pipelines/tagger_parser_ud' from explosion/projects[0m
/content/tagger_parser_ud
[38;5;2m✔ Your project is now ready![0m
To fetch the assets, run:
python -m spacy project assets /content/tagger_parser_ud


In [3]:
%%writefile tagger_parser_ud/project.yml
title: "Part-of-speech Tagging & Dependency Parsing (Universal Dependencies)"
description: "This project template lets you train a part-of-speech tagger, morphologizer and dependency parser from a [Universal Dependencies](https://universaldependencies.org/) corpus. It takes care of downloading the treebank, converting it to spaCy's format and training and evaluating the model. The template uses the [`UD_English-EWT`](https://github.com/UniversalDependencies/UD_English-EWT) treebank by default, but you can swap it out for any other available treebank. Just make sure to adjust the `lang` and treebank settings in the variables below. Use `xx` for multi-language if no language-specific tokenizer is available in spaCy. Note that multi-word tokens will be merged together when the corpus is converted since spaCy does not support multi-word token expansion."

# Variables can be referenced across the project.yml using ${vars.var_name}
vars:
  config: "default"
  lang: "ga"
  treebank: "UD_Irish-IDT"
  train_name: "ga_idt-ud-train"
  dev_name: "ga_idt-ud-dev"
  test_name: "ga_idt-ud-test"
  package_name: "ud_ga_idt"
  package_version: "0.0.0"
  gpu: 0

# These are the directories that the project needs. The project CLI will make
# sure that they always exist.
directories: ["assets", "corpus", "training", "metrics", "configs", "packages"]

assets:
  - dest: "assets/${vars.treebank}"
    git:
      repo: "https://github.com/UniversalDependencies/${vars.treebank}"
      branch: "dev"
      path: ""

workflows:
  all:
    - preprocess
    - train
    - evaluate
    - package

commands:
  - name: preprocess
    help: "Convert the data to spaCy's format"
    script:
      - "mkdir -p corpus/${vars.treebank}"
      - "python -m spacy convert assets/${vars.treebank}/${vars.train_name}.conllu corpus/${vars.treebank}/ --converter conllu --n-sents 10 --merge-subtokens --morphology"
      - "python -m spacy convert assets/${vars.treebank}/${vars.dev_name}.conllu corpus/${vars.treebank}/ --converter conllu --n-sents 10 --merge-subtokens --morphology"
      - "python -m spacy convert assets/${vars.treebank}/${vars.test_name}.conllu corpus/${vars.treebank}/ --converter conllu --n-sents 10 --merge-subtokens --morphology"
      - "mv corpus/${vars.treebank}/${vars.train_name}.spacy corpus/${vars.treebank}/train.spacy"
      - "mv corpus/${vars.treebank}/${vars.dev_name}.spacy corpus/${vars.treebank}/dev.spacy"
      - "mv corpus/${vars.treebank}/${vars.test_name}.spacy corpus/${vars.treebank}/test.spacy"
    deps:
      - "assets/${vars.treebank}/${vars.train_name}.conllu"
      - "assets/${vars.treebank}/${vars.dev_name}.conllu"
      - "assets/${vars.treebank}/${vars.test_name}.conllu"
    outputs:
      - "corpus/${vars.treebank}/train.spacy"
      - "corpus/${vars.treebank}/dev.spacy"
      - "corpus/${vars.treebank}/test.spacy"

  - name: train
    help: "Train ${vars.treebank}"
    script:
      - "python -m spacy train configs/${vars.config}.cfg --output training/${vars.treebank} --gpu-id ${vars.gpu} --paths.train corpus/${vars.treebank}/train.spacy --paths.dev corpus/${vars.treebank}/dev.spacy --nlp.lang=${vars.lang}"
    deps:
      - "corpus/${vars.treebank}/train.spacy"
      - "corpus/${vars.treebank}/dev.spacy"
      - "configs/${vars.config}.cfg"
    outputs:
      - "training/${vars.treebank}/model-best"

  - name: evaluate
    help: "Evaluate on the test data and save the metrics"
    script:
      - "python -m spacy evaluate ./training/${vars.treebank}/model-best ./corpus/${vars.treebank}/test.spacy --output ./metrics/${vars.treebank}.json --gpu-id ${vars.gpu}"
    deps:
      - "training/${vars.treebank}/model-best"
      - "corpus/${vars.treebank}/test.spacy"
    outputs:
      - "metrics/${vars.treebank}.json"

  - name: package
    help: "Package the trained model so it can be installed"
    script:
      - "python -m spacy package training/${vars.treebank}/model-best packages --name ${vars.package_name} --version ${vars.package_version} --force"
    deps:
      - "training/${vars.treebank}/model-best"
    outputs_no_cache:
      - "packages/${vars.lang}_${vars.package_name}-${vars.package_version}/dist/en_${vars.package_name}-${vars.package_version}.tar.gz"

  - name: clean
    help: "Remove intermediate files"
    script:
      - "rm -rf training/*"
      - "rm -rf metrics/*"
      - "rm -rf corpus/*"

Overwriting tagger_parser_ud/project.yml


In [4]:
!python -m spacy project assets /content/tagger_parser_ud

[38;5;4mℹ Fetching 1 asset(s)[0m
[38;5;2m✔ Downloaded asset /content/tagger_parser_ud/assets/UD_Irish-IDT[0m


In [5]:
%cd /content
!python -m spacy project run preprocess tagger_parser_ud

/content
[1m
Running command: mkdir -p corpus/UD_Irish-IDT
Running command: /usr/bin/python3 -m spacy convert assets/UD_Irish-IDT/ga_idt-ud-train.conllu corpus/UD_Irish-IDT/ --converter conllu --n-sents 10 --merge-subtokens --morphology
[38;5;4mℹ Grouping every 10 sentences into a document.[0m
[38;5;2m✔ Generated output file (401 documents):
corpus/UD_Irish-IDT/ga_idt-ud-train.spacy[0m
Running command: /usr/bin/python3 -m spacy convert assets/UD_Irish-IDT/ga_idt-ud-dev.conllu corpus/UD_Irish-IDT/ --converter conllu --n-sents 10 --merge-subtokens --morphology
[38;5;4mℹ Grouping every 10 sentences into a document.[0m
[38;5;2m✔ Generated output file (46 documents):
corpus/UD_Irish-IDT/ga_idt-ud-dev.spacy[0m
Running command: /usr/bin/python3 -m spacy convert assets/UD_Irish-IDT/ga_idt-ud-test.conllu corpus/UD_Irish-IDT/ --converter conllu --n-sents 10 --merge-subtokens --morphology
[38;5;4mℹ Grouping every 10 sentences into a document.[0m
[38;5;2m✔ Generated output file (46 doc

In [6]:
!python -m spacy project run train tagger_parser_ud

[1m
Running command: /usr/bin/python3 -m spacy train configs/default.cfg --output training/UD_Irish-IDT --gpu-id 0 --paths.train corpus/UD_Irish-IDT/train.spacy --paths.dev corpus/UD_Irish-IDT/dev.spacy --nlp.lang=ga
[38;5;2m✔ Created output directory: training/UD_Irish-IDT[0m
[38;5;4mℹ Saving to output directory: training/UD_Irish-IDT[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
[2021-11-17 23:48:13,392] [INFO] Set up nlp object from config
[2021-11-17 23:48:13,407] [INFO] Pipeline: ['tok2vec', 'tagger', 'morphologizer', 'parser']
[2021-11-17 23:48:13,414] [INFO] Created vocabulary
[2021-11-17 23:48:13,416] [INFO] Finished initializing nlp object
[2021-11-17 23:49:40,722] [INFO] Initialized pipeline components: ['tok2vec', 'tagger', 'morphologizer', 'parser']
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'tagger', 'morphologizer', 'parser'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS TAGGER  LOSS MORPH...  LOSS PARSER  TAG_ACC  

In [44]:
!apt install git-lfs

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following NEW packages will be installed:
  git-lfs
0 upgraded, 1 newly installed, 0 to remove and 37 not upgraded.
Need to get 2,129 kB of archives.
After this operation, 7,662 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 git-lfs amd64 2.3.4-1 [2,129 kB]
Fetched 2,129 kB in 1s (1,931 kB/s)
Selecting previously unselected package git-lfs.
(Reading database ... 155219 files and directories currently installed.)
Preparing to unpack .../git-lfs_2.3.4-1_amd64.deb ...
Unpacking git-lfs (2.3.4-1) ...
Setting up git-lfs (2.3.4-1) ...
Processing triggers for man-db (2.8.3-2ubuntu0.1) ...


In [None]:
%%capture
!pip install transformers

In [None]:
!transformers-cli login
