# "Training spaCy 3 on IDT"
> "Not going well so far"

- toc: false
- branch: master
- comments: true
- categories: [spacy, idt]


In [1]:
%%capture
!pip install -U pip setuptools wheel
!pip install thinc --pre
!pip install -U spacy-lookups-data
!pip install -U datasets floret

In [2]:
%%capture
!pip install -U spacy spacy-lookups-data

In [3]:
!pip uninstall spacy


Found existing installation: spacy 3.2.0
Uninstalling spacy-3.2.0:
  Would remove:
    /usr/local/bin/spacy
    /usr/local/lib/python3.7/dist-packages/spacy-3.2.0.dist-info/*
    /usr/local/lib/python3.7/dist-packages/spacy/*
Proceed (Y/n)? Y
  Successfully uninstalled spacy-3.2.0


In [None]:
#!pip install -U Cython numpy
!pip install --verbose -U git+https://github.com/jimregan/spaCy@patch-2

In [5]:
!python -m spacy project clone pipelines/tagger_parser_ud

[38;5;2m✔ Cloned 'pipelines/tagger_parser_ud' from explosion/projects[0m
/content/tagger_parser_ud
[38;5;2m✔ Your project is now ready![0m
To fetch the assets, run:
python -m spacy project assets /content/tagger_parser_ud


In [6]:
%%writefile tagger_parser_ud/project.yml
title: "Part-of-speech Tagging & Dependency Parsing (Universal Dependencies)"
description: "This project template lets you train a part-of-speech tagger, morphologizer and dependency parser from a [Universal Dependencies](https://universaldependencies.org/) corpus. It takes care of downloading the treebank, converting it to spaCy's format and training and evaluating the model. The template uses the [`UD_English-EWT`](https://github.com/UniversalDependencies/UD_English-EWT) treebank by default, but you can swap it out for any other available treebank. Just make sure to adjust the `lang` and treebank settings in the variables below. Use `xx` for multi-language if no language-specific tokenizer is available in spaCy. Note that multi-word tokens will be merged together when the corpus is converted since spaCy does not support multi-word token expansion."

# Variables can be referenced across the project.yml using ${vars.var_name}
vars:
  config: "default"
  lang: "ga"
  treebank: "UD_Irish-IDT"
  train_name: "ga_idt-ud-train"
  dev_name: "ga_idt-ud-dev"
  test_name: "ga_idt-ud-test"
  package_name: "ud_ga_idt"
  package_version: "0.0.0"
  gpu: -1

# These are the directories that the project needs. The project CLI will make
# sure that they always exist.
directories: ["assets", "corpus", "training", "metrics", "configs", "packages"]

assets:
  - url: "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ga.300.vec.gz"
    dest: "assets/vectors.vec.gz"
    description: "FastText vectors"
  - dest: "assets/${vars.treebank}"
    git:
      repo: "https://github.com/UniversalDependencies/${vars.treebank}"
      branch: "master"
      path: ""

workflows:
  all:
    - preprocess
    - vectors
    - train
    - evaluate
    - package

commands:
  - name: preprocess
    help: "Convert the data to spaCy's format"
    script:
      - "mkdir -p corpus/${vars.treebank}"
      - "python -m spacy convert assets/${vars.treebank}/${vars.train_name}.conllu corpus/${vars.treebank}/ --converter conllu --n-sents 10 --merge-subtokens --morphology"
      - "python -m spacy convert assets/${vars.treebank}/${vars.dev_name}.conllu corpus/${vars.treebank}/ --converter conllu --n-sents 10 --merge-subtokens --morphology"
      - "python -m spacy convert assets/${vars.treebank}/${vars.test_name}.conllu corpus/${vars.treebank}/ --converter conllu --n-sents 10 --merge-subtokens --morphology"
      - "mv corpus/${vars.treebank}/${vars.train_name}.spacy corpus/${vars.treebank}/train.spacy"
      - "mv corpus/${vars.treebank}/${vars.dev_name}.spacy corpus/${vars.treebank}/dev.spacy"
      - "mv corpus/${vars.treebank}/${vars.test_name}.spacy corpus/${vars.treebank}/test.spacy"
    deps:
      - "assets/${vars.treebank}/${vars.train_name}.conllu"
      - "assets/${vars.treebank}/${vars.dev_name}.conllu"
      - "assets/${vars.treebank}/${vars.test_name}.conllu"
    outputs:
      - "corpus/${vars.treebank}/train.spacy"
      - "corpus/${vars.treebank}/dev.spacy"
      - "corpus/${vars.treebank}/test.spacy"

  - name: vectors
    help: "Convert, truncate and prune the vectors."
    script:
      - "python -m spacy init vectors ga assets/vectors.vec.gz corpus/ga_vectors -n ga_fasttext_cc_vectors_md"
    deps:
      - "assets/vectors.vec.gz"
    outputs:
      - "corpus/ga_vectors"

  - name: train
    help: "Train ${vars.treebank}"
    script:
      - "python -m spacy train configs/${vars.config}.cfg --output training/${vars.treebank} --gpu-id ${vars.gpu} --paths.train corpus/${vars.treebank}/train.spacy --paths.dev corpus/${vars.treebank}/dev.spacy --paths.vectors corpus/ga_vectors --nlp.lang=${vars.lang}"
    deps:
      - "corpus/${vars.treebank}/train.spacy"
      - "corpus/${vars.treebank}/dev.spacy"
      - "configs/${vars.config}.cfg"
      - "corpus/ga_vectors"
    outputs:
      - "training/${vars.treebank}/model-best"

  - name: evaluate
    help: "Evaluate on the test data and save the metrics"
    script:
      - "python -m spacy evaluate ./training/${vars.treebank}/model-best ./corpus/${vars.treebank}/test.spacy --output ./metrics/${vars.treebank}.json --gpu-id ${vars.gpu}"
    deps:
      - "training/${vars.treebank}/model-best"
      - "corpus/${vars.treebank}/test.spacy"
    outputs:
      - "metrics/${vars.treebank}.json"

  - name: package
    help: "Package the trained model so it can be installed"
    script:
      - "python -m spacy package training/${vars.treebank}/model-best packages --name ${vars.package_name} --version ${vars.package_version} --force"
    deps:
      - "training/${vars.treebank}/model-best"
    outputs_no_cache:
      - "packages/${vars.lang}_${vars.package_name}-${vars.package_version}/dist/en_${vars.package_name}-${vars.package_version}.tar.gz"

  - name: clean
    help: "Remove intermediate files"
    script:
      - "rm -rf training/*"
      - "rm -rf metrics/*"
      - "rm -rf corpus/*"

Overwriting tagger_parser_ud/project.yml


In [7]:
!python -m spacy project assets /content/tagger_parser_ud

[38;5;4mℹ Fetching 2 asset(s)[0m
[38;5;2m✔ Downloaded asset /content/tagger_parser_ud/assets/vectors.vec.gz[0m
[38;5;2m✔ Downloaded asset /content/tagger_parser_ud/assets/UD_Irish-IDT[0m


In [8]:
!python -m spacy project run vectors /content/tagger_parser_ud

[1m
Running command: /usr/bin/python3 -m spacy init vectors ga assets/vectors.vec.gz corpus/ga_vectors -n ga_fasttext_cc_vectors_md
[38;5;4mℹ Creating blank nlp object for language 'ga'[0m
[2021-12-05 21:34:09,293] [INFO] Reading vectors from assets/vectors.vec.gz
316836it [00:28, 11302.98it/s]
[2021-12-05 21:34:37,505] [INFO] Loaded vectors from assets/vectors.vec.gz
[38;5;2m✔ Successfully converted 316836 vectors[0m
[38;5;2m✔ Saved nlp object with vectors to output directory. You can now use
the path to it in your config as the 'vectors' setting in [initialize].[0m
/content/tagger_parser_ud/corpus/ga_vectors


In [9]:
!python -m spacy project run preprocess /content/tagger_parser_ud

[1m
Running command: mkdir -p corpus/UD_Irish-IDT
Running command: /usr/bin/python3 -m spacy convert assets/UD_Irish-IDT/ga_idt-ud-train.conllu corpus/UD_Irish-IDT/ --converter conllu --n-sents 10 --merge-subtokens --morphology
[38;5;4mℹ Grouping every 10 sentences into a document.[0m
[38;5;2m✔ Generated output file (401 documents):
corpus/UD_Irish-IDT/ga_idt-ud-train.spacy[0m
Running command: /usr/bin/python3 -m spacy convert assets/UD_Irish-IDT/ga_idt-ud-dev.conllu corpus/UD_Irish-IDT/ --converter conllu --n-sents 10 --merge-subtokens --morphology
[38;5;4mℹ Grouping every 10 sentences into a document.[0m
[38;5;2m✔ Generated output file (46 documents):
corpus/UD_Irish-IDT/ga_idt-ud-dev.spacy[0m
Running command: /usr/bin/python3 -m spacy convert assets/UD_Irish-IDT/ga_idt-ud-test.conllu corpus/UD_Irish-IDT/ --converter conllu --n-sents 10 --merge-subtokens --morphology
[38;5;4mℹ Grouping every 10 sentences into a document.[0m
[38;5;2m✔ Generated output file (46 documents):


In [10]:
%%writefile /content/tagger_parser_ud/configs/base_default.cfg
[paths]
train = null
dev = null
vectors = null
init_tok2vec = null

[system]
gpu_allocator = null
seed = 0

[nlp]
lang = "ga"
pipeline = ["tok2vec","tagger","morphologizer","parser"]
tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
disabled = []
before_creation = null
after_creation = null
after_pipeline_creation = null
batch_size = 1000

[components]

[components.morphologizer]
factory = "morphologizer"

[components.morphologizer.model]
@architectures = "spacy.Tagger.v1"
nO = null

[components.morphologizer.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode.width}
upstream = "*"

[components.parser]
factory = "parser"
learn_tokens = false
min_action_freq = 30
moves = null
update_with_oracle_cut_size = 100

[components.parser.model]
@architectures = "spacy.TransitionBasedParser.v2"
state_type = "parser"
extra_state_tokens = false
hidden_width = 128
maxout_pieces = 3
use_upper = true
nO = null

[components.parser.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode.width}
upstream = "*"

[components.tagger]
factory = "tagger"

[components.tagger.model]
@architectures = "spacy.Tagger.v1"
nO = null

[components.tagger.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode.width}
upstream = "*"

[components.tok2vec]
factory = "tok2vec"

[components.tok2vec.model]
@architectures = "spacy.Tok2Vec.v1"

[components.tok2vec.model.embed]
@architectures = "spacy.MultiHashEmbed.v1"
width = ${components.tok2vec.model.encode.width}
attrs = ["LOWER","PREFIX","SUFFIX","SHAPE"]
rows = [5000,2500,2500,2500]
include_static_vectors = false

[components.tok2vec.model.encode]
@architectures = "spacy.MaxoutWindowEncoder.v1"
width = 96
depth = 4
window_size = 1
maxout_pieces = 3

[corpora]

[corpora.dev]
@readers = "spacy.Corpus.v1"
path = ${paths.dev}
max_length = 0
gold_preproc = false
limit = 0
augmenter = null

[corpora.train]
@readers = "spacy.Corpus.v1"
path = ${paths.train}
max_length = 2000
gold_preproc = false
limit = 0
augmenter = null

[training]
dev_corpus = "corpora.dev"
train_corpus = "corpora.train"
seed = ${system.seed}
gpu_allocator = ${system.gpu_allocator}
dropout = 0.1
accumulate_gradient = 1
patience = 1600
max_epochs = 0
max_steps = 20000
eval_frequency = 200
frozen_components = []
before_to_disk = null

[training.batcher]
@batchers = "spacy.batch_by_words.v1"
discard_oversize = false
tolerance = 0.2
get_length = null

[training.batcher.size]
@schedules = "compounding.v1"
start = 100
stop = 1000
compound = 1.001
t = 0.0

[training.logger]
@loggers = "spacy.ConsoleLogger.v1"
progress_bar = false

[training.optimizer]
@optimizers = "Adam.v1"
beta1 = 0.9
beta2 = 0.999
L2_is_weight_decay = true
L2 = 0.01
grad_clip = 1.0
use_averages = false
eps = 0.00000001
learn_rate = 0.001

[training.score_weights]
morph_per_feat = null
dep_las_per_type = null
sents_p = null
sents_r = null
tag_acc = 0.33
pos_acc = 0.17
morph_acc = 0.17
dep_uas = 0.17
dep_las = 0.17
sents_f = 0.0

[pretraining]

[initialize]
vectors = ${paths.vectors}
init_tok2vec = ${paths.init_tok2vec}
vocab_data = null
lookups = null
before_init = null
after_init = null

[initialize.components]

[initialize.tokenizer]


Writing /content/tagger_parser_ud/configs/base_default.cfg


In [11]:
!rm /content/tagger_parser_ud/configs/default.cfg
!python -m spacy init fill-config /content/tagger_parser_ud/configs/base_default.cfg /content/tagger_parser_ud/configs/default.cfg

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
/content/tagger_parser_ud/configs/default.cfg
You can now add your data and train your pipeline:
python -m spacy train default.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [12]:
!apt install git-lfs

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following NEW packages will be installed:
  git-lfs
0 upgraded, 1 newly installed, 0 to remove and 37 not upgraded.
Need to get 2,129 kB of archives.
After this operation, 7,662 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 git-lfs amd64 2.3.4-1 [2,129 kB]
Fetched 2,129 kB in 1s (2,010 kB/s)
Selecting previously unselected package git-lfs.
(Reading database ... 155222 files and directories currently installed.)
Preparing to unpack .../git-lfs_2.3.4-1_amd64.deb ...
Unpacking git-lfs (2.3.4-1) ...
Setting up git-lfs (2.3.4-1) ...
Processing triggers for man-db (2.8.3-2ubuntu0.1) ...


In [13]:
%%writefile tmp_config.cfg
# This is an auto-generated partial config. To use it with 'spacy train'
# you can run spacy init fill-config to auto-fill all default settings:
# python -m spacy init fill-config ./base_config.cfg ./config.cfg
[paths]
train = null
dev = null

[system]
gpu_allocator = null

[nlp]
lang = "ga"
pipeline = ["tok2vec","tagger","morphologizer","parser"]
batch_size = 1000

[components]

[components.tok2vec]
factory = "tok2vec"

[components.tok2vec.model]
@architectures = "spacy.Tok2Vec.v2"

[components.tok2vec.model.embed]
@architectures = "spacy.MultiHashEmbed.v2"
width = ${components.tok2vec.model.encode.width}
attrs = ["ORTH", "SHAPE"]
rows = [5000, 2500]
include_static_vectors = true

[components.tok2vec.model.encode]
@architectures = "spacy.MaxoutWindowEncoder.v2"
width = 256
depth = 8
window_size = 1
maxout_pieces = 3

[components.morphologizer]
factory = "morphologizer"

[components.morphologizer.model]
@architectures = "spacy.Tagger.v1"
nO = null

[components.morphologizer.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode.width}

[components.tagger]
factory = "tagger"

[components.tagger.model]
@architectures = "spacy.Tagger.v1"
nO = null

[components.tagger.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode.width}

[components.parser]
factory = "parser"

[components.parser.model]
@architectures = "spacy.TransitionBasedParser.v2"
state_type = "parser"
extra_state_tokens = false
hidden_width = 128
maxout_pieces = 3
use_upper = true
nO = null

[components.parser.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode.width}

[components.ner]
factory = "ner"

[components.ner.model]
@architectures = "spacy.TransitionBasedParser.v2"
state_type = "ner"
extra_state_tokens = false
hidden_width = 64
maxout_pieces = 2
use_upper = true
nO = null

[components.ner.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode.width}

[corpora]

[corpora.train]
@readers = "spacy.Corpus.v1"
path = ${paths.train}
max_length = 0

[corpora.dev]
@readers = "spacy.Corpus.v1"
path = ${paths.dev}
max_length = 0

[training]
dev_corpus = "corpora.dev"
train_corpus = "corpora.train"

[training.optimizer]
@optimizers = "Adam.v1"

[training.batcher]
@batchers = "spacy.batch_by_words.v1"
discard_oversize = false
tolerance = 0.2

[training.batcher.size]
@schedules = "compounding.v1"
start = 100
stop = 1000
compound = 1.001

[initialize]
vectors = ${paths.vectors}

Writing tmp_config.cfg


In [14]:
!python -m spacy init fill-config tmp_config.cfg config.cfg

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [15]:
#!rm -rf spacy_ud
!mkdir spacy_ud

In [16]:
!python -m spacy project clone pipelines/floret_vectors_demo

[38;5;2m✔ Cloned 'pipelines/floret_vectors_demo' from explosion/projects[0m
/content/floret_vectors_demo
[38;5;2m✔ Your project is now ready![0m
To fetch the assets, run:
python -m spacy project assets /content/floret_vectors_demo


In [17]:
%%writefile /content/floret_vectors_demo/project.yml
title: "Demo floret vectors"
description: "Train floret vectors and load them into a spaCy vectors model."
spacy_version: ">=3.2.0,<4.0.0"
# Variables can be referenced across the project.yml using ${vars.var_name}
vars:
  name: "floret_vectors"
  lang: "ga"
  oscar_dataset: "unshuffled_deduplicated_ga"
  max_texts: 1000
  # number of processes (tokenization) and threads (floret)
  n_process: 8

# These are the directories that the project needs. The project CLI will make
# sure that they always exist.
directories: ["corpus", "scripts", "vectors"]

## Workflows are sequences of commands (see below) executed in order. You can
## run them via "spacy project run [workflow]". If a commands's inputs/outputs
## haven't changed, it won't be re-run.
workflows:
  all:
    - tokenize-oscar
    - train-floret
    - init-floret-vectors
    - floret-nn

# Project commands, specified in a style similar to CI config files (e.g. Azure
# pipelines). The name is the command name that lets you trigger the command
# via "spacy project run [command] [path]". The help message is optional and
# shown when executing "spacy project run [optional command] [path] --help".
commands:
  - name: "tokenize-oscar"
    help: "Download, tokenize, and sentencize data"
    script:
      - "python scripts/tokenize_dataset.py ${vars.lang} ${vars.oscar_dataset} ${vars.max_texts} corpus/${vars.oscar_dataset}.${vars.max_texts}.tok.txt --n-process=${vars.n_process}"
    deps:
      - "scripts/tokenize_dataset.py"
    outputs:
      - "corpus/${vars.oscar_dataset}.${vars.max_texts}.tok.txt"

  - name: "train-floret"
    help: "Train floret vectors"
    script:
      - "python scripts/train_floret.py --model cbow --dim 300 --mincount 10 --minn 3 --maxn 6 --neg 10 --mode floret --hashcount 2 --bucket 20000 --thread ${vars.n_process} corpus/${vars.oscar_dataset}.${vars.max_texts}.tok.txt vectors/${vars.oscar_dataset}.${vars.max_texts}.dim300.minCount10.n3-6.neg10.modeFloret.hashCount2.bucket20000"
    deps:
      - "scripts/train_floret.py"
      - "corpus/${vars.oscar_dataset}.${vars.max_texts}.tok.txt"
    outputs:
      - "vectors/${vars.oscar_dataset}.${vars.max_texts}.dim300.minCount10.n3-6.neg10.modeFloret.hashCount2.bucket20000.floret"
      - "vectors/${vars.oscar_dataset}.${vars.max_texts}.dim300.minCount10.n3-6.neg10.modeFloret.hashCount2.bucket20000.vec"
      - "vectors/${vars.oscar_dataset}.${vars.max_texts}.dim300.minCount10.n3-6.neg10.modeFloret.hashCount2.bucket20000.bin"

  - name: "init-floret-vectors"
    help: "Create a floret vectors model"
    script:
      - "python -m spacy init vectors ${vars.lang} vectors/${vars.oscar_dataset}.${vars.max_texts}.dim300.minCount10.n3-6.neg10.modeFloret.hashCount2.bucket20000.floret vectors/${vars.oscar_dataset}.${vars.max_texts}_floret_model --mode floret" 
    deps:
      - "vectors/${vars.oscar_dataset}.${vars.max_texts}.dim300.minCount10.n3-6.neg10.modeFloret.hashCount2.bucket20000.floret"
    outputs:
      - "vectors/${vars.oscar_dataset}.${vars.max_texts}_floret_model"

  - name: "floret-nn"
    help: "Demo nearest neighbors for intentional OOV misspelling 'outdooor'"
    script:
      - "python scripts/nn_floret.py vectors/${vars.oscar_dataset}.${vars.max_texts}.dim300.minCount10.n3-6.neg10.modeFloret.hashCount2.bucket20000.bin outdooor"
    deps:
      - "scripts/nn_floret.py"
      - "vectors/${vars.oscar_dataset}.${vars.max_texts}.dim300.minCount10.n3-6.neg10.modeFloret.hashCount2.bucket20000.bin"


Overwriting /content/floret_vectors_demo/project.yml


In [18]:
!python -m spacy project run tokenize-oscar /content/floret_vectors_demo

[1m
Running command: /usr/bin/python3 scripts/tokenize_dataset.py ga unshuffled_deduplicated_ga 1000 corpus/unshuffled_deduplicated_ga.1000.tok.txt --n-process=8
Downloading: 14.8kB [00:00, 11.0MB/s]       
Downloading: 3.07MB [00:00, 78.3MB/s]      


In [19]:
!tail /content/floret_vectors_demo/corpus/unshuffled_deduplicated_ga.1000.tok.txt

Tubaiste cheart a bhí i gcogadh na hIaráice .
Maraíodh na céadta míle duine ; chuaigh an fhuath i dtaobh na Breataine sna tíortha Ioslamacha i méid ; rinneadh dochar don chomhaontas idir an Bhreatain agus an Eoraip ; Agus tarraingíodh míchliú ar oidhreacht Blair agus ar ghníomhréim a pháirtí .
Is í Mairéad Ní Chuaig , craoltóir agus bean na haimsire ar TG4 , a bhí i mbun cainte le Sara Ní Chuirreáin an tseachtain seo … Ceist mhaith !
Eicléicteach , banda agus boihéamach .
Tá suim mhór agam i stíl na seascaidí .
Bíonn an-tionchar ag ceol , scannáin agus taisteal ar mo stíl féin .
Ní thugam suntas do na faisin is deireanaí .
Bíonn mé ag bailiú éadaí ar fud na cruinne , agus is breá liom hataí .
Fachtóir cosanta ar an ngrian ( Tá mé ag úsáid uachtar gréine le spf ard ó bhí mé 16 , comhairle mhaith a fuair mé ó mo mhama ! ) ,
línitheoir súl agus maothóir .


In [20]:
!cat /content/tagger_parser_ud/assets/UD_Irish-IDT/ga_idt-ud-*.conllu |grep 'text =' |sed -e 's/^# text = //' > idt-sentences.txt

In [21]:
import re
import spacy
import typer
from itertools import islice
from pathlib import Path


lang = "ga"
n_process = 8
batch_size = 100
nlp = spacy.blank(lang)
input_file = "idt-sentences.txt"
output_file = "idt-split.txt"

nlp.add_pipe("sentencizer")
nlp.max_length = 10 ** 8

idt_lines = []
with open(input_file) as input_fileh:
    for line in input_fileh.readlines():
        idt_lines.append(line.strip())

with open(output_file, "w") as output_fileh:
    texts = (
        re.sub("\s+", " ", line.strip())
        for line in idt_lines
    )
    for doc in nlp.pipe(texts, batch_size=batch_size):
        for sent in doc.sents:
            output_fileh.write(" ".join([t.text for t in sent]) + "\n")

In [22]:
!cat idt-split.txt >> /content/floret_vectors_demo/corpus/unshuffled_deduplicated_ga.1000.tok.txt

In [23]:
!python -m spacy project run train-floret /content/floret_vectors_demo

[1m
Running command: /usr/bin/python3 scripts/train_floret.py --model cbow --dim 300 --mincount 10 --minn 3 --maxn 6 --neg 10 --mode floret --hashcount 2 --bucket 20000 --thread 8 corpus/unshuffled_deduplicated_ga.1000.tok.txt vectors/unshuffled_deduplicated_ga.1000.dim300.minCount10.n3-6.neg10.modeFloret.hashCount2.bucket20000
Read 0M words
Number of words:  3547
Number of labels: 0
Progress: 100.0% words/sec/thread:    7829 lr:  0.000000 avg.loss:  3.232815 ETA:   0h 0m 0s


In [24]:
!python -m spacy project run init-floret-vectors /content/floret_vectors_demo

[1m
Running command: /usr/bin/python3 -m spacy init vectors ga vectors/unshuffled_deduplicated_ga.1000.dim300.minCount10.n3-6.neg10.modeFloret.hashCount2.bucket20000.floret vectors/unshuffled_deduplicated_ga.1000_floret_model --mode floret
[38;5;4mℹ Creating blank nlp object for language 'ga'[0m
[2021-12-05 21:38:20,042] [INFO] Reading vectors from vectors/unshuffled_deduplicated_ga.1000.dim300.minCount10.n3-6.neg10.modeFloret.hashCount2.bucket20000.floret
20000it [00:01, 14267.75it/s]
[2021-12-05 21:38:21,458] [INFO] Loaded vectors from vectors/unshuffled_deduplicated_ga.1000.dim300.minCount10.n3-6.neg10.modeFloret.hashCount2.bucket20000.floret
[38;5;2m✔ Successfully converted 20000 vectors[0m
[38;5;2m✔ Saved nlp object with vectors to output directory. You can now use
the path to it in your config as the 'vectors' setting in [initialize].[0m
/content/floret_vectors_demo/vectors/unshuffled_deduplicated_ga.1000_floret_model


In [None]:
!python -m spacy train config.cfg --paths.train /content/tagger_parser_ud/corpus/UD_Irish-IDT/train.spacy --paths.dev /content/tagger_parser_ud/corpus/UD_Irish-IDT/dev.spacy --paths.vectors /content/floret_vectors_demo/vectors/unshuffled_deduplicated_ga.1000_floret_model --output spacy_ud

In [26]:
!python -m spacy debug data /content/spacy_ud/model-best/config.cfg --no-format --verbose

Data file validation
Pipeline can be initialized with data
Corpus is loadable
Training stats
Language: ga
Training pipeline: tok2vec, tagger, morphologizer, parser
401 training docs
46 evaluation docs
No overlap between training and evaluation data
Low number of examples to train a new pipeline (401)
It's recommended to use at least 2000 examples (minimum 100)
Vocab & Vectors
95881 total word(s) in the data (14191 unique)
661 misaligned tokens in the training data
90 misaligned tokens in the dev data
10 most common words: 'a' (3954), 'an' (3807), '.' (3805), ',' (3296), 'agus' (2704), 'ar' (2437), 'na' (2182), 'i' (1431), 'go' (1171), 'le' (1159)
20000 vectors (0 unique keys, 300 dimensions)
0 words in training data without vectors (0%)
10 most common words without vectors: 
Part-of-speech Tagging
1034 label(s) in train data
Some model labels are not present in the train data. The model performance may be degraded for these labels after training: 'ADJ__Abbr=Yes|Case=Nom|Gender=Fem|Numb

In [32]:
!mkdir /content/convout
!python -m spacy convert -t json /content/tagger_parser_ud/assets/UD_Irish-IDT/ga_idt-ud-dev.conllu /content/convout  --converter conllu --n-sents 10 --merge-subtokens --morphology
!python -m spacy convert -t json /content/tagger_parser_ud/assets/UD_Irish-IDT/ga_idt-ud-train.conllu /content/convout --converter conllu --n-sents 10 --merge-subtokens --morphology
!python -m spacy convert -t json /content/tagger_parser_ud/assets/UD_Irish-IDT/ga_idt-ud-test.conllu /content/convout --converter conllu --n-sents 10 --merge-subtokens --morphology

mkdir: cannot create directory ‘/content/convout’: File exists
[38;5;4mℹ Grouping every 10 sentences into a document.[0m
[38;5;2m✔ Generated output file (1 documents):
/content/convout/ga_idt-ud-dev.json[0m
[38;5;4mℹ Grouping every 10 sentences into a document.[0m
[38;5;2m✔ Generated output file (1 documents):
/content/convout/ga_idt-ud-train.json[0m
[38;5;4mℹ Grouping every 10 sentences into a document.[0m
[38;5;2m✔ Generated output file (1 documents):
/content/convout/ga_idt-ud-test.json[0m


In [33]:
!ls convout/*json | zip json.zip -@

  adding: convout/ga_idt-ud-dev.json (deflated 94%)
  adding: convout/ga_idt-ud-test.json (deflated 94%)
  adding: convout/ga_idt-ud-train.json (deflated 94%)


In [34]:
import json

In [36]:
with open("/content/convout/ga_idt-ud-dev.json") as devf:
  file = devf.read()
  dev = json.loads(file)

In [62]:
import spacy
nlp = spacy.blank("ga")
nlp.add_pipe("sentencizer")

texts = []
for para in dev[0]["paragraphs"]:
  sents = []
  for sent in para["sentences"]:
    sentence = " ".join([t["orth"] for t in sent["tokens"]])
    sents.append(sentence)
  texts.append(" | ".join(sents))

In [64]:
len(texts)

46

<generator object Language.pipe at 0x7fbc97122b50>