# Stanza

## Setting up Stanza

In [None]:
! git clone https://github.com/huberemanuel/stanza-train-br.git
! cd stanza-train-br && git clone https://github.com/huberemanuel/stanza
! pip install -r /content/stanza-train-br/requirements.txt
! pip install -e /content/stanza-train-br/stanza
! pip install pythainlp
! cd /content/stanza-train-br/ && cp config/config.sh stanza/scripts/config.sh
! cd /content/stanza-train-br/ && cp config/xpos_vocab_factory.py stanza/stanza/models/pos/xpos_vocab_factory.py
! cd /content/stanza-train-br/ && make download_data
! cd /content/stanza-train-br/ && make download_vectors_colab

## Training


In [None]:
! python3 -m stanza.utils.datasets.prepare_pos_treebank UD_Portuguese-Porttinari

In [None]:
%%writefile /content/stanza-train-br/stanza/stanza/utils/training/common.py
import argparse
import glob
import logging
import os
import pathlib
import sys
import tempfile

from enum import Enum

from stanza.models.common.constant import treebank_to_short_name
from stanza.resources.common import download, DEFAULT_MODEL_DIR
from stanza.utils.datasets import common
import stanza.utils.default_paths as default_paths
from stanza.utils import conll18_ud_eval as ud_eval

logger = logging.getLogger('stanza')

class Mode(Enum):
    TRAIN = 1
    SCORE_DEV = 2
    SCORE_TEST = 3
    SCORE_TRAIN = 4

BERT = {
    # https://huggingface.co/Maltehb/danish-bert-botxo
    # contrary to normal expectations, this hurts F1
    # on a dev split by about 1 F1
    # "da": "Maltehb/danish-bert-botxo",
    #
    # the multilingual bert is a marginal improvement for conparse
    "da": "bert-base-multilingual-cased",

    # https://huggingface.co/roberta-base
    "en": "roberta-base",

    # NER scores for a couple options:
    # none:
    # dev:  2022-03-04 INFO: fi_turku 83.45
    # test: 2022-03-04 INFO: fi_turku 86.25
    #
    # bert-base-multilingual-cased
    # dev:  2022-03-04 INFO: fi_turku 85.23
    # test: 2022-03-04 INFO: fi_turku 89.00
    #
    # TurkuNLP/bert-base-finnish-cased-v1:
    # dev:  2022-03-04 INFO: fi_turku 88.41
    # test: 2022-03-04 INFO: fi_turku 91.36
    "fi": "TurkuNLP/bert-base-finnish-cased-v1",

    # from https://github.com/idb-ita/GilBERTo
    # annoyingly, it doesn't handle cased text
    # supposedly there is an argument "do_lower_case"
    # but that still leaves a lot of unk tokens
    # "it": "idb-ita/gilberto-uncased-from-camembert",
    #
    # from https://github.com/musixmatchresearch/umberto
    # on NER, this gets 88.37 dev and 91.02 test
    # another option is dbmdz/bert-base-italian-cased,
    # which gets 87.27 dev and 90.32 test
    "it": "Musixmatch/umberto-commoncrawl-cased-v1",

    # experiments on the cintil conparse dataset
    # ran a variety of transformer settings
    # found the following dev set scores after 400 iterations:
    # Geotrend/distilbert-base-pt-cased : not plug & play
    # no bert: 0.9082
    # xlm-roberta-base: 0.9109
    # xlm-roberta-large: 0.9254
    # adalbertojunior/distilbert-portuguese-cased: 0.9300
    # neuralmind/bert-base-portuguese-cased: 0.9307
    # neuralmind/bert-large-portuguese-cased: 0.9343
    "pt": "neuralmind/bert-large-portuguese-cased",

    # https://huggingface.co/dbmdz/bert-base-turkish-128k-cased
    # helps the Turkish model quite a bit
    "tr": "dbmdz/bert-base-turkish-128k-cased",

    # from https://github.com/VinAIResearch/PhoBERT
    # "vi": "vinai/phobert-base",
    # another option is phobert-large, but that doesn't
    # change the scores any
    "vi": "vinai/phobert-large",

    # https://github.com/ymcui/Chinese-BERT-wwm
    # there's also hfl/chinese-roberta-wwm-ext-large
    "zh-hans": "hfl/chinese-roberta-wwm-ext",
}

def build_argparse():
    parser = argparse.ArgumentParser()
    parser.add_argument('--save_output', dest='temp_output', default=True, action='store_false', help="Save output - default is to use a temp directory.")

    parser.add_argument('treebanks', type=str, nargs='+', help='Which treebanks to run on.  Use all_ud or ud_all for all UD treebanks')

    parser.add_argument('--train', dest='mode', default=Mode.TRAIN, action='store_const', const=Mode.TRAIN, help='Run in train mode')
    parser.add_argument('--score_dev', dest='mode', action='store_const', const=Mode.SCORE_DEV, help='Score the dev set')
    parser.add_argument('--score_test', dest='mode', action='store_const', const=Mode.SCORE_TEST, help='Score the test set')
    parser.add_argument("--score_train", dest="mode", action="store_const", const=Mode.SCORE_TRAIN, help="Score the train set")

    # This argument needs to be here so we can identify if the model already exists in the user-specified home
    parser.add_argument('--save_dir', type=str, default=None, help="Root dir for saving models.  If set, will override the model's default.")

    parser.add_argument('--force', dest='force', action='store_true', default=False, help='Retrain existing models')
    return parser

def main(run_treebank, model_dir, model_name, add_specific_args=None):
    """
    A main program for each of the run_xyz scripts

    It collects the arguments and runs the main method for each dataset provided.
    It also tries to look for an existing model and not overwrite it unless --force is provided

    model_name can be a callable expecting the args
      - the charlm, for example, needs this feature, since it makes
        both forward and backward models
    """
    logger.info("Training program called with:\n" + " ".join(sys.argv))

    paths = default_paths.get_default_paths()

    parser = build_argparse()
    if add_specific_args is not None:
        add_specific_args(parser)
    if '--extra_args' in sys.argv:
        idx = sys.argv.index('--extra_args')
        extra_args = sys.argv[idx+1:]
        command_args = parser.parse_args(sys.argv[1:idx])
    else:
        command_args, extra_args = parser.parse_known_args()

    # Pass this through to the underlying model as well as use it here
    if command_args.save_dir:
        extra_args.extend(["--save_dir", command_args.save_dir])

    if callable(model_name):
        model_name = model_name(command_args)

    mode = command_args.mode
    treebanks = []

    for treebank in command_args.treebanks:
        # this is a really annoying typo to make if you copy/paste a
        # UD directory name on the cluster and your job dies 30s after
        # being queued for an hour
        if treebank.endswith("/"):
            treebank = treebank[:-1]
        if treebank.lower() in ('ud_all', 'all_ud'):
            ud_treebanks = common.get_ud_treebanks(paths["UDBASE"])
            treebanks.extend(ud_treebanks)
        else:
            treebanks.append(treebank)

    for treebank_idx, treebank in enumerate(treebanks):
        if treebank_idx > 0:
            logger.info("=========================================")

        short_name = treebank_to_short_name(treebank)
        logger.debug("%s: %s" % (treebank, short_name))

        if mode == Mode.TRAIN and not command_args.force and model_name != 'ete':
            if command_args.save_dir:
                model_path = "%s/%s_%s.pt" % (command_args.save_dir, short_name, model_name)
            else:
                model_path = "saved_models/%s/%s_%s.pt" % (model_dir, short_name, model_name)
            if os.path.exists(model_path):
                logger.info("%s: %s exists, skipping!" % (treebank, model_path))
                continue
            else:
                logger.info("%s: %s does not exist, training new model" % (treebank, model_path))

        if command_args.temp_output and model_name != 'ete':
            with tempfile.NamedTemporaryFile() as temp_output_file:
                run_treebank(mode, paths, treebank, short_name,
                             temp_output_file.name, command_args, extra_args)
        else:
            run_treebank(mode, paths, treebank, short_name,
                         None, command_args, extra_args)

def run_eval_script(gold_conllu_file, system_conllu_file, evals=None):
    """ Wrapper for lemma scorer. """
    gold_ud = ud_eval.load_conllu_file(gold_conllu_file)
    system_ud = ud_eval.load_conllu_file(system_conllu_file)
    evaluation = ud_eval.evaluate(gold_ud, system_ud)

    if evals is None:
        return ud_eval.build_evaluation_table(evaluation, verbose=True, counts=False)
    else:
        results = [evaluation[key].f1 for key in evals]
        return " ".join("{:.2f}".format(100 * x) for x in results)

def run_eval_script_tokens(eval_gold, eval_pred):
    return run_eval_script(eval_gold, eval_pred, evals=["Tokens", "Sentences", "Words"])

def run_eval_script_mwt(eval_gold, eval_pred):
    return run_eval_script(eval_gold, eval_pred, evals=["Words"])

def run_eval_script_pos(eval_gold, eval_pred):
    return run_eval_script(eval_gold, eval_pred, evals=["UPOS", "XPOS", "UFeats", "AllTags"])

def run_eval_script_depparse(eval_gold, eval_pred):
    return run_eval_script(eval_gold, eval_pred, evals=["UAS", "LAS", "CLAS", "MLAS", "BLEX"])


def find_wordvec_pretrain(language, default_pretrain):
    pretrain_path = '{}/{}/pretrain/*.pt'.format(DEFAULT_MODEL_DIR, language)
    pretrains = glob.glob(pretrain_path)
    if len(pretrains) == 0:
        # TODO: try to extract/remember the specific pretrain for the given model
        # That would be a good way to archive which pretrains are used for which NER models, anyway
        # For now, just download the default and use that
        pretrain_package = default_pretrain.get(language, None)
        if pretrain_package is None:
            logger.warning(f"Cannot figure out which pretrain to use for '{language}'.  Will download the default package and hope for the best")
            download(lang=language)
        else:
            logger.warning(f"Missing pretrain for '{language}'.  Will download the default pretrain '{pretrain_package}'")
            download(lang=language, package=None, processors={"pretrain": pretrain_package})
        pretrains = glob.glob(pretrain_path)
    if len(pretrains) == 0:
        raise FileNotFoundError(f"Cannot find any pretrains in {pretrain_path}  Try 'stanza.download(\"{language}\")' to get a default pretrain or use --wordvec_pretrain_file to specify a .pt file to use")
    if len(pretrains) > 1:
        default_pt = default_pretrain.get(language, None)
        if default_pt is None:
            raise FileNotFoundError(f"Too many pretrains to choose from in {pretrain_path}  No default pretrain is specified for language {language}  Must specify an exact path to a --wordvec_pretrain_file")
        for pt_file in pretrains:
            pt_name = os.path.split(pt_file)[1]
            pt_name = os.path.splitext(pt_name)[0]
            if pt_name == default_pt:
                logger.info(f"Using default pretrain for language, found in {pt_file}  To use a different pretrain, specify --wordvec_pretrain_file")
                return pt_file
        raise FileNotFoundError(f"Too many pretrains to choose from in {pretrain_path}  Could not find default pt {default_pt} for language {language}  Must specify an exact path to a --wordvec_pretrain_file")
    pt = pretrains[0]
    logger.info(f"Using pretrain found in {pt}  To use a different pretrain, specify --wordvec_pretrain_file")
    return pt

def find_charlm_file(direction, language, charlm):
    """
    Return the path to the forward or backward charlm if it exists for the given package

    If we can figure out the package, but can't find it anywhere, we try to download it
    """
    saved_path = 'saved_models/charlm/{}_{}_{}_charlm.pt'.format(language, charlm, direction)
    if os.path.exists(saved_path):
        logger.info(f'Using model {saved_path} for {direction} charlm')
        return saved_path

    resource_path = '{}/{}/{}_charlm/{}.pt'.format(DEFAULT_MODEL_DIR, language, direction, charlm)
    if os.path.exists(resource_path):
        logger.info(f'Using model {resource_path} for {direction} charlm')
        return resource_path

    try:
        download(lang=language, package=None, processors={f"{direction}_charlm": charlm})
        if os.path.exists(resource_path):
            logger.info(f'Downloaded model, using model {resource_path} for {direction} charlm')
            return resource_path
    except ValueError as e:
        # we're about to throw an error anyway
        pass

    raise FileNotFoundError(f"Cannot find {direction} charlm in either {saved_path} or {resource_path}  Attempted downloading {charlm} but that did not work")

def build_charlm_args(language, charlm, base_args=True):
    """
    If specified, return forward and backward charlm args
    """
    if charlm:
        forward = find_charlm_file('forward', language, charlm)
        backward = find_charlm_file('backward', language, charlm)
        char_args = ['--charlm_forward_file', forward,
                     '--charlm_backward_file', backward]
        if not base_args:
            return char_args
        return ['--charlm',
                '--charlm_shorthand', f'{language}_{charlm}'] + char_args

    return []

def choose_charlm(language, dataset, charlm, language_charlms, dataset_charlms):
    default_charlm = language_charlms.get(language, None)
    specific_charlm = dataset_charlms.get(language, {}).get(dataset, None)

    if charlm is None:
        return None
    elif charlm != "default":
        return charlm
    elif specific_charlm:
        return specific_charlm
    elif default_charlm:
        return default_charlm
    else:
        return None



In [None]:
%%writefile /content/stanza-train-br/stanza/stanza/utils/training/run_pos.py


import logging
import os

from stanza.models import tagger

from stanza.utils.training import common
from stanza.utils.training.common import Mode

logger = logging.getLogger('stanza')

# TODO: move this somewhere common
def wordvec_args(short_language):
    if short_language in ("cop", "orv", "pcm", "qtd", "swl"):
        # we couldn't find word vectors for these languages:
        # coptic, naija, old russian, turkish german, swedish sign language
        logger.warning("No known word vectors for language {}  If those vectors can be found, please update the training scripts.".format(short_language))
        return ["--no_pretrain"]
    else:
        return []

def pos_batch_size(short_name):
    if short_name == 'de_hdt':
        # 'UD_German-HDT'
        return "2000"
    elif short_name == 'hr_set':
        # 'UD_Croatian-SET'
        return "3000"
    else:
        return "5000"

def run_treebank(mode, paths, treebank, short_name,
                 temp_output_file, command_args, extra_args):
    short_language = short_name.split("_")[0]

    pos_dir        = paths["POS_DATA_DIR"]
    train_file     = f"{pos_dir}/{short_name}.train.in.conllu"
    train_gold_file= f"{pos_dir}/{short_name}.train.gold.conllu"
    train_pred_file= temp_output_file if temp_output_file else f"{pos_dir}/{short_name}.train.pred.conllu"
    dev_in_file    = f"{pos_dir}/{short_name}.dev.in.conllu"
    dev_gold_file  = f"{pos_dir}/{short_name}.dev.gold.conllu"
    dev_pred_file  = temp_output_file if temp_output_file else f"{pos_dir}/{short_name}.dev.pred.conllu"
    test_in_file   = f"{pos_dir}/{short_name}.test.in.conllu"
    test_gold_file = f"{pos_dir}/{short_name}.test.gold.conllu"
    test_pred_file = temp_output_file if temp_output_file else f"{pos_dir}/{short_name}.test.pred.conllu"

    if mode == Mode.TRAIN:
        if not os.path.exists(train_file):
            logger.error("TRAIN FILE NOT FOUND: %s ... skipping" % train_file)
            return

        # some languages need reduced batch size
        batch_size = pos_batch_size(short_name)

        train_args = ["--wordvec_dir", paths["WORDVEC_DIR"],
                      "--train_file", train_file,
                      "--eval_file", dev_in_file,
                      "--output_file", dev_pred_file,
                      "--gold_file", dev_gold_file,
                      "--batch_size", batch_size,
                      "--lang", short_language,
                      "--shorthand", short_name,
                      "--mode", "train"]
        train_args = train_args + wordvec_args(short_language)
        train_args = train_args + extra_args
        logger.info("Running train POS for {} with args {}".format(treebank, train_args))
        tagger.main(train_args)
    
    if mode == Mode.SCORE_TRAIN:
        train_pred_args = ["--wordvec_dir", paths["WORDVEC_DIR"],
                    "--eval_file", train_file,
                    "--output_file", train_pred_file,
                    "--gold_file", train_file,
                    "--lang", short_language,
                    "--shorthand", short_name,
                    "--mode", "predict"]
        train_pred_args = train_pred_args + wordvec_args(short_language)
        train_pred_args = train_pred_args + extra_args
        logger.info("Running train POS for {} with args {}".format(treebank, train_pred_args))
        tagger.main(train_pred_args)

        results = common.run_eval_script_pos(train_file, train_pred_file)
        logger.info("Finished running test set on\n{}\n{}".format(treebank, results))

    if mode == Mode.SCORE_DEV or mode == Mode.TRAIN:
        dev_args = ["--wordvec_dir", paths["WORDVEC_DIR"],
                    "--eval_file", dev_in_file,
                    "--output_file", dev_pred_file,
                    "--gold_file", dev_gold_file,
                    "--lang", short_language,
                    "--shorthand", short_name,
                    "--mode", "predict"]
        dev_args = dev_args + wordvec_args(short_language)
        dev_args = dev_args + extra_args
        logger.info("Running dev POS for {} with args {}".format(treebank, dev_args))
        tagger.main(dev_args)

        results = common.run_eval_script_pos(dev_gold_file, dev_pred_file)
        logger.info("Finished running dev set on\n{}\n{}".format(treebank, results))

    if mode == Mode.SCORE_TEST:
        test_args = ["--wordvec_dir", paths["WORDVEC_DIR"],
                    "--eval_file", test_in_file,
                    "--output_file", test_pred_file,
                    "--gold_file", test_gold_file,
                    "--lang", short_language,
                    "--shorthand", short_name,
                    "--mode", "predict"]
        test_args = test_args + wordvec_args(short_language)
        test_args = test_args + extra_args
        logger.info("Running test POS for {} with args {}".format(treebank, test_args))
        tagger.main(test_args)

        results = common.run_eval_script_pos(test_gold_file, test_pred_file)
        logger.info("Finished running test set on\n{}\n{}".format(treebank, results))


def main():
    common.main(run_treebank, "pos", "tagger")

if __name__ == "__main__":
    main()



In [None]:
DATA_ROOT="../data/processed"
%cd /content/stanza-train-br/stanza
%env UDBASE=../data/udbase
%env NERBASE=../data/nerbase
%env TOKENIZE_DATA_DIR=$DATA_ROOT/tokenize
%env MWT_DATA_DIR=$DATA_ROOT/mwt
%env LEMMA_DATA_DIR=$DATA_ROOT/lemma
%env POS_DATA_DIR=$DATA_ROOT/pos
%env DEPPARSE_DATA_DIR=$DATA_ROOT/depparse
%env ETE_DATA_DIR=$DATA_ROOT/ete
%env NER_DATA_DIR=$DATA_ROOT/ner
%env CHARLM_DATA_DIR=$DATA_ROOT/charlm
%env WORDVEC_DIR=../data/wordvec

In [None]:
for seed in range(42, 52):
    ! python3 -m stanza.utils.training.run_pos UD_Portuguese-Porttinari --max_steps 1000 --force --seed {seed}
    ! python3 -m stanza.utils.training.run_pos UD_Portuguese-Porttinari --score_test --save_output
    ! python3 -m stanza.utils.training.run_pos UD_Portuguese-Porttinari --score_train --save_output
    ! mv /content/stanza-train-br/data/processed/pos/pt_porttinari.test.pred.conllu /content/drive/MyDrive/Studies/Mestrado/Experimentos\ \Tagging\ \Porttinari-base/stanza/pt_porttinari_test_pred_{seed}.conllu
    ! mv /content/stanza-train-br/data/processed/pos/pt_porttinari.train.pred.conllu /content/drive/MyDrive/Studies/Mestrado/Experimentos\ \Tagging\ \Porttinari-base/stanza/pt_porttinari_train_pred_{seed}.conllu

In [None]:
! python3 -m stanza.utils.conll18_ud_eval \
    $DATA_ROOT/pos/pt_bosque.test.gold.conllu \
    $DATA_ROOT/pos/pt_bosque.test.pred.conllu \
    --verbose