# CNCSR

## Setting up CNCSR

In [None]:
! git clone https://github.com/huberemanuel/subword-sequence-tagging.git
! pip install -r subword-sequence-tagging/requirements.txt
! cd subword-sequence-tagging/data && tar -xvf ud_1_2.tar.gz

In [None]:
%%bash

mkdir data
mkdir embeddings
wget http://143.107.183.175:22980/download.php?file=embeddings/fasttext/skip_s300.zip -P data
# cp /content/drive/MyDrive/Colab\ Notebooks/resources/skip_s300.zip data/
unzip data/skip_s300.zip -d embeddings

# Porttinari-base
git clone https://github.com/huberemanuel/UD_Portuguese-Porttinari.git
cp UD_Portuguese-Porttinari/pt_porttinari-ud-train.conllu /content/subword-sequence-tagging/data/ud_1_2/train/pt.conllu
cp UD_Portuguese-Porttinari/pt_porttinari-ud-dev.conllu /content/subword-sequence-tagging/data/ud_1_2/dev/pt.conllu
cp UD_Portuguese-Porttinari/pt_porttinari-ud-test.conllu /content/subword-sequence-tagging/data/ud_1_2/test/pt.conllu

In [None]:
! mkdir -p /content/subword-sequence-tagging/data/my_ud/train /content/subword-sequence-tagging/data/my_ud/dev /content/subword-sequence-tagging/data/my_ud/test
! cp /content/UD_Portuguese-Porttinari/pt_porttinari-ud-train.conllu /content/subword-sequence-tagging/data/my_ud/train/pt.conll 
! cp /content/UD_Portuguese-Porttinari/pt_porttinari-ud-dev.conllu /content/subword-sequence-tagging/data/my_ud/dev/pt-ud.conll 
! cp /content/UD_Portuguese-Porttinari/pt_porttinari-ud-test.conllu /content/subword-sequence-tagging/data/my_ud/test/pt.conll! 

In [None]:
! pip freeze | grep sentencepiece

In [None]:
! pip install sentencepiece==0.1.1

In [None]:
%%writefile subword-sequence-tagging/trainer.py
import random

import numpy as np
import torch
from torch.optim.lr_scheduler import ReduceLROnPlateau

from bert_wrapper import Transformer as Bert
from data import datasets
from model import SequenceTagger
from util import (
    ConllScore,
    EarlyStopping,
    LossTrackers,
    Score,
    dump_args,
    emb_layer,
    get_logger,
    get_optim,
    json_load,
    load_word2vec_file,
    mkdir,
    next_rundir,
    save_model,
    set_random_seed,
)


def load_dataset(conf, lang, bert=None):
    if conf.best_vocab_size:
        conf.vocab_size = json_load(conf.best_vocab_size_file)[conf.lang]
    data = datasets[conf.dataset].load(conf, lang, bert=bert)
    data.describe()
    return data


class Trainer:
    def __init__(self, conf):
        self.conf = conf
        self.device = torch.device(f"cuda:{conf.gpu_id}")
        self.log = get_logger()
        torch.set_printoptions(precision=8)
        if conf.runid:
            conf.rundir = mkdir(conf.outdir / conf.runid)
        if not conf.rundir:
            conf.rundir = next_rundir(conf.outdir, log=self.log)
        self.rundir = conf.rundir
        dump_args(conf, conf.rundir / "conf.json")
        set_random_seed(conf.random_seed)
        if self.conf.use_bert:
            assert self.conf.lang in Bert.supported_langs, self.conf.lang
            self.bert = Bert(self.conf.bert_model_name, device=self.device)
        else:
            self.bert = None
        self.data = load_dataset(conf, conf.lang, bert=self.bert)
        _data = [self.data]
        for d in _data:
            self.log.info(f"{len(d.train_loader)} batches | bs {conf.batch_size}")
        self.model = self.get_model()
        self.optimizer = get_optim(conf, self.model)
        optimum = "min"
        if conf.lr_scheduler == "plateau":
            self.lr_scheduler = ReduceLROnPlateau(
                self.optimizer, factor=0.1, patience=2, mode=optimum, verbose=True
            )
        elif conf.lr_scheduler:
            raise ValueError("Unknown lr_scheduler: " + conf.lr_scheduler)
        self.losses = LossTrackers.from_names("loss", log=self.log)
        if self.main_lang_data.tag == "ner" or self.conf.dataset.startswith("sr3de"):
            if self.data.is_multilingual:
                self.sentence_texts = {
                    split_name: self.main_lang_data.token_texts(split_name)
                    for split_name in ["dev", "test"]
                }
                self.conll_score = {
                    lang: ConllScore(tag_enc=self.main_lang_data.tag_enc)
                    for lang in self.data.dev
                }
                self.score = {
                    lang: Score(
                        "f1",
                        save_model=False,
                        log=self.log,
                        score_func=self.conll_score[lang],
                        add_mode="append",
                    )
                    for lang in self.data.dev
                }
                self.avg_score = Score(
                    "avg_f1", log=self.log, score_func="dummy", add_mode="append"
                )
            else:
                self.sentence_texts = {
                    split_name: self.main_lang_data.token_texts(split_name)[
                        : conf.max_eval_inst
                    ]
                    for split_name in ["dev", "test"]
                }
                self.conll_score = ConllScore(tag_enc=self.main_lang_data.tag_enc)
                self.score = Score(
                    "f1", log=self.log, score_func=self.conll_score, add_mode="append"
                )
        else:
            if self.data.is_multilingual:
                self.score = {
                    lang: Score("acc", log=self.log) for lang in self.data.dev
                }
                self.avg_score = Score(
                    "avg_acc", log=self.log, score_func="dummy", add_mode="append"
                )
            else:
                self.score = Score("acc", log=self.log)
        if conf.early_stop > 0:
            score_optimum = (
                "max"
                if (
                    self.conf.dataset.startswith("wikiannmulti")
                    or self.data.is_multilingual
                )
                else self.score.optimum
            )
            self.early_stop = EarlyStopping(
                score_optimum,
                min_delta=conf.early_stop_min_delta,
                patience=conf.early_stop,
            )
        else:
            self.early_stop = None
        self.epoch = 0

    def get_model(self):
        ntags = self.data.tag_enc.nlabels
        nshapes = self.data.shape_enc.nlabels
        nchars = self.data.char_enc.nlabels
        bpe_emb = emb_layer(
            self.data.bpemb.vectors,
            trainable=not self.conf.emb_fixed,
            use_weights=not self.conf.emb_random_init,
        )
        if self.conf.use_fasttext:
            fasttext_file = self.conf.fasttext_emb_file.format(
                dataset=self.conf.dataset, lang=self.data.lang
            )
            fasttext_emb = emb_layer(
                load_word2vec_file(fasttext_file, add_unk=True),
                trainable=not self.conf.emb_fixed,
                use_weights=not self.conf.emb_random_init,
            )
        else:
            fasttext_emb = None
        model = SequenceTagger(
            bpe_emb,
            ntags,
            self.conf,
            nchars=nchars,
            nshapes=nshapes,
            fasttext_emb=fasttext_emb,
            bert=self.bert,
            tag_enc=self.main_lang_data.tag_enc,
        ).to(self.device)
        self.log.info(f"model repr dim: {model.repr_dim}")
        if self.conf.model_file:
            self.log.info(f"loading model {self.conf.model_file}")
            model.load_state_dict(torch.load(self.conf.model_file))
            self.log.info(f"loaded model {self.conf.model_file}")
        return model

    def train(self, train_epoch, do_eval, do_test=None, eval_ds_name=None):
        try:
            for epoch in range(1, self.conf.max_epochs + 1):
                self.epoch = epoch
                self.model.train()
                train_epoch(epoch=epoch)
                self.losses.interval_end_log(epoch, ds_name="train")
                burnin_done = epoch >= self.conf.first_eval_epoch
                if burnin_done and not epoch % self.conf.eval_every:
                    score, preds, trues = self.do_eval(
                        do_eval, epoch=epoch, eval_ds_name=eval_ds_name
                    )
                    if do_test:
                        self.do_eval(do_test, epoch=epoch, eval_ds_name="test")
                    if score is not None and self.early_stop:
                        if self.early_stop.step(score):
                            if epoch >= self.conf.min_epochs:
                                patience = self.early_stop.patience
                                self.log.info(f"Early stop after {patience} steps")
                                break
        except KeyboardInterrupt:
            self.log.info("Stopping training due to keyboard interrupt")

    def do_eval(self, eval_func, epoch=None, eval_ds_name=None):
        self.model.eval()
        cur_score, preds, trues = eval_func(epoch=epoch)
        self.log_eval(ds_name=eval_ds_name, epoch=epoch)
        if self.data.is_multilingual:
            return self.avg_score.current
        return self.score.current, preds, trues

    def log_eval(self, ds_name=None, epoch=None):
        self.losses.interval_end(ds_name=ds_name)
        if self.data.is_multilingual:
            for lang in getattr(self.data, ds_name):
                if hasattr(self, "conll_score"):
                    self.conll_score[lang].sentences = self.sentence_texts[ds_name][
                        lang
                    ]
                    fname = f"{epoch}.{ds_name}.{lang}.conll"
                    self.conll_score[lang].outfile = self.rundir / fname
                self.score[lang].update()
            avg_score = np.average([score.current for score in self.score.values()])
            self.avg_score.update_log(
                model=self.model, rundir=self.rundir, epoch=epoch, score=avg_score
            )
        else:
            if hasattr(self, "conll_score"):
                self.conll_score.sentences = self.sentence_texts[ds_name]
                fname = f"{epoch}.{ds_name}.conll"
                self.conll_score.outfile = self.rundir / fname
            self.score.update_log(self.model, self.rundir, epoch)

    def save_model(self):
        model_file = self.rundir / f"model.e{self.epoch}.pt"
        save_model(self.model, model_file, self.log)

    @property
    def main_lang_data(self):
        return self.data[0] if isinstance(self.data, list) else self.data

    @property
    def batch_iter_train_multilang(self):
        main_lang_len = len(self.data[0].train_loader)
        max_sim_lang_len = int(self.conf.sim_lang_ratio * main_lang_len)

        def get_sim_lang_len(i):
            sim_lang_len = len(self.data[i].train_loader)
            return min(sim_lang_len, max_sim_lang_len)

        lang_idxs = [
            i
            for i, data in enumerate(self.data)
            for _ in range(main_lang_len if i == 0 else get_sim_lang_len(i))
        ]
        random.shuffle(lang_idxs)
        iters = [data.batch_iter_train for data in self.data]
        return ((i, next(iters[i])) for i in lang_idxs)


In [None]:
%%writefile /content/subword-sequence-tagging/main.py
from copy import deepcopy
from functools import partial
from itertools import islice

import torch

from argparser import get_args
from eval import save_conllu
from trainer import Trainer


def train(conf):
    t = Trainer(conf)
    optim = t.optimizer
    model = t.model

    def train_epoch(*args, **kwargs):
        for i, batch in enumerate(t.data.batch_iter_train):
            optim.zero_grad()
            tag_true = batch["token"][3]
            _, loss = model(batch, tag_true=tag_true)
            loss.backward()
            optim.step()
            t.losses.append(loss)
        if hasattr(t, "lr_scheduler"):
            t.lr_scheduler.step(t.losses[0].current)

    def train_epoch_multilang(*args, **kwargs):
        for i, (lang_idx, batch) in enumerate(t.batch_iter_train_multilang):
            optim.zero_grad()
            tag_true = batch["token"][3]
            _, loss = model(batch, tag_true=tag_true, lang_idx=lang_idx)
            loss.backward()
            optim.step()
            t.losses.append(loss)

    def do_eval(ds_iter, *args, **kwargs):
        preds = []
        trues = []
        for batch in islice(ds_iter(), conf.max_eval_inst):
            sorted_len, sort_idx, tag_true = batch["token"][1:4]
            tag_pred, loss = model(batch, tag_true=tag_true)
            unsort_idx = torch.sort(sort_idx)[1]
            for l, true, pred in zip(
                sorted_len[unsort_idx], tag_pred[unsort_idx], tag_true[unsort_idx]
            ):
                t.score.add(pred[:l], true[:l])
                preds.append(pred[:l])
                trues.append(true[:l])
        return (t.score.current, preds, trues)

    def do_eval_multi(ds_iter, *args, **kwargs):
        for lang, ds in ds_iter():
            for batch in ds:
                sorted_len, sort_idx, tag_true = batch["token"][1:4]
                tag_pred, loss = model(batch, tag_true=tag_true)
                unsort_idx = torch.sort(sort_idx)[1]
                for l, true, pred in zip(
                    sorted_len[unsort_idx], tag_pred[unsort_idx], tag_true[unsort_idx]
                ):
                    t.score[lang].add(pred[:l], true[:l])
        return t.avg_score.current

    _train_epoch = train_epoch
    if t.data.is_multilingual:
        _do_eval = partial(do_eval_multi, lambda: t.data.iter_dev)
        do_test = partial(do_eval_multi, lambda: t.data.iter_test)
    else:
        _do_eval = partial(do_eval, lambda: t.main_lang_data.iter_dev)
        do_test = partial(do_eval, lambda: t.main_lang_data.iter_test)
    _do_test = do_test if conf.test_every_eval else None
    t.train(_train_epoch, _do_eval, do_test=_do_test, eval_ds_name="dev")
    if t.data.is_multilingual:
        score = t.avg_score
    else:
        score = t.score
    conf.model_file = score.best_model
    test_score = test(conf)
    if t.data.is_multilingual:
        test_score, lang_scores = test_score
        for lang, lang_score in lang_scores.items():
            t.log.info(f"{lang} score: {lang_score.current:.4}")
    t.log.info(f"final score: {test_score:.4}")


def test(conf, model=None):
    t = Trainer(conf)
    if model is None:
        model = t.model

    if t.data.is_multilingual:

        def do_test(*args, **kwargs):
            for lang, ds in t.data.iter_test:
                for batch in ds:
                    sorted_len, sort_idx, tag_true = batch["token"][1:4]
                    tag_pred, loss = model(batch, tag_true=tag_true)
                    unsort_idx = torch.sort(sort_idx)[1]
                    for l, true, pred in zip(
                        sorted_len[unsort_idx],
                        tag_pred[unsort_idx],
                        tag_true[unsort_idx],
                    ):
                        t.score[lang].add(pred[:l], true[:l])
            return t.avg_score.current

    else:

        def do_test(*args, **kwargs):
            preds = []
            trues = []
            for batch in islice(t.main_lang_data.iter_test, conf.max_eval_inst):
                sorted_len, sort_idx, tag_true = batch["token"][1:4]
                tag_pred, loss = model(batch, tag_true=tag_true)
                unsort_idx = torch.sort(sort_idx)[1]
                for l, true, pred in zip(
                    sorted_len[unsort_idx], tag_pred[unsort_idx], tag_true[unsort_idx]
                ):
                    t.score.add(pred[:l], true[:l])
                    preds.append(pred[:l])
                    trues.append(true[:l])
            return (t.score.current, preds, trues)

    score, preds, trues = t.do_eval(do_test, eval_ds_name="test")

    test_dataset = deepcopy(t.data.test_raw)
    a = deepcopy(test_dataset)
    diff = 0
    i = 0
    for sent, sent_pred, sent_true in zip(test_dataset, preds, trues):
        sent_pred = t.data.tag_enc.inverse_transform(sent_pred)
        sent_true = t.data.tag_enc.inverse_transform(sent_true)
        j = 0
        for token, pred_tag, gold_tag in zip(sent, sent_pred, sent_true):
            # Someday I will understand this
            test_dataset[i][j]["upos"] = gold_tag
            if pred_tag != gold_tag:
                diff += 1
                t.log.info("{} - {}".format(pred_tag, gold_tag))
            j += 1
        i += 1
    t.log.info("Total diff tokens: {}".format(diff))
    t.log.info("---- {}".format(a == test_dataset))
    save_conllu(t.data.test_raw, rundir=".", eval_name="gold")
    save_conllu(test_dataset, rundir=".", eval_name="pred")

    if t.data.is_multilingual:
        avg_score = score
        lang_scores = t.score
        for lang, lang_score in lang_scores.items():
            t.log.info(f"{lang} score: {lang_score.current:.4}")
        t.log.info(f"avg score: {avg_score:.4}")
        return avg_score, lang_scores
    return score


if __name__ == "__main__":
    conf = get_args()
    conf.bpemb_lang = conf.lang
    globals()[conf.command.replace("-", "_")](conf)


In [None]:
!  rm -rf /content/subword-sequence-tagging/out

In [None]:
import glob

# Execute 10 experiments
for seed in range(42, 42 + 10):
    ! rm -rf /content/subword-sequence-tagging/out
    ! cd subword-sequence-tagging && python main.py train \
        --dataset ud_1_2 \
        --lang pt \
        --tag upostag \
        --use-char \
        --use-bpe \
        --use-meta-rnn \
        --best-vocab-size \
        --char-emb-dim 50 \
        --char-nhidden 256 \
        --bpe-nhidden 256 \
        --meta-nhidden 256 \
        --dropout 0.2 \
        --random-seed {seed}

    model_path = glob.glob("/content/subword-sequence-tagging/out/0/acc*")[0]

    ! cd subword-sequence-tagging && python main.py test \
        --dataset ud_1_2 \
        --lang pt \
        --tag upostag \
        --model-file {model_path} \
        --use-char \
        --use-bpe \
        --use-meta-rnn \
        --best-vocab-size \
        --char-emb-dim 50 \
        --char-nhidden 256 \
        --bpe-nhidden 256 \
        --meta-nhidden 256 \
        --dropout 0.2

    ! mkdir logs_{seed}
    ! cd /content/subword-sequence-tagging && mv *.conllu ../logs_{seed}
    ! cd /content/subword-sequence-tagging && mv {model_path} ../logs_{seed}
    ! zip -r -9 logs_{seed}.zip logs_{seed}
    ! mv logs_{seed}.zip "/content/drive/MyDrive/Studies/Mestrado/Experimentos Tagging Porttinari-base/cncsr"