### Imports

In [None]:
from IPython.core.display import display, HTML
display(HTML('<style>.container { width:80% !important; }</style>'))
%config InlineBackend.figure_format = 'retina'

In [None]:
from pathlib import Path
from subprocess import PIPE, Popen
from typing import List

import flair
import torch
from flair.data import Corpus
from flair.datasets import ColumnCorpus
from flair.embeddings import (FlairEmbeddings, StackedEmbeddings,
                              TokenEmbeddings, WordEmbeddings)
from flair.models import SequenceTagger
from flair.trainers import ModelTrainer
from flair.visual.training_curves import Plotter

from helpers import get_gittag
from test_funcs import test_paths

In [None]:
# if in need set device here
DEVICE_ID = 1
flair.device = torch.device(f'cuda:{DEVICE_ID}')
print(flair.device)

In [None]:
GIT_TAG = get_gittag()
print(f'Current git tag is: {GIT_TAG}')

### Prepare Data

In [None]:
ROOT_PATH = Path.cwd().parent
print(f'Project path: {ROOT_PATH}')
DATA_PATH = ROOT_PATH/'conll-data'
print(f'Data path: {DATA_PATH}')
TRAIN_FILE = 'train.conll'
TEST_FILE = 'test.conll'

In [None]:
FINE_TUNED = 'ft_false'   # change the training folder name accordingly
FRACTION = 1.0   # train on fraction of data
BATCH_SZ = 8   # mini-batch size
LR = 0.1   # default learning rate for flair tagger model
DROPOUT = 0.0   # default dropout
HIDDEN_SZ = 256   # hidden size of rnn model (use rnn is true by default)
RNN_LAYERS = 1   # default number of rnn layers

In [None]:
TRAIN_FOLDER_NAME = f'resume-ner-{GIT_TAG}-frac-{FRACTION}-{FINE_TUNED}'

In [None]:
test_paths(dirs=[ROOT_PATH, DATA_PATH], files=[DATA_PATH/TRAIN_FILE])

### Create a Corpus 

In [None]:
# what tag do we want to work with?
TAG_TYPE = 'ner'

In [None]:
# define columns
columns = {0: 'text', 1: 'ner'}

In [None]:
# init a corpus using column format, data folder and the names of the train, dev and test files
corpus: Corpus = ColumnCorpus(DATA_PATH, columns,
                              train_file=TRAIN_FILE,
                              test_file=TEST_FILE).downsample(percentage=FRACTION,
                                                              only_downsample_train=True)

In [None]:
print(corpus)

In [None]:
len(corpus.train)

In [None]:
print(corpus.train[0].to_tagged_string(TAG_TYPE))

In [None]:
stats = corpus.obtain_statistics(tag_type=TAG_TYPE)
print(stats)

### Prepare Model

In [None]:
# make the tag dictionary from the corpus
tag_dictionary = corpus.make_tag_dictionary(tag_type=TAG_TYPE)
print(tag_dictionary.idx2item)

In [None]:
# initialize embeddings
embedding_types: List[TokenEmbeddings] = [

    WordEmbeddings('crawl'),

    # comment in this line to use character embeddings
    # CharacterEmbeddings(),

    # comment in these lines to use flair embeddings
    FlairEmbeddings('news-forward'),
    FlairEmbeddings('news-backward'),
]

In [None]:
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

In [None]:
# initialize sequence tagger
tagger: SequenceTagger = SequenceTagger(hidden_size=HIDDEN_SZ,
                                        embeddings=embeddings,
                                        tag_dictionary=tag_dictionary,
                                        tag_type=TAG_TYPE,
                                        use_crf=True,
                                        rnn_layers=RNN_LAYERS,
                                        dropout=DROPOUT)

In [None]:
# initialize trainer
trainer: ModelTrainer = ModelTrainer(tagger, corpus)

In [None]:
# # find learning rate
# learning_rate_tsv = trainer.find_learning_rate(base_path='resources/taggers/debug-ner',
#                                                file_name='learning_rate.tsv',
#                                                mini_batch_size=8,
#                                                start_learning_rate=1e-1)

In [None]:
# # plot the learning rate finder curve
# plotter = Plotter()
# plotter.plot_learning_rate(learning_rate_tsv)

In [None]:
# start training
trainer.train(base_path=f'resources/taggers/{TRAIN_FOLDER_NAME}',
              learning_rate=LR,
              mini_batch_size=BATCH_SZ,
              max_epochs=120,
              train_with_dev=False,
              param_selection_mode=False,
              anneal_factor=0.5,
              patience=3,
              monitor_train=True,
              monitor_test=True,
              embeddings_storage_mode='cpu')

In [None]:
# plot training curves (optional)
plotter = Plotter()
plotter.plot_training_curves(f'resources/taggers/{TRAIN_FOLDER_NAME}/loss.tsv')
plotter.plot_weights(f'resources/taggers/{TRAIN_FOLDER_NAME}/weights.txt')