In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from pathlib import Path

from tqdm.notebook import tqdm

In [3]:
import os
os.environ['LOGURU_LEVEL'] = 'INFO'

In [4]:
import logging

from loguru import logger

class InterceptHandler(logging.Handler):
    def emit(self, record):
        # Get corresponding Loguru level if it exists
        try:
            level = logger.level(record.levelname).name
        except ValueError:
            level = record.levelno

        # Find caller from where originated the logged message
        frame, depth = logging.currentframe(), 2
        while frame.f_code.co_filename == logging.__file__:
            frame = frame.f_back
            depth += 1

        logger.opt(depth=depth, exception=record.exc_info).log(level, record.getMessage())

logging.basicConfig(handlers=[InterceptHandler()], level=0)

In [5]:
from flair.data import Corpus
from flair.datasets import ColumnCorpus

# define columns
columns = {0: 'text', 1: 'ocr_mistake'}

# this is the folder in which train, test and dev files reside
data_folder = '.'

# init a corpus using column format, data folder and the names of the train, dev and test files
corpus: Corpus = ColumnCorpus(data_folder, columns,
                              column_delimiter='\t',
                              train_file='train.txt',
                              test_file='test.txt',
                              dev_file='dev.txt')

2021-03-05 15:36:38,447 Reading data from .
2021-03-05 15:36:38,451 Train: train.txt
2021-03-05 15:36:38,474 Dev: dev.txt
2021-03-05 15:36:38,487 Test: test.txt


In [6]:
print(corpus)

Corpus: 118812 train + 13060 dev + 35361 test sentences


In [None]:
# Downsample for testing
corpus = corpus.downsample(0.1)

In [7]:
corpus.train[15309]

Sentence: "Quant aux ossements rapportés du cimetière des Pestiférés, ils surent sur-le-champ déposés chez M. Gérard Pâté; et le procès-ver bal de l’enlèvemcnt, rédigé sur les lieux et signé de tous les témoins, lut remis entre les mains des vicaires généraux, MM."   [− Tokens: 41  − Token-Labels: "Quant <0> aux <0> ossements <0> rapportés <0> du <0> cimetière <0> des <0> Pestiférés, <0> ils <0> surent <1> sur-le-champ <0> déposés <0> chez <0> M. <0> Gérard <0> Pâté; <0> et <0> le <0> procès-ver <1> bal <0> de <0> l’enlèvemcnt, <1> rédigé <0> sur <0> les <0> lieux <0> et <0> signé <0> de <0> tous <0> les <0> témoins, <0> lut <1> remis <0> entre <0> les <0> mains <0> des <0> vicaires <0> généraux, <0> MM. <0>"]

In [8]:
# 2. what tag do we want to predict?
tag_type = 'ocr_mistake'

# 3. make the tag dictionary from the corpus
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
print(tag_dictionary

Dictionary with 6 tags: <unk>, O, 1, 0, <START>, <STOP>


In [11]:
from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, FlairEmbeddings


# 4. initialize embeddings
embedding_types = [

    # WordEmbeddings('glove'),

    # comment in this line to use character embeddings
    # CharacterEmbeddings(),

    # comment in these lines to use flair embeddings
    FlairEmbeddings('multi-forward'),
    FlairEmbeddings('multi-backward'),
]

embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

2021-03-05 15:50:40,935 https://flair.informatik.hu-berlin.de/resources/embeddings/flair/lm-jw300-forward-v0.1.pt not found in cache, downloading to /var/folders/wf/s_2kht555m52w66184wpyzl80000gn/T/tmpucbimb0s
100%|██████████| 172513724/172513724 [00:14<00:00, 11526998.21B/s]2021-03-05 15:50:56,258 copying /var/folders/wf/s_2kht555m52w66184wpyzl80000gn/T/tmpucbimb0s to cache at /Users/janneke/.flair/embeddings/lm-jw300-forward-v0.1.pt

2021-03-05 15:50:57,353 removing temp file /var/folders/wf/s_2kht555m52w66184wpyzl80000gn/T/tmpucbimb0s
2021-03-05 15:50:58,852 https://flair.informatik.hu-berlin.de/resources/embeddings/flair/lm-jw300-backward-v0.1.pt not found in cache, downloading to /var/folders/wf/s_2kht555m52w66184wpyzl80000gn/T/tmp_1_0042y
100%|██████████| 172513724/172513724 [00:15<00:00, 10889906.83B/s]2021-03-05 15:51:14,906 copying /var/folders/wf/s_2kht555m52w66184wpyzl80000gn/T/tmp_1_0042y to cache at /Users/janneke/.flair/embeddings/lm-jw300-backward-v0.1.pt

2021-03-05 15:

In [12]:
# 5. initialize sequence tagger
from flair.models import SequenceTagger

tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                        embeddings=embeddings,
                                        tag_dictionary=tag_dictionary,
                                        tag_type=tag_type,
                                        use_crf=True)

In [13]:
# 6. initialize trainer
from flair.trainers import ModelTrainer

trainer: ModelTrainer = ModelTrainer(tagger, corpus)

In [None]:
# 7. start training
trainer.train('resources/taggers/example-pos',
              learning_rate=0.1,
              mini_batch_size=32,
              max_epochs=150)