In [1]:
from flair.data import Sentence, Token

domains = []
with open('conll-unique.txt') as fhandle:
    domain = None
    
    for line in fhandle:
        line = line.rstrip()
        
        if not line:
            domains.append(domain)
            continue
        
        if line[0] == '#':            
            domain = Sentence()
            continue
            
        domain.add_token(line.split('\t')[0])

In [2]:
len(domains)

91733

In [13]:
import random

random.sample(domains, 1)

[Sentence: "www . montana sass boutique . com" - 7 Tokens]

### Use pre-trained models

In [25]:
from flair.models import SequenceTagger

tagger = SequenceTagger.load('ner')
tagger.predict(domains)

2019-12-11 16:25:10,858 loading file /Users/hydo/.flair/models/en-ner-conll03-v0.4.pt


[Sentence: "www . vila melodi ja . com" - 7 Tokens,
 Sentence: "fbom9 . l . fna . fb cdn . net" - 10 Tokens,
 Sentence: "mail . dsd services . net" - 6 Tokens,
 Sentence: "biohazard . free box os . fr" - 7 Tokens,
 Sentence: "www . azia tisch . massage . city . nl" - 10 Tokens,
 Sentence: "gassan kanko . jp" - 4 Tokens,
 Sentence: "7l8 tech support . com" - 5 Tokens,
 Sentence: "gmc bois . fr" - 4 Tokens,
 Sentence: "www . evergreen and ruby . com" - 7 Tokens,
 Sentence: "3caefeoe35569883399aeele . keene tic . io" - 6 Tokens,
 Sentence: "i kacy biz al . ga" - 6 Tokens,
 Sentence: "mail . ever fresh skill development . in" - 8 Tokens,
 Sentence: "towering . goes . rip" - 5 Tokens,
 Sentence: "www . emma collison . com" - 6 Tokens,
 Sentence: "web disk . dr vos je ca . ba" - 9 Tokens,
 Sentence: "www . pastel beach . com" - 6 Tokens,
 Sentence: "jl junior designer . com" - 5 Tokens,
 Sentence: "webmail . unc style . com" - 6 Tokens,
 Sentence: "web disk . bk ecc . com" - 7 Tokens,
 Sente

In [27]:
result = random.sample(domains, k=1)[0]
result.to_dict(tag_type='ner')

{'text': 'www . edu credito . com . br', 'labels': [], 'entities': []}

### Prepare training corpus

In [86]:
import json

label_mapping = {
    0: 'O',
    2230: 'PUNCT',
    2231: 'TLD',
    2232: 'B-ORG',
    2233: 'E-ORG',    
    2234: 'EXT'
}

all_domains = []
with open('file.json') as fhandle:
    for line in fhandle:
        record = json.loads(line)
        
        if not record['annotation_approver']:
            continue
        
        domain = Sentence()
        
        prev_offset = -1
        for span in sorted(record['annotations'], key=lambda r: r['start_offset']):
            if prev_offset+1 != span['start_offset']:
                tmp_offset = prev_offset if prev_offset != -1 else 0
                for not_tagged in record['text'][tmp_offset:span['start_offset']].split(' '):
                    not_tagged = not_tagged.strip()                    
                    
                    if not not_tagged:
                        continue
                        
                    token = Token(not_tagged)
                    token.add_tag('ner', 'O')                    
                    domain.add_token(token)
            
            for tagged in record['text'][span['start_offset']:span['end_offset']].split(' '):
                tagged = tagged.strip()
                
                token = Token(tagged)
                token.add_tag('ner', label_mapping[span['label']])
                domain.add_token(token)
            
            prev_offset = span['end_offset']
            
        all_domains.append(domain)

In [87]:
len(all_domains)

from sklearn.model_selection import train_test_split

x_train, x_test = train_test_split(all_domains, test_size=0.2)

In [88]:
len(x_train)

156

In [93]:
with open('data/train.txt', 'w') as fhandle:
    for domain in x_train:        
        for token in domain:            
            print(f"{token.text}\t{token.get_tag('ner')}", file=fhandle)                  
        print('', file=fhandle)

In [94]:
len(x_test)

40

In [95]:
with open('data/test.txt', 'w') as fhandle:
    for domain in x_test:
        for token in domain:
            print(f"{token.text}\t{token.get_tag('ner')}", file=fhandle)                  
        print('', file=fhandle)

### Train the model

In [96]:
columns = {0: 'text', 1: 'ner'}

from flair.data import Corpus
from flair.datasets import ColumnCorpus

corpus: Corpus = ColumnCorpus('data', columns,
                              train_file='train.txt',
                              test_file='test.txt')

2019-12-12 18:20:44,890 Reading data from data
2019-12-12 18:20:44,891 Train: data/train.txt
2019-12-12 18:20:44,893 Dev: None
2019-12-12 18:20:44,896 Test: data/test.txt


In [102]:
from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, FlairEmbeddings, DocumentRNNEmbeddings

tag_type = 'ner'
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
print(tag_dictionary.idx2item)

[b'<unk>', b'O', b'B-ORG', b'E-ORG', b'PUNCT', b'TLD', b'EXT', b'<START>', b'<STOP>']


In [113]:
from typing import List

embedding_types: List[TokenEmbeddings] = [
    WordEmbeddings('glove'),
    FlairEmbeddings('news-forward'),
    FlairEmbeddings('news-backward'),
]

embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

In [114]:
from flair.models import SequenceTagger

tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                        embeddings=embeddings,
                                        tag_dictionary=tag_dictionary,
                                        tag_type=tag_type,
                                        use_crf=True)

In [115]:
from flair.trainers import ModelTrainer

trainer: ModelTrainer = ModelTrainer(tagger, corpus)

trainer.train('data',
              learning_rate=0.1,
              mini_batch_size=4,
              max_epochs=50)

2019-12-13 17:44:19,674 ----------------------------------------------------------------------------------------------------
2019-12-13 17:44:19,677 Model: "SequenceTagger(
  (embeddings): StackedEmbeddings(
    (list_embedding_0): WordEmbeddings('glove')
    (list_embedding_1): FlairEmbeddings(
      (lm): LanguageModel(
        (drop): Dropout(p=0.05, inplace=False)
        (encoder): Embedding(300, 100)
        (rnn): LSTM(100, 2048)
        (decoder): Linear(in_features=2048, out_features=300, bias=True)
      )
    )
    (list_embedding_2): FlairEmbeddings(
      (lm): LanguageModel(
        (drop): Dropout(p=0.05, inplace=False)
        (encoder): Embedding(300, 100)
        (rnn): LSTM(100, 2048)
        (decoder): Linear(in_features=2048, out_features=300, bias=True)
      )
    )
  )
  (word_dropout): WordDropout(p=0.05)
  (locked_dropout): LockedDropout(p=0.5)
  (embedding2nn): Linear(in_features=4196, out_features=4196, bias=True)
  (rnn): LSTM(4196, 256, batch_first=True, b

  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


2019-12-13 17:45:03,223 ----------------------------------------------------------------------------------------------------
2019-12-13 17:45:03,740 epoch 2 - iter 0/35 - loss 4.66993761 - samples/sec: 23.37
2019-12-13 17:45:05,182 epoch 2 - iter 3/35 - loss 3.81871265 - samples/sec: 8.64
2019-12-13 17:45:06,682 epoch 2 - iter 6/35 - loss 3.92123049 - samples/sec: 8.33
2019-12-13 17:45:08,112 epoch 2 - iter 9/35 - loss 3.80853474 - samples/sec: 8.72
2019-12-13 17:45:09,549 epoch 2 - iter 12/35 - loss 3.77091663 - samples/sec: 8.69
2019-12-13 17:45:11,186 epoch 2 - iter 15/35 - loss 3.81041889 - samples/sec: 7.60
2019-12-13 17:45:12,755 epoch 2 - iter 18/35 - loss 3.75447224 - samples/sec: 8.02
2019-12-13 17:45:14,262 epoch 2 - iter 21/35 - loss 3.91682648 - samples/sec: 8.31
2019-12-13 17:45:15,741 epoch 2 - iter 24/35 - loss 3.72034348 - samples/sec: 8.46
2019-12-13 17:45:17,222 epoch 2 - iter 27/35 - loss 3.71355672 - samples/sec: 8.44
2019-12-13 17:45:18,683 epoch 2 - iter 30/35 - l

2019-12-13 17:46:57,834 ----------------------------------------------------------------------------------------------------
2019-12-13 17:46:57,836 EPOCH 7 done: loss 1.6742 - lr 0.1000
2019-12-13 17:46:57,959 DEV : loss 2.7561254501342773 - score 0.9274
2019-12-13 17:46:57,961 BAD EPOCHS (no improvement): 0
2019-12-13 17:47:00,877 ----------------------------------------------------------------------------------------------------
2019-12-13 17:47:01,408 epoch 8 - iter 0/35 - loss 3.72414207 - samples/sec: 22.71
2019-12-13 17:47:02,923 epoch 8 - iter 3/35 - loss 1.52295494 - samples/sec: 8.22
2019-12-13 17:47:04,653 epoch 8 - iter 6/35 - loss 1.52095876 - samples/sec: 7.17
2019-12-13 17:47:06,467 epoch 8 - iter 9/35 - loss 1.41200776 - samples/sec: 6.87
2019-12-13 17:47:08,059 epoch 8 - iter 12/35 - loss 1.43713243 - samples/sec: 7.85
2019-12-13 17:47:09,635 epoch 8 - iter 15/35 - loss 1.53386033 - samples/sec: 7.91
2019-12-13 17:47:11,187 epoch 8 - iter 18/35 - loss 1.54450389 - samp

2019-12-13 17:48:58,613 epoch 13 - iter 21/35 - loss 0.99023416 - samples/sec: 7.90
2019-12-13 17:49:00,085 epoch 13 - iter 24/35 - loss 0.93623379 - samples/sec: 8.45
2019-12-13 17:49:01,566 epoch 13 - iter 27/35 - loss 0.91110817 - samples/sec: 8.41
2019-12-13 17:49:03,148 epoch 13 - iter 30/35 - loss 0.85292716 - samples/sec: 7.85
2019-12-13 17:49:04,710 epoch 13 - iter 33/35 - loss 0.85472132 - samples/sec: 7.95
2019-12-13 17:49:05,265 ----------------------------------------------------------------------------------------------------
2019-12-13 17:49:05,266 EPOCH 13 done: loss 0.8484 - lr 0.1000
2019-12-13 17:49:05,391 DEV : loss 1.4369769096374512 - score 0.948
2019-12-13 17:49:05,393 BAD EPOCHS (no improvement): 3
2019-12-13 17:49:05,394 ----------------------------------------------------------------------------------------------------
2019-12-13 17:49:05,945 epoch 14 - iter 0/35 - loss 1.10873890 - samples/sec: 21.93
2019-12-13 17:49:07,503 epoch 14 - iter 3/35 - loss 1.867084

2019-12-13 17:50:44,637 epoch 19 - iter 3/35 - loss 0.63250768 - samples/sec: 5.61
2019-12-13 17:50:46,241 epoch 19 - iter 6/35 - loss 0.62847144 - samples/sec: 7.76
2019-12-13 17:50:47,812 epoch 19 - iter 9/35 - loss 0.52216640 - samples/sec: 7.93
2019-12-13 17:50:49,368 epoch 19 - iter 12/35 - loss 0.47825006 - samples/sec: 7.98
2019-12-13 17:50:50,929 epoch 19 - iter 15/35 - loss 0.50675946 - samples/sec: 7.96
2019-12-13 17:50:52,484 epoch 19 - iter 18/35 - loss 0.50240552 - samples/sec: 8.04
2019-12-13 17:50:54,032 epoch 19 - iter 21/35 - loss 0.50394080 - samples/sec: 8.00
2019-12-13 17:50:56,142 epoch 19 - iter 24/35 - loss 0.51399813 - samples/sec: 5.82
2019-12-13 17:50:57,804 epoch 19 - iter 27/35 - loss 0.56776198 - samples/sec: 7.54
2019-12-13 17:50:59,368 epoch 19 - iter 30/35 - loss 0.59658554 - samples/sec: 7.92
2019-12-13 17:51:00,810 epoch 19 - iter 33/35 - loss 0.57630460 - samples/sec: 8.60
2019-12-13 17:51:01,352 -------------------------------------------------------

2019-12-13 17:52:34,399 EPOCH 24 done: loss 0.4056 - lr 0.0250
2019-12-13 17:52:34,520 DEV : loss 1.2227187156677246 - score 0.9708
2019-12-13 17:52:34,521 BAD EPOCHS (no improvement): 0
2019-12-13 17:52:37,452 ----------------------------------------------------------------------------------------------------
2019-12-13 17:52:37,946 epoch 25 - iter 0/35 - loss 0.12649012 - samples/sec: 25.45
2019-12-13 17:52:39,528 epoch 25 - iter 3/35 - loss 0.34546959 - samples/sec: 7.85
2019-12-13 17:52:41,124 epoch 25 - iter 6/35 - loss 0.37668208 - samples/sec: 7.77
2019-12-13 17:52:42,623 epoch 25 - iter 9/35 - loss 0.30046268 - samples/sec: 8.30
2019-12-13 17:52:44,153 epoch 25 - iter 12/35 - loss 0.27651684 - samples/sec: 8.08
2019-12-13 17:52:45,673 epoch 25 - iter 15/35 - loss 0.32565027 - samples/sec: 8.16
2019-12-13 17:52:47,184 epoch 25 - iter 18/35 - loss 0.39797462 - samples/sec: 8.17
2019-12-13 17:52:48,718 epoch 25 - iter 21/35 - loss 0.37473783 - samples/sec: 8.10
2019-12-13 17:52:51

2019-12-13 17:54:36,193 epoch 30 - iter 27/35 - loss 0.32198496 - samples/sec: 8.22
2019-12-13 17:54:37,680 epoch 30 - iter 30/35 - loss 0.31980965 - samples/sec: 8.39
2019-12-13 17:54:39,332 epoch 30 - iter 33/35 - loss 0.35879629 - samples/sec: 7.49
2019-12-13 17:54:39,972 ----------------------------------------------------------------------------------------------------
2019-12-13 17:54:39,973 EPOCH 30 done: loss 0.3660 - lr 0.0250
2019-12-13 17:54:40,113 DEV : loss 1.0892693996429443 - score 0.9765
Epoch    29: reducing learning rate of group 0 to 1.2500e-02.
2019-12-13 17:54:40,115 BAD EPOCHS (no improvement): 4
2019-12-13 17:54:40,117 ----------------------------------------------------------------------------------------------------
2019-12-13 17:54:40,716 epoch 31 - iter 0/35 - loss 1.12165260 - samples/sec: 20.10
2019-12-13 17:54:42,417 epoch 31 - iter 3/35 - loss 0.38033867 - samples/sec: 7.30
2019-12-13 17:54:44,239 epoch 31 - iter 6/35 - loss 0.29424490 - samples/sec: 6.80

2019-12-13 17:56:24,007 epoch 36 - iter 9/35 - loss 0.33975620 - samples/sec: 8.22
2019-12-13 17:56:25,516 epoch 36 - iter 12/35 - loss 0.29917361 - samples/sec: 8.26
2019-12-13 17:56:27,799 epoch 36 - iter 15/35 - loss 0.31760058 - samples/sec: 5.42
2019-12-13 17:56:29,656 epoch 36 - iter 18/35 - loss 0.28877055 - samples/sec: 6.72
2019-12-13 17:56:31,412 epoch 36 - iter 21/35 - loss 0.27953089 - samples/sec: 7.06
2019-12-13 17:56:33,262 epoch 36 - iter 24/35 - loss 0.29865999 - samples/sec: 6.73
2019-12-13 17:56:34,859 epoch 36 - iter 27/35 - loss 0.29023904 - samples/sec: 7.79
2019-12-13 17:56:36,477 epoch 36 - iter 30/35 - loss 0.27323060 - samples/sec: 7.69
2019-12-13 17:56:38,279 epoch 36 - iter 33/35 - loss 0.25766335 - samples/sec: 6.85
2019-12-13 17:56:38,987 ----------------------------------------------------------------------------------------------------
2019-12-13 17:56:38,989 EPOCH 36 done: loss 0.2519 - lr 0.0063
2019-12-13 17:56:39,208 DEV : loss 1.1204771995544434 - s

2019-12-13 17:58:19,570 DEV : loss 1.1593666076660156 - score 0.9765
2019-12-13 17:58:19,572 BAD EPOCHS (no improvement): 3
2019-12-13 17:58:19,576 ----------------------------------------------------------------------------------------------------
2019-12-13 17:58:20,121 epoch 42 - iter 0/35 - loss 0.00571728 - samples/sec: 22.09
2019-12-13 17:58:21,848 epoch 42 - iter 3/35 - loss 0.10316217 - samples/sec: 7.20
2019-12-13 17:58:23,622 epoch 42 - iter 6/35 - loss 0.16471692 - samples/sec: 7.04
2019-12-13 17:58:25,251 epoch 42 - iter 9/35 - loss 0.13373036 - samples/sec: 7.67
2019-12-13 17:58:26,853 epoch 42 - iter 12/35 - loss 0.17340121 - samples/sec: 7.79
2019-12-13 17:58:28,436 epoch 42 - iter 15/35 - loss 0.20374134 - samples/sec: 7.97
2019-12-13 17:58:29,987 epoch 42 - iter 18/35 - loss 0.34455337 - samples/sec: 8.04
2019-12-13 17:58:31,563 epoch 42 - iter 21/35 - loss 0.35822805 - samples/sec: 7.91
2019-12-13 17:58:33,144 epoch 42 - iter 24/35 - loss 0.36085443 - samples/sec: 7.8

2019-12-13 18:00:08,136 epoch 47 - iter 24/35 - loss 0.28235069 - samples/sec: 8.32
2019-12-13 18:00:09,724 epoch 47 - iter 27/35 - loss 0.31599862 - samples/sec: 7.85
2019-12-13 18:00:11,217 epoch 47 - iter 30/35 - loss 0.31386903 - samples/sec: 8.35
2019-12-13 18:00:12,671 epoch 47 - iter 33/35 - loss 0.31787669 - samples/sec: 8.58
2019-12-13 18:00:13,194 ----------------------------------------------------------------------------------------------------
2019-12-13 18:00:13,195 EPOCH 47 done: loss 0.3095 - lr 0.0008
2019-12-13 18:00:13,324 DEV : loss 1.1429295539855957 - score 0.9765
2019-12-13 18:00:13,325 BAD EPOCHS (no improvement): 1
2019-12-13 18:00:13,327 ----------------------------------------------------------------------------------------------------
2019-12-13 18:00:13,817 epoch 48 - iter 0/35 - loss 0.65879250 - samples/sec: 24.56
2019-12-13 18:00:15,285 epoch 48 - iter 3/35 - loss 0.53530169 - samples/sec: 8.48
2019-12-13 18:00:16,780 epoch 48 - iter 6/35 - loss 0.425049

{'test_score': 0.8978,
 'dev_score_history': [0.8334,
  0.9036,
  0.908,
  0.907,
  0.895,
  0.8791,
  0.9274,
  0.9385,
  0.9257,
  0.954,
  0.9318,
  0.9318,
  0.948,
  0.895,
  0.9647,
  0.931,
  0.9704,
  0.9596,
  0.9647,
  0.9647,
  0.9651,
  0.9529,
  0.9535,
  0.9708,
  0.9647,
  0.9823,
  0.9708,
  0.9529,
  0.948,
  0.9765,
  0.9765,
  0.9765,
  0.9647,
  0.9651,
  0.9765,
  0.9765,
  0.9765,
  0.9765,
  0.9765,
  0.9765,
  0.9765,
  0.9765,
  0.9765,
  0.9765,
  0.9765,
  0.9765,
  0.9765,
  0.9765,
  0.9765,
  0.9765],
 'train_loss_history': [8.049862357548305,
  3.7572667598724365,
  3.033046204703195,
  2.4653559820992608,
  1.9635121413639613,
  1.7871274130684989,
  1.6741735322134836,
  1.4896295411246163,
  1.5619366645812989,
  1.4209679739815848,
  1.3800765446254186,
  1.1132631846836636,
  0.8483681678771973,
  0.9803770882742745,
  1.0590538569859096,
  0.6230416570390974,
  0.6843278476170132,
  0.5305566106523786,
  0.5620737348284041,
  0.4680426325116839,
  0

In [1]:
from flair.visual.training_curves import Plotter
plotter = Plotter()
plotter.plot_weights('data/weights.txt')

Weights plots are saved in data/weights.png
