【チュートリアル】機械学習を使って30分で固有表現抽出器を作る
@Hironsan
2017年05月03日に更新
https://qiita.com/Hironsan/items/326b66711eb4196aa9d4

In [18]:
import os
from pathlib import Path
task = 'ja-wikinews-ner'
data_folder = Path(os.environ['HOME']) / '.flair/datasets' / task

def load_ner(filepath):
    x = [[]]
    with open(filepath) as f:
        for line in f:
            cols = line.strip().split('\t')
            if len(cols) > 1:
                x[-1].append(cols)
            else:
                x.append([])
    x.pop()
    return x
x_train = load_ner(data_folder / 'train.txt')
x_test = load_ner(data_folder / 'test.txt')

In [22]:
def is_hiragana(ch):
    return 0x3040 <= ord(ch) <= 0x309F

def is_katakana(ch):
    return 0x30A0 <= ord(ch) <= 0x30FF

def get_character_type(ch):
    if ch.isspace():
        return 'ZSPACE'
    elif ch.isdigit():
        return 'ZDIGIT'
    elif ch.islower():
        return 'ZLLET'
    elif ch.isupper():
        return 'ZULET'
    elif is_hiragana(ch):
        return 'HIRAG'
    elif is_katakana(ch):
        return 'KATAK'
    else:
        return 'OTHER'

def get_character_types(string):
    character_types = map(get_character_type, string)
    character_types_str = '-'.join(sorted(set(character_types)))

    return character_types_str

In [23]:
def extract_pos_with_subtype(morph):
    idx = morph.index('*')

    return '-'.join(morph[1:idx])

def word2features(sent, i):
    word = sent[i][0]
    chtype = get_character_types(sent[i][0])
    postag = extract_pos_with_subtype(sent[i])
    features = [
        'bias',
        'word=' + word,
        'type=' + chtype,
        'postag=' + postag,
    ]
    if i >= 2:
        word2 = sent[i-2][0]
        chtype2 = get_character_types(sent[i-2][0])
        postag2 = extract_pos_with_subtype(sent[i-2])
        iobtag2 = sent[i-2][-1]
        features.extend([
            '-2:word=' + word2,
            '-2:type=' + chtype2,
            '-2:postag=' + postag2,
            '-2:iobtag=' + iobtag2,
        ])
    else:
        features.append('BOS')

    if i >= 1:
        word1 = sent[i-1][0]
        chtype1 = get_character_types(sent[i-1][0])
        postag1 = extract_pos_with_subtype(sent[i-1])
        iobtag1 = sent[i-1][-1]
        features.extend([
            '-1:word=' + word1,
            '-1:type=' + chtype1,
            '-1:postag=' + postag1,
            '-1:iobtag=' + iobtag1,
        ])
    else:
        features.append('BOS')

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        chtype1 = get_character_types(sent[i+1][0])
        postag1 = extract_pos_with_subtype(sent[i+1])
        features.extend([
            '+1:word=' + word1,
            '+1:type=' + chtype1,
            '+1:postag=' + postag1,
        ])
    else:
        features.append('EOS')

    if i < len(sent)-2:
        word2 = sent[i+2][0]
        chtype2 = get_character_types(sent[i+2][0])
        postag2 = extract_pos_with_subtype(sent[i+2])
        features.extend([
            '+2:word=' + word2,
            '+2:type=' + chtype2,
            '+2:postag=' + postag2,
        ])
    else:
        features.append('EOS')

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]


def sent2labels(sent):
    return [morph[-1] for morph in sent]


def sent2tokens(sent):
    return [morph[0] for morph in sent]

In [24]:
sent2features(x_train[0])

[['bias',
  'word=2005',
  'type=ZDIGIT',
  'postag=名詞-数',
  'BOS',
  'BOS',
  '+1:word=年',
  '+1:type=OTHER',
  '+1:postag=名詞-接尾-助数詞',
  '+2:word=7',
  '+2:type=ZDIGIT',
  '+2:postag=名詞-数'],
 ['bias',
  'word=年',
  'type=OTHER',
  'postag=名詞-接尾-助数詞',
  'BOS',
  '-1:word=2005',
  '-1:type=ZDIGIT',
  '-1:postag=名詞-数',
  '-1:iobtag=B-DAT',
  '+1:word=7',
  '+1:type=ZDIGIT',
  '+1:postag=名詞-数',
  '+2:word=月',
  '+2:type=OTHER',
  '+2:postag=名詞-一般'],
 ['bias',
  'word=7',
  'type=ZDIGIT',
  'postag=名詞-数',
  '-2:word=2005',
  '-2:type=ZDIGIT',
  '-2:postag=名詞-数',
  '-2:iobtag=B-DAT',
  '-1:word=年',
  '-1:type=OTHER',
  '-1:postag=名詞-接尾-助数詞',
  '-1:iobtag=I-DAT',
  '+1:word=月',
  '+1:type=OTHER',
  '+1:postag=名詞-一般',
  '+2:word=14',
  '+2:type=ZDIGIT',
  '+2:postag=名詞-数'],
 ['bias',
  'word=月',
  'type=OTHER',
  'postag=名詞-一般',
  '-2:word=年',
  '-2:type=OTHER',
  '-2:postag=名詞-接尾-助数詞',
  '-2:iobtag=I-DAT',
  '-1:word=7',
  '-1:type=ZDIGIT',
  '-1:postag=名詞-数',
  '-1:iobtag=I-DAT',
  '+1:word

In [25]:
X_train = [sent2features(s) for s in x_train]
y_train = [sent2labels(s) for s in x_train]

X_test = [sent2features(s) for s in x_test]
y_test = [sent2labels(s) for s in x_test]

In [26]:
len(X_train)

400

In [28]:
y_train[0]

['B-DAT',
 'I-DAT',
 'I-DAT',
 'I-DAT',
 'I-DAT',
 'I-DAT',
 'O',
 'B-LOC',
 'I-LOC',
 'I-LOC',
 'I-LOC',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-ORG',
 'I-ORG',
 'I-ORG',
 'O',
 'O',
 'B-LOC',
 'O',
 'O',
 'O',
 'B-LOC',
 'O',
 'B-LOC',
 'I-LOC',
 'O',
 'B-LOC',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-ORG',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O']

In [29]:
from itertools import chain
import pycrfsuite
import sklearn
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelBinarizer

trainer = pycrfsuite.Trainer(verbose=False)

for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)
    
trainer.set_params({
    'c1': 1.0,   # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    'max_iterations': 50,  # stop earlier

    # include transitions that are possible, but not observed
    'feature.possible_transitions': True
})

In [30]:
trainer.train('model.crfsuite')

In [31]:
tagger = pycrfsuite.Tagger()
tagger.open('model.crfsuite')
example_sent = x_test[0]
print(' '.join(sent2tokens(example_sent)))
print("Predicted:", ' '.join(tagger.tag(sent2features(example_sent))))
print("Correct:  ", ' '.join(sent2labels(example_sent)))

昨年 10 月 に は 、 34 人 が 、 今回 の 現場 に 近い エジプト の タバ で 爆発 事件 の ため 死亡 し て いる 。
Predicted: B-DAT I-DAT I-DAT O O O O O O O O O O O O B-LOC O B-LOC O O O O O O O O O O
Correct:   B-DAT I-DAT I-DAT O O O O O O O O O O O O B-LOC O B-LOC O O O O O O O O O O


In [34]:
def bio_classification_report(y_true, y_pred):
    lb = LabelBinarizer()
    y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))

    tagset = set(lb.classes_) - {'O'}
    tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}

    return classification_report(
        y_true_combined,
        y_pred_combined,
        labels = [class_indices[cls] for cls in tagset],
        target_names = tagset,
    )

for i in range(len(X_test)):
    for j in range(len(X_test[i])):
        if j < len(X_test[i]) - 1:
            X_test[i][j + 1][11] = "-1:iobtag=" + tagger.tag(X_test[i][:j+1])[j]
        if j < len(X_test[i]) - 2:
            X_test[i][j + 2][7] = "-2:iobtag=" + tagger.tag(X_test[i][:j+1])[j]
            
y_pred = [tagger.tag(xseq) for xseq in X_test]

In [35]:
print(bio_classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       B-ART       1.00      0.44      0.62         9
       I-ART       0.50      0.50      0.50        12
       B-DAT       0.89      0.67      0.76        12
       I-DAT       0.86      0.55      0.67        22
       B-LOC       0.93      0.73      0.82        55
       I-LOC       0.37      0.82      0.51        17
       B-ORG       0.62      0.71      0.67        14
       I-ORG       0.21      0.40      0.28        10
       B-PSN       0.00      0.00      0.00         3
       B-TIM       1.00      0.14      0.25         7
       I-TIM       1.00      0.25      0.40        16

   micro avg       0.64      0.58      0.61       177
   macro avg       0.67      0.47      0.50       177
weighted avg       0.77      0.58      0.62       177
 samples avg       0.09      0.09      0.09       177

