In [1]:
#-*-coding:utf-8-*-
#created time:2017-12-12 14:39:00
#Author:kyla

In [2]:
import matplotlib.pyplot as plt
# plt.style.use('ggplot')
from itertools import chain

# import nltk
import sklearn
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import RandomizedSearchCV

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
import codecs
import numpy as np
import os



In [3]:
def zero_digits(s):
    """
    Replace every digit in a string by a zero.
    """
    s[0] = re.sub('\d', '0', s[0])
    return s


def load_sentences(path, lower, zeros):
    """
    Load sentences. A line must contain at least a word and its tag.
    Sentences are separated by empty lines.
    """
    sentences = []
    sentence = []
    for line in codecs.open(path, 'r', 'utf8'):
        if not line.rstrip():
            if len(sentence) > 0:
                if 'DOCSTART' not in sentence[0][0]:
                    sentences.append(sentence)
                sentence = []
        else:
            word = zero_digits(line.rstrip().split()) if zeros else line.rstrip().split()
            assert len(word) >= 2
            sentence.append(word)
    if len(sentence) > 0:
        if 'DOCSTART' not in sentence[0][0]:
            sentences.append(sentence)
    return sentences


In [4]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word.isalpha()': word.isalpha(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.isalpha()': word1.isalpha(),
            '-1:word.isdigit()': word1.isdigit(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.isalpha()': word1.isalpha(),
            '+1:word.isdigit()': word1.isdigit(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [5]:
%%time
lower = 1
zeros = 0
train_path = './train.txt'
train_sents = load_sentences(train_path, lower, zeros)

dev_path = './dev.txt'
dev_sents = load_sentences(dev_path, lower, zeros)

test_path = './test.txt'
test_sents = load_sentences(test_path, lower, zeros)

CPU times: user 2.11 s, sys: 129 ms, total: 2.24 s
Wall time: 2.37 s


In [6]:
%%time
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

X_dev = [sent2features(s) for s in dev_sents]
y_dev = [sent2labels(s) for s in dev_sents]

X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

CPU times: user 2.54 s, sys: 387 ms, total: 2.93 s
Wall time: 3.36 s


In [8]:
%%time
# train
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CPU times: user 59.5 s, sys: 775 ms, total: 1min
Wall time: 1min 7s


In [10]:
def evaluate(test_sents, y_test, y_pred):
    predictions = []
    n_tags = len(np.unique([x for ys in y_pred for x in ys]))
    count = np.zeros((n_tags, n_tags), dtype=np.int32)
    tag_to_id = {"B-ORG": 0, "I-ORG": 1, "E-ORG": 2, "S-ORG": 3, "O": 4}
    id_to_tag = {0: "B-ORG", 1: "I-ORG", 2: "E-ORG", 3: "S-ORG", 4: "O"}
    
    for raw_sentence, y_reals, y_preds in zip(test_sents, y_test, y_pred):
#         print len(y_preds)
#         print len(y_reals)
        assert len(y_preds) == len(y_reals)
        for i, (yr, yp) in enumerate(zip(y_reals, y_preds)):
            new_line = " ".join(raw_sentence[i][:-1] + [yr, yp])
            predictions.append(new_line)
            count[tag_to_id[yr], tag_to_id[yp]] += 1
        predictions.append("")
    
    # Write predictions to disk and run CoNLL script externally
    eval_script = './evaluation/conlleval'
    eval_temp = './evaluation/temp'
    eval_id = np.random.randint(1000000, 2000000)
    print "eval_id: %d" % (eval_id)
    output_path = os.path.join(eval_temp, "eval.%i.output" % eval_id)
    scores_path = os.path.join(eval_temp, "eval.%i.scores" % eval_id)
    with codecs.open(output_path, 'w', 'utf8') as f:
        f.write("\n".join(predictions))
    os.system("perl %s < %s > %s" % (eval_script, output_path, scores_path))

    # CoNLL evaluation results
    eval_lines = [l.rstrip() for l in codecs.open(scores_path, 'r', 'utf8')]
    print "eval_lines:"
    for line in eval_lines:
        print line

    # Remove temp files
    #os.remove(output_path)
    #os.remove(scores_path)

    # Confusion matrix with accuracy for each tag
    print ("{: >2}{: >7}{: >7}%s{: >9}" % ("{: >7}" * n_tags)).format(
        "ID", "NE", "Total",
        *([id_to_tag[i] for i in xrange(n_tags)] + ["Percent"])
    )
    for i in xrange(n_tags):
        print ("{: >2}{: >7}{: >7}%s{: >9}" % ("{: >7}" * n_tags)).format(
            str(i), id_to_tag[i], str(count[i].sum()),
            *([count[i][j] for j in xrange(n_tags)] +
              ["%.3f" % (count[i][i] * 100. / max(1, count[i].sum()))])
        )

    # Global accuracy
    print "%i/%i (%.5f%%)" % (
        count.trace(), count.sum(), 100. * count.trace() / max(1, count.sum())
    )

    # F1 on all entities
    return float(eval_lines[1].strip().split()[-1])


In [11]:
%%time
# dev
y_pred = crf.predict(X_dev)
evaluate(dev_sents, y_dev, y_pred)

eval_id: 1263949
eval_lines:
processed 63241 tokens with 4554 phrases; found: 4166 phrases; correct: 3920.
accuracy:  97.53%; precision:  94.10%; recall:  86.08%; FB1:  89.91
              ORG: precision:  94.10%; recall:  86.08%; FB1:  89.91  4166
ID     NE  Total  B-ORG  I-ORG  E-ORG  S-ORG      O  Percent
 0  B-ORG   2825   2457     13      0     22    333   86.973
 1  I-ORG   1346     33   1065     19      1    228   79.123
 2  E-ORG   2833      0     25   2463      3    342   86.940
 3  S-ORG   1842     36      2      5   1573    226   85.396
 4      O  54395     75     57    118     23  54122   99.498
61680/63241 (97.53166%)
CPU times: user 1.28 s, sys: 114 ms, total: 1.4 s
Wall time: 2.25 s


In [12]:
%%time
# test
y_pred = crf.predict(X_test)
evaluate(test_sents, y_test, y_pred)

eval_id: 1915435
eval_lines:
processed 68798 tokens with 4926 phrases; found: 4502 phrases; correct: 4266.
accuracy:  97.61%; precision:  94.76%; recall:  86.60%; FB1:  90.50
              ORG: precision:  94.76%; recall:  86.60%; FB1:  90.50  4502
ID     NE  Total  B-ORG  I-ORG  E-ORG  S-ORG      O  Percent
 0  B-ORG   3078   2686     15      5     27    345   87.264
 1  I-ORG   1379     20   1090     21      1    247   79.043
 2  E-ORG   3086      1     33   2672      1    379   86.585
 3  S-ORG   1978     27      0      7   1697    247   85.794
 4      O  59277     74     68    105     23  59007   99.545
67152/68798 (97.60749%)
CPU times: user 1.56 s, sys: 127 ms, total: 1.69 s
Wall time: 3 s
