In [16]:
import re, sys

def warning(msg):
    print ("WARNING:", msg)

def convert_bio_to_spans(bio_sequence):
    spans = []  # (label, startindex, endindex)
    cur_start = None
    cur_label = None
    N = len(bio_sequence)
    for t in range(N+1):
        if ((cur_start is not None) and
                (t==N or re.search("^[BO]", bio_sequence[t]))):
            assert cur_label is not None
            spans.append((cur_label, cur_start, t))
            cur_start = None
            cur_label = None
        if t==N: continue
        assert bio_sequence[t] and bio_sequence[t][0] in ("B","I","O")
        if bio_sequence[t].startswith("B"):
            cur_start = t
            cur_label = re.sub("^B-?","", bio_sequence[t]).strip()
        if bio_sequence[t].startswith("I"):
            if cur_start is None:
                warning("BIO inconsistency: I without starting B. Rewriting to B.")
                newseq = bio_sequence[:]
                newseq[t] = "B" + newseq[t][1:]
                return convert_bio_to_spans(newseq)
            continuation_label = re.sub("^I-?","",bio_sequence[t])
            if continuation_label != cur_label:
                newseq = bio_sequence[:]
                newseq[t] = "B" + newseq[t][1:]
                warning("BIO inconsistency: %s but current label is '%s'. Rewriting to %s" % (bio_sequence[t], cur_label, newseq[t]))
                return convert_bio_to_spans(newseq)

    # should have exited for last span ending at end by now
    assert cur_start is None
    spancheck(spans)
    return spans

def test_bio_conversion():
    spans = convert_bio_to_spans(["B"])
    assert spans==[("",0,1)]
    spans = convert_bio_to_spans(["B","I"])
    assert spans==[("",0,2)]
    spans = convert_bio_to_spans(["B","I","O"])
    assert spans==[("",0,2)]
    spans = convert_bio_to_spans(["O","B","I","O","O"])
    assert spans==[("",1,3)]
    spans = convert_bio_to_spans(["B","B"])
    assert spans==[("",0,1), ("",1,2)]
    spans = convert_bio_to_spans(["B","I","B"])
    assert spans==[("",0,2), ("",2,3)]
    spans = convert_bio_to_spans(["B-asdf","I-asdf","B"])
    assert spans==[("asdf",0,2), ("",2,3)]
    spans = convert_bio_to_spans(["B-asdf","I-difftype","B"])
    assert spans==[("asdf",0,1), ("difftype",1,2), ("",2,3)]
    spans = convert_bio_to_spans(["I","I"])
    assert spans==[("",0,2)]
    spans = convert_bio_to_spans(["B-a","I-b"])
    assert spans==[("a",0,1), ("b",1,2)]


def spancheck(spanlist):
    s = set(spanlist)
    assert len(s)==len(spanlist), "spans are non-unique ... is this a bug in the eval script?"

def kill_labels(bio_seq):
    ret = []
    for x in bio_seq:
        if re.search("^[BI]", x):
            x = re.sub("^B.*","B", x)
            x = re.sub("^I.*","I", x)
        ret.append(x)
    return ret

def evaluate_taggings(goldseq_predseq_pairs, ignore_labels=False):
    """a list of (goldtags,predtags) pairs.  goldtags and predtags are both lists of strings, of the same length."""
    num_sent = 0
    num_tokens= 0
    num_goldspans = 0
    num_predspans = 0

    tp, fp, fn = 0,0,0

    for goldseq,predseq in goldseq_predseq_pairs:
        N = len(goldseq)
        assert N==len(predseq)
        num_sent += 1
        num_tokens += N

        if ignore_labels:
            goldseq = kill_labels(goldseq)
            predseq = kill_labels(predseq)

        goldspans = convert_bio_to_spans(goldseq)
        predspans = convert_bio_to_spans(predseq)

        num_goldspans += len(goldspans)
        num_predspans += len(predspans)

        goldspans_set = set(goldspans)
        predspans_set = set(predspans)

        # tp: number of spans that gold and pred have
        # fp: number of spans that pred had that gold didn't (incorrect predictions)
        # fn: number of spans that gold had that pred didn't (didn't recall)
        tp += len(goldspans_set & predspans_set)
        fp += len(predspans_set - goldspans_set)
        fn += len(goldspans_set - predspans_set)

    prec = tp/(tp+fp) if (tp+fp)>0 else 1
    rec =  tp/(tp+fn) if (tp+fn)>0 else 1
    f1 = 2*prec*rec / (prec + rec)
    print("F = {f1:.4f},  Prec = {prec:.4f} ({tp}/{tpfp}),  Rec = {rec:.4f} ({tp}/{tpfn})".format(
            tpfp=tp+fp, tpfn=tp+fn, **locals()))
    print("({num_sent} sentences, {num_tokens} tokens, {num_goldspans} gold spans, {num_predspans} predicted spans)".format(**locals()))
    return f1

def read_tokens_tags_file(filename):
    """Returns list of sentences.  each sentence is a pair (tokens, tags), each
    of which is a list of strings of the same length."""
    sentences = open(filename).read().strip().split("\n\n")
    ret = []
    for sent in sentences:
        sent = sent.strip()
        lines = sent.split("\n")
        pairs = [L.split("\t") for L in lines]
        for pair in pairs:
            assert len(pair)==2, "Was expecting 2 tab-separated items per line."
        tokens = [tok for tok,tag in pairs]
        tags = [tag for tok,tag in pairs]
        ret.append( (tokens,tags) )
    return ret

def read_tags_file(filename):
    sentences = open(filename).read().strip().split("\n\n")
    ret = []
    for sent in sentences:
        sent = sent.strip()
        lines = sent.split("\n")
        for line in lines:
            assert len(line.split())==1, "Was expecting 1 item per line"
        ret.append( [line.strip() for line in lines] )
    return ret

def evaluate_tagging_file(gold_tags_file, predicted_tags_file):
    tokens_and_tags = read_tokens_tags_file(gold_tags_file)
    goldseqs = [tags for tokens,tags in tokens_and_tags]

    # assume predicted_tags_file is the simple crfsuite output format
    # every line is just a tag by itself, blank lines separating sentences
    predtags = read_tags_file(predicted_tags_file)

    # commented out code for a different prediction format
    # tokens_and_tags = read_tokens_tags_file(predicted_tags_file)
    # predtags = [tags for tokens,tags in tokens_and_tags]

    assert len(goldseqs) == len(predtags)

    print("Span-level NER evaluation")
    # print "Evaluation including NER types"
    # evaluate_taggings( list(zip(goldseqs, predtags)) )
    # print "Evaluation without types (is the span a name or not?)"
    evaluate_taggings( list(zip(goldseqs, predtags)), ignore_labels=True )

In [3]:
import json
import pandas as pd
import numpy as np
import sklearn_crfsuite
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import Perceptron
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, f1_score
from nltk.stem import WordNetLemmatizer
import nltk
import eli5
import scipy
import tqdm
import copy
nltk.download('wordnet')
LEMMATIZER = WordNetLemmatizer()
START1 = 'START1'
START2 = 'START2'
END1 = 'END1'
END2 = 'END2'

[nltk_data] Downloading package wordnet to /Users/leon/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
def data_loader(file, isTrain=False):
    f=open(file, 'r')
    sentences = []
    sentence = []
    labels = []
    label = []
    for line in f:
        if len(line) is 1:
            sentences.append(sentence)
            sentence = []
            labels.append(label)
            label = []
        else:
            sentence.append(line.split()[0])
            
            # get the label if is train/dev
            if isTrain:
                label.append(line.split()[1])
    
    
    # extract features
    train_X = extract_feature_crf(sentences)
    
    return train_X, labels

In [5]:
def get_wordClass(word):
    res = re.match(r'[a-z]+', word)
    if res is not None and res.span()[0] is 0 and res.span()[1] is len(word):
        return 'LOWERCASE'

    res = re.match(r'[A-Z]+', word)
    if res is not None and res.span()[0] is 0 and res.span()[1] is len(word):
        return 'allCaps'

    res = re.match(r'[A-Z][a-z]*', word)
    if res is not None and res.span()[0] is 0 and res.span()[1] is len(word):
        return 'initCap'
    
    res = re.match(r'[a-z]*[A-Z]+[a-z]*', word)
    if res is not None and res.span()[0] is 0 and res.span()[1] is len(word):
        return 'containCap'

    word = word.lower()
    res = re.match(r'[a-z]*[0-9]+[a-z]*', word)
    if res is not None and res.span()[0] is 0 and res.span()[1] is len(word):
        return 'containsDigit'

    return 'UNK'

In [6]:
def extract_feature_crf(sentences):
    train_features = []
    for sentence in sentences:
        tag_list = [START1, START2] + list(map(lambda x:x[1], nltk.pos_tag(sentence))) + [END1, END2]
        sentence = [START1, START2] + sentence + [END1, END2]

        sen_dict = []
        for i in range(2, len(sentence)-2):

            feature_dict = {}
            cur_info = word_process(sentence[i], tag_list[i], 'cur', feature_dict)
            prev_info = word_process(sentence[i-1], tag_list[i-1], 'prev', feature_dict)
#             prev_prev_info = word_process(sentence[i-2], tag_list[i-2], 'prev-prev', feature_dict)
            next_info = word_process(sentence[i+1], tag_list[i+1], 'next', feature_dict)
#             next_next_info = word_process(sentence[i+2], tag_list[i+2], 'next-next', feature_dict)
            
            # more customized feature
            word = sentence[i]
            feature_dict['pre-1'] = word[:1]
            feature_dict['pre-2'] = word[:2]
            feature_dict['pre-3'] = word[:3]
            feature_dict['pre-4'] = word[:4]
            
            feature_dict['post-1'] = word[-1:]
            feature_dict['post-2'] = word[-2:]
            feature_dict['post-3'] = word[-3:]
            feature_dict['post-4'] = word[-4:]

            sen_dict.append(feature_dict)
        train_features.append(sen_dict)
        
    return train_features

def extract_feature_log(train_X_crf, labels=None):
    
    features = []
    for sen in train_X_crf:
        for word in sen:
            temp = []
            for (key, item) in word.items():
                temp.append(item)
            features.append(temp)
            
    Y = []
    if labels is not None:
        for i in range(len(labels)):
            for j in range(len(labels[i])):
                for k in range(len(labels[i][j])):
                    if labels[i][j][k] == 'O':
                        Y.append(0)
                    else:
                        Y.append(1)
                        
    return features, np.array(Y)


In [7]:
def word_process(word, pos_tag, prefix, feature_dict):
    lemma = LEMMATIZER.lemmatize(word)
    lemma_key = prefix+'-lemma'
    pos_tag_key = prefix+'-pos-tag'
    word_class_key = prefix+'-word-class'
    
    
    _dict = {}
#     _dict[lemma_key] = lemma
    _dict[pos_tag_key] = pos_tag
    _dict[word_class_key] = get_wordClass(word)
    _dict[prefix+'-word'] = word
#     _dict[prefix+'-len'] = len(word)

#     _dict[prefix+'-pre-1'] = word[:1]
#     _dict[prefix+'-pre-2'] = word[:2]
#     _dict[prefix+'-pre-3'] = word[:3]
#     _dict[prefix+'-pre-4'] = word[:4]

#     _dict[prefix+'-post-1'] = word[-1:]
#     _dict[prefix+'-post-2'] = word[-2:]
#     _dict[prefix+'-post-3'] = word[-3:]
#     _dict[prefix+'-post-4'] = word[-4:]
    feature_dict.update(_dict)
    return _dict

In [8]:
def write2file(file_path, predict):
    fh = open(file_path, "w", encoding='utf-8')

    for sen in predict:
        for tag in sen:
            fh.write(tag+'\n')
        fh.write('\n')

    fh.close()

def convert_t0_obi(res):
    
    arr = ['O', 'B', 'I']
    new_res = []
    for i in range(len(res)):
        if i == 0:
            new_res.append(arr[res[i]])
        else:
            if (new_res[-1] == 'B' or new_res[-1] == 'I') and res[i] == 1:
                new_res.append('I')
            else:
                new_res.append(arr[res[i]])
    return new_res

def convertLog2Crf(predict_log, train_crf):
    output = []
    index = 0
    
    for i in range(len(train_crf)):
        temp = []
        for j in range(len(train_crf[i])):
            temp.append(predict_log[index])
            index += 1
        output.append(temp)
        
    return output

def ensemble_union(results):
    
    _len = len(result)
    result = results[0] 
    output = copy.deepcopy(result)

    for r in results:
        assert len(r) == _len
    
    for k in range(len(results)):
        for i in range(_len):
            for j in range(len(result[i])):
                if results[k][i][j] is not "O" and output[i][j] is "O":
                    output[i][j] = results[k][i][j]
                        
    return output
        

In [9]:
train_X_crf, train_y_crf = data_loader('data/train/train.txt', isTrain=True)
dev_X_crf, dev_y_crf = data_loader('data/dev/dev.txt', isTrain=True)
test_X_crf, _ = data_loader('data/test/test.nolabels.txt', isTrain=False)

# get log features
train_X_log, train_y_log = extract_feature_log(train_X_crf, labels=train_y_crf)
dev_X_log, dev_y_log = extract_feature_log(dev_X_crf, labels=dev_y_crf)
test_X_log, _ = extract_feature_log(test_X_crf)

X_dataframe = pd.DataFrame(np.array(train_X_log + dev_X_log + test_X_log))
from sklearn.feature_extraction import DictVectorizer
vectorizer = DictVectorizer(sparse=True)
one_hot_x_training = vectorizer.fit_transform(X_dataframe.to_dict("records"))

In [34]:
one_hot_x_training[0]

<1x107601 sparse matrix of type '<class 'numpy.float64'>'
	with 14 stored elements in Compressed Sparse Row format>

In [14]:
from sklearn import linear_model
logistic = linear_model.LogisticRegression(C=1e7)
logistic.fit(one_hot_x_training[0:len(train_X_log)], train_y_log)

LogisticRegression(C=10000000.0, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [None]:
dev_predict_log_flat = convert_t0_obi(logistic.predict(
    one_hot_x_training[len(train_X_log) : len(train_X_log) + len(dev_X_log)]))

In [None]:
dev_predict_log = convertLog2Crf(dev_predict_log_flat, dev_X_crf)
print (evaluate_taggings(list(zip(dev_y_crf, dev_predict_log)), ignore_labels=True ))

In [10]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0,
    c2=0.0005,
    max_iterations=1000,
    all_possible_transitions=False,
)
crf.fit(train_X_crf, train_y_crf);

In [18]:
%%time
train_predict = crf.predict(train_X_crf)
evaluate_taggings(list(zip(train_y_crf, train_predict)), ignore_labels=True )

F = 0.9943,  Prec = 0.9966 (1483/1488),  Rec = 0.9920 (1483/1495)
(2394 sentences, 46469 tokens, 1495 gold spans, 1488 predicted spans)
CPU times: user 739 ms, sys: 7.46 ms, total: 746 ms
Wall time: 750 ms


In [17]:
predict = crf.predict(dev_X_crf)
evaluate_taggings(list(zip(dev_y_crf, predict)), ignore_labels=True)

F = 0.4973,  Prec = 0.6436 (186/289),  Rec = 0.4052 (186/459)
(959 sentences, 13360 tokens, 459 gold spans, 289 predicted spans)


0.49732620320855614

In [None]:
trai

In [None]:
en_output = ensemble_union([predict, dev_predict_log])
print (evaluate_taggings(list(zip(dev_y_crf, en_output)), ignore_labels=True ))

In [28]:
print (dev_X_crf[0][10]['cur-word'])

🎢


In [177]:
write2file('dev_predict.txt', predict)
evaluate_tagging_file('data/dev/dev.txt', 'dev_predict.txt')

Span-level NER evaluation


In [35]:
write2file('results/test_predictions.out', crf.predict(test_X))

In [15]:
eli5.show_weights(logistic, top=10)

Weight?,Feature
+20.739,x76947
+19.934,x74773
+19.771,x105829
+19.731,x82143
+19.449,x95487
+18.028,x98049
+17.662,x105431
+17.344,x77871
+16.508,x79936
+16.465,x94309
