In [1]:
%load_ext autoreload
%autoreload 2

from util import readAllSeqLabelFigure,cleanFigureAnnotation, blockBIO, sortFigureAnnotation, \
    flatten,BIO2FigureLabel, read_passages

from sklearn_crfsuite import CRF
from sklearn_crfsuite import metrics
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
import scipy.stats
from sklearn.metrics import f1_score
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np

from figureSpanExtractor import extractDocumentFigureSpan

[nltk_data] Downloading package punkt to /Users/xiangcili/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
inference_input_file = "test_coronavirus.txt"
inference_discourse_label_file = "test.out"

In [3]:
test_str_seqs, _ = read_passages(inference_input_file, False)
test_label_seqs, _ = read_passages(inference_discourse_label_file, False)

In [5]:
test_figure_appearances = extractDocumentFigureSpan(test_str_seqs)
test_figure_appearances = sortFigureAnnotation(test_figure_appearances, placeholder = "NaN")
test_figure_appearances = cleanFigureAnnotation(test_figure_appearances, placeholder="NaN")

In [6]:
def process(trainfilename):
    str_seqs, label_seqs, figure_seqs, figure_appearances, str_seq_lens = readAllSeqLabelFigure(trainfilename)
    figure_appearances = sortFigureAnnotation(figure_appearances, placeholder = "NaN")
    figure_seqs = sortFigureAnnotation(figure_seqs, placeholder = "NaN")
    figure_appearances = cleanFigureAnnotation(figure_appearances, placeholder="NaN")
    figure_seqs = cleanFigureAnnotation(figure_seqs, placeholder="NaN")
    figure_BIO = blockBIO(figure_seqs, placeholder="NaN")
    return str_seqs, label_seqs, figure_seqs, figure_appearances, figure_BIO, str_seq_lens

In [7]:
# Put the directory of the tsv files here.
trainfilename = "train+test/all/"

In [8]:
str_seqs, label_seqs, figure_seqs, figure_appearances, figure_BIO, str_seq_lens = process(trainfilename)

train+test/all/19734906_spans.tsv 14 14
train+test/all/10704436_spans.tsv 5 19
train+test/all/10087260_spans.tsv 12 31
train+test/all/15314656_spans.tsv 5 36
train+test/all/9128250_spans.tsv 10 46
train+test/all/18583988_spans_test.tsv 8 54
train+test/all/11238593_spans.tsv 8 62
train+test/all/10790433_spans.tsv 15 77
train+test/all/10085298_spans.tsv 17 94
train+test/all/14707117_spans.tsv 12 106
train+test/all/16729043_spans_test.tsv 15 121
train+test/all/16602827_spans.tsv 21 142
train+test/all/9625767_spans.tsv 6 148
train+test/all/9971737_spans.tsv 14 162
train+test/all/18604198_spans.tsv 19 181
train+test/all/11777939_spans.tsv 13 194
train+test/all/16848641_spans.tsv 13 207
train+test/all/17276402_spans.tsv 6 213
train+test/all/24835508_spans.tsv 6 219
train+test/all/9700154_spans.tsv 14 233


In [9]:
labels = ["B","I","O"]

In [10]:
def n_grams(clause, N=1):
    words = clause.split()
    n_grams = []
    n_gram = ["#"] * N
    for i, word in enumerate(words):
        n_gram = n_gram[1:]
        n_gram.append(word)
        n_grams.append(" ".join(n_gram))
    for n in range(N-1):
        n_gram = n_gram[1:]
        n_gram.append("#")
        n_grams.append(" ".join(n_gram))
    return set(n_grams)

In [11]:
def sentence2features(paragraph, discourses ,appearances, i):
    sentence = paragraph[i]
    discourse = discourses[i]
    appearance = appearances[i]
    features = {
        'bias': 1.0,
        'unigrams': n_grams(sentence, N=1),
        'bigrams': n_grams(sentence, N=2),
        'trigrams': n_grams(sentence, N=3),
        'BOP': False,
        'EOP': False,
        'discourse': discourse, # Include scientific discourses as input features.
        'appearance': appearance
    }
    
    if i > 0:
        sentence1 = paragraph[i-1]
        discourse1 = discourses[i-1]
        appearance1 = appearances[i-1]
        features.update({
            '-1:unigrams': n_grams(sentence1, N=1),
            '-1:bigrams': n_grams(sentence1, N=2),
            '-1:trigrams': n_grams(sentence1, N=3),
            '-1:discourse': discourse1,
            '-1:appearance': appearance1
        })
    else:
        features['BOP'] = True

    if i < len(paragraph)-1:
        sentence1 = paragraph[i+1]
        discourse1 = discourses[i+1]
        appearance1 = appearances[i+1]
        features.update({
            '+1:unigrams': n_grams(sentence1, N=1),
            '+1:bigrams': n_grams(sentence1, N=2),
            '+1:trigrams': n_grams(sentence1, N=3),
            '+1:discourse': discourse1,
            '+1:appearance': appearance1
        })
    else:
        features['EOP'] = True

    return features

In [12]:
def paragraph2features(paragraph, label_para, appearance_para):
    return [sentence2features(paragraph, label_para, appearance_para, i) for i in range(len(paragraph))]

In [13]:
X_train = [paragraph2features(p, label_para, appearance_para) for p, label_para, appearance_para in zip(str_seqs, label_seqs, figure_appearances)]
y_train = figure_BIO

In [14]:
crf = CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)

In [15]:
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

In [16]:
f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted', labels=labels)

In [17]:
# You can use this code to search optimal hyper parameters, but empirically the hyperparameter of CRF does not affect the results much.
"""
rs = RandomizedSearchCV(crf, params_space,
                        cv=5,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=10,
                        scoring=f1_scorer)
rs.fit(X_train, y_train)
crf = rs.best_estimator_

print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)
print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))
"""

"\nrs = RandomizedSearchCV(crf, params_space,\n                        cv=5,\n                        verbose=1,\n                        n_jobs=-1,\n                        n_iter=10,\n                        scoring=f1_scorer)\nrs.fit(X_train, y_train)\ncrf = rs.best_estimator_\n\nprint('best params:', rs.best_params_)\nprint('best CV score:', rs.best_score_)\nprint('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))\n"

In [18]:

crf = CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True,
    c1 = 0.3,
    c2 = 0.1
)
crf.fit(X_train, y_train)


CRF(algorithm='lbfgs', all_possible_states=None, all_possible_transitions=True,
    averaging=None, c=None, c1=0.3, c2=0.1, calibration_candidates=None,
    calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
    calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
    gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
    max_linesearch=None, min_freq=None, model_filename=None, num_memories=None,
    pa_type=None, period=None, trainer_cls=None, variance=None, verbose=False)

In [19]:
X_test = [paragraph2features(p, label_para, appearance_para) for p, label_para, appearance_para in zip(test_str_seqs, test_label_seqs, test_figure_appearances)]

In [20]:
y_pred = crf.predict(X_test)

In [21]:
pred_figure = BIO2FigureLabel(y_pred, test_figure_appearances, placeholder="NaN")

In [22]:
for i, para in enumerate(pred_figure):
    print(i+1, para)

1 [['NaN'], ['NaN']]
2 [['NaN'], ['NaN'], ['NaN'], ['NaN']]
3 [['NaN'], ['NaN'], ['NaN'], ['NaN']]
4 [['NaN'], ['NaN']]
5 [['NaN'], ['NaN']]
6 [['NaN'], ['NaN'], ['NaN']]
7 [['NaN'], ['NaN']]
8 [['NaN'], ['NaN'], ['NaN'], ['NaN']]
9 [['NaN'], ['NaN'], ['NaN'], ['NaN'], ['NaN']]
10 [['NaN'], ['NaN'], ['NaN']]
11 [['NaN'], ['NaN'], ['NaN'], ['1'], ['NaN'], ['NaN'], ['NaN'], ['NaN'], ['NaN']]
12 [['NaN'], ['NaN'], ['NaN'], ['NaN'], ['NaN'], ['NaN'], ['NaN']]
13 [['NaN'], ['NaN'], ['NaN'], ['NaN'], ['NaN'], ['NaN']]
14 [['2a', '2b'], ['2a', '2b']]
15 [['NaN'], ['NaN']]
16 [['NaN'], ['1', '2'], ['1', '2'], ['1', '2'], ['1', '2'], ['1', '2'], ['1', '2']]
17 [['NaN'], ['NaN']]
18 [['NaN'], ['NaN'], ['NaN'], ['NaN'], ['NaN'], ['3'], ['3'], ['3']]
19 [['NaN'], ['NaN'], ['NaN'], ['NaN'], ['NaN']]
20 [['NaN'], ['NaN'], ['NaN']]
21 [['NaN'], ['NaN'], ['NaN'], ['NaN'], ['NaN'], ['NaN']]
22 [['NaN'], ['NaN']]
23 [['NaN'], ['NaN'], ['NaN'], ['NaN'], ['NaN']]
24 [['NaN']]
25 [['NaN'], ['NaN'], ['NaN']

In [24]:
out_file = "test_span.txt"
with open(out_file, "w") as f:
    for para_seq, para_label, para_span in zip(test_str_seqs, test_label_seqs, pred_figure):
        for seq, label, span in zip(para_seq, para_label, para_span):
            f.write(seq+"\t"+label+"\t"+"|".join(span)+"\n")
        f.write("\n")