In [1]:
%load_ext autoreload
%autoreload 2

from util import readAllSeqLabelFigure,cleanFigureAnnotation, blockBIO, sortFigureAnnotation, \
    flatten, candidatesPerParagraph,BIO2FigureLabel, computeF1, computePaperF1

from sklearn_crfsuite import CRF
from sklearn_crfsuite import metrics
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
import scipy.stats
from sklearn.metrics import f1_score
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np

[nltk_data] Downloading package punkt to /Users/lixiangci/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
def process(trainfilename):
    original_str_seqs, label_seqs, original_figure_seqs, original_figure_appearances, str_seq_lens = readAllSeqLabelFigure(trainfilename)
    original_figure_appearances = sortFigureAnnotation(original_figure_appearances, placeholder = "NaN")
    original_figure_seqs = sortFigureAnnotation(original_figure_seqs, placeholder = "NaN")
    original_candidate_seqs = candidatesPerParagraph(original_figure_seqs)
    original_figure_appearances = cleanFigureAnnotation(original_figure_appearances, placeholder="NaN")
    original_figure_seqs = cleanFigureAnnotation(original_figure_seqs, placeholder="NaN")
    str_seqs, figure_seqs, figure_appearances, candidate_seqs = original_str_seqs, original_figure_seqs, original_figure_appearances, original_candidate_seqs
    candidate_seqs = candidatesPerParagraph(figure_seqs)
    figure_BIO = blockBIO(figure_seqs, placeholder="NaN")
    return str_seqs, label_seqs, figure_seqs, figure_appearances, candidate_seqs, figure_BIO, original_figure_appearances, original_figure_seqs, str_seq_lens

In [3]:
# Put the directory of the tsv files here.
trainfilename = "/Users/lixiangci/Downloads/train+test/train+dev/"
testfilename = "/Users/lixiangci/Downloads/train+test/test/"

In [4]:
str_seqs, label_seqs, figure_seqs, figure_appearances, candidate_seqs, figure_BIO, original_figure_appearances, original_figure_seqs, str_seq_lens = process(trainfilename)
test_str_seqs, test_label_seqs, test_figure_seqs, test_figure_appearances, test_candidate_seqs, test_figure_BIO, test_original_figure_appearances, test_original_figure_seqs, test_str_seq_lens = process(testfilename)

/Users/lixiangci/Downloads/train+test/train+dev/19734906_spans.tsv 14 14
/Users/lixiangci/Downloads/train+test/train+dev/10087260_spans.tsv 12 26
/Users/lixiangci/Downloads/train+test/train+dev/15314656_spans.tsv 5 31
/Users/lixiangci/Downloads/train+test/train+dev/9128250_spans.tsv 10 41
/Users/lixiangci/Downloads/train+test/train+dev/11238593_spans.tsv 8 49
/Users/lixiangci/Downloads/train+test/train+dev/10790433_spans.tsv 15 64
/Users/lixiangci/Downloads/train+test/train+dev/10085298_spans.tsv 17 81
/Users/lixiangci/Downloads/train+test/train+dev/14707117_spans.tsv 12 93
/Users/lixiangci/Downloads/train+test/train+dev/16602827_spans.tsv 21 114
/Users/lixiangci/Downloads/train+test/train+dev/9625767_spans.tsv 6 120
/Users/lixiangci/Downloads/train+test/train+dev/18604198_spans.tsv 19 139
/Users/lixiangci/Downloads/train+test/train+dev/11777939_spans.tsv 13 152
/Users/lixiangci/Downloads/train+test/train+dev/16848641_spans.tsv 13 165
/Users/lixiangci/Downloads/train+test/train+dev/172

In [5]:
labels = ["B","I","O"]

In [6]:
def n_grams(clause, N=1):
    words = clause.split()
    n_grams = []
    n_gram = ["#"] * N
    for i, word in enumerate(words):
        n_gram = n_gram[1:]
        n_gram.append(word)
        n_grams.append(" ".join(n_gram))
    for n in range(N-1):
        n_gram = n_gram[1:]
        n_gram.append("#")
        n_grams.append(" ".join(n_gram))
    return set(n_grams)

In [7]:
def sentence2features(paragraph, discourses, candidates ,appearances, i):
    sentence = paragraph[i]
    discourse = discourses[i]
    appearance = appearances[i]
    features = {
        'bias': 1.0,
        'unigrams': n_grams(sentence, N=1),
        'bigrams': n_grams(sentence, N=2),
        'trigrams': n_grams(sentence, N=3),
        'BOP': False,
        'EOP': False,
        'discourse': discourse, # Include scientific discourses as input features.
        #'candidates': set(candidates),
        'appearance': appearance
    }
    
    if i > 0:
        sentence1 = paragraph[i-1]
        discourse1 = discourses[i-1]
        appearance1 = appearances[i-1]
        features.update({
            '-1:unigrams': n_grams(sentence1, N=1),
            '-1:bigrams': n_grams(sentence1, N=2),
            '-1:trigrams': n_grams(sentence1, N=3),
            '-1:discourse': discourse1,
            '-1:appearance': appearance1
        })
    else:
        features['BOP'] = True

    if i < len(paragraph)-1:
        sentence1 = paragraph[i+1]
        discourse1 = discourses[i+1]
        appearance1 = appearances[i+1]
        features.update({
            '+1:unigrams': n_grams(sentence1, N=1),
            '+1:bigrams': n_grams(sentence1, N=2),
            '+1:trigrams': n_grams(sentence1, N=3),
            '+1:discourse': discourse1,
            '+1:appearance': appearance1
        })
    else:
        features['EOP'] = True

    return features

In [8]:
def paragraph2features(paragraph, label_para, candidate_para, appearance_para):
    return [sentence2features(paragraph, label_para, candidate_para, appearance_para, i) for i in range(len(paragraph))]

In [9]:
X_train = [paragraph2features(p, label_para, candidate_para, appearance_para) for p, label_para, candidate_para, appearance_para in zip(str_seqs, label_seqs, candidate_seqs, figure_appearances)]
y_train = figure_BIO

X_test = [paragraph2features(p, label_para, candidate_para, appearance_para) for p, label_para, candidate_para, appearance_para in zip(test_str_seqs, test_label_seqs, test_candidate_seqs, test_figure_appearances)]
y_test = test_figure_BIO

In [10]:
crf = CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)

In [11]:
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

In [12]:
f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted', labels=labels)

In [13]:
# You can use this code to search optimal hyper parameters, but empirically the hyperparameter of CRF does not affect the results much.
"""
rs = RandomizedSearchCV(crf, params_space,
                        cv=5,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=10,
                        scoring=f1_scorer)
rs.fit(X_train, y_train)
crf = rs.best_estimator_

print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)
print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))
"""

"\nrs = RandomizedSearchCV(crf, params_space,\n                        cv=5,\n                        verbose=1,\n                        n_jobs=-1,\n                        n_iter=10,\n                        scoring=f1_scorer)\nrs.fit(X_train, y_train)\ncrf = rs.best_estimator_\n\nprint('best params:', rs.best_params_)\nprint('best CV score:', rs.best_score_)\nprint('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))\n"

In [14]:

crf = CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True,
    c1 = 0.3,
    c2 = 0.1
)
crf.fit(X_train, y_train)


CRF(algorithm='lbfgs', all_possible_states=None, all_possible_transitions=True,
    averaging=None, c=None, c1=0.3, c2=0.1, calibration_candidates=None,
    calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
    calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
    gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
    max_linesearch=None, min_freq=None, model_filename=None, num_memories=None,
    pa_type=None, period=None, trainer_cls=None, variance=None, verbose=False)

In [15]:
def test_f1(label_seqs,pred_label_seqs):
    true_label = flatten(label_seqs)
    pred_label = flatten(pred_label_seqs)

    f1 = f1_score(true_label,pred_label,average="weighted")
    return f1

In [16]:
y_train_pred = crf.predict(X_train)
y_pred = crf.predict(X_test)

In [17]:
f1 = test_f1(y_train,y_train_pred)
print("Training set BIO F1 score:",f1)

Training set BIO F1 score: 1.0


In [18]:
f1 = test_f1(y_test,y_pred)
print("Test set BIO F1 score:",f1)

Test set BIO F1 score: 0.8159948979591838


In [19]:
train_pred_figure = BIO2FigureLabel(y_train_pred, original_figure_appearances, placeholder="NaN")
test_pred_figure = BIO2FigureLabel(y_pred, test_original_figure_appearances, placeholder="NaN")

In [22]:
prev = 0
paperF1s = []
for seq_len in test_str_seq_lens:
    paperF1 = computePaperF1(test_original_figure_seqs, test_pred_figure, prev, prev + seq_len)
    paperF1s.append(paperF1)
    prev += seq_len
print("Reconstructed test F1 score:", np.mean(paperF1s))

Reconstructed test F1 score: 0.8071524064171123
