### Conditional Random Fields (CRFs)

In [5]:
import pandas as pd
import numpy as np
import sklearn as sk
import sklearn_crfsuite
import os
import random
from collections import Counter, defaultdict, namedtuple, OrderedDict
from itertools import chain
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import Perceptron
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
from io import BytesIO
from itertools import chain


In [8]:
Sentence = namedtuple("Sentence", "words tags")

def read_data(filename):
    """Read tagged sentence data"""
    with open(filename, 'r') as f:
        sentence_lines = [l.split("\n") for l in f.read().split("\n\n")]
        index = 1
        a = OrderedDict()
        for s in sentence_lines:
            temp = []
            for l in s:
                temp.append(l.strip().split("\t")[1:])
            
            temp2 = []
            temp3 = []
            for val in temp:
                if len(val) == 2:
                    temp2.append(val[0])
                    temp3.append(val[1])
                          
            a[index] = Sentence(tuple(temp2),tuple(temp3))
            index += 1
        return a

def read_tags(filename):
    """Read a list of word tag classes"""
    with open(filename, 'r') as f:
        tags = f.read().split("\n")
    return frozenset(tags)

class Subset(namedtuple("BaseSet", "sentences keys vocab X tagset Y N stream")):
    def __new__(cls, sentences, keys):
        word_sequences = tuple([sentences[k].words for k in keys])
        tag_sequences = tuple([sentences[k].tags for k in keys])
        wordset = frozenset(chain(*word_sequences))
        tagset = frozenset(chain(*tag_sequences))
        N = sum(1 for _ in chain(*(sentences[k].words for k in keys)))
        stream = tuple(zip(chain(*word_sequences), chain(*tag_sequences)))
        return super().__new__(cls, {k: sentences[k] for k in keys}, keys, wordset, word_sequences,
                               tagset, tag_sequences, N, stream.__iter__)

    def __len__(self):
        return len(self.sentences)

    def __iter__(self):
        return iter(self.sentences.items())

class Dataset(namedtuple("_Dataset", "sentences keys vocab X tagset Y training_set testing_set N stream")):
    def __new__(cls, tagfile, datafile, train_test_split=0.8, seed=112890):
        tagset = read_tags(tagfile)
        sentences = read_data(datafile)
        keys = tuple(sentences.keys())
        wordset = frozenset(chain(*[s.words for s in sentences.values()]))
        word_sequences = tuple([sentences[k].words for k in keys])
        tag_sequences = tuple([sentences[k].tags for k in keys])
        N = sum(1 for _ in chain(*(s.words for s in sentences.values())))
        
        # split data into train/test sets
        _keys = list(keys)
        if seed is not None: random.seed(seed)
        random.shuffle(_keys)
        split = int(train_test_split * len(_keys))
        training_data = Subset(sentences, _keys[:split])
        testing_data = Subset(sentences, _keys[split:])
        stream = tuple(zip(chain(*word_sequences), chain(*tag_sequences)))
        return super().__new__(cls, dict(sentences), keys, wordset, word_sequences, tagset,
                               tag_sequences, training_data, testing_data, N, stream.__iter__)

    def __len__(self):
        return len(self.sentences)

    def __iter__(self):
        return iter(self.sentences.items())

In [9]:
data = Dataset("tags-universal.txt", "S21-gene-train.txt", train_test_split=0.8)

In [None]:
data.sentences.keys()

In [None]:
sentences

['I', 'O', 'B']

#### Features extraction

Next, we extract more features (word parts, simplified POS tags, lower/title/upper flags, features of nearby words) and convert them to sklear-crfsuite format - each sentence should be converted to a list of dicts.

Stopwords,containsANumber-->(I,B),camelCase,endingWithASE,endingWithIN,wordLength,(max-word)/(max-min)*check*, 

In [11]:
def word2features(sent, i):
    word = sent[i][0]    
    features = {
        'bias': 1.0, 
        'word.lower()': word.lower(), 
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i-1][0]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
        })
    else:
        features['BOS'] = True
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
        })
    else:
        features['EOS'] = True

    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token,label in sent]

def sent2tokens(sent):
    return [token for token, label in sent]

The above code were taken from sklearn-crfsuite official site.

Split train and test sets.

In [66]:
classes = list(data.tagset)

def getMeSentences(data):
    sentences = []
    for key in data:
        sentence = []
        for val in zip(data[key].words,data[key].tags):
            sentence.append(val)
        sentences.append(sentence)
    return sentences

In [67]:
X_train = [sent2features(s) for s in getMeSentences(data.training_set.sentences)]
X_test = [sent2features(s) for s in getMeSentences(data.testing_set.sentences)]
y_train = [sent2labels(s) for s in getMeSentences(data.training_set.sentences)]
y_test = [sent2labels(s) for s in getMeSentences(data.testing_set.sentences)]

In [78]:
# data.training_set.sentences
# y_train[:2]

{10685: Sentence(words=('In', 'these', 'cells', ',', 'ras', '-', 'induced', 'transition', 'is', 'accompanied', 'by', 'a', 'strong', 'induction', 'of', 'AP', '-', '1', '-', 'binding', 'activity', 'along', 'with', 'increased', 'expression', 'of', 'CD44', 'mRNA', 'and', 'protein', '.'), tags=('O', 'O', 'O', 'O', 'B', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'I', 'I', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'I', 'O', 'O', 'O')),
 2707: Sentence(words=('The', 'galactose', 'transporter', 'shows', 'both', 'sequence', 'and', 'structural', 'homology', 'with', 'a', 'superfamily', 'of', 'sugar', 'transporters', 'which', 'includes', 'the', 'human', 'HepG2', '-', 'erythrocyte', 'and', 'fetal', 'muscle', 'glucose', 'transporters', ',', 'the', 'rat', 'brain', 'and', 'liver', 'glucose', 'transporters', ',', 'the', 'Escherichia', 'coli', 'xylose', 'and', 'arabinose', 'permeases', ',', 'and', 'the', 'S', '.', 'cerevisiae', 'glucose', ',', 'maltose', ',', 'and', 'galactose', 'transport

In [77]:
for key in data.training_set.sentences:
    print(key)
    break

10685


In [13]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

In [79]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)



CRF(algorithm='lbfgs', all_possible_transitions=True, c1=0.1, c2=0.1,
    keep_tempfiles=None, max_iterations=100)

In [80]:
y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=classes)

0.9530159715745871

In [88]:
len(data.testing_set.sentences[10080].tags)

48

In [87]:
print(len(y_test[0]))

48


In [89]:
len(y_pred[0])

48

In [22]:
print(metrics.flat_classification_report(y_test, y_pred, labels = classes))



              precision    recall  f1-score   support

           I       0.79      0.72      0.75      7979
           O       0.97      0.98      0.98    112634
           B       0.81      0.70      0.75      5384

    accuracy                           0.95    125997
   macro avg       0.86      0.80      0.83    125997
weighted avg       0.95      0.95      0.95    125997



In [None]:
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.grid_search import RandomizedSearchCV

crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted', labels=classes)

# search
rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50,
                        scoring=f1_scorer)
rs.fit(X_train, y_train)

In [None]:
print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)
print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))

In [45]:
crf = rs.best_estimator_
y_pred = crf.predict(X_test)
print(metrics.flat_classification_report(y_test, y_pred, labels=new_classes))

  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

      B-art       1.00      0.03      0.07        29
      B-eve       0.83      0.21      0.33        24
      B-geo       0.75      0.87      0.81      1043
      B-gpe       0.88      0.78      0.83       588
      B-nat       0.67      0.20      0.31        10
      B-org       0.74      0.63      0.68       649
      B-per       0.81      0.80      0.81       546
      B-tim       0.90      0.84      0.87       589
      I-art       0.00      0.00      0.00         7
      I-eve       0.67      0.22      0.33        18
      I-geo       0.67      0.71      0.69       204
      I-gpe       0.39      0.53      0.45        17
      I-nat       1.00      0.50      0.67         2
      I-org       0.78      0.72      0.75       545
      I-per       0.81      0.89      0.85       574
      I-tim       0.79      0.66      0.72       185

avg / total       0.80      0.78      0.78      5030



In [43]:
from collections import Counter

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top likely transitions:")
print_transitions(Counter(crf.transition_features_).most_common())



Top likely transitions:
I      -> I       2.548123
O      -> B       2.486112
O      -> O       2.463966
B      -> I       1.755441
I      -> O       -0.783299
B      -> O       -1.524286


It is very likely that the beginning of a geographical entity (B-geo) will be followed by a token inside geographical entity (I-geo), but transitions to inside of an organization name (I-org) from tokens with other labels are penalized hugely.

In [27]:
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))

print("Top positive:")
print_state_features(Counter(crf.state_features_).most_common(30))

print("\nTop negative:")
print_state_features(Counter(crf.state_features_).most_common()[-30:])

Top positive:
7.381354 O        word.lower():release
6.905846 O        BOS
6.714438 B        BOS
5.771993 B        word.lower():interferon
5.636952 O        word.lower():increase
5.101526 O        word.lower():contains
4.995895 B        word.lower():homeodomain
4.847196 O        word.lower():disease
4.781059 O        word.lower():phase
4.511437 O        word.lower():strains
4.493968 B        word.lower():fibrinogen
4.470155 B        word.lower():histone
4.423800 B        word.lower():collagen
4.345207 I        word.lower():sites
4.341344 O        word.lower():inhibitors
4.295511 B        word.lower():ras
4.290223 O        EOS
4.260480 I        word.lower():antibodies
4.244574 B        word.lower():stats
4.230769 O        word[-3:]:in
4.206672 B        word.lower():hindiii
4.151153 B        word[-2:]:4p
4.147725 O        -1:word.lower():spc1
4.144089 O        word.lower():case
4.081697 O        word.lower():represses
4.060289 O        word[-2:]:he
4.060257 O        word.lower():decrease

### ELI5

ELI5 is a Python package which helps to debug machine learning classifiers and explain their predictions. ELI5 allows to check weights of sklearn_crfsuite.CRF models.

In [127]:
import eli5

eli5.show_weights(crf, top=10)



From \ To,B,I,O
B,-5.696,1.307,-0.873
I,-6.373,2.111,-1.05
O,-0.249,-10.799,1.407

Weight?,Feature,Unnamed: 2_level_0
Weight?,Feature,Unnamed: 2_level_1
Weight?,Feature,Unnamed: 2_level_2
+7.895,BOS,
+6.564,word.lower():interferon,
+5.716,word.lower():ets,
+5.073,word.lower():fibrinogen,
+4.769,word.lower():histone,
+4.620,word.lower():albumin,
+4.383,word.lower():ras,
+4.187,word.lower():insulin,
… 9364 more positive …,… 9364 more positive …,
… 1425 more negative …,… 1425 more negative …,

Weight?,Feature
+7.895,BOS
+6.564,word.lower():interferon
+5.716,word.lower():ets
+5.073,word.lower():fibrinogen
+4.769,word.lower():histone
+4.620,word.lower():albumin
+4.383,word.lower():ras
+4.187,word.lower():insulin
… 9364 more positive …,… 9364 more positive …
… 1425 more negative …,… 1425 more negative …

Weight?,Feature
+4.226,word.lower():sites
+3.660,-1:word.lower():activation
+3.586,-1:word.lower():gcn3
+3.429,-1:word.lower():alkaline
+3.297,-1:word.lower():hly
+3.241,-1:word.lower():cych
+3.196,word.lower():promoters
+3.191,word.lower():sequence
+3.188,-1:word.lower():histocompatibility
… 7969 more positive …,… 7969 more positive …

Weight?,Feature
+8.192,BOS
+7.200,word.lower():release
+6.085,word.lower():increase
+5.563,word.lower():contains
+5.400,word.lower():disease
+4.685,word.lower():phase
+4.663,word.lower():strains
+4.472,word.lower():min
+4.450,-1:word.lower():transcriptase
+4.311,word.lower():orf1


It does make sense that I-entity must follow B-entity, such as I-geo follows B-geo, I-org follows B-org, I-per follows B-per, and so on. 

We can also see that it is not common in this dataset to have a person right after an organization name (B-org -> I-per has a large negative weight).

If we regularize CRF more, we can expect that only features which are generic will remain, and memoized tokens will go. Let’s check what effect does regularization have on CRF weights:

In [None]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=200,
    c2=0.1,
    max_iterations=20,
    all_possible_transitions=False,
)
crf.fit(X_train, y_train)
eli5.show_weights(crf, top=10)

In [None]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True,
)
crf.fit(X_train, y_train);
eli5.show_weights(crf, top=5, show=['transition_features'])

The model learned large negative weights for impossible transitions like O -> I-geo, O -> I-org and O -> I-tim, and so on.

In order to easy to read, we can check only a subset of tags.

In [None]:
eli5.show_weights(crf, top=10, targets=['O', 'B', 'I'])

Or check only some of the features for all tags.

In [None]:
eli5.show_weights(crf, top=10, feature_re='^word\.is',
                  horizontal_layout=False, show=['targets'])

In [93]:
with open('yoursystemoutput.txt', 'w') as f:
    k = 0
    for key in data.testing_set.sentences:
        for i,val in enumerate(zip(data.testing_set.sentences[key].words,y_pred[k])):
            f.write("\t".join([str(i+1),val[0],val[1]]) + "\n")
        k += 1
        f.write("\n")

In [94]:
with open('goldstandardfile.txt', 'w') as f:
    for key in data.testing_set.sentences:
        for i,val in enumerate(zip(data.testing_set.sentences[key].words,data.testing_set.sentences[key].tags)):
            f.write("\t".join([str(i+1),val[0],val[1]]) + "\n")
        f.write("\n")

In [None]:
y_pred

In [114]:
Test_Sentence = namedtuple("Sentence", "words")

def read_test_data(filename):
    """Read tagged sentence data"""
    with open(filename, 'r') as f:
        sentence_lines = [l.split("\n") for l in f.read().split("\n\n")]
        index = 1
        a = OrderedDict()
        for s in sentence_lines:
            temp = []
            for l in s:
                temp.append(l.strip().split("\t")[1:])   
            temp2 = []
            for val in temp:
                if len(val) == 1:
                    temp2.append(val[0])
                          
            a[index] = Test_Sentence(tuple(temp2))
            index += 1
        return a

In [115]:
class TestDataset(namedtuple("_TDataset", "sentences keys vocab X N")):
    def __new__(cls, tagfile, datafile, train_test_split=0.8, seed=112890):
        sentences = read_test_data(datafile)
        keys = tuple(sentences.keys())
        wordset = frozenset(chain(*[s.words for s in sentences.values()]))
        word_sequences = tuple([sentences[k].words for k in keys])
        N = sum(1 for _ in chain(*(s.words for s in sentences.values())))
        
        return super().__new__(cls, dict(sentences), keys, wordset, word_sequences,N)

    def __len__(self):
        return len(self.sentences)

    def __iter__(self):
        return iter(self.sentences.items())

In [116]:
test_data = TestDataset("tags-universal.txt", "S21-gene-test.txt")

In [125]:
test_data.sentences

{1: Sentence(words=('However', ',', 'C3', 'toxin', 'alone', 'or', 'in', 'combination', 'with', 'growth', 'factors', 'did', 'not', 'stimulate', 'AP', '-', '1', ':', 'Luc', 'activity', 'and', 'actually', 'antagonized', 'the', 'synergistic', 'activation', 'of', 'AP', '-', '1', ':', 'Luc', 'observed', 'in', 'response', 'to', 'co', '-', 'stimulation', 'with', 'growth', 'factors', 'and', 'Ro', '-', '31', '-', '8220', '.')),
 2: Sentence(words=('The', 'aim', 'of', 'the', 'present', 'study', 'was', 'to', 'examine', 'the', 'antimicrobial', 'susceptibility', 'to', '10', 'currently', 'used', 'antimicrobial', 'agents', 'of', '50', 'strains', 'of', 'P', '.', 'acnes', 'isolated', 'from', 'acne', 'lesions', 'and', 'identified', 'using', 'a', 'Rap', 'ID', 'ANA', 'II', 'panel', '.')),
 3: Sentence(words=('The', 'role', 'of', 'intraoperative', 'echocardiography', 'in', 'surgery', 'of', 'the', 'heart', 'and', 'large', 'vessels')),
 4: Sentence(words=('Taken', 'together', ',', 'our', 'results', 'demonstra

In [120]:
def getMeTestSentences(data):
    sentences = []
    for key in data:
        sentence = []
        for val in zip(data[key].words):
            sentence.append(val)
        sentences.append(sentence)
    return sentences

In [None]:
getMeTestSentences(test_data.sentences)

In [122]:
X_testFinal = [sent2features(s) for s in getMeTestSentences(test_data.sentences)]

In [124]:
y_predTestFinal = crf.predict(X_testFinal)

In [126]:
with open('testFinal.txt', 'w') as f:
    k = 0
    for key in test_data.sentences:
        for i,val in enumerate(zip(test_data.sentences[key].words,y_predTestFinal[k])):
            f.write("\t".join([str(i+1),val[0],val[1]]) + "\n")
        k += 1
        f.write("\n")

In [None]:
data.vocab

True