### Conditional Random Fields (CRFs)

In [3]:
import pandas as pd
import numpy as np
import sklearn as sk
import sklearn_crfsuite
import os
import random
from collections import Counter, defaultdict, namedtuple, OrderedDict
from itertools import chain
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import Perceptron
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
from io import BytesIO
from itertools import chain


In [2]:
Test_Sentence = namedtuple("Sentence", "words")

def getMeTestSentences(data):
    sentences = []
    for key in data:
        sentence = []
        for val in zip(data[key].words):
            sentence.append(val)
        sentences.append(sentence)
    return sentences

def read_test_data(filename):
    """Read tagged sentence data"""
    with open(filename, 'r') as f:
        sentence_lines = [l.split("\n") for l in f.read().split("\n\n")]
        index = 1
        a = OrderedDict()
        for s in sentence_lines:
            temp = []
            for l in s:
                temp.append(l.strip().split("\t")[1:])   
            temp2 = []
            for val in temp:
                if len(val) == 1:
                    temp2.append(val[0])
                          
            a[index] = Test_Sentence(tuple(temp2))
            index += 1
        return a
        
class TestDataset(namedtuple("_TDataset", "sentences keys vocab X N")):
    def __new__(cls, tagfile, datafile, train_test_split=0.8, seed=112890):
        sentences = read_test_data(datafile)
        keys = tuple(sentences.keys())
        wordset = frozenset(chain(*[s.words for s in sentences.values()]))
        word_sequences = tuple([sentences[k].words for k in keys])
        N = sum(1 for _ in chain(*(s.words for s in sentences.values())))
        
        return super().__new__(cls, dict(sentences), keys, wordset, word_sequences,N)

    def __len__(self):
        return len(self.sentences)

    def __iter__(self):
        return iter(self.sentences.items())

In [3]:
Sentence = namedtuple("Sentence", "words tags")

def getMeSentences(data):
    sentences = []
    for key in data:
        sentence = []
        for val in zip(data[key].words,data[key].tags):
            sentence.append(val)
        sentences.append(sentence)
    return sentences

def read_data(filename):
    """Read tagged sentence data"""
    with open(filename, 'r') as f:
        sentence_lines = [l.split("\n") for l in f.read().split("\n\n")]
        index = 1
        a = OrderedDict()
        for s in sentence_lines:
            temp = []
            for l in s:
                temp.append(l.strip().split("\t")[1:])
            
            temp2 = []
            temp3 = []
            for val in temp:
                if len(val) == 2:
                    temp2.append(val[0])
                    temp3.append(val[1])
                          
            a[index] = Sentence(tuple(temp2),tuple(temp3))
            index += 1
        return a

def read_tags(filename):
    """Read a list of word tag classes"""
    with open(filename, 'r') as f:
        tags = f.read().split("\n")
    return frozenset(tags)

class Subset(namedtuple("BaseSet", "sentences keys vocab X tagset Y N stream")):
    def __new__(cls, sentences, keys):
        word_sequences = tuple([sentences[k].words for k in keys])
        tag_sequences = tuple([sentences[k].tags for k in keys])
        wordset = frozenset(chain(*word_sequences))
        tagset = frozenset(chain(*tag_sequences))
        N = sum(1 for _ in chain(*(sentences[k].words for k in keys)))
        stream = tuple(zip(chain(*word_sequences), chain(*tag_sequences)))
        return super().__new__(cls, {k: sentences[k] for k in keys}, keys, wordset, word_sequences,
                               tagset, tag_sequences, N, stream.__iter__)

    def __len__(self):
        return len(self.sentences)

    def __iter__(self):
        return iter(self.sentences.items())

class Dataset(namedtuple("_Dataset", "sentences keys vocab X tagset Y training_set testing_set N stream")):
    def __new__(cls, tagfile, datafile, train_test_split=0.8, seed=112890):
        tagset = read_tags(tagfile)
        sentences = read_data(datafile)
        keys = tuple(sentences.keys())
        wordset = frozenset(chain(*[s.words for s in sentences.values()]))
        word_sequences = tuple([sentences[k].words for k in keys])
        tag_sequences = tuple([sentences[k].tags for k in keys])
        N = sum(1 for _ in chain(*(s.words for s in sentences.values())))
        
        # split data into train/test sets
        _keys = list(keys)
        if seed is not None: random.seed(seed)
        random.shuffle(_keys)
        split = int(train_test_split * len(_keys))
        training_data = Subset(sentences, _keys[:split])
        testing_data = Subset(sentences, _keys[split:])
        stream = tuple(zip(chain(*word_sequences), chain(*tag_sequences)))
        return super().__new__(cls, dict(sentences), keys, wordset, word_sequences, tagset,
                               tag_sequences, training_data, testing_data, N, stream.__iter__)

    def __len__(self):
        return len(self.sentences)

    def __iter__(self):
        return iter(self.sentences.items())

In [4]:
data = Dataset("tags-universal.txt", "S21-gene-train.txt", train_test_split=0.8)

In [5]:
classes = list(data.tagset)

#### Features extraction

Next, we extract more features (word parts, simplified POS tags, lower/title/upper flags, features of nearby words) and convert them to sklear-crfsuite format - each sentence should be converted to a list of dicts.

Stopwords,containsANumber-->(I,B),camelCase,endingWithASE,endingWithIN,wordLength,(max-word)/(max-min)*check*, 

In [6]:
def word2features(sent, i):
    word = sent[i][0]    
    features = {
        'bias': 1.0, 
        'word.lower()': word.lower(), 
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i-1][0]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
        })
    else:
        features['BOS'] = True
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
        })
    else:
        features['EOS'] = True

    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token,label in sent]

def sent2tokens(sent):
    return [token for token, label in sent]

In [7]:
X_train = [sent2features(s) for s in getMeSentences(data.training_set.sentences)]
X_test = [sent2features(s) for s in getMeSentences(data.testing_set.sentences)]
y_train = [sent2labels(s) for s in getMeSentences(data.training_set.sentences)]
y_test = [sent2labels(s) for s in getMeSentences(data.testing_set.sentences)]

In [8]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)



CRF(algorithm='lbfgs', all_possible_transitions=True, c1=0.1, c2=0.1,
    keep_tempfiles=None, max_iterations=100)

In [9]:
y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=classes)

0.9530159715745871

In [10]:
print(metrics.flat_classification_report(y_test, y_pred, labels = classes))



              precision    recall  f1-score   support

           O       0.97      0.98      0.98     68134
           B       0.82      0.72      0.77      3309
           I       0.78      0.72      0.75      4928

    accuracy                           0.95     76371
   macro avg       0.86      0.81      0.83     76371
weighted avg       0.95      0.95      0.95     76371



In [11]:
from collections import Counter

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top likely transitions:")
print_transitions(Counter(crf.transition_features_).most_common())

Top likely transitions:
I      -> I       2.110688
O      -> O       1.406979
B      -> I       1.306918
O      -> B       -0.249158
B      -> O       -0.873324
I      -> O       -1.049799
B      -> B       -5.695768
I      -> B       -6.372783
O      -> I       -10.799272


In [12]:
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))

print("Top positive:")
print_state_features(Counter(crf.state_features_).most_common(3))

print("\nTop negative:")
print_state_features(Counter(crf.state_features_).most_common()[-3:])

Top positive:
8.192295 O        BOS
7.894808 B        BOS
7.199896 O        word.lower():release

Top negative:
-4.006191 B        word[-2:]:ll
-4.243715 B        word[-2:]:he
-5.730898 B        word.isdigit()


### ELI5

ELI5 is a Python package which helps to debug machine learning classifiers and explain their predictions. ELI5 allows to check weights of sklearn_crfsuite.CRF models.

In [14]:
import eli5


eli5.show_weights(crf, top=10)



From \ To,B,I,O
B,-5.696,1.307,-0.873
I,-6.373,2.111,-1.05
O,-0.249,-10.799,1.407

Weight?,Feature,Unnamed: 2_level_0
Weight?,Feature,Unnamed: 2_level_1
Weight?,Feature,Unnamed: 2_level_2
+7.895,BOS,
+6.564,word.lower():interferon,
+5.716,word.lower():ets,
+5.073,word.lower():fibrinogen,
+4.769,word.lower():histone,
+4.620,word.lower():albumin,
+4.383,word.lower():ras,
+4.187,word.lower():insulin,
… 9364 more positive …,… 9364 more positive …,
… 1425 more negative …,… 1425 more negative …,

Weight?,Feature
+7.895,BOS
+6.564,word.lower():interferon
+5.716,word.lower():ets
+5.073,word.lower():fibrinogen
+4.769,word.lower():histone
+4.620,word.lower():albumin
+4.383,word.lower():ras
+4.187,word.lower():insulin
… 9364 more positive …,… 9364 more positive …
… 1425 more negative …,… 1425 more negative …

Weight?,Feature
+4.226,word.lower():sites
+3.660,-1:word.lower():activation
+3.586,-1:word.lower():gcn3
+3.429,-1:word.lower():alkaline
+3.297,-1:word.lower():hly
+3.241,-1:word.lower():cych
+3.196,word.lower():promoters
+3.191,word.lower():sequence
+3.188,-1:word.lower():histocompatibility
… 7969 more positive …,… 7969 more positive …

Weight?,Feature
+8.192,BOS
+7.200,word.lower():release
+6.085,word.lower():increase
+5.563,word.lower():contains
+5.400,word.lower():disease
+4.685,word.lower():phase
+4.663,word.lower():strains
+4.472,word.lower():min
+4.450,-1:word.lower():transcriptase
+4.311,word.lower():orf1


In [15]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=200,
    c2=0.1,
    max_iterations=20,
    all_possible_transitions=False,
)
crf.fit(X_train, y_train)
eli5.show_weights(crf, top=10)

From \ To,B,I,O
B,0.0,1.965,-1.455
I,0.0,2.447,-0.983
O,2.815,0.0,2.49

Weight?,Feature,Unnamed: 2_level_0
Weight?,Feature,Unnamed: 2_level_1
Weight?,Feature,Unnamed: 2_level_2
+1.122,-1:word.lower():the,
+0.944,-1:word.lower():of,
+0.890,+1:word.lower():-,
+0.719,word.isupper(),
+0.618,-1:word.lower():and,
… 14 more positive …,… 14 more positive …,
… 14 more negative …,… 14 more negative …,
-0.335,word[-2:]:-,
-0.335,word.lower():-,
-0.335,word[-3:]:-,

Weight?,Feature
+1.122,-1:word.lower():the
+0.944,-1:word.lower():of
+0.890,+1:word.lower():-
+0.719,word.isupper()
+0.618,-1:word.lower():and
… 14 more positive …,… 14 more positive …
… 14 more negative …,… 14 more negative …
-0.335,word[-2:]:-
-0.335,word.lower():-
-0.335,word[-3:]:-

Weight?,Feature
+0.684,-1:word.lower():-
+0.619,word[-3:]:ase
+0.506,word[-3:]:tor
+0.446,word[-2:]:se
+0.430,word[-2:]:ne
+0.395,word[-3:]:ene
+0.371,word.lower():gene
+0.333,word.isdigit()
… 23 more positive …,… 23 more positive …
… 36 more negative …,… 36 more negative …

Weight?,Feature
+2.106,EOS
+0.911,word[-2:]:ed
+0.790,BOS
+0.787,word[-2:]:.
+0.787,word[-3:]:.
+0.787,word.lower():.
+0.738,word.lower():the
+0.737,word[-2:]:he
… 48 more positive …,… 48 more positive …
… 43 more negative …,… 43 more negative …


In [16]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True,
)
crf.fit(X_train, y_train);
eli5.show_weights(crf, top=5, show=['transition_features'])

From \ To,B,I,O
B,-5.696,1.307,-0.873
I,-6.373,2.111,-1.05
O,-0.249,-10.799,1.407


In order to easy to read, we can check only a subset of tags.

In [17]:
eli5.show_weights(crf, top=10, targets=['O', 'B', 'I'])

From \ To,O,B,I
O,1.407,-0.249,-10.799
B,-0.873,-5.696,1.307
I,-1.05,-6.373,2.111

Weight?,Feature,Unnamed: 2_level_0
Weight?,Feature,Unnamed: 2_level_1
Weight?,Feature,Unnamed: 2_level_2
+8.192,BOS,
+7.200,word.lower():release,
+6.085,word.lower():increase,
+5.563,word.lower():contains,
+5.400,word.lower():disease,
+4.685,word.lower():phase,
+4.663,word.lower():strains,
+4.472,word.lower():min,
+4.450,-1:word.lower():transcriptase,
+4.311,word.lower():orf1,

Weight?,Feature
+8.192,BOS
+7.200,word.lower():release
+6.085,word.lower():increase
+5.563,word.lower():contains
+5.400,word.lower():disease
+4.685,word.lower():phase
+4.663,word.lower():strains
+4.472,word.lower():min
+4.450,-1:word.lower():transcriptase
+4.311,word.lower():orf1

Weight?,Feature
+7.895,BOS
+6.564,word.lower():interferon
+5.716,word.lower():ets
+5.073,word.lower():fibrinogen
+4.769,word.lower():histone
+4.620,word.lower():albumin
+4.383,word.lower():ras
+4.187,word.lower():insulin
… 9364 more positive …,… 9364 more positive …
… 1425 more negative …,… 1425 more negative …

Weight?,Feature
+4.226,word.lower():sites
+3.660,-1:word.lower():activation
+3.586,-1:word.lower():gcn3
+3.429,-1:word.lower():alkaline
+3.297,-1:word.lower():hly
+3.241,-1:word.lower():cych
+3.196,word.lower():promoters
+3.191,word.lower():sequence
+3.188,-1:word.lower():histocompatibility
… 7969 more positive …,… 7969 more positive …


Or check only some of the features for all tags.

In [18]:
eli5.show_weights(crf, top=10, feature_re='^word\.is',
                  horizontal_layout=False, show=['targets'])

Weight?,Feature
0.179,word.istitle()
-0.654,word.isupper()
-5.731,word.isdigit()

Weight?,Feature
0.303,word.isupper()
0.162,word.istitle()
0.078,word.isdigit()

Weight?,Feature
0.068,word.isdigit()
-0.127,word.istitle()
-0.916,word.isupper()


## Writing To Files

In [19]:
with open('yoursystemoutput.txt', 'w') as f:
    k = 0
    for key in data.testing_set.sentences:
        for i,val in enumerate(zip(data.testing_set.sentences[key].words,y_pred[k])):
            f.write("\t".join([str(i+1),val[0],val[1]]) + "\n")
        k += 1
        f.write("\n")

with open('goldstandardfile.txt', 'w') as f:
    for key in data.testing_set.sentences:
        for i,val in enumerate(zip(data.testing_set.sentences[key].words,data.testing_set.sentences[key].tags)):
            f.write("\t".join([str(i+1),val[0],val[1]]) + "\n")
        f.write("\n")

In [20]:
test_data = TestDataset("tags-universal.txt", "S21-gene-test.txt")

X_testFinal = [sent2features(s) for s in getMeTestSentences(test_data.sentences)]

y_predTestFinal = crf.predict(X_testFinal)

with open('testFinal.txt', 'w') as f:
    k = 0
    for key in test_data.sentences:
        for i,val in enumerate(zip(test_data.sentences[key].words,y_predTestFinal[k])):
            f.write("\t".join([str(i+1),val[0],val[1]]) + "\n")
        k += 1
        f.write("\n")

## Remaining Tasks

In [6]:
import sklearn
print(sklearn.__version__)

0.23.2


In [1]:
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.grid_search import RandomizedSearchCV

crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted', labels=classes)

# search
rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50,
                        scoring=f1_scorer)
rs.fit(X_train, y_train)

ModuleNotFoundError: No module named 'sklearn.grid_search'

In [None]:
print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)
print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))

In [None]:
crf = rs.best_estimator_
y_pred = crf.predict(X_test)
print(metrics.flat_classification_report(y_test, y_pred, labels=new_classes))