## NER using CRFs

#### AnatEM anatomical entity mention corpus 
http://nactem.ac.uk/anatomytagger/

In [1]:
import pandas as pd
import numpy as np
import itertools

In [2]:
PATH_TRAIN = 'data/AnatEM-1.0.2/conll/train/'
PATH_TEST = 'data/AnatEM-1.0.2/conll/test/'

In [3]:
import os
arr_train = os.listdir(PATH_TRAIN)
arr_test = os.listdir(PATH_TEST)

In [4]:
def process(arr, path):
    arr = list(filter(lambda x: x.find('caption') == -1, arr))
    arr = list(filter(lambda x: x.find('sec') == -1, arr))
    texts = []
    for el in arr:
        f = open(path+el, "rt")
        text = f.readlines()
        text = [el.strip('\n').split() for el in text]
        texts.append(text)
    out = list(itertools.chain.from_iterable(texts))
    return out

In [5]:
train_processed = process(arr_train, PATH_TRAIN)
test_processed = process(arr_test, PATH_TEST )

In [6]:
import pandas as pd
def to_df(arr):
    processed = []
    c = 1
    for i in range(len(arr)):
        if arr[i] == []:
            processed.append('EOS')
            c += 1
        else:
            processed.append((arr[i], 'Sentence: ' + str(c)))
    filtered = list(filter(lambda x: x != 'EOS', processed))
    filtered = [(x[0][0], x[0][1], x[1]) for x in filtered]
    df = pd.DataFrame(filtered, columns=['Word', 'Tag', 'Sentence#'])
    return df

In [7]:
df_train = to_df(train_processed)

In [8]:
df_train.describe()

Unnamed: 0,Word,Tag,Sentence#
count,118947,118947,118947
unique,10562,25,4517
top,.,O,Sentence: 157
freq,5188,108253,183


In [9]:
df_test = to_df(test_processed)

In [10]:
df_test.describe()

Unnamed: 0,Word,Tag,Sentence#
count,76715,76715,76715
unique,8352,25,2869
top,.,O,Sentence: 1901
freq,3297,69696,145


In [11]:
words = list(set(df_train['Word'].values))

In [12]:
n_words = len(words); n_words

10562

In [13]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, t) for w, t in zip(s["Word"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence#").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [14]:
getter = SentenceGetter(df_train)

In [15]:
sent = getter.get_next()

In [16]:
sent

[('Obstructive', 'O'),
 ('nephropathy', 'O'),
 (':', 'O'),
 ('lessons', 'O'),
 ('from', 'O'),
 ('cystic', 'B-Pathological_formation'),
 ('kidney', 'B-Organ'),
 ('disease', 'O'),
 ('.', 'O')]

In [17]:
sentences = getter.sentences

In [18]:
def word2features(sent, i):
    word = sent[i][0]
    
    features = {
        'bias': 1.0,  
        'word.lower()': word.lower(), 
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:], 
        'word.isupper()': word.isupper(), 
        'word.istitle()': word.istitle(), 
        'word.isdigit()': word.isdigit(), 

    }
    
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
        })
    else:
        features['EOS'] = True
    
    return features

In [19]:
def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

In [20]:
def sent2labels(sent):
    return [label for token, label in sent]

In [21]:
def sent2tokens(sent):
    return [token for token,label in sent]

In [22]:
X = [sent2features(s) for s in sentences]

In [23]:
y = [sent2labels(s) for s in sentences]

In [24]:
from sklearn_crfsuite import CRF

crf = CRF(algorithm='lbfgs',
          c1=0.1,
          c2=0.1,
          max_iterations=100,
          all_possible_transitions=False)

In [25]:
from sklearn.model_selection import cross_val_predict
from sklearn_crfsuite.metrics import flat_classification_report

In [26]:
from timeit import default_timer as timer
start = timer()
pred = cross_val_predict(estimator=crf, X=X, y=y, cv=5)
end = timer()
print(end - start)



132.752258829




In [27]:
report = flat_classification_report(y_pred=pred, y_true=y)


  _warn_prf(average, modifier, msg_start, len(result))


In [28]:
print(report)

                                   precision    recall  f1-score   support

              B-Anatomical_system       0.90      0.47      0.62        60
                         B-Cancer       0.84      0.78      0.81      1415
                           B-Cell       0.86      0.75      0.80      2153
             B-Cellular_component       0.83      0.50      0.62       309
B-Developing_anatomical_structure       0.67      0.12      0.21        33
   B-Immaterial_anatomical_entity       0.94      0.46      0.62       102
         B-Multi-tissue_structure       0.76      0.49      0.59       751
                          B-Organ       0.84      0.54      0.66       353
           B-Organism_subdivision       0.50      0.12      0.20       113
             B-Organism_substance       0.92      0.56      0.70       263
         B-Pathological_formation       0.74      0.30      0.43       164
                         B-Tissue       0.66      0.42      0.51       409
              I-Anatomic

In [29]:
crf.fit(X, y)

CRF(algorithm='lbfgs', all_possible_states=None, all_possible_transitions=False,
    averaging=None, c=None, c1=0.1, c2=0.1, calibration_candidates=None,
    calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
    calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
    gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
    max_linesearch=None, min_freq=None, model_filename=None, num_memories=None,
    pa_type=None, period=None, trainer_cls=None, variance=None, verbose=False)

## Test on some unseen data

In [30]:
sample = "Lung cancer and other causes of death in relation to smoking"
sample2 = "The patient complained of heart pains. He suffers from high blood pressure."
sample3 = "The partient is suffering from trauma to the brain."
sample4 = "She complains of pain in the larynx."

In [31]:
import nltk
import ssl
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context


In [32]:
def make_prediction(text):
    tokens = nltk.word_tokenize(text)
    text_pos = nltk.pos_tag(tokens)
    features = sent2features(text_pos)
    labels = crf.predict_single(features)
    out = pd.DataFrame(list(zip(tokens, labels)), columns=['Word', 'Entity_Label'])
    return out

In [33]:
preds = make_prediction(sample)

In [34]:
preds

Unnamed: 0,Word,Entity_Label
0,Lung,B-Cancer
1,cancer,I-Cancer
2,and,O
3,other,O
4,causes,O
5,of,O
6,death,O
7,in,O
8,relation,O
9,to,O


In [35]:
preds2 = make_prediction(sample2)

In [36]:
preds2

Unnamed: 0,Word,Entity_Label
0,The,O
1,patient,O
2,complained,O
3,of,O
4,heart,B-Organ
5,pains,O
6,.,O
7,He,O
8,suffers,O
9,from,O


In [37]:
preds3 = make_prediction(sample3)

In [38]:
preds3

Unnamed: 0,Word,Entity_Label
0,The,O
1,partient,O
2,is,O
3,suffering,O
4,from,O
5,trauma,O
6,to,O
7,the,O
8,brain,B-Organ
9,.,O


In [39]:
preds4 = make_prediction(sample4)

In [40]:
preds4

Unnamed: 0,Word,Entity_Label
0,She,O
1,complains,O
2,of,O
3,pain,O
4,in,O
5,the,O
6,larynx,B-Multi-tissue_structure
7,.,O
