## NER using CRFs and Word Embeddings

#### AnatEM anatomical entity mention corpus 
http://nactem.ac.uk/anatomytagger/

In [1]:
import pandas as pd
import numpy as np
import itertools
import gensim

In [2]:
PATH_TRAIN = 'data/AnatEM-1.0.2/conll/train/'
PATH_TEST = 'data/AnatEM-1.0.2/conll/test/'

In [3]:
import os
arr_train = os.listdir(PATH_TRAIN)
arr_test = os.listdir(PATH_TEST)

In [4]:
def process(arr, path):
    arr = list(filter(lambda x: x.find('caption') == -1, arr))
    arr = list(filter(lambda x: x.find('sec') == -1, arr))
    texts = []
    for el in arr:
        f = open(path+el, "rt")
        text = f.readlines()
        text = [el.strip('\n').split() for el in text]
        texts.append(text)
    out = list(itertools.chain.from_iterable(texts))
    return out

In [5]:
train_processed = process(arr_train, PATH_TRAIN)
test_processed = process(arr_test, PATH_TEST )

In [6]:
def prepare_for_w2v(arr):
    out = []
    tmp = []
    for i in range(len(arr)):
        if arr[i] != []:
            tmp.append(arr[i][0])
        else:
            out.append(tmp)
            tmp = []
    return out

In [7]:
w2v_sents_train = prepare_for_w2v(train_processed)
w2v_sents_test = prepare_for_w2v(test_processed)
w2v_all = w2v_sents_train + w2v_sents_test + [['UKN']]

In [8]:
import pandas as pd
def to_df(arr):
    processed = []
    c = 1
    for i in range(len(arr)):
        if arr[i] == []:
            processed.append('EOS')
            c += 1
        else:
            processed.append((arr[i], 'Sentence: ' + str(c)))
    filtered = list(filter(lambda x: x != 'EOS', processed))
    filtered = [(x[0][0], x[0][1], x[1]) for x in filtered]
    df = pd.DataFrame(filtered, columns=['Word', 'Tag', 'Sentence#'])
    return df

In [9]:
df_train = to_df(train_processed)

In [10]:
df_test = to_df(test_processed)

In [11]:
df_test.describe()

Unnamed: 0,Word,Tag,Sentence#
count,76715,76715,76715
unique,8352,25,2869
top,.,O,Sentence: 1901
freq,3297,69696,145


In [12]:
## word2vec
from gensim.models import Word2Vec

In [13]:
model = Word2Vec(w2v_all, min_count=1,size= 200,workers=3, window =5)

In [14]:
train_w2v_tmp = list(zip(list(df_train['Word']), list(df_train['Tag']), list(df_train['Sentence#'])))
test_w2v_tmp = list(zip(list(df_test['Word']), list(df_test['Tag']), list(df_test['Sentence#'])))

In [15]:
def get_vecs(arr, model):
    output = []
    for el in arr:
        if el[0] in model:
            tmp = (el[0], el[1], el[2], model[el[0]])
            output.append(tmp)
        else:
            print('False')
            tmp = (el[0], el[1], el[2], model['UKN'])
            output.append(tmp)
    return output

In [16]:
train_vecs = get_vecs(train_w2v_tmp, model)
test_vecs = get_vecs(test_w2v_tmp, model)

  after removing the cwd from sys.path.
  """


In [17]:
df_train = pd.DataFrame(train_vecs, columns=['Word', 'Tag', 'Sentence#', 'Embedding'])

In [18]:
df_train.head()

Unnamed: 0,Word,Tag,Sentence#,Embedding
0,Obstructive,O,Sentence: 1,"[0.009252742, -0.00500164, -0.00021493642, -0...."
1,nephropathy,O,Sentence: 1,"[0.073529035, -0.061625864, 0.010470589, -0.02..."
2,:,O,Sentence: 1,"[1.4891775, -0.77105534, 0.10571906, -0.219595..."
3,lessons,O,Sentence: 1,"[0.005032762, -0.003789113, 0.00022727302, -0...."
4,from,O,Sentence: 1,"[1.1600312, -1.0661284, 0.34372744, -0.5662684..."


In [19]:
df_test = pd.DataFrame(test_vecs, columns=['Word', 'Tag', 'Sentence#', 'Embedding'])

In [20]:
df_test.head()

Unnamed: 0,Word,Tag,Sentence#,Embedding
0,Nuclear,B-Cellular_component,Sentence: 1,"[0.00991862, -0.0093637565, 0.0032952111, -0.0..."
1,translocation,O,Sentence: 1,"[0.14319539, -0.13288176, 0.030554807, -0.0573..."
2,of,O,Sentence: 1,"[1.0930728, -1.03911, 0.26274154, -0.4945856, ..."
3,a,O,Sentence: 1,"[1.2311697, -0.96898663, -0.017469464, -0.3312..."
4,clusterin,O,Sentence: 1,"[0.09582306, -0.07466529, 0.0030629383, -0.026..."


In [21]:
words = list(set(df_train['Word'].values))

In [22]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, t, e) for w, t, e in zip(s["Word"].values.tolist(),
                                                           s["Tag"].values.tolist(), s['Embedding'])]
        self.grouped = self.data.groupby("Sentence#").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [23]:
getter = SentenceGetter(df_train)

In [24]:
sent = getter.get_next()

In [25]:
sentences = getter.sentences

In [26]:
def word2features(sent, i):
    word = sent[i][0]
    embedding = sent[i][2]
    
    features = {
        'bias': 1.0,  
        'word.lower()': word.lower(), 
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:], 
        'word.isupper()': word.isupper(), 
        'word.istitle()': word.istitle(), 
        'word.isdigit()': word.isdigit(), 

    }
    
    for el in embedding:
        if ('embedding' + str(i)) not in features:
            features[('embedding' + str(i))] = el
    
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
        })
    else:
        features['EOS'] = True
    
    return features

In [27]:
def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

In [28]:
def sent2labels(sent):
    return [label for token,label, embedding in sent]

In [29]:
def sent2tokens(sent):
    return [token for token,label, embedding in sent]

In [30]:
X = [sent2features(s) for s in sentences]

In [31]:
y = [sent2labels(s) for s in sentences]

In [32]:
from sklearn_crfsuite import CRF

crf = CRF(algorithm='lbfgs',
          c1=0.1,
          c2=0.1,
          max_iterations=100,
          all_possible_transitions=False)

In [33]:
from sklearn.model_selection import cross_val_predict
from sklearn_crfsuite.metrics import flat_classification_report

In [34]:
from timeit import default_timer as timer
start = timer()
pred = cross_val_predict(estimator=crf, X=X, y=y, cv=5)
end = timer()
print(end - start)



149.96216401700002




In [35]:
report = flat_classification_report(y_pred=pred, y_true=y)


  _warn_prf(average, modifier, msg_start, len(result))


In [36]:
print(report)

                                   precision    recall  f1-score   support

              B-Anatomical_system       0.90      0.47      0.62        60
                         B-Cancer       0.84      0.78      0.81      1415
                           B-Cell       0.86      0.74      0.80      2153
             B-Cellular_component       0.83      0.50      0.62       309
B-Developing_anatomical_structure       0.67      0.12      0.21        33
   B-Immaterial_anatomical_entity       0.94      0.44      0.60       102
         B-Multi-tissue_structure       0.76      0.49      0.59       751
                          B-Organ       0.85      0.54      0.66       353
           B-Organism_subdivision       0.52      0.13      0.21       113
             B-Organism_substance       0.89      0.56      0.69       263
         B-Pathological_formation       0.72      0.31      0.43       164
                         B-Tissue       0.67      0.42      0.52       409
              I-Anatomic

In [37]:
crf.fit(X, y)

CRF(algorithm='lbfgs', all_possible_states=None, all_possible_transitions=False,
    averaging=None, c=None, c1=0.1, c2=0.1, calibration_candidates=None,
    calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
    calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
    gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
    max_linesearch=None, min_freq=None, model_filename=None, num_memories=None,
    pa_type=None, period=None, trainer_cls=None, variance=None, verbose=False)

## Test on some unseen data 

# TODO