# Block 3: Sequence Level. Building a NER chunker with CRF.
Jordi Armengol - Joan Llop

In [1]:
import nltk
from nltk.corpus.reader import ConllCorpusReader
import pycrfsuite
import gensim
from gensim.models import Word2Vec 
from sklearn import svm, metrics
# gensim can be downloaded using pip install -U gensim

The 'conll2003' folder with the files 'eng.train', 'eng.testa' and 'eng.testb' is required

In [2]:
train = ConllCorpusReader('conll2003', 'eng.train', ['words', 'pos', 'ne', 'chunk']).iob_sents()[1:]
testa = ConllCorpusReader('conll2003', 'eng.testa', ['words', 'pos', 'ne', 'chunk']).iob_sents()[1:]
# testb = ConllCorpusReader('conll2003', 'eng.testb', ['words', 'pos', 'ne', 'chunk']).iob_sents()[1:]

In [3]:
def get_words_from_sent(sent):
    return [words for words, postag, label in sent]


# used when all features are embeddings
def get_embedded_word_features(i, words, model):
    word_features = []
    for j in range(len(words)):
        word_features.append(str(model.wv.similarity(words[i], words[j])))
    return word_features


# used when all features are embeddings
def get_embedded_sentence_features(sent, model):
    words = get_words_from_sent(sent)
    features = [get_embedded_word_features(i, words, model) for i in range(len(words))]
    return features


# used when all features are embeddings
def get_embedded_features(corpus):
    words = [get_words_from_sent(sent) for sent in corpus]
    model = gensim.models.Word2Vec(words, min_count = 1, size = 100, window = 5)
    return [get_embedded_sentence_features(sent, model) for sent in corpus]


def get_word_features(i, sent):
    words = get_words_from_sent(sent)
    word = words[i]
    features = []
    features.append('word is upper: ' + str(word.isupper())) # is uppercase
    features.append('word is digit: ' + str(word.isdigit())) # is a digit
    features.append('Beggining of sentence: ' + str(i==0)) # beggining of a sentence
    features.append('End of sentence: ' + str(i==len(words)-1)) # end of sentence
    features.append('lenght of word: ' + str(len(words))) # lenght of word
    features.append('postag: ' + str(sent[i][1])) # Postag                   
    if (i > 0):
        previous_word = words[i-1]
        features.append('postag previous word: ' + str(sent[i-1][1]))
        features.append('lenght of previous word: ' + str(len(previous_word)))
        features.append('previous word is upper: ' + str(previous_word.isupper()))
        features.append('previous word is digit: ' + str(previous_word.isdigit()))
    if (i > 1):
        previous_word = words[i-2]
        features.append('postag second previous word: ' + str(sent[i-1][1]))
        features.append('lenght of second previous word: ' + str(len(previous_word)))
        features.append('second previous word is upper: ' + str(previous_word.isupper()))
        features.append('second previous word is digit: ' + str(previous_word.isdigit()))
    if (i < len(words)-1):
        next_word = words[i+1]
        features.append('postag next word: ' + str(sent[i+1][1]))
        features.append('lenght of next word: ' + str(len(next_word)))
        features.append('next word is upper: ' + str(next_word.isupper()))
        features.append('next word is digit: ' + str(next_word.isdigit()))
    if (i < len(words)-2):
        next_word = words[i+2]
        features.append('postag second next word: ' + str(sent[i+1][1]))
        features.append('lenght of second next word: ' + str(len(next_word)))
        features.append('second next word is upper: ' + str(next_word.isupper()))
        features.append('second next word is digit: ' + str(next_word.isdigit()))
    return features


def get_sentence_features(sent):
    return [get_word_features(i, sent) for i in range(len(sent))]


def get_features(corpus):
    return [get_sentence_features(sent) for sent in corpus]
    
                        
def get_sentence_labels(sent):
    return [label for words, postag, label in sent]
                        
                        
def get_labels(corpus):
    return [get_sentence_labels(sent) for sent in corpus]


In [4]:
%%time
# train_features = get_embedded_features(train)
train_features = get_features(train)
train_labels = get_labels(train)

# testa_features = get_embedded_features(testa)
testa_features = get_features(testa)
testa_labels = get_labels(testa)

# testb_features = get_features(testb)
# testb_labels = get_labels(testb)

CPU times: user 5.38 s, sys: 284 ms, total: 5.67 s
Wall time: 5.85 s


In [5]:
%%time
CRF = pycrfsuite.Trainer(verbose=False)

for x, y in zip(train_features, train_labels):
    CRF.append(x, y)

CPU times: user 2.89 s, sys: 27.8 ms, total: 2.92 s
Wall time: 2.94 s


In [6]:
%%time
CRF.train('conll2003-eng.train')

CPU times: user 1min 56s, sys: 120 ms, total: 1min 56s
Wall time: 1min 58s


In [9]:
tagger = pycrfsuite.Tagger()
tagger.open('conll2003-eng.train')
y_pred = []

# use testa_features for validation and testb_features for testing
for sentence_pred in [tagger.tag(x) for x in testa_features]:
    for pred in sentence_pred:
        y_pred.append(pred)
        
# use testa_labels for validation and testb_labels for testing
y_test = []
for sentence_labels in testa_labels:
    for label in sentence_labels:
        y_test.append(label)
        
# Print results 
print('accuracy =', metrics.accuracy_score(y_true=y_test, y_pred=y_pred))
print(metrics.classification_report(y_true=y_test, y_pred=y_pred))

accuracy = 0.907928040185


  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

      B-MISC       0.00      0.00      0.00         4
       I-LOC       0.53      0.56      0.55      2094
      I-MISC       0.35      0.14      0.20      1264
       I-ORG       0.55      0.49      0.52      2092
       I-PER       0.74      0.77      0.76      3149
           O       0.96      0.98      0.97     42759

    accuracy                           0.91     51362
   macro avg       0.52      0.49      0.50     51362
weighted avg       0.90      0.91      0.90     51362

