# Block 3: Sequence Level. Building a NER chunker with CRF.
Jordi Armengol - Joan Llop

In this block we want to discover the name entities of sentences using conditional random fields. 
#### CRFs 
Usually a neural network is model that takes a single input and returns the most likely label, but with conditional random fields, the previous inputs and the next inputs matter in the task of assigning a label to an instance. Therefore, we can think of them as a way of modeling the join distribution of a whole sequence of inputs.

In [7]:
import nltk
from nltk.corpus.reader import ConllCorpusReader
# import pycrfsuite
from nltk.tag import CRFTagger
# gensim can be downloaded using pip install -U gensim
import gensim # m'ha semblat inutil, crec que ho hauriem de borrar de cara a l'entrega
from gensim.models import Word2Vec # " "
from sklearn import svm, metrics

In [8]:
train = ConllCorpusReader('conll2003', 'eng.train', ['words', 'pos', 'ne', 'chunk']).iob_sents()[1:]
testa = ConllCorpusReader('conll2003', 'eng.testa', ['words', 'pos', 'ne', 'chunk']).iob_sents()[1:]
testb = ConllCorpusReader('conll2003', 'eng.testb', ['words', 'pos', 'ne', 'chunk']).iob_sents()[1:]
train[0]

[('EU', 'NNP', 'I-ORG'),
 ('rejects', 'VBZ', 'O'),
 ('German', 'JJ', 'I-MISC'),
 ('call', 'NN', 'O'),
 ('to', 'TO', 'O'),
 ('boycott', 'VB', 'O'),
 ('British', 'JJ', 'I-MISC'),
 ('lamb', 'NN', 'O'),
 ('.', '.', 'O')]

## Preprocessing
#### The features
In order to train our CRF we need to create the features (from the data we have the words and the pos). We have decided to use the following features:
- The word in lowercase
- The POS
- The lenght of the word
- A bool that indicates if the word is the beginning of the sentence
- A bool that indicates if the word is the end of the sentence
- A bool that indicates if the word is all in uppercase
- A bool that indicates if the word is a digit
- A bool that indicates if the word is a title

We repeat all these features for the two previous words and for the two next words of the sentence (if they exist).

In [9]:
def get_words_and_pos_tags_from_tokens(tokens):
    words, pos_tags = [[], []]
    for token in tokens:
        word, pos_tag = token.split(' ')
        words.append(word)
        pos_tags.append(pos_tag)
    return words, pos_tags


def get_word_features(tokens, i):
    words, pos_tags = get_words_and_pos_tags_from_tokens(tokens)
    word = words[i]
    pos_tag = pos_tags[i]
    features = []
    features.append('lowercase word: ' + word.lower()) # word in lowercase
    features.append('postag: ' + str(pos_tag)) # Postag                   
    features.append('lenght of word: ' + str(len(words))) # lenght of word
    features.append('BOS: ' + str(i==0)) # beggining of a sentence
    features.append('EOF: ' + str(i==len(words)-1)) # end of sentence
    features.append('word is upper: ' + str(word.isupper())) # is uppercase
    features.append('word is digit: ' + str(word.isdigit())) # is a digit
    features.append('word is title: ' + str(word.istitle())) # is a title
    if (i > 0):
        previous_word = words[i-1]
        pos_tag = pos_tags[i-1]
        features.append('lowercase previous word: ' + previous_word.lower())
        features.append('postag previous word: ' + str(pos_tag))
        features.append('lenght of previous word: ' + str(len(previous_word)))
        features.append('previous word is BOS: ' + str(i-1==0))
        features.append('previous word is EOF: ' + str(i-1==len(words)-1))
        features.append('previous word is upper: ' + str(previous_word.isupper()))
        features.append('previous word is digit: ' + str(previous_word.isdigit()))
        features.append('previous word is title: ' + str(previous_word.istitle()))
    if (i > 1):
        previous_word = words[i-2]
        pos_tag = pos_tags[i-2]
        features.append('lowercase second previous word: ' + previous_word.lower())
        features.append('postag second previous word: ' + str(pos_tag))
        features.append('lenght of second previous word: ' + str(len(previous_word)))
        features.append('second previous word is BOS: ' + str(i-2==0))
        features.append('second previous word is EOF: ' + str(i-2==len(words)-1))
        features.append('second previous word is upper: ' + str(previous_word.isupper()))
        features.append('second previous word is digit: ' + str(previous_word.isdigit()))
        features.append('second previous word is title: ' + str(previous_word.istitle()))
    if (i < len(words)-1):
        next_word = words[i+1]
        pos_tag = pos_tags[i+1]
        features.append('lowercase next word: ' + next_word.lower())
        features.append('postag next word: ' + str(pos_tag))
        features.append('lenght of next word: ' + str(len(next_word)))
        features.append('next word is BOS: ' + str(i+1==0))
        features.append('next word is EOF: ' + str(i+1==len(words)-1))
        features.append('next word is upper: ' + str(next_word.isupper()))
        features.append('next word is digit: ' + str(next_word.isdigit()))
        features.append('next word is title: ' + str(next_word.istitle()))
    if (i < len(words)-2):
        next_word = words[i+2]
        pos_tag = pos_tags[i+2]
        features.append('lowercase second next word: ' + next_word.lower())
        features.append('postag second next word: ' + str(pos_tag))
        features.append('lenght of second next word: ' + str(len(next_word)))
        features.append('second next word is BOS: ' + str(i+2==0))
        features.append('second next word is EOF: ' + str(i+2==len(words)-1))
        features.append('second next word is upper: ' + str(next_word.isupper()))
        features.append('second next word is digit: ' + str(next_word.isdigit()))
        features.append('second next word is title: ' + str(next_word.istitle()))
    return features


def get_token_label(word_tag_ne):
    word, tag, label = word_tag_ne
    return (word + ' ' + tag, label)


def get_list_of_word_tag_ne(sent):
    return [get_token_label(word_tag_ne) for word_tag_ne in sent]


def get_tokens_from_iob_sents(iob_corpus):
    tokens = [get_list_of_word_tag_ne(sent) for sent in iob_corpus]
    return tokens
            

In [18]:
train_tokens = get_tokens_from_iob_sents(train)
testa_tokens = get_tokens_from_iob_sents(testa)
testb_tokens = get_tokens_from_iob_sents(testb)
emp = 0
for t in train_tokens:
    if (len(t) == 0):
        emp += 1
print(emp, '/', len(train_tokens))

945 / 14986


'conll2003-eng.model'

In [13]:
CRF = CRFTagger()
CRF.train(train_tokens, 'conll2003-eng.model')

ValueError: not enough values to unpack (expected 2, got 0)