# Named Entity Recognition on Ontonotes Dataset

In [11]:
# Necessary Imports
import os
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import f1_score, classification_report
from bs4 import BeautifulSoup
from nltk.corpus import names,gazetteers 
from sklearn_crfsuite import CRF
from sklearn_crfsuite.metrics import flat_f1_score, flat_classification_report
import re
import nltk

## Data Source

The data is available from the [UBC library](http://dvn.library.ubc.ca.ezproxy.library.ubc.ca/dvn/dv/UBCLDS/faces/study/StudyPage.xhtml?globalId=hdl:11272/DSBHN). In this task, I will work with a subset of it: [ontonotes_NE](https://github.com/krieya/Name-Entity_Recognition/blob/master/ontonotes_NE/)

In [3]:
ontonotes_path = "./ontonotes_NE/"

## Initial Data Processing


Generate three lists (`train_data`, `dev_data`, and `test_data`) which consist of the paths of all the NER datafiles from their respective folders.

In [4]:
paths = ['train/', 'dev/', 'test/']
train_data = []
dev_data = []
test_data = []
datas = [train_data, dev_data, test_data]

for path, data in zip(paths, datas):
    p = ontonotes_path + path
    data += [path + c for c in os.listdir(p)]

   

Convert your data from the .name files to standard IOB (**I**nside-**O**utside-**B**eginning) tags for NER. Each line of the data file contains a sentence with XML tags indicating the named entities. For example, if the sentence contains a *GPE* tag such as:

 < ENAMEX TYPE="GPE" > Hong Kong < /ENAMEX >

The tag for 'Hong' is *B-GPE* and 'Kong' is *I-GPE* (GPE stands for Geopolitical Entity). The function below reads in a sentence from the dataset and converts it to a list of tokens with corresponding IOB-tags.

In [5]:
def sentence2iob(sentence):
    '''Input sentence is a string from the Ontonotes corpus, with xml tags indicating named enitites
    output is a list of tokens and a list of NER IOB-tags corresponding to those tokens'''
    # your code here
    
    if sentence.startswith('</DOC>') or sentence.startswith("\n"):
        return [], []
    
    curr_tokens = []
    curr_tags = []
    split_list = re.split("<|>", sentence)
    name_yes = False
    
    for s in split_list:
        if s == "":
            continue
        
        if s.startswith("ENAMEX"):
            enamex_type = s.split('''TYPE="''')[1].split('"')[0]
            name_yes = True
            
        elif s == '/ENAMEX':
            name_yes = False
            
        else:
            tokens = s.strip().split(" ")
            if name_yes:
                first_word = tokens.pop(0)

                curr_tokens.append(first_word)
                curr_tags.append("B-" + enamex_type)
                
                if tokens:
                    for word in tokens:
                        if word == " " or word == "":
                            continue
                        curr_tokens.append(word)
                        curr_tags.append("I-" + enamex_type)

            else:
                for word in tokens:
                    if word == " " or word == "":
                        continue
                    curr_tokens.append(word)
                    curr_tags.append("O")
    return curr_tokens, curr_tags         
    

In [356]:
# tests
check_sentence = 'While <ENAMEX TYPE="PERSON">Galloway</ENAMEX> \'s <ENAMEX TYPE="ORG" S_OFF="4">pro-Wal-Mart</ENAMEX> film introduces us to grateful employees /-'
curr_tokens, curr_tags = sentence2iob(check_sentence)
assert curr_tags == ['O', 'B-PERSON', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O']

check_sentence = '<ENAMEX TYPE="GPE">Moscow</ENAMEX> , overcast changing to moderate snow , <ENAMEX TYPE="QUANTITY">2 degrees below zero</ENAMEX> to <ENAMEX TYPE="QUANTITY">1 degree</ENAMEX> .'
curr_tokens, curr_tags = sentence2iob(check_sentence)
assert curr_tags == ['B-GPE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-QUANTITY', 'I-QUANTITY', 'I-QUANTITY', 'I-QUANTITY', 'O', 'B-QUANTITY', 'I-QUANTITY', 'O']

token_count = 0
for filename in train_data:
    with open(ontonotes_path + filename, encoding="utf-8") as f:
        f.readline()
        for sentence in f:
            curr_tokens, curr_tags = sentence2iob(sentence)
            token_count += len(curr_tokens)
            assert len(curr_tokens) == len(curr_tags)
            assert "" not in curr_tokens # if you have empty strings, you've done something wrong

assert token_count == 1096878

print("Success!")

Success!


## Naive Bayes Classification

Train a simple Naive bayes classifer to perform NER. 

The quality of the model depends on utilizing informative features for our task. So the features below are used to boost the classfier performance

In [6]:
gaz_vocab_ori = set(gazetteers.words())
gaz_vocab = set()

for v in gaz_vocab_ori:
    for v_s in v.split(" "):
        gaz_vocab.add(v_s)

In [7]:

names_vocab = set(names.words())
female_names = set(names.words('female.txt'))
male_names = set(names.words('male.txt'))
    

In [38]:
pad_token = '<pad>'

def word2features(sentence, idx, postags):
    word_features = {}
    word_features['word_lowercase'] = sentence[idx].lower()
    
    padded_sent = [pad_token, pad_token] + [s for s in sentence] + [pad_token, pad_token]
    
    left_word2 = padded_sent[idx]
    left_word = padded_sent[idx + 1]
    right_word = padded_sent[idx + 3]
    right_word2 = padded_sent[idx + 4]
    
    # Neighbouring word features
       
    word_features['skip_gram'] = left_word + " " + right_word
    word_features['left_bigram'] = left_word + " " + sentence[idx]
    word_features['right_bigram'] = sentence[idx] +" "+ right_word
    word_features['left_trigram'] = " ".join([left_word2, left_word, sentence[idx]])
#     word_features['right_trigram'] = " ".join([sentence[idx], right_word, right_word2])
        
    # Word shape
    
    if sentence[idx].istitle():
        word_features['word_istitle'] = 1
    else:
        word_features['word_istitle'] = 0
        
    
    if sentence[idx].isupper():
        word_features['word_isupper'] = 1
    else:
        word_features['word_isupper'] = 0
        
    if sentence[idx].isdigit():
        word_features['word_isdigit'] = 1
    else:
        word_features['word_isdigit'] = 0
    
    # Subword feature
    
    word_features['subword_pre3'] = sentence[idx][:3]
    word_features['subword_end3'] = sentence[idx][-3:]
    
    word_features['subword_pre2'] = sentence[idx][:2]
    word_features['subword_end2'] = sentence[idx][-2:]
    
    word_features['left_subword_pre3'] = left_word[:3]
    word_features['left_subword_end3'] = left_word[-3:]
    
    # gazetteer feature
    
    
    if sentence[idx] in names_vocab:
        word_features['in_names'] = 1
        
        
    else:
        word_features['in_names'] = 0
    
    if sentence[idx] in gaz_vocab:
        word_features['in_gaz_vocab'] = 1

    else:
        word_features['in_gaz_vocab'] = 0
        
    if left_word in gaz_vocab:
        word_features['left_in_gaz_vocab'] = 1
        
    else:
        word_features['left_in_gaz_vocab'] = 0
        
#     if left_word2 in gaz_vocab:
#         word_features['left2_in_gaz_vocab'] = 1
        
#     else:
#         word_features['left2_in_gaz_vocab'] = 0

        
    if right_word in gaz_vocab:
        word_features['right_in_gaz_vocab'] = 1
        
    else:
        word_features['right_in_gaz_vocab'] = 0
        
    # More features
    
#     if postags[idx][1] == 'NNP':
#         word_features['pos_nnp'] = 1
#     else:
#         word_features['pos_nnp'] = 0
    
    word_features['pos_tag'] = postags[idx][1]
    
    if left_word != pad_token:
        word_features['left_pos_tag'] = postags[idx - 1][1]
        
    if left_word2 != pad_token:
        word_features['left2_pos_tag'] = postags[idx - 2][1]
        
    if right_word != pad_token:
        word_features['right_pos_tag'] = postags[idx + 1][1]
        
    if right_word2 != pad_token:
        word_features['right2_pos_tag'] = postags[idx + 2][1]
        
    
    return word_features
    
def sentence2features(sentence, postags):
    return [word2features(sentence, idx, postags) for idx in range(len(sentence))]


In [13]:
def prepare_ner_feature_dicts(ner_files, sent_level = False):
    '''ner_files is a list of Ontonotes files with NER annotations. Returns feature dictionaries and 
    IOB tags for each token in the entire dataset'''
    
    train_dicts = []
    train_tags = []

    for filename in ner_files:
        with open(ontonotes_path + filename, encoding="utf-8") as f:
            f.readline()
            for sentence in f:
                curr_tokens, curr_tags = sentence2iob(sentence)
                
                postags = nltk.pos_tag(curr_tokens)
                
                sentence_features = sentence2features(curr_tokens, postags)
                if sent_level:
                    train_dicts.append(sentence_features)
                    train_tags.append(curr_tags)
                else:
                    train_dicts.extend(sentence_features)
                    train_tags.extend(curr_tags)
                
                

    return train_dicts, train_tags

Use features to train a Multinomial Naive Bayes classifer, with default settings.

In [711]:
train_dicts, train_tags = prepare_ner_feature_dicts(train_data)
dev_dicts, dev_tags = prepare_ner_feature_dicts(dev_data)

vectorizer = DictVectorizer()
train_features = vectorizer.fit_transform(train_dicts)
dev_features = vectorizer.transform(dev_dicts)

In [712]:
classifier = MultinomialNB()

classifier.fit(train_features, train_tags)
dev_pred = classifier.predict(dev_features)



In [713]:
f1_score(dev_tags,dev_pred,average='macro')

0.3368414413105045

In [714]:
f1_score(dev_tags,dev_pred,average='micro')

0.92594507573532

In [522]:
print(classification_report(dev_tags, dev_pred))

  _warn_prf(average, modifier, msg_start, len(result))


               precision    recall  f1-score   support

   B-CARDINAL       0.59      0.56      0.57      1216
       B-DATE       0.74      0.59      0.65      2230
      B-EVENT       1.00      0.02      0.03       130
        B-FAC       0.00      0.00      0.00       149
        B-GPE       0.79      0.86      0.82      2738
   B-LANGUAGE       0.00      0.00      0.00       114
        B-LAW       0.00      0.00      0.00        47
        B-LOC       0.77      0.07      0.13       231
      B-MONEY       0.73      0.64      0.68       712
       B-NORP       0.85      0.72      0.78       928
    B-ORDINAL       0.74      0.25      0.37       222
        B-ORG       0.87      0.36      0.51      3024
    B-PERCENT       0.71      0.69      0.70       574
     B-PERSON       0.65      0.79      0.71      2082
    B-PRODUCT       1.00      0.04      0.08       101
   B-QUANTITY       1.00      0.01      0.02       125
       B-TIME       0.50      0.01      0.02       203
B-WORK_OF

### Observation

In macroaveraged f-score, class balance is not considered. As shown in the reports, the major class is "O", which should give more weights during averaging. The microaveraged f-score considers the class balance in the dataset,
which gives more weights to larger classes like "O", and the returned f-score is much higher. 


One problem with using a regular (non-sequential) classifier for `IOB-based NER` is that it may create ill-formed named entities, i.e. `I-` tags with no corresponding `B-` or `I-` tags before it. Check how often this is happening in the dev set with your classifier (the answer is "a lot")

In [715]:
ill_count = 0
total_count = 0

for i, pred in enumerate(dev_pred):
    total_count += 1
    if pred.startswith("B-"):
        curr_type = pred
        
    if pred.startswith("I-"):
        if dev_pred[i-1].startswith("B-") or dev_pred[i-1].startswith("I-"):
            if pred != curr_type:
                ill_count += 1
                
        else:
            ill_count += 1
            
Percentage = ill_count/total_count
print(Percentage)

0.053749162739867684


In [716]:
ill_count

9148

## Training a CRF

train a CRF model using the `sklearn_crfsuite` package.

In [39]:
train_dicts, train_tags = prepare_ner_feature_dicts(train_data, sent_level = True)
dev_dicts, dev_tags = prepare_ner_feature_dicts(dev_data, sent_level = True)

In [42]:
crf = CRF(algorithm = 'l2sgd', max_iterations = 15, c2 = 1e-5)
crf.fit(train_dicts, train_tags)
dev_pred = crf.predict(dev_dicts)
print(flat_f1_score(dev_tags,dev_pred,average='macro'))
print(flat_f1_score(dev_tags,dev_pred,average='micro'))

0.7087665903819654
0.9690125618397396


Explore the top and bottom 10 transitions in terms of weight.

In [482]:
transition_weights = crf.transition_features_

sorted_weights = sorted(transition_weights, key = transition_weights.get)
print("bottom 10:",sorted_weights[:10])
print("top 10:",sorted_weights[-10:])

bottom 10: [('B-PERCENT', 'O'), ('B-LANGUAGE', 'O'), ('B-PERSON', 'B-PERSON'), ('B-PERSON', 'B-ORG'), ('B-PERSON', 'B-GPE'), ('B-GPE', 'B-PERSON'), ('B-PERSON', 'B-NORP'), ('B-LAW', 'O'), ('I-ORDINAL', 'O'), ('I-ORG', 'B-NORP')]
top 10: [('B-ORG', 'I-ORG'), ('B-LAW', 'I-LAW'), ('I-QUANTITY', 'I-QUANTITY'), ('B-PRODUCT', 'I-PRODUCT'), ('B-CARDINAL', 'I-CARDINAL'), ('B-MONEY', 'I-MONEY'), ('B-DATE', 'I-DATE'), ('B-PERCENT', 'I-PERCENT'), ('B-QUANTITY', 'I-QUANTITY'), ('B-TIME', 'I-TIME')]


These transition weights make sense.

For the bottom 10s:

- It is likely that `percentages` and `language` would contain more than 1 token, like `50 %` contains 2 tokens, so the transition from the `B-` to `O` for these types are less likely to happen.
- Also, it is less likely that two differnet `B-` tags are placed together because we are less likely to mention several names from different aspects simultaneously.

For the top 10s:

- These are tags that are in the same type, so they are very likely to be in the sequential order like this, so high transition weights for them are reasonable.

### Kaggle Competition Preparation


In [41]:
kaggle_data = [ontonotes_path + 'kaggle_untagged/' + file for file in os.listdir(ontonotes_path + 'kaggle_untagged/')]
kaggle_data = sorted(kaggle_data) #ensures the files are in a standard order for consistency
header = 'Id,Predicted\n'

kaggle_dicts = []
for file in kaggle_data:
    with open(file) as f:
        for sentence in f:
            curr_tokens = sentence.strip().split(" ")

            postags = nltk.pos_tag(curr_tokens)
                
            sentence_features = sentence2features(curr_tokens, postags)
            kaggle_dicts.append(sentence_features)
    

kaggle_pred = crf.predict(kaggle_dicts)

curr_idx = 0
with open('./kaggle_tags.csv', 'w') as f:
    f.write(header)
    for tags in kaggle_pred:
        for c in tags:
            f.write(str(curr_idx) + "," + c + "\n")
            curr_idx += 1
    

In [637]:
# test
with open('kaggle_tags.csv') as f:
    lines = f.readlines()
    assert len(lines) == 306118
    assert lines[0] == header
    assert lines[1].startswith("0,")
    assert lines[-1].startswith("306116,")    
print("Success!")

Success!


Kaggle competition [here](https://www.kaggle.com/c/ubc-mdscl-colx563-ner/overview)