In [1]:
import reader
import scorer
import utils
import classifiers.sequence_classifier as sc

In [2]:
from itertools import chain
from sklearn.model_selection import cross_val_score, KFold 
import os

import nltk
import sklearn
import numpy as np

import sklearn_crfsuite
from collections import Counter

In [3]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    chunktag = sent[i][2]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'word.isalpha()': word.isalpha(),
        'postag': postag,
        'postag[:2]': postag[:2],
        'chunktag': chunktag,
        'chunktag[:2]': chunktag[:2],
    }
    if i > 0:
        word1 = sent[i - 1][0]
        postag1 = sent[i - 1][1]
        chunktag1 = sent[i - 1][2]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:word.isalpha()': word1.isalpha(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
            '-1:chunktag': chunktag1,
            '-1:chunktag[:2]': chunktag1[:2],
        })
    else:
        features['BOS'] = True
    
    if i > 1:
        word2 = sent[i - 2][0]
        postag2 = sent[i - 2][1]
        chunktag2 = sent[i - 2][2]
        features.update({
            '-2:word.lower()': word2.lower(),
            '-2:word.istitle()': word2.istitle(),
            '-2:word.isupper()': word2.isupper(),
            '-2:word.isalpha()': word2.isalpha(),
            '-2:postag': postag2,
            '-2:postag[:2]': postag2[:2],
            '-2:chunktag': chunktag2,
            '-2:chunktag[:2]': chunktag2[:2],
        })

    if i < len(sent) - 1:
        word1 = sent[i + 1][0]
        postag1 = sent[i + 1][1]
        chunktag1 = sent[i + 1][2]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:word.isalpha()': word1.isalpha(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
            '+1:chunktag': chunktag1,
            '+1:chunktag[:2]': chunktag1[:2],
        })
    else:
        features['EOS'] = True
        
    if i < len(sent) - 2:
        word2 = sent[i + 2][0]
        postag2 = sent[i + 2][1]
        chunktag2 = sent[i + 2][2]
        features.update({
            '+2:word.lower()': word2.lower(),
            '+2:word.istitle()': word2.istitle(),
            '+2:word.isupper()': word2.isupper(),
            '+2:word.isalpha()': word2.isalpha(),
            '+2:postag': postag2,
            '+2:postag[:2]': postag2[:2],
            '+2:chunktag': chunktag2,
            '+2:chunktag[:2]': chunktag2[:2],
        })

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [4]:
def add_history(docs):
    for doc in docs:
        all_tokens = []
        for sent in doc:
            all_tokens += sent
        for i in range(0, len(all_tokens), 1):
            for j in range(i - 1, max(0, i - 1000), -1):
                if all_tokens[i]['word.lower()'] == all_tokens[j]['word.lower()']:
                    all_tokens[i].update({key + '_history': value 
                                          for key, value in all_tokens[j].items() if key != 'word.lower()'})
                    break

### First-level predictions using 10-fold split

In [5]:
kf = KFold(n_splits=10)

x_train_docs, y_train_docs = utils.docs_from_dataset('./dataset', 'eng.train.txt', 
                                                     ('words', 'pos', 'chunk', 'ne'), 
                                                     ['words', 'pos', 'chunk'], sent2features)

add_history(x_train_docs)
x_train_docs = np.array(x_train_docs)
y_train_docs = np.array(y_train_docs)

if os.path.exists('./prepared_data/y_train_docs_predicted.npy'):
    y_train_docs_predicted = np.load('./prepared_data/y_train_docs_predicted.npy')
else:
    y_train_docs_predicted = [0 for i in range(len(y_train_docs))]

    for train_ids, test_ids in kf.split(x_train_docs):      
        crf_first_train = sc.SequenceClassifier(cls='CRF')
        crf_first_train.fit(x_train_docs[train_ids], y_train_docs[train_ids])                                            
        y_predicted_docs = crf_first_train.predict(x_train_docs[test_ids])
        i = 0
        for index in test_ids:
            y_train_docs_predicted[index] = y_predicted_docs[i]
            i += 1
    np.save('./prepared_data/y_train_docs_predicted.npy', y_train_docs_predicted)

### Token-majority features

In [6]:
def add_token_features(x_train_docs, y_train_docs_predicted):
    tokens_corp = dict()

    for x_doc, y_doc in zip(x_train_docs, y_train_docs_predicted):
        tokens_doc = dict()
        for x_sent, y_sent in zip(x_doc, y_doc):
            for x_token, y_token in zip(x_sent, y_sent):
                token = x_token['word.lower()']
                # prefixes
                if y_token != 'O':
                    y_token = y_token[2:] 
                if token in tokens_corp:
                    tokens_corp[token].append(y_token)
                else:
                    tokens_corp[token]= [y_token]
                if token in tokens_doc:
                    tokens_doc[token].append(y_token)
                else:
                    tokens_doc[token]= [y_token]

        for key in tokens_doc.keys():
            tokens_doc[key] = Counter(tokens_doc[key])
        for x_sent in x_doc:
            for x_token in x_sent:
                x_token['doc_token_maj'] = tokens_doc[x_token['word.lower()']].most_common(1)[0][0]

    for key in tokens_corp.keys():
        tokens_corp[key] = Counter(tokens_corp[key])   

    for x_doc in x_train_docs:          
        for x_sent in x_doc:
            for x_token in x_sent:
                x_token['corp_token_maj'] = tokens_corp[x_token['word.lower()']].most_common(1)[0][0]

### Entity-majority features

In [7]:
def add_entity_features(x_train_docs, y_train_docs_predicted):
    entities_corp = dict()
    current_entity = ''

    for x_doc, y_doc in zip(x_train_docs, y_train_docs_predicted):
        entities_doc = dict()
        for x_sent, y_sent in zip(x_doc, y_doc):
            current_indexes = []
            current_entity = ''
            for x_token, y_token in zip(x_sent, y_sent):
                token = x_token['word.lower()']
                if y_token[0] == 'S':
                    current_entity = token
                    if current_entity in entities_doc:
                        entities_doc[current_entity].append(y_token[2:])
                    else:
                        entities_doc[current_entity] = [y_token[2:]]
                    if current_entity in entities_corp:
                        entities_corp[current_entity].append(y_token[2:])
                    else:
                        entities_corp[current_entity] = [y_token[2:]]
                if y_token[0] == 'B':
                    current_entity = token
                if y_token[0] == 'I':
                    current_entity += ' ' + token
                if y_token[0] == 'E':
                    current_entity += ' ' + token
                    if current_entity in entities_doc:
                        entities_doc[current_entity].append(y_token[2:])
                    else:
                        entities_doc[current_entity] = [y_token[2:]]
                    if current_entity in entities_corp:
                        entities_corp[current_entity].append(y_token[2:])
                    else:
                        entities_corp[current_entity] = [y_token[2:]]
                    current_entity = ''
                if y_token == 'O':
                    current_entity = ''
        for key in entities_doc.keys():
            entities_doc[key] = Counter(entities_doc[key])
        for x_sent, y_sent in zip(x_doc, y_doc):
            current_indexes = []
            current_entity = ''
            for i in range(len(x_sent)):
                if y_sent[i][0] == 'S':
                    current_entity = x_sent[i]['word.lower()']
                    current_indexes = [i]
                    for j in current_indexes:
                        x_sent[j]['doc_entity_maj'] = entities_doc[current_entity].most_common(1)[0][0]
                    current_entity = ''
                    current_indexes = []
                if y_sent[i][0] == 'B':
                    current_entity = x_sent[i]['word.lower()']
                    current_indexes = [i]
                if y_sent[i][0] == 'I':
                    current_entity = current_entity + ' ' + x_sent[i]['word.lower()']
                    current_indexes.append(i)
                if y_sent[i][0] == 'E':
                    current_entity = current_entity + ' ' + x_sent[i]['word.lower()']
                    current_indexes.append(i)
                    if current_entity in entities_doc:
                        for j in current_indexes:
                            x_sent[j]['doc_entity_maj'] = entities_doc[current_entity].most_common(1)[0][0]
                    current_entity = ''
                    current_indexes = []
                if y_sent[i] == 'O':
                    current_entity = x_sent[i]['word.lower()']
                    current_indexes = [i]
                    if current_entity in entities_doc:
                        for j in current_indexes:
                            x_sent[j]['corp_entity_maj'] = entities_doc[current_entity].most_common(1)[0][0]
                    current_entity = ''
                    current_indexes = []


    for key in entities_corp.keys():
        entities_corp[key] = Counter(entities_corp[key])   

    for x_doc, y_doc in zip(x_train_docs, y_train_docs_predicted):          
        for x_sent, y_sent in zip(x_doc, y_doc):
            current_indexes = []
            for i in range(len(x_sent)):
                if y_sent[i][0] == 'S':
                    current_entity = x_sent[i]['word.lower()']
                    current_indexes = [i]
                    for j in current_indexes:
                        x_sent[j]['corp_entity_maj'] = entities_corp[current_entity].most_common(1)[0][0]
                    current_indexes = []
                    current_entity = ''
                if y_sent[i][0] == 'B':
                    current_entity = x_sent[i]['word.lower()']
                    current_indexes = [i]
                if y_sent[i][0] == 'I':
                    current_entity += ' ' + x_sent[i]['word.lower()']
                    current_indexes.append(i)
                if y_sent[i][0] == 'E':
                    current_entity += ' ' + x_sent[i]['word.lower()']
                    current_indexes.append(i)
                    if current_entity in entities_corp:
                        for j in current_indexes:
                            x_sent[j]['corp_entity_maj'] = entities_corp[current_entity].most_common(1)[0][0]
                    current_entity = ''
                    current_indexes = []
                if y_sent[i] == 'O':
                    current_entity = x_sent[i]['word.lower()']
                    current_indexes = [i]
                    if current_entity in entities_corp:
                        for j in current_indexes:
                            x_sent[j]['corp_entity_maj'] = entities_corp[current_entity].most_common(1)[0][0]
                    current_entity = ''
                    current_indexes = []

### Superentity-majority features

In [8]:
def add_super_features(x_train_docs, y_train_docs_predicted):
    super_corp = dict()
    current_entity = ''

    for x_doc, y_doc in zip(x_train_docs, y_train_docs_predicted):
        super_doc = dict()
        for x_sent, y_sent in zip(x_doc, y_doc):
            for x_token, y_token in zip(x_sent, y_sent):
                token = x_token['word.lower()']
                if y_token[0] == 'S':
                    current_entity = token
                    if current_entity in super_doc:
                        super_doc[current_entity].append(y_token[2:])
                    else:
                        super_doc[current_entity] = [y_token[2:]]
                    if current_entity in super_corp:
                        super_corp[current_entity].append(y_token[2:])
                    else:
                        super_corp[current_entity] = [y_token[2:]]
                if y_token[0] == 'B':
                    current_entity = token
                if y_token[0] == 'I':
                    current_entity += ' ' + token
                if y_token[0] == 'E':
                    current_entity += ' ' + token
                    if current_entity in super_doc:
                        super_doc[current_entity].append(y_token[2:])
                    else:
                        super_doc[current_entity] = [y_token[2:]]
                    if current_entity in super_corp:
                        super_corp[current_entity].append(y_token[2:])
                    else:
                        super_corp[current_entity] = [y_token[2:]]
                    current_entity = ''
                if y_token == 'O':
                    current_entity = ''
        for key in super_doc.keys():
            super_doc[key] = Counter(super_doc[key])
        for x_sent, y_sent in zip(x_doc, y_doc):
            current_indexes = []
            for i in range(len(x_sent)):
                if y_sent[i][0] == 'S':
                    current_entity = x_sent[i]['word.lower()']
                    current_indexes = [i]
                    current_counter = Counter()
                    for key in super_doc.keys():
                        if current_entity in key and len(current_entity) != len(key):
                            current_counter += super_doc[key]
                    if len(current_counter.keys()) != 0:
                        for j in current_indexes:
                            x_sent[j]['doc_super_maj'] = current_counter.most_common(1)[0][0]
                    current_entity = ''
                    current_indexes = []
                if y_sent[i][0] == 'B':
                    current_entity = x_sent[i]['word.lower()']
                    current_indexes = [i]
                if y_sent[i][0] == 'I':
                    current_entity += ' ' + x_sent[i]['word.lower()']
                    current_indexes.append(i)
                if y_sent[i][0] == 'E':
                    current_entity += ' ' + x_sent[i]['word.lower()']
                    current_indexes.append(i)
                    current_counter = Counter()
                    for key in super_doc.keys():
                        if current_entity in key and len(current_entity) != len(key):
                            current_counter += super_doc[key]
                    if len(current_counter.keys()) != 0:
                        for j in current_indexes:
                            x_sent[j]['doc_super_maj'] = current_counter.most_common(1)[0][0]
                    current_entity = ''
                    current_indexes = []
                if y_sent[i] == 'O':
                    current_entity = x_sent[i]['word.lower()']
                    current_indexes = [i]
                    current_counter = Counter()
                    for key in super_doc.keys():
                        if current_entity in key and len(current_entity) != len(key):
                            current_counter += super_doc[key]
                    if len(current_counter.keys()) != 0:
                        for j in current_indexes:
                            x_sent[j]['doc_super_maj'] = current_counter.most_common(1)[0][0]
                    current_entity = ''
                    current_indexes = []

    for key in super_corp.keys():
        super_corp[key] = Counter(super_corp[key])
    for x_doc, y_doc in zip(x_train_docs, y_train_docs_predicted):
        for x_sent, y_sent in zip(x_doc, y_doc):
            current_indexes = []
            for i in range(len(x_sent)):
                if y_sent[i][0] == 'S':
                    current_entity = x_sent[i]['word.lower()']
                    current_indexes = [i]
                    current_counter = Counter()
                    for key in super_corp.keys():
                        if current_entity in key and len(current_entity) != len(key):
                            current_counter += super_corp[key]
                    if len(current_counter.keys()) != 0:
                        for j in current_indexes:
                            x_sent[j]['corp_super_maj'] = current_counter.most_common(1)[0][0]
                    current_entity = ''
                    current_indexes = []
                if y_sent[i][0] == 'B':
                    current_entity = x_sent[i]['word.lower()']
                    current_indexes = [i]
                if y_sent[i][0] == 'I':
                    current_entity += ' ' + x_sent[i]['word.lower()']
                    current_indexes.append(i)
                if y_sent[i][0] == 'E':
                    current_entity += ' ' + x_sent[i]['word.lower()']
                    current_indexes.append(i)
                    current_counter = Counter()
                    for key in super_corp.keys():
                        if current_entity in key and len(current_entity) != len(key):
                            current_counter += super_corp[key]
                    if len(current_counter.keys()) != 0:
                        for j in current_indexes:
                            x_sent[j]['corp_super_maj'] = current_counter.most_common(1)[0][0]
                    current_entity = ''
                    current_indexes = []
                if y_sent[i] == 'O':
                    current_entity = x_sent[i]['word.lower()']
                    current_indexes = [i]
                    current_counter = Counter()
                    for key in super_corp.keys():
                        if current_entity in key and len(current_entity) != len(key):
                            current_counter += super_corp[key]
                    if len(current_counter.keys()) != 0:
                        for j in current_indexes:
                            x_sent[j]['corp_super_maj'] = current_counter.most_common(1)[0][0]
                    current_entity = ''
                    current_indexes = []

### Prediction CRFs

In [9]:
crf_first_train = sc.SequenceClassifier(cls='CRF')
crf_first_train.fit(x_train_docs, y_train_docs)

add_token_features(x_train_docs, y_train_docs_predicted)
add_entity_features(x_train_docs, y_train_docs_predicted)
add_super_features(x_train_docs, y_train_docs_predicted)

crf_second_train = sc.SequenceClassifier(cls='CRF')
crf_second_train.fit(x_train_docs, y_train_docs)

SequenceClassifier(algorithm='lbfgs', all_possible_states=None,
          all_possible_transitions=True, averaging=None, c=None, c1=0.1,
          c2=0.1, calibration_candidates=None, calibration_eta=None,
          calibration_max_trials=None, calibration_rate=None,
          calibration_samples=None, cls='CRF', delta=None, epsilon=None,
          error_sensitive=None, gamma=None, keep_tempfiles=None,
          linesearch=None, max_iterations=100, max_linesearch=None,
          min_freq=None, model_filename=None, num_memories=None,
          pa_type=None, period=None, trainer_cls=None, variance=None,
          verbose=False)

### Test prediction

In [10]:
x_testb_docs, y_testb_docs = utils.docs_from_dataset('./dataset', 'eng.testb.test.txt', 
                                                     ('words', 'pos', 'chunk', 'ne'), 
                                                     ['words', 'pos', 'chunk'], sent2features)
add_history(x_testb_docs)
y_testb_docs_predicted = crf_first_train.predict(x_testb_docs)
add_token_features(x_testb_docs, y_testb_docs_predicted)
add_entity_features(x_testb_docs, y_testb_docs_predicted)
add_super_features(x_testb_docs, y_testb_docs_predicted)
print('Результаты EXTENDED + HISTORY + KRISHNAN_MANNING')
crf_second_train.get_full_score(x_testb_docs, y_testb_docs)

Результаты EXTENDED + HISTORY + KRISHNAN_MANNING
label    precision    recall    f1-score

PER      0.9132       0.9420    0.9274
ORG      0.7586       0.7880    0.7730
LOC      0.8778       0.8283    0.8524
MISC     0.8363       0.7417    0.7862

total    0.8464       0.8377    0.8420
