In [1]:
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

from nltk.tag.stanford import StanfordPOSTagger
from stanford_postagger.stanford_wrapper import StanfordPOSTagger as StanfordPOSTaggerWrapper

from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score

import scipy
from sklearn.grid_search import RandomizedSearchCV



In [2]:
%load_ext autoreload
%autoreload 2

# Read Dataset

In [3]:
f = open('datasets/conll2003/train.txt', 'r')
lines = f.readlines()
f.close()

del lines[0]
del lines[0]

dataset = []
sentence = []
for line in lines:
    splitter = line.strip().split(' ')
    if splitter[0] == '':
        continue
    elif (splitter[0] == '-DOCSTART-'):
        dataset.append(sentence)
        sentence = []
    else:
        token = splitter[0]
        tag = splitter[3]
        sentence.append((token, tag))

In [4]:
def convert_conlltxt2dataset(filename):
    f = open(filename, 'r')
    lines = f.readlines()
    f.close()
    
    del lines[0]
    del lines[0]
    
    dataset = []
    sentence = []
    for line in lines:
        splitter = line.strip().split(' ')
        if splitter[0] == '':
            continue
        elif (splitter[0] == '-DOCSTART-'):
            dataset.append(sentence)
            sentence = []
        else:
            token = splitter[0]
            tag = splitter[3]
            sentence.append((token, tag))
    return dataset

In [5]:
train_dataset = convert_conlltxt2dataset('datasets/conll2003/train.txt')
validation_dataset = convert_conlltxt2dataset('datasets/conll2003/valid.txt')
test_dataset = convert_conlltxt2dataset('datasets/conll2003/test.txt')

In [6]:
train_dataset[0][0:5]

[('EU', 'B-ORG'),
 ('rejects', 'O'),
 ('German', 'B-MISC'),
 ('call', 'O'),
 ('to', 'O')]

# Add Postag to Dataset

## Example

In [7]:
postagger = StanfordPOSTaggerWrapper()
postag = postagger.tag('+44 171')
postag

[('+44', 'CD'), ('171', 'CD')]

In [8]:
def add_postag2dataset(dataset):
    postagger = StanfordPOSTaggerWrapper()
    dataset_with_postag = []
    for sent in dataset:
        postagged_sent = []
        for index, (token, tag) in enumerate(sent):
            postagged_token = postagger.tag(token)
            postagged_sent.append((token, postagged_token[0][1], tag))
        dataset_with_postag.append(postagged_sent)
        
    return dataset_with_postag

postagged_train_dataset = add_postag2dataset(train_dataset)
postagged_validation_dataset = add_postag2dataset(validation_dataset)
postagged_test_dataset = add_postag2dataset(test_dataset)

del train_dataset
del validation_dataset
del test_dataset

In [9]:
postagged_train_dataset[0][0:5]

[('EU', 'NNP', 'B-ORG'),
 ('rejects', 'VBZ', 'O'),
 ('German', 'JJ', 'B-MISC'),
 ('call', 'NN', 'O'),
 ('to', 'TO', 'O')]

# Extract Feature

In [10]:
def word2features(sent, i, word_check, ortographic_check, ngram_check, postag_check, position_check, bow_check):
    word = sent[i][0]
    postag = sent[i][1]
    features = {}
    # Ortographic Feature, Word, POSTag & N-Gram
    if word_check == True:
        features.update({
            'word': word,
            'word.lower()': word.lower()
        })
    
    if ortographic_check == True:
        features.update({
            'word.istitle()': word.istitle(),
            'word.isdigit()': word.isdigit(),
            'word.isupper()': word.isupper()
        })
    
    if ngram_check == True:
        features.update({
            'word[-3:]': word[-3:],
            'word[-2:]': word[-2:],
            'word[:2]': word[:2],
            'word[:3]': word[:3]
        })
        
    if postag_check == True:
        features.update({
            'postag': postag
        })
    
    if postag_check == True and ngram_check == True:
        features.update({
            'postag[:2]': postag[:2]
        })
    
    if position_check == True:
        # Position
        features.update({
            'pos_front': i,
            'pos_end': len(sent) - i
        })
    
    if bow_check == True:
        # Bag Of Words
        if i > 0:
            word1 = sent[i-1][0]
            postag1 = sent[i-1][1]
            features.update({
                '-1:word.lower()': word1.lower(),
                '-1:word.istitle()': word1.istitle(),
                '-1:word.isupper()': word1.isupper(),
                '-1:postag': postag1,
                '-1:postag[:2]': postag1[:2]
            })
        else:
            features['BOS'] = True

        if i < len(sent) - 1:
            word1 = sent[i+1][0]
            postag1 = sent[i+1][1]
            features.update({
                '+1:word.lower()': word1.lower(),
                '+1:word.istitle()': word1.istitle(),
                '+1:word.isupper()': word1.isupper(),
                '+1:postag': postag1,
                '+1:postag[:2]': postag1[:2]
            })
        else:
            features['EOS'] = True

    return features

def sent2features(sent, word, ortographic, ngram, postag, position, bow):
    return [word2features(sent, i, word, ortographic, ngram, postag, position, bow) for i in range(len(sent))]

def sent2postag(sent):
    return [postag for token, postag, label in sent]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [11]:
sent2features(postagged_train_dataset[0], True, True, True, True, True, True)[0]

{'word': 'EU',
 'word.lower()': 'eu',
 'word.istitle()': False,
 'word.isdigit()': False,
 'word.isupper()': True,
 'word[-3:]': 'EU',
 'word[-2:]': 'EU',
 'word[:2]': 'EU',
 'word[:3]': 'EU',
 'postag': 'NNP',
 'postag[:2]': 'NN',
 'pos_front': 0,
 'pos_end': 469,
 'BOS': True,
 '+1:word.lower()': 'rejects',
 '+1:word.istitle()': False,
 '+1:word.isupper()': False,
 '+1:postag': 'VBZ',
 '+1:postag[:2]': 'VB'}

# Feature Experiment 

In [12]:
result = {}

## All

In [13]:
X_train = [sent2features(sent, True, True, True, True, True, True) for sent in postagged_train_dataset]
y_train = [sent2labels(sent) for sent in postagged_train_dataset]

X_val = [sent2features(sent, True, True, True, True, True, True) for sent in postagged_validation_dataset]
y_val = [sent2labels(sent) for sent in postagged_validation_dataset]

In [14]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)
labels = list(crf.classes_)
labels.remove('O')

y_pred = crf.predict(X_val)
result['all'] = metrics.flat_f1_score(y_val, y_pred,
                      average='weighted', labels=labels)

CPU times: user 32 s, sys: 819 ms, total: 32.8 s
Wall time: 32.9 s


## Delete Word

In [15]:
X_train = [sent2features(sent, False, True, True, True, True, True) for sent in postagged_train_dataset]
y_train = [sent2labels(sent) for sent in postagged_train_dataset]

X_val = [sent2features(sent, False, True, True, True, True, True) for sent in postagged_validation_dataset]
y_val = [sent2labels(sent) for sent in postagged_validation_dataset]

In [16]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

y_pred = crf.predict(X_val)
result['delete_word'] = metrics.flat_f1_score(y_val, y_pred,
                      average='weighted', labels=labels)

CPU times: user 29 s, sys: 347 ms, total: 29.3 s
Wall time: 29.3 s


## Delete Ortographic

In [17]:
X_train = [sent2features(sent, True, False, True, True, True, True) for sent in postagged_train_dataset]
y_train = [sent2labels(sent) for sent in postagged_train_dataset]

X_val = [sent2features(sent, True, False, True, True, True, True) for sent in postagged_validation_dataset]
y_val = [sent2labels(sent) for sent in postagged_validation_dataset]

In [18]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

y_pred = crf.predict(X_val)
result['delete_ortographic'] = metrics.flat_f1_score(y_val, y_pred,
                      average='weighted', labels=labels)

CPU times: user 32 s, sys: 235 ms, total: 32.2 s
Wall time: 32.2 s


## Delete Ngram

In [19]:
X_train = [sent2features(sent, True, True, False, True, True, True) for sent in postagged_train_dataset]
y_train = [sent2labels(sent) for sent in postagged_train_dataset]

X_val = [sent2features(sent, True, True, False, True, True, True) for sent in postagged_validation_dataset]
y_val = [sent2labels(sent) for sent in postagged_validation_dataset]

In [20]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

y_pred = crf.predict(X_val)
result['delete_ngram'] = metrics.flat_f1_score(y_val, y_pred,
                      average='weighted', labels=labels)

CPU times: user 24.7 s, sys: 123 ms, total: 24.9 s
Wall time: 24.9 s


## Delete Postag

In [21]:
X_train = [sent2features(sent, True, True, True, False, True, True) for sent in postagged_train_dataset]
y_train = [sent2labels(sent) for sent in postagged_train_dataset]

X_val = [sent2features(sent, True, True, True, False, True, True) for sent in postagged_validation_dataset]
y_val = [sent2labels(sent) for sent in postagged_validation_dataset]

In [22]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

y_pred = crf.predict(X_val)
result['delete_postag'] = metrics.flat_f1_score(y_val, y_pred,
                      average='weighted', labels=labels)

CPU times: user 34.5 s, sys: 126 ms, total: 34.6 s
Wall time: 34.6 s


## Delete Position

In [23]:
X_train = [sent2features(sent, True, True, True, True, False, True) for sent in postagged_train_dataset]
y_train = [sent2labels(sent) for sent in postagged_train_dataset]

X_val = [sent2features(sent, True, True, True, True, False, True) for sent in postagged_validation_dataset]
y_val = [sent2labels(sent) for sent in postagged_validation_dataset]

In [24]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

y_pred = crf.predict(X_val)
result['delete_position'] = metrics.flat_f1_score(y_val, y_pred,
                      average='weighted', labels=labels)

CPU times: user 30.2 s, sys: 83.8 ms, total: 30.2 s
Wall time: 30.2 s


## Delete BOW

In [25]:
X_train = [sent2features(sent, True, True, True, True, True, False) for sent in postagged_train_dataset]
y_train = [sent2labels(sent) for sent in postagged_train_dataset]

X_val = [sent2features(sent, True, True, True, True, True, False) for sent in postagged_validation_dataset]
y_val = [sent2labels(sent) for sent in postagged_validation_dataset]

In [26]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

y_pred = crf.predict(X_val)
result['delete_bow'] = metrics.flat_f1_score(y_val, y_pred,
                      average='weighted', labels=labels)

CPU times: user 21.6 s, sys: 44 ms, total: 21.6 s
Wall time: 21.6 s


# Print Result and Importance of Each Feature

In [27]:
result

{'all': 0.8924325828151741,
 'delete_word': 0.8824051054269132,
 'delete_ortographic': 0.8879887554440393,
 'delete_ngram': 0.8890286700522596,
 'delete_postag': 0.8912385871879948,
 'delete_position': 0.8915977529617232,
 'delete_bow': 0.8659510361894482}

In [28]:
for key in result:
    print(key, result[key] - result['all'])

all 0.0
delete_word -0.01002747738826093
delete_ortographic -0.004443827371134801
delete_ngram -0.0034039127629145183
delete_postag -0.0011939956271793672
delete_position -0.000834829853450958
delete_bow -0.026481546625725882
