In [1]:
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

from nltk.tag.stanford import StanfordPOSTagger
from stanford_postagger.stanford_wrapper import StanfordPOSTagger as StanfordPOSTaggerWrapper

from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score

import scipy
from sklearn.grid_search import RandomizedSearchCV



In [2]:
%load_ext autoreload
%autoreload 2

# Read Dataset

In [3]:
f = open('datasets/conll2003/train.txt', 'r')
lines = f.readlines()
f.close()

del lines[0]
del lines[0]

dataset = []
sentence = []
for line in lines:
    splitter = line.strip().split(' ')
    if splitter[0] == '':
        continue
    elif (splitter[0] == '-DOCSTART-'):
        dataset.append(sentence)
        sentence = []
    else:
        token = splitter[0]
        tag = splitter[3]
        sentence.append((token, tag))

In [4]:
def convert_conlltxt2dataset(filename):
    f = open(filename, 'r')
    lines = f.readlines()
    f.close()
    
    del lines[0]
    del lines[0]
    
    dataset = []
    sentence = []
    for line in lines:
        splitter = line.strip().split(' ')
        if splitter[0] == '':
            continue
        elif (splitter[0] == '-DOCSTART-'):
            dataset.append(sentence)
            sentence = []
        else:
            token = splitter[0]
            tag = splitter[3]
            sentence.append((token, tag))
    return dataset

In [5]:
train_dataset = convert_conlltxt2dataset('datasets/conll2003/train.txt')
validation_dataset = convert_conlltxt2dataset('datasets/conll2003/valid.txt')
test_dataset = convert_conlltxt2dataset('datasets/conll2003/test.txt')

In [6]:
train_dataset[0][0:5]

[('EU', 'B-ORG'),
 ('rejects', 'O'),
 ('German', 'B-MISC'),
 ('call', 'O'),
 ('to', 'O')]

# Add Postag to Dataset

## Example

In [7]:
postagger = StanfordPOSTaggerWrapper()
postag = postagger.tag('+44 171')
postag

[('+44', 'CD'), ('171', 'CD')]

In [8]:
def add_postag2dataset(dataset):
    postagger = StanfordPOSTaggerWrapper()
    dataset_with_postag = []
    for sent in dataset:
        postagged_sent = []
        for index, (token, tag) in enumerate(sent):
            postagged_token = postagger.tag(token)
            postagged_sent.append((token, postagged_token[0][1], tag))
        dataset_with_postag.append(postagged_sent)
        
    return dataset_with_postag

postagged_train_dataset = add_postag2dataset(train_dataset)
postagged_validation_dataset = add_postag2dataset(validation_dataset)
postagged_test_dataset = add_postag2dataset(test_dataset)

# Delete Unused Dataset
del train_dataset
del validation_dataset
del test_dataset

In [9]:
postagged_train_dataset[0][0:5]

[('EU', 'NNP', 'B-ORG'),
 ('rejects', 'VBZ', 'O'),
 ('German', 'JJ', 'B-MISC'),
 ('call', 'NN', 'O'),
 ('to', 'TO', 'O')]

# Extract Feature

In [10]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    # Ortographic Feature, Word, POSTag & N-Gram
    features = {
        'word': word,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word[:2]': word[:2],
        'word[:3]': word[:3],
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'word.isupper()': word.isupper(),
        'postag': postag,
        'postag[:2]': postag[:2]
    }
    
    # Position
    features.update({
        'pos_front': i,
        'pos_end': len(sent) - i
    })
    
    # Bag Of Words
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True
        
    if i < len(sent) - 1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2postag(sent):
    return [postag for token, postag, label in sent]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [11]:
sent2features(postagged_train_dataset[0])[0]

{'word': 'EU',
 'word.lower()': 'eu',
 'word[-3:]': 'EU',
 'word[-2:]': 'EU',
 'word[:2]': 'EU',
 'word[:3]': 'EU',
 'word.istitle()': False,
 'word.isdigit()': False,
 'word.isupper()': True,
 'postag': 'NNP',
 'postag[:2]': 'NN',
 'pos_front': 0,
 'pos_end': 469,
 'BOS': True,
 '+1:word.lower()': 'rejects',
 '+1:word.istitle()': False,
 '+1:word.isupper()': False,
 '+1:postag': 'VBZ',
 '+1:postag[:2]': 'VB'}

# Feature Extraction

In [12]:
X_train = [sent2features(sent) for sent in postagged_train_dataset]
y_train = [sent2labels(sent) for sent in postagged_train_dataset]

X_val = [sent2features(sent) for sent in postagged_validation_dataset]
y_val = [sent2labels(sent) for sent in postagged_validation_dataset]

X_test = [sent2features(sent) for sent in postagged_test_dataset]
y_test = [sent2labels(sent) for sent in postagged_test_dataset]

del postagged_train_dataset
del postagged_validation_dataset
del postagged_test_dataset

In [13]:
X_train[0][0]

{'word': 'EU',
 'word.lower()': 'eu',
 'word[-3:]': 'EU',
 'word[-2:]': 'EU',
 'word[:2]': 'EU',
 'word[:3]': 'EU',
 'word.istitle()': False,
 'word.isdigit()': False,
 'word.isupper()': True,
 'postag': 'NNP',
 'postag[:2]': 'NN',
 'pos_front': 0,
 'pos_end': 469,
 'BOS': True,
 '+1:word.lower()': 'rejects',
 '+1:word.istitle()': False,
 '+1:word.isupper()': False,
 '+1:postag': 'VBZ',
 '+1:postag[:2]': 'VB'}

# Train

In [14]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CPU times: user 30.9 s, sys: 538 ms, total: 31.5 s
Wall time: 31.5 s


# Evaluation

In [15]:
labels = list(crf.classes_)
labels.remove('O')
labels

['B-ORG', 'B-MISC', 'B-PER', 'I-PER', 'B-LOC', 'I-ORG', 'I-MISC', 'I-LOC']

In [16]:
y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=labels)

0.8233657955716351

In [17]:
# group B and I results
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

             precision    recall  f1-score   support

      B-LOC      0.865     0.875     0.870      1658
      I-LOC      0.776     0.745     0.760       255
     B-MISC      0.820     0.777     0.798       694
     I-MISC      0.584     0.704     0.638       213
      B-ORG      0.803     0.730     0.765      1660
      I-ORG      0.695     0.766     0.729       834
      B-PER      0.868     0.848     0.858      1608
      I-PER      0.908     0.944     0.926      1154

avg / total      0.827     0.821     0.823      8076



# Cross Validation Score

In [18]:
f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted', labels=labels)

In [19]:
x_val_score = cross_val_score(crf, X_val, y_val, cv=5, scoring=f1_scorer)

In [20]:
print(x_val_score)
print(x_val_score.mean())

[0.82146043 0.69796028 0.8176525  0.81892228 0.69484772]
0.7701686404202621


# Hyperparameter Optimization

## Train

In [21]:
# %%time
# params_space = {
#     'c1': scipy.stats.expon(scale=0.5),
#     'c2': scipy.stats.expon(scale=0.05),
# }

# rs_train = RandomizedSearchCV(crf, params_space,
#                         cv=3,
#                         verbose=1,
#                         n_jobs=-1,
#                         n_iter=50,
#                         scoring=f1_scorer)
# rs_train.fit(X_train, y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  6.7min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed: 21.5min finished


CPU times: user 3min 53s, sys: 15.7 s, total: 4min 9s
Wall time: 21min 59s


Fitting 3 folds for each of 50 candidates, totalling 150 fits  
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  6.7min  
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed: 21.5min finished  
CPU times: user 3min 53s, sys: 15.7 s, total: 4min 9s  
Wall time: 21min 59s

In [21]:
rs_train_best_params_ = {'c1': 0.001262621084804322, 'c2': 0.07748342053200617}
rs_train_best_score_ = 0.856466684355955

In [22]:
print('Best params:', rs_train_best_params_)
print('Best CV score:', rs_train_best_score_)

Best params: {'c1': 0.001262621084804322, 'c2': 0.07748342053200617}
Best CV score: 0.856466684355955


## Validation

In [23]:
# %%time
# params_space = {
#     'c1': scipy.stats.expon(scale=0.5),
#     'c2': scipy.stats.expon(scale=0.05),
# }

# rs_val = RandomizedSearchCV(crf, params_space,
#                         cv=3,
#                         verbose=1,
#                         n_jobs=-1,
#                         n_iter=50,
#                         scoring=f1_scorer)
# rs_val.fit(X_val, y_val)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  5.5min finished


CPU times: user 1min 2s, sys: 844 ms, total: 1min 3s
Wall time: 5min 37s


Fitting 3 folds for each of 50 candidates, totalling 150 fits  
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  2.0min  
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  5.5min finished  
CPU times: user 1min 2s, sys: 844 ms, total: 1min 3s  
Wall time: 5min 37s

In [23]:
rs_val_best_params_ = {'c1': 0.097424410654595, 'c2': 0.02559303567607237}
rs_val_best_score_ = 0.7813431798834048

In [24]:
print('Best params:', rs_val_best_params_)
print('Best CV score:', rs_val_best_score_)

Best params: {'c1': 0.097424410654595, 'c2': 0.02559303567607237}
Best CV score: 0.7813431798834048


# New Classifier Parameter

## Train RS

In [25]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=rs_train_best_params_['c1'],
    c2=rs_train_best_params_['c2'],
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CPU times: user 31.9 s, sys: 134 ms, total: 32.1 s
Wall time: 32.1 s


In [26]:
y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=labels)

0.8276477008704834

In [27]:
# group B and I results
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

             precision    recall  f1-score   support

      B-LOC      0.866     0.880     0.873      1658
      I-LOC      0.798     0.729     0.762       255
     B-MISC      0.816     0.772     0.793       694
     I-MISC      0.561     0.690     0.619       213
      B-ORG      0.814     0.732     0.771      1660
      I-ORG      0.695     0.775     0.732       834
      B-PER      0.879     0.855     0.867      1608
      I-PER      0.913     0.952     0.932      1154

avg / total      0.832     0.825     0.828      8076



## Validation RS

In [31]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=rs_val_best_params_['c1'],
    c2=rs_val_best_params_['c2'],
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CPU times: user 31.4 s, sys: 83.1 ms, total: 31.5 s
Wall time: 31.5 s


In [32]:
y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=labels)

0.8204993488141177

In [33]:
# group B and I results
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

             precision    recall  f1-score   support

      B-LOC      0.864     0.869     0.866      1658
      I-LOC      0.760     0.745     0.752       255
     B-MISC      0.798     0.784     0.791       694
     I-MISC      0.561     0.690     0.619       213
      B-ORG      0.813     0.717     0.762      1660
      I-ORG      0.706     0.748     0.726       834
      B-PER      0.858     0.860     0.859      1608
      I-PER      0.893     0.956     0.923      1154

avg / total      0.823     0.820     0.820      8076

