In [1]:
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

from nltk.tag.stanford import StanfordPOSTagger
from stanford_postagger.stanford_wrapper import StanfordPOSTagger as StanfordPOSTaggerWrapper

from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score

import scipy
from sklearn.grid_search import RandomizedSearchCV

%load_ext autoreload
%autoreload 2



In [2]:
def convert_conlltxt2dataset(filename):
    f = open(filename, 'r')
    lines = f.readlines()
    f.close()
    
    del lines[0]
    del lines[0]
    
    dataset = []
    sentence = []
    for line in lines:
        splitter = line.strip().split(' ')
        if splitter[0] == '':
            continue
        elif (splitter[0] == '-DOCSTART-'):
            dataset.append(sentence)
            sentence = []
        else:
            token = splitter[0]
            tag = splitter[3]
            sentence.append((token, tag))
    return dataset

In [3]:
train_dataset = convert_conlltxt2dataset('datasets/conll2003/train.txt')
test_dataset = convert_conlltxt2dataset('datasets/conll2003/test.txt')

In [4]:
train_dataset[0][0:3]

[('EU', 'B-ORG'), ('rejects', 'O'), ('German', 'B-MISC')]

In [5]:
def add_postag2dataset(dataset):
    postagger = StanfordPOSTaggerWrapper()
    dataset_with_postag = []
    for sent in dataset:
        postagged_sent = []
        for index, (token, tag) in enumerate(sent):
            postagged_token = postagger.tag(token)
            postagged_sent.append((token, postagged_token[0][1], tag))
        dataset_with_postag.append(postagged_sent)
        
    return dataset_with_postag

In [6]:
postagged_train_dataset = add_postag2dataset(train_dataset)
postagged_test_dataset = add_postag2dataset(test_dataset)

In [7]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    # Ortographic Feature, Word, POSTag & N-Gram
    features = {
        'word': word,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word[:2]': word[:2],
        'word[:3]': word[:3],
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'word.isupper()': word.isupper(),
        'postag': postag,
        'postag[:2]': postag[:2]
    }
    
    # Position
    features.update({
        'pos_front': i,
        'pos_end': len(sent) - i
    })
    
    # Bag Of Words
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True
        
    if i < len(sent) - 1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2postag(sent):
    return [postag for token, postag, label in sent]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [8]:
postagged_train_dataset[0][0:3]

[('EU', 'NNP', 'B-ORG'), ('rejects', 'VBZ', 'O'), ('German', 'JJ', 'B-MISC')]

In [9]:
X_train = [sent2features(sent) for sent in postagged_train_dataset]
y_train = [sent2labels(sent) for sent in postagged_train_dataset]

X_test = [sent2features(sent) for sent in postagged_test_dataset]
y_test = [sent2labels(sent) for sent in postagged_test_dataset]

In [10]:
c1_ = 0.001262621084804322
c2_ = 0.07748342053200617

In [11]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=c1_,
    c2=c2_,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CPU times: user 30.7 s, sys: 776 ms, total: 31.5 s
Wall time: 31.5 s


In [12]:
labels = crf.classes_

In [13]:
labels = list(crf.classes_)
labels.remove('O')
labels

['B-ORG', 'B-MISC', 'B-PER', 'I-PER', 'B-LOC', 'I-ORG', 'I-MISC', 'I-LOC']

In [14]:
y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=labels)

0.8276477008704834

In [15]:
import pickle

In [16]:
model_filename = 'finalized_model.sav'
pickle.dump(crf, open(model_filename, 'wb'))

# Predict New String

In [17]:
test_text_list = [['Beyoncé',
  'Giselle',
  'Knowles-Carter',
  'bee-YON-say',
  'born',
  'September',
  '1981',
  'is',
  'an',
  'American',
  'singer',
  'songwriter',
  'record',
  'producer',
  'and',
  'actress'],
 ['Born',
  'and',
  'raised',
  'in',
  'Houston',
  'Texas',
  'she',
  'performed',
  'in',
  'various',
  'singing',
  'and',
  'dancing',
  'competitions',
  'as',
  'a',
  'child',
  'and',
  'rose',
  'to',
  'fame',
  'in',
  'the',
  'late',
  '1990s',
  'as',
  'lead',
  'singer',
  'of',
  'R&B',
  'girl-group',
  'Destiny',
  'Child'],
 ['Managed',
  'by',
  'her',
  'father',
  'Mathew',
  'Knowles',
  'the',
  'group',
  'became',
  'one',
  'of',
  'the',
  'world',
  'best-selling',
  'girl',
  'groups',
  'of',
  'all',
  'time'],
 ['Their',
  'hiatus',
  'saw',
  'the',
  'release',
  'of',
  'Beyoncé',
  'debut',
  'album',
  'Dangerously',
  'in',
  'Love',
  '2003',
  'which',
  'established',
  'her',
  'as',
  'a',
  'solo',
  'artist',
  'worldwide',
  'earned',
  'five',
  'Grammy',
  'Awards',
  'and',
  'featured',
  'the',
  'Billboard',
  'Hot',
  '100',
  'number-one',
  'singles',
  'Crazy',
  'in',
  'Love',
  'and',
  'Baby',
  'Boy']]

In [18]:
def add_other_label2dataset(dataset):
    other_label_dataset = []
    for sent in dataset:
        sent_list = []
        for token in sent:
            sent_list.append((token, 'O'))
        other_label_dataset.append(sent_list)
    return other_label_dataset

In [19]:
other_label_dataset = add_other_label2dataset(test_text_list)
other_label_dataset[0]

[('Beyoncé', 'O'),
 ('Giselle', 'O'),
 ('Knowles-Carter', 'O'),
 ('bee-YON-say', 'O'),
 ('born', 'O'),
 ('September', 'O'),
 ('1981', 'O'),
 ('is', 'O'),
 ('an', 'O'),
 ('American', 'O'),
 ('singer', 'O'),
 ('songwriter', 'O'),
 ('record', 'O'),
 ('producer', 'O'),
 ('and', 'O'),
 ('actress', 'O')]

In [20]:
postagged_test = add_postag2dataset(other_label_dataset)
postagged_test[0]

[('Beyoncé', 'NNP', 'O'),
 ('Giselle', 'NNP', 'O'),
 ('Knowles-Carter', 'NNP', 'O'),
 ('bee-YON-say', 'NN', 'O'),
 ('born', 'VBN', 'O'),
 ('September', 'NNP', 'O'),
 ('1981', 'CD', 'O'),
 ('is', 'VBZ', 'O'),
 ('an', 'DT', 'O'),
 ('American', 'NNP', 'O'),
 ('singer', 'NN', 'O'),
 ('songwriter', 'NN', 'O'),
 ('record', 'NN', 'O'),
 ('producer', 'NN', 'O'),
 ('and', 'CC', 'O'),
 ('actress', 'NN', 'O')]

In [21]:
features = [sent2features(sent) for sent in postagged_test]

In [22]:
crf.predict(features)

[['B-PER',
  'I-PER',
  'I-PER',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-MISC',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['B-LOC',
  'O',
  'O',
  'O',
  'B-LOC',
  'I-LOC',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-MISC',
  'O',
  'O',
  'O'],
 ['O',
  'O',
  'O',
  'O',
  'B-PER',
  'I-PER',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-LOC',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-MISC',
  'I-MISC',
  'O',
  'O',
  'O',
  'B-ORG',
  'I-ORG',
  'O',
  'O',
  'O',
  'B-PER',
  'O',
  'O',
  'O',
  'B-ORG',
  'I-ORG']]

# Test with Text Data

In [23]:
text = "John Doe is the most handsome person in the world"
sentence = text.split()
sentence

['John',
 'Doe',
 'is',
 'the',
 'most',
 'handsome',
 'person',
 'in',
 'the',
 'world']

In [24]:
other_label_dataset = add_other_label2dataset([sentence])

In [25]:
postagged_test = add_postag2dataset(other_label_dataset)

In [26]:
features = [sent2features(sent) for sent in postagged_test]

In [27]:
crf.predict(features)

[['B-PER', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']]

# Test with New Module

In [28]:
from ner.NER import NER

In [29]:
ner = NER('finalized_model.sav')

Load Model Success


In [30]:
ner.predict_class_text_list(test_text_list)

[['B-PER',
  'I-PER',
  'I-PER',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-MISC',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['B-LOC',
  'O',
  'O',
  'O',
  'B-LOC',
  'I-LOC',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-MISC',
  'O',
  'O',
  'O'],
 ['O',
  'O',
  'O',
  'O',
  'B-PER',
  'I-PER',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-LOC',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-MISC',
  'I-MISC',
  'O',
  'O',
  'O',
  'B-ORG',
  'I-ORG',
  'O',
  'O',
  'O',
  'B-PER',
  'O',
  'O',
  'O',
  'B-ORG',
  'I-ORG']]

In [31]:
ner.predict_class_text(text)

[['B-PER', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']]

In [32]:
ner.predict_marginal_class_text(text)

[[{'B-ORG': 0.0007559024654277962,
   'O': 1.9755432037347083e-05,
   'B-MISC': 5.523440012265124e-05,
   'B-PER': 0.9962545385251304,
   'I-PER': 0.0027229147485909066,
   'B-LOC': 0.00011879024625599744,
   'I-ORG': 5.3514887609208765e-05,
   'I-MISC': 1.4866292352674246e-05,
   'I-LOC': 4.4830024720018395e-06},
  {'B-ORG': 3.699566917079212e-05,
   'O': 7.134945874731716e-05,
   'B-MISC': 2.3603605069636875e-05,
   'B-PER': 0.00022768884042872998,
   'I-PER': 0.9987219757091576,
   'B-LOC': 1.994387821576716e-05,
   'I-ORG': 0.0007598134366588535,
   'I-MISC': 5.285984822983397e-05,
   'I-LOC': 8.576955432094797e-05},
  {'B-ORG': 9.635405742680737e-08,
   'O': 0.9999340236308972,
   'B-MISC': 1.251807318723364e-06,
   'B-PER': 1.708631580181513e-07,
   'I-PER': 5.899954012330967e-05,
   'B-LOC': 4.391770904934039e-08,
   'I-ORG': 9.026373087700323e-07,
   'I-MISC': 3.735471748194755e-06,
   'I-LOC': 7.757776787617347e-07},
  {'B-ORG': 3.867632428295636e-06,
   'O': 0.999990295335272