In [1]:
# https://nlpforhackers.io/training-pos-tagger
from nltk import word_tokenize, pos_tag
import nltk

In [2]:
print pos_tag(word_tokenize("I'm learning NLP with NLTK."))

[('I', 'PRP'), ("'m", 'VBP'), ('learning', 'VBG'), ('NLP', 'NNP'), ('with', 'IN'), ('NLTK', 'NNP'), ('.', '.')]


In [32]:
# choose a training corpus to learn POS tagging
tagged_sentences = nltk.corpus.treebank.tagged_sents()
print tagged_sentences[0]


[(u'Pierre', u'NNP'), (u'Vinken', u'NNP'), (u',', u','), (u'61', u'CD'), (u'years', u'NNS'), (u'old', u'JJ'), (u',', u','), (u'will', u'MD'), (u'join', u'VB'), (u'the', u'DT'), (u'board', u'NN'), (u'as', u'IN'), (u'a', u'DT'), (u'nonexecutive', u'JJ'), (u'director', u'NN'), (u'Nov.', u'NNP'), (u'29', u'CD'), (u'.', u'.')]


In [159]:
def features(sentence, index):
    # sentence: [w1, w2, ...], index: the index of the word
    return {
        'word': sentence[index],
        'is_first': index == 0,
        'is_last': index == len(sentence) - 1,
        'is_capitalised': sentence[index][0].isupper(),
        'is_all_caps': sentence[index].isupper(),
        'is_all_lower': sentence[index].islower(),
        'prefix_1': sentence[index][0],
        'prefix_2': sentence[index][:2],
        'prefix_3': sentence[index][:3],
        'suffix_1': sentence[index][-1:],
        'suffix_2': sentence[index][-2:],
        'suffix_3': sentence[index][-3:],
        'prev_word': '' if index == 0 else sentence[index-1],
        'next_word': '' if index == len(sentence)-1 else sentence[index+1],
        'has_hypen': '-' in sentence[index],
        'is_numeric': sentence[index].isdigit(),
        'capitals_inside': sentence[index][1:].isupper()        
    }

In [51]:
features(word_tokenize("I love Python programming."), 2)

{'capitals_inside': False,
 'has_hypen': False,
 'is_all_caps': False,
 'is_all_lower': False,
 'is_capitalised': True,
 'is_first': False,
 'is_last': False,
 'is_numeric': False,
 'next_word': 'programming',
 'prefix_1': 'P',
 'prefix_2': 'Py',
 'prefix_3': 'Pyt',
 'prev_word': 'love',
 'suffix_1': 'n',
 'suffix_2': 'on',
 'suffix_3': 'hon',
 'word': 'Python'}

In [27]:
def untag(tagged_sentence):
    return [w for w, t in tagged_sentence]

from sklearn.cross_validation import train_test_split

def transform_to_dataset(tagged_sentences):
    X, y = [], []
    for tagged in tagged_sentences:
        #for idx in range(len(tagged)):
         #   X.append(features(untag(tagged), idx))
          #  y.append(tagged[idx][1])    
        w, t = zip(*tagged)        
        X.extend([features(w, idx) for idx, _ in enumerate(w)])
        y.extend(t)
    
    return X, y

X, y = transform_to_dataset(tagged_sentences)



In [28]:
# encode class labels
from sklearn.preprocessing import LabelEncoder
class_le = LabelEncoder()
y = class_le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state = 0)

In [29]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline

#clf = Pipeline([
#    ('vectoriser', DictVectorizer(sparse=True)),
#    ('classifier', DecisionTreeClassifier(criterion='entropy'))    
#])

clf = Pipeline([
    ('vectoriser', DictVectorizer(sparse=True)),
    ('classifier', RandomForestClassifier(criterion='entropy', n_estimators=10, random_state=1))
])
clf.fit(X_train, y_train)

print "Training completed successfully!"

Training completed successfully!


In [30]:
# check accuracy score of classifier
clf.score(X_test, y_test)


0.945369303508284

In [31]:
test_sent = word_tokenize("President of the United States announces state of emergency.")
test_feat = [features(test_sent, idx) for idx in range(len(test_sent))]
print [f['word'] for f in test_feat]
class_le.inverse_transform(clf.predict(test_feat))

['President', 'of', 'the', 'United', 'States', 'announces', 'state', 'of', 'emergency', '.']


  if diff:


array([u'NNP', u'IN', u'DT', u'NNP', u'NNPS', u'NNS', u'NN', u'IN', u'NN',
       u'.'], dtype='<U6')

# Conditional Random Fields

In [160]:
# prepare dataset for CRF classifier
# each item is now a sequence rather than a single word
def transform_to_CRF_dataset(tagged_sentences):
    X, y = [], []
    for tagged in tagged_sentences:
        #for idx in range(len(tagged)):
         #   X.append(features(untag(tagged), idx))
          #  y.append(tagged[idx][1])    
        w, t = zip(*tagged)        
        X.append([features(w, idx) for idx, _ in enumerate(w)])
        y.append(list(t))
    
    return X, y

X, y = transform_to_CRF_dataset(tagged_sentences)


In [161]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state = 0)

In [205]:
from sklearn_crfsuite import CRF
from sklearn_crfsuite import metrics
from sklearn.cross_validation import cross_val_predict


model = CRF()
#model = CRF(algorithm='lbfgs', 
 #           c1=10, 
 #           c2=0.1, 
 #           max_iterations=100, 
 #           all_possible_transitions=False)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)
#print metrics.flat_accuracy_score(y_test, y_pred)
print metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=model.classes_)

0.960463620053468


In [208]:
# calculate model performance
y_cross = cross_val_predict(estimator=model, X=X_train, y=y_train, cv=5)


In [209]:
# display cross-validated f1 score
metrics.flat_f1_score(y_pred=y_cross, y_true = y_train, average='weighted', labels=model.classes_)

0.9578061293281852

In [134]:
# predict test sentence
idx = 101
print len([f['word']for f in X_test[idx]])
#print y_test[idx]
print 
print zip([f['word']for f in X_test[idx]], y_test[idx])

model.predict_single(X_test[idx])


31

[(u'Until', u'IN'), (u'the', u'DT'), (u'other', u'JJ'), (u'day', u'NN'), (u',', u','), (u'you', u'PRP'), (u'as', u'IN'), (u'an', u'DT'), (u'ordinary', u'JJ'), (u'citizen', u'NN'), (u'of', u'IN'), (u'this', u'DT'), (u'democracy', u'NN'), (u'had', u'VBD'), (u'no', u'DT'), (u'right', u'NN'), (u'*', u'-NONE-'), (u'to', u'TO'), (u'see', u'VB'), (u'what', u'WP'), (u'your', u'PRP$'), (u'government', u'NN'), (u'was', u'VBD'), (u'telling', u'VBG'), (u'your', u'PRP$'), (u'cousins', u'NNS'), (u'around', u'IN'), (u'the', u'DT'), (u'world', u'NN'), (u'*T*-21', u'-NONE-'), (u'.', u'.')]


['IN',
 'DT',
 'JJ',
 'NN',
 ',',
 'PRP',
 'IN',
 'DT',
 'JJ',
 'NN',
 'IN',
 'DT',
 'NN',
 'VBD',
 'DT',
 'NN',
 '-NONE-',
 'TO',
 'VB',
 'WP',
 'PRP$',
 'NN',
 'VBD',
 'VBG',
 'PRP$',
 'NNS',
 'IN',
 'DT',
 'NN',
 '-NONE-',
 '.']

In [108]:
print model.predict(X_test[10])[0]
print model.predict(X_test[20])[0]

['MD', 'VB', 'PRP', 'MD', 'VB', 'PRP', 'MD', 'VB', 'PRP', 'MD', 'VB', 'PRP', 'MD', 'VB', 'PRP', 'MD', 'VB']
['MD', 'VB', 'PRP', 'MD', 'VB', 'PRP', 'MD', 'VB', 'PRP', 'MD', 'VB', 'PRP', 'MD', 'VB', 'PRP', 'MD', 'VB']


In [194]:
# predicting a new sentence, which did not feature in the treebank set at all
sentence = ['I', 'ate', 'the', 'custard', 'tart', 'ravenously']
sentence_features = [features(sentence, idx) for idx, _  in enumerate(sentence)]
model.predict_single(sentence_features)

['PRP', 'VBP', 'DT', 'NN', 'NN', 'RB']

In [153]:

#len(model.state_features_)
model.attributes_

[u'is_first',
 u'word:Among',
 u'is_all_lower',
 u'next_word:other',
 u'suffix_3:ong',
 u'is_numeric',
 u'prefix_1:A',
 u'prefix_3:Amo',
 u'prefix_2:Am',
 u'is_last',
 u'has_hypen',
 u'is_all_caps',
 u'suffix_2:ng',
 u'is_capitalised',
 u'suffix_1:g',
 u'capitals_inside',
 u'prev_word:',
 u'word:other',
 u'next_word:things',
 u'suffix_3:her',
 u'prefix_1:o',
 u'prefix_3:oth',
 u'prefix_2:ot',
 u'suffix_2:er',
 u'suffix_1:r',
 u'prev_word:Among',
 u'word:things',
 u'next_word:,',
 u'suffix_3:ngs',
 u'prefix_1:t',
 u'prefix_3:thi',
 u'prefix_2:th',
 u'suffix_2:gs',
 u'suffix_1:s',
 u'prev_word:other',
 u'word:,',
 u'next_word:the',
 u'suffix_3:,',
 u'prefix_1:,',
 u'prefix_3:,',
 u'prefix_2:,',
 u'suffix_2:,',
 u'suffix_1:,',
 u'prev_word:things',
 u'word:the',
 u'next_word:survey',
 u'suffix_3:the',
 u'prefix_3:the',
 u'suffix_2:he',
 u'suffix_1:e',
 u'prev_word:,',
 u'word:survey',
 u'next_word:found',
 u'suffix_3:vey',
 u'prefix_1:s',
 u'prefix_3:sur',
 u'prefix_2:su',
 u'suffix_2:ey'

In [188]:
import eli5
import string
# conduct detailed evaluation

labels = list(model.classes_)
# remove symbol labels
labels = [l for l in labels if len(l) >1 and l[1] in string.ascii_uppercase]
print metrics.flat_f1_score(y_test, y_pred, average='weighted',labels=labels)

# display per-class results
print metrics.flat_classification_report(y_test, y_pred, labels=sorted(labels), digits=3)

0.9554268148413771
             precision    recall  f1-score   support

      -LRB-      1.000     1.000     1.000        30
     -NONE-      1.000     1.000     1.000      1644
      -RRB-      1.000     1.000     1.000        33
         CC      0.995     0.997     0.996       590
         CD      0.992     0.994     0.993       871
         DT      0.994     0.989     0.992      2138
         EX      0.852     1.000     0.920        23
         FW      0.000     0.000     0.000         2
         IN      0.967     0.982     0.975      2506
         JJ      0.887     0.864     0.876      1564
        JJR      0.837     0.854     0.845        96
        JJS      1.000     0.843     0.915        51
         LS      1.000     0.667     0.800         3
         MD      1.000     0.987     0.994       234
         NN      0.937     0.946     0.941      3403
        NNP      0.949     0.987     0.968      2436
       NNPS      0.744     0.525     0.615        61
        NNS      0.953    

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [189]:
# check what classifier learned
from collections import Counter
def print_transitions(trans_features):
    for (l_from, l_to), weight in trans_features:
        print "%-6s -> %-7s %0.6f" % (l_from, l_to, weight)
        
print ("Top likely transitions:")
print_transitions(Counter(model.transition_features_).most_common(20))

Top likely transitions:
MD     -> VB      4.944188
TO     -> VB      3.035100
JJ     -> NN      2.768665
NNS    -> VBP     2.669795
PRP    -> VBP     2.605842
WDT    -> -NONE-  2.479925
NNP    -> NNP     2.454811
NNP    -> POS     2.420472
JJ     -> NNS     2.356060
CD     -> CD      2.333455
PRP    -> VBD     2.326176
PRP    -> VBZ     2.152002
PRP$   -> NN      2.091057
-NONE- -> VBP     2.073427
VBN    -> -NONE-  2.049816
$      -> CD      2.018674
NNP    -> VBD     2.014314
DT     -> NN      2.012249
NN     -> POS     1.964675
CD     -> NNS     1.962200


In [198]:
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print "%0.6f %-8s %s" % (weight, label, attr)

print ("Top positive:")
print_state_features(Counter(model.state_features_).most_common(30))
print ("Top negative:")
print_state_features(Counter(model.state_features_).most_common()[-30:])

Top positive:
6.151299 NNP      is_capitalised
4.387084 -NONE-   prefix_1:*
4.383760 VBZ      suffix_1:s
4.369644 NNS      suffix_1:s
4.342867 RB       suffix_2:ly
3.938686 NN       is_all_lower
3.738135 VBG      suffix_3:ing
3.734816 VBN      suffix_2:ed
3.538607 JJ       has_hypen
3.377862 VBD      suffix_2:ed
3.085612 CD       prefix_1:1
3.010594 NNPS     suffix_1:s
2.902327 JJ       suffix_3:ous
2.744854 JJ       is_all_lower
2.739277 JJ       word:many
2.669771 VBD      suffix_1:d
2.633840 NN       suffix_2:ss
2.617963 VBN      suffix_1:d
2.588813 JJR      suffix_2:er
2.530668 DT       suffix_2:he
2.497807 JJ       suffix_3:ble
2.446247 RB       is_all_lower
2.427643 VB       prev_word:n't
2.403446 IN       prefix_3:tha
2.400196 NNS      suffix_2:ts
2.381873 JJ       suffix_2:al
2.364360 JJS      suffix_2:st
2.346303 CD       is_numeric
2.337198 VBN      prev_word:has
2.335452 VB       prev_word:*
Top negative:
-1.049359 JJ       suffix_3:eed
-1.108168 NN       prev_word:are
-1.11

[5, 4, 3, 2, 1]