In [132]:
from nltk import word_tokenize, pos_tag
import nltk

In [133]:
print(pos_tag(word_tokenize("I'm learning NLP for my homework")))

[('I', 'PRP'), ("'m", 'VBP'), ('learning', 'VBG'), ('NLP', 'NNP'), ('for', 'IN'), ('my', 'PRP$'), ('homework', 'NN')]


In [134]:
tagged_sentences = nltk.corpus.treebank.tagged_sents()

In [135]:
print(tagged_sentences[0])

[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')]


In [136]:
print('Tagged sentences : ',len(tagged_sentences))

Tagged sentences :  3914


In [137]:
print('Tagged words : ', len(nltk.corpus.treebank.tagged_words()))

Tagged words :  100676


In [138]:
def features(sentence, index):
    """ sentence: [w1, w2, ...], index: the index of the word """
    return {
        'word': sentence[index],
        'is_first': index == 0,
        'is_last': index == len(sentence) - 1,
        'is_capitalized': sentence[index][0].upper() == sentence[index][0],
        'is_all_caps': sentence[index].upper() == sentence[index],
        'is_all_lower': sentence[index].lower() == sentence[index],
        'prefix-1': sentence[index][0],
        'prefix-2': sentence[index][:2],
        'prefix-3': sentence[index][:3],
        'suffix-1': sentence[index][-1],
        'suffix-2': sentence[index][-2:],
        'suffix-3': sentence[index][-3:],
        'prev_word': '' if index == 0 else sentence[index - 1],
        'next_word': '' if index == len(sentence) - 1 else sentence[index + 1],
        'has_hyphen': '-' in sentence[index],
        'is_numeric': sentence[index].isdigit(),
        'capitals_inside': sentence[index][1:].lower() != sentence[index][1:]
    }

In [139]:
import pprint

In [140]:
pprint.pprint(features(['This','is','a','sentence'], 3))

{'capitals_inside': False,
 'has_hyphen': False,
 'is_all_caps': False,
 'is_all_lower': True,
 'is_capitalized': False,
 'is_first': False,
 'is_last': True,
 'is_numeric': False,
 'next_word': '',
 'prefix-1': 's',
 'prefix-2': 'se',
 'prefix-3': 'sen',
 'prev_word': 'a',
 'suffix-1': 'e',
 'suffix-2': 'ce',
 'suffix-3': 'nce',
 'word': 'sentence'}


In [141]:
def untag(tagged_sentence):
    return [w for w, t in tagged_sentence]

In [142]:
# Split the dataset for training and testing
cutoff = int(.75 * len(tagged_sentences))
training_sentences = tagged_sentences[:cutoff]
test_sentences = tagged_sentences[cutoff:]

In [143]:
print('training_sentences : ',len(training_sentences))

training_sentences :  2935


In [144]:
print('test_sentences : ', len(test_sentences))

test_sentences :  979


In [145]:
def transform_to_dataset(tagged_sentences):
    X, y = [], []
 
    for tagged in tagged_sentences:
        for index in range(len(tagged)):
            X.append(features(untag(tagged), index))
            y.append(tagged[index][1])
 
    return X, y

In [146]:
X, y = transform_to_dataset(training_sentences)

In [147]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline
 
clf = Pipeline([
    ('vectorizer', DictVectorizer(sparse=False)),
    ('classifier', DecisionTreeClassifier(criterion='entropy'))
    ])
 
clf.fit(X[:10000], y[:10000])

Pipeline(memory=None,
         steps=[('vectorizer',
                 DictVectorizer(dtype=<class 'numpy.float64'>, separator='=',
                                sort=True, sparse=False)),
                ('classifier',
                 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                        criterion='entropy', max_depth=None,
                                        max_features=None, max_leaf_nodes=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=1, min_samples_split=2,
                                        min_weight_fraction_leaf=0.0,
                                        presort='deprecated', random_state=None,
                                        splitter='best'))],
         verbose=False)

In [148]:
print ('Training completed')

Training completed


In [149]:
X_test, y_test = transform_to_dataset(test_sentences)

In [150]:
print("Accuracy:", clf.score(X_test, y_test))

Accuracy: 0.8928571428571429


In [151]:
def pos_tags(sentence):
    tags = clf.predict([features(sentence, index) for index in range(len(sentence))])
    return list(zip(sentence, tags))

In [152]:
print(pos_tags(word_tokenize('This is my friend, John.')))

[('This', 'DT'), ('is', 'VBZ'), ('my', 'NN'), ('friend', 'NN'), (',', ','), ('John', 'NNP'), ('.', '.')]
