# Creating a POS Tagger

We can train a classifier to work out which suffixes are most informative for POS tagging. We can begin by finding out what the most common suffixes are

In [1]:
from nltk.corpus import brown
from nltk import FreqDist

suffix_fdist = FreqDist()
for word in brown.words():
    word = word.lower()
    suffix_fdist[word[-1:]] += 1
    suffix_fdist[word[-2:]] += 1
    suffix_fdist[word[-3:]] += 1
    
suffix_fdist

FreqDist({'e': 202946,
          'he': 92084,
          'the': 70026,
          'n': 87889,
          'on': 33382,
          'ton': 1019,
          'y': 59146,
          'ty': 6458,
          'nty': 391,
          'd': 105687,
          'nd': 36418,
          'and': 31057,
          'ry': 7500,
          'ury': 482,
          'id': 4272,
          'aid': 2460,
          'ay': 6482,
          'day': 1613,
          'an': 17650,
          'ion': 14905,
          'f': 43173,
          'of': 72978,
          's': 128722,
          "'s": 5865,
          "a's": 202,
          't': 94459,
          'nt': 13151,
          'ent': 9369,
          'ary': 2122,
          'ed': 41527,
          'ced': 1262,
          '`': 8837,
          '``': 17674,
          'o': 42363,
          'no': 4402,
          'ce': 10953,
          'nce': 5971,
          "'": 10455,
          "''": 17639,
          'at': 25410,
          'hat': 12692,
          'ny': 3437,
          'any': 2793,
          'es': 22408,
  

In [2]:
common_suffixes = [suffix for (suffix, count) in suffix_fdist.most_common(100)]
common_suffixes[:10]

['e', ',', '.', 's', 'd', 't', 'he', 'n', 'a', 'of']

Next, we'll define a feature extractor function which checks a given word for these suffixes:

In [3]:
def pos_features(word):
    features = {}
    for suffix in common_suffixes:
        features['endswith({})'.format(suffix)] = word.lower().endswith(suffix)
    return features

pos_features('test')

{"endswith('')": False,
 "endswith(')": False,
 "endswith('s)": False,
 'endswith(()': False,
 'endswith())': False,
 'endswith(,)': False,
 'endswith(--)': False,
 'endswith(.)': False,
 'endswith(:)': False,
 'endswith(;)': False,
 'endswith(?)': False,
 'endswith(`)': False,
 'endswith(``)': False,
 'endswith(a)': False,
 'endswith(ad)': False,
 'endswith(al)': False,
 'endswith(an)': False,
 'endswith(and)': False,
 'endswith(are)': False,
 'endswith(as)': False,
 'endswith(at)': False,
 'endswith(ay)': False,
 'endswith(be)': False,
 'endswith(by)': False,
 'endswith(c)': False,
 'endswith(ce)': False,
 'endswith(ch)': False,
 'endswith(d)': False,
 'endswith(e)': False,
 'endswith(ed)': False,
 'endswith(en)': False,
 'endswith(ent)': False,
 'endswith(er)': False,
 'endswith(ere)': False,
 'endswith(ers)': False,
 'endswith(es)': False,
 'endswith(ey)': False,
 'endswith(f)': False,
 'endswith(for)': False,
 'endswith(g)': False,
 'endswith(h)': False,
 'endswith(had)': False,
 

Now that we've defined our feature extractor, we can use it to train a new decision tree classifier:

In [4]:
tagged_words = brown.tagged_words(categories='news')
featuresets = [(pos_features(n), g) for (n,g) in tagged_words]
featuresets[0]

({"endswith('')": False,
  "endswith(')": False,
  "endswith('s)": False,
  'endswith(()': False,
  'endswith())': False,
  'endswith(,)': False,
  'endswith(--)': False,
  'endswith(.)': False,
  'endswith(:)': False,
  'endswith(;)': False,
  'endswith(?)': False,
  'endswith(`)': False,
  'endswith(``)': False,
  'endswith(a)': False,
  'endswith(ad)': False,
  'endswith(al)': False,
  'endswith(an)': False,
  'endswith(and)': False,
  'endswith(are)': False,
  'endswith(as)': False,
  'endswith(at)': False,
  'endswith(ay)': False,
  'endswith(be)': False,
  'endswith(by)': False,
  'endswith(c)': False,
  'endswith(ce)': False,
  'endswith(ch)': False,
  'endswith(d)': False,
  'endswith(e)': True,
  'endswith(ed)': False,
  'endswith(en)': False,
  'endswith(ent)': False,
  'endswith(er)': False,
  'endswith(ere)': False,
  'endswith(ers)': False,
  'endswith(es)': False,
  'endswith(ey)': False,
  'endswith(f)': False,
  'endswith(for)': False,
  'endswith(g)': False,
  'endswit

In [5]:
from nltk import DecisionTreeClassifier
from nltk.classify import accuracy

cutoff = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[cutoff:], featuresets[:cutoff]

In [None]:
classifier = DecisionTreeClassifier.train(train_set) # NLTK is a teaching toolkit which is not really optimized for speed. Therefore, this may take forever. For speed, use scikit-learn for the classifiers.

In [None]:
accuracy(classifier, test_set)

In [None]:
classifier.classify(pos_features('cats'))

In [None]:
classifier.pseudocode(depth=4)

To improve the classifier, we can add contextual features:

```py
def pos_features(sentence, i): [1]
    features = {"suffix(1)": sentence[i][-1:],
                "suffix(2)": sentence[i][-2:],
                "suffix(3)": sentence[i][-3:]}
    if i == 0:
        features["prev-word"] = "<START>"
    else:
        features["prev-word"] = sentence[i-1]
    return features
```

Then, instead of working with tagged words, we work with tagged sentences:
```py
tagged_sents = brown.tagged_sents(categories='news')
```

We can then improve this further by adding more features such as `prev-tag` etc.