In [1]:
from nltk.corpus import brown
from nltk import FreqDist
from nltk import DecisionTreeClassifier
from nltk import classify

In [2]:
suff_fdist = FreqDist()
suff_fdist

FreqDist()

<h3> 1. Feature - Find freq of suffix -1, -2, -3  </h3>

In [3]:
for word in brown.words():
    word = word.lower()
#     print(word, word[-3:])
    suff_fdist[word[-1:]] +=1
    suff_fdist[word[-2:]] +=1
    suff_fdist[word[-3:]] +=1

In [4]:
suff_fdist.most_common(10)

[('e', 202946),
 (',', 175002),
 ('.', 152999),
 ('s', 128722),
 ('d', 105687),
 ('t', 94459),
 ('he', 92084),
 ('n', 87889),
 ('a', 74912),
 ('of', 72978)]

<h3> 2. Find common suffix - top 100 </h3>

In [5]:
common_suffix = [ suffix for(suffix, count) in suff_fdist.most_common(100)]
common_suffix

['e',
 ',',
 '.',
 's',
 'd',
 't',
 'he',
 'n',
 'a',
 'of',
 'the',
 'y',
 'r',
 'to',
 'in',
 'f',
 'o',
 'ed',
 'nd',
 'is',
 'on',
 'l',
 'g',
 'and',
 'ng',
 'er',
 'as',
 'ing',
 'h',
 'at',
 'es',
 'or',
 're',
 'it',
 '``',
 'an',
 "''",
 'm',
 ';',
 'i',
 'ly',
 'ion',
 'en',
 'al',
 '?',
 'nt',
 'be',
 'hat',
 'st',
 'his',
 'th',
 'll',
 'le',
 'ce',
 'by',
 'ts',
 'me',
 've',
 "'",
 'se',
 'ut',
 'was',
 'for',
 'ent',
 'ch',
 'k',
 'w',
 'ld',
 '`',
 'rs',
 'ted',
 'ere',
 'her',
 'ne',
 'ns',
 'ith',
 'ad',
 'ry',
 ')',
 '(',
 'te',
 '--',
 'ay',
 'ty',
 'ot',
 'p',
 'nce',
 "'s",
 'ter',
 'om',
 'ss',
 ':',
 'we',
 'are',
 'c',
 'ers',
 'uld',
 'had',
 'so',
 'ey']

<h3>3. Feature extraction </h3>

In [6]:
def pos_features(word):
    word = word.lower()
    features = {}
    for suffix in common_suffix:
        features['endswith({})'.format(suffix)] = word.endswith(suffix)
        
    return features

In [7]:
pos_features('casng')

{"endswith('')": False,
 "endswith(')": False,
 "endswith('s)": False,
 'endswith(()': False,
 'endswith())': False,
 'endswith(,)': False,
 'endswith(--)': False,
 'endswith(.)': False,
 'endswith(:)': False,
 'endswith(;)': False,
 'endswith(?)': False,
 'endswith(`)': False,
 'endswith(``)': False,
 'endswith(a)': False,
 'endswith(ad)': False,
 'endswith(al)': False,
 'endswith(an)': False,
 'endswith(and)': False,
 'endswith(are)': False,
 'endswith(as)': False,
 'endswith(at)': False,
 'endswith(ay)': False,
 'endswith(be)': False,
 'endswith(by)': False,
 'endswith(c)': False,
 'endswith(ce)': False,
 'endswith(ch)': False,
 'endswith(d)': False,
 'endswith(e)': False,
 'endswith(ed)': False,
 'endswith(en)': False,
 'endswith(ent)': False,
 'endswith(er)': False,
 'endswith(ere)': False,
 'endswith(ers)': False,
 'endswith(es)': False,
 'endswith(ey)': False,
 'endswith(f)': False,
 'endswith(for)': False,
 'endswith(g)': True,
 'endswith(h)': False,
 'endswith(had)': False,
 '

"Feature extraction functions behave like tinted glasses, highlighting some of the properties (colors) in our data and making it impossible to see other properties. The classifier will rely exclusively on these highlighted properties when determining how to label inputs. In this case, the classifier will make its decisions based only on information about which of the common suffixes (if any) a given word has."

<h4> Applying feature extraction </h4>

In [8]:
tagged_words = brown.tagged_words(categories='news')
feat_sets = [ (pos_features(news), cat) for (news, cat) in tagged_words]

In [9]:
tagged_words[:10]

[('The', 'AT'),
 ('Fulton', 'NP-TL'),
 ('County', 'NN-TL'),
 ('Grand', 'JJ-TL'),
 ('Jury', 'NN-TL'),
 ('said', 'VBD'),
 ('Friday', 'NR'),
 ('an', 'AT'),
 ('investigation', 'NN'),
 ('of', 'IN')]

In [10]:
size = int(len(feat_sets) * 0.1)
train_set, test_set = feat_sets[size:], feat_sets[:size]

In [14]:
clf = DecisionTreeClassifier.train(train_set)
print(clf)

KeyboardInterrupt: 