In [1]:
import nltk
import random
from nltk.corpus import movie_reviews
from nltk.corpus import brown

In [3]:
documents = [(list(movie_reviews.words(fileid)), category) for category in movie_reviews.categories() 
            for fileid in movie_reviews.fileids(categories=category)]

In [5]:
random.shuffle(documents)

In [7]:
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())

In [9]:
word_features = all_words.keys()

In [15]:
word_features = list(word_features)[:2000]

In [16]:
def document_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains(%s)' % word] = (word in document_words)
    return features

In [17]:
document_features(movie_reviews.words('pos/cv957_8737.txt'))

{'contains(plot)': True,
 'contains(:)': True,
 'contains(two)': True,
 'contains(teen)': False,
 'contains(couples)': False,
 'contains(go)': False,
 'contains(to)': True,
 'contains(a)': True,
 'contains(church)': False,
 'contains(party)': False,
 'contains(,)': True,
 'contains(drink)': False,
 'contains(and)': True,
 'contains(then)': True,
 'contains(drive)': False,
 'contains(.)': True,
 'contains(they)': True,
 'contains(get)': True,
 'contains(into)': True,
 'contains(an)': True,
 'contains(accident)': False,
 'contains(one)': True,
 'contains(of)': True,
 'contains(the)': True,
 'contains(guys)': False,
 'contains(dies)': False,
 'contains(but)': True,
 'contains(his)': True,
 'contains(girlfriend)': True,
 'contains(continues)': False,
 'contains(see)': False,
 'contains(him)': True,
 'contains(in)': True,
 'contains(her)': False,
 'contains(life)': False,
 'contains(has)': True,
 'contains(nightmares)': False,
 'contains(what)': True,
 "contains(')": True,
 'contains(s)': T

In [18]:
featuresets = [(document_features(d), c) for (d,c) in documents]

In [19]:
train_set, test_set = featuresets[100:], featuresets[:100]

In [20]:
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [21]:
nltk.classify.accuracy(classifier, test_set)

0.87

In [22]:
classifier.show_most_informative_features(5)

Most Informative Features
 contains(unimaginative) = True              neg : pos    =      7.8 : 1.0
    contains(schumacher) = True              neg : pos    =      7.1 : 1.0
     contains(atrocious) = True              neg : pos    =      6.7 : 1.0
        contains(suvari) = True              neg : pos    =      6.4 : 1.0
          contains(mena) = True              neg : pos    =      6.4 : 1.0


In [24]:
suffix_fdist = nltk.FreqDist()

In [26]:
for word in brown.words():
    word = word.lower()
    suffix_fdist[word[-1:]] += 1
    suffix_fdist[word[-2:]] += 1
    suffix_fdist[word[-3:]] += 1

In [28]:
common_suffixs = list(suffix_fdist.keys())[:100]

In [29]:
common_suffixs

['e',
 'he',
 'the',
 'n',
 'on',
 'ton',
 'y',
 'ty',
 'nty',
 'd',
 'nd',
 'and',
 'ry',
 'ury',
 'id',
 'aid',
 'ay',
 'day',
 'an',
 'ion',
 'f',
 'of',
 's',
 "'s",
 "a's",
 't',
 'nt',
 'ent',
 'ary',
 'ed',
 'ced',
 '`',
 '``',
 'o',
 'no',
 'ce',
 'nce',
 "'",
 "''",
 'at',
 'hat',
 'ny',
 'any',
 'es',
 'ies',
 'k',
 'ok',
 'ook',
 'ace',
 '.',
 'r',
 'er',
 'her',
 'in',
 'end',
 'ts',
 'nts',
 'ity',
 've',
 'ive',
 'ee',
 'tee',
 ',',
 'h',
 'ch',
 'ich',
 'ad',
 'had',
 'l',
 'll',
 'all',
 'ge',
 'rge',
 'ves',
 'se',
 'ise',
 'ks',
 'nks',
 'a',
 'ta',
 'nta',
 'or',
 'for',
 'ner',
 'as',
 'was',
 'ted',
 'ber',
 'm',
 'rm',
 'erm',
 'en',
 'een',
 'ged',
 'by',
 'ior',
 'rt',
 'urt',
 'dge',
 'od']

In [30]:
def pos_features(word):
    features = {}
    for suffix in common_suffixs:
        features['endswith(%s)'%suffix] = word.lower().endswith(suffix)
    return features

In [31]:
tagged_words = brown.tagged_words(categories='news')
featuresets = [(pos_features(n), g) for n,g in tagged_words]
size = int( len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]
classifier = nltk.DecisionTreeClassifier.train(train_set)

In [32]:
nltk.classify.accuracy(classifier, test_set)

0.5689706613625062

In [33]:
classifier.classify(pos_features('cats'))

'NNS'

In [34]:
print(classifier.pseudocode(depth=4))

if endswith(the) == False: 
  if endswith(,) == False: 
    if endswith(s) == False: 
      if endswith(.) == False: return '.'
      if endswith(.) == True: return '.'
    if endswith(s) == True: 
      if endswith(was) == False: return 'PP$'
      if endswith(was) == True: return 'BEDZ'
  if endswith(,) == True: return ','
if endswith(the) == True: return 'AT'



In [2]:
def pos_feature(sentence, i):
    features = {"suffix(1)": sentence[i][-1:],
               "suffix(2)": sentence[i][-2:],
                "suffix(3)": sentence[i][-3:]}
    if i == 0:
        features["prev-word"] = "<START>"
    else:
        features["prev-word"] = sentence[i-1]
    return features

In [3]:
pos_feature(brown.sents()[0], 8)

{'prev-word': 'an', 'suffix(1)': 'n', 'suffix(2)': 'on', 'suffix(3)': 'ion'}

In [4]:
tagged_sents = brown.tagged_sents(categories='news')
featuresets = []
for tagged_sent in tagged_sents:
    untagged_sent = nltk.tag.untag(tagged_sent)
    for i, (word, tag) in enumerate(tagged_sent):
        featuresets.append((pos_feature(untagged_sent, i), tag))

In [5]:
size = int(len(untagged_sent)*0.1)
train_set , test_set = featuresets[size:], featuresets[:size]

In [6]:
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [8]:
nltk.classify.accuracy(classifier, test_set)

0.6666666666666666

In [9]:
classifier.show_most_informative_features()

Most Informative Features
               suffix(1) = '.'                 . : NN     =   7868.8 : 1.0
               suffix(2) = 'he'               AT : NN     =   3709.9 : 1.0
               suffix(2) = 'ho'              WPS : NN     =   3476.7 : 1.0
               suffix(1) = 'r'               JJR : NNS    =   2630.7 : 1.0
               suffix(2) = 'to'               TO : JJ     =   2490.1 : 1.0
               suffix(1) = 'h'               ABX : NNS    =   2335.5 : 1.0
               suffix(3) = 'hat'              CS : NN     =   1914.9 : 1.0
               suffix(2) = 'es'              NNS : IN     =   1844.1 : 1.0
               suffix(2) = 'I'              PPSS : NN     =   1779.7 : 1.0
               suffix(1) = "'"                '' : JJ     =   1689.8 : 1.0
