#### 06 - Learning to Classify Text

Resource: https://www.nltk.org/book/ch06.html

In [83]:
import nltk

In [84]:
def gender_features(word):
  return {'last_letter' : word[-1]}
gender_features('Shrek')

{'last_letter': 'k'}

In [85]:
 	
import random
from nltk.corpus import names

labeled_names = ([(name, 'male') for name in names.words('male.txt')] +
  [(name, 'female') for name in names.words('female.txt')])
random.shuffle(labeled_names)

In [86]:
featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [87]:
classifier.classify(gender_features('Neo'))

'male'

In [88]:
classifier.classify(gender_features('Trinity'))

'female'

In [89]:
nltk.classify.accuracy(classifier, test_set)

0.77

In [90]:
classifier.show_most_informative_features(10)

Most Informative Features
             last_letter = 'a'            female : male   =     34.4 : 1.0
             last_letter = 'k'              male : female =     32.3 : 1.0
             last_letter = 'f'              male : female =     16.7 : 1.0
             last_letter = 'p'              male : female =     12.6 : 1.0
             last_letter = 'v'              male : female =      9.9 : 1.0
             last_letter = 'd'              male : female =      9.8 : 1.0
             last_letter = 'm'              male : female =      8.9 : 1.0
             last_letter = 'o'              male : female =      8.6 : 1.0
             last_letter = 'r'              male : female =      6.7 : 1.0
             last_letter = 'g'              male : female =      6.3 : 1.0


In [91]:
from nltk.classify import apply_features
train_set = apply_features(gender_features, labeled_names[500:])
test_set = apply_features(gender_features, labeled_names[:500])

In [92]:
def gender_features2(name):
    features = {}
    features["first_letter"] = name[0].lower()
    features["last_letter"] = name[-1].lower()
    # Count occurances of each letter
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features[f"count({letter})"] = name.lower().count(letter)
        features[f"has {letter}"] = letter.lower() in name.lower()
    return features

In [93]:
featuresets = [(gender_features2(n), gender) for (n, gender) in labeled_names]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.728

In [94]:
from nltk.corpus import movie_reviews
documents = [(list(movie_reviews.words(fileid)), category)
            for category in movie_reviews.categories()
            for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)

In [95]:
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
words_features = list(all_words)[:2000]

def document_features(document):
  document_words = set(document)
  features = {}
  for word in words_features:
    features[f"contains {word}"] = word in document_words
  return features

In [96]:
featuresets = [(document_features(d), c) for (d,c) in documents]
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.89

In [97]:
classifier.show_most_informative_features(10)

Most Informative Features
    contains outstanding = True              pos : neg    =     10.6 : 1.0
          contains mulan = True              pos : neg    =      7.7 : 1.0
          contains damon = True              pos : neg    =      7.6 : 1.0
         contains seagal = True              neg : pos    =      7.4 : 1.0
    contains wonderfully = True              pos : neg    =      6.8 : 1.0
          contains waste = True              neg : pos    =      5.5 : 1.0
           contains lame = True              neg : pos    =      5.4 : 1.0
         contains wasted = True              neg : pos    =      5.3 : 1.0
            contains era = True              pos : neg    =      5.3 : 1.0
          contains awful = True              neg : pos    =      5.2 : 1.0


In [98]:
sents = nltk.corpus.treebank_raw.sents()
tokens = []
boundaries = set()
offset = 0
for sent in sents:
  tokens.extend(sent)
  offset += len(sent)
  boundaries.add(offset-1)

In [99]:
def punct_features(tokens, i):
  return {'next-word-capitalized': tokens[i+1][0].isupper(),
          'prev-word': tokens[i-1].lower(),
          'punct': tokens[i],
          'prev-word-is-one-char': len(tokens[i-1]) == 1}

In [100]:
featuresets = [(punct_features(tokens, i), (i in boundaries))
                for i in range(1, len(tokens)-1)
                if tokens[i] in '.?!']

In [101]:
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.936026936026936

In [102]:
def segment_sentences(words):
    start = 0
    sents = []
    for i, word in enumerate(words):
        if word in '.?!' and classifier.classify(punct_features(words, i)) == True:
            sents.append(words[start:i+1])
            start = i+1
    if start < len(words):
        sents.append(words[start:])
    return sents

In [103]:
posts = nltk.corpus.nps_chat.xml_posts()[:10000]

In [104]:
def dialogue_act_features(post):
  features = {}
  for word in nltk.word_tokenize(post):
    features[f"contains({word.lower()})"] = True
  return features

In [105]:
featuresets = [(dialogue_act_features(post.text), post.get('class'))
                for post in posts]
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.667