In [1]:
%matplotlib inline

In [2]:
import nltk, re, pprint
from nltk import word_tokenize

In [3]:
sents = nltk.corpus.treebank_raw.sents()

In [4]:
tokens = []

In [5]:
boundaries = set()

In [6]:
offset = 0

In [7]:
for sent in sents:
    tokens.extend(sent)
    offset += len(sent)
    boundaries.add(offset-1)

In [8]:
def punct_features(tokens, i):
    return {'next-word-capitalized': tokens[i+1][0].isupper(),
            'prev-word': tokens[i-1].lower(),
            'punct': tokens[i],
            'prev-word-is-one-char': len(tokens[i-1]) == 1}

In [9]:
featuresets = [(punct_features(tokens, i), (i in boundaries))
               for i in range(1, len(tokens)-1)
               if tokens[i] in '.?!']

In [10]:
size = int(len(featuresets) * 0.1)

In [11]:
train_set, test_set = featuresets[size:], featuresets[:size]

In [12]:
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [13]:
nltk.classify.accuracy(classifier, test_set)
# 0.936026936026936

0.936026936026936

In [14]:
def segment_sentences(words):
    start = 0
    sents = []
    for i, word in enumerate(words):
        if word in '.?!' and classifier.classify(punct_features(words, i)) == True:
            sents.append(words[start:i+1])
            start = i+1
    if start < len(words):
        sents.append(words[start:])
    return sents

In [15]:
posts = nltk.corpus.nps_chat.xml_posts()[:10000]

In [16]:
def dialogue_act_features(post):
    features = {}
    for word in nltk.word_tokenize(post):
        features['contains({})'.format(word.lower())] = True
    return features

In [None]:
featuresets = [(dialogue_act_features(post.text), post.get('class'))
               for post in posts]

In [17]:
size = int(len(featuresets) * 0.1)

In [18]:
train_set, test_set = featuresets[size:], featuresets[:size]

In [19]:
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [20]:
print(nltk.classify.accuracy(classifier, test_set))
# 0.67

0.936026936026936


In [21]:
def rte_features(rtepair):
    extractor = nltk.RTEFeatureExtractor(rtepair)
    features = {}
    features['word_overlap'] = len(extractor.overlap('word'))
    features['word_hyp_extra'] = len(extractor.hyp_extra('word'))
    features['ne_overlap'] = len(extractor.overlap('ne'))
    features['ne_hyp_extra'] = len(extractor.hyp_extra('ne'))
    return features

In [22]:
rtepair = nltk.corpus.rte.pairs(['rte3_dev.xml'])[33]

In [23]:
extractor = nltk.RTEFeatureExtractor(rtepair)

In [24]:
print(extractor.text_words)
# {'Russia', 'Organisation', 'Shanghai', 'Asia', 'four', 'at',
# 'operation', 'SCO', ...}

{'SCO', 'republics', 'Co', 'Russia', 'was', 'terrorism.', 'binds', 'at', 'former', 'China', 'meeting', 'Iran', 'association', 'together', 'fight', 'Shanghai', 'Asia', 'Soviet', 'representing', 'central', 'Parviz', 'operation', 'that', 'four', 'Organisation', 'Davudi', 'fledgling'}


In [25]:
print(extractor.hyp_words)
# {'member', 'SCO', 'China'}

{'China', 'member', 'SCO.'}


In [26]:
print(extractor.overlap('word'))
# set()

set()


In [27]:
print(extractor.overlap('ne'))
# {'SCO', 'China'}

{'China'}


In [28]:
print(extractor.hyp_extra('word'))
# {'member'}

{'member'}
