In [1]:
import os
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize

In [2]:
BASE_DIR = os.getcwd()

data_path = BASE_DIR + '/imp_set.txt'

### Data

In [3]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\narho_000\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\narho_000\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [4]:
data = open(data_path, 'r').read()

### Tokenization

In [5]:
lines = []
for l in data.split('\n'):
    lines.append(l)

In [6]:
sentences = [s for l in lines for s in sent_tokenize(l)] # punkt
sentences

['Be kind',
 'Get out of here',
 'Look this over',
 'Paul, do your homework now',
 'Do not clean soot off the window',
 'Turn your phones off, please',
 'Run down to the shop, will you, Peter']

In [7]:
tagged_sentences = []
for s in sentences:
    words = word_tokenize(s)
    tagged = nltk.pos_tag(words) # averaged_perceptron_tagger
    tagged_sentences.append(tagged)
print(tagged_sentences)

[[('Be', 'VB'), ('kind', 'NN')], [('Get', 'VB'), ('out', 'IN'), ('of', 'IN'), ('here', 'RB')], [('Look', 'VB'), ('this', 'DT'), ('over', 'IN')], [('Paul', 'NNP'), (',', ','), ('do', 'VB'), ('your', 'PRP$'), ('homework', 'NN'), ('now', 'RB')], [('Do', 'VB'), ('not', 'RB'), ('clean', 'VB'), ('soot', 'NN'), ('off', 'IN'), ('the', 'DT'), ('window', 'NN')], [('Turn', 'VB'), ('your', 'PRP$'), ('phones', 'NNS'), ('off', 'RP'), (',', ','), ('please', 'VB')], [('Run', 'NNP'), ('down', 'RB'), ('to', 'TO'), ('the', 'DT'), ('shop', 'NN'), (',', ','), ('will', 'MD'), ('you', 'PRP'), (',', ','), ('Peter', 'NNP')]]


#### Note: POS accuracy

`Run down to the shop, will you, Peter` is parsed unexpectedly by `nltk.pos_tag`:
> `[('Run', 'NNP'), ('down', 'RB'), ('to', 'TO'), ('the', 'DT'), ('shop', 'NN'), (',', ','), ('will', 'MD'), ('you', 'PRP'), (',', ','), ('Peter', 'NNP')]`

`Run` is tagged as a `NNP (proper noun, singular)`

I expected an output more like what the [Stanford Parser](http://nlp.stanford.edu:8080/parser/) provides:
> `Run/VBG down/RP to/TO the/DT shop/NN ,/, will/MD you/PRP ,/, Peter/NNP`

`Run` is tagged as a `VGB (verb, gerund/present participle)` - still not quite the `VB` I want, but at least it's a `V*`

_MEANWHILE..._

`nltk.pos_tag` did better with:
> `[('Do', 'VB'), ('not', 'RB'), ('clean', 'VB'), ('soot', 'NN'), ('off', 'IN'), ('the', 'DT'), ('window', 'NN')]`

Compared to [Stanford CoreNLP](http://nlp.stanford.edu:8080/corenlp/process) (note that this is different than what [Stanford Parser](http://nlp.stanford.edu:8080/parser/) outputs):
> `(ROOT (S (VP (VB Do) (NP (RB not) (JJ clean) (NN soot)) (PP (IN off) (NP (DT the) (NN window))))))`

Concern: _clean_ as `VB (verb, base form)` vs `JJ (adjective)` 

**IMPROVE** POS taggers should vote: nltk.pos_tag (averaged_perceptron_tagger), Stanford Parser, CoreNLP, etc.

### Featurization

In [8]:
from enum import Enum, auto
class FeatureName(Enum):
    VERB = auto() # does this sentence contain a VB*?
    FOLLOWING = auto() # is the following word a <POS>? postfixed with _<POS>
    VERB_MODIFIER = auto() # is a <POS> modifying the verb?
    VERB_MODIFYING = auto() # is the verb modifying a <POS>?

In [11]:
import re
from collections import defaultdict

featuresets = []
for ts in tagged_sentences:
    s_features = defaultdict(int)
    for idx, tup in enumerate(ts):
        #print(tup)
        pos = tup[1]
        # FeatureName.VERB
        is_verb = re.match(r"VB.?", pos) is not None
        print(tup, is_verb)
        if is_verb:
            s_features[FeatureName.VERB] += 1
            # FOLLOWING_POS
            next_idx = idx + 1;
            if next_idx < len(ts):
                s_features[f"{FeatureName.FOLLOWING}_{ts[next_idx][1]}"] += 1
            # VERB_MODIFIER
            # VERB_MODIFYING
        else:
            s_features[FeatureName.VERB] = 0
    featuresets.append(dict(s_features))

print()
print(featuresets)

('Be', 'VB') True
('kind', 'NN') False
('Get', 'VB') True
('out', 'IN') False
('of', 'IN') False
('here', 'RB') False
('Look', 'VB') True
('this', 'DT') False
('over', 'IN') False
('Paul', 'NNP') False
(',', ',') False
('do', 'VB') True
('your', 'PRP$') False
('homework', 'NN') False
('now', 'RB') False
('Do', 'VB') True
('not', 'RB') False
('clean', 'VB') True
('soot', 'NN') False
('off', 'IN') False
('the', 'DT') False
('window', 'NN') False
('Turn', 'VB') True
('your', 'PRP$') False
('phones', 'NNS') False
('off', 'RP') False
(',', ',') False
('please', 'VB') True
('Run', 'NNP') False
('down', 'RB') False
('to', 'TO') False
('the', 'DT') False
('shop', 'NN') False
(',', ',') False
('will', 'MD') False
('you', 'PRP') False
(',', ',') False
('Peter', 'NNP') False

[{<FeatureName.VERB: 1>: 0, 'FeatureName.FOLLOWING_NN': 1}, {<FeatureName.VERB: 1>: 0, 'FeatureName.FOLLOWING_IN': 1}, {<FeatureName.VERB: 1>: 0, 'FeatureName.FOLLOWING_DT': 1}, {<FeatureName.VERB: 1>: 0, 'FeatureName.FOLLOW