### Exercise 2

In [1]:
import re
import pprint
from nltk.corpus import brown
import nltk
import random
import numpy as np
import scipy as sp
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings('ignore')

In [2]:
tagged_sents = brown.tagged_sents(categories='news')
size = int(len(tagged_sents) * 0.1)
train_sents, test_sents = tagged_sents[size:], tagged_sents[:size]

In [3]:
def pos_features(sentence, i, history):
    features = {"suffix(1)": sentence[i][-1:],
                "suffix(2)": sentence[i][-2:],
                "suffix(3)": sentence[i][-3:]}
    if i == 0:
        features["prev-word"] = "<START>"
    else:
        features["prev-word"] = sentence[i-1]
    return features

In [4]:
class ConsecutivePosTagger(nltk.TaggerI):
    
    def __init__(self, train_sents, features=pos_features):
        self.features = features
        train_set = []
        for tagged_sent in train_sents:
            untagged_sent = nltk.tag.untag(tagged_sent)
            history = []
            for i, (word, tag) in enumerate(tagged_sent):
                featureset = features(untagged_sent, i, history)
                train_set.append( (featureset, tag) )
                history.append(tag)
        self.classifier = nltk.NaiveBayesClassifier.train(train_set)
        
    def tag(self, sentence):
        history = []
        for i, word in enumerate(sentence):
            featureset = self.features(sentence, i, history)
            tag = self.classifier.classify(featureset)
            history.append(tag)
        return zip(sentence, history)

In [5]:
tagger = ConsecutivePosTagger(train_sents)
print(round(tagger.evaluate(test_sents), 4))

0.7915


### Exercise 2.1.1

In [6]:
def originize(tagged_sents):
    """Change tags to original Brown tags in tagged_sents"""
    return [ [(word, tag.split('-')[0]) for (word,tag) in sent]
            for sent in tagged_sents]

orig_train_sents = originize(train_sents)
orig_test_sents = originize(test_sents)

orig_tagger_1 = ConsecutivePosTagger(orig_train_sents)
print(round(orig_tagger_1.evaluate(orig_test_sents), 4))

0.8314


In [7]:
# I found out that len(tagged_sents) is 4623, and 10% of that is = 462.3, so I rounded it down to 462
# 10% for final testing
news_test = tagged_sents[:462]
# 10% for development testing
news_dev_test = tagged_sents[462:924]
# 80% for training
news_train = tagged_sents[924:]
# print(len(news_test), len(news_dev_test),len(news_train))

### Exercise 2.1.2

In [8]:
train_sents = originize(news_train)
test_sents = originize(news_dev_test)
baseline = ConsecutivePosTagger(orig_train_sents)
print(round(baseline.evaluate(test_sents), 4))

0.8569


Yes, the baseline beats the NLTK tagger

### Exercise 2.2

In [9]:
import numpy as np
import sklearn

from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer


class ScikitConsecutivePosTagger(nltk.TaggerI):
    def __init__(self, train_sents,
                 features=pos_features, clf = BernoulliNB(alpha=0.5)):
        # Using pos_features as default.
        # Using BernoulliNB() (with alpha/lidstone 0.5)
        self.features = features
        train_features = []
        train_labels = []
        for tagged_sent in train_sents:
            history = []
            untagged_sent = nltk.tag.untag(tagged_sent)
            
            for i, (word, tag) in enumerate(tagged_sent):
                featureset = features(untagged_sent, i, history)
                train_features.append(featureset)
                train_labels.append(tag)
                history.append(tag)
        v = DictVectorizer()
        X_train = v.fit_transform(train_features)
        y_train = np.array(train_labels)
        clf.fit(X_train, y_train)
        self.classifier = clf
        self.dict = v

    def tag(self, sentence):
        test_features = []
        history = []
        for i, word in enumerate(sentence):
            featureset = self.features(sentence, i, history)
            test_features.append(featureset)
        X_test = self.dict.transform(test_features)
        tags = self.classifier.predict(X_test)
        return zip(sentence, tags)

### Exercise 2.2.1

In [10]:
orig_tagger_2 = ScikitConsecutivePosTagger(train_sents)
print(round(orig_tagger_2.evaluate(test_sents), 4))

0.7654


No, it did not yield the same result

### Exercise 2.2.2

In [11]:
alpha = [1, 0.5, 0.1, 0.01, 0.001, 0.0001]

for a in alpha:
    orig_tagger = ScikitConsecutivePosTagger(train_sents, clf=BernoulliNB(alpha=a))
    print("Alpha: {}, score: {}".format(a,round(orig_tagger.evaluate(test_sents), 4)))

Alpha: 1, score: 0.6646
Alpha: 0.5, score: 0.7654
Alpha: 0.1, score: 0.8309
Alpha: 0.01, score: 0.8276
Alpha: 0.001, score: 0.8237
Alpha: 0.0001, score: 0.8269


The best result with the new tagger was with alpha: 0.1 with a score of: 0.8309

This was a little better than NLTK, which was at 0.8249

### Exercise 2.2.3

In [12]:
def new_pos_features(sentence, i, history):
    features = {"suffix(1)": sentence[i][-1:],
                "suffix(2)": sentence[i][-2:],
                "suffix(3)": sentence[i][-3:]}
    if i == 0:
        features["prev-word"] = "<START>"
    else:
        features["prev-word"] = sentence[i-1]
    features["this-word"] = sentence[i]
    return features

In [13]:
for a in alpha:
    orig_tagger = ScikitConsecutivePosTagger(train_sents, features= new_pos_features, clf=BernoulliNB(alpha=a))
    print("Alpha: {}, score: {}".format(a,round(orig_tagger.evaluate(test_sents), 4)))

Alpha: 1, score: 0.6553
Alpha: 0.5, score: 0.7857
Alpha: 0.1, score: 0.8824
Alpha: 0.01, score: 0.8912
Alpha: 0.001, score: 0.8971
Alpha: 0.0001, score: 0.9024


We extended the feature selector, so it should get a better result.
Yes, the extended feature selector beat the baseline.

The best result was with alpha: 0.0001 with a score of 0.9024

### Exercise 2.3.1

In [14]:
class ScikitConsecutivePosTagger(nltk.TaggerI):
    def __init__(self, train_sents,
                 features=pos_features, clf = LogisticRegression(solver='lbfgs', C=1.0)): # Default C to 1, since it already does this
        # Using pos_features as default.
        # Using BernoulliNB() (with alpha/lidstone 0.5)
        self.features = features
        train_features = []
        train_labels = []
        for tagged_sent in train_sents:
            history = []
            untagged_sent = nltk.tag.untag(tagged_sent)
            
            for i, (word, tag) in enumerate(tagged_sent):
                featureset = features(untagged_sent, i, history)
                train_features.append(featureset)
                train_labels.append(tag)
                history.append(tag)
        v = DictVectorizer()
        X_train = v.fit_transform(train_features)
        y_train = np.array(train_labels)
        clf.fit(X_train, y_train)
        self.classifier = clf
        self.dict = v

    def tag(self, sentence):
        test_features = []
        history = []
        for i, word in enumerate(sentence):
            featureset = self.features(sentence, i, history)
            test_features.append(featureset)
        X_test = self.dict.transform(test_features)
        tags = self.classifier.predict(X_test)
        return zip(sentence, tags)

In [15]:
log_tagger = ScikitConsecutivePosTagger(train_sents, features= new_pos_features, clf=LogisticRegression(solver='lbfgs'))
print("score: {}".format(round(log_tagger.evaluate(test_sents), 4)))

score: 0.9263


Yes, this worked better than Bernoulli

### Exercise 2.3.2

In [16]:
c = [0.01, 0.1, 1.0, 10.0, 100.0]

for a in c:
    orig_tagger = ScikitConsecutivePosTagger(train_sents, features= new_pos_features, clf=LogisticRegression(C=a))
    print("C: {}, score: {}".format(a,round(orig_tagger.evaluate(test_sents), 4)))

C: 0.01, score: 0.769
C: 0.1, score: 0.8739
C: 1.0, score: 0.926
C: 10.0, score: 0.9356
C: 100.0, score: 0.9345


C = 10.0 yields the best result with 0.9356, and is better than Naive Bayes

### Exercise 2.4.1

In [17]:
def new_new_pos_features(sentence, i, history):
    features = {"suffix(1)": sentence[i][-1:],
                "suffix(2)": sentence[i][-2:],
                "suffix(3)": sentence[i][-3:]}
    if i == 0:
        features["prev-word"] = "<START>"
    else:
        features["prev-word"] = sentence[i-1]
    features["this-word"] = sentence[i]
    
    if i+1 >= len(sentence):
        features["next-word"] = "<END>"
    else:
        features["next-word"] = sentence[i+1]
    return features

In [18]:
orig_tagger = ScikitConsecutivePosTagger(train_sents, features=new_new_pos_features, clf=LogisticRegression(C=10.0))
print("score with next-word feature : {}".format(round(orig_tagger.evaluate(test_sents), 4)))

score with next-word feature : 0.948


### Exercise 2.4.2

In [19]:
def new_new_new_pos_features(sentence, i, history):
    features = {"suffix(1)": sentence[i][-1:],
                "suffix(2)": sentence[i][-2:],
                "suffix(3)": sentence[i][-3:]}
    if i == 0:
        features["prev-word"] = "<START>"
    else:
        features["prev-word"] = sentence[i-1]
    features["this-word"] = sentence[i]
    
    if i+1 >= len(sentence):
        features["next-word"] = "<END>"
    else:
        features["next-word"] = sentence[i+1]
    
    features["isnumeric"] = sentence[i].isnumeric()
    features["upperFirstLetter"] = sentence[i][0].isupper()
    
    return features

In [20]:
orig_tagger = ScikitConsecutivePosTagger(train_sents, features=new_new_new_pos_features, clf=LogisticRegression(C=10.0, solver='lbfgs'))
print("score with isnumeric and upperFirstLetter feature : {}".format(round(orig_tagger.evaluate(test_sents), 4)))

score with isnumeric and upperFirstLetter feature : 0.9535


### Exercise 2.5.1

In [21]:
print("score with isnumeric and upperFirstLetter feature : {}".format(round(orig_tagger.evaluate(news_test), 4)))

score with isnumeric and upperFirstLetter feature : 0.8938


### Exercise 2.5.2

In [22]:
# brown.categories()
cat = ['belles_lettres', 'editorial', 'fiction', 'government', 'humor', 'learned',
       'lore', 'mystery', 'religion', 'reviews', 'romance', 'science_fiction']
all_tagged_sents = brown.tagged_sents(categories=cat)
size = int(len(all_tagged_sents) * 0.1)
rest_train, rest_dev_test, rest_test = all_tagged_sents[size:-size], all_tagged_sents[:size], all_tagged_sents[-size:]
rest_train = originize(rest_train)
rest_dev_test = originize(rest_dev_test)
rest_test = originize(rest_test)

train = rest_train+news_train
test = rest_test + news_test

orig_tagger = ScikitConsecutivePosTagger(news_train, features=new_new_new_pos_features, clf=LogisticRegression(C=10.0, solver='lbfgs'))
print("Base' : {}".format(round(orig_tagger.evaluate(news_test), 4)))

Base' : 0.9414


### Exercise 2.5.3

In [23]:
orig_tagger = ScikitConsecutivePosTagger(train, features=new_new_new_pos_features, clf=LogisticRegression(C=10.0, solver='lbfgs'))
print("score with all categories except 'hobbies' and 'adventure' : {}".format(round(orig_tagger.evaluate(test), 4)))


score with all categories except 'hobbies' and 'adventure' : 0.9556


### Exercise 2.5.4

In [24]:
hobbies_sents = brown.tagged_sents(categories='hobbies')
adventure_sents = brown.tagged_sents(categories='adventure')

hobbies_sents = originize(hobbies_sents)
adventure_sents = originize(adventure_sents)

print("score with hobbies : {}".format(round(orig_tagger.evaluate(hobbies_sents), 4)))
print("score with adventure : {}".format(round(orig_tagger.evaluate(adventure_sents), 4)))

score with hobbies : 0.9501
score with adventure : 0.9594


The results were within margin the same, but there are of course some words used in the different categories that did not occur that often in other categories.

The fact that they are so close (adventure beating the 'all categories but hobbies and adventure'), I think maybe I did something wrong..

### Exercise 2.6.1

In [25]:
import time

In [29]:
# Start timer
start_time = time.clock()
news_hmm_tagger = nltk.HiddenMarkovModelTagger.train(news_train)
print(time.clock() - start_time, "seconds")
print("HMM-tagger with news_train: {}".format(round(news_hmm_tagger.evaluate(news_test), 4)))

0.8924119999992399 seconds
HMM-tagger with news_train: 0.8597


In [30]:
# Start timer
start_time = time.clock()
news_hmm_tagger = nltk.HiddenMarkovModelTagger.train(train)
print("HMM-tagger with news_train: {}".format(round(news_hmm_tagger.evaluate(test), 4)))
print(time.clock() - start_time, "seconds")

HMM-tagger with news_train: 0.9225
224.80315399999927 seconds


The HMM tagger was a little worse in this test than my tagger, but it ran pretty fast.

### Exercise 2.6.2

In [31]:
start_time = time.clock()
per_tagger = nltk.PerceptronTagger(load=False)
per_tagger.train(train)
print("Perceptron-tagger with news_train: {}".format(round(news_hmm_tagger.evaluate(test), 4)))
print(time.clock() - start_time, "seconds")

Perceptron-tagger with news_train: 0.9225
694.7592920000006 seconds


This took way longer, and was not as good as mine was. (Not saying that I made a better overall tagger, and that I did not do anything wrong)

The time it took compared to the HMM-tagger was about three times longer.